From d19c9b6638babc5a95988f957014604e827814f5 Mon Sep 17 00:00:00 2001 From: iyamaza Date: Thu, 9 Jul 2020 16:41:04 -0600 Subject: [PATCH 001/106] initial integration of "batched" trtri & trmm --- src/sparse/KokkosSparse_sptrsv_supernode.hpp | 101 ++++++++++++++++++- 1 file changed, 97 insertions(+), 4 deletions(-) diff --git a/src/sparse/KokkosSparse_sptrsv_supernode.hpp b/src/sparse/KokkosSparse_sptrsv_supernode.hpp index 6f230780cc..0820032904 100644 --- a/src/sparse/KokkosSparse_sptrsv_supernode.hpp +++ b/src/sparse/KokkosSparse_sptrsv_supernode.hpp @@ -57,8 +57,11 @@ #include "KokkosBlas3_trmm.hpp" #include "KokkosBlas_trtri.hpp" -#include "KokkosSparse_sptrsv.hpp" +#include "KokkosBatched_Trtri_Decl.hpp" +#include "KokkosBatched_Trtri_Serial_Impl.hpp" + +#include "KokkosSparse_sptrsv.hpp" namespace KokkosSparse { namespace Experimental { @@ -1082,6 +1085,89 @@ void sptrsv_supernodal_symbolic( /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ /* Auxiliary functions for numeric computation */ +/* ========================================================================================= */ + template + struct TriSupernodalTrtriFunctor { + + bool lower_tri; + bool unit_diag; + bool invert_offdiag; + const input_size_type *nb; + row_map_type hr; + index_type hc; + values_type hv; + + KOKKOS_INLINE_FUNCTION + TriSupernodalTrtriFunctor(bool lower_tri_, bool unit_diag_, bool invert_offdiag_, const input_size_type *nb_, + row_map_type& hr_, index_type& hc_, values_type& hv_) : + lower_tri(lower_tri_), + unit_diag(unit_diag_), + invert_offdiag(invert_offdiag_), + nb(nb_), + hr(hr_), + hc(hc_), + hv(hv_) + {} + + KOKKOS_INLINE_FUNCTION + void operator() (const int s) const { + using execution_space = typename values_type::execution_space; + using memory_space = typename execution_space::memory_space; + using values_view_t = typename values_type::non_const_type; + using scalar_t = typename values_view_t::value_type; + + using range_type = Kokkos::pair; + using TrtriAlgoType = KokkosBatched::Algo::Trtri::Unblocked; + using Uplo = KokkosBatched::Uplo; + using Diag = KokkosBatched::Diag; + using Side = KokkosBatched::Side; + using Trans = KokkosBatched::Trans; + + int j1 = nb[s]; + int nsrow = hr(j1+1) - hr(j1); + int nscol = nb[s +1] - nb[s]; + + auto nnzD = hr (j1); + Kokkos::View + viewL (&hv(nnzD), nsrow, nscol); + auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ()); + + if (lower_tri) { + if (unit_diag) { + KokkosBatched::SerialTrtri::invoke(Ljj); + } else { + KokkosBatched::SerialTrtri::invoke(Ljj); + } + } else { + if (unit_diag) { + KokkosBatched::SerialTrtri::invoke(Ljj); + } else { + KokkosBatched::SerialTrtri::invoke(Ljj); + } + } + if (nsrow > nscol && invert_offdiag) { + const scalar_t one (1.0); + auto Lij = Kokkos::subview (viewL, range_type (nscol, nsrow), Kokkos::ALL ()); + if (lower_tri) { + if (unit_diag) { + KokkosBatched::SerialTrmm:: + invoke(one, Ljj, Lij); + } else { + KokkosBatched::SerialTrmm:: + invoke(one, Ljj, Lij); + } + } else { + if (unit_diag) { + KokkosBatched::SerialTrmm:: + invoke(one, Ljj, Lij); + } else { + KokkosBatched::SerialTrmm:: + invoke(one, Ljj, Lij); + } + } + } + } + }; /* ========================================================================================= */ template @@ -1090,9 +1176,9 @@ invert_supernodal_columns(KernelHandle kernelHandle, bool unit_diag, int nsuper, row_map_type& hr, index_type& hc, values_type& hv) { using execution_space = typename values_type::execution_space; - using memory_space = typename execution_space::memory_space; - using values_view_t = typename values_type::non_const_type; - using scalar_t = typename values_view_t::value_type; + using memory_space = typename execution_space::memory_space; + using values_view_t = typename values_type::non_const_type; + using scalar_t = typename values_view_t::value_type; using range_type = Kokkos::pair; const scalar_t one (1.0); @@ -1115,6 +1201,12 @@ invert_supernodal_columns(KernelHandle kernelHandle, bool unit_diag, int nsuper, // ---------------------------------------------------------- // now let's invert some blocks + #if 1 + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (lower, unit_diag, invert_offdiag, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, nsuper), sptrsv_tritri_functor); + #else for (int s2 = 0; s2 < nsuper; s2++) { int j1 = nb[s2]; int nsrow = hr(j1+1) - hr(j1); @@ -1144,6 +1236,7 @@ invert_supernodal_columns(KernelHandle kernelHandle, bool unit_diag, int nsuper, time2 += timer.seconds (); } } + #endif #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE std::cout << " invert_supernodes" << std::endl; From 6009eb79047b7350a6364aa71e271cf24412fb39 Mon Sep 17 00:00:00 2001 From: iyamaza Date: Wed, 15 Jul 2020 02:17:25 -0600 Subject: [PATCH 002/106] > template with upper/lower and unit/non-unit > use KokkosBlas for larger blocks, and use batchedBlas for smaller ones --- src/sparse/KokkosSparse_sptrsv_supernode.hpp | 161 +++++++++++-------- 1 file changed, 94 insertions(+), 67 deletions(-) diff --git a/src/sparse/KokkosSparse_sptrsv_supernode.hpp b/src/sparse/KokkosSparse_sptrsv_supernode.hpp index 0820032904..16b55d0876 100644 --- a/src/sparse/KokkosSparse_sptrsv_supernode.hpp +++ b/src/sparse/KokkosSparse_sptrsv_supernode.hpp @@ -61,6 +61,9 @@ #include "KokkosBatched_Trtri_Decl.hpp" #include "KokkosBatched_Trtri_Serial_Impl.hpp" +#include "KokkosBatched_Trmm_Decl.hpp" +#include "KokkosBatched_Trmm_Serial_Impl.hpp" + #include "KokkosSparse_sptrsv.hpp" namespace KokkosSparse { @@ -903,6 +906,7 @@ void sptrsv_supernodal_symbolic( host_graph_t graphU_host, KernelHandle *kernelHandleU) { #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + int nrows = graphL_host.numRows (); double time_seconds = 0.0; Kokkos::Timer timer; Kokkos::Timer tic; @@ -949,7 +953,6 @@ void sptrsv_supernodal_symbolic( int nsuper_merged = nsuper; #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE tic.reset (); - int nrows = graphL_host.numRows (); check_supernode_sizes("Original L-structure", nrows, nsuper, supercols_merged, graphL_host); check_supernode_sizes("Original U-structure", nrows, nsuper, supercols_merged, graphU_host); #endif @@ -1086,23 +1089,22 @@ void sptrsv_supernodal_symbolic( /* Auxiliary functions for numeric computation */ /* ========================================================================================= */ - template + template struct TriSupernodalTrtriFunctor { - bool lower_tri; - bool unit_diag; bool invert_offdiag; + integer_view_host_t supernode_ids; const input_size_type *nb; row_map_type hr; index_type hc; values_type hv; KOKKOS_INLINE_FUNCTION - TriSupernodalTrtriFunctor(bool lower_tri_, bool unit_diag_, bool invert_offdiag_, const input_size_type *nb_, + TriSupernodalTrtriFunctor(bool invert_offdiag_, integer_view_host_t supernode_ids_, const input_size_type *nb_, row_map_type& hr_, index_type& hc_, values_type& hv_) : - lower_tri(lower_tri_), - unit_diag(unit_diag_), invert_offdiag(invert_offdiag_), + supernode_ids(supernode_ids_), nb(nb_), hr(hr_), hc(hc_), @@ -1110,7 +1112,7 @@ void sptrsv_supernodal_symbolic( {} KOKKOS_INLINE_FUNCTION - void operator() (const int s) const { + void operator() (const int i) const { using execution_space = typename values_type::execution_space; using memory_space = typename execution_space::memory_space; using values_view_t = typename values_type::non_const_type; @@ -1118,11 +1120,10 @@ void sptrsv_supernodal_symbolic( using range_type = Kokkos::pair; using TrtriAlgoType = KokkosBatched::Algo::Trtri::Unblocked; - using Uplo = KokkosBatched::Uplo; - using Diag = KokkosBatched::Diag; using Side = KokkosBatched::Side; using Trans = KokkosBatched::Trans; + int s = supernode_ids(i); int j1 = nb[s]; int nsrow = hr(j1+1) - hr(j1); int nscol = nb[s +1] - nb[s]; @@ -1131,40 +1132,13 @@ void sptrsv_supernodal_symbolic( Kokkos::View viewL (&hv(nnzD), nsrow, nscol); auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ()); + KokkosBatched::SerialTrtri::invoke(Ljj); - if (lower_tri) { - if (unit_diag) { - KokkosBatched::SerialTrtri::invoke(Ljj); - } else { - KokkosBatched::SerialTrtri::invoke(Ljj); - } - } else { - if (unit_diag) { - KokkosBatched::SerialTrtri::invoke(Ljj); - } else { - KokkosBatched::SerialTrtri::invoke(Ljj); - } - } if (nsrow > nscol && invert_offdiag) { const scalar_t one (1.0); auto Lij = Kokkos::subview (viewL, range_type (nscol, nsrow), Kokkos::ALL ()); - if (lower_tri) { - if (unit_diag) { - KokkosBatched::SerialTrmm:: - invoke(one, Ljj, Lij); - } else { - KokkosBatched::SerialTrmm:: - invoke(one, Ljj, Lij); - } - } else { - if (unit_diag) { - KokkosBatched::SerialTrmm:: - invoke(one, Ljj, Lij); - } else { - KokkosBatched::SerialTrmm:: - invoke(one, Ljj, Lij); - } - } + KokkosBatched::SerialTrmm:: + invoke(one, Ljj, Lij); } } }; @@ -1180,6 +1154,7 @@ invert_supernodal_columns(KernelHandle kernelHandle, bool unit_diag, int nsuper, using values_view_t = typename values_type::non_const_type; using scalar_t = typename values_view_t::value_type; using range_type = Kokkos::pair; + using integer_view_host_t = Kokkos::View; const scalar_t one (1.0); @@ -1195,53 +1170,105 @@ invert_supernodal_columns(KernelHandle kernelHandle, bool unit_diag, int nsuper, // quick return if (!invert_diag) return; + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE Kokkos::Timer timer; double time1 = 0.0; double time2 = 0.0; + double time3 = 0.0; + #endif // ---------------------------------------------------------- // now let's invert some blocks - #if 1 - using range_policy = Kokkos::RangePolicy; - TriSupernodalTrtriFunctor - sptrsv_tritri_functor (lower, unit_diag, invert_offdiag, nb, hr, hc, hv); - Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, nsuper), sptrsv_tritri_functor); - #else + // > first go through all the supernode columns + // > use KokkosBlas on large blocks, and keep track of small blocks + // > to call batchedBlas on them + int num_batchs = 0; + int size_unblocked = handle->get_supernode_size_unblocked(); + integer_view_host_t supernode_ids ("supernode_batch", nsuper); for (int s2 = 0; s2 < nsuper; s2++) { - int j1 = nb[s2]; - int nsrow = hr(j1+1) - hr(j1); int nscol = nb[s2+1] - nb[s2]; - auto nnzD = hr (j1); - char uplo_char = (lower ? 'L' : 'U'); - char diag_char = (unit_diag ? 'U' : 'N'); - - Kokkos::View - viewL (&hv(nnzD), nsrow, nscol); - auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ()); + if (nscol >= size_unblocked) { + int j1 = nb[s2]; + int nsrow = hr(j1+1) - hr(j1); - timer.reset (); - KokkosBlas::trtri(&uplo_char, &diag_char, Ljj); - time1 += timer.seconds (); + auto nnzD = hr (j1); + char uplo_char = (lower ? 'L' : 'U'); + char diag_char = (unit_diag ? 'U' : 'N'); - if (nsrow > nscol && invert_offdiag) { - char side_char = 'R'; - char tran_char = 'N'; - auto Lij = Kokkos::subview (viewL, range_type (nscol, nsrow), Kokkos::ALL ()); + Kokkos::View + viewL (&hv(nnzD), nsrow, nscol); + auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ()); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE timer.reset (); - KokkosBlas::trmm (&side_char, &uplo_char, - &tran_char, &diag_char, - one, Ljj, Lij); - time2 += timer.seconds (); + #endif + KokkosBlas::trtri(&uplo_char, &diag_char, Ljj); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time1 += timer.seconds (); + #endif + + if (nsrow > nscol && invert_offdiag) { + char side_char = 'R'; + char tran_char = 'N'; + auto Lij = Kokkos::subview (viewL, range_type (nscol, nsrow), Kokkos::ALL ()); + + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + timer.reset (); + #endif + KokkosBlas::trmm (&side_char, &uplo_char, + &tran_char, &diag_char, + one, Ljj, Lij); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time2 += timer.seconds (); + #endif + } + } + else { + supernode_ids (num_batchs) = s2; + num_batchs ++; } } - #endif + // now call batchedBLAS + if (num_batchs > 0) { + using range_policy = Kokkos::RangePolicy; + using Uplo = KokkosBatched::Uplo; + using Diag = KokkosBatched::Diag; + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + timer.reset (); + #endif + if (lower) { + if (unit_diag) { + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (invert_offdiag, supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } else { + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (invert_offdiag, supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } + } else { + if (unit_diag) { + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (invert_offdiag, supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } else { + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (invert_offdiag, supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } + } + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time3 = timer.seconds (); + #endif + } #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE std::cout << " invert_supernodes" << std::endl; + std::cout << " + num supernodes = " << nsuper << " num batchs = " << num_batchs << std::endl; std::cout << " > Time for inversion::trtri : " << time1 << std::endl; std::cout << " > Time for inversion::trmm : " << time2 << std::endl; + std::cout << " > Time for batchs : " << time3 << std::endl; #endif } From 24c9a862b3d92abed6ec270c7c4ca822789a8fe5 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 17 Aug 2020 17:06:04 -0600 Subject: [PATCH 003/106] cm_test_all_sandia: add option to enable view bounds checking --- scripts/cm_test_all_sandia | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index dc905965a6..481e713225 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -20,6 +20,7 @@ print_help() { echo "--spack: Run spack builds rather than direct CMake tests" echo "" echo "--debug: Run tests in debug. Defaults to False" + echo "--boundscheck: Enable Kokkos_ENABLE_DEBUG_BOUNDS_CHECK to check View accesses within bounds." echo "--test-script: Test this script, not Kokkos" echo "--skip-hwloc: Do not do hwloc tests" echo "--num=N: Number of jobs to run in parallel" @@ -273,6 +274,9 @@ do --debug*) DEBUG=True ;; + --boundscheck*) + KOKKOS_BOUNDS_CHECK="--boundscheck" + ;; --build-only*) BUILD_ONLY=True ;; @@ -1243,13 +1247,13 @@ single_build_and_test() { # KOKKOS_OPTIONS and KOKKOS_CUDA_OPTIONS are exported and detected by kokkos' generate_makefile.sh during install of kokkos; we pass them to the reproducer script instructions echo " # Use generate_makefile line below to call cmake which generates makefile for this build:" &> call_generate_makefile.sh - echo " ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} --no-examples $extra_args" &>> call_generate_makefile.sh + echo " ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args" &>> call_generate_makefile.sh chmod +x call_generate_makefile.sh # script command with generic path for faster copy/paste of reproducer into issues - echo " # \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} --no-examples $extra_args" &> call_generate_makefile_genericpath.sh + echo " # \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args" &> call_generate_makefile_genericpath.sh - run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --no-examples $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } local make_par_lvl=12 if [[ "$MACHINE" = bowman* ]] || [[ "$MACHINE" = white* ]]; then From 21eccbf2b1d49977483a3e9f954473eaf8f5c3ad Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 20 Aug 2020 00:15:35 -0600 Subject: [PATCH 004/106] cm_test_all_sandia: update testing for c++14, remove old compilers add workaround for intel/17 modules --- scripts/cm_test_all_sandia | 42 +++++++------------------------------- scripts/update_lib.sh | 26 +++++++++++++++++++++++ 2 files changed, 33 insertions(+), 35 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 481e713225..51b394c24e 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -203,7 +203,7 @@ CXX_FLAGS_EXTRA="" LD_FLAGS_EXTRA="" KOKKOS_OPTIONS="" -CXX_STANDARD="11" +CXX_STANDARD="14" GCC_VARIANTS="+blas+lapack +openmp+serial" CLANG_VARIANTS="+blas+lapack +openmp+serial" @@ -213,8 +213,6 @@ PGI_VARIANTS="$GCC_VARIANTS" SPACK_VARIANTS=("cuda 10.0 $CUDA_VARIANTS std=14" "cuda 10.1 $CUDA_VARIANTS" "cuda 9.2 $CUDA_VARIANTS" - "gcc 4.8.4 $GCC_VARIANTS" - "gcc 4.9.3 $GCC_VARIANTS" "gcc 5.3.0 $GCC_VARIANTS" "gcc 6.1.0 $GCC_VARIANTS" "gcc 7.2.0 $GCC_VARIANTS std=14" @@ -222,8 +220,6 @@ SPACK_VARIANTS=("cuda 10.0 $CUDA_VARIANTS std=14" "gcc 8.3.0 $GCC_VARIANTS std=14" "gcc 9.1 $GCC_VARIANTS std=17" "gcc 9.2.0 $GCC_VARIANTS std=17" - "intel 15.0.2 $INTEL_VARIANTS" - "intel 16.0.1 $INTEL_VARIANTS" "intel 17.0.1 $INTEL_VARIANTS" "intel 18.0.5 $INTEL_VARIANTS" "intel 19.0.5 $INTEL_VARIANTS std=14" @@ -425,13 +421,9 @@ if [ "$MACHINE" = "sems" ]; then ) else # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" @@ -460,7 +452,6 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/16.0.3 $BASE_MODULE_LIST "Serial,Pthread" icpc $INTEL_WARNING_FLAGS" "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" "clang/4.0.1 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" "clang/7.0.1 $CLANG7_MODULE_LIST "Cuda_OpenMP" clang++ $CLANG_WARNING_FLAGS" @@ -470,7 +461,6 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/16.0.3 $BASE_MODULE_LIST "Serial,Pthread" icpc $INTEL_WARNING_FLAGS" "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" "clang/4.0.1 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" "clang/7.0.1 $CLANG7_MODULE_LIST "Cuda_OpenMP" clang++ $CLANG_WARNING_FLAGS" @@ -478,13 +468,9 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then ) else # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" @@ -615,8 +601,8 @@ elif [ "$MACHINE" = "bowman" ]; then OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/16.4.258 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/17.2.174 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + COMPILERS=("intel/17.2.174 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.2.199 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" ) if [ -z "$ARCH_FLAG" ]; then @@ -672,14 +658,12 @@ elif [ "$MACHINE" = "blake" ]; then COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/19.1.144 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/19.3.199 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/5.5.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/8.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" ) fi @@ -717,18 +701,14 @@ elif [ "$MACHINE" = "apollo" ]; then if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" + COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS" "cuda/9.1 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" + COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS" "cuda/9.1 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" @@ -746,8 +726,6 @@ elif [ "$MACHINE" = "apollo" ]; then "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" ) @@ -791,7 +769,6 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" "gcc/8.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS" - "gcc/4.8.4 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS" "intel/18.0.5 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" "intel/19.0.5 $BASE_MODULE_LIST "Pthread" icpc $INTEL_WARNING_FLAGS" @@ -805,7 +782,6 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" "gcc/8.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS" - "gcc/4.8.4 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS" "intel/18.0.5 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" "intel/19.0.5 $BASE_MODULE_LIST "Pthread" icpc $INTEL_WARNING_FLAGS" @@ -820,8 +796,6 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then "cuda/9.2 $NVCC_SEMSMODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CLANG8_CUDA_WARNING_FLAGS" "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" - "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" @@ -829,8 +803,6 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.1 $GCC91_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" diff --git a/scripts/update_lib.sh b/scripts/update_lib.sh index ce3693409c..95e6cb597c 100755 --- a/scripts/update_lib.sh +++ b/scripts/update_lib.sh @@ -1,6 +1,32 @@ #!/bin/bash if [ "$1" = bowman ]; then + ICPCVER="$(icpc --version | grep 17 | cut -d ' ' -f 3)" + if [[ "${ICPCVER}" = 17.* ]]; then + module swap gcc/4.9.3 gcc/6.2.0 + module list + fi export LIBRARY_PATH=/home/projects/x86-64-knl/gcc/6.2.0/lib/gcc/x86_64-pc-linux-gnu/6.2.0:/home/projects/x86-64-knl/cloog/0.18.4/lib:/home/projects/x86-64-knl/isl/0.16.1/lib:/home/projects/x86-64-knl/gmp/6.1.0/lib:/home/projects/x86-64-knl/mpfr/3.1.3/lib:/home/projects/x86-64-knl/mpc/1.0.3/lib:/home/projects/x86-64-knl/binutils/2.26.0/lib:/usr/lib/gcc/x86_64-redhat-linux/4.8.3:$LIBRARY_PATH export LD_LIBRARY_PATH=/home/projects/x86-64-knl/gcc/6.2.0/lib64:/home/projects/x86-64-knl/gcc/6.2.0/lib:/home/projects/x86-64-knl/cloog/0.18.4/lib:/home/projects/x86-64-knl/isl/0.16.1/lib:/home/projects/x86-64-knl/gmp/6.1.0/lib:/home/projects/x86-64-knl/mpfr/3.1.3/lib:/home/projects/x86-64-knl/mpc/1.0.3/lib:/home/projects/x86-64-knl/binutils/2.26.0/lib:/usr/lib/gcc/x86_64-redhat-linux/4.8.3:$LD_LIBRARY_PATH fi +if [ "$1" = blake ]; then + ICPCVER="$(icpc --version | grep 17 | cut -d ' ' -f 3)" + if [[ "${ICPCVER}" = 17.* ]]; then + module swap gcc/4.9.3 gcc/6.4.0 + module list + fi +fi +if [ "$1" = kokkos-dev ]; then + ICPCVER="$(icpc --version | grep 17 | cut -d ' ' -f 3)" + if [[ "${ICPCVER}" = 17.* ]]; then + module swap sems-gcc/4.8.4 sems-gcc/6.4.0 + module list + fi +fi +if [ "$1" = kokkos-dev-2 ]; then + ICPCVER="$(icpc --version | grep 17 | cut -d ' ' -f 3)" + if [[ "${ICPCVER}" = 17.* ]]; then + module swap sems-gcc/4.8.4 sems-gcc/6.4.0 + module list + fi +fi From 6dd10175bbb765e4897d0ebf4f2af740a4703327 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 20 Aug 2020 12:20:51 -0600 Subject: [PATCH 005/106] test scripts update --- scripts/cm_test_all_sandia | 1 + scripts/update_lib.sh | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 51b394c24e..a40aa1e29f 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -658,6 +658,7 @@ elif [ "$MACHINE" = "blake" ]; then COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/19.1.144 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/19.3.199 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "gcc/5.5.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" diff --git a/scripts/update_lib.sh b/scripts/update_lib.sh index 95e6cb597c..ea10ae42c1 100755 --- a/scripts/update_lib.sh +++ b/scripts/update_lib.sh @@ -1,8 +1,8 @@ #!/bin/bash if [ "$1" = bowman ]; then - ICPCVER="$(icpc --version | grep 17 | cut -d ' ' -f 3)" - if [[ "${ICPCVER}" = 17.* ]]; then + ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" + if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.* ]]; then module swap gcc/4.9.3 gcc/6.2.0 module list fi @@ -10,21 +10,21 @@ if [ "$1" = bowman ]; then export LD_LIBRARY_PATH=/home/projects/x86-64-knl/gcc/6.2.0/lib64:/home/projects/x86-64-knl/gcc/6.2.0/lib:/home/projects/x86-64-knl/cloog/0.18.4/lib:/home/projects/x86-64-knl/isl/0.16.1/lib:/home/projects/x86-64-knl/gmp/6.1.0/lib:/home/projects/x86-64-knl/mpfr/3.1.3/lib:/home/projects/x86-64-knl/mpc/1.0.3/lib:/home/projects/x86-64-knl/binutils/2.26.0/lib:/usr/lib/gcc/x86_64-redhat-linux/4.8.3:$LD_LIBRARY_PATH fi if [ "$1" = blake ]; then - ICPCVER="$(icpc --version | grep 17 | cut -d ' ' -f 3)" - if [[ "${ICPCVER}" = 17.* ]]; then + ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" + if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.0.128 ]]; then module swap gcc/4.9.3 gcc/6.4.0 module list fi fi if [ "$1" = kokkos-dev ]; then - ICPCVER="$(icpc --version | grep 17 | cut -d ' ' -f 3)" + ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" if [[ "${ICPCVER}" = 17.* ]]; then module swap sems-gcc/4.8.4 sems-gcc/6.4.0 module list fi fi if [ "$1" = kokkos-dev-2 ]; then - ICPCVER="$(icpc --version | grep 17 | cut -d ' ' -f 3)" + ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" if [[ "${ICPCVER}" = 17.* ]]; then module swap sems-gcc/4.8.4 sems-gcc/6.4.0 module list From 380cfcb6870ae4faa0bfe5be7c7f44e2272f97b2 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 20 Aug 2020 14:16:47 -0600 Subject: [PATCH 006/106] Fix spot-checks with CXX14 --- src/batched/KokkosBatched_SetTriangular_Internal.hpp | 2 +- unit_test/batched/Test_Batched_TeamVectorSolveUTV.hpp | 2 +- unit_test/batched/Test_Batched_TeamVectorSolveUTV2.hpp | 2 +- unit_test/batched/Test_Batched_TeamVectorUTV.hpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/batched/KokkosBatched_SetTriangular_Internal.hpp b/src/batched/KokkosBatched_SetTriangular_Internal.hpp index cc9e95393f..fd74915ea9 100644 --- a/src/batched/KokkosBatched_SetTriangular_Internal.hpp +++ b/src/batched/KokkosBatched_SetTriangular_Internal.hpp @@ -50,7 +50,7 @@ namespace KokkosBatched { const int jdist = j+ dist; Kokkos::parallel_for (Kokkos::ThreadVectorRange(member, m), - [&](const int &i) { + [=](const int &i) { if (i >= jdist) A[i*as0+j*as1] = alpha; }); diff --git a/unit_test/batched/Test_Batched_TeamVectorSolveUTV.hpp b/unit_test/batched/Test_Batched_TeamVectorSolveUTV.hpp index 92990bc4be..93b7dddc14 100644 --- a/unit_test/batched/Test_Batched_TeamVectorSolveUTV.hpp +++ b/unit_test/batched/Test_Batched_TeamVectorSolveUTV.hpp @@ -74,7 +74,7 @@ namespace Test { } else { Kokkos::parallel_for (Kokkos::TeamVectorRange(member, m*m), - [&](const int &ij) { + [=](const int &ij) { const int i = ij/m, j = ij%m; value_type tmp(0); for (int l=0;l Date: Thu, 20 Aug 2020 16:20:53 -0600 Subject: [PATCH 007/106] cm_test_all_sandia: Disable failing IBM toolchain --- scripts/cm_test_all_sandia | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index a40aa1e29f..1c75b93d52 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -503,11 +503,12 @@ elif [ "$MACHINE" = "white" ]; then # Don't do pthread on white. GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" + # Don't run the IBM toolchain with CXX14 on white + # "ibm/16.1.1 $IBM_MODULE_LIST "Serial" xlC $IBM_WARNING_FLAGS" if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "ibm/16.1.1 $IBM_MODULE_LIST "Serial" xlC $IBM_WARNING_FLAGS" "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/10.1.105 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) From ccef0a1d9d13835c438dadb5cbbf6b45d86006d3 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 21 Aug 2020 16:00:03 -0700 Subject: [PATCH 008/106] update_lib: fix module swap for kokkos-dev-2 --- scripts/update_lib.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/update_lib.sh b/scripts/update_lib.sh index ea10ae42c1..fcec9ea9f2 100755 --- a/scripts/update_lib.sh +++ b/scripts/update_lib.sh @@ -1,5 +1,7 @@ #!/bin/bash +echo "CALLING UPDATE_LIB" + if [ "$1" = bowman ]; then ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.* ]]; then @@ -25,8 +27,10 @@ if [ "$1" = kokkos-dev ]; then fi if [ "$1" = kokkos-dev-2 ]; then ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" + echo "KOKKOSDEV2" + echo " ICPCVER=$ICPCVER" if [[ "${ICPCVER}" = 17.* ]]; then - module swap sems-gcc/4.8.4 sems-gcc/6.4.0 + module swap sems-gcc/4.9.3 sems-gcc/6.4.0 module list fi fi From 3e7f99f616258c1f2de4025cd9067c8e7fbc7267 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 21 Aug 2020 17:04:10 -0600 Subject: [PATCH 009/106] cleanup echos --- scripts/update_lib.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/update_lib.sh b/scripts/update_lib.sh index fcec9ea9f2..69a179cd56 100755 --- a/scripts/update_lib.sh +++ b/scripts/update_lib.sh @@ -27,8 +27,6 @@ if [ "$1" = kokkos-dev ]; then fi if [ "$1" = kokkos-dev-2 ]; then ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" - echo "KOKKOSDEV2" - echo " ICPCVER=$ICPCVER" if [[ "${ICPCVER}" = 17.* ]]; then module swap sems-gcc/4.9.3 sems-gcc/6.4.0 module list From 294cdef094cbea86c6442015b9a0766e060e0ec3 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 19 Aug 2020 21:40:58 -0600 Subject: [PATCH 010/106] Adding Changelog for Release 3.2.00 Part of Kokkos C++ Performance Portability Programming EcoSystem 3.2 Update Kokkos version macros to 3.2.0 --- CHANGELOG.md | 20 ++++++++++++++++++++ CMakeLists.txt | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 74990b8415..0380691d0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,25 @@ # Change Log +## [3.2.00](https://github.com/kokkos/kokkos-kernels/tree/3.2.00) (2020-08-19) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.1.01...3.2.00) + +**Implemented enhancements:** + +- Add CudaUVMSpace specializations for cuBLAS IAMAX and SCAL [\#758](https://github.com/kokkos/kokkos-kernels/issues/758) +- Add wiki examples [\#735](https://github.com/kokkos/kokkos-kernels/issues/735) +- Support complex\_float, complex\_double in cuSPARSE SPMV wrapper [\#726](https://github.com/kokkos/kokkos-kernels/issues/726) +- Add performance tests for trmm and trtri [\#711](https://github.com/kokkos/kokkos-kernels/issues/711) +- SpAdd requires output values to be zero-initialized, but this shouldnt be needed [\#694](https://github.com/kokkos/kokkos-kernels/issues/694) +- SpAdd doesnt merge entries correctly [\#685](https://github.com/kokkos/kokkos-kernels/issues/685) +- cusparse SpMV merge algorithm [\#670](https://github.com/kokkos/kokkos-kernels/issues/670) +- TPL support for SpMV [\#614](https://github.com/kokkos/kokkos-kernels/issues/614) +- Add two BLAS/LAPACK calls needed by: Sptrsv supernode \#552 [\#589](https://github.com/kokkos/kokkos-kernels/issues/589) +- HashmapAccumulator has several unused members, misnamed parameters [\#508](https://github.com/kokkos/kokkos-kernels/issues/508) + +**Fixed bugs:** + +- Nightly test failure: spgemm unit tests failing on White \(Power8\) [\#780](https://github.com/kokkos/kokkos-kernels/issues/780) +- supernodal does not build with UVM enabled [\#633](https://github.com/kokkos/kokkos-kernels/issues/633) ## [3.1.00](https://github.com/kokkos/kokkos-kernels/tree/3.1.00) (2020-04-14) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.0.00...3.1.00) diff --git a/CMakeLists.txt b/CMakeLists.txt index d2164a2794..d70710325a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) PROJECT(KokkosKernels CXX) ENDIF() SET(KokkosKernels_VERSION_MAJOR 3) - SET(KokkosKernels_VERSION_MINOR 1) + SET(KokkosKernels_VERSION_MINOR 2) SET(KokkosKernels_VERSION_PATCH 0) ENDIF() From 873f503c8c6ac8a71a768bc442c02a2874085eb9 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Sat, 8 Aug 2020 13:02:44 -0600 Subject: [PATCH 011/106] Distance-2 maximal independent set. -Unit test, perf test -Fast version (based on Luby) and high-quality version (based on ECL) -Fast version yields ~5x speedup vs. CUSP MIS2 (only other implementation I know of) -Bitonic sorting functions now take a comparator object. Not currently used by MIS2 but may be in the future. Backward compatible interface. --- perf_test/graph/CMakeLists.txt | 5 + perf_test/graph/KokkosGraph_mis_d2.cpp | 310 ++++++++ src/common/KokkosKernels_Handle.hpp | 4 +- src/common/KokkosKernels_Sorting.hpp | 40 +- src/common/KokkosKernels_SparseUtils.hpp | 73 +- src/graph/KokkosGraph_Distance1Color.hpp | 2 - src/graph/KokkosGraph_MIS2.hpp | 93 +++ .../impl/KokkosGraph_Distance2MIS_impl.hpp | 737 ++++++++++++++++++ ...KokkosSparse_cluster_gauss_seidel_impl.hpp | 6 +- .../impl/KokkosSparse_partitioning_impl.hpp | 34 +- unit_test/cuda/Test_Cuda_Graph_mis2.cpp | 2 + .../Test_Graph_graph_color_distance2.hpp | 1 + unit_test/graph/Test_Graph_mis2.hpp | 190 +++++ unit_test/openmp/Test_OpenMP_Graph_mis2.cpp | 2 + unit_test/serial/Test_Serial_Graph_mis2.cpp | 2 + unit_test/sparse/Test_Sparse_gauss_seidel.hpp | 70 +- unit_test/threads/Test_Threads_Graph_mis2.cpp | 3 + 17 files changed, 1463 insertions(+), 111 deletions(-) create mode 100644 perf_test/graph/KokkosGraph_mis_d2.cpp create mode 100644 src/graph/KokkosGraph_MIS2.hpp create mode 100644 src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp create mode 100644 unit_test/cuda/Test_Cuda_Graph_mis2.cpp create mode 100644 unit_test/graph/Test_Graph_mis2.hpp create mode 100644 unit_test/openmp/Test_OpenMP_Graph_mis2.cpp create mode 100644 unit_test/serial/Test_Serial_Graph_mis2.cpp create mode 100644 unit_test/threads/Test_Threads_Graph_mis2.cpp diff --git a/perf_test/graph/CMakeLists.txt b/perf_test/graph/CMakeLists.txt index bf7ae17082..134a7acc2e 100644 --- a/perf_test/graph/CMakeLists.txt +++ b/perf_test/graph/CMakeLists.txt @@ -11,6 +11,11 @@ KOKKOSKERNELS_ADD_EXECUTABLE( SOURCES KokkosGraph_color_d2.cpp ) +KOKKOSKERNELS_ADD_EXECUTABLE( + graph_mis_d2 + SOURCES KokkosGraph_mis_d2.cpp + ) + #Below will probably fail on GPUs. #KOKKOSKERNELS_ADD_EXECUTABLE( diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp new file mode 100644 index 0000000000..52e680e855 --- /dev/null +++ b/perf_test/graph/KokkosGraph_mis_d2.cpp @@ -0,0 +1,310 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "KokkosKernels_Utils.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_spadd.hpp" +#include "KokkosGraph_MIS2.hpp" +#include "KokkosKernels_default_types.hpp" + +using namespace KokkosGraph; + +struct MIS2Parameters +{ + int repeat = 1; + bool verbose = false; + int use_threads = 0; + int use_openmp = 0; + int use_cuda = 0; + int use_serial = 0; + const char* mtx_file = NULL; + MIS2_Algorithm algo = MIS2_FAST; +}; + +void print_options(std::ostream &os, const char *app_name, unsigned int indent = 0) +{ + std::string spaces(indent, ' '); + os << "Usage:" << std::endl + << spaces << " " << app_name << " [parameters]" << std::endl + << std::endl + << spaces << "Parameters:" << std::endl + << spaces << " Required Parameters:" << std::endl + << spaces << " --amtx Input file in Matrix Market format (.mtx)." << std::endl + << std::endl + << spaces << " Device type (the following are enabled in this build):" << std::endl +#ifdef KOKKOS_ENABLE_SERIAL + << spaces << " --serial Execute serially." << std::endl +#endif +#ifdef KOKKOS_ENABLE_THREADS + << spaces << " --threads Use posix threads.\n" +#endif +#ifdef KOKKOS_ENABLE_OPENMP + << spaces << " --openmp Use OpenMP.\n" +#endif +#ifdef KOKKOS_ENABLE_CUDA + << spaces << " --cuda Use CUDA.\n" +#endif + << std::endl + << spaces << " Optional Parameters:" << std::endl + << spaces << " --algo alg alg: fast, quality" << std::endl + << spaces << " --repeat Set number of test repetitions (Default: 1) " << std::endl + << spaces << " --verbose Enable verbose mode (record and print timing + extra information)" << std::endl + << spaces << " --help Print out command line help." << std::endl + << spaces << " " << std::endl; +} + +static char* getNextArg(int& i, int argc, char** argv) +{ + i++; + if(i >= argc) + { + std::cerr << "Error: expected additional command-line argument!\n"; + exit(1); + } + return argv[i]; +} + +int parse_inputs(MIS2Parameters ¶ms, int argc, char **argv) +{ + bool got_required_param_amtx = false; + for(int i = 1; i < argc; ++i) + { + if(0 == strcasecmp(argv[i], "--threads")) + { + params.use_threads = 1; + } + else if(0 == strcasecmp(argv[i], "--serial")) + { + params.use_serial = 1; + } + else if(0 == strcasecmp(argv[i], "--openmp")) + { + params.use_openmp = 1; + } + else if(0 == strcasecmp(argv[i], "--cuda")) + { + params.use_cuda = 1; + } + else if(0 == strcasecmp(argv[i], "--repeat")) + { + params.repeat = atoi(getNextArg(i, argc, argv)); + if(params.repeat <= 0) + { + std::cout << "*** Repeat count must be positive, defaulting to 1.\n"; + params.repeat = 1; + } + } + else if(0 == strcasecmp(argv[i], "--amtx")) + { + got_required_param_amtx = true; + params.mtx_file = getNextArg(i, argc, argv); + } + else if(0 == strcasecmp(argv[i], "--algo")) + { + const char* algName = getNextArg(i, argc, argv); + if(!strcasecmp(algName, "fast")) + params.algo = MIS2_FAST; + else if(!strcasecmp(algName, "quality")) + params.algo = MIS2_QUALITY; + else + throw std::invalid_argument("Algorithm not valid: must be 'fast' or 'quality'"); + } + else if(0 == strcasecmp(argv[i], "--verbose")) + { + params.verbose = true; + } + else if(0 == strcasecmp(argv[i], "--help") || 0 == strcasecmp(argv[i], "-h")) + { + print_options(std::cout, argv[0]); + return 1; + } + else + { + std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; + print_options(std::cout, argv[0]); + return 1; + } + } + + if(!got_required_param_amtx) + { + std::cout << "Missing required parameter amtx" << std::endl << std::endl; + print_options(std::cout, argv[0]); + return 1; + } + if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda) + { + print_options(std::cout, argv[0]); + return 1; + } + return 0; +} + +template +void run_mis2(const MIS2Parameters& params) +{ + using size_type = default_size_type; + using lno_t = default_lno_t; + using exec_space = typename device_t::execution_space; + using mem_space = typename device_t::memory_space; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using lno_view_t = typename crsMat_t::index_type::non_const_type; + using KKH = KokkosKernels::Experimental::KokkosKernelsHandle; + + Kokkos::Timer t; + crsMat_t A_in = KokkosKernels::Impl::read_kokkos_crst_matrix(params.mtx_file); + std::cout << "I/O time: " << t.seconds() << " s\n"; + t.reset(); + //Symmetrize the matrix just in case + crsMat_t At_in = KokkosKernels::Impl::transpose_matrix(A_in); + crsMat_t A; + KKH kkh; + kkh.create_spadd_handle(false); + KokkosSparse::spadd_symbolic(&kkh, A_in, At_in, A); + KokkosSparse::spadd_numeric(&kkh, 1.0, A_in, 1.0, At_in, A); + kkh.destroy_spadd_handle(); + std::cout << "Time to symmetrize: " << t.seconds() << " s\n"; + auto rowmap = A.graph.row_map; + auto entries = A.graph.entries; + lno_t numVerts = A.numRows(); + + std::cout << "Num verts: " << numVerts << '\n' + << "Num edges: " << A.nnz() << '\n'; + + lno_view_t mis; + + t.reset(); + for(int rep = 0; rep < params.repeat; rep++) + { + mis = KokkosGraph::Experimental::graph_d2_mis(rowmap, entries, params.algo); + exec_space().fence(); + } + double totalTime = t.seconds(); + std::cout << "MIS-2 average time: " << totalTime / params.repeat << '\n'; + std::cout << "MIS size: " << mis.extent(0) << '\n'; + + if(params.verbose) + { + std::cout << "Vertices in independent set:\n"; + KokkosKernels::Impl::print_1Dview(mis); + } +} + +int main(int argc, char *argv[]) +{ + MIS2Parameters params; + + if(parse_inputs(params, argc, argv)) + { + return 1; + } + + if(params.mtx_file == NULL) + { + std::cerr << "Provide a matrix file" << std::endl; + return 0; + } + + Kokkos::initialize(); + + bool run = false; + + #if defined(KOKKOS_ENABLE_OPENMP) + if(params.use_openmp) + { + run_mis2(params); + run = true; + } + #endif + + #if defined(KOKKOS_ENABLE_THREADS) + if(params.use_threads) + { + run_mis2(params); + run = true; + } + #endif + + #if defined(KOKKOS_ENABLE_CUDA) + if(params.use_cuda) + { + run_mis2(params); + run = true; + } + #endif + + #if defined(KOKKOS_ENABLE_SERIAL) + if(params.use_serial) + { + run_mis2(params); + run = true; + } + #endif + + if(!run) + { + std::cerr << "*** ERROR: did not run, none of the supported device types were selected.\n"; + } + + Kokkos::finalize(); + + return 0; +} diff --git a/src/common/KokkosKernels_Handle.hpp b/src/common/KokkosKernels_Handle.hpp index 1713e7c460..d0ffa6ca85 100644 --- a/src/common/KokkosKernels_Handle.hpp +++ b/src/common/KokkosKernels_Handle.hpp @@ -609,10 +609,10 @@ class KokkosKernelsHandle } } - void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t verts_per_cluster) { + void create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t verts_per_cluster) { this->destroy_gs_handle(); this->is_owner_of_the_gs_handle = true; - this->gsHandle = new ClusterGaussSeidelHandleType(clusterAlgo, verts_per_cluster); + this->gsHandle = new ClusterGaussSeidelHandleType(KokkosSparse::CLUSTER_BALLOON, verts_per_cluster); } void destroy_gs_handle(){ if (is_owner_of_the_gs_handle && this->gsHandle != NULL){ diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp index bdb93c71b1..d9346aba61 100644 --- a/src/common/KokkosKernels_Sorting.hpp +++ b/src/common/KokkosKernels_Sorting.hpp @@ -250,7 +250,7 @@ struct DefaultComparator //Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter template> KOKKOS_INLINE_FUNCTION void -TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem) +TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) { //Algorithm only works on power-of-two input size only. //If n is not a power-of-two, will implicitly pretend @@ -277,7 +277,6 @@ TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem) Ordinal boxStart = boxID << (1 + i - j); //boxID * boxSize Ordinal boxOffset = t - (boxStart >> 1); //t - boxID * boxSize / 2; Ordinal elem1 = boxStart + boxOffset; - Comparator comp; if(j == 0) { //first phase (brown box): within a block, compare with the opposite value in the box @@ -316,7 +315,7 @@ TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem) //Sort "values", while applying the same swaps to "perm" template> KOKKOS_INLINE_FUNCTION void -TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem) +TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) { //Algorithm only works on power-of-two input size only. //If n is not a power-of-two, will implicitly pretend @@ -343,7 +342,6 @@ TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember Ordinal boxStart = boxID << (1 + i - j); //boxID * boxSize Ordinal boxOffset = t - (boxStart >> 1); //t - boxID * boxSize / 2; Ordinal elem1 = boxStart + boxOffset; - Comparator comp; if(j == 0) { //first phase (brown box): within a block, compare with the opposite value in the box @@ -389,19 +387,20 @@ TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember template struct BitonicSingleTeamFunctor { - BitonicSingleTeamFunctor(View& v_) : v(v_) {} + BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) : v(v_), comp(comp_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { - TeamBitonicSort(v.data(), v.extent(0), t); + TeamBitonicSort(v.data(), v.extent(0), t, comp); }; View v; + Comparator comp; }; //Functor that sorts equally sized chunks on each team template struct BitonicChunkFunctor { - BitonicChunkFunctor(View& v_, Ordinal chunkSize_) : v(v_), chunkSize(chunkSize_) {} + BitonicChunkFunctor(View& v_, const Comparator& comp_, Ordinal chunkSize_) : v(v_), comp(comp_), chunkSize(chunkSize_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { Ordinal chunk = t.league_rank(); @@ -409,9 +408,10 @@ struct BitonicChunkFunctor Ordinal n = chunkSize; if(chunkStart + n > Ordinal(v.extent(0))) n = v.extent(0) - chunkStart; - TeamBitonicSort(v.data() + chunkStart, n, t); + TeamBitonicSort(v.data() + chunkStart, n, t, comp); }; View v; + Comparator comp; Ordinal chunkSize; }; @@ -420,8 +420,8 @@ template> -void bitonicSort(View v) +void bitonicSort(View v, const Comparator& comp = Comparator()) { typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; Ordinal n = v.extent(0); //If n is small, just sort on a single team - if(n <= Ordinal(1) << 16) + if(n <= Ordinal(1) << 12) { Kokkos::parallel_for(team_policy(1, Kokkos::AUTO()), - BitonicSingleTeamFunctor(v)); + BitonicSingleTeamFunctor(v, comp)); } else { @@ -552,16 +552,16 @@ void bitonicSort(View v) Ordinal numTeams = npot / chunkSize; //First, sort within teams Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), - BitonicChunkFunctor(v, chunkSize)); + BitonicChunkFunctor(v, comp, chunkSize)); for(int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2) { Ordinal boxSize = teamsPerBox * chunkSize; Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), - BitonicPhase1Functor(v, boxSize, teamsPerBox)); + BitonicPhase1Functor(v, comp, boxSize, teamsPerBox)); for(int boxDiv = 1; teamsPerBox >> boxDiv; boxDiv++) { Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), - BitonicPhase2Functor(v, boxSize >> boxDiv, teamsPerBox >> boxDiv)); + BitonicPhase2Functor(v, comp, boxSize >> boxDiv, teamsPerBox >> boxDiv)); } } } diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp index 16a336f200..6f0c7ed647 100644 --- a/src/common/KokkosKernels_SparseUtils.hpp +++ b/src/common/KokkosKernels_SparseUtils.hpp @@ -858,13 +858,84 @@ inline size_t kk_is_d1_coloring_valid( struct ColorChecker cc(num_rows, xadj, adj, v_colors, team_work_chunk_size); size_t num_conf = 0; - Kokkos::parallel_reduce( "KokkosKernels::Common::IsD1ColoringValie", dynamic_team_policy(num_rows / team_work_chunk_size + 1 , + Kokkos::parallel_reduce( "KokkosKernels::Common::IsD1ColoringValid", dynamic_team_policy(num_rows / team_work_chunk_size + 1 , suggested_team_size, vector_size), cc, num_conf); MyExecSpace().fence(); return num_conf; } +template +struct MinMaxDegreeFunctor +{ + using ReducerVal = typename Reducer::value_type; + MinMaxDegreeFunctor(const rowmap_t& rowmap_) + : rowmap(rowmap_) {} + KOKKOS_INLINE_FUNCTION void operator()(ordinal_t i, ReducerVal& lminmax) const + { + ordinal_t deg = rowmap(i + 1) - rowmap(i); + if(deg < lminmax.min_val) + lminmax.min_val = deg; + if(deg > lminmax.max_val) + lminmax.max_val = deg; + } + rowmap_t rowmap; +}; + +template +struct MaxDegreeFunctor +{ + using ReducerVal = typename Reducer::value_type; + MaxDegreeFunctor(const rowmap_t& rowmap_) + : rowmap(rowmap_) {} + KOKKOS_INLINE_FUNCTION void operator()(ordinal_t i, ReducerVal& lmax) const + { + ordinal_t deg = rowmap(i + 1) - rowmap(i); + if(deg > lmax) + lmax = deg; + } + rowmap_t rowmap; +}; + +template +ordinal_t graph_max_degree(const rowmap_t& rowmap) +{ + using Reducer = Kokkos::Max; + ordinal_t nrows = rowmap.extent(0); + if(nrows) + nrows--; + if(nrows == 0) + return 0; + ordinal_t val; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, nrows), + MaxDegreeFunctor(rowmap), + Reducer(val)); + return val; +} + +template +void graph_min_max_degree(const rowmap_t& rowmap, ordinal_t& min_degree, ordinal_t& max_degree) +{ + using Reducer = Kokkos::MinMax; + ordinal_t nrows = rowmap.extent(0); + if(nrows) + nrows--; + if(nrows == 0) + { + min_degree = 0; + max_degree = 0; + return; + } + typename Reducer::value_type result; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, nrows), + MinMaxDegreeFunctor(rowmap), + Reducer(result)); + min_degree = result.min_val; + max_degree = result.max_val; +} + template struct SortCrsMatrixFunctor { diff --git a/src/graph/KokkosGraph_Distance1Color.hpp b/src/graph/KokkosGraph_Distance1Color.hpp index 83070c6e66..2e9a4bc03d 100644 --- a/src/graph/KokkosGraph_Distance1Color.hpp +++ b/src/graph/KokkosGraph_Distance1Color.hpp @@ -44,8 +44,6 @@ #ifndef _KOKKOSGRAPH_DISTANCE1_COLOR_HPP #define _KOKKOSGRAPH_DISTANCE1_COLOR_HPP -#include - #include "KokkosGraph_Distance1ColorHandle.hpp" #include "KokkosGraph_Distance1Color_impl.hpp" #include "KokkosKernels_Utils.hpp" diff --git a/src/graph/KokkosGraph_MIS2.hpp b/src/graph/KokkosGraph_MIS2.hpp new file mode 100644 index 0000000000..98908c40a9 --- /dev/null +++ b/src/graph/KokkosGraph_MIS2.hpp @@ -0,0 +1,93 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOSGRAPH_DISTANCE2_MIS_HPP +#define _KOKKOSGRAPH_DISTANCE2_MIS_HPP + +#include "KokkosGraph_Distance2MIS_impl.hpp" + +namespace KokkosGraph{ + +enum MIS2_Algorithm +{ + MIS2_QUALITY, + MIS2_FAST +}; + +namespace Experimental{ + +// Compute a distance-2 maximal independent set, given a symmetric CRS graph. +// Returns a list of the vertices in the set. +// +// Column indices >= num_verts are ignored. + +template +typename colinds_t::non_const_type +graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, MIS2_Algorithm algo = MIS2_FAST) +{ + if(rowmap.extent(0) <= 1) + { + //zero vertices means the MIS is empty. + return typename colinds_t::non_const_type(); + } + switch(algo) + { + case MIS2_QUALITY: + { + Impl::D2_MIS_ECL mis(rowmap, colinds); + return mis.compute(); + } + case MIS2_FAST: + { + Impl::D2_MIS_Luby mis(rowmap, colinds); + return mis.compute(); + } + } + throw std::invalid_argument("graph_d2_mis: invalid algorithm"); +} + +} // end namespace Experimental +} // end namespace KokkosGraph + +#endif diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp new file mode 100644 index 0000000000..4c54a459f8 --- /dev/null +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -0,0 +1,737 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOSGRAPH_DISTANCE2_MIS_IMPL_HPP +#define _KOKKOSGRAPH_DISTANCE2_MIS_IMPL_HPP + +#include "Kokkos_Core.hpp" +#include "Kokkos_Bitset.hpp" +#include "KokkosKernels_SparseUtils.hpp" +#include + +namespace KokkosGraph { +namespace Experimental { +namespace Impl { + +/* + * 100% asynchronous algorithm ideas: + * -For each row in team worklist: + * -Determine if any neighboring columns are OUT_SET, as well as whether all col statuses match my row status exactly + * -If any neighbors are OUT_SET: + * -Mark row permanently as OUT_SET. + * -Mark all neighboring columns for status update, since their minimum status may now have increased. + * -If all neighbor statuses match this row's status, mark this row permanently as IN_SET. Then mark all neighboring columns as OUT_SET. + * -Process all pending column updates (atomic_maxing the status with new one, if multiple threads may get the same column) + * + * -Invariants: + * -Row status changes exactly once (to either IN_SET or OUT_SET). After this, it never needs to be proccessed again. + * -Col status can change multiple times, but it can only increase (up to OUT_SET) + * -Therefore, when a column is updated, it converges to the true minimum status over rows + * + * What if a row R 2 hops away becomes IN_SET, and this row doesn't observe the columns changing to OUT_SET? + * -It's OK, since at no time can this row observe a mutual neighbor exactly matching its status. It will match R's status, and then it will be OUT_SET). + * What if a column's updated status is based on out of date information? + * -The minimum is computed as: any are IN_SET? OUT_SET : min(neighbors) + * -This quantity may only increase, since rows can only change to IN_SET or OUT_SET, and in either case it increases + * -So it's OK, since if it's out of date, it can only be _lower_ than it should be, never allowing a vertex to become IN_SET that shouldn't + * + * Problem still to solve: with priorities chosen only once, will still converge slowly. Need a way to have teams working + * independently, but still have globally consistent rounds where row statuses change. + */ + +template +struct D2_MIS_Luby +{ + using exec_space = typename device_t::execution_space; + using mem_space = typename device_t::memory_space; + using bitset_t = Kokkos::Bitset; + using const_bitset_t = Kokkos::ConstBitset; + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using lno_view_t = typename entries_t::non_const_type; + //The type of status/priority values. + using status_t = typename std::make_unsigned::type; + using status_view_t = Kokkos::View; + using range_pol = Kokkos::RangePolicy; + using team_pol = Kokkos::TeamPolicy; + using team_mem = typename team_pol::member_type; + + KOKKOS_INLINE_FUNCTION static uint32_t xorshiftHash(uint32_t in) + { + uint32_t x = in; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + return x; + } + + // Priority values 0 and max are special, they mean the vertex is + // in the independent set or eliminated from consideration, respectively. + // Values in between represent a priority for being added to the set, + // based on degree and vertex ID as a tiebreak + // (higher priority = less preferred to being in the independent set) + + static constexpr status_t IN_SET = 0; + static constexpr status_t OUT_SET = ~IN_SET; + + D2_MIS_Luby(const rowmap_t& rowmap_, const entries_t& entries_) + : rowmap(rowmap_), entries(entries_), numVerts(rowmap.extent(0) - 1) + { + status_t i = numVerts + 1; + nvBits = 0; + while(i) + { + i >>= 1; + nvBits++; + } + //Each value in rowStatus represents the status and priority of each row. + //Each value in colStatus represents the lowest nonzero priority of any row adjacent to the column. + // This counts up monotonically as vertices are eliminated (given status OUT_SET) + rowStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("RowStatus"), numVerts); + colStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("ColStatus"), numVerts); + allWorklists = Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("AllWorklists"), numVerts, 3); + } + + struct RefreshRowStatus + { + RefreshRowStatus(const status_view_t& rowStatus_, const lno_view_t& worklist_, lno_t nvBits_, int round) + : rowStatus(rowStatus_), worklist(worklist_), nvBits(nvBits_) + { + hashedRound = xorshiftHash(round); + } + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const + { + lno_t i = worklist(w); + //Combine vertex and round to get some pseudorandom priority bits that change each round + status_t priority = xorshiftHash(i + hashedRound); + //Generate unique status per row, with IN_SET < status < OUT_SET, + int priorityBits = sizeof(status_t) * 8 - nvBits; + status_t priorityMask = 1; + priorityMask <<= priorityBits; + priorityMask--; + status_t newStatus = (status_t) (i + 1) + ((priority & priorityMask) << nvBits); + if(newStatus == OUT_SET) + newStatus--; + rowStatus(i) = newStatus; + } + + status_view_t rowStatus; + lno_view_t worklist; + int nvBits; + uint32_t hashedRound; + }; + + struct RefreshColStatus + { + RefreshColStatus(const status_view_t& colStatus_, const lno_view_t& worklist_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) + : colStatus(colStatus_), worklist(worklist_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const + { + lno_t i = worklist(w); + //iterate over {i} union the neighbors of i, to find + //minimum status. + status_t s = OUT_SET; + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + for(size_type j = rowBegin; j <= rowEnd; j++) + { + lno_t nei = (j == rowEnd) ? i : entries(j); + if(nei <= nv) + { + status_t neiStat = rowStatus(nei); + if(neiStat < s) + s = neiStat; + } + } + if(s == IN_SET) + s = OUT_SET; + colStatus(i) = s; + } + + status_view_t colStatus; + lno_view_t worklist; + status_view_t rowStatus; + rowmap_t rowmap; + entries_t entries; + lno_t nv; + }; + + struct DecideSetFunctor + { + DecideSetFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const lno_view_t& worklist_) + : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklist(worklist_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const + { + lno_t i = worklist(w); + //Processing row i. + status_t s = rowStatus(i); + if(s == IN_SET || s == OUT_SET) + return; + //s is the status which must be the minimum among all neighbors + //to decide that i is IN_SET. + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + bool neiOut = false; + bool neiMismatchS = false; + for(size_type j = rowBegin; j <= rowEnd; j++) + { + lno_t nei = (j == rowEnd) ? i : entries(j); + if(nei >= nv) + continue; + status_t neiStat = colStatus(nei); + if(neiStat == OUT_SET) + { + neiOut = true; + break; + } + else if(neiStat != s) + { + neiMismatchS = true; + } + } + if(neiOut) + { + //In order to make future progress, need to update the + //col statuses for all neighbors of i. + rowStatus(i) = OUT_SET; + } + else if(!neiMismatchS) + { + //all neighboring col statuses match s, therefore s is the minimum status among all d2 neighbors + rowStatus(i) = IN_SET; + } + } + + status_view_t rowStatus; + status_view_t colStatus; + rowmap_t rowmap; + entries_t entries; + lno_t nv; + lno_view_t worklist; + }; + + struct CountInSet + { + CountInSet(const status_view_t& rowStatus_) + : rowStatus(rowStatus_) + {} + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet) const + { + if(rowStatus(i) == IN_SET) + lNumInSet++; + } + status_view_t rowStatus; + }; + + struct CompactInSet + { + CompactInSet(const status_view_t& rowStatus_, const lno_view_t& setList_) + : rowStatus(rowStatus_), setList(setList_) + {} + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, bool finalPass) const + { + if(rowStatus(i) == IN_SET) + { + if(finalPass) + setList(lNumInSet) = i; + lNumInSet++; + } + } + status_view_t rowStatus; + lno_view_t setList; + }; + + struct InitWorklistFunctor + { + InitWorklistFunctor(const lno_view_t& worklist_) + : worklist(worklist_) + {} + KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const + { + worklist(i) = i; + } + lno_view_t worklist; + }; + + struct CompactWorklistFunctor + { + CompactWorklistFunctor(const lno_view_t& src_, const lno_view_t& dst_, const status_view_t& status_) + : src(src_), dst(dst_), status(status_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lNumInSet, bool finalPass) const + { + lno_t i = src(w); + status_t s = status(i); + if(s != IN_SET && s != OUT_SET) + { + //next worklist needs to contain i + if(finalPass) + dst(lNumInSet) = i; + lNumInSet++; + } + } + + lno_view_t src; + lno_view_t dst; + status_view_t status; + }; + + lno_view_t compute() + { + //Initialize first worklist to 0...numVerts + lno_view_t rowWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 0); + Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(rowWorklist)); + lno_view_t colWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 1); + Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(colWorklist)); + lno_view_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); + int round = 0; + lno_t rowWorkLen = numVerts; + lno_t colWorkLen = numVerts; + while(true) + { + //Compute new row statuses + Kokkos::parallel_for(range_pol(0, rowWorkLen), RefreshRowStatus(rowStatus, rowWorklist, nvBits, round)); + //Compute new col statuses + Kokkos::parallel_for(range_pol(0, colWorkLen), RefreshColStatus(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts)); + //Decide row statuses + Kokkos::parallel_for(range_pol(0, rowWorkLen), DecideSetFunctor(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist)); + //Compact row worklist + Kokkos::parallel_scan(range_pol(0, rowWorkLen), CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus), rowWorkLen); + if(rowWorkLen == 0) + break; + std::swap(rowWorklist, thirdWorklist); + //Compact col worklist + Kokkos::parallel_scan(range_pol(0, colWorkLen), CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus), colWorkLen); + std::swap(colWorklist, thirdWorklist); + round++; + } + //now that every vertex has been decided IN_SET/OUT_SET, + //build a compact list of the vertices which are IN_SET. + lno_t numInSet = 0; + Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet); + lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), numInSet); + Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList)); + return setList; + } + + rowmap_t rowmap; + entries_t entries; + lno_t numVerts; + status_view_t rowStatus; + status_view_t colStatus; + Kokkos::View allWorklists; + //The number of bits required to represent vertex IDs, in the ECL-MIS tiebreak scheme: + // ceil(log_2(numVerts + 1)) + int nvBits; + lno_t minDegree; + lno_t maxDegree; +}; + +template +struct D2_MIS_ECL +{ + using exec_space = typename device_t::execution_space; + using mem_space = typename device_t::memory_space; + using bitset_t = Kokkos::Bitset; + using const_bitset_t = Kokkos::ConstBitset; + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using lno_view_t = typename entries_t::non_const_type; + //The type of status/priority values. + using status_t = typename std::make_unsigned::type; + using status_view_t = Kokkos::View; + using range_pol = Kokkos::RangePolicy; + + // Priority values 0 and max are special, they mean the vertex is + // in the independent set or eliminated from consideration, respectively. + // Values in between represent a priority for being added to the set, + // based on degree and vertex ID as a tiebreak + // (higher priority = less preferred to being in the independent set) + + static constexpr status_t IN_SET = 0; + static constexpr status_t OUT_SET = ~IN_SET; + + D2_MIS_ECL(const rowmap_t& rowmap_, const entries_t& entries_) + : rowmap(rowmap_), entries(entries_), numVerts(rowmap.extent(0) - 1), colUpdateBitset(numVerts), + worklist1(Kokkos::ViewAllocateWithoutInitializing("WL1"), numVerts), + worklist2(Kokkos::ViewAllocateWithoutInitializing("WL2"), numVerts) + { + status_t i = numVerts + 1; + nvBits = 0; + while(i) + { + i >>= 1; + nvBits++; + } + //Each value in rowStatus represents the status and priority of each row. + //Each value in colStatus represents the lowest nonzero priority of any row adjacent to the column. + // This counts up monotonically as vertices are eliminated (given status OUT_SET) + rowStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("RowStatus"), numVerts); + colStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("ColStatus"), numVerts); + KokkosKernels::Impl::graph_min_max_degree(rowmap, minDegree, maxDegree); + //Compute row statuses + Kokkos::parallel_for(range_pol(0, numVerts), InitRowStatus(rowStatus, rowmap, numVerts, nvBits, minDegree, maxDegree)); + //Compute col statuses + Kokkos::parallel_for(range_pol(0, numVerts), InitColStatus(colStatus, rowStatus, rowmap, entries, numVerts)); + } + + struct InitRowStatus + { + InitRowStatus(const status_view_t& rowStatus_, const rowmap_t& rowmap_, lno_t nv_, lno_t nvBits_, lno_t minDeg_, lno_t maxDeg_) + : rowStatus(rowStatus_), rowmap(rowmap_), nv(nv_), nvBits(nvBits_), minDeg(minDeg_), maxDeg(maxDeg_), invDegRange(1.f / (maxDeg - minDeg)) {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const + { + //Generate unique status per row, with IN_SET < status < OUT_SET, + int degBits = sizeof(status_t) * 8 - nvBits; + if(degBits == 0) + { + //no space to store degree information. Algorithm will still work but will + //probably produce a lower quality MIS. + rowStatus(i) = i + 1; + return; + } + status_t maxDegRange = (((status_t) 1) << degBits) - 2; + lno_t deg = rowmap(i + 1) - rowmap(i); + float degScore = (float) (deg - minDeg) * invDegRange; + rowStatus(i) = (status_t) (i + 1) + (((status_t) (degScore * maxDegRange)) << nvBits); + } + + status_view_t rowStatus; + rowmap_t rowmap; + lno_t nv; + int nvBits; + lno_t minDeg; + lno_t maxDeg; + float invDegRange; + }; + + struct InitColStatus + { + InitColStatus(const status_view_t& colStatus_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) + : colStatus(colStatus_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const + { + //iterate over {i} union the neighbors of i, to find + //minimum status. + status_t s = rowStatus(i); + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + for(size_type j = rowBegin; j < rowEnd; j++) + { + lno_t nei = entries(j); + if(nei != i && nei < nv) + { + status_t neiStat = rowStatus(nei); + if(neiStat < s) + s = neiStat; + } + } + colStatus(i) = s; + } + + status_view_t colStatus; + status_view_t rowStatus; + rowmap_t rowmap; + entries_t entries; + lno_t nv; + }; + + struct IterateStatusFunctor + { + IterateStatusFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const lno_view_t& worklist_, const bitset_t& colUpdateBitset_) + : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklist(worklist_), colUpdateBitset(colUpdateBitset_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const + { + lno_t i = worklist(w); + //Processing row i. + status_t s = rowStatus(i); + //s is the status which must be the minimum among all neighbors + //to decide that i is IN_SET. + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + bool neiOut = false; + bool neiMismatchS = false; + for(size_type j = rowBegin; j <= rowEnd; j++) + { + lno_t nei = (j == rowEnd) ? i : entries(j); + if(nei >= nv) + continue; + status_t neiStat = colStatus(nei); + if(neiStat == OUT_SET) + { + neiOut = true; + break; + } + else if(neiStat != s) + { + neiMismatchS = true; + } + } + bool statusChanged = neiOut || !neiMismatchS; + if(neiOut) + { + //In order to make future progress, need to update the + //col statuses for all neighbors of i which have status s. + //This will increase the minimum to the next smallest row, + //so that another nearby vertex can be added to the set. + rowStatus(i) = OUT_SET; + } + else if(!neiMismatchS) + { + rowStatus(i) = IN_SET; + } + if(statusChanged) + { + for(size_type j = rowBegin; j <= rowEnd; j++) + { + lno_t nei = (j == rowEnd) ? i : entries(j); + if(nei < nv && colStatus(nei) == s) + colUpdateBitset.set(nei); + } + } + //else: still undecided + } + + status_view_t rowStatus; + status_view_t colStatus; + rowmap_t rowmap; + entries_t entries; + lno_t nv; + lno_view_t worklist; + bitset_t colUpdateBitset; + }; + + struct UpdateWorklistFunctor + { + UpdateWorklistFunctor(const status_view_t& rowStatus_, const lno_view_t& oldWorklist_, const lno_view_t& newWorklist_) + : rowStatus(rowStatus_), oldWorklist(oldWorklist_), newWorklist(newWorklist_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lcount, bool finalPass) const + { + //processing row i + lno_t i = oldWorklist(w); + //Bit i will be set when it's decided IN_SET/OUT_SET. + //If clear, vertex i needs to be processed still. + status_t s = rowStatus(i); + if(s != IN_SET && s != OUT_SET) + { + if(finalPass) + newWorklist(lcount) = i; + lcount++; + } + } + + status_view_t rowStatus; + lno_view_t oldWorklist; + lno_view_t newWorklist; + }; + + struct ColRefreshWorklist + { + ColRefreshWorklist(const bitset_t& colUpdateBitset_, const lno_view_t& refreshList_) + : colUpdateBitset(colUpdateBitset_), refreshList(refreshList_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lindex, bool finalPass) const + { + if(colUpdateBitset.test(i)) + { + if(finalPass) + { + refreshList(lindex) = i; + colUpdateBitset.reset(i); + } + lindex++; + } + } + + bitset_t colUpdateBitset; + lno_view_t refreshList; + }; + + struct RefreshColStatus + { + RefreshColStatus(const lno_view_t& worklist_, const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) + : worklist(worklist_), rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const + { + lno_t col = worklist(w); + status_t minNeiStat = OUT_SET; + size_type rowBegin = rowmap(col); + size_type rowEnd = rowmap(col + 1); + for(size_type j = rowBegin; j <= rowEnd; j++) + { + lno_t nei = (j == rowEnd) ? col : entries(j); + if(nei >= nv) + continue; + status_t neiStat = rowStatus(nei); + if(neiStat < minNeiStat) + minNeiStat = neiStat; + } + if(minNeiStat == IN_SET) + minNeiStat = OUT_SET; + colStatus(col) = minNeiStat; + } + + lno_view_t worklist; + status_view_t rowStatus; + status_view_t colStatus; + rowmap_t rowmap; + entries_t entries; + lno_t nv; + }; + + struct InitWorklistFunctor + { + InitWorklistFunctor(const lno_view_t& worklist_) + : worklist(worklist_) + {} + KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const + { + worklist(i) = i; + } + lno_view_t worklist; + }; + + struct CountInSet + { + CountInSet(const status_view_t& rowStatus_) + : rowStatus(rowStatus_) + {} + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet) const + { + if(rowStatus(i) == IN_SET) + lNumInSet++; + } + status_view_t rowStatus; + }; + + struct CompactInSet + { + CompactInSet(const status_view_t& rowStatus_, const lno_view_t& setList_) + : rowStatus(rowStatus_), setList(setList_) + {} + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, bool finalPass) const + { + if(rowStatus(i) == IN_SET) + { + if(finalPass) + setList(lNumInSet) = i; + lNumInSet++; + } + } + status_view_t rowStatus; + lno_view_t setList; + }; + + lno_view_t compute() + { + //Initialize first worklist to 0...numVerts + Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(worklist1)); + lno_t workRemain = numVerts; + int numIter = 0; + while(workRemain) + { + //do another iteration + Kokkos::parallel_for(range_pol(0, workRemain), + IterateStatusFunctor(rowStatus, colStatus, rowmap, entries, numVerts, worklist1, colUpdateBitset)); + //And refresh the column statuses using the other worklist. + lno_t colsToRefresh; + Kokkos::parallel_scan(range_pol(0, numVerts), + ColRefreshWorklist(colUpdateBitset, worklist2), colsToRefresh); + Kokkos::parallel_for(range_pol(0, colsToRefresh), + RefreshColStatus(worklist2, rowStatus, colStatus, rowmap, entries, numVerts)); + //then build the next worklist with a scan. Also get the length of the next worklist. + lno_t newWorkRemain = 0; + Kokkos::parallel_scan(range_pol(0, workRemain), + UpdateWorklistFunctor(rowStatus, worklist1, worklist2), + newWorkRemain); + //Finally, flip the worklists + std::swap(worklist1, worklist2); + workRemain = newWorkRemain; + numIter++; + } + //now that every vertex has been decided IN_SET/OUT_SET, + //build a compact list of the vertices which are IN_SET. + lno_t numInSet = 0; + Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet); + lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), numInSet); + Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList)); + return setList; + } + + rowmap_t rowmap; + entries_t entries; + lno_t numVerts; + status_view_t rowStatus; + status_view_t colStatus; + //The number of bits required to represent vertex IDs, in the ECL-MIS tiebreak scheme: + // ceil(log_2(numVerts + 1)) + int nvBits; + lno_t minDegree; + lno_t maxDegree; + //Bitset representing columns whose status needs to be recomputed + //These bits are cleared after each refresh. + bitset_t colUpdateBitset; + lno_view_t worklist1; + lno_view_t worklist2; +}; + +}}} + +#endif diff --git a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index f2cdee87bb..9671c2339d 100644 --- a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -782,7 +782,10 @@ namespace KokkosSparse{ raw_sym_adj = raw_colinds_t(sym_adj.data(), sym_adj.extent(0)); } nnz_view_t vertClusters; - auto clusterAlgo = gsHandle->get_clustering_algo(); + //auto clusterAlgo = gsHandle->get_clustering_algo(); + BalloonClustering balloon(num_rows, raw_sym_xadj, raw_sym_adj); + vertClusters = balloon.run(clusterSize); + /* if(clusterAlgo == CLUSTER_DEFAULT) clusterAlgo = CLUSTER_BALLOON; switch(clusterAlgo) @@ -814,6 +817,7 @@ namespace KokkosSparse{ default: throw std::runtime_error("Clustering algo " + std::to_string((int) clusterAlgo) + " is not implemented"); } + */ #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "Graph clustering: " << timer.seconds() << '\n'; timer.reset(); diff --git a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp index ddfcb70f92..86c34a1eee 100644 --- a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp +++ b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp @@ -105,6 +105,8 @@ struct RCM typedef Kokkos::RangePolicy my_exec_space; + typedef Kokkos::Device device_t; + typedef Kokkos::RangePolicy range_policy_t ; typedef Kokkos::TeamPolicy team_policy_t ; typedef typename team_policy_t::member_type team_member_t ; @@ -119,30 +121,6 @@ struct RCM const_lno_row_view_t rowmap; const_lno_nnz_view_t colinds; - template - struct MaxDegreeFunctor - { - typedef typename std::remove_cv::type size_type; - MaxDegreeFunctor(Rowmap& rowmap_) : r(rowmap_) {} - KOKKOS_INLINE_FUNCTION void operator()(const size_type i, size_type& lmax) const - { - size_type ideg = r(i + 1) - r(i); - if(ideg > lmax) - lmax = ideg; - } - Rowmap r; - }; - - //simple parallel reduction to find max degree in graph - size_type find_max_degree() - { - size_type maxDeg = 0; - Kokkos::parallel_reduce(range_policy_t(0, numRows), MaxDegreeFunctor(rowmap), Kokkos::Max(maxDeg)); - //max degree should be computed as an offset_t, - //but must fit in a nnz_lno_t - return maxDeg; - } - //radix sort keys according to their corresponding values ascending. //keys are NOT preserved since the use of this in RCM doesn't care about degree after sorting template @@ -401,7 +379,7 @@ struct RCM nnz_lno_t parallel_bfs(nnz_lno_t start, nnz_view_t& xadj, nnz_view_t& adj, nnz_lno_t& maxDeg, nnz_lno_t nthreads) { //need to know maximum degree to allocate scratch space for threads - maxDeg = find_max_degree(); + maxDeg = KokkosKernels::Impl::graph_max_degree(rowmap); //view for storing the visit timestamps nnz_view_t visit("BFS visited nodes", numRows); const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); @@ -532,8 +510,10 @@ struct RCM nnz_lno_t maxDegree = 0; //parallel_bfs will compute maxDegree auto numLevels = parallel_bfs(start, xadj, adj, maxDegree, nthreads); - nnz_lno_t maxLevelSize = 0; - Kokkos::parallel_reduce(range_policy_t(0, numLevels), MaxDegreeFunctor(xadj), Kokkos::Max(maxLevelSize)); + //xadj determines where each level set starts and begins, + //so its max 'degree' gives the size of the largest level + nnz_lno_t maxLevelSize = KokkosKernels::Impl::graph_max_degree(xadj); + std::cout << "Maximum size of a level set: " << maxLevelSize << '\n'; //visit (to be returned) contains the RCM numberings of each row nnz_view_t visit("RCM labels", numRows); //Populate visit wth LNO_MAX so that the "min-labeled neighbor" diff --git a/unit_test/cuda/Test_Cuda_Graph_mis2.cpp b/unit_test/cuda/Test_Cuda_Graph_mis2.cpp new file mode 100644 index 0000000000..00148fd653 --- /dev/null +++ b/unit_test/cuda/Test_Cuda_Graph_mis2.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/graph/Test_Graph_graph_color_distance2.hpp b/unit_test/graph/Test_Graph_graph_color_distance2.hpp index 7dac558bff..cc3931083b 100644 --- a/unit_test/graph/Test_Graph_graph_color_distance2.hpp +++ b/unit_test/graph/Test_Graph_graph_color_distance2.hpp @@ -47,6 +47,7 @@ #include #include "KokkosGraph_Distance2Color.hpp" +#include "KokkosGraph_MIS2.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_IOUtils.hpp" #include "KokkosKernels_SparseUtils.hpp" diff --git a/unit_test/graph/Test_Graph_mis2.hpp b/unit_test/graph/Test_Graph_mis2.hpp new file mode 100644 index 0000000000..ca0d801002 --- /dev/null +++ b/unit_test/graph/Test_Graph_mis2.hpp @@ -0,0 +1,190 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include + +#include "KokkosGraph_MIS2.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosKernels_IOUtils.hpp" +#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosKernels_Handle.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" + +using namespace KokkosKernels; +using namespace KokkosKernels::Experimental; + +using namespace KokkosGraph; +using namespace KokkosGraph::Experimental; + +namespace Test { + +template +bool verifyD2MIS( + lno_t numVerts, + const rowmap_t& rowmap, const entries_t& entries, + const mis_t& misArray) +{ + //set a std::set of the mis, for fast membership test + std::set mis; + for(size_t i = 0; i < misArray.extent(0); i++) + mis.insert(misArray(i)); + for(lno_t i = 0; i < numVerts; i++) + { + //determine whether another vertex in the set is + //within 2 hops of i. + bool misIn2Hops = false; + for(size_type j = rowmap(i); j < rowmap(i + 1); j++) + { + lno_t nei1 = entries(j); + if(nei1 == i || nei1 >= numVerts) + continue; + if(mis.find(nei1) != mis.end()) + { + misIn2Hops = true; + break; + } + for(size_type k = rowmap(nei1); k < rowmap(nei1 + 1); k++) + { + lno_t nei2 = entries(k); + if(nei2 == i || nei2 >= numVerts) + continue; + if(mis.find(nei2) != mis.end()) + { + misIn2Hops = true; + break; + } + } + } + if(mis.find(i) == mis.end()) + { + //i is not in the set + if(!misIn2Hops) + { + std::cout << "INVALID D2 MIS: vertex " << i << " is not in the set,\n"; + std::cout << "but there are no vertices in the set within 2 hops.\n"; + return false; + } + } + else + { + //i is in the set + if(misIn2Hops) + { + std::cout << "INVALID D2 MIS: vertex " << i << " is in the set,\n"; + std::cout << "but there is another vertex within 2 hops which is also in the set.\n"; + return false; + } + } + } + return true; +} +} + +template +void test_dist2_mis(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) +{ + using execution_space = typename device::execution_space; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + //Generate graph, and add some out-of-bounds columns + crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix(numVerts, numVerts, nnz, row_size_variance, bandwidth); + auto G = A.graph; + //Symmetrize the graph + rowmap_t symRowmap; + entries_t symEntries; + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap + + (numVerts, G.row_map, G.entries, symRowmap, symEntries); + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); + //For each algorithm, compute and verify the MIS + std::vector algos + = {MIS2_FAST, MIS2_QUALITY}; + for(auto algo : algos) + { + auto mis = graph_d2_mis(symRowmap, symEntries, algo); + auto misHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), mis); + bool success = Test::verifyD2MIS + + (numVerts, rowmapHost, entriesHost, misHost); + EXPECT_TRUE(success) << "Dist-2 MIS (algo " << (int) algo << ") produced invalid set."; + } +} + +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##graph_color_mis2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ + { \ + test_dist2_mis(5000, 5000 * 20, 1000, 10); \ + test_dist2_mis(50, 50 * 10, 40, 10); \ + test_dist2_mis(5, 5 * 3, 5, 0); \ + } + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) \ + || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, int, int, TestExecSpace) +#endif + +#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) \ + || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, int64_t, int, TestExecSpace) +#endif + +#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) \ + || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, int, size_t, TestExecSpace) +#endif + +#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) \ + || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +#endif +#endif diff --git a/unit_test/openmp/Test_OpenMP_Graph_mis2.cpp b/unit_test/openmp/Test_OpenMP_Graph_mis2.cpp new file mode 100644 index 0000000000..8622411b0f --- /dev/null +++ b/unit_test/openmp/Test_OpenMP_Graph_mis2.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/serial/Test_Serial_Graph_mis2.cpp b/unit_test/serial/Test_Serial_Graph_mis2.cpp new file mode 100644 index 0000000000..38db82cfc9 --- /dev/null +++ b/unit_test/serial/Test_Serial_Graph_mis2.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp index 5645f1d2ce..a8d7d46a40 100644 --- a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp @@ -81,7 +81,6 @@ int run_gauss_seidel( bool is_symmetric_graph, int apply_type = 0, // 0 for symmetric, 1 for forward, 2 for backward. int cluster_size = 1, - ClusteringAlgorithm cluster_algorithm = CLUSTER_DEFAULT, bool classic = false) // only with two-stage, true for sptrsv instead of richardson { typedef typename crsMat_t::StaticCrsGraphType graph_t; @@ -101,7 +100,7 @@ int run_gauss_seidel( kh.set_team_work_size(16); kh.set_dynamic_scheduling(true); if(gs_algorithm == GS_CLUSTER) - kh.create_gs_handle(cluster_algorithm, cluster_size); + kh.create_gs_handle(KokkosSparse::CLUSTER_BALLOON, cluster_size); else if(gs_algorithm == GS_TWOSTAGE) { // test for two-stage/classical gs kh.create_gs_handle(gs_algorithm); @@ -284,19 +283,16 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ int clusterSizes[3] = {2, 5, 34}; for(int csize = 0; csize < 3; csize++) { - for(int algo = 0; algo < (int) NUM_CLUSTERING_ALGORITHMS; algo++) + for(int apply_type = 0; apply_type < apply_count; ++apply_type) { - for(int apply_type = 0; apply_type < apply_count; ++apply_type) - { - Kokkos::Impl::Timer timer1; - //Zero out X before solving - Kokkos::deep_copy(x_vector, zero); - run_gauss_seidel( - input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, apply_type, clusterSizes[csize], (ClusteringAlgorithm) algo); - KokkosBlas::axpby(one, solution_x, -one, x_vector); - mag_t result_norm_res = KokkosBlas::nrm2(x_vector); - EXPECT_LT(result_norm_res, initial_norm_res); - } + Kokkos::Impl::Timer timer1; + //Zero out X before solving + Kokkos::deep_copy(x_vector, zero); + run_gauss_seidel( + input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, apply_type, clusterSizes[csize]); + KokkosBlas::axpby(one, solution_x, -one, x_vector); + mag_t result_norm_res = KokkosBlas::nrm2(x_vector); + EXPECT_LT(result_norm_res, initial_norm_res); } } //*** Two-stage version **** @@ -312,10 +308,9 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ //*** Two-stage version (classic) **** for (int apply_type = 0; apply_type < apply_count; ++apply_type) { - ClusteringAlgorithm cluster_algo = (ClusteringAlgorithm)0; Kokkos::deep_copy(x_vector, zero); run_gauss_seidel - (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, cluster_algo, true); + (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true); KokkosBlas::axpby(one, solution_x, -one, x_vector); mag_t result_norm_res = KokkosBlas::nrm2(x_vector); EXPECT_LT(result_norm_res, initial_norm_res); @@ -435,10 +430,9 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ for(int apply_type = 0; apply_type < apply_count; ++apply_type) { //Zero out X before solving - ClusteringAlgorithm cluster_algo = (ClusteringAlgorithm)0; Kokkos::deep_copy(x_vector, zero); run_gauss_seidel - (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, cluster_algo, true); + (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true); Kokkos::deep_copy(x_host, x_vector); for(lno_t i = 0; i < numVecs; i++) { @@ -455,43 +449,6 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ } } -template -void test_rcm(lno_t numRows, size_type nnzPerRow, lno_t bandwidth) -{ - using namespace Test; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type lno_row_view_t; - typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; - typedef KokkosKernelsHandle - KernelHandle; - srand(245); - size_type nnzTotal = nnzPerRow * numRows; - lno_t nnzVariance = nnzPerRow / 4; - crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows, numRows, nnzTotal, nnzVariance, bandwidth); - lno_row_view_t symRowmap; - lno_nnz_view_t symEntries; - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap - - (numRows, A.graph.row_map, A.graph.entries, symRowmap, symEntries); - typedef KokkosSparse::Impl::RCM rcm_t; - rcm_t rcm(numRows, symRowmap, symEntries); - lno_nnz_view_t rcmOrder = rcm.rcm(); - //perm(i) = the node with timestamp i - //make sure that perm is in fact a permutation matrix (contains each row exactly once) - Kokkos::View rcmHost("RCM row ordering", numRows); - Kokkos::deep_copy(rcmHost, rcmOrder); - std::set rowSet; - for(lno_t i = 0; i < numRows; i++) - rowSet.insert(rcmHost(i)); - if((lno_t) rowSet.size() != numRows) - { - std::cerr << "Only got back " << rowSet.size() << " unique row IDs!\n"; - return; - } -} - template void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { const scalar_t zero = Kokkos::Details::ArithTraits::zero(); @@ -659,9 +616,6 @@ TEST_F( TestCategory, sparse ## _ ## gauss_seidel_symmetric_rank2 ## _ ## SCALAR TEST_F( TestCategory, sparse ## _ ## gauss_seidel_zero_rows ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ test_sgs_zero_rows(); \ } \ -TEST_F( TestCategory, sparse ## _ ## rcm ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ - test_rcm(10000, 50, 2000); \ -} \ TEST_F( TestCategory, sparse ## _ ## balloon_clustering ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ test_balloon_clustering(5000, 100, 2000); \ } \ diff --git a/unit_test/threads/Test_Threads_Graph_mis2.cpp b/unit_test/threads/Test_Threads_Graph_mis2.cpp new file mode 100644 index 0000000000..cbf15a7662 --- /dev/null +++ b/unit_test/threads/Test_Threads_Graph_mis2.cpp @@ -0,0 +1,3 @@ +#include +#include + From 27017573646ff8fec103089733f16021efc5eaa6 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 31 Aug 2020 21:33:04 -0600 Subject: [PATCH 012/106] Added MIS2 to wiki example for coloring --- .../wiki/graph/KokkosGraph_wiki_coloring.cpp | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/example/wiki/graph/KokkosGraph_wiki_coloring.cpp b/example/wiki/graph/KokkosGraph_wiki_coloring.cpp index 7e561f5883..cd86e1741c 100644 --- a/example/wiki/graph/KokkosGraph_wiki_coloring.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_coloring.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -7,6 +8,7 @@ #include "KokkosKernels_Handle.hpp" #include "KokkosGraph_Distance1Color.hpp" #include "KokkosGraph_Distance2Color.hpp" +#include "KokkosGraph_MIS2.hpp" //Greedy Graph Coloring // -Generate the graph for a rectangular grid, with a 9-point stencil @@ -70,6 +72,28 @@ namespace ColoringDemo } } + template + void printMIS(MISView misList) + { + //Read colors on host + auto misHost = Kokkos::create_mirror_view_and_copy(HostSpace(), misList); + std::set mis; + for(Offset i = 0; i < (Offset) misList.extent(0); i++) + mis.insert(misHost(i)); + for(Ordinal y = 0; y < gridY; y++) + { + for(Ordinal x = 0; x < gridX; x++) + { + Ordinal vertex = getVertexID(x, y); + if(mis.find(vertex) == mis.end()) + printf(". "); + else + printf("# "); + } + putchar('\n'); + } + } + //Build the graph on host, allocate these views on device and copy the graph to them. //Both rowmapDevice and colindsDevice are output parameters and should default-initialized (empty) on input. void generate9pt(RowmapType& rowmapDevice, ColindsType& colindsDevice) @@ -157,6 +181,15 @@ int main(int argc, char* argv[]) //Clean up handle.destroy_distance2_graph_coloring_handle(); } + //Step 4: Run distance-2 MIS. + { + //Run coloring + auto misDevice = KokkosGraph::Experimental::graph_d2_mis(rowmapDevice, colindsDevice); + std::cout << "9-pt stencil: Distance-2 MIS: contains " + << misDevice.extent(0) << " out of " << ColoringDemo::numVertices << " vertices.\n"; + ColoringDemo::printMIS(misDevice); + putchar('\n'); + } } Kokkos::finalize(); return 0; From c0e816e9f20b8647de68d9438928d59e78c6f9d9 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 31 Aug 2020 21:33:26 -0600 Subject: [PATCH 013/106] Added MIS2 verification, if verbose enabled --- perf_test/graph/KokkosGraph_mis_d2.cpp | 71 ++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp index 52e680e855..da9fb549d6 100644 --- a/perf_test/graph/KokkosGraph_mis_d2.cpp +++ b/perf_test/graph/KokkosGraph_mis_d2.cpp @@ -44,6 +44,7 @@ #include #include +#include #include #include @@ -79,6 +80,67 @@ struct MIS2Parameters MIS2_Algorithm algo = MIS2_FAST; }; +template +bool verifyD2MIS( + lno_t numVerts, + const rowmap_t& rowmap, const entries_t& entries, + const mis_t& misArray) +{ + //set a std::set of the mis, for fast membership test + std::set mis; + for(size_t i = 0; i < misArray.extent(0); i++) + mis.insert(misArray(i)); + for(lno_t i = 0; i < numVerts; i++) + { + //determine whether another vertex in the set is + //within 2 hops of i. + bool misIn2Hops = false; + for(size_type j = rowmap(i); j < rowmap(i + 1); j++) + { + lno_t nei1 = entries(j); + if(nei1 == i || nei1 >= numVerts) + continue; + if(mis.find(nei1) != mis.end()) + { + misIn2Hops = true; + break; + } + for(size_type k = rowmap(nei1); k < rowmap(nei1 + 1); k++) + { + lno_t nei2 = entries(k); + if(nei2 == i || nei2 >= numVerts) + continue; + if(mis.find(nei2) != mis.end()) + { + misIn2Hops = true; + break; + } + } + } + if(mis.find(i) == mis.end()) + { + //i is not in the set + if(!misIn2Hops) + { + std::cout << "INVALID D2 MIS: vertex " << i << " is not in the set,\n"; + std::cout << "but there are no vertices in the set within 2 hops.\n"; + return false; + } + } + else + { + //i is in the set + if(misIn2Hops) + { + std::cout << "INVALID D2 MIS: vertex " << i << " is in the set,\n"; + std::cout << "but there is another vertex within 2 hops which is also in the set.\n"; + return false; + } + } + } + return true; +} + void print_options(std::ostream &os, const char *app_name, unsigned int indent = 0) { std::string spaces(indent, ' '); @@ -245,6 +307,15 @@ void run_mis2(const MIS2Parameters& params) { std::cout << "Vertices in independent set:\n"; KokkosKernels::Impl::print_1Dview(mis); + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries); + auto misHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), mis); + if(verifyD2MIS + + (numVerts, rowmapHost, entriesHost, misHost)) + std::cout << "MIS-2 is correct.\n"; + else + std::cout << "*** MIS-2 not correct! ***\n"; } } From a3ed1b3959de5d211f44b48095326813c979e36e Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 2 Sep 2020 14:24:13 -0600 Subject: [PATCH 014/106] Added MIS2_BALANCED algorithm Much closer to FAST than QUALITY in run time, but higher quality than FAST. Almost as good as QUALITY on some graphs. --- example/wiki/graph/CMakeLists.txt | 5 + .../graph/KokkosGraph_wiki_9pt_stencil.hpp | 133 +++++++ .../wiki/graph/KokkosGraph_wiki_coloring.cpp | 147 +------- example/wiki/graph/KokkosGraph_wiki_mis2.cpp | 40 +++ perf_test/graph/KokkosGraph_mis_d2.cpp | 2 + src/graph/KokkosGraph_MIS2.hpp | 12 +- .../impl/KokkosGraph_Distance2MIS_impl.hpp | 330 +++++++++++++++++- unit_test/graph/Test_Graph_mis2.hpp | 2 +- 8 files changed, 521 insertions(+), 150 deletions(-) create mode 100644 example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp create mode 100644 example/wiki/graph/KokkosGraph_wiki_mis2.cpp diff --git a/example/wiki/graph/CMakeLists.txt b/example/wiki/graph/CMakeLists.txt index a8ddec070d..25175ec08e 100644 --- a/example/wiki/graph/CMakeLists.txt +++ b/example/wiki/graph/CMakeLists.txt @@ -8,3 +8,8 @@ KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( SOURCES KokkosGraph_wiki_coloring.cpp ) +KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( + wiki_mis2 + SOURCES KokkosGraph_wiki_mis2.cpp + ) + diff --git a/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp b/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp new file mode 100644 index 0000000000..4561300dea --- /dev/null +++ b/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp @@ -0,0 +1,133 @@ +#ifndef WIKI_9PT_STENCIL_H +#define WIKI_9PT_STENCIL_H + +#include "Kokkos_Core.hpp" +#include "KokkosKernels_default_types.hpp" +#include "KokkosKernels_Handle.hpp" +#include +#include +#include +#include +#include + +using Ordinal = default_lno_t; +using Offset = default_size_type; +using Layout = default_layout; +using ExecSpace = Kokkos::DefaultExecutionSpace; +using DeviceSpace = typename ExecSpace::memory_space; +using Kokkos::HostSpace; +using RowmapType = Kokkos::View; +using ColindsType = Kokkos::View; +using Handle = KokkosKernels::Experimental:: + KokkosKernelsHandle; + +namespace GraphDemo +{ + constexpr Ordinal gridX = 15; + constexpr Ordinal gridY = 25; + constexpr Ordinal numVertices = gridX * gridY; + + //Helper to get the vertex ID given grid coordinates + Ordinal getVertexID(Ordinal x, Ordinal y) + { + return y * gridX + x; + } + + //Inverse of getVertexID + void getVertexPos(Ordinal vert, Ordinal& x, Ordinal& y) + { + x = vert % gridX; + y = vert / gridX; + } + + //Helper to print out colors in the shape of the grid + template + void printColoring(ColorView colors, Ordinal numColors) + { + //Read colors on host + auto colorsHost = Kokkos::create_mirror_view_and_copy(HostSpace(), colors); + int numDigits = ceil(log10(numColors + 1)); + //Print out the grid, with columns aligned and at least one space between numbers + std::ostringstream numFmtStream; + numFmtStream << '%' << numDigits + 1 << 'd'; + std::string numFmt = numFmtStream.str(); + for(Ordinal y = 0; y < gridY; y++) + { + for(Ordinal x = 0; x < gridX; x++) + { + Ordinal vertex = getVertexID(x, y); + int color = colorsHost(vertex); + printf(numFmt.c_str(), color); + } + putchar('\n'); + } + } + + template + void printMIS(MISView misList) + { + //Read colors on host + auto misHost = Kokkos::create_mirror_view_and_copy(HostSpace(), misList); + std::set mis; + for(Offset i = 0; i < (Offset) misList.extent(0); i++) + mis.insert(misHost(i)); + for(Ordinal y = 0; y < gridY; y++) + { + for(Ordinal x = 0; x < gridX; x++) + { + Ordinal vertex = getVertexID(x, y); + if(mis.find(vertex) == mis.end()) + printf(". "); + else + printf("# "); + } + putchar('\n'); + } + } + + //Build the graph on host, allocate these views on device and copy the graph to them. + //Both rowmapDevice and colindsDevice are output parameters and should default-initialized (empty) on input. + void generate9pt(RowmapType& rowmapDevice, ColindsType& colindsDevice) + { + //Generate the graph on host (use std::vector to not need to know + //how many entries ahead of time) + std::vector rowmap(numVertices + 1); + std::vector colinds; + rowmap[0] = 0; + for(Ordinal vert = 0; vert < numVertices; vert++) + { + Ordinal x, y; + getVertexPos(vert, x, y); + //Loop over the neighbors in a 3x3 region + for(Ordinal ny = y - 1; ny <= y + 1; ny++) + { + for(Ordinal nx = x - 1; nx <= x + 1; nx++) + { + //exclude the edge to self + if(nx == x && ny == y) + continue; + //exclude vertices that would be outside the grid + if(nx < 0 || nx >= gridX || ny < 0 || ny >= gridY) + continue; + //add the neighbor to colinds, forming an edge + colinds.push_back(getVertexID(nx, ny)); + } + } + //mark where the current row ends + rowmap[vert + 1] = colinds.size(); + } + Offset numEdges = colinds.size(); + //Now that the graph is formed, copy rowmap and colinds to Kokkos::Views in device memory + //The nonowning host views just alias the std::vectors. + Kokkos::View> rowmapHost(rowmap.data(), numVertices + 1); + Kokkos::View> colindsHost(colinds.data(), numEdges); + //Allocate owning views on device with the correct size. + rowmapDevice = RowmapType("Rowmap", numVertices + 1); + colindsDevice = ColindsType("Colinds", numEdges); + //Copy the graph from host to device + Kokkos::deep_copy(rowmapDevice, rowmapHost); + Kokkos::deep_copy(colindsDevice, colindsHost); + } +} + +#endif diff --git a/example/wiki/graph/KokkosGraph_wiki_coloring.cpp b/example/wiki/graph/KokkosGraph_wiki_coloring.cpp index cd86e1741c..56639dad3a 100644 --- a/example/wiki/graph/KokkosGraph_wiki_coloring.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_coloring.cpp @@ -1,14 +1,6 @@ -#include -#include -#include -#include -#include -#include "Kokkos_Core.hpp" -#include "KokkosKernels_default_types.hpp" -#include "KokkosKernels_Handle.hpp" +#include "KokkosGraph_wiki_9pt_stencil.hpp" #include "KokkosGraph_Distance1Color.hpp" #include "KokkosGraph_Distance2Color.hpp" -#include "KokkosGraph_MIS2.hpp" //Greedy Graph Coloring // -Generate the graph for a rectangular grid, with a 9-point stencil @@ -19,136 +11,16 @@ // -Different constraint: two vertices separated by a path of length 1 OR 2 // must have different colors) -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; -using ExecSpace = Kokkos::DefaultExecutionSpace; -using DeviceSpace = typename ExecSpace::memory_space; -using Kokkos::HostSpace; -using RowmapType = Kokkos::View; -using ColindsType = Kokkos::View; -using Handle = KokkosKernels::Experimental:: - KokkosKernelsHandle; - -namespace ColoringDemo -{ - constexpr Ordinal gridX = 15; - constexpr Ordinal gridY = 25; - constexpr Ordinal numVertices = gridX * gridY; - - //Helper to get the vertex ID given grid coordinates - Ordinal getVertexID(Ordinal x, Ordinal y) - { - return y * gridX + x; - } - - //Inverse of getVertexID - void getVertexPos(Ordinal vert, Ordinal& x, Ordinal& y) - { - x = vert % gridX; - y = vert / gridX; - } - - //Helper to print out colors in the shape of the grid - template - void printColoring(ColorView colors, Ordinal numColors) - { - //Read colors on host - auto colorsHost = Kokkos::create_mirror_view_and_copy(HostSpace(), colors); - int numDigits = ceil(log10(numColors + 1)); - //Print out the grid, with columns aligned and at least one space between numbers - std::ostringstream numFmtStream; - numFmtStream << '%' << numDigits + 1 << 'd'; - std::string numFmt = numFmtStream.str(); - for(Ordinal y = 0; y < gridY; y++) - { - for(Ordinal x = 0; x < gridX; x++) - { - Ordinal vertex = getVertexID(x, y); - int color = colorsHost(vertex); - printf(numFmt.c_str(), color); - } - putchar('\n'); - } - } - - template - void printMIS(MISView misList) - { - //Read colors on host - auto misHost = Kokkos::create_mirror_view_and_copy(HostSpace(), misList); - std::set mis; - for(Offset i = 0; i < (Offset) misList.extent(0); i++) - mis.insert(misHost(i)); - for(Ordinal y = 0; y < gridY; y++) - { - for(Ordinal x = 0; x < gridX; x++) - { - Ordinal vertex = getVertexID(x, y); - if(mis.find(vertex) == mis.end()) - printf(". "); - else - printf("# "); - } - putchar('\n'); - } - } - - //Build the graph on host, allocate these views on device and copy the graph to them. - //Both rowmapDevice and colindsDevice are output parameters and should default-initialized (empty) on input. - void generate9pt(RowmapType& rowmapDevice, ColindsType& colindsDevice) - { - //Generate the graph on host (use std::vector to not need to know - //how many entries ahead of time) - std::vector rowmap(numVertices + 1); - std::vector colinds; - rowmap[0] = 0; - for(Ordinal vert = 0; vert < numVertices; vert++) - { - Ordinal x, y; - getVertexPos(vert, x, y); - //Loop over the neighbors in a 3x3 region - for(Ordinal ny = y - 1; ny <= y + 1; ny++) - { - for(Ordinal nx = x - 1; nx <= x + 1; nx++) - { - //exclude the edge to self - if(nx == x && ny == y) - continue; - //exclude vertices that would be outside the grid - if(nx < 0 || nx >= gridX || ny < 0 || ny >= gridY) - continue; - //add the neighbor to colinds, forming an edge - colinds.push_back(getVertexID(nx, ny)); - } - } - //mark where the current row ends - rowmap[vert + 1] = colinds.size(); - } - Offset numEdges = colinds.size(); - //Now that the graph is formed, copy rowmap and colinds to Kokkos::Views in device memory - //The nonowning host views just alias the std::vectors. - Kokkos::View> rowmapHost(rowmap.data(), numVertices + 1); - Kokkos::View> colindsHost(colinds.data(), numEdges); - //Allocate owning views on device with the correct size. - rowmapDevice = RowmapType("Rowmap", numVertices + 1); - colindsDevice = ColindsType("Colinds", numEdges); - //Copy the graph from host to device - Kokkos::deep_copy(rowmapDevice, rowmapHost); - Kokkos::deep_copy(colindsDevice, colindsHost); - } -} - int main(int argc, char* argv[]) { Kokkos::initialize(); { - using ColoringDemo::numVertices; + using GraphDemo::numVertices; RowmapType rowmapDevice; ColindsType colindsDevice; //Step 1: Generate the graph on host, allocate space on device, and copy. //See function "generate9pt" below. - ColoringDemo::generate9pt(rowmapDevice, colindsDevice); + GraphDemo::generate9pt(rowmapDevice, colindsDevice); //Step 2: Create handle and run distance-1 coloring. { Handle handle; @@ -160,7 +32,7 @@ int main(int argc, char* argv[]) auto colors = handle.get_graph_coloring_handle()->get_vertex_colors(); Ordinal numColors = handle.get_graph_coloring_handle()->get_num_colors(); printf("9-pt stencil: Distance-1 Colors (used %d):\n", (int) numColors); - ColoringDemo::printColoring(colors, numColors); + GraphDemo::printColoring(colors, numColors); putchar('\n'); //Clean up handle.destroy_graph_coloring_handle(); @@ -176,20 +48,11 @@ int main(int argc, char* argv[]) auto colors = handle.get_distance2_graph_coloring_handle()->get_vertex_colors(); Ordinal numColors = handle.get_distance2_graph_coloring_handle()->get_num_colors(); printf("9-pt stencil: Distance-2 Colors (used %d):\n", (int) numColors); - ColoringDemo::printColoring(colors, numColors); + GraphDemo::printColoring(colors, numColors); putchar('\n'); //Clean up handle.destroy_distance2_graph_coloring_handle(); } - //Step 4: Run distance-2 MIS. - { - //Run coloring - auto misDevice = KokkosGraph::Experimental::graph_d2_mis(rowmapDevice, colindsDevice); - std::cout << "9-pt stencil: Distance-2 MIS: contains " - << misDevice.extent(0) << " out of " << ColoringDemo::numVertices << " vertices.\n"; - ColoringDemo::printMIS(misDevice); - putchar('\n'); - } } Kokkos::finalize(); return 0; diff --git a/example/wiki/graph/KokkosGraph_wiki_mis2.cpp b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp new file mode 100644 index 0000000000..c158231282 --- /dev/null +++ b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp @@ -0,0 +1,40 @@ +#include "KokkosGraph_wiki_9pt_stencil.hpp" +#include "KokkosGraph_MIS2.hpp" + +int main(int argc, char* argv[]) +{ + Kokkos::initialize(); + { + using GraphDemo::numVertices; + RowmapType rowmapDevice; + ColindsType colindsDevice; + //Step 1: Generate the graph on host, allocate space on device, and copy. + //See function "generate9pt" below. + GraphDemo::generate9pt(rowmapDevice, colindsDevice); + //Step 2: Run distance-2 MIS and print the results, with three different algorithms + { + //Run coloring + auto misDevice = KokkosGraph::Experimental::graph_d2_mis( + rowmapDevice, colindsDevice, KokkosGraph::MIS2_FAST); + std::cout << "Distance-2 MIS, FAST algorithm: contains " + << misDevice.extent(0) << " out of " << GraphDemo::numVertices << " vertices.\n"; + GraphDemo::printMIS(misDevice); + putchar('\n'); + misDevice = KokkosGraph::Experimental::graph_d2_mis( + rowmapDevice, colindsDevice, KokkosGraph::MIS2_BALANCED); + std::cout << "Distance-2 MIS, BALANCED algorithm: contains " + << misDevice.extent(0) << " out of " << GraphDemo::numVertices << " vertices.\n"; + GraphDemo::printMIS(misDevice); + putchar('\n'); + misDevice = KokkosGraph::Experimental::graph_d2_mis( + rowmapDevice, colindsDevice, KokkosGraph::MIS2_QUALITY); + std::cout << "Distance-2 MIS, QUALITY algorithm: contains " + << misDevice.extent(0) << " out of " << GraphDemo::numVertices << " vertices.\n"; + GraphDemo::printMIS(misDevice); + putchar('\n'); + } + } + Kokkos::finalize(); + return 0; +} + diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp index da9fb549d6..1ae68fff00 100644 --- a/perf_test/graph/KokkosGraph_mis_d2.cpp +++ b/perf_test/graph/KokkosGraph_mis_d2.cpp @@ -226,6 +226,8 @@ int parse_inputs(MIS2Parameters ¶ms, int argc, char **argv) params.algo = MIS2_FAST; else if(!strcasecmp(algName, "quality")) params.algo = MIS2_QUALITY; + else if(!strcasecmp(algName, "balanced")) + params.algo = MIS2_BALANCED; else throw std::invalid_argument("Algorithm not valid: must be 'fast' or 'quality'"); } diff --git a/src/graph/KokkosGraph_MIS2.hpp b/src/graph/KokkosGraph_MIS2.hpp index 98908c40a9..5cb6e8a4d0 100644 --- a/src/graph/KokkosGraph_MIS2.hpp +++ b/src/graph/KokkosGraph_MIS2.hpp @@ -52,7 +52,8 @@ namespace KokkosGraph{ enum MIS2_Algorithm { MIS2_QUALITY, - MIS2_FAST + MIS2_FAST, + MIS2_BALANCED }; namespace Experimental{ @@ -75,12 +76,17 @@ graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, MIS2_Algorithm al { case MIS2_QUALITY: { - Impl::D2_MIS_ECL mis(rowmap, colinds); + Impl::D2_MIS_FixedPriority mis(rowmap, colinds); return mis.compute(); } case MIS2_FAST: { - Impl::D2_MIS_Luby mis(rowmap, colinds); + Impl::D2_MIS_RandomPriority mis(rowmap, colinds); + return mis.compute(); + } + case MIS2_BALANCED: + { + Impl::D2_MIS_BlendedPriority mis(rowmap, colinds); return mis.compute(); } } diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index 4c54a459f8..69a9095f29 100644 --- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -81,7 +81,7 @@ namespace Impl { */ template -struct D2_MIS_Luby +struct D2_MIS_RandomPriority { using exec_space = typename device_t::execution_space; using mem_space = typename device_t::memory_space; @@ -115,7 +115,7 @@ struct D2_MIS_Luby static constexpr status_t IN_SET = 0; static constexpr status_t OUT_SET = ~IN_SET; - D2_MIS_Luby(const rowmap_t& rowmap_, const entries_t& entries_) + D2_MIS_RandomPriority(const rowmap_t& rowmap_, const entries_t& entries_) : rowmap(rowmap_), entries(entries_), numVerts(rowmap.extent(0) - 1) { status_t i = numVerts + 1; @@ -375,7 +375,329 @@ struct D2_MIS_Luby }; template -struct D2_MIS_ECL +struct D2_MIS_BlendedPriority +{ + using exec_space = typename device_t::execution_space; + using mem_space = typename device_t::memory_space; + using bitset_t = Kokkos::Bitset; + using const_bitset_t = Kokkos::ConstBitset; + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using lno_view_t = typename entries_t::non_const_type; + //The type of status/priority values. + using status_t = typename std::make_unsigned::type; + using status_view_t = Kokkos::View; + using range_pol = Kokkos::RangePolicy; + using team_pol = Kokkos::TeamPolicy; + using team_mem = typename team_pol::member_type; + + KOKKOS_INLINE_FUNCTION static uint32_t xorshiftHash(uint32_t in) + { + uint32_t x = in; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + return x; + } + + // Priority values 0 and max are special, they mean the vertex is + // in the independent set or eliminated from consideration, respectively. + // Values in between represent a priority for being added to the set, + // based on degree and vertex ID as a tiebreak + // (higher priority = less preferred to being in the independent set) + + static constexpr status_t IN_SET = 0; + static constexpr status_t OUT_SET = ~IN_SET; + + D2_MIS_BlendedPriority(const rowmap_t& rowmap_, const entries_t& entries_) + : rowmap(rowmap_), entries(entries_), numVerts(rowmap.extent(0) - 1) + { + status_t i = numVerts + 1; + nvBits = 0; + while(i) + { + i >>= 1; + nvBits++; + } + //Each value in rowStatus represents the status and priority of each row. + //Each value in colStatus represents the lowest nonzero priority of any row adjacent to the column. + // This counts up monotonically as vertices are eliminated (given status OUT_SET) + rowStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("RowStatus"), numVerts); + colStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("ColStatus"), numVerts); + allWorklists = Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("AllWorklists"), numVerts, 3); + } + + struct RefreshRowStatus + { + RefreshRowStatus(const status_view_t& rowStatus_, const rowmap_t& rowmap_, const lno_view_t& worklist_, lno_t nvBits_, int round, float k_, lno_t minDeg_, lno_t maxDeg_) + : rowStatus(rowStatus_), rowmap(rowmap_), worklist(worklist_), nvBits(nvBits_), k(k_), minDeg(minDeg_) + { + hashedRound = xorshiftHash(round); + if(maxDeg_ == minDeg_) + { + //section doesn't matter at all + invDegRange = 1.0f; + } + else + { + invDegRange = 1.0f / (maxDeg_ - minDeg_); + } + } + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const + { + lno_t i = worklist(w); + int degBits = sizeof(status_t) * 8 - nvBits; + if(degBits == 0) + { + //no space to store degree information. Algorithm will still work but will + //probably produce a lower quality MIS. + rowStatus(i) = i + 1; + return; + } + //Combine vertex and round to get some pseudorandom priority bits that change each round + status_t maxDegRange = (((status_t) 1) << degBits) - 2; + lno_t deg = rowmap(i + 1) - rowmap(i); + //Compute degree-based score and random score + float degScore = (float) (deg - minDeg) * invDegRange; + float randScore = (xorshiftHash(i + hashedRound) & 0xFFFF) / 65536.f; + //Then linearly interpolate using k + float finalScore = k * randScore + (1.f - k) * degScore; + rowStatus(i) = (status_t) (i + 1) + (((status_t) (finalScore * maxDegRange)) << nvBits); + } + + status_view_t rowStatus; + rowmap_t rowmap; + lno_view_t worklist; + int nvBits; + uint32_t hashedRound; + float k; + lno_t minDeg; + float invDegRange; + }; + + struct RefreshColStatus + { + RefreshColStatus(const status_view_t& colStatus_, const lno_view_t& worklist_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) + : colStatus(colStatus_), worklist(worklist_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const + { + lno_t i = worklist(w); + //iterate over {i} union the neighbors of i, to find + //minimum status. + status_t s = OUT_SET; + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + for(size_type j = rowBegin; j <= rowEnd; j++) + { + lno_t nei = (j == rowEnd) ? i : entries(j); + if(nei <= nv) + { + status_t neiStat = rowStatus(nei); + if(neiStat < s) + s = neiStat; + } + } + if(s == IN_SET) + s = OUT_SET; + colStatus(i) = s; + } + + status_view_t colStatus; + lno_view_t worklist; + status_view_t rowStatus; + rowmap_t rowmap; + entries_t entries; + lno_t nv; + }; + + struct DecideSetFunctor + { + DecideSetFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const lno_view_t& worklist_) + : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklist(worklist_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const + { + lno_t i = worklist(w); + //Processing row i. + status_t s = rowStatus(i); + if(s == IN_SET || s == OUT_SET) + return; + //s is the status which must be the minimum among all neighbors + //to decide that i is IN_SET. + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + bool neiOut = false; + bool neiMismatchS = false; + for(size_type j = rowBegin; j <= rowEnd; j++) + { + lno_t nei = (j == rowEnd) ? i : entries(j); + if(nei >= nv) + continue; + status_t neiStat = colStatus(nei); + if(neiStat == OUT_SET) + { + neiOut = true; + break; + } + else if(neiStat != s) + { + neiMismatchS = true; + } + } + if(neiOut) + { + //In order to make future progress, need to update the + //col statuses for all neighbors of i. + rowStatus(i) = OUT_SET; + } + else if(!neiMismatchS) + { + //all neighboring col statuses match s, therefore s is the minimum status among all d2 neighbors + rowStatus(i) = IN_SET; + } + } + + status_view_t rowStatus; + status_view_t colStatus; + rowmap_t rowmap; + entries_t entries; + lno_t nv; + lno_view_t worklist; + }; + + struct CountInSet + { + CountInSet(const status_view_t& rowStatus_) + : rowStatus(rowStatus_) + {} + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet) const + { + if(rowStatus(i) == IN_SET) + lNumInSet++; + } + status_view_t rowStatus; + }; + + struct CompactInSet + { + CompactInSet(const status_view_t& rowStatus_, const lno_view_t& setList_) + : rowStatus(rowStatus_), setList(setList_) + {} + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, bool finalPass) const + { + if(rowStatus(i) == IN_SET) + { + if(finalPass) + setList(lNumInSet) = i; + lNumInSet++; + } + } + status_view_t rowStatus; + lno_view_t setList; + }; + + struct InitWorklistFunctor + { + InitWorklistFunctor(const lno_view_t& worklist_) + : worklist(worklist_) + {} + KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const + { + worklist(i) = i; + } + lno_view_t worklist; + }; + + struct CompactWorklistFunctor + { + CompactWorklistFunctor(const lno_view_t& src_, const lno_view_t& dst_, const status_view_t& status_) + : src(src_), dst(dst_), status(status_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lNumInSet, bool finalPass) const + { + lno_t i = src(w); + status_t s = status(i); + if(s != IN_SET && s != OUT_SET) + { + //next worklist needs to contain i + if(finalPass) + dst(lNumInSet) = i; + lNumInSet++; + } + } + + lno_view_t src; + lno_view_t dst; + status_view_t status; + }; + + lno_view_t compute() + { + //Compute min and max degree of graph + lno_t minDegree; + lno_t maxDegree; + KokkosKernels::Impl::graph_min_max_degree(rowmap, minDegree, maxDegree); + //Initialize first worklist to 0...numVerts + lno_view_t rowWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 0); + Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(rowWorklist)); + lno_view_t colWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 1); + Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(colWorklist)); + lno_view_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); + int round = 0; + lno_t rowWorkLen = numVerts; + lno_t colWorkLen = numVerts; + //k is the linear interpolation constant for priority: 0 means completely degree-based, 1 means completely random + float k = 0.f; + while(true) + { + //Compute new row statuses + Kokkos::parallel_for(range_pol(0, rowWorkLen), RefreshRowStatus(rowStatus, rowmap, rowWorklist, nvBits, round, k * k, minDegree, maxDegree)); + //Compute new col statuses + Kokkos::parallel_for(range_pol(0, colWorkLen), RefreshColStatus(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts)); + //Decide row statuses + Kokkos::parallel_for(range_pol(0, rowWorkLen), DecideSetFunctor(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist)); + //Compact row worklist + Kokkos::parallel_scan(range_pol(0, rowWorkLen), CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus), rowWorkLen); + if(rowWorkLen == 0) + break; + std::swap(rowWorklist, thirdWorklist); + //Compact col worklist + Kokkos::parallel_scan(range_pol(0, colWorkLen), CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus), colWorkLen); + std::swap(colWorklist, thirdWorklist); + round++; + k += 0.02f; + if(k > 1.0f) + k = 1.0f; + } + //now that every vertex has been decided IN_SET/OUT_SET, + //build a compact list of the vertices which are IN_SET. + lno_t numInSet = 0; + Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet); + lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), numInSet); + Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList)); + return setList; + } + + rowmap_t rowmap; + entries_t entries; + lno_t numVerts; + status_view_t rowStatus; + status_view_t colStatus; + Kokkos::View allWorklists; + //The number of bits required to represent vertex IDs, in the ECL-MIS tiebreak scheme: + // ceil(log_2(numVerts + 1)) + int nvBits; + lno_t minDegree; + lno_t maxDegree; +}; + +template +struct D2_MIS_FixedPriority { using exec_space = typename device_t::execution_space; using mem_space = typename device_t::memory_space; @@ -398,7 +720,7 @@ struct D2_MIS_ECL static constexpr status_t IN_SET = 0; static constexpr status_t OUT_SET = ~IN_SET; - D2_MIS_ECL(const rowmap_t& rowmap_, const entries_t& entries_) + D2_MIS_FixedPriority(const rowmap_t& rowmap_, const entries_t& entries_) : rowmap(rowmap_), entries(entries_), numVerts(rowmap.extent(0) - 1), colUpdateBitset(numVerts), worklist1(Kokkos::ViewAllocateWithoutInitializing("WL1"), numVerts), worklist2(Kokkos::ViewAllocateWithoutInitializing("WL2"), numVerts) diff --git a/unit_test/graph/Test_Graph_mis2.hpp b/unit_test/graph/Test_Graph_mis2.hpp index ca0d801002..3bd41fbf22 100644 --- a/unit_test/graph/Test_Graph_mis2.hpp +++ b/unit_test/graph/Test_Graph_mis2.hpp @@ -147,7 +147,7 @@ void test_dist2_mis(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_si auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); //For each algorithm, compute and verify the MIS std::vector algos - = {MIS2_FAST, MIS2_QUALITY}; + = {MIS2_FAST, MIS2_BALANCED, MIS2_QUALITY}; for(auto algo : algos) { auto mis = graph_d2_mis(symRowmap, symEntries, algo); From f3ecb30806e8a84172a15a80619551a679f9f055 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 2 Sep 2020 15:12:23 -0600 Subject: [PATCH 015/106] Added MIS-2 based coarsening Added a wiki example for it, still need to add a test --- example/wiki/graph/CMakeLists.txt | 5 + .../graph/KokkosGraph_wiki_coarsening.cpp | 28 +++++ src/graph/KokkosGraph_MIS2.hpp | 15 +++ .../impl/KokkosGraph_Distance2MIS_impl.hpp | 118 ++++++++++++++++++ 4 files changed, 166 insertions(+) create mode 100644 example/wiki/graph/KokkosGraph_wiki_coarsening.cpp diff --git a/example/wiki/graph/CMakeLists.txt b/example/wiki/graph/CMakeLists.txt index 25175ec08e..f1122958c2 100644 --- a/example/wiki/graph/CMakeLists.txt +++ b/example/wiki/graph/CMakeLists.txt @@ -13,3 +13,8 @@ KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( SOURCES KokkosGraph_wiki_mis2.cpp ) +KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( + wiki_coarsening + SOURCES KokkosGraph_wiki_coarsening.cpp + ) + diff --git a/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp new file mode 100644 index 0000000000..e27ab47f44 --- /dev/null +++ b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp @@ -0,0 +1,28 @@ +#include "KokkosGraph_wiki_9pt_stencil.hpp" +#include "KokkosGraph_MIS2.hpp" + +int main(int argc, char* argv[]) +{ + Kokkos::initialize(); + { + using GraphDemo::numVertices; + RowmapType rowmapDevice; + ColindsType colindsDevice; + //Step 1: Generate the graph on host, allocate space on device, and copy. + //See function "generate9pt" below. + GraphDemo::generate9pt(rowmapDevice, colindsDevice); + //Step 2: Run MIS-2 based coarsening and print the result + { + std::cout << "Coarsened vertex labels:\n"; + Ordinal numClusters = 0; + auto labels = KokkosGraph::Experimental::graph_mis2_coarsen( + rowmapDevice, colindsDevice, numClusters, KokkosGraph::MIS2_BALANCED); + //coarsening labels can be printed in the same way as colors + GraphDemo::printColoring(labels, numClusters); + putchar('\n'); + } + } + Kokkos::finalize(); + return 0; +} + diff --git a/src/graph/KokkosGraph_MIS2.hpp b/src/graph/KokkosGraph_MIS2.hpp index 5cb6e8a4d0..5636974db5 100644 --- a/src/graph/KokkosGraph_MIS2.hpp +++ b/src/graph/KokkosGraph_MIS2.hpp @@ -93,6 +93,21 @@ graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, MIS2_Algorithm al throw std::invalid_argument("graph_d2_mis: invalid algorithm"); } +template +typename colinds_t::non_const_type +graph_mis2_coarsen(const rowmap_t& rowmap, const colinds_t& colinds, typename colinds_t::non_const_value_type& numClusters, MIS2_Algorithm algo = MIS2_FAST) +{ + if(rowmap.extent(0) <= 1) + { + //there are no vertices to label + return typename colinds_t::non_const_type(); + } + auto mis2 = graph_d2_mis(rowmap, colinds, algo); + numClusters = mis2.extent(0); + Impl::D2_MIS_Coarsening coarsening(rowmap, colinds, mis2); + return coarsening.compute(); +} + } // end namespace Experimental } // end namespace KokkosGraph diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index 69a9095f29..b4279ec8ea 100644 --- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -1054,6 +1054,124 @@ struct D2_MIS_FixedPriority lno_view_t worklist2; }; +template +struct D2_MIS_Coarsening +{ + using exec_space = typename device_t::execution_space; + using mem_space = typename device_t::memory_space; + using bitset_t = Kokkos::Bitset; + using const_bitset_t = Kokkos::ConstBitset; + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using lno_view_t = typename entries_t::non_const_type; + //The type of status/priority values. + using status_t = typename std::make_unsigned::type; + using status_view_t = Kokkos::View; + using range_pol = Kokkos::RangePolicy; + + D2_MIS_Coarsening(const rowmap_t& rowmap_, const entries_t& entries_, const lno_view_t& mis2_) + : rowmap(rowmap_), entries(entries_), mis2(mis2_), + numVerts(rowmap.extent(0) - 1), + labels(Kokkos::ViewAllocateWithoutInitializing("Cluster Labels"), numVerts), + clusterSizes(Kokkos::ViewAllocateWithoutInitializing("Cluster Sizes"), mis2.extent(0)) + { + Kokkos::deep_copy(labels, (lno_t) -1); + } + + //Phase 1 (over 0...numClusters) labels roots and immediate neighbors of roots. + struct Phase1Functor + { + Phase1Functor(const rowmap_t& rowmap_, const entries_t& entries_, const lno_view_t& mis2_, lno_t numVerts_, const lno_view_t& labels_, const lno_view_t& clusterSizes_) + : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(numVerts_), labels(labels_), clusterSizes(clusterSizes_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const + { + lno_t root = mis2(i); + size_type rowBegin = rowmap(root); + size_type rowEnd = rowmap(root + 1); + clusterSizes(i) = (rowEnd - rowBegin); + labels(root) = i; + for(size_type j = rowBegin; j < rowEnd; j++) + { + lno_t nei = entries(j); + if(nei != root && nei < numVerts) + { + labels(nei) = i; + } + } + } + + rowmap_t rowmap; + entries_t entries; + lno_view_t mis2; + lno_t numVerts; + lno_view_t labels; + lno_view_t clusterSizes; + }; + + //Phase 2 (over 0...numVerts) joins unlabeled vertices to the smallest adjacent cluster + struct Phase2Functor + { + Phase2Functor(const rowmap_t& rowmap_, const entries_t& entries_, const lno_view_t& mis2_, lno_t numVerts_, const lno_view_t& labels_, const lno_view_t& clusterSizes_) + : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(numVerts_), labels(labels_), clusterSizes(clusterSizes_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const + { + if(labels(i) != (lno_t) -1) + return; + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + //smallest cluster and + lno_t cluster = -1; + lno_t clusterSize = numVerts + 1; + for(size_type j = rowBegin; j < rowEnd; j++) + { + lno_t nei = entries(j); + if(nei == i || nei >= numVerts) + continue; + lno_t neiCluster = labels(nei); + if(neiCluster != -1 && neiCluster != cluster) + { + //check if this cluster is smaller + lno_t neiClusterSize = clusterSizes(neiCluster); + if(neiClusterSize < clusterSize) + { + cluster = neiCluster; + clusterSize = neiClusterSize; + } + } + } + labels(i) = cluster; + } + + rowmap_t rowmap; + entries_t entries; + lno_view_t mis2; + lno_t numVerts; + lno_view_t labels; + lno_view_t clusterSizes; + }; + + lno_view_t compute() + { + lno_t numClusters = mis2.extent(0); + Kokkos::parallel_for(range_pol(0, numClusters), Phase1Functor(rowmap, entries, mis2, numVerts, labels, clusterSizes)); + Kokkos::parallel_for(range_pol(0, numVerts), Phase2Functor(rowmap, entries, mis2, numVerts, labels, clusterSizes)); + return labels; + } + + //Phase 2 joins remaining vertices to the smallest neighboring + + rowmap_t rowmap; + entries_t entries; + lno_view_t mis2; + lno_t numVerts; + lno_view_t labels; + lno_view_t clusterSizes; +}; + }}} #endif From 70c11f828f316e99d39eaa819fc4ac26c6cafd35 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 2 Sep 2020 15:43:16 -0600 Subject: [PATCH 016/106] Finished MI2 coarsening test, change phase2 heuristic Not easy to keep cluster sizes up to date during phase2, and still be deterministic. Instead, it now joins leftover vertices based on a pseudorandom value that is a function of both vertex ID and neighboring cluster ID. --- .../impl/KokkosGraph_Distance2MIS_impl.hpp | 39 +++++++------- unit_test/graph/Test_Graph_mis2.hpp | 54 +++++++++++++++++-- 2 files changed, 69 insertions(+), 24 deletions(-) diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index b4279ec8ea..d0a68c6f1f 100644 --- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -1072,8 +1072,7 @@ struct D2_MIS_Coarsening D2_MIS_Coarsening(const rowmap_t& rowmap_, const entries_t& entries_, const lno_view_t& mis2_) : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(rowmap.extent(0) - 1), - labels(Kokkos::ViewAllocateWithoutInitializing("Cluster Labels"), numVerts), - clusterSizes(Kokkos::ViewAllocateWithoutInitializing("Cluster Sizes"), mis2.extent(0)) + labels(Kokkos::ViewAllocateWithoutInitializing("Cluster Labels"), numVerts) { Kokkos::deep_copy(labels, (lno_t) -1); } @@ -1081,8 +1080,8 @@ struct D2_MIS_Coarsening //Phase 1 (over 0...numClusters) labels roots and immediate neighbors of roots. struct Phase1Functor { - Phase1Functor(const rowmap_t& rowmap_, const entries_t& entries_, const lno_view_t& mis2_, lno_t numVerts_, const lno_view_t& labels_, const lno_view_t& clusterSizes_) - : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(numVerts_), labels(labels_), clusterSizes(clusterSizes_) + Phase1Functor(const rowmap_t& rowmap_, const entries_t& entries_, const lno_view_t& mis2_, lno_t numVerts_, const lno_view_t& labels_) + : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(numVerts_), labels(labels_) {} KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const @@ -1090,7 +1089,6 @@ struct D2_MIS_Coarsening lno_t root = mis2(i); size_type rowBegin = rowmap(root); size_type rowEnd = rowmap(root + 1); - clusterSizes(i) = (rowEnd - rowBegin); labels(root) = i; for(size_type j = rowBegin; j < rowEnd; j++) { @@ -1107,14 +1105,22 @@ struct D2_MIS_Coarsening lno_view_t mis2; lno_t numVerts; lno_view_t labels; - lno_view_t clusterSizes; }; + KOKKOS_INLINE_FUNCTION static uint32_t xorshiftHash(uint32_t in) + { + uint32_t x = in; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + return x; + } + //Phase 2 (over 0...numVerts) joins unlabeled vertices to the smallest adjacent cluster struct Phase2Functor { - Phase2Functor(const rowmap_t& rowmap_, const entries_t& entries_, const lno_view_t& mis2_, lno_t numVerts_, const lno_view_t& labels_, const lno_view_t& clusterSizes_) - : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(numVerts_), labels(labels_), clusterSizes(clusterSizes_) + Phase2Functor(const rowmap_t& rowmap_, const entries_t& entries_, const lno_view_t& mis2_, lno_t numVerts_, const lno_view_t& labels_) + : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(numVerts_), labels(labels_) {} KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const @@ -1123,9 +1129,8 @@ struct D2_MIS_Coarsening return; size_type rowBegin = rowmap(i); size_type rowEnd = rowmap(i + 1); - //smallest cluster and lno_t cluster = -1; - lno_t clusterSize = numVerts + 1; + uint32_t minScore = ~(uint32_t) 0; for(size_type j = rowBegin; j < rowEnd; j++) { lno_t nei = entries(j); @@ -1135,11 +1140,11 @@ struct D2_MIS_Coarsening if(neiCluster != -1 && neiCluster != cluster) { //check if this cluster is smaller - lno_t neiClusterSize = clusterSizes(neiCluster); - if(neiClusterSize < clusterSize) + uint32_t score = xorshiftHash(i + xorshiftHash(neiCluster)); + if(score < minScore) { cluster = neiCluster; - clusterSize = neiClusterSize; + minScore = score; } } } @@ -1151,25 +1156,21 @@ struct D2_MIS_Coarsening lno_view_t mis2; lno_t numVerts; lno_view_t labels; - lno_view_t clusterSizes; }; lno_view_t compute() { lno_t numClusters = mis2.extent(0); - Kokkos::parallel_for(range_pol(0, numClusters), Phase1Functor(rowmap, entries, mis2, numVerts, labels, clusterSizes)); - Kokkos::parallel_for(range_pol(0, numVerts), Phase2Functor(rowmap, entries, mis2, numVerts, labels, clusterSizes)); + Kokkos::parallel_for(range_pol(0, numClusters), Phase1Functor(rowmap, entries, mis2, numVerts, labels)); + Kokkos::parallel_for(range_pol(0, numVerts), Phase2Functor(rowmap, entries, mis2, numVerts, labels)); return labels; } - //Phase 2 joins remaining vertices to the smallest neighboring - rowmap_t rowmap; entries_t entries; lno_view_t mis2; lno_t numVerts; lno_view_t labels; - lno_view_t clusterSizes; }; }}} diff --git a/unit_test/graph/Test_Graph_mis2.hpp b/unit_test/graph/Test_Graph_mis2.hpp index 3bd41fbf22..e82eabf882 100644 --- a/unit_test/graph/Test_Graph_mis2.hpp +++ b/unit_test/graph/Test_Graph_mis2.hpp @@ -124,7 +124,7 @@ bool verifyD2MIS( } template -void test_dist2_mis(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) +void test_mis2(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using execution_space = typename device::execution_space; using crsMat = KokkosSparse::CrsMatrix; @@ -159,12 +159,56 @@ void test_dist2_mis(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_si } } +template +void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) +{ + using execution_space = typename device::execution_space; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + //Generate graph, and add some out-of-bounds columns + crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix(numVerts, numVerts, nnz, row_size_variance, bandwidth); + auto G = A.graph; + //Symmetrize the graph + rowmap_t symRowmap; + entries_t symEntries; + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap + + (numVerts, G.row_map, G.entries, symRowmap, symEntries); + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); + //For each algorithm, compute and verify the MIS + std::vector algos + = {MIS2_FAST, MIS2_BALANCED, MIS2_QUALITY}; + for(auto algo : algos) + { + lno_t numClusters = 0; + auto labels = graph_mis2_coarsen(symRowmap, symEntries, numClusters, algo); + auto labelsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), labels); + //Not a strong test, but sanity check the number of clusters returned + EXPECT_TRUE(numClusters >= 1 && numClusters <= numVerts); + //Check that every label is in the range [0, numClusters) + for(lno_t i = 0; i < numVerts; i++) + EXPECT_TRUE(0 <= labelsHost(i) && labelsHost(i) < numClusters); + } +} + #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, graph##_##graph_color_mis2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ + TEST_F(TestCategory, graph##_##graph_mis2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ + { \ + test_mis2(5000, 5000 * 20, 1000, 10); \ + test_mis2(50, 50 * 10, 40, 10); \ + test_mis2(5, 5 * 3, 5, 0); \ + } \ + TEST_F(TestCategory, graph##_##graph_mis2_coarsening##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ { \ - test_dist2_mis(5000, 5000 * 20, 1000, 10); \ - test_dist2_mis(50, 50 * 10, 40, 10); \ - test_dist2_mis(5, 5 * 3, 5, 0); \ + test_mis2_coarsening(5000, 5000 * 20, 1000, 10); \ + test_mis2_coarsening(50, 50 * 10, 40, 10); \ + test_mis2_coarsening(5, 5 * 3, 5, 0); \ } #if defined(KOKKOSKERNELS_INST_DOUBLE) From b0bd2dec8f5b564b66d57f33399dd2f1a020da09 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 2 Sep 2020 17:28:24 -0600 Subject: [PATCH 017/106] Enable MIS-2 based coarsening for cluster GS --- src/common/KokkosKernels_Handle.hpp | 4 +- src/graph/KokkosGraph_MIS2.hpp | 22 ++--- .../impl/KokkosGraph_Distance2MIS_impl.hpp | 97 ++++++++++--------- .../KokkosSparse_gauss_seidel_handle.hpp | 10 +- ...KokkosSparse_cluster_gauss_seidel_impl.hpp | 23 ++--- unit_test/sparse/Test_Sparse_gauss_seidel.hpp | 27 +++--- 6 files changed, 87 insertions(+), 96 deletions(-) diff --git a/src/common/KokkosKernels_Handle.hpp b/src/common/KokkosKernels_Handle.hpp index d0ffa6ca85..9d43ba670c 100644 --- a/src/common/KokkosKernels_Handle.hpp +++ b/src/common/KokkosKernels_Handle.hpp @@ -609,10 +609,10 @@ class KokkosKernelsHandle } } - void create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t verts_per_cluster) { + void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t hint_verts_per_cluster) { this->destroy_gs_handle(); this->is_owner_of_the_gs_handle = true; - this->gsHandle = new ClusterGaussSeidelHandleType(KokkosSparse::CLUSTER_BALLOON, verts_per_cluster); + this->gsHandle = new ClusterGaussSeidelHandleType(clusterAlgo, hint_verts_per_cluster); } void destroy_gs_handle(){ if (is_owner_of_the_gs_handle && this->gsHandle != NULL){ diff --git a/src/graph/KokkosGraph_MIS2.hpp b/src/graph/KokkosGraph_MIS2.hpp index 5636974db5..4c267b434f 100644 --- a/src/graph/KokkosGraph_MIS2.hpp +++ b/src/graph/KokkosGraph_MIS2.hpp @@ -63,48 +63,48 @@ namespace Experimental{ // // Column indices >= num_verts are ignored. -template -typename colinds_t::non_const_type +template +lno_view_t graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, MIS2_Algorithm algo = MIS2_FAST) { if(rowmap.extent(0) <= 1) { //zero vertices means the MIS is empty. - return typename colinds_t::non_const_type(); + return lno_view_t(); } switch(algo) { case MIS2_QUALITY: { - Impl::D2_MIS_FixedPriority mis(rowmap, colinds); + Impl::D2_MIS_FixedPriority mis(rowmap, colinds); return mis.compute(); } case MIS2_FAST: { - Impl::D2_MIS_RandomPriority mis(rowmap, colinds); + Impl::D2_MIS_RandomPriority mis(rowmap, colinds); return mis.compute(); } case MIS2_BALANCED: { - Impl::D2_MIS_BlendedPriority mis(rowmap, colinds); + Impl::D2_MIS_BlendedPriority mis(rowmap, colinds); return mis.compute(); } } throw std::invalid_argument("graph_d2_mis: invalid algorithm"); } -template -typename colinds_t::non_const_type +template +labels_t graph_mis2_coarsen(const rowmap_t& rowmap, const colinds_t& colinds, typename colinds_t::non_const_value_type& numClusters, MIS2_Algorithm algo = MIS2_FAST) { if(rowmap.extent(0) <= 1) { //there are no vertices to label - return typename colinds_t::non_const_type(); + return labels_t(); } - auto mis2 = graph_d2_mis(rowmap, colinds, algo); + labels_t mis2 = graph_d2_mis(rowmap, colinds, algo); numClusters = mis2.extent(0); - Impl::D2_MIS_Coarsening coarsening(rowmap, colinds, mis2); + Impl::D2_MIS_Coarsening coarsening(rowmap, colinds, mis2); return coarsening.compute(); } diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index d0a68c6f1f..e8fdb4d0f1 100644 --- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -80,7 +80,7 @@ namespace Impl { * independently, but still have globally consistent rounds where row statuses change. */ -template +template struct D2_MIS_RandomPriority { using exec_space = typename device_t::execution_space; @@ -89,13 +89,14 @@ struct D2_MIS_RandomPriority using const_bitset_t = Kokkos::ConstBitset; using size_type = typename rowmap_t::non_const_value_type; using lno_t = typename entries_t::non_const_value_type; - using lno_view_t = typename entries_t::non_const_type; //The type of status/priority values. using status_t = typename std::make_unsigned::type; using status_view_t = Kokkos::View; using range_pol = Kokkos::RangePolicy; using team_pol = Kokkos::TeamPolicy; using team_mem = typename team_pol::member_type; + using all_worklists_t = Kokkos::View; + using worklist_t = Kokkos::View; KOKKOS_INLINE_FUNCTION static uint32_t xorshiftHash(uint32_t in) { @@ -135,7 +136,7 @@ struct D2_MIS_RandomPriority struct RefreshRowStatus { - RefreshRowStatus(const status_view_t& rowStatus_, const lno_view_t& worklist_, lno_t nvBits_, int round) + RefreshRowStatus(const status_view_t& rowStatus_, const worklist_t& worklist_, lno_t nvBits_, int round) : rowStatus(rowStatus_), worklist(worklist_), nvBits(nvBits_) { hashedRound = xorshiftHash(round); @@ -158,14 +159,14 @@ struct D2_MIS_RandomPriority } status_view_t rowStatus; - lno_view_t worklist; + worklist_t worklist; int nvBits; uint32_t hashedRound; }; struct RefreshColStatus { - RefreshColStatus(const status_view_t& colStatus_, const lno_view_t& worklist_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) + RefreshColStatus(const status_view_t& colStatus_, const worklist_t& worklist_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) : colStatus(colStatus_), worklist(worklist_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_) {} @@ -193,7 +194,7 @@ struct D2_MIS_RandomPriority } status_view_t colStatus; - lno_view_t worklist; + worklist_t worklist; status_view_t rowStatus; rowmap_t rowmap; entries_t entries; @@ -202,7 +203,7 @@ struct D2_MIS_RandomPriority struct DecideSetFunctor { - DecideSetFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const lno_view_t& worklist_) + DecideSetFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const worklist_t& worklist_) : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklist(worklist_) {} @@ -253,7 +254,7 @@ struct D2_MIS_RandomPriority rowmap_t rowmap; entries_t entries; lno_t nv; - lno_view_t worklist; + worklist_t worklist; }; struct CountInSet @@ -289,19 +290,19 @@ struct D2_MIS_RandomPriority struct InitWorklistFunctor { - InitWorklistFunctor(const lno_view_t& worklist_) + InitWorklistFunctor(const worklist_t& worklist_) : worklist(worklist_) {} KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const { worklist(i) = i; } - lno_view_t worklist; + worklist_t worklist; }; struct CompactWorklistFunctor { - CompactWorklistFunctor(const lno_view_t& src_, const lno_view_t& dst_, const status_view_t& status_) + CompactWorklistFunctor(const worklist_t& src_, const worklist_t& dst_, const status_view_t& status_) : src(src_), dst(dst_), status(status_) {} @@ -318,19 +319,19 @@ struct D2_MIS_RandomPriority } } - lno_view_t src; - lno_view_t dst; + worklist_t src; + worklist_t dst; status_view_t status; }; lno_view_t compute() { //Initialize first worklist to 0...numVerts - lno_view_t rowWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 0); + worklist_t rowWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 0); Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(rowWorklist)); - lno_view_t colWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 1); + worklist_t colWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 1); Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(colWorklist)); - lno_view_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); + worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); int round = 0; lno_t rowWorkLen = numVerts; lno_t colWorkLen = numVerts; @@ -366,7 +367,7 @@ struct D2_MIS_RandomPriority lno_t numVerts; status_view_t rowStatus; status_view_t colStatus; - Kokkos::View allWorklists; + all_worklists_t allWorklists; //The number of bits required to represent vertex IDs, in the ECL-MIS tiebreak scheme: // ceil(log_2(numVerts + 1)) int nvBits; @@ -374,7 +375,7 @@ struct D2_MIS_RandomPriority lno_t maxDegree; }; -template +template struct D2_MIS_BlendedPriority { using exec_space = typename device_t::execution_space; @@ -383,13 +384,14 @@ struct D2_MIS_BlendedPriority using const_bitset_t = Kokkos::ConstBitset; using size_type = typename rowmap_t::non_const_value_type; using lno_t = typename entries_t::non_const_value_type; - using lno_view_t = typename entries_t::non_const_type; //The type of status/priority values. using status_t = typename std::make_unsigned::type; using status_view_t = Kokkos::View; using range_pol = Kokkos::RangePolicy; using team_pol = Kokkos::TeamPolicy; using team_mem = typename team_pol::member_type; + using all_worklists_t = Kokkos::View; + using worklist_t = Kokkos::View; KOKKOS_INLINE_FUNCTION static uint32_t xorshiftHash(uint32_t in) { @@ -424,12 +426,12 @@ struct D2_MIS_BlendedPriority // This counts up monotonically as vertices are eliminated (given status OUT_SET) rowStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("RowStatus"), numVerts); colStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("ColStatus"), numVerts); - allWorklists = Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("AllWorklists"), numVerts, 3); + allWorklists = all_worklists_t(Kokkos::ViewAllocateWithoutInitializing("AllWorklists"), numVerts, 3); } struct RefreshRowStatus { - RefreshRowStatus(const status_view_t& rowStatus_, const rowmap_t& rowmap_, const lno_view_t& worklist_, lno_t nvBits_, int round, float k_, lno_t minDeg_, lno_t maxDeg_) + RefreshRowStatus(const status_view_t& rowStatus_, const rowmap_t& rowmap_, const worklist_t& worklist_, lno_t nvBits_, int round, float k_, lno_t minDeg_, lno_t maxDeg_) : rowStatus(rowStatus_), rowmap(rowmap_), worklist(worklist_), nvBits(nvBits_), k(k_), minDeg(minDeg_) { hashedRound = xorshiftHash(round); @@ -468,7 +470,7 @@ struct D2_MIS_BlendedPriority status_view_t rowStatus; rowmap_t rowmap; - lno_view_t worklist; + worklist_t worklist; int nvBits; uint32_t hashedRound; float k; @@ -478,7 +480,7 @@ struct D2_MIS_BlendedPriority struct RefreshColStatus { - RefreshColStatus(const status_view_t& colStatus_, const lno_view_t& worklist_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) + RefreshColStatus(const status_view_t& colStatus_, const worklist_t& worklist_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) : colStatus(colStatus_), worklist(worklist_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_) {} @@ -506,7 +508,7 @@ struct D2_MIS_BlendedPriority } status_view_t colStatus; - lno_view_t worklist; + worklist_t worklist; status_view_t rowStatus; rowmap_t rowmap; entries_t entries; @@ -515,7 +517,7 @@ struct D2_MIS_BlendedPriority struct DecideSetFunctor { - DecideSetFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const lno_view_t& worklist_) + DecideSetFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const worklist_t& worklist_) : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklist(worklist_) {} @@ -566,7 +568,7 @@ struct D2_MIS_BlendedPriority rowmap_t rowmap; entries_t entries; lno_t nv; - lno_view_t worklist; + worklist_t worklist; }; struct CountInSet @@ -602,19 +604,19 @@ struct D2_MIS_BlendedPriority struct InitWorklistFunctor { - InitWorklistFunctor(const lno_view_t& worklist_) + InitWorklistFunctor(const worklist_t& worklist_) : worklist(worklist_) {} KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const { worklist(i) = i; } - lno_view_t worklist; + worklist_t worklist; }; struct CompactWorklistFunctor { - CompactWorklistFunctor(const lno_view_t& src_, const lno_view_t& dst_, const status_view_t& status_) + CompactWorklistFunctor(const worklist_t& src_, const worklist_t& dst_, const status_view_t& status_) : src(src_), dst(dst_), status(status_) {} @@ -631,8 +633,8 @@ struct D2_MIS_BlendedPriority } } - lno_view_t src; - lno_view_t dst; + worklist_t src; + worklist_t dst; status_view_t status; }; @@ -643,11 +645,11 @@ struct D2_MIS_BlendedPriority lno_t maxDegree; KokkosKernels::Impl::graph_min_max_degree(rowmap, minDegree, maxDegree); //Initialize first worklist to 0...numVerts - lno_view_t rowWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 0); + worklist_t rowWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 0); Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(rowWorklist)); - lno_view_t colWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 1); + worklist_t colWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 1); Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(colWorklist)); - lno_view_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); + worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); int round = 0; lno_t rowWorkLen = numVerts; lno_t colWorkLen = numVerts; @@ -688,7 +690,7 @@ struct D2_MIS_BlendedPriority lno_t numVerts; status_view_t rowStatus; status_view_t colStatus; - Kokkos::View allWorklists; + all_worklists_t allWorklists; //The number of bits required to represent vertex IDs, in the ECL-MIS tiebreak scheme: // ceil(log_2(numVerts + 1)) int nvBits; @@ -696,7 +698,7 @@ struct D2_MIS_BlendedPriority lno_t maxDegree; }; -template +template struct D2_MIS_FixedPriority { using exec_space = typename device_t::execution_space; @@ -705,7 +707,6 @@ struct D2_MIS_FixedPriority using const_bitset_t = Kokkos::ConstBitset; using size_type = typename rowmap_t::non_const_value_type; using lno_t = typename entries_t::non_const_value_type; - using lno_view_t = typename entries_t::non_const_type; //The type of status/priority values. using status_t = typename std::make_unsigned::type; using status_view_t = Kokkos::View; @@ -1054,7 +1055,7 @@ struct D2_MIS_FixedPriority lno_view_t worklist2; }; -template +template struct D2_MIS_Coarsening { using exec_space = typename device_t::execution_space; @@ -1069,7 +1070,7 @@ struct D2_MIS_Coarsening using status_view_t = Kokkos::View; using range_pol = Kokkos::RangePolicy; - D2_MIS_Coarsening(const rowmap_t& rowmap_, const entries_t& entries_, const lno_view_t& mis2_) + D2_MIS_Coarsening(const rowmap_t& rowmap_, const entries_t& entries_, const labels_t& mis2_) : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(rowmap.extent(0) - 1), labels(Kokkos::ViewAllocateWithoutInitializing("Cluster Labels"), numVerts) @@ -1080,7 +1081,7 @@ struct D2_MIS_Coarsening //Phase 1 (over 0...numClusters) labels roots and immediate neighbors of roots. struct Phase1Functor { - Phase1Functor(const rowmap_t& rowmap_, const entries_t& entries_, const lno_view_t& mis2_, lno_t numVerts_, const lno_view_t& labels_) + Phase1Functor(const rowmap_t& rowmap_, const entries_t& entries_, const labels_t& mis2_, lno_t numVerts_, const labels_t& labels_) : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(numVerts_), labels(labels_) {} @@ -1102,9 +1103,9 @@ struct D2_MIS_Coarsening rowmap_t rowmap; entries_t entries; - lno_view_t mis2; + labels_t mis2; lno_t numVerts; - lno_view_t labels; + labels_t labels; }; KOKKOS_INLINE_FUNCTION static uint32_t xorshiftHash(uint32_t in) @@ -1119,7 +1120,7 @@ struct D2_MIS_Coarsening //Phase 2 (over 0...numVerts) joins unlabeled vertices to the smallest adjacent cluster struct Phase2Functor { - Phase2Functor(const rowmap_t& rowmap_, const entries_t& entries_, const lno_view_t& mis2_, lno_t numVerts_, const lno_view_t& labels_) + Phase2Functor(const rowmap_t& rowmap_, const entries_t& entries_, const labels_t& mis2_, lno_t numVerts_, const labels_t& labels_) : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(numVerts_), labels(labels_) {} @@ -1153,12 +1154,12 @@ struct D2_MIS_Coarsening rowmap_t rowmap; entries_t entries; - lno_view_t mis2; + labels_t mis2; lno_t numVerts; - lno_view_t labels; + labels_t labels; }; - lno_view_t compute() + labels_t compute() { lno_t numClusters = mis2.extent(0); Kokkos::parallel_for(range_pol(0, numClusters), Phase1Functor(rowmap, entries, mis2, numVerts, labels)); @@ -1168,9 +1169,9 @@ struct D2_MIS_Coarsening rowmap_t rowmap; entries_t entries; - lno_view_t mis2; + labels_t mis2; lno_t numVerts; - lno_view_t labels; + labels_t labels; }; }}} diff --git a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp index e4ded70d54..7d137f4590 100644 --- a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp +++ b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp @@ -57,20 +57,16 @@ namespace KokkosSparse{ enum GSAlgorithm{GS_DEFAULT, GS_PERMUTED, GS_TEAM, GS_CLUSTER, GS_TWOSTAGE}; enum GSDirection{GS_FORWARD, GS_BACKWARD, GS_SYMMETRIC}; - enum ClusteringAlgorithm{CLUSTER_DEFAULT, CLUSTER_BALLOON, CLUSTER_CUTHILL_MCKEE, CLUSTER_DO_NOTHING, NUM_CLUSTERING_ALGORITHMS}; + enum ClusteringAlgorithm{CLUSTER_DEFAULT, CLUSTER_MIS2, CLUSTER_BALLOON, NUM_CLUSTERING_ALGORITHMS}; inline const char* getClusterAlgoName(ClusteringAlgorithm ca) { switch(ca) { - case CLUSTER_DEFAULT: - return "Default"; case CLUSTER_BALLOON: return "Balloon"; - case CLUSTER_CUTHILL_MCKEE: - return "Cuthill-McKee"; - case CLUSTER_DO_NOTHING: - return "No-op"; + case CLUSTER_MIS2: + return "MIS(2)"; default:; } return "INVALID CLUSTERING ALGORITHM"; diff --git a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index 9671c2339d..bca3bd725a 100644 --- a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -56,6 +56,7 @@ #include "KokkosKernels_BitUtils.hpp" #include "KokkosKernels_SimpleUtils.hpp" #include "KokkosSparse_partitioning_impl.hpp" +#include "KokkosGraph_MIS2.hpp" namespace KokkosSparse{ namespace Impl{ @@ -782,20 +783,15 @@ namespace KokkosSparse{ raw_sym_adj = raw_colinds_t(sym_adj.data(), sym_adj.extent(0)); } nnz_view_t vertClusters; - //auto clusterAlgo = gsHandle->get_clustering_algo(); - BalloonClustering balloon(num_rows, raw_sym_xadj, raw_sym_adj); - vertClusters = balloon.run(clusterSize); - /* + auto clusterAlgo = gsHandle->get_clustering_algo(); if(clusterAlgo == CLUSTER_DEFAULT) - clusterAlgo = CLUSTER_BALLOON; + clusterAlgo = CLUSTER_MIS2; switch(clusterAlgo) { - case CLUSTER_CUTHILL_MCKEE: + case CLUSTER_MIS2: { - RCM rcm(num_rows, raw_sym_xadj, raw_sym_adj); - nnz_view_t cmOrder = rcm.cuthill_mckee(); - vertClusters = nnz_view_t("Cluster labels", num_rows); - Kokkos::parallel_for(my_exec_space(0, num_rows), ReorderedClusteringFunctor(vertClusters, cmOrder, clusterSize)); + vertClusters = KokkosGraph::Experimental::graph_mis2_coarsen + (raw_sym_xadj, raw_sym_adj, numClusters, KokkosGraph::MIS2_FAST); break; } case CLUSTER_BALLOON: @@ -804,12 +800,6 @@ namespace KokkosSparse{ vertClusters = balloon.run(clusterSize); break; } - case CLUSTER_DO_NOTHING: - { - vertClusters = nnz_view_t("Cluster labels", num_rows); - Kokkos::parallel_for(my_exec_space(0, num_rows), NopVertClusteringFunctor(vertClusters, clusterSize)); - break; - } case CLUSTER_DEFAULT: { throw std::logic_error("Logic to choose default clustering algorithm is incorrect"); @@ -817,7 +807,6 @@ namespace KokkosSparse{ default: throw std::runtime_error("Clustering algo " + std::to_string((int) clusterAlgo) + " is not implemented"); } - */ #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "Graph clustering: " << timer.seconds() << '\n'; timer.reset(); diff --git a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp index a8d7d46a40..9993d46e22 100644 --- a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp @@ -81,7 +81,8 @@ int run_gauss_seidel( bool is_symmetric_graph, int apply_type = 0, // 0 for symmetric, 1 for forward, 2 for backward. int cluster_size = 1, - bool classic = false) // only with two-stage, true for sptrsv instead of richardson + bool classic = false, // only with two-stage, true for sptrsv instead of richardson + ClusteringAlgorithm clusterAlgo = CLUSTER_DEFAULT) { typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type lno_view_t; @@ -100,7 +101,7 @@ int run_gauss_seidel( kh.set_team_work_size(16); kh.set_dynamic_scheduling(true); if(gs_algorithm == GS_CLUSTER) - kh.create_gs_handle(KokkosSparse::CLUSTER_BALLOON, cluster_size); + kh.create_gs_handle(clusterAlgo, cluster_size); else if(gs_algorithm == GS_TWOSTAGE) { // test for two-stage/classical gs kh.create_gs_handle(gs_algorithm); @@ -281,18 +282,22 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ } //*** Cluster-coloring version **** int clusterSizes[3] = {2, 5, 34}; + std::vector clusteringAlgos = {CLUSTER_MIS2, CLUSTER_BALLOON}; for(int csize = 0; csize < 3; csize++) { - for(int apply_type = 0; apply_type < apply_count; ++apply_type) + for(auto clusterAlgo : clusteringAlgos) { - Kokkos::Impl::Timer timer1; - //Zero out X before solving - Kokkos::deep_copy(x_vector, zero); - run_gauss_seidel( - input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, apply_type, clusterSizes[csize]); - KokkosBlas::axpby(one, solution_x, -one, x_vector); - mag_t result_norm_res = KokkosBlas::nrm2(x_vector); - EXPECT_LT(result_norm_res, initial_norm_res); + for(int apply_type = 0; apply_type < apply_count; ++apply_type) + { + Kokkos::Impl::Timer timer1; + //Zero out X before solving + Kokkos::deep_copy(x_vector, zero); + run_gauss_seidel( + input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, apply_type, clusterSizes[csize], false, clusterAlgo); + KokkosBlas::axpby(one, solution_x, -one, x_vector); + mag_t result_norm_res = KokkosBlas::nrm2(x_vector); + EXPECT_LT(result_norm_res, initial_norm_res); + } } } //*** Two-stage version **** From f7432dcff57e2a47340bc2f833a1ab159f2a9e0b Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 2 Sep 2020 17:56:32 -0600 Subject: [PATCH 018/106] Fix shadow warning --- src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index e8fdb4d0f1..35ce0d1794 100644 --- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -371,8 +371,6 @@ struct D2_MIS_RandomPriority //The number of bits required to represent vertex IDs, in the ECL-MIS tiebreak scheme: // ceil(log_2(numVerts + 1)) int nvBits; - lno_t minDegree; - lno_t maxDegree; }; template @@ -694,8 +692,6 @@ struct D2_MIS_BlendedPriority //The number of bits required to represent vertex IDs, in the ECL-MIS tiebreak scheme: // ceil(log_2(numVerts + 1)) int nvBits; - lno_t minDegree; - lno_t maxDegree; }; template From af8fc721875fcd8c34b2a3caaca46ee926751743 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 2 Sep 2020 20:25:47 -0600 Subject: [PATCH 019/106] Remove hybrid algorithm but leave the interpolated priority function commented out in case it's useful for paper experiments --- example/wiki/graph/KokkosGraph_wiki_mis2.cpp | 6 - perf_test/graph/KokkosGraph_mis_d2.cpp | 2 - src/graph/KokkosGraph_MIS2.hpp | 8 +- .../impl/KokkosGraph_Distance2MIS_impl.hpp | 347 ++---------------- unit_test/graph/Test_Graph_mis2.hpp | 2 +- 5 files changed, 29 insertions(+), 336 deletions(-) diff --git a/example/wiki/graph/KokkosGraph_wiki_mis2.cpp b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp index c158231282..416164981b 100644 --- a/example/wiki/graph/KokkosGraph_wiki_mis2.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp @@ -20,12 +20,6 @@ int main(int argc, char* argv[]) << misDevice.extent(0) << " out of " << GraphDemo::numVertices << " vertices.\n"; GraphDemo::printMIS(misDevice); putchar('\n'); - misDevice = KokkosGraph::Experimental::graph_d2_mis( - rowmapDevice, colindsDevice, KokkosGraph::MIS2_BALANCED); - std::cout << "Distance-2 MIS, BALANCED algorithm: contains " - << misDevice.extent(0) << " out of " << GraphDemo::numVertices << " vertices.\n"; - GraphDemo::printMIS(misDevice); - putchar('\n'); misDevice = KokkosGraph::Experimental::graph_d2_mis( rowmapDevice, colindsDevice, KokkosGraph::MIS2_QUALITY); std::cout << "Distance-2 MIS, QUALITY algorithm: contains " diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp index 1ae68fff00..da9fb549d6 100644 --- a/perf_test/graph/KokkosGraph_mis_d2.cpp +++ b/perf_test/graph/KokkosGraph_mis_d2.cpp @@ -226,8 +226,6 @@ int parse_inputs(MIS2Parameters ¶ms, int argc, char **argv) params.algo = MIS2_FAST; else if(!strcasecmp(algName, "quality")) params.algo = MIS2_QUALITY; - else if(!strcasecmp(algName, "balanced")) - params.algo = MIS2_BALANCED; else throw std::invalid_argument("Algorithm not valid: must be 'fast' or 'quality'"); } diff --git a/src/graph/KokkosGraph_MIS2.hpp b/src/graph/KokkosGraph_MIS2.hpp index 4c267b434f..c578a97271 100644 --- a/src/graph/KokkosGraph_MIS2.hpp +++ b/src/graph/KokkosGraph_MIS2.hpp @@ -52,8 +52,7 @@ namespace KokkosGraph{ enum MIS2_Algorithm { MIS2_QUALITY, - MIS2_FAST, - MIS2_BALANCED + MIS2_FAST }; namespace Experimental{ @@ -84,11 +83,6 @@ graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, MIS2_Algorithm al Impl::D2_MIS_RandomPriority mis(rowmap, colinds); return mis.compute(); } - case MIS2_BALANCED: - { - Impl::D2_MIS_BlendedPriority mis(rowmap, colinds); - return mis.compute(); - } } throw std::invalid_argument("graph_d2_mis: invalid algorithm"); } diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index 35ce0d1794..688ff74137 100644 --- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -373,326 +373,33 @@ struct D2_MIS_RandomPriority int nvBits; }; -template -struct D2_MIS_BlendedPriority -{ - using exec_space = typename device_t::execution_space; - using mem_space = typename device_t::memory_space; - using bitset_t = Kokkos::Bitset; - using const_bitset_t = Kokkos::ConstBitset; - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - //The type of status/priority values. - using status_t = typename std::make_unsigned::type; - using status_view_t = Kokkos::View; - using range_pol = Kokkos::RangePolicy; - using team_pol = Kokkos::TeamPolicy; - using team_mem = typename team_pol::member_type; - using all_worklists_t = Kokkos::View; - using worklist_t = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static uint32_t xorshiftHash(uint32_t in) - { - uint32_t x = in; - x ^= x << 13; - x ^= x >> 17; - x ^= x << 5; - return x; - } - - // Priority values 0 and max are special, they mean the vertex is - // in the independent set or eliminated from consideration, respectively. - // Values in between represent a priority for being added to the set, - // based on degree and vertex ID as a tiebreak - // (higher priority = less preferred to being in the independent set) - - static constexpr status_t IN_SET = 0; - static constexpr status_t OUT_SET = ~IN_SET; - - D2_MIS_BlendedPriority(const rowmap_t& rowmap_, const entries_t& entries_) - : rowmap(rowmap_), entries(entries_), numVerts(rowmap.extent(0) - 1) - { - status_t i = numVerts + 1; - nvBits = 0; - while(i) - { - i >>= 1; - nvBits++; - } - //Each value in rowStatus represents the status and priority of each row. - //Each value in colStatus represents the lowest nonzero priority of any row adjacent to the column. - // This counts up monotonically as vertices are eliminated (given status OUT_SET) - rowStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("RowStatus"), numVerts); - colStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("ColStatus"), numVerts); - allWorklists = all_worklists_t(Kokkos::ViewAllocateWithoutInitializing("AllWorklists"), numVerts, 3); - } - - struct RefreshRowStatus - { - RefreshRowStatus(const status_view_t& rowStatus_, const rowmap_t& rowmap_, const worklist_t& worklist_, lno_t nvBits_, int round, float k_, lno_t minDeg_, lno_t maxDeg_) - : rowStatus(rowStatus_), rowmap(rowmap_), worklist(worklist_), nvBits(nvBits_), k(k_), minDeg(minDeg_) - { - hashedRound = xorshiftHash(round); - if(maxDeg_ == minDeg_) - { - //section doesn't matter at all - invDegRange = 1.0f; - } - else - { - invDegRange = 1.0f / (maxDeg_ - minDeg_); - } - } - - KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const - { - lno_t i = worklist(w); - int degBits = sizeof(status_t) * 8 - nvBits; - if(degBits == 0) - { - //no space to store degree information. Algorithm will still work but will - //probably produce a lower quality MIS. - rowStatus(i) = i + 1; - return; - } - //Combine vertex and round to get some pseudorandom priority bits that change each round - status_t maxDegRange = (((status_t) 1) << degBits) - 2; - lno_t deg = rowmap(i + 1) - rowmap(i); - //Compute degree-based score and random score - float degScore = (float) (deg - minDeg) * invDegRange; - float randScore = (xorshiftHash(i + hashedRound) & 0xFFFF) / 65536.f; - //Then linearly interpolate using k - float finalScore = k * randScore + (1.f - k) * degScore; - rowStatus(i) = (status_t) (i + 1) + (((status_t) (finalScore * maxDegRange)) << nvBits); - } - - status_view_t rowStatus; - rowmap_t rowmap; - worklist_t worklist; - int nvBits; - uint32_t hashedRound; - float k; - lno_t minDeg; - float invDegRange; - }; - - struct RefreshColStatus - { - RefreshColStatus(const status_view_t& colStatus_, const worklist_t& worklist_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) - : colStatus(colStatus_), worklist(worklist_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_) - {} - - KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const - { - lno_t i = worklist(w); - //iterate over {i} union the neighbors of i, to find - //minimum status. - status_t s = OUT_SET; - size_type rowBegin = rowmap(i); - size_type rowEnd = rowmap(i + 1); - for(size_type j = rowBegin; j <= rowEnd; j++) - { - lno_t nei = (j == rowEnd) ? i : entries(j); - if(nei <= nv) - { - status_t neiStat = rowStatus(nei); - if(neiStat < s) - s = neiStat; - } - } - if(s == IN_SET) - s = OUT_SET; - colStatus(i) = s; - } - - status_view_t colStatus; - worklist_t worklist; - status_view_t rowStatus; - rowmap_t rowmap; - entries_t entries; - lno_t nv; - }; - - struct DecideSetFunctor - { - DecideSetFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const worklist_t& worklist_) - : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklist(worklist_) - {} - - KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const - { - lno_t i = worklist(w); - //Processing row i. - status_t s = rowStatus(i); - if(s == IN_SET || s == OUT_SET) - return; - //s is the status which must be the minimum among all neighbors - //to decide that i is IN_SET. - size_type rowBegin = rowmap(i); - size_type rowEnd = rowmap(i + 1); - bool neiOut = false; - bool neiMismatchS = false; - for(size_type j = rowBegin; j <= rowEnd; j++) - { - lno_t nei = (j == rowEnd) ? i : entries(j); - if(nei >= nv) - continue; - status_t neiStat = colStatus(nei); - if(neiStat == OUT_SET) - { - neiOut = true; - break; - } - else if(neiStat != s) - { - neiMismatchS = true; - } - } - if(neiOut) - { - //In order to make future progress, need to update the - //col statuses for all neighbors of i. - rowStatus(i) = OUT_SET; - } - else if(!neiMismatchS) - { - //all neighboring col statuses match s, therefore s is the minimum status among all d2 neighbors - rowStatus(i) = IN_SET; - } - } - - status_view_t rowStatus; - status_view_t colStatus; - rowmap_t rowmap; - entries_t entries; - lno_t nv; - worklist_t worklist; - }; - - struct CountInSet - { - CountInSet(const status_view_t& rowStatus_) - : rowStatus(rowStatus_) - {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet) const - { - if(rowStatus(i) == IN_SET) - lNumInSet++; - } - status_view_t rowStatus; - }; - - struct CompactInSet - { - CompactInSet(const status_view_t& rowStatus_, const lno_view_t& setList_) - : rowStatus(rowStatus_), setList(setList_) - {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, bool finalPass) const - { - if(rowStatus(i) == IN_SET) - { - if(finalPass) - setList(lNumInSet) = i; - lNumInSet++; - } - } - status_view_t rowStatus; - lno_view_t setList; - }; - - struct InitWorklistFunctor - { - InitWorklistFunctor(const worklist_t& worklist_) - : worklist(worklist_) - {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const - { - worklist(i) = i; - } - worklist_t worklist; - }; - - struct CompactWorklistFunctor - { - CompactWorklistFunctor(const worklist_t& src_, const worklist_t& dst_, const status_view_t& status_) - : src(src_), dst(dst_), status(status_) - {} - - KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lNumInSet, bool finalPass) const - { - lno_t i = src(w); - status_t s = status(i); - if(s != IN_SET && s != OUT_SET) - { - //next worklist needs to contain i - if(finalPass) - dst(lNumInSet) = i; - lNumInSet++; - } - } - - worklist_t src; - worklist_t dst; - status_view_t status; - }; - - lno_view_t compute() - { - //Compute min and max degree of graph - lno_t minDegree; - lno_t maxDegree; - KokkosKernels::Impl::graph_min_max_degree(rowmap, minDegree, maxDegree); - //Initialize first worklist to 0...numVerts - worklist_t rowWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 0); - Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(rowWorklist)); - worklist_t colWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 1); - Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(colWorklist)); - worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); - int round = 0; - lno_t rowWorkLen = numVerts; - lno_t colWorkLen = numVerts; - //k is the linear interpolation constant for priority: 0 means completely degree-based, 1 means completely random - float k = 0.f; - while(true) - { - //Compute new row statuses - Kokkos::parallel_for(range_pol(0, rowWorkLen), RefreshRowStatus(rowStatus, rowmap, rowWorklist, nvBits, round, k * k, minDegree, maxDegree)); - //Compute new col statuses - Kokkos::parallel_for(range_pol(0, colWorkLen), RefreshColStatus(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts)); - //Decide row statuses - Kokkos::parallel_for(range_pol(0, rowWorkLen), DecideSetFunctor(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist)); - //Compact row worklist - Kokkos::parallel_scan(range_pol(0, rowWorkLen), CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus), rowWorkLen); - if(rowWorkLen == 0) - break; - std::swap(rowWorklist, thirdWorklist); - //Compact col worklist - Kokkos::parallel_scan(range_pol(0, colWorkLen), CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus), colWorkLen); - std::swap(colWorklist, thirdWorklist); - round++; - k += 0.02f; - if(k > 1.0f) - k = 1.0f; - } - //now that every vertex has been decided IN_SET/OUT_SET, - //build a compact list of the vertices which are IN_SET. - lno_t numInSet = 0; - Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet); - lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), numInSet); - Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList)); - return setList; - } - - rowmap_t rowmap; - entries_t entries; - lno_t numVerts; - status_view_t rowStatus; - status_view_t colStatus; - all_worklists_t allWorklists; - //The number of bits required to represent vertex IDs, in the ECL-MIS tiebreak scheme: - // ceil(log_2(numVerts + 1)) - int nvBits; -}; +// UNUSED CODE +// Version of RefreshRowStatus, which does linear interpolation between a degree-based score and a random score. +// By gradually increasing the interpolation coefficient in favor of random, the MIS can converge much faster than +// constant priorities. +// +// KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const +// { +// lno_t i = worklist(w); +// int degBits = sizeof(status_t) * 8 - nvBits; +// if(degBits == 0) +// { +// //no space to store degree information. Algorithm will still work but will +// //probably produce a lower quality MIS. +// rowStatus(i) = i + 1; +// return; +// } +// //Combine vertex and round to get some pseudorandom priority bits that change each round +// status_t maxDegRange = (((status_t) 1) << degBits) - 2; +// lno_t deg = rowmap(i + 1) - rowmap(i); +// //Compute degree-based score and random score +// float degScore = (float) (deg - minDeg) * invDegRange; +// float randScore = (xorshiftHash(i + hashedRound) & 0xFFFF) / 65536.f; +// //Then linearly interpolate using k +// float finalScore = k * randScore + (1.f - k) * degScore; +// rowStatus(i) = (status_t) (i + 1) + (((status_t) (finalScore * maxDegRange)) << nvBits); +// } +// */ template struct D2_MIS_FixedPriority diff --git a/unit_test/graph/Test_Graph_mis2.hpp b/unit_test/graph/Test_Graph_mis2.hpp index e82eabf882..037ff7b835 100644 --- a/unit_test/graph/Test_Graph_mis2.hpp +++ b/unit_test/graph/Test_Graph_mis2.hpp @@ -183,7 +183,7 @@ void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); //For each algorithm, compute and verify the MIS std::vector algos - = {MIS2_FAST, MIS2_BALANCED, MIS2_QUALITY}; + = {MIS2_FAST, MIS2_QUALITY}; for(auto algo : algos) { lno_t numClusters = 0; From 4e763a0d1efc262ede176c28f7d745458ced535b Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 2 Sep 2020 20:40:10 -0600 Subject: [PATCH 020/106] Remove MIS2_BALANCED in one place --- unit_test/graph/Test_Graph_mis2.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unit_test/graph/Test_Graph_mis2.hpp b/unit_test/graph/Test_Graph_mis2.hpp index 037ff7b835..30d32fb2dc 100644 --- a/unit_test/graph/Test_Graph_mis2.hpp +++ b/unit_test/graph/Test_Graph_mis2.hpp @@ -147,7 +147,7 @@ void test_mis2(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_va auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); //For each algorithm, compute and verify the MIS std::vector algos - = {MIS2_FAST, MIS2_BALANCED, MIS2_QUALITY}; + = {MIS2_FAST, MIS2_QUALITY}; for(auto algo : algos) { auto mis = graph_d2_mis(symRowmap, symEntries, algo); From c1c92e85a1927bafa796c78c55beddeb85bd1dcb Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 2 Sep 2020 21:39:09 -0600 Subject: [PATCH 021/106] Remove MIS2_BALANCED from another place --- example/wiki/graph/KokkosGraph_wiki_coarsening.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp index e27ab47f44..dded3fd258 100644 --- a/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp @@ -16,7 +16,7 @@ int main(int argc, char* argv[]) std::cout << "Coarsened vertex labels:\n"; Ordinal numClusters = 0; auto labels = KokkosGraph::Experimental::graph_mis2_coarsen( - rowmapDevice, colindsDevice, numClusters, KokkosGraph::MIS2_BALANCED); + rowmapDevice, colindsDevice, numClusters, KokkosGraph::MIS2_FAST); //coarsening labels can be printed in the same way as colors GraphDemo::printColoring(labels, numClusters); putchar('\n'); From 4ff4f869e14945913d99a4b3f591e92af610a225 Mon Sep 17 00:00:00 2001 From: iyamaza Date: Tue, 8 Sep 2020 13:20:54 -0600 Subject: [PATCH 022/106] moving if(invert_offdiag) out of functor with tag --- src/sparse/KokkosSparse_sptrsv_supernode.hpp | 98 ++++++++++++++++---- 1 file changed, 80 insertions(+), 18 deletions(-) diff --git a/src/sparse/KokkosSparse_sptrsv_supernode.hpp b/src/sparse/KokkosSparse_sptrsv_supernode.hpp index 16b55d0876..42529155fa 100644 --- a/src/sparse/KokkosSparse_sptrsv_supernode.hpp +++ b/src/sparse/KokkosSparse_sptrsv_supernode.hpp @@ -1089,11 +1089,13 @@ void sptrsv_supernodal_symbolic( /* Auxiliary functions for numeric computation */ /* ========================================================================================= */ + struct Tag_SupTrtriFunctor{}; + struct Tag_SupTrtriTrmmFunctor{}; + template struct TriSupernodalTrtriFunctor { - bool invert_offdiag; integer_view_host_t supernode_ids; const input_size_type *nb; row_map_type hr; @@ -1101,9 +1103,8 @@ void sptrsv_supernodal_symbolic( values_type hv; KOKKOS_INLINE_FUNCTION - TriSupernodalTrtriFunctor(bool invert_offdiag_, integer_view_host_t supernode_ids_, const input_size_type *nb_, + TriSupernodalTrtriFunctor(integer_view_host_t supernode_ids_, const input_size_type *nb_, row_map_type& hr_, index_type& hc_, values_type& hv_) : - invert_offdiag(invert_offdiag_), supernode_ids(supernode_ids_), nb(nb_), hr(hr_), @@ -1111,8 +1112,9 @@ void sptrsv_supernodal_symbolic( hv(hv_) {} + // functor: just invert diagonal KOKKOS_INLINE_FUNCTION - void operator() (const int i) const { + void operator() (const Tag_SupTrtriFunctor&, const int i) const { using execution_space = typename values_type::execution_space; using memory_space = typename execution_space::memory_space; using values_view_t = typename values_type::non_const_type; @@ -1128,13 +1130,42 @@ void sptrsv_supernodal_symbolic( int nsrow = hr(j1+1) - hr(j1); int nscol = nb[s +1] - nb[s]; + // invert diagonal auto nnzD = hr (j1); Kokkos::View viewL (&hv(nnzD), nsrow, nscol); auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ()); KokkosBatched::SerialTrtri::invoke(Ljj); + } - if (nsrow > nscol && invert_offdiag) { + // functor: invert diagonal + apply inverse to off-diagonal + KOKKOS_INLINE_FUNCTION + void operator() (const Tag_SupTrtriTrmmFunctor&, const int i) const { + using execution_space = typename values_type::execution_space; + using memory_space = typename execution_space::memory_space; + using values_view_t = typename values_type::non_const_type; + using scalar_t = typename values_view_t::value_type; + + using range_type = Kokkos::pair; + using TrtriAlgoType = KokkosBatched::Algo::Trtri::Unblocked; + using Side = KokkosBatched::Side; + using Trans = KokkosBatched::Trans; + + int s = supernode_ids(i); + int j1 = nb[s]; + int nsrow = hr(j1+1) - hr(j1); + int nscol = nb[s +1] - nb[s]; + + // invert diagonal + auto nnzD = hr (j1); + Kokkos::View + viewL (&hv(nnzD), nsrow, nscol); + auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ()); + KokkosBatched::SerialTrtri::invoke(Ljj); + + // apply invse to off-diagonal + //if (nsrow > nscol && invert_offdiag) + { const scalar_t one (1.0); auto Lij = Kokkos::subview (viewL, range_type (nscol, nsrow), Kokkos::ALL ()); KokkosBatched::SerialTrmm:: @@ -1231,7 +1262,6 @@ invert_supernodal_columns(KernelHandle kernelHandle, bool unit_diag, int nsuper, } // now call batchedBLAS if (num_batchs > 0) { - using range_policy = Kokkos::RangePolicy; using Uplo = KokkosBatched::Uplo; using Diag = KokkosBatched::Diag; #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE @@ -1239,23 +1269,55 @@ invert_supernodal_columns(KernelHandle kernelHandle, bool unit_diag, int nsuper, #endif if (lower) { if (unit_diag) { - TriSupernodalTrtriFunctor - sptrsv_tritri_functor (invert_offdiag, supernode_ids, nb, hr, hc, hv); - Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + if (invert_offdiag) { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } else { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } } else { - TriSupernodalTrtriFunctor - sptrsv_tritri_functor (invert_offdiag, supernode_ids, nb, hr, hc, hv); - Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + if (invert_offdiag) { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } else { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } } } else { if (unit_diag) { - TriSupernodalTrtriFunctor - sptrsv_tritri_functor (invert_offdiag, supernode_ids, nb, hr, hc, hv); - Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + if (invert_offdiag) { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } else { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } } else { - TriSupernodalTrtriFunctor - sptrsv_tritri_functor (invert_offdiag, supernode_ids, nb, hr, hc, hv); - Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + if (invert_offdiag) { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } else { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } } } #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE From 7da48ba402c143c363e5c0cbdea122564febdf0a Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 9 Sep 2020 16:09:31 -0600 Subject: [PATCH 023/106] More test script cleanup / updates --- scripts/cm_test_all_sandia | 18 +++--------------- scripts/update_lib.sh | 5 ++++- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 1c75b93d52..44048528ea 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -692,7 +692,6 @@ elif [ "$MACHINE" = "apollo" ]; then CUDA101_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/7.3.0" CLANG_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,cuda/9.0.69" - CLANG7_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-gcc/6.1.0,/,cuda/9.1" NVCC_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/5.3.0" # HPX_MODULE_LIST="sems-env,sems-cmake/3.12.2,hpx/1.2.1,sems-gcc/6.1.0,binutils" # HPX3_MODULE_LIST="sems-env,sems-cmake/3.12.2,compilers/hpx/1.3.0,sems-gcc/6.1.0,binutils" @@ -704,32 +703,21 @@ elif [ "$MACHINE" = "apollo" ]; then if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" - "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS" - "cuda/9.1 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" - "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS" - "cuda/9.1 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" ) else # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("cuda/9.1 $CUDA9_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/10.0 $CUDA10_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + COMPILERS=("cuda/10.0 $CUDA10_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/10.1 $CUDA101_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/10.2 $CUDA101_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "clang/6.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" - "clang/7.0 $CLANG7_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" - "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" - "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" ) fi diff --git a/scripts/update_lib.sh b/scripts/update_lib.sh index 69a179cd56..5c4ec5d66a 100755 --- a/scripts/update_lib.sh +++ b/scripts/update_lib.sh @@ -4,7 +4,10 @@ echo "CALLING UPDATE_LIB" if [ "$1" = bowman ]; then ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" - if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.* ]]; then + if [[ "${ICPCVER}" = 17.0.* ]]; then + module swap gcc/4.7.4 gcc/6.2.0 + module list + elif [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.* ]]; then module swap gcc/4.9.3 gcc/6.2.0 module list fi From d56fa3fd7f90cac4fd49584baad2c4c9b17a6286 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Sun, 13 Sep 2020 23:46:05 -0600 Subject: [PATCH 024/106] HIP: fix ArithTraits to support HIP backend, see issue #807 The goal is to replace undefined calls to math functions with the HIP backend. --- src/Kokkos_ArithTraits.hpp | 104 +++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 44 deletions(-) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index 23e6f5e125..2179cb14c8 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -691,13 +691,13 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION float infinity() { return HUGE_VALF; } static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const float x) { - #ifndef __CUDA_ARCH__ + #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; #endif return isinf (x); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const float x) { - #ifndef __CUDA_ARCH__ + #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; #endif return isnan (x); @@ -801,9 +801,11 @@ class ArithTraits { return sqrt (x); } static KOKKOS_FORCEINLINE_FUNCTION float nan () { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) return CUDART_NAN_F; //return nan (); //this returns 0??? +#elif defined(__HIP_DEVICE_COMPILE__) + return nanf(""); #else return std::numeric_limits::quiet_NaN(); #endif // __CUDA_ARCH__ @@ -865,13 +867,13 @@ class ArithTraits > { } static bool isInf (const std::complex& x) { - #ifndef __CUDA_ARCH__ + #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; #endif return isinf (real (x)) || isinf (imag (x)); } static bool isNan (const std::complex& x) { - #ifndef __CUDA_ARCH__ + #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; #endif return isnan (real (x)) || isnan (imag (x)); @@ -1045,13 +1047,13 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION double infinity() { return HUGE_VAL; } static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) { - #ifndef __CUDA_ARCH__ + #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; #endif return isinf (x); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) { - #ifndef __CUDA_ARCH__ + #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; #endif return isnan (x); @@ -1126,9 +1128,11 @@ class ArithTraits { return ::atan (x); } static KOKKOS_FORCEINLINE_FUNCTION val_type nan () { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) return CUDART_NAN; //return nan (); // this returns 0 ??? +#elif defined(__HIP_DEVICE_COMPILE__) + return nan(""); #else return std::numeric_limits::quiet_NaN(); #endif // __CUDA_ARCH__ @@ -1140,8 +1144,10 @@ class ArithTraits { // Backwards compatibility with Teuchos::ScalarTraits. typedef mag_type magnitudeType; typedef float halfPrecision; -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) typedef double doublePrecision; // CUDA doesn't support long double, unfortunately +#elif defined(__HIP_DEVICE_COMPILE__) + typedef double doublePrecision; // HIP does not support long double unfortunately #else typedef long double doublePrecision; #endif // __CUDA_ARCH__ @@ -1216,13 +1222,13 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION long double infinity() { return HUGE_VALL; } static bool isInf (const val_type& x) { - #ifndef __CUDA_ARCH__ + #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; #endif return isinf (x); } static bool isNan (const val_type& x) { - #ifndef __CUDA_ARCH__ + #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; #endif return isnan (x); @@ -2923,11 +2929,13 @@ class ArithTraits { return intPowSigned (x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { -#ifdef __CUDA_ARCH__ - return static_cast ( ::sqrt (static_cast (abs (x)))); + using std::sqrt; + using std::abs; +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + return static_cast ( sqrt (static_cast (abs (x)))); #else - return static_cast ( ::sqrt (static_cast (abs (x)))); -#endif // __CUDA_ARCH__ + return static_cast ( sqrt (static_cast (abs (x)))); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) { return static_cast ( ::log (static_cast (abs (x)))); @@ -3048,18 +3056,20 @@ class ArithTraits { return intPowUnsigned (x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { -#ifdef __CUDA_ARCH__ - return static_cast ( ::sqrt (static_cast (x))); + using std::sqrt; +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + return static_cast ( sqrt (static_cast (x))); #else - return static_cast ( ::sqrt (static_cast (x))); -#endif // __CUDA_ARCH__ + return static_cast ( sqrt (static_cast (x))); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { -#ifdef __CUDA_ARCH__ - return static_cast ( ::cbrt (static_cast (x))); -#else + using std::cbrtl; +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST return static_cast ( ::cbrtl (static_cast (x))); -#endif // __CUDA_ARCH__ +#else + return static_cast ( ::cbrt (static_cast (x))); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { return static_cast ( ::exp (static_cast (x))); @@ -3184,7 +3194,15 @@ class ArithTraits { return intPowSigned (x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { -#ifdef __CUDA_ARCH__ + using std::sqrt; + using std::abs; +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + // IEEE 754 promises that long double has at least 64 significand + // bits, so we can use it to represent any signed or unsigned + // 64-bit integer type exactly. However, CUDA does not implement + // long double for device functions. + return static_cast ( sqrt (static_cast (abs (x)))); +#else // Casting from a 64-bit integer type to double does result in a // loss of accuracy. However, it gives us a good first // approximation. For very large numbers, we may lose some @@ -3195,21 +3213,17 @@ class ArithTraits { // which it has to be, so we don't have to check) to ensure // correctness. It actually should suffice to check numbers // within 1 of the result. - return static_cast ( ::sqrt (static_cast (abs (x)))); -#else - // IEEE 754 promises that long double has at least 64 significand - // bits, so we can use it to represent any signed or unsigned - // 64-bit integer type exactly. However, CUDA does not implement - // long double for device functions. - return static_cast ( ::sqrt (static_cast (abs (x)))); -#endif // __CUDA_ARCH__ + return static_cast ( sqrt (static_cast (abs (x)))); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { -#ifdef __CUDA_ARCH__ - return static_cast ( ::cbrt (static_cast (abs (x)))); + using std::cbrtl; + using std::abs; +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + return static_cast ( cbrtl (static_cast (abs (x)))); #else - return static_cast ( ::cbrtl (static_cast (abs (x)))); -#endif // __CUDA_ARCH__ + return static_cast ( cbrt (static_cast (abs (x)))); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { return static_cast ( ::exp (static_cast (abs (x)))); @@ -3334,18 +3348,20 @@ class ArithTraits { return intPowUnsigned (x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { -#ifdef __CUDA_ARCH__ - return static_cast ( ::sqrt (static_cast (x))); -#else + using std::sqrt; +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST return static_cast ( ::sqrt (static_cast (x))); -#endif // __CUDA_ARCH__ +#else + return static_cast ( ::sqrt (static_cast (x))); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { -#ifdef __CUDA_ARCH__ - return static_cast ( ::cbrt (static_cast (x))); + using std::cbrtl; +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + return static_cast ( cbrtl (static_cast (x))); #else - return static_cast ( ::cbrtl (static_cast (x))); -#endif // __CUDA_ARCH__ + return static_cast ( cbrt (static_cast (x))); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { return static_cast ( ::exp (static_cast (x))); From b0941738bea90c1fef618d2f159ecc31e532be5f Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 14 Sep 2020 07:44:58 -0700 Subject: [PATCH 025/106] Fixing an issue on cuda-clang. Without the top namespace "::" specified, cbrt calls itself in infinite loop. Also moving some "using" statements around... --- src/Kokkos_ArithTraits.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index 2179cb14c8..cf6c26590f 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -3064,8 +3064,8 @@ class ArithTraits { #endif } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - using std::cbrtl; #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::cbrtl; return static_cast ( ::cbrtl (static_cast (x))); #else return static_cast ( ::cbrt (static_cast (x))); @@ -3194,9 +3194,9 @@ class ArithTraits { return intPowSigned (x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::sqrt; using std::abs; -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST // IEEE 754 promises that long double has at least 64 significand // bits, so we can use it to represent any signed or unsigned // 64-bit integer type exactly. However, CUDA does not implement @@ -3213,16 +3213,16 @@ class ArithTraits { // which it has to be, so we don't have to check) to ensure // correctness. It actually should suffice to check numbers // within 1 of the result. - return static_cast ( sqrt (static_cast (abs (x)))); + return static_cast ( ::sqrt (static_cast (abs (x)))); #endif } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::cbrtl; using std::abs; -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST return static_cast ( cbrtl (static_cast (abs (x)))); #else - return static_cast ( cbrt (static_cast (abs (x)))); + return static_cast ( ::cbrt (static_cast (abs (x)))); #endif } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { @@ -3348,19 +3348,19 @@ class ArithTraits { return intPowUnsigned (x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { - using std::sqrt; #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - return static_cast ( ::sqrt (static_cast (x))); + using std::sqrt; + return static_cast ( sqrt (static_cast (x))); #else return static_cast ( ::sqrt (static_cast (x))); #endif } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - using std::cbrtl; #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::cbrtl; return static_cast ( cbrtl (static_cast (x))); #else - return static_cast ( cbrt (static_cast (x))); + return static_cast ( ::cbrt (static_cast (x))); #endif } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { From ffab9f756c35c1c8b37d929d50905ef76afa7a77 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 14 Sep 2020 10:01:42 -0600 Subject: [PATCH 026/106] Fixing undefined nanf("") function in the HIP code branch This now compiles fine on caraway --- src/Kokkos_ArithTraits.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index cf6c26590f..3a6ea1cca5 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -805,7 +805,7 @@ class ArithTraits { return CUDART_NAN_F; //return nan (); //this returns 0??? #elif defined(__HIP_DEVICE_COMPILE__) - return nanf(""); + return ::nanf(""); #else return std::numeric_limits::quiet_NaN(); #endif // __CUDA_ARCH__ @@ -1132,7 +1132,7 @@ class ArithTraits { return CUDART_NAN; //return nan (); // this returns 0 ??? #elif defined(__HIP_DEVICE_COMPILE__) - return nan(""); + return ::nan(""); #else return std::numeric_limits::quiet_NaN(); #endif // __CUDA_ARCH__ From 90b4cb75e698bc2f2545888dbef97fe3225bc0e4 Mon Sep 17 00:00:00 2001 From: jjwilke Date: Mon, 14 Sep 2020 09:08:24 -0700 Subject: [PATCH 027/106] fix install testing refactor for inline builds --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d70710325a..1149a2101d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,7 +83,8 @@ IF (KokkosKernels_INSTALL_TESTING) ELSE() # Regular build, not install testing # Do all the regular option processing - IF (NOT KOKKOSKERNELS_HAS_TRILINOS) + IF (NOT KOKKOSKERNELS_HAS_TRILINOS AND NOT KOKKOSKERNELS_HAS_PARENT) + # This is a standalone build FIND_PACKAGE(Kokkos REQUIRED) MESSAGE(STATUS "Found Kokkos at ${Kokkos_DIR}") KOKKOS_CHECK(OPTIONS CUDA_UVM RETURN_VALUE KOKKOS_ENABLE_CUDA_UVM) From 8cdc613d143d9b9451bcfb9a94b6fe0701f96b10 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 15 Sep 2020 16:17:55 -0600 Subject: [PATCH 028/106] KokkosBatched - hip added in util --- src/batched/KokkosBatched_Util.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 372b5e1753..2347c63e87 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -276,6 +276,11 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> ::type mb() { return 2; } +#endif +#if defined(KOKKOS_ENABLE_HIP) + template KOKKOS_INLINE_FUNCTION static constexpr + typename std::enable_if::value,int> + ::type mb() { return 2; } #endif template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> @@ -320,6 +325,11 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> ::type mb() { return 1; } +#endif +#if defined(KOKKOS_ENABLE_HIP) + template KOKKOS_INLINE_FUNCTION static constexpr + typename std::enable_if::value,int> + ::type mb() { return 1; } #endif template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> From bd11f23c2f637cc1c9c9cee5682b377d66ba06fa Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 15 Sep 2020 16:33:36 -0600 Subject: [PATCH 029/106] KokkosArithTraits - std complex is not available on hip either --- test_common/Test_Common_ArithTraits.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_common/Test_Common_ArithTraits.hpp b/test_common/Test_Common_ArithTraits.hpp index 5e253a1820..ff2abd0acc 100644 --- a/test_common/Test_Common_ArithTraits.hpp +++ b/test_common/Test_Common_ArithTraits.hpp @@ -1598,7 +1598,7 @@ int runAllArithTraitsHostTests (std::ostream& out, const int verbose) success = success && curSuccess; curSuccess = testArithTraitsOnHost (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost (out, verbose); -#ifndef KOKKOS_ENABLE_CUDA +#if !defined( KOKKOS_ENABLE_CUDA ) && !defined( KOKKOS_ENABLE_HIP ) // This would spill tons of warnings about host device stuff otherwise success = success && curSuccess; curSuccess = testArithTraitsOnHost (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); From 79a59ec51c88b7c8423d5ad41b248a8da796572b Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 21 Sep 2020 11:41:24 -0600 Subject: [PATCH 030/106] Use team policy for fast MIS2 for CUDA Coalesces graph memory accesses, and expresses per-row operations as ThreadVector parallel reductions. --- .../impl/KokkosGraph_Distance2MIS_impl.hpp | 162 ++++++++++++++---- 1 file changed, 127 insertions(+), 35 deletions(-) diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index 688ff74137..3392c30452 100644 --- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -47,39 +47,13 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Bitset.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosKernels_Utils.hpp" #include namespace KokkosGraph { namespace Experimental { namespace Impl { -/* - * 100% asynchronous algorithm ideas: - * -For each row in team worklist: - * -Determine if any neighboring columns are OUT_SET, as well as whether all col statuses match my row status exactly - * -If any neighbors are OUT_SET: - * -Mark row permanently as OUT_SET. - * -Mark all neighboring columns for status update, since their minimum status may now have increased. - * -If all neighbor statuses match this row's status, mark this row permanently as IN_SET. Then mark all neighboring columns as OUT_SET. - * -Process all pending column updates (atomic_maxing the status with new one, if multiple threads may get the same column) - * - * -Invariants: - * -Row status changes exactly once (to either IN_SET or OUT_SET). After this, it never needs to be proccessed again. - * -Col status can change multiple times, but it can only increase (up to OUT_SET) - * -Therefore, when a column is updated, it converges to the true minimum status over rows - * - * What if a row R 2 hops away becomes IN_SET, and this row doesn't observe the columns changing to OUT_SET? - * -It's OK, since at no time can this row observe a mutual neighbor exactly matching its status. It will match R's status, and then it will be OUT_SET). - * What if a column's updated status is based on out of date information? - * -The minimum is computed as: any are IN_SET? OUT_SET : min(neighbors) - * -This quantity may only increase, since rows can only change to IN_SET or OUT_SET, and in either case it increases - * -So it's OK, since if it's out of date, it can only be _lower_ than it should be, never allowing a vertex to become IN_SET that shouldn't - * - * Problem still to solve: with priorities chosen only once, will still converge slowly. Need a way to have teams working - * independently, but still have globally consistent rounds where row statuses change. - */ - template struct D2_MIS_RandomPriority { @@ -166,8 +140,8 @@ struct D2_MIS_RandomPriority struct RefreshColStatus { - RefreshColStatus(const status_view_t& colStatus_, const worklist_t& worklist_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) - : colStatus(colStatus_), worklist(worklist_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_) + RefreshColStatus(const status_view_t& colStatus_, const worklist_t& worklist_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, lno_t worklistLen_) + : colStatus(colStatus_), worklist(worklist_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklistLen(worklistLen_) {} KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const @@ -181,7 +155,7 @@ struct D2_MIS_RandomPriority for(size_type j = rowBegin; j <= rowEnd; j++) { lno_t nei = (j == rowEnd) ? i : entries(j); - if(nei <= nv) + if(nei < nv) { status_t neiStat = rowStatus(nei); if(neiStat < s) @@ -193,20 +167,62 @@ struct D2_MIS_RandomPriority colStatus(i) = s; } + KOKKOS_INLINE_FUNCTION void operator()(const team_mem& t) const + { + using MinReducer = Kokkos::Min; + lno_t w = t.league_rank() * t.team_size() + t.team_rank(); + if(w >= worklistLen) + return; + lno_t i = worklist(w); + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowLen = rowEnd - rowBegin; + //iterate over {i} union the neighbors of i, to find + //minimum status. + status_t s; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(t, rowLen + 1), + [&](lno_t j, status_t& ls) + { + lno_t nei = (j == rowLen) ? i : entries(rowBegin + j); + if(nei < nv) + { + status_t neiStat = rowStatus(nei); + if(neiStat < ls) + ls = neiStat; + } + }, MinReducer(s)); + Kokkos::single(Kokkos::PerThread(t), + [&]() + { + if(s == IN_SET) + s = OUT_SET; + colStatus(i) = s; + }); + } + status_view_t colStatus; worklist_t worklist; status_view_t rowStatus; rowmap_t rowmap; entries_t entries; lno_t nv; + lno_t worklistLen; }; struct DecideSetFunctor { - DecideSetFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const worklist_t& worklist_) - : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklist(worklist_) + DecideSetFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const worklist_t& worklist_, lno_t worklistLen_) + : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklist(worklist_), worklistLen(worklistLen_) {} + //Enum values to be used as flags, so that the team policy version can + //express the neighbor checking as an OR-reduction + enum + { + NEI_OUT_SET = 1, + NEI_DIFFERENT_STATUS = 2 + }; + KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const { lno_t i = worklist(w); @@ -249,12 +265,59 @@ struct D2_MIS_RandomPriority } } + KOKKOS_INLINE_FUNCTION void operator()(const team_mem& t) const + { + using OrReducer = Kokkos::BOr; + lno_t w = t.league_rank() * t.team_size() + t.team_rank(); + if(w >= worklistLen) + return; + lno_t i = worklist(w); + //Processing row i. + status_t s = rowStatus(i); + if(s == IN_SET || s == OUT_SET) + return; + //s is the status which must be the minimum among all neighbors + //to decide that i is IN_SET. + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowLen = rowEnd - rowBegin; + int flags = 0; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(t, rowLen + 1), + [&](lno_t j, int& lflags) + { + lno_t nei = (j == rowLen) ? i : entries(rowBegin + j); + if(nei >= nv) + return; + status_t neiStat = colStatus(nei); + if(neiStat == OUT_SET) + lflags |= NEI_OUT_SET; + else if(neiStat != s) + lflags |= NEI_DIFFERENT_STATUS; + }, OrReducer(flags)); + Kokkos::single(Kokkos::PerThread(t), + [&]() + { + if(flags & NEI_OUT_SET) + { + //In order to make future progress, need to update the + //col statuses for all neighbors of i. + rowStatus(i) = OUT_SET; + } + else if(!(flags & NEI_DIFFERENT_STATUS)) + { + //all neighboring col statuses match s, therefore s is the minimum status among all d2 neighbors + rowStatus(i) = IN_SET; + } + }); + } + status_view_t rowStatus; status_view_t colStatus; rowmap_t rowmap; entries_t entries; lno_t nv; worklist_t worklist; + lno_t worklistLen; }; struct CountInSet @@ -332,17 +395,46 @@ struct D2_MIS_RandomPriority worklist_t colWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 1); Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(colWorklist)); worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); + auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); + bool useTeams = execSpaceEnum == KokkosKernels::Impl::Exec_CUDA; + int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum); int round = 0; lno_t rowWorkLen = numVerts; lno_t colWorkLen = numVerts; + int refreshColTeamSize = 0; + int decideSetTeamSize = 0; + if(useTeams) + { + //Compute the recommended team size for RefreshColStatus and DecideSetFunctor (will be constant) + { + RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen); + refreshColTeamSize = KokkosKernels::Impl::get_suggested_team_size(refreshCol, vectorLength); + } + { + DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen); + decideSetTeamSize = KokkosKernels::Impl::get_suggested_team_size(decideSet, vectorLength); + } + } while(true) { //Compute new row statuses Kokkos::parallel_for(range_pol(0, rowWorkLen), RefreshRowStatus(rowStatus, rowWorklist, nvBits, round)); //Compute new col statuses - Kokkos::parallel_for(range_pol(0, colWorkLen), RefreshColStatus(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts)); - //Decide row statuses - Kokkos::parallel_for(range_pol(0, rowWorkLen), DecideSetFunctor(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist)); + { + RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen); + if(useTeams) + Kokkos::parallel_for(team_pol((colWorkLen + refreshColTeamSize - 1) / refreshColTeamSize, refreshColTeamSize, vectorLength), refreshCol); + else + Kokkos::parallel_for(range_pol(0, colWorkLen), refreshCol); + } + //Decide row statuses where enough information is available + { + DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen); + if(useTeams) + Kokkos::parallel_for(team_pol((rowWorkLen + decideSetTeamSize - 1) / decideSetTeamSize, decideSetTeamSize, vectorLength), decideSet); + else + Kokkos::parallel_for(range_pol(0, rowWorkLen), decideSet); + } //Compact row worklist Kokkos::parallel_scan(range_pol(0, rowWorkLen), CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus), rowWorkLen); if(rowWorkLen == 0) From 1b40c8c4bbc0e21ef85db60897cbe63b59102e8d Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 21 Sep 2020 11:41:23 -0700 Subject: [PATCH 031/106] Using avg degree threshold to choose team or range for MIS2 If avg degree >= 16, use teams. Otherwise use range. --- src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index 3392c30452..773fd2f41f 100644 --- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -396,7 +396,7 @@ struct D2_MIS_RandomPriority Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(colWorklist)); worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); - bool useTeams = execSpaceEnum == KokkosKernels::Impl::Exec_CUDA; + bool useTeams = (execSpaceEnum == KokkosKernels::Impl::Exec_CUDA) && (entries.extent(0) / numVerts >= 16); int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum); int round = 0; lno_t rowWorkLen = numVerts; @@ -405,14 +405,15 @@ struct D2_MIS_RandomPriority int decideSetTeamSize = 0; if(useTeams) { + team_pol temp(1, 1, vectorLength); //Compute the recommended team size for RefreshColStatus and DecideSetFunctor (will be constant) { RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen); - refreshColTeamSize = KokkosKernels::Impl::get_suggested_team_size(refreshCol, vectorLength); + refreshColTeamSize = temp.team_size_max(refreshCol, Kokkos::ParallelForTag()); } { DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen); - decideSetTeamSize = KokkosKernels::Impl::get_suggested_team_size(decideSet, vectorLength); + decideSetTeamSize = temp.team_size_max(decideSet, Kokkos::ParallelForTag()); } } while(true) From d01b6171b25af41e18bc6038c5bfb82a864865b7 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 21 Sep 2020 15:01:48 -0600 Subject: [PATCH 032/106] Rename 'temp' to 'dummyPolicy' more descriptive, much less likely to collide --- src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index 773fd2f41f..0a5493df7d 100644 --- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -405,15 +405,15 @@ struct D2_MIS_RandomPriority int decideSetTeamSize = 0; if(useTeams) { - team_pol temp(1, 1, vectorLength); + team_pol dummyPolicy(1, 1, vectorLength); //Compute the recommended team size for RefreshColStatus and DecideSetFunctor (will be constant) { RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen); - refreshColTeamSize = temp.team_size_max(refreshCol, Kokkos::ParallelForTag()); + refreshColTeamSize = dummyPolicy.team_size_max(refreshCol, Kokkos::ParallelForTag()); } { DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen); - decideSetTeamSize = temp.team_size_max(decideSet, Kokkos::ParallelForTag()); + decideSetTeamSize = dummyPolicy.team_size_max(decideSet, Kokkos::ParallelForTag()); } } while(true) From 81c1fe200025331c140016a98c83798aa04c7e03 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 25 Sep 2020 17:13:08 -0600 Subject: [PATCH 033/106] cm_generate_makefile: add hip and amd gpu support --- cm_generate_makefile.bash | 62 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index 2bdb004ec2..bb246df3c6 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -21,8 +21,18 @@ get_kokkos_device_list() { for DEVICE_ in $PARSE_DEVICES_LST do UC_DEVICE=$(echo $DEVICE_ | tr "[:lower:]" "[:upper:]") + if [ "${UC_DEVICE}" == "CUDA" ]; then + WITH_CUDA_BACKEND=ON + fi + if [ "${UC_DEVICE}" == "HIP" ]; then + WITH_HIP_BACKEND=ON + fi KOKKOS_DEVICE_CMD="-DKokkos_ENABLE_${UC_DEVICE}=ON ${KOKKOS_DEVICE_CMD}" done + if [ "${WITH_CUDA_BACKEND}" == "ON" ] && [ "${WITH_HIP_BACKEND}" == "ON" ]; then + echo "Invalid configuration - Cuda and Hip cannot be simultaneously enabled" + exit + fi } get_kokkos_arch_list() { @@ -59,6 +69,24 @@ get_kokkos_cuda_option_list() { done } +get_kokkos_hip_option_list() { + echo parsing KOKKOS_HIP_OPTIONS=$KOKKOS_HIP_OPTIONS + KOKKOS_HIP_OPTION_CMD= + PARSE_HIP_LST=$(echo $KOKKOS_HIP_OPTIONS | tr "," "\n") + for HIP_ in $PARSE_HIP_LST + do + HIP_OPT_NAME= + if [ "${HIP_}" == "rdc" ]; then + HIP_OPT_NAME=HIP_RELOCATABLE_DEVICE_CODE + else + echo "${HIP_} is not a valid hip option..." + fi + if [ "${HIP_OPT_NAME}" != "" ]; then + KOKKOS_HIP_OPTION_CMD="-DKokkos_ENABLE_${HIP_OPT_NAME}=ON ${KOKKOS_HIP_OPTION_CMD}" + fi + done +} + get_kokkos_option_list() { echo parsing KOKKOS_OPTIONS=$KOKKOS_OPTIONS KOKKOS_OPTION_CMD= @@ -196,15 +224,21 @@ display_help_text() { echo "--prefix=/Install/Path: Path to install the KokkosKernels library." echo "" echo "--with-cuda[=/Path/To/Cuda]: Enable Cuda and set path to Cuda Toolkit." + echo "--with-hip[=/Path/To/Hip]: Enable Hip and set path to ROCM Toolkit." echo "--with-openmp: Enable OpenMP backend." echo "--with-pthread: Enable Pthreads backend." echo "--with-serial: Enable Serial backend." echo "--with-devices: Explicitly add a set of backends." echo "" echo "--arch=[OPT]: Set target architectures. Options are:" - echo " [AMD]" + echo " [AMD: CPU]" echo " AMDAVX = AMD CPU" - echo " EPYC = AMD EPYC Zen-Core CPU" + echo " ZEN = AMD Zen-Core CPU" + echo " ZEN2 = AMD Zen2-Core CPU" + echo " [AMD: GPU]" + echo " VEGA900 = AMD GPU MI25 GFX900" + echo " VEGA906 = AMD GPU MI50/MI60 GFX906" + echo " VEGA908 = AMD GPU" echo " [ARM]" echo " ARMV80 = ARMv8.0 Compatible CPU" echo " ARMV81 = ARMv8.1 Compatible CPU" @@ -264,6 +298,8 @@ display_help_text() { echo " " echo "--with-cuda-options=[OPT]: Additional options to CUDA:" echo " force_uvm, use_ldg, enable_lambda, rdc" + echo "--with-hip-options=[OPT]: Additional options to HIP:" + echo " rdc" echo "--with-scalars=[SCALARS]: Set scalars to be instantiated." echo " Options: float, double, complex_float, complex_double" echo "--with-ordinals=[ORDINALS]: Set ordinals to be instantiated." @@ -307,6 +343,10 @@ KOKKOS_MAKEINSTALL_J=4 KERNELS_DEFAULT_ETI_OPTION="" +# For tracking if Cuda and Hip devices are enabled simultaneously +WITH_CUDA_BACKEND=OFF +WITH_HIP_BACKEND=OFF + while [[ $# > 0 ]] do key="$1" @@ -340,6 +380,19 @@ do update_kokkos_devices Cuda CUDA_PATH="${key#*=}" ;; + --with-hip) + update_kokkos_devices Hip + HIP_PATH_HIPCC=$(command -v hipcc) + HIP_PATH=${HIP_PATH_HIPCC%/bin/hipcc} + ;; + # Catch this before '--with-hip*' + --with-hip-options*) + KOKKOS_HIP_OPTIONS="${key#*=}" + ;; + --with-hip*) + update_kokkos_devices Hip + HIP_PATH="${key#*=}" + ;; --with-openmp) update_kokkos_devices OpenMP ;; @@ -606,6 +659,7 @@ get_kokkos_device_list get_kokkos_option_list get_kokkos_arch_list get_kokkos_cuda_option_list +get_kokkos_hip_option_list get_kernels_scalar_list get_kernels_ordinals_list @@ -655,9 +709,9 @@ cd ${KOKKOS_INSTALL_PATH} # Configure kokkos echo "" -echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH} +echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH} echo "" -cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH} # Install kokkos library make install -j $KOKKOS_MAKEINSTALL_J From 32a4e074bc9d4c06d375f060b62a0f005bb8bc0b Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 30 Sep 2020 12:29:25 -0600 Subject: [PATCH 034/106] cm_test_all_sandia: add voltrino, rename waterman to weaver --- scripts/cm_test_all_sandia | 63 +++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 44048528ea..fa09af3c69 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -119,8 +119,13 @@ if [[ "$HOSTNAME" =~ (white|ride).* ]]; then module load git fi -if [[ "$HOSTNAME" =~ waterman.* ]]; then - MACHINE=waterman +if [[ "$HOSTNAME" =~ weaver.* ]]; then + MACHINE=weaver + module load git +fi + +if [[ "$HOSTNAME" =~ .*voltrino.* ]]; then + MACHINE=voltrino module load git fi @@ -491,14 +496,14 @@ elif [ "$MACHINE" = "white" ]; then BASE_MODULE_LIST="cmake/3.12.3,/" IBM_MODULE_LIST="cmake/3.12.3,/xl/,gcc/7.2.0" - CUDA_MODULE_LIST="cmake/3.12.3,/,gcc/7.2.0,ibm/xl/16.1.0" - CUDA10_MODULE_LIST="cmake/3.12.3,/,gcc/7.4.0,ibm/xl/16.1.0" + CUDA_MODULE_LIST="cmake/3.12.3,/,gcc/7.2.0,ibm/xl/16.1.1" + CUDA10_MODULE_LIST="cmake/3.12.3,/,gcc/7.4.0,ibm/xl/16.1.1" GCC72_MODULE_TPL_LIST="cmake/3.12.3,/,netlib/3.8.0/gcc/7.2.0" GCC74_MODULE_TPL_LIST="cmake/3.12.3,/,openblas/0.3.4/gcc/7.4.0" CUDA_MODULE_TPL_LIST="cmake/3.12.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" CUDA10_MODULE_TPL_LIST="cmake/3.12.3,/,gcc/7.4.0,openblas/0.3.4/gcc/7.4.0" - IBM_MODULE_TPL_LIST="cmake/3.12.3,/xl/,gcc/7.2.0,netlib/3.8.0/ibm/xl/16.1.0" + IBM_MODULE_TPL_LIST="cmake/3.12.3,/xl/,gcc/7.2.0,netlib/3.8.0/ibm/xl/16.1.1" # Don't do pthread on white. GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" @@ -508,7 +513,7 @@ elif [ "$MACHINE" = "white" ]; then if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/10.1.105 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) @@ -521,12 +526,13 @@ elif [ "$MACHINE" = "white" ]; then ) else # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "ibm/16.1.1 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/10.0.130 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/10.1.105 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.1.105 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) fi @@ -536,15 +542,15 @@ elif [ "$MACHINE" = "white" ]; then SPACK_HOST_ARCH="+power8" SPACK_CUDA_ARCH="+kepler37" SPACK_CUDA_HOST_COMPILER="%gcc@7.2.0" -elif [ "$MACHINE" = "waterman" ]; then +elif [ "$MACHINE" = "weaver" ]; then MODULE_ENVIRONMENT="source /etc/profile.d/modules.sh" eval "$MODULE_ENVIRONMENT" SKIP_HWLOC=True BASE_MODULE_LIST="cmake/3.12.3,/" IBM_MODULE_LIST="cmake/3.12.3,/xl/,gcc/7.2.0" - CUDA_MODULE_LIST="cmake/3.12.3,/,gcc/7.2.0,ibm/xl/16.1.0" - CUDA10_MODULE_LIST="cmake/3.12.3,/,gcc/7.4.0,ibm/xl/16.1.0" + CUDA_MODULE_LIST="cmake/3.12.3,/,gcc/7.2.0,ibm/xl/16.1.1" + CUDA10_MODULE_LIST="cmake/3.12.3,/,gcc/7.4.0,ibm/xl/16.1.1" GCC72_MODULE_TPL_LIST="cmake/3.12.3,/,openblas/0.2.20/gcc/7.2.0" CUDA_MODULE_TPL_LIST="cmake/3.12.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" @@ -555,14 +561,13 @@ elif [ "$MACHINE" = "waterman" ]; then # "cuda/10.1.243 $CUDA10_MODULE_TPL_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - # Don't do pthread on waterman + # Don't do pthread on weaver GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "ibm/16.1.0 $IBM_MODULE_LIST "Serial" xlC $IBM_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "cuda/9.2.88 $CUDA_MODULE_LIST "Cuda_Serial" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/10.1.243 $CUDA10_MODULE_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) @@ -573,15 +578,16 @@ elif [ "$MACHINE" = "waterman" ]; then ) else # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/7.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" + COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "ibm/16.1.1 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/10.0.130 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/10.1.105 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/10.1.243 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.2.089 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) fi @@ -591,6 +597,21 @@ elif [ "$MACHINE" = "waterman" ]; then SPACK_HOST_ARCH="+power9" SPACK_CUDA_ARCH="+volta70" +elif [ "$MACHINE" = "voltrino" ]; then + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + BASE_MODULE_LIST="PrgEnv-intel,craype-mic-knl,cmake/3.16.2,slurm/19.05.5a,/,gcc/9.3.0" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("intel/17.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/19.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + ) + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=KNL" + fi elif [ "$MACHINE" = "bowman" ]; then MODULE_ENVIRONMENT="source /etc/profile.d/modules.sh" eval "$MODULE_ENVIRONMENT" @@ -1051,7 +1072,7 @@ setup_env() { if [[ "${SPOT_CHECK_TPLS}" = "True" ]]; then # Some machines will require explicitly setting include dirs and libs - if ([[ "$MACHINE" = white* ]] || [[ "$MACHINE" = waterman* ]]) && [[ "$mod" = openblas* ]]; then + if ([[ "$MACHINE" = white* ]] || [[ "$MACHINE" = weaver* ]]) && [[ "$mod" = openblas* ]]; then BLAS_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" LAPACK_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" # BLAS_LIBRARIES="openblas" @@ -1062,7 +1083,7 @@ setup_env() { KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD="--extra-linker-flags=-lgfortran,-lm" echo "TPL PATHS: KOKKOSKERNELS_TPL_PATH_CMD=$KOKKOSKERNELS_TPL_PATH_CMD" echo "TPL LIBS: KOKKOSKERNELS_TPL_LIBS_CMD=$KOKKOSKERNELS_TPL_LIBS_CMD" - elif ([[ "$MACHINE" = white* ]] || [[ "$MACHINE" = waterman* ]]) && [[ "$mod" = netlib* ]]; then + elif ([[ "$MACHINE" = white* ]] || [[ "$MACHINE" = weaver* ]]) && [[ "$mod" = netlib* ]]; then BLAS_LIBRARY_DIRS="${BLAS_ROOT}/lib" LAPACK_LIBRARY_DIRS="${BLAS_ROOT}/lib" BLAS_LIBRARIES="blas" From 4c89f2824d97c00869dfedc4cff0b77e9052f180 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 7 Sep 2020 02:07:17 -0400 Subject: [PATCH 035/106] cuSPARSE 11: fix spgemm and spmv_struct_tunning, see issue #803 Fix for spmv_struct_tunning perf_test that uses cusparse SpMV. Temporary fix for spGEMM, this will require rewritting the interface for it to work properly but for the moment everything builds correctly. --- .../KokkosSparse_spmv_struct_tuning.cpp | 74 +++++++++++++++++-- .../KokkosSparse_spgemm_cuSPARSE_impl.hpp | 17 ++++- .../impl/KokkosSparse_spgemm_impl_def.hpp | 2 +- 3 files changed, 82 insertions(+), 11 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp index 60779f7fe5..afef5968f0 100644 --- a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp @@ -66,12 +66,6 @@ enum {STRUCT, UNSTR}; -#ifdef INT64 -typedef long long int LocalOrdinalType; -#else -typedef int LocalOrdinalType; -#endif - void print_help() { printf("SPMV_struct benchmark code written by Luc Berger-Vergiat.\n"); printf("Options:\n"); @@ -482,6 +476,73 @@ int main(int argc, char **argv) if(compare_cusparse) { #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) +#ifdef CUSPARSE_VERSION + KokkosKernels::Experimental::Controls controls; + + cusparseIndexType_t myCusparseOffsetType = CUSPARSE_INDEX_32I; + cusparseIndexType_t myCusparseEntryType = CUSPARSE_INDEX_32I; + cudaDataType myCudaDataType = CUDA_R_64F; + + /* create matrix */ + cusparseSpMatDescr_t A_cusparse; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(&A_cusparse, A.numRows(), A.numCols(), A.nnz(), + (void*) A.graph.row_map.data(), + (void*) A.graph.entries.data(), + (void*) A.values.data(), + myCusparseOffsetType, + myCusparseEntryType, + CUSPARSE_INDEX_BASE_ZERO, + myCudaDataType)); + + /* create lhs and rhs */ + cusparseDnVecDescr_t vecX, vecY; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(&vecX, x1.extent_int(0), (void*) x1.data(), myCudaDataType)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(&vecY, y1.extent_int(0), (void*) y1.data(), myCudaDataType)); + + const double alpha = 1.0, beta = 1.0; + size_t bufferSize = 0; + void* dBuffer = NULL; + cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize(controls.getCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, A_cusparse, vecX, &beta, vecY, myCudaDataType, + alg, &bufferSize)); + CUDA_SAFE_CALL(cudaMalloc(&dBuffer, bufferSize)); + + /* perform SpMV */ + Kokkos::Profiling::pushRegion("cuSparse spmv test"); + double min_time = 1.0e32; + double max_time = 0.0; + double ave_time = 0.0; + for(int i=0;imax_time) max_time = time; + if(time::value){ const idx *a_xadj = (int *)row_mapA.data(); @@ -143,6 +147,7 @@ namespace Impl{ throw std::runtime_error ("CUSPARSE requires local ordinals to be integer.\n"); //return; } +#endif #else (void)handle; (void)m; (void)n; (void)k; @@ -186,6 +191,9 @@ namespace Impl{ cin_nonzero_value_view_type valuesC){ #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#if defined(CUSPARSE_VERSION) && (11000 <= CUSPARSE_VERSION) + throw std::runtime_error ("SpGEMM cuSPARSE backend is not yet supported for this CUDA version\n"); +#else typedef typename KernelHandle::nnz_lno_t idx; typedef typename KernelHandle::nnz_scalar_t value_type; @@ -289,6 +297,7 @@ namespace Impl{ throw std::runtime_error ("CUSPARSE requires local ordinals to be integer.\n"); //return; } +#endif #else (void)handle; (void)m; (void)n; (void)k; diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp index 595e216700..aa73c1e55b 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp @@ -99,7 +99,7 @@ void KokkosSPGEMM Kokkos::Impl::Timer timer1; auto new_row_mapB_begin = Kokkos::subview (row_mapB, std::make_pair (nnz_lno_t(0), b_row_cnt)); auto new_row_mapB_end = Kokkos::subview (row_mapB, std::make_pair (nnz_lno_t(1), b_row_cnt + 1)); - row_lno_persistent_work_view_t flops_per_row(Kokkos::ViewAllocateWithoutInitializing("origianal row flops"), a_row_cnt); + row_lno_persistent_work_view_t flops_per_row(Kokkos::ViewAllocateWithoutInitializing("original row flops"), a_row_cnt); //get maximum row flops. maxNumRoughZeros = this->getMaxRoughRowNNZ(a_row_cnt, row_mapA, entriesA, From e6ca8f0ff07d093b79d73d099f06b59b38f9ddda Mon Sep 17 00:00:00 2001 From: jjwilke Date: Mon, 5 Oct 2020 14:01:33 -0700 Subject: [PATCH 036/106] Allow enabling/disabling certain components --- CMakeLists.txt | 9 ++ cmake/fake_tribits.cmake | 126 +++++++++++++-------- cmake/kokkoskernels_tribits.cmake | 161 ++++++++++++++++----------- perf_test/CMakeLists.txt | 9 +- perf_test/batched/CMakeLists.txt | 8 +- perf_test/blas/CMakeLists.txt | 2 + perf_test/blas/blas/CMakeLists.txt | 4 +- perf_test/blas/blas3/CMakeLists.txt | 4 +- perf_test/performance/CMakeLists.txt | 4 +- src/CMakeLists.txt | 141 +++++++++++++++-------- src/kokkoskernels_eti.cmake | 53 +++++---- unit_test/CMakeLists.txt | 57 ++++++---- 12 files changed, 370 insertions(+), 208 deletions(-) create mode 100644 perf_test/blas/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 1149a2101d..2313002481 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -140,6 +140,15 @@ ELSE() BOOL "Whether to restrict testing to ETI types. Default: ON" ) + + KOKKOSKERNELS_ADD_OPTION( + ENABLED_COMPONENTS + "ALL" + STRING + "A list of components to enable in testing and building" + VALID_ENTRIES BATCHED BLAS GRAPH SPARSE ALL + ) + # ================================================================== # Enable Device Types for ETI (exec- + mem-space) # ================================================================== diff --git a/cmake/fake_tribits.cmake b/cmake/fake_tribits.cmake index 8d623a67fe..c306891cff 100644 --- a/cmake/fake_tribits.cmake +++ b/cmake/fake_tribits.cmake @@ -24,6 +24,13 @@ ENDFOREACH() ENDFUNCTION() FUNCTION(kokkoskernels_add_option SUFFIX DEFAULT TYPE DOCSTRING) + CMAKE_PARSE_ARGUMENTS(OPT + "" + "" + "VALID_ENTRIES" #if this is a list variable, the valid values in the list + ${ARGN} + ) + SET(CAMEL_NAME KokkosKernels_${SUFFIX}) STRING(TOUPPER ${CAMEL_NAME} UC_NAME) @@ -40,13 +47,28 @@ FUNCTION(kokkoskernels_add_option SUFFIX DEFAULT TYPE DOCSTRING) ENDIF() ENDFOREACH() + #okay, great, we passed the validation test - use the default IF (DEFINED ${CAMEL_NAME}) - SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + IF (OPT_VALID_ENTRIES) + STRING(TOUPPER "${OPT_VALID_ENTRIES}" OPT_VALID_ENTRIES_UC) + FOREACH(entry ${${CAMEL_NAME}}) + STRING(TOUPPER ${entry} ENTRY_UC) + IF (NOT ${ENTRY_UC} IN_LIST OPT_VALID_ENTRIES_UC) + MESSAGE(FATAL_ERROR "Given entry ${entry} in list for option ${SUFFIX}. " + "Valid case-insensitive values are any of ${OPT_VALID_ENTRIES}") + ENDIF() + ENDFOREACH() + STRING(TOUPPER "${${CAMEL_NAME}}" GIVEN_ENTRIES_UC) + SET(${UC_NAME} ${GIVEN_ENTRIES_UC} PARENT_SCOPE) + ELSE() + SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + ENDIF() ELSE() SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE) ENDIF() + ENDFUNCTION() MACRO(KOKKOSKERNELS_ADD_OPTION_AND_DEFINE USER_OPTION_NAME MACRO_DEFINE_NAME DOCSTRING DEFAULT_VALUE ) @@ -161,52 +183,68 @@ IF(NOT TARGET check) ENDIF() FUNCTION(KOKKOSKERNELS_ADD_TEST) -IF (KOKKOSKERNELS_HAS_TRILINOS) - CMAKE_PARSE_ARGUMENTS(TEST - "" - "EXE;NAME" - "" - ${ARGN}) - IF(TEST_EXE) - SET(EXE_ROOT ${TEST_EXE}) - ELSE() - SET(EXE_ROOT ${TEST_NAME}) - ENDIF() - TRIBITS_ADD_TEST( - ${EXE_ROOT} - NAME ${TEST_NAME} - ${ARGN} - COMM serial mpi - NUM_MPI_PROCS 1 - ${TEST_UNPARSED_ARGUMENTS} - ) -ELSE() - CMAKE_PARSE_ARGUMENTS(TEST - "WILL_FAIL" - "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME" - "CATEGORIES" - ${ARGN}) - IF(TEST_EXE) - SET(EXE ${TEST_EXE}) - ELSE() - SET(EXE ${TEST_NAME}) - ENDIF() - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX}) +CMAKE_PARSE_ARGUMENTS(PARSE + "" + "" + "COMPONENTS" + ${ARGN}) + +KOKKOSKERNELS_IS_ENABLED( + COMPONENTS ${PARSE_COMPONENTS} + OUTPUT_VARIABLE IS_ENABLED +) + +IF (IS_ENABLED) + IF (KOKKOSKERNELS_HAS_TRILINOS) + CMAKE_PARSE_ARGUMENTS(TEST + "" + "EXE;NAME" + "" + ${PARSE_UNPARSED_ARGUMENTS}) + IF(TEST_EXE) + SET(EXE_ROOT ${TEST_EXE}) + ELSE() + SET(EXE_ROOT ${TEST_NAME}) + ENDIF() + + TRIBITS_ADD_TEST( + ${EXE_ROOT} + NAME ${TEST_NAME} + ${ARGN} + COMM serial mpi + NUM_MPI_PROCS 1 + ${TEST_UNPARSED_ARGUMENTS} + ) ELSE() - ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE}) - ENDIF() - IF(TEST_WILL_FAIL) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) - ENDIF() - IF(TEST_FAIL_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_PASS_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + CMAKE_PARSE_ARGUMENTS(TEST + "WILL_FAIL" + "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME" + "CATEGORIES" + ${PARSE_UNPARSED_ARGUMENTS}) + IF(TEST_EXE) + SET(EXE ${TEST_EXE}) + ELSE() + SET(EXE ${TEST_NAME}) + ENDIF() + IF(WIN32) + ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX}) + ELSE() + ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE}) + ENDIF() + IF(TEST_WILL_FAIL) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) + ENDIF() + IF(TEST_FAIL_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) + ENDIF() + IF(TEST_PASS_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + ENDIF() + VERIFY_EMPTY(KOKKOSKERNELS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) ENDIF() - VERIFY_EMPTY(KOKKOSKERNELS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) +ELSE() + MESSAGE(STATUS "Skipping test ${TEST_NAME} because not all necessary components enabled") ENDIF() ENDFUNCTION() diff --git a/cmake/kokkoskernels_tribits.cmake b/cmake/kokkoskernels_tribits.cmake index 0bd8c04963..a0cc9d30d5 100644 --- a/cmake/kokkoskernels_tribits.cmake +++ b/cmake/kokkoskernels_tribits.cmake @@ -127,88 +127,121 @@ ENDIF() ENDFUNCTION() FUNCTION(KOKKOSKERNELS_ADD_EXECUTABLE EXE_NAME) -IF (KOKKOSKERNELS_HAS_TRILINOS) - TRIBITS_ADD_EXECUTABLE(${EXE_NAME} ${ARGN}) -ELSE() - CMAKE_PARSE_ARGUMENTS(PARSE - "TESTONLY" - "" - "SOURCES;TESTONLYLIBS" - ${ARGN}) +CMAKE_PARSE_ARGUMENTS(PARSE + "" + "" + "SOURCES;COMPONENTS;TESTONLYLIBS" + ${ARGN}) +VERIFY_EMPTY(KOKKOSKERNELS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) - ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES}) - TARGET_LINK_LIBRARIES(${EXE_NAME} Kokkos::kokkoskernels) - IF (PARSE_TESTONLYLIBS) - TARGET_LINK_LIBRARIES(${EXE_NAME} ${PARSE_TESTONLYLIBS}) +KOKKOSKERNELS_IS_ENABLED( + COMPONENTS ${PARSE_COMPONENTS} + OUTPUT_VARIABLE IS_ENABLED +) + +IF (IS_ENABLED) + IF (KOKKOSKERNELS_HAS_TRILINOS) + TRIBITS_ADD_EXECUTABLE(${EXE_NAME} + SOURCES ${PARSE_SOURCES} + TESTONLYLIBS ${TESTONLYLIBS}) + ELSE() + ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES}) + TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkoskernels) + IF (PARSE_TESTONLYLIBS) + TARGET_LINK_LIBRARIES(${EXE_NAME} ${PARSE_TESTONLYLIBS}) + ENDIF() ENDIF() - VERIFY_EMPTY(KOKKOSKERNELS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) +ELSE() + MESSAGE(STATUS "Skipping executable ${EXE_NAME} because not all necessary components enabled") ENDIF() ENDFUNCTION() -FUNCTION(KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST ROOT_NAME) -IF (KOKKOSKERNELS_HAS_TRILINOS) - TRIBITS_ADD_EXECUTABLE_AND_TEST( +FUNCTION(KOKKOSKERNELS_ADD_UNIT_TEST ROOT_NAME) + KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( ${ROOT_NAME} TESTONLYLIBS kokkoskernels_gtest ${ARGN} - NUM_MPI_PROCS 1 - COMM serial mpi ) -ELSE() - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES;CATEGORIES" - ${ARGN}) - VERIFY_EMPTY(KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE(${EXE_NAME} - SOURCES ${PARSE_SOURCES} - ) - KOKKOSKERNELS_ADD_TEST(NAME ${ROOT_NAME} - EXE ${EXE_NAME} - ) -ENDIF() ENDFUNCTION() -FUNCTION(KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) -IF (KOKKOSKERNELS_HAS_TRILINOS) - TRIBITS_ADD_EXECUTABLE_AND_TEST( - ${ROOT_NAME} - ${ARGN} - NUM_MPI_PROCS 1 - COMM serial mpi - ) -ELSE() +FUNCTION(KOKKOSKERNELS_IS_ENABLED) CMAKE_PARSE_ARGUMENTS(PARSE "" - "" - "SOURCES;CATEGORIES" + "OUTPUT_VARIABLE" + "COMPONENTS" ${ARGN}) - VERIFY_EMPTY(KOKKOSKERNELS_ADD_EXECUTABLE_AND_RUN_VERIFY ${PARSE_UNPARSED_ARGUMENTS}) - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - KOKKOSKERNELS_ADD_EXECUTABLE(${EXE_NAME} - SOURCES ${PARSE_SOURCES} - ) - KOKKOSKERNELS_ADD_TEST(NAME ${ROOT_NAME} - EXE ${EXE_NAME} - ) -ENDIF() + + IF (KOKKOSKERNELS_ENABLED_COMPONENTS STREQUAL "ALL") + SET(${PARSE_OUTPUT_VARIABLE} TRUE PARENT_SCOPE) + ELSEIF(PARSE_COMPONENTS) + SET(ENABLED TRUE) + FOREACH(comp ${PARSE_COMPONENTS}) + STRING(TOUPPER ${comp} COMP_UC) + # make sure this is in the list of enabled components + IF(NOT "${COMP_UC}" IN_LIST KOKKOSKERNELS_ENABLED_COMPONENTS) + # if not in the list, one or more components is missing + SET(ENABLED FALSE) + ENDIF() + ENDFOREACH() + SET(${PARSE_OUTPUT_VARIABLE} ${ENABLED} PARENT_SCOPE) + ELSE() + # we did not enable all components and no components + # were given as part of this - we consider this enabled + SET(${PARSE_OUTPUT_VARIABLE} TRUE PARENT_SCOPE) + ENDIF() ENDFUNCTION() -MACRO(KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE EXE_NAME) -CMAKE_PARSE_ARGUMENTS(PARSE +FUNCTION(KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) + +CMAKE_PARSE_ARGUMENTS(PARSE "" "" - "SOURCES" + "SOURCES;CATEGORIES;COMPONENTS;TESTONLYLIBS" ${ARGN}) -KOKKOSKERNELS_ADD_EXECUTABLE(${EXE_NAME} - SOURCES ${PARSE_SOURCES} - TESTONLYLIBS kokkoskernels_gtest - ${PARSE_UNPARSED_ARGUMENTS} +VERIFY_EMPTY(KOKKOSKERNELS_ADD_EXECUTABLE_AND_RUN_VERIFY ${PARSE_UNPARSED_ARGUMENTS}) + +KOKKOSKERNELS_IS_ENABLED( + COMPONENTS ${PARSE_COMPONENTS} + OUTPUT_VARIABLE IS_ENABLED ) -IF (NOT KOKKOSKERNELS_HAS_TRILINOS) - TARGET_LINK_LIBRARIES(${EXE_NAME} kokkoskernels_gtest) + +IF (IS_ENABLED) + IF (KOKKOSKERNELS_HAS_TRILINOS) + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ROOT_NAME} + SOURCES ${PARSE_SOURCES} + CATEGORIES ${PARSE_CATEGORIES} + TESTONLYLIBS ${PARSE_TESTONLYLIBS} + NUM_MPI_PROCS 1 + COMM serial mpi + ) + ELSE() + SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + KOKKOSKERNELS_ADD_EXECUTABLE(${EXE_NAME} + SOURCES ${PARSE_SOURCES} + ) + IF (PARSE_TESTONLYLIBS) + TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS}) + ENDIF() + KOKKOSKERNELS_ADD_TEST(NAME ${ROOT_NAME} + EXE ${EXE_NAME} + ) + ENDIF() +ELSE() + MESSAGE(STATUS "Skipping executable/test ${ROOT_NAME} because not all necessary components enabled") ENDIF() -ADD_DEPENDENCIES(check ${EXE_NAME}) -ENDMACRO(KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE) + +ENDFUNCTION() + +MACRO(ADD_COMPONENT_SUBDIRECTORY SUBDIR) + KOKKOSKERNELS_IS_ENABLED( + COMPONENTS ${SUBDIR} + OUTPUT_VARIABLE COMP_SUBDIR_ENABLED + ) + IF (COMP_SUBDIR_ENABLED) + ADD_SUBDIRECTORY(${SUBDIR}) + ELSE() + MESSAGE(STATUS "Skipping subdirectory ${SUBDIR} because component is not enabled") + ENDIF() + UNSET(COMP_SUBDIR_ENABLED) +ENDMACRO() diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index 2ec1ff57c8..fe3b3c51ba 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -10,11 +10,10 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../test_common) # build correctly with or without MPI, but only run them with a single # MPI process. -ADD_SUBDIRECTORY(batched) -ADD_SUBDIRECTORY(graph) -ADD_SUBDIRECTORY(sparse) +ADD_COMPONENT_SUBDIRECTORY(batched) +ADD_COMPONENT_SUBDIRECTORY(graph) +ADD_COMPONENT_SUBDIRECTORY(sparse) +ADD_COMPONENT_SUBDIRECTORY(blas) ADD_SUBDIRECTORY(performance) -ADD_SUBDIRECTORY(blas/blas3) -ADD_SUBDIRECTORY(blas/blas) #ADD_SUBDIRECTORY(common) diff --git a/perf_test/batched/CMakeLists.txt b/perf_test/batched/CMakeLists.txt index b9613c7802..36435ecfc1 100644 --- a/perf_test/batched/CMakeLists.txt +++ b/perf_test/batched/CMakeLists.txt @@ -1,5 +1,9 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockTridiag SOURCES KokkosBatched_Test_BlockTridiagDirect.cpp) -KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockJacobi SOURCES KokkosBatched_Test_BlockJacobi_Tutorial.cpp) +KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockTridiag + SOURCES KokkosBatched_Test_BlockTridiagDirect.cpp +) +KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockJacobi + SOURCES KokkosBatched_Test_BlockJacobi_Tutorial.cpp +) diff --git a/perf_test/blas/CMakeLists.txt b/perf_test/blas/CMakeLists.txt new file mode 100644 index 0000000000..2d93de0458 --- /dev/null +++ b/perf_test/blas/CMakeLists.txt @@ -0,0 +1,2 @@ +ADD_SUBDIRECTORY(blas) +ADD_SUBDIRECTORY(blas3) diff --git a/perf_test/blas/blas/CMakeLists.txt b/perf_test/blas/blas/CMakeLists.txt index 98e4ed0859..762c472e22 100644 --- a/perf_test/blas/blas/CMakeLists.txt +++ b/perf_test/blas/blas/CMakeLists.txt @@ -2,4 +2,6 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) KOKKOSKERNELS_ADD_EXECUTABLE( - KokkosBlas_perf_test SOURCES KokkosBlas_perf_test.cpp) + KokkosBlas_perf_test + SOURCES KokkosBlas_perf_test.cpp + ) diff --git a/perf_test/blas/blas3/CMakeLists.txt b/perf_test/blas/blas3/CMakeLists.txt index a46d4a7712..c1e3a117fa 100644 --- a/perf_test/blas/blas3/CMakeLists.txt +++ b/perf_test/blas/blas3/CMakeLists.txt @@ -2,4 +2,6 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) KOKKOSKERNELS_ADD_EXECUTABLE( - KokkosBlas3_perf_test SOURCES KokkosBlas3_perf_test.cpp) + KokkosBlas3_perf_test + SOURCES KokkosBlas3_perf_test.cpp +) diff --git a/perf_test/performance/CMakeLists.txt b/perf_test/performance/CMakeLists.txt index 09593b3128..93d377ba60 100644 --- a/perf_test/performance/CMakeLists.txt +++ b/perf_test/performance/CMakeLists.txt @@ -11,12 +11,12 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) IF(TPL_ENABLE_yaml-cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( performance_validate SOURCES performance_validate.cpp ) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( performance_example SOURCES performance_example.cpp ) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index df360c69de..22c17b5247 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -56,285 +56,332 @@ SET(ETI_HEADERS) #Generate @X@ variables in the template X.hpp.in and X.cpp.in #files containing the list of all needed macros KOKKOSKERNELS_GENERATE_ETI(Blas1_abs abs + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_abs_mv abs + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_scal scal + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_scal_mv scal + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_dot dot + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_dot_mv dot + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas_gesv gesv + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_axpby axpby + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_axpby_mv axpby + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_update update + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_update_mv update + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_sum sum + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_sum_mv sum + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm1 nrm1 + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm1_mv nrm1 + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm2w nrm2w + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm2w_mv nrm2w + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrminf nrminf + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrminf_mv nrminf + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_iamax iamax + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_iamax_mv iamax + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm2 nrm2 + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm2_mv nrm2 + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_mult mult + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_mult_mv mult + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_reciprocal reciprocal + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_reciprocal_mv reciprocal + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas2_gemv gemv + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas3_gemm gemm + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas3_trsm trsm + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas3_trmm trmm + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas_trtri trtri + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_sptrsv_solve sptrsv_solve + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_struct spmv + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_mv_struct spmv + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv spmv + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_mv spmv + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_symbolic spgemm_symbolic + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_numeric spgemm_numeric + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_jacobi spgemm_jacobi + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spiluk_symbolic spiluk_symbolic + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spiluk_numeric spiluk_numeric + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_sptrsv_symbolic sptrsv_symbolic + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_trsv trsv + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_gauss_seidel_symbolic gauss_seidel_symbolic + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) KOKKOSKERNELS_GENERATE_ETI(Sparse_gauss_seidel_numeric gauss_seidel_numeric + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) KOKKOSKERNELS_GENERATE_ETI(Sparse_gauss_seidel_apply gauss_seidel_apply + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) LIST(APPEND HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h) diff --git a/src/kokkoskernels_eti.cmake b/src/kokkoskernels_eti.cmake index 1179ec9c41..04a6f412c9 100644 --- a/src/kokkoskernels_eti.cmake +++ b/src/kokkoskernels_eti.cmake @@ -126,7 +126,7 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) CMAKE_PARSE_ARGUMENTS(ETI "" "HEADER_LIST;SOURCE_LIST" - "TYPE_LISTS" + "TYPE_LISTS;COMPONENTS" ${ARGN}) STRING(TOUPPER "${FUNCTION_NAME}" UPPER_NAME) @@ -134,26 +134,38 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) SET(ETI_AVAIL_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_AVAIL") SET(ETI_INST_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_INST") - KOKKOSKERNELS_ETI_MAKE_LIST(${FUNCTION_NAME}_eti TYPE_LISTS ${ETI_TYPE_LISTS}) - FOREACH(ETI ${${FUNCTION_NAME}_eti}) - SET(MACRO_STRING "(") - FOREACH(TYPE_NAME ${${ETI}}) - STRING(APPEND MACRO_STRING "${${TYPE_NAME}_CPP_TYPE},") + # if this is tied to particular components + # see whether those components are enabled + KOKKOSKERNELS_IS_ENABLED( + COMPONENTS ${ETI_COMPONENTS} + OUTPUT_VARIABLE ETI_COMP_IS_ENABLED + ) + + IF (ETI_COMP_IS_ENABLED) + MESSAGE(STATUS "Creating ETI files for ${FUNCTION_NAME}") + KOKKOSKERNELS_ETI_MAKE_LIST(${FUNCTION_NAME}_eti TYPE_LISTS ${ETI_TYPE_LISTS}) + FOREACH(ETI ${${FUNCTION_NAME}_eti}) + SET(MACRO_STRING "(") + FOREACH(TYPE_NAME ${${ETI}}) + STRING(APPEND MACRO_STRING "${${TYPE_NAME}_CPP_TYPE},") + ENDFOREACH() + STRING(APPEND MACRO_STRING ")") + STRING(REPLACE ",)" ")" MACRO_STRING ${MACRO_STRING}) + #Make a single header file for all instances + LIST(APPEND ${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") + LIST(APPEND ${UPPER_NAME}_ETI_AVAIL_LIST "${ETI_AVAIL_MACRO}${MACRO_STRING}") + SET(${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") + #Make a different source file for each instance + SET(INST_SOURCE "impl/generated_specializations_cpp/${SUBFOLDER}/${ETI}.cpp") + SET(INST_TEMPLATE "impl/generated_specializations_cpp/${SUBFOLDER}/Kokkos${FUNCTION_NAME}_eti_spec_inst.cpp.in") + SET(${UPPER_NAME}_ETI_INST_BLOCK "${ETI_INST_MACRO}${MACRO_STRING}") + CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${INST_TEMPLATE} + ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE}) + LIST(APPEND ${ETI_SOURCE_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE}) ENDFOREACH() - STRING(APPEND MACRO_STRING ")") - STRING(REPLACE ",)" ")" MACRO_STRING ${MACRO_STRING}) - #Make a single header file for all instances - LIST(APPEND ${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") - LIST(APPEND ${UPPER_NAME}_ETI_AVAIL_LIST "${ETI_AVAIL_MACRO}${MACRO_STRING}") - SET(${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") - #Make a different source file for each instance - SET(INST_SOURCE "impl/generated_specializations_cpp/${SUBFOLDER}/${ETI}.cpp") - SET(INST_TEMPLATE "impl/generated_specializations_cpp/${SUBFOLDER}/Kokkos${FUNCTION_NAME}_eti_spec_inst.cpp.in") - SET(${UPPER_NAME}_ETI_INST_BLOCK "${ETI_INST_MACRO}${MACRO_STRING}") - CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${INST_TEMPLATE} - ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE}) - LIST(APPEND ${ETI_SOURCE_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE}) - ENDFOREACH() + ELSE() + MESSAGE(STATUS "Skipping ETI files for ${FUNCTION_NAME} because not all components are enabled") + ENDIF() SET(AVAIL_HEADER "impl/generated_specializations_hpp/Kokkos${FUNCTION_NAME}_eti_spec_avail.hpp") SET(AVAIL_TEMPLATE "${AVAIL_HEADER}.in") @@ -163,7 +175,6 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_INST_BLOCK "${${UPPER_NAME}_ETI_INST_LIST}") STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_AVAIL_BLOCK "${${UPPER_NAME}_ETI_AVAIL_LIST}") - MESSAGE(STATUS "Creating ETI files for ${FUNCTION_NAME}") CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${DECL_TEMPLATE} ${CMAKE_CURRENT_BINARY_DIR}/${DECL_HEADER}) CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${AVAIL_TEMPLATE} diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt index e610ded3f9..b8060d3cb1 100644 --- a/unit_test/CMakeLists.txt +++ b/unit_test/CMakeLists.txt @@ -39,19 +39,21 @@ IF (KOKKOS_ENABLE_CUDA) KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/cuda) APPEND_GLOB(CUDA_BLAS_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Blas*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( blas_cuda SOURCES Test_Main.cpp ${CUDA_BLAS_SOURCES} + COMPONENTS blas ) APPEND_GLOB(CUDA_BATCHED_DLA_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Batched*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( batched_dla_cuda SOURCES Test_Main.cpp ${CUDA_BATCHED_DLA_SOURCES} + COMPONENTS batched ) APPEND_GLOB(CUDA_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Sparse*.cpp) @@ -66,27 +68,29 @@ IF (KOKKOS_ENABLE_CUDA) "${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Sparse_Utils_cusparse.cpp") ENDIF() - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( sparse_cuda SOURCES Test_Main.cpp ${CUDA_SPARSE_SOURCES} + COMPONENTS sparse ) APPEND_GLOB(CUDA_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Graph*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( graph_cuda SOURCES Test_Main.cpp ${CUDA_GRAPH_SOURCES} + COMPONENTS graph ) #currently float 128 test is not working. So common tests are explicitly added. APPEND_GLOB(CUDA_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Common*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( common_cuda SOURCES Test_Main.cpp @@ -104,42 +108,46 @@ IF (KOKKOS_ENABLE_OPENMP) # SET(DISABLE_SLOW_DGEMM_DOUBLE_TEST "--gtest_filter=-openmp.gemm_double") # ENDIF() - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( blas_openmp SOURCES Test_Main.cpp ${OPENMP_BLAS_SOURCES} - ) + COMPONENTS blas + ) APPEND_GLOB(OPENMP_BATCHED_DLA_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Batched*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( batched_dla_openmp SOURCES Test_Main.cpp ${OPENMP_BATCHED_DLA_SOURCES} - ) + COMPONENTS batched + ) APPEND_GLOB(OPENMP_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Sparse*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( sparse_openmp SOURCES Test_Main.cpp ${OPENMP_SPARSE_SOURCES} + COMPONENTS sparse ) APPEND_GLOB(OPENMP_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Graph*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( graph_openmp SOURCES Test_Main.cpp ${OPENMP_GRAPH_SOURCES} + COMPONENTS graph ) APPEND_GLOB(OPENMP_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Common*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( common_openmp SOURCES Test_Main.cpp @@ -157,43 +165,47 @@ IF (KOKKOS_ENABLE_SERIAL) # SET(DISABLE_SLOW_DGEMM_DOUBLE_TEST "--gtest_filter=-serial.gemm_double") # ENDIF() - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( blas_serial SOURCES Test_Main.cpp ${SERIAL_BLAS_SOURCES} # ARGS ${DISABLE_SLOW_DGEMM_DOUBLE_TEST} + COMPONENTS blas ) APPEND_GLOB(SERIAL_BATCHED_DLA_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Batched*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( batched_dla_serial SOURCES Test_Main.cpp ${SERIAL_BATCHED_DLA_SOURCES} + COMPONENTS batched ) APPEND_GLOB(SERIAL_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Sparse*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( sparse_serial SOURCES Test_Main.cpp ${SERIAL_SPARSE_SOURCES} + COMPONENTS sparse ) APPEND_GLOB(SERIAL_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Graph*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( graph_serial SOURCES Test_Main.cpp ${SERIAL_GRAPH_SOURCES} + COMPONENTS graph ) APPEND_GLOB(SERIAL_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Common*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( common_serial SOURCES Test_Main.cpp @@ -207,35 +219,38 @@ IF (KOKKOS_ENABLE_PTHREAD) APPEND_GLOB(THREADS_BLAS_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Blas*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( blas_threads SOURCES Test_Main.cpp ${THREADS_BLAS_SOURCES} + COMPONENTS blas ) APPEND_GLOB(THREADS_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Sparse*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( sparse_threads SOURCES Test_Main.cpp ${THREADS_SPARSE_SOURCES} + COMPONENTS sparse ) APPEND_GLOB(THREADS_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Graph*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( graph_threads SOURCES Test_Main.cpp ${THREADS_GRAPH_SOURCES} + COMPONENTS graph ) APPEND_GLOB(THREADS_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Common*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( common_threads SOURCES Test_Main.cpp From 498ca69fbf56a9f652d8029e105a2b7e0c54c70f Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 6 Oct 2020 14:08:10 -0700 Subject: [PATCH 037/106] cm_test_all_sandia: add cuda/11 for kokkos-dev-2 more kokkos-dev and sems module updates --- scripts/cm_test_all_sandia | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index fa09af3c69..1343000fb4 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -413,7 +413,6 @@ if [ "$MACHINE" = "sems" ]; then COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" "gcc/7.2.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then @@ -421,7 +420,6 @@ if [ "$MACHINE" = "sems" ]; then COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" "gcc/7.2.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) else @@ -430,10 +428,8 @@ if [ "$MACHINE" = "sems" ]; then "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) fi @@ -476,13 +472,15 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "clang/4.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" "clang/7.0.1 $CLANG7_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/10.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) fi @@ -764,6 +762,7 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then GCC91_MODULE_LIST="sems-env,sems-cmake/3.12.2,/" NVCC_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/7.3.0" NVCC_SEMSMODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/7.3.0" + NVCC11_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/9.2.0" CLANG_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/6.1.0" CLANG8_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,cuda/10.0" @@ -773,7 +772,6 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then BUILD_LIST_CLANG="Serial,Pthread,OpenMP" CLANG8_CUDA_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized,-Wno-pass-failed" - PGI_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-gcc/7.3.0,/" if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) @@ -787,6 +785,7 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then "clang/9.0.0 $BASE_MODULE_LIST "Serial,Pthread" clang++ $CLANG_WARNING_FLAGS" "cuda/10.1 $NVCC_SEMSMODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/9.2 $NVCC_SEMSMODULE_LIST "Cuda_Serial" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/11.0 $NVCC11_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) @@ -799,11 +798,13 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Pthread_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS" "clang/9.0.0 $BASE_MODULE_LIST "Serial,Pthread" clang++ $CLANG_WARNING_FLAGS" "cuda/10.1 $NVCC_SEMSMODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/11.0 $NVCC11_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) else # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("cuda/10.0 $NVCC_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/10.1 $NVCC_SEMSMODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/11.0 $NVCC11_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/9.2 $NVCC_SEMSMODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CLANG8_CUDA_WARNING_FLAGS" "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" @@ -817,15 +818,10 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" "clang/7.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" "clang/10.0.0 $BASE_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" - "pgi/19.4 $PGI_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" ) fi From 1c3a92144889920641361f3fa47bc369ba0d7332 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 7 Oct 2020 13:54:50 -0700 Subject: [PATCH 038/106] Remove pre-3.0 deprecated code Addresses cleanup for issue #793 --- src/common/KokkosKernels_Utils.hpp | 110 ------------------ .../KokkosGraph_Distance1ColorHandle.hpp | 10 +- src/graph/KokkosGraph_Distance2Color.hpp | 74 ------------ src/graph/KokkosGraph_GraphColorHandle.hpp | 9 -- src/graph/KokkosGraph_graph_color.hpp | 9 -- src/sparse/KokkosSparse_BlockCrsMatrix.hpp | 7 -- src/sparse/KokkosSparse_CrsMatrix.hpp | 7 -- .../KokkosSparse_gauss_seidel_handle.hpp | 9 -- .../KokkosSparse_spgemm_impl_compression.hpp | 8 -- .../impl/KokkosSparse_spgemm_impl_kkmem.hpp | 8 -- .../impl/KokkosSparse_spgemm_impl_speed.hpp | 8 -- .../KokkosSparse_spgemm_impl_symbolic.hpp | 24 ---- .../KokkosSparse_spgemm_impl_triangle.hpp | 8 -- ...se_spgemm_impl_triangle_no_compression.hpp | 8 -- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 28 +---- .../impl/KokkosSparse_spmv_struct_impl.hpp | 16 --- .../Test_Graph_graph_color_distance2.hpp | 54 --------- 17 files changed, 5 insertions(+), 392 deletions(-) diff --git a/src/common/KokkosKernels_Utils.hpp b/src/common/KokkosKernels_Utils.hpp index 6c34a64ae8..80d22ec4b0 100644 --- a/src/common/KokkosKernels_Utils.hpp +++ b/src/common/KokkosKernels_Utils.hpp @@ -90,78 +90,6 @@ void get_histogram( kk_get_histogram(in_elements, in_view, histogram); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE -template -void get_suggested_vector_team_size( - int max_allowed_team_size, - int &suggested_vector_size_, - int &suggested_team_size_, - idx nr, idx nnz){ - - - suggested_vector_size_ = 1; - suggested_team_size_ = 1; - -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ - suggested_vector_size_ = 1; - suggested_team_size_ = 1; - } -#endif - -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - suggested_vector_size_ = 1; - suggested_team_size_ = 1; - } -#endif - -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - suggested_vector_size_ = 1; - suggested_team_size_ = 1; - } -#endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - - suggested_vector_size_ = nnz / double (nr) + 0.5; - - if (suggested_vector_size_ <= 3){ - suggested_vector_size_ = 2; - } - else if (suggested_vector_size_ <= 6){ - suggested_vector_size_ = 4; - } - else if (suggested_vector_size_ <= 12){ - suggested_vector_size_ = 8; - } - else if (suggested_vector_size_ <= 24){ - suggested_vector_size_ = 16; - } - else { - suggested_vector_size_ = 32; - } - - suggested_team_size_ = max_allowed_team_size / suggested_vector_size_; - } -#else - (void)max_allowed_team_size; - (void)nr; - (void)nnz; -#endif - -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ - suggested_vector_size_ = 1; - suggested_team_size_ = 1; - } -#endif - -} - -#else template void get_suggested_vector_size( int &suggested_vector_size_, @@ -237,8 +165,6 @@ int get_suggested_team_size(Functor& f, int vector_size) } } -#endif //ifdef KOKKOS_ENABLE_DEPRECATED_CODE ... else - template int get_suggested_team_size(Functor& f, int vector_size, size_t sharedPerTeam, size_t sharedPerThread) { @@ -1132,21 +1058,12 @@ void symmetrize_and_get_lower_diagonal_edge_list( int teamSizeMax = 0; int vector_size = 0; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - int max_allowed_team_size = team_policy::team_size_max(fse); - get_suggested_vector_team_size( - max_allowed_team_size, - vector_size, - teamSizeMax, - xadj.extent(0) - 1, nnz); -#else get_suggested_vector_size( vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(fse, vector_size); -#endif //std::cout << "max_allowed_team_size:" << max_allowed_team_size << " vs:" << vector_size << " tsm:" << teamSizeMax<< std::endl; team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); @@ -1186,21 +1103,12 @@ void symmetrize_and_get_lower_diagonal_edge_list( int teamSizeMax = 0; int vector_size = 0; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - int max_allowed_team_size = team_policy::team_size_max(FSCH); - get_suggested_vector_team_size( - max_allowed_team_size, - vector_size, - teamSizeMax, - xadj.extent(0) - 1, nnz); -#else get_suggested_vector_size( vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(FSCH, vector_size); -#endif team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S1", pol, FSCH); @@ -1261,21 +1169,12 @@ void symmetrize_graph_symbolic_hashmap( int teamSizeMax = 0; int vector_size = 0; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - int max_allowed_team_size = team_policy::team_size_max(fse); - get_suggested_vector_team_size( - max_allowed_team_size, - vector_size, - teamSizeMax, - xadj.extent(0) - 1, nnz); -#else get_suggested_vector_size( vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(fse, vector_size); -#endif team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S0", @@ -1311,22 +1210,13 @@ void symmetrize_graph_symbolic_hashmap( int teamSizeMax = 0; int vector_size = 0; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - int max_allowed_team_size = team_policy::team_size_max(FSCH); - get_suggested_vector_team_size( - max_allowed_team_size, - vector_size, - teamSizeMax, - xadj.extent(0) - 1, nnz); -#else get_suggested_vector_size( vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(FSCH, vector_size); -#endif team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S1", diff --git a/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/src/graph/KokkosGraph_Distance1ColorHandle.hpp index 49e20d5395..268c8e6a68 100644 --- a/src/graph/KokkosGraph_Distance1ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance1ColorHandle.hpp @@ -510,20 +510,12 @@ class GraphColoringHandle int vector_size = 0; CountLowerTriangleTeam clt (nv, xadj, adj, lower_count); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - int max_allowed_team_size = team_policy_t::team_size_max(clt); - KokkosKernels::Impl::get_suggested_vector_team_size( - max_allowed_team_size, - vector_size, - teamSizeMax, - nv, ne); -#else + KokkosKernels::Impl::get_suggested_vector_size( vector_size, nv, ne); teamSizeMax = KokkosKernels::Impl::get_suggested_team_size(clt, vector_size); -#endif Kokkos::parallel_for("KokkosGraph::CountLowerTriangleTeam", team_policy_t((nv + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size), diff --git a/src/graph/KokkosGraph_Distance2Color.hpp b/src/graph/KokkosGraph_Distance2Color.hpp index dacf9c99db..53f2b4a26b 100644 --- a/src/graph/KokkosGraph_Distance2Color.hpp +++ b/src/graph/KokkosGraph_Distance2Color.hpp @@ -245,80 +245,6 @@ void bipartite_color_columns( gch_d2->set_coloring_time(timer.seconds()); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE -/** - * (DEPRECATED) Compute the left-side coloring of a bipartite matrix/graph. - * Equivalent to bipartite_color_rows(), except this interface requires the user - * to compute (col_map, col_entries) as the transpose of the graph (if nonsymmetric). - * - * This function is deprecated because it's not possible to support both undirected - * distance-2 coloring and bipartite one-sided coloring - * in a single interface. However, if the input graph has all diagonal entries present and - * is symmetric (which is generally the case for discretized PDE matrices), then this - * function is also equivalent to graph_color_distance2(). - * - * In any case, the graphs (row_map, row_entries) and (col_map, col_entries) must be transposes - * of each other. - * - * @param[in] handle The Kernel Handle - * @param[in] num_rows Number of rows in the matrix (number of vertices) - * @param[in] num_cols Number of columns in the matrix - * @param[in] row_map Row map - * @param[in] row_entries Row entries - * @param[in] col_map Column map - * @param[in] col_entries Column entries - */ -template -void graph_compute_distance2_color(KernelHandle *handle, - typename KernelHandle::nnz_lno_t num_rows, - typename KernelHandle::nnz_lno_t num_cols, - lno_row_view_t_ row_map, - lno_nnz_view_t_ row_entries, - // If graph is symmetric, simply pass the same graph twice: - // row_map == col_map, row_entries == col_entries - lno_col_view_t_ col_map, - lno_colnnz_view_t_ col_entries) -{ - using lno_t = typename KernelHandle::nnz_lno_t; - using size_type = typename KernelHandle::size_type; - using memory_space = typename KernelHandle::HandleTempMemorySpace; - static_assert(std::is_same::value, - "Row and col maps must have the same value type (size_type)."); - static_assert(std::is_same::value, - "Row and col entries must have the same value type (nnz_lno_t)."); - //Internally, coloring accesses the graph through unmanaged views - //These are explicitly nonconst so that copies of adj for edge-filtering - //(which must be mutable) can use the same type. - // - //The original input graphs will never be modified. - using InternalRowmap = Kokkos::View >; - using InternalColinds = Kokkos::View >; - if(row_entries.extent(0) != col_entries.extent(0)) - { - throw std::runtime_error("row_entries and col_entries must represent transposes of each other, but they have different lengths"); - } - Kokkos::Impl::Timer timer; - // Set our handle pointer to a GraphColoringHandleType. - auto *gch_d2 = handle->get_distance2_graph_coloring_handle(); - // Create a view to save the colors to. - using color_view_type = typename KernelHandle::GraphColorDistance2HandleType::color_view_type; - color_view_type colors_out("Graph Colors", num_rows); - InternalRowmap rowmap_internal(row_map.data(), row_map.extent(0)); - InternalColinds rowentries_internal(row_entries.data(), row_entries.extent(0)); - InternalRowmap colmap_internal(col_map.data(), col_map.extent(0)); - InternalColinds colentries_internal(col_entries.data(), col_entries.extent(0)); - Impl::GraphColorDistance2 - gc(num_rows, num_cols, row_map, row_entries, col_map, col_entries, gch_d2); - gc.compute_distance2_color(); - gch_d2->add_to_overall_coloring_time(timer.seconds()); - gch_d2->set_coloring_time(timer.seconds()); -} -#endif - } // end namespace Experimental } // end namespace KokkosGraph diff --git a/src/graph/KokkosGraph_GraphColorHandle.hpp b/src/graph/KokkosGraph_GraphColorHandle.hpp index de9fd6d8f4..9526c34b0e 100644 --- a/src/graph/KokkosGraph_GraphColorHandle.hpp +++ b/src/graph/KokkosGraph_GraphColorHandle.hpp @@ -49,12 +49,3 @@ * KokkosGraph_Distance1Color.hpp to be more consistent with file naming * used in other places within Kokkos-Kernels. */ -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE -#include "KokkosGraph_Distance1ColorHandle.hpp" - -// This interface should be deprecated in version 3.0 -#pragma message("DEPRECATION WARNING: The KokkosGraph_GraphColorHandle.hpp header is replaced by KokkosGraph_Distance1ColorHandle.hpp") - -#endif - - diff --git a/src/graph/KokkosGraph_graph_color.hpp b/src/graph/KokkosGraph_graph_color.hpp index 4494ecc509..9526c34b0e 100644 --- a/src/graph/KokkosGraph_graph_color.hpp +++ b/src/graph/KokkosGraph_graph_color.hpp @@ -49,12 +49,3 @@ * KokkosGraph_Distance1Color.hpp to be more consistent with file naming * used in other places within Kokkos-Kernels. */ -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE -#include "KokkosGraph_Distance1Color.hpp" - -// This interface should be deprecated in version 3.0 -#pragma message("DEPRECATION WARNING: The KokkosGraph_graph_color.hpp header will be replaced by KokkosGraph_Distance1Color.hpp") - -#endif - - diff --git a/src/sparse/KokkosSparse_BlockCrsMatrix.hpp b/src/sparse/KokkosSparse_BlockCrsMatrix.hpp index 9cfd97afbb..61f3550275 100644 --- a/src/sparse/KokkosSparse_BlockCrsMatrix.hpp +++ b/src/sparse/KokkosSparse_BlockCrsMatrix.hpp @@ -415,17 +415,10 @@ class BlockCrsMatrix { //! Type of a host-memory mirror of the sparse matrix. typedef BlockCrsMatrix HostMirror; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - //! Type of the graph structure of the sparse matrix. - typedef Kokkos::StaticCrsGraph StaticCrsGraphType; - //! Type of the graph structure of the sparse matrix - consistent with Kokkos. - typedef Kokkos::StaticCrsGraph staticcrsgraph_type; -#else //! Type of the graph structure of the sparse matrix. typedef Kokkos::StaticCrsGraph StaticCrsGraphType; //! Type of the graph structure of the sparse matrix - consistent with Kokkos. typedef Kokkos::StaticCrsGraph staticcrsgraph_type; -#endif //! Type of column indices in the sparse matrix. typedef typename staticcrsgraph_type::entries_type index_type; //! Const version of the type of column indices in the sparse matrix. diff --git a/src/sparse/KokkosSparse_CrsMatrix.hpp b/src/sparse/KokkosSparse_CrsMatrix.hpp index bba54c613c..938d6e91be 100644 --- a/src/sparse/KokkosSparse_CrsMatrix.hpp +++ b/src/sparse/KokkosSparse_CrsMatrix.hpp @@ -406,17 +406,10 @@ class CrsMatrix { //! Type of a host-memory mirror of the sparse matrix. typedef CrsMatrix HostMirror; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - //! Type of the graph structure of the sparse matrix. - typedef Kokkos::StaticCrsGraph StaticCrsGraphType; - //! Type of the graph structure of the sparse matrix - consistent with Kokkos. - typedef Kokkos::StaticCrsGraph staticcrsgraph_type; -#else //! Type of the graph structure of the sparse matrix. typedef Kokkos::StaticCrsGraph StaticCrsGraphType; //! Type of the graph structure of the sparse matrix - consistent with Kokkos. typedef Kokkos::StaticCrsGraph staticcrsgraph_type; -#endif //! Type of column indices in the sparse matrix. typedef typename staticcrsgraph_type::entries_type index_type; //! Const version of the type of column indices in the sparse matrix. diff --git a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp index 7d137f4590..2def3a17f1 100644 --- a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp +++ b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp @@ -188,12 +188,8 @@ namespace KokkosSparse{ return; } else { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - KokkosKernels::Impl::get_suggested_vector_team_size(max_allowed_team_size, suggested_vector_size_, suggested_team_size_, nr, nnz); -#else KokkosKernels::Impl::get_suggested_vector_size(suggested_vector_size_, nr, nnz); KokkosKernels::Impl::get_suggested_team_size(max_allowed_team_size, suggested_vector_size_, suggested_team_size_); -#endif this->suggested_team_size = suggested_vector_size_; this->suggested_vector_size = suggested_vector_size_; @@ -445,13 +441,8 @@ namespace KokkosSparse{ return; } else { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - KokkosKernels::Impl::get_suggested_vector_team_size( - max_allowed_team_size, suggested_vector_size_, suggested_team_size_, nr, nnz); -#else KokkosKernels::Impl::get_suggested_vector_size(suggested_vector_size_, nr, nnz); KokkosKernels::Impl::get_suggested_team_size(max_allowed_team_size, suggested_vector_size_, suggested_team_size_); -#endif this->suggested_team_size = suggested_vector_size_; this->suggested_vector_size = suggested_vector_size_; diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp index 5d98e28b98..6d240d11b3 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp @@ -206,19 +206,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp index 5303a46c40..095cef74b5 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp @@ -221,19 +221,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp index 3ba3d4e443..415bd1ed3a 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp @@ -130,19 +130,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp index 36afa46eef..9f4f7ec753 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp @@ -197,19 +197,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: @@ -780,19 +772,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: @@ -2587,19 +2571,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp index c53f8b461c..d8997fcc12 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp @@ -206,19 +206,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp index 119e6cddc6..e59b95e8ac 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp @@ -202,19 +202,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 3ea7d150b6..b14f781320 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -385,11 +385,7 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); const int rows_per_thread = RowsPerThread (NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy::team_size_recommended (op, vector_length); -#else const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > @@ -948,11 +944,7 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a // team_size is a hardware resource thing so it might legitimately // be int. const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif + const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > @@ -973,11 +965,7 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a // team_size is a hardware resource thing so it might legitimately // be int. const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif + const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > @@ -1037,11 +1025,7 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph // team_size is a hardware resource thing so it might legitimately // be int. const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif + const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for ("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > @@ -1062,11 +1046,7 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph // team_size is a hardware resource thing so it might legitimately // be int. const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif + const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index 51d2189c5c..a9c62806fd 100644 --- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -1377,11 +1377,7 @@ struct SPMV_MV_Struct_Transpose_Functor { // team_size is a hardware resource thing so it might legitimately // be int. const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > @@ -1402,11 +1398,7 @@ struct SPMV_MV_Struct_Transpose_Functor { // team_size is a hardware resource thing so it might legitimately // be int. const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > @@ -1466,11 +1458,7 @@ struct SPMV_MV_Struct_Transpose_Functor { // team_size is a hardware resource thing so it might legitimately // be int. const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for ("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > @@ -1491,11 +1479,7 @@ struct SPMV_MV_Struct_Transpose_Functor { // team_size is a hardware resource thing so it might legitimately // be int. const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > diff --git a/unit_test/graph/Test_Graph_graph_color_distance2.hpp b/unit_test/graph/Test_Graph_graph_color_distance2.hpp index cc3931083b..69fc77cca2 100644 --- a/unit_test/graph/Test_Graph_graph_color_distance2.hpp +++ b/unit_test/graph/Test_Graph_graph_color_distance2.hpp @@ -323,61 +323,7 @@ void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz, lno_t bandwidth } } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE -template -void test_old_d2(lno_t numRows, lno_t numCols, size_type nnz, lno_t bandwidth, lno_t row_size_variance) -{ - using execution_space = typename device::execution_space; - using memory_space = typename device::memory_space; - using crsMat = KokkosSparse::CrsMatrix; - using graph_type = typename crsMat::StaticCrsGraphType; - using c_rowmap_t = typename graph_type::row_map_type; - using c_entries_t = typename graph_type::entries_type; - using rowmap_t = typename graph_type::row_map_type::non_const_type; - using entries_t = typename graph_type::entries_type::non_const_type; - using KernelHandle = KokkosKernelsHandle< - size_type, lno_t, double, - execution_space, memory_space, memory_space>; - //Generate graph - crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows, numCols, nnz, row_size_variance, bandwidth); - auto G = A.graph; - rowmap_t t_rowmap("rowmap^T", numCols + 1); - entries_t t_entries("entries^T", G.entries.extent(0)); - KokkosKernels::Impl::transpose_graph - - (numRows, numCols, G.row_map, G.entries, t_rowmap, t_entries); - auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.row_map); - auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.entries); - auto t_rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_rowmap); - auto t_entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_entries); - std::vector algos = - {COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT}; - for(auto algo : algos) - { - KernelHandle kh; - kh.create_distance2_graph_coloring_handle(algo); - // Compute the one-sided bipartite coloring. - graph_compute_distance2_color - (&kh, numRows, numCols, G.row_map, G.entries, t_rowmap, t_entries); - execution_space().fence(); - auto coloring_handle = kh.get_distance2_graph_coloring_handle(); - auto colors = coloring_handle->get_vertex_colors(); - auto colorsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors); - auto numColors = coloring_handle->get_num_colors(); - EXPECT_LE(numColors, numRows); - bool success = Test::verifyBipartitePartialColoring - - (numRows, numCols, rowmapHost, entriesHost, t_rowmapHost, t_entriesHost, colorsHost); - EXPECT_TRUE(success) << "Old dist-2 coloring: algorithm " << coloring_handle->getD2AlgorithmName() << " produced invalid coloring"; - kh.destroy_distance2_graph_coloring_handle(); - } -} -#define DO_DEPRECATED_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - test_old_d2(2000, 4000, 3000 * 20, 800, 10); \ - test_old_d2(4000, 2000, 3000 * 20, 800, 10); -#else #define DO_DEPRECATED_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) -#endif #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, graph##_##graph_color_distance2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ From 08bc3e89e8b9e63955b1e80accf6b6ba1c0175b4 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Wed, 7 Oct 2020 15:06:34 -0600 Subject: [PATCH 039/106] Remove the now empty deprecated D2 coloring test --- unit_test/graph/Test_Graph_graph_color_distance2.hpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/unit_test/graph/Test_Graph_graph_color_distance2.hpp b/unit_test/graph/Test_Graph_graph_color_distance2.hpp index 69fc77cca2..6f60fc9d62 100644 --- a/unit_test/graph/Test_Graph_graph_color_distance2.hpp +++ b/unit_test/graph/Test_Graph_graph_color_distance2.hpp @@ -323,18 +323,12 @@ void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz, lno_t bandwidth } } -#define DO_DEPRECATED_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) - #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, graph##_##graph_color_distance2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ { \ test_dist2_coloring(5000, 5000 * 20, 1000, 10); \ test_dist2_coloring(50, 50 * 10, 40, 10); \ } \ - TEST_F(TestCategory, graph##_##graph_color_deprecated_distance2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ - { \ - DO_DEPRECATED_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - } \ TEST_F(TestCategory, graph##_##graph_color_bipartite_sym##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ { \ test_bipartite_symmetric(50, 50 * 5, 30, 1); \ From 3350f004d074d884ea72869012485b1149ca64ad Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 7 Oct 2020 14:09:48 -0700 Subject: [PATCH 040/106] Update integration scripts to remove deprecated code setting --- .../configure-atdm-cuda-depoff-dbg.sh | 53 ----------------- .../configure-atdm-cuda-depoff.sh | 58 ------------------- .../configure-atdm-cuda-depon-dbg.sh | 1 - .../configure-atdm-cuda-depon.sh | 1 - .../configure-atdm-cuda-ride-rdc-depoff.sh | 35 ----------- .../configure-atdm-cuda-ride-rdc-depon.sh | 1 - .../ATDM_configurations/configure-atdm-env.sh | 1 - 7 files changed, 150 deletions(-) delete mode 100755 scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff-dbg.sh delete mode 100755 scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff.sh delete mode 100755 scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depoff.sh diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff-dbg.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff-dbg.sh deleted file mode 100755 index 375b7f8712..0000000000 --- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff-dbg.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -export TRILINOS_DIR=${PWD}/../.. - -# Load modules -module purge -source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh cuda-9.2-dbg - -# Packages -PACKAGE1=Tpetra -PACKAGE2=Sacado -PACKAGE3=Stokhos -PACKAGE4=MueLu -PACKAGE5=Intrepid2 -PACKAGE6=Ifpack2 -PACKAGE7=Panzer -PACKAGE8=Phalanx -PACKAGE9=Stratimikos -PACKAGE10=Belos - -# Configure -cmake \ - -GNinja \ - -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \ - -DTrilinos_ENABLE_TESTS=ON \ - -DTrilinos_ENABLE_${PACKAGE1}=ON \ - -DTrilinos_ENABLE_${PACKAGE2}=ON \ - -DTrilinos_ENABLE_${PACKAGE3}=ON \ - -DTrilinos_ENABLE_${PACKAGE4}=ON \ - -DTrilinos_ENABLE_${PACKAGE5}=ON \ - -DTrilinos_ENABLE_${PACKAGE6}=ON \ - -DTrilinos_ENABLE_${PACKAGE7}=ON \ - -DTrilinos_ENABLE_${PACKAGE8}=ON \ - -DTrilinos_ENABLE_${PACKAGE9}=ON \ - -DTrilinos_ENABLE_${PACKAGE10}=ON \ - -DKOKKOS_ENABLE_DEPRECATED_CODE=OFF \ - -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ - -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ - -DTpetra_ENABLE_DEBUG=ON \ -$TRILINOS_DIR - -# Notes: -# Compile using ninja -# make NP=32 - -# Allocate node: -# bsub -J TestCompare-DepOffdbg -W 06:00 -Is -n 16 -q rhel7W bash - -# Run tests -# ctest -j8 - -# Submit tests as job -# bsub -x -Is -q rhel7W -n 16 ctest -j8 diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff.sh deleted file mode 100755 index 9f35eeed3f..0000000000 --- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash - -echo "SOURCE this script!!" - -export TRILINOS_DIR=${PWD}/../.. - -# Load modules -module purge -source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh cuda-9.2-opt - -# Packages -PACKAGE1=Tpetra -PACKAGE2=Sacado -PACKAGE3=Stokhos -PACKAGE4=MueLu -PACKAGE5=Intrepid2 -PACKAGE6=Ifpack2 -PACKAGE7=Panzer -PACKAGE8=Phalanx -PACKAGE9=Stratimikos -PACKAGE10=Belos - - -rm -rf CMake* - -# Configure -cmake \ - -GNinja \ - -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \ - -DTrilinos_ENABLE_TESTS=ON \ - -DTrilinos_ENABLE_${PACKAGE1}=ON \ - -DTrilinos_ENABLE_${PACKAGE2}=ON \ - -DTrilinos_ENABLE_${PACKAGE3}=ON \ - -DTrilinos_ENABLE_${PACKAGE4}=ON \ - -DTrilinos_ENABLE_${PACKAGE5}=ON \ - -DTrilinos_ENABLE_${PACKAGE6}=ON \ - -DTrilinos_ENABLE_${PACKAGE7}=ON \ - -DTrilinos_ENABLE_${PACKAGE8}=ON \ - -DTrilinos_ENABLE_${PACKAGE9}=ON \ - -DTrilinos_ENABLE_${PACKAGE10}=ON \ - -DKOKKOS_ENABLE_DEPRECATED_CODE=OFF \ - -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ - -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ -$TRILINOS_DIR - - -# Notes: -# Compile using ninja -# make NP=32 - -# Allocate node: -# bsub -J TestCompare-DepCodeOFF -W 06:00 -Is -n 16 -q rhel7W bash - -# Run tests -# ctest -j8 - -# Or submit tests as job -# bsub -x -Is -q rhel7W -n 16 ctest -j8 diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon-dbg.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon-dbg.sh index 41160c938c..c6af962034 100755 --- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon-dbg.sh +++ b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon-dbg.sh @@ -33,7 +33,6 @@ cmake \ -DTrilinos_ENABLE_${PACKAGE8}=ON \ -DTrilinos_ENABLE_${PACKAGE9}=ON \ -DTrilinos_ENABLE_${PACKAGE10}=ON \ - -DKOKKOS_ENABLE_DEPRECATED_CODE=ON \ -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ -DTpetra_ENABLE_DEBUG=ON \ diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon.sh index 955821005f..9403741586 100755 --- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon.sh +++ b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon.sh @@ -38,7 +38,6 @@ cmake \ -DTrilinos_ENABLE_${PACKAGE8}=ON \ -DTrilinos_ENABLE_${PACKAGE9}=ON \ -DTrilinos_ENABLE_${PACKAGE10}=ON \ - -DKOKKOS_ENABLE_DEPRECATED_CODE=ON \ -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ $TRILINOS_DIR diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depoff.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depoff.sh deleted file mode 100755 index da9017e388..0000000000 --- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depoff.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -echo "SOURCE this script!!" - -export TRILINOS_DIR=${PWD}/../.. - -# Load modules -module purge -source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh cuda-9.2-rdc-release-debug-pt - -rm -rf CMake* - -# Configure -cmake \ - -GNinja \ - -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \ - -DTrilinos_ENABLE_TESTS=ON \ - -DTrilinos_ENABLE_ALL_PACKAGES=ON \ - -DKOKKOS_ENABLE_DEPRECATED_CODE=OFF \ - -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ - -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ -$TRILINOS_DIR - -# Notes: -# Compile using ninja -# make NP=32 - -# Allocate node: -# bsub -J TestKokkos-DepCodeOn-rdcpt -W 07:00 -Is -n 16 -q rhel7W bash - -# Run tests -# ctest -j8 - -# Submit tests as job -# bsub -x -Is -q rhel7W -n 16 ctest -j8 diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depon.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depon.sh index 01e2def015..d508d4c77a 100755 --- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depon.sh +++ b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depon.sh @@ -16,7 +16,6 @@ cmake \ -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \ -DTrilinos_ENABLE_TESTS=ON \ -DTrilinos_ENABLE_ALL_PACKAGES=ON \ - -DKOKKOS_ENABLE_DEPRECATED_CODE=ON \ -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ $TRILINOS_DIR diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-env.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-env.sh index 76e0391912..7be71edc1c 100755 --- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-env.sh +++ b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-env.sh @@ -38,7 +38,6 @@ cmake \ -DTrilinos_ENABLE_${PACKAGE8}=ON \ -DTrilinos_ENABLE_${PACKAGE9}=ON \ -DTrilinos_ENABLE_${PACKAGE10}=ON \ - -DKOKKOS_ENABLE_DEPRECATED_CODE=ON \ -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ $TRILINOS_DIR From 516c2ebad49f9e99a6a25f0bec8451be5ca9a01e Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 7 Oct 2020 18:19:53 -0700 Subject: [PATCH 041/106] cm_test_all_sandia: fix determination of CM_ALL_SCRIPT_PATH var Broken for absolute paths, causing update_libs.sh to not be called on some systems --- scripts/cm_test_all_sandia | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 1343000fb4..7f14255b7f 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -1099,7 +1099,7 @@ setup_env() { fi if [ -e ${CM_ALL_SCRIPT_PATH}/update_lib.sh ]; then - echo calling ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE + echo "calling ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE" source ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE fi @@ -1388,8 +1388,7 @@ wait_summarize_and_exit() { # CM_ALL_SCRIPT=$0 -CM_ALL_SCRIPT_PATH=`pwd` -CM_ALL_SCRIPT_PATH=${CM_ALL_SCRIPT_PATH}/`dirname $CM_ALL_SCRIPT` +CM_ALL_SCRIPT_PATH=$(cd `dirname $CM_ALL_SCRIPT` && pwd) ROOT_DIR=$(get_test_root_dir) mkdir -p $ROOT_DIR From 6eb72db51e0057c6bd5a0f6832954a0ce8a42253 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 8 Oct 2020 13:26:20 -0600 Subject: [PATCH 042/106] WIP: factor out explicit graph coarsening from CGS --- src/common/KokkosKernels_SparseUtils.hpp | 87 ++++- src/graph/KokkosGraph_ExplicitCoarsening.hpp | 117 +++++++ .../KokkosGraph_ExplicitCoarsening_impl.hpp | 303 ++++++++++++++++++ ...KokkosSparse_cluster_gauss_seidel_impl.hpp | 251 +-------------- 4 files changed, 511 insertions(+), 247 deletions(-) create mode 100644 src/graph/KokkosGraph_ExplicitCoarsening.hpp create mode 100644 src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp index 6f0c7ed647..2547c2e1b9 100644 --- a/src/common/KokkosKernels_SparseUtils.hpp +++ b/src/common/KokkosKernels_SparseUtils.hpp @@ -1168,14 +1168,14 @@ struct MergedRowmapFunctor }; template -struct MergedEntriesFunctor +struct MatrixMergedEntriesFunctor { using size_type = typename rowmap_t::non_const_value_type; using lno_t = typename entries_t::non_const_value_type; using scalar_t = typename values_t::non_const_value_type; //Precondition: entries are sorted within each row - MergedEntriesFunctor( + MatrixMergedEntriesFunctor( const rowmap_t& rowmap_, const entries_t& entries_, const values_t& values_, const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_, const values_t& mergedValues_) : rowmap(rowmap_), entries(entries_), values(values_), @@ -1225,6 +1225,52 @@ struct MergedEntriesFunctor values_t mergedValues; }; +template +struct GraphMergedEntriesFunctor +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + + //Precondition: entries are sorted within each row + GraphMergedEntriesFunctor( + const rowmap_t& rowmap_, const entries_t& entries_, + const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_) + : rowmap(rowmap_), entries(entries_), + mergedRowmap(mergedRowmap_), mergedEntries(mergedEntries_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const + { + size_type rowBegin = rowmap(row); + size_type rowEnd = rowmap(row + 1); + if(rowEnd == rowBegin) + { + //Row was empty to begin with, nothing to do + return; + } + //Otherwise, accumulate the value for each column + lno_t accumCol = entries(rowBegin); + size_type insertPos = mergedRowmap(row); + for(size_type j = rowBegin + 1; j < rowEnd; j++) + { + if(accumCol != entries(j)) + { + //write out and reset + mergedEntries(insertPos) = accumCol; + insertPos++; + accumCol = entries(j); + } + } + //always left with the last unique entry + mergedEntries(insertPos) = accumCol; + } + + rowmap_t rowmap; + entries_t entries; + rowmap_t mergedRowmap; + entries_t mergedEntries; +}; + //Sort the rows of matrix, and merge duplicate entries. template crsMat_t sort_and_merge_matrix(const crsMat_t& A) @@ -1248,7 +1294,7 @@ crsMat_t sort_and_merge_matrix(const crsMat_t& A) values_t mergedValues("SortedMerged values", numCompressedEntries); //Compute merged entries and values Kokkos::parallel_for(range_t(0, A.numRows()), - MergedEntriesFunctor + MatrixMergedEntriesFunctor (A.graph.row_map, A.graph.entries, A.values, mergedRowmap, mergedEntries, mergedValues)); //Finally, construct the new compressed matrix @@ -1256,6 +1302,41 @@ crsMat_t sort_and_merge_matrix(const crsMat_t& A) mergedValues, mergedRowmap, mergedEntries); } +template +void sort_and_merge_graph( + const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, + rowmap_t& rowmap_out, entries_t& entries_out) +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using range_t = Kokkos::RangePolicy; + using const_rowmap_t = typename rowmap_t::const_type; + lno_t numRows = rowmap_in.extent(0); + if(numRows <= 1) + { + //Matrix has zero rows + rowmap_out = rowmap_t(); + entries_out = entries_t(); + return; + } + numRows--; + //Sort in place + sort_crs_graph(rowmap_in, entries_in); + //Count entries per row into a new rowmap, in terms of merges that can be done + rowmap_out = rowmap_t(Kokkos::ViewAllocateWithoutInitializing("SortedMerged rowmap"), numRows + 1); + size_type numCompressedEntries = 0; + Kokkos::parallel_reduce(range_t(0, numRows), + MergedRowmapFunctor(rowmap_out, rowmap_in, entries_in), numCompressedEntries); + //Prefix sum to get rowmap + kk_exclusive_parallel_prefix_sum(numRows + 1, rowmap_out); + entries_out = entries_t("SortedMerged entries", numCompressedEntries); + //Compute merged entries and values + Kokkos::parallel_for(range_t(0, numRows), + GraphMergedEntriesFunctor + (rowmap_in, entries_in, + rowmap_out, entries_out)); +} + template = numVerts are discarded. +//The labels should be in the range [0, numCoarseVerts), and the output graph wil have numCoarseVerts. +// +//If compress, sort and merge entries in each row. +//An uncompressed graph will still work as input to some things like D1 graph coloring. + +template +void graph_explicit_coarsen( + const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, + const labels_t& labels, typename fine_entries_t::non_const_value_type numCoarseVerts, + coarse_rowmap_t& coarseRowmap, coarse_entries_t& coarseEntries, + bool compress = true) +{ + using size_type = typename fine_rowmap_t::non_const_value_type; + using lno_t = typename fine_entries_t::non_const_value_type; + using exec_space = typename device_t::execution_space; + static_assert(std::is_same::value, + "graph_explicit_coarsen: The coarse and fine entry Views have different value types."); + KokkosGraph::Impl::ExplicitGraphCoarsening + egc(fineRowmap, fineEntries, labels, numCoarseVerts); + coarseRowmap = egc.coarseRowmap; + coarseEntries = egc.coarseEntries; + if(compress) + { + coarse_rowmap_t mergedRowmap; + coarse_entries_t mergedEntries; + KokkosKernels::Impl::sort_and_merge_graph + (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); + } +} + +//Same as above, but also produce the map from coarse vertices to fine vertices (inverse map of labels) +template +void graph_explicit_coarsen_with_inverse_map( + const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, + const labels_t& labels, typename fine_entries_t::non_const_value_type numCoarseVerts, + coarse_rowmap_t& coarseRowmap, coarse_entries_t& coarseEntries, + ordinal_view_t& inverseOffsets, ordinal_view_t& inverseLabels, + bool compress = true) +{ + using size_type = typename fine_rowmap_t::non_const_value_type; + using lno_t = typename fine_entries_t::non_const_value_type; + using exec_space = typename device_t::execution_space; + static_assert(std::is_same::value, + "graph_explicit_coarsen: The coarse and fine entry Views have different value types."); + KokkosGraph::Impl::ExplicitGraphCoarsening + egc(fineRowmap, fineEntries, labels, numCoarseVerts); + coarseRowmap = egc.coarseRowmap; + coarseEntries = egc.coarseEntries; + inverseOffsets = egc.clusterOffsets; + inverseLabels = egc.clusterVerts; + if(compress) + { + coarse_rowmap_t mergedRowmap; + coarse_entries_t mergedEntries; + KokkosKernels::Impl::sort_and_merge_graph + (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); + } +} + +}} + +#endif diff --git a/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp b/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp new file mode 100644 index 0000000000..65ed3a1415 --- /dev/null +++ b/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp @@ -0,0 +1,303 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSGRAPH_EXPLICIT_COARSEN_IMPL_HPP +#define KOKKOSGRAPH_EXPLICIT_COARSEN_IMPL_HPP + +namespace KokkosGraph { +namespace Impl { + +template +struct ExplicitGraphCoarsening +{ + using exec_space = typename device_t::execution_space; + using range_pol = Kokkos::RangePolicy; + using team_pol = Kokkos::TeamPolicy; + using team_member_t = typename team_pol::member_type; + using bitset_t = Kokkos::Bitset; + using const_bitset_t = Kokkos::ConstBitset; + + struct ClusterSizeFunctor + { + ClusterSizeFunctor(const ordinal_view_t& counts_, const labels_t& vertClusters_) + : counts(counts_), vertClusters(vertClusters_) + {} + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const + { + Kokkos::atomic_increment(&counts(vertClusters(i))); + } + ordinal_view_t counts; + labels_t vertClusters; + }; + + struct FillClusterVertsFunctor + { + FillClusterVertsFunctor(const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, const labels_t& vertClusters_, const ordinal_view_t& insertCounts_) + : clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), insertCounts(insertCounts_) + {} + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const + { + lno_t cluster = vertClusters(i); + lno_t offset = clusterOffsets(cluster) + Kokkos::atomic_fetch_add(&insertCounts(cluster), 1); + clusterVerts(offset) = i; + } + ordinal_view_t clusterOffsets; + ordinal_view_t clusterVerts; + labels_t vertClusters; + ordinal_view_t insertCounts; + }; + + struct BuildCrossClusterMaskFunctor + { + BuildCrossClusterMaskFunctor(const fine_rowmap_t& rowmap_, const fine_entries_t& colinds_, const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, const labels_t& vertClusters_, const bitset_t& mask_) + : numRows(rowmap_.extent(0) - 1), rowmap(rowmap_), colinds(colinds_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), mask(mask_) + {} + + //Used a fixed-size hash set in shared memory + KOKKOS_INLINE_FUNCTION constexpr int tableSize() const + { + //Should always be a power-of-two, so that X % tableSize() reduces to a bitwise and. + return 512; + } + + //Given a cluster index, get the hash table index. + //This is the 32-bit xorshift RNG, but it works as a hash function. + KOKKOS_INLINE_FUNCTION unsigned xorshiftHash(lno_t cluster) const + { + unsigned x = cluster; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + return x; + } + + KOKKOS_INLINE_FUNCTION bool lookup(lno_t cluster, int* table) const + { + unsigned h = xorshiftHash(cluster); + for(unsigned i = h; i < h + 2; i++) + { + if(table[i % tableSize()] == cluster) + return true; + } + return false; + } + + //Try to insert the edge between cluster (team's cluster) and neighbor (neighboring cluster) + //by inserting nei into the table. + KOKKOS_INLINE_FUNCTION bool insert(lno_t cluster, lno_t nei, int* table) const + { + unsigned h = xorshiftHash(nei); + for(unsigned i = h; i < h + 2; i++) + { + if(Kokkos::atomic_compare_exchange_strong(&table[i % tableSize()], cluster, nei)) + return true; + } + return false; + } + + KOKKOS_INLINE_FUNCTION void operator()(const team_member_t t) const + { + lno_t cluster = t.league_rank(); + lno_t clusterSize = clusterOffsets(cluster + 1) - clusterOffsets(cluster); + //Use a fixed-size hash table per thread to accumulate neighbor of the cluster. + //If it fills up (very unlikely) then just count every remaining edge going to another cluster + //not already in the table; this provides a reasonable upper bound for overallocating the cluster graph. + //each thread handles a cluster + int* table = (int*) t.team_shmem().get_shmem(tableSize() * sizeof(int)); + //mark every entry as cluster (self-loop) to represent free/empty + Kokkos::parallel_for(Kokkos::TeamVectorRange(t, tableSize()), + [&](const lno_t i) + { + table[i] = cluster; + }); + t.team_barrier(); + //now, for each row belonging to the cluster, iterate through the neighbors + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clusterSize), + [&] (const lno_t i) + { + lno_t row = clusterVerts(clusterOffsets(cluster) + i); + lno_t rowDeg = rowmap(row + 1) - rowmap(row); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, rowDeg), + [&] (const lno_t j) + { + lno_t nei = colinds(rowmap(row) + j); + //Remote neighbors are not included + if(nei >= numRows) + return; + lno_t neiCluster = vertClusters(nei); + if(neiCluster != cluster) + { + //Have a neighbor. Try to find it in the table. + if(!lookup(neiCluster, table)) + { + //Not in the table. Try to insert it. + insert(cluster, neiCluster, table); + //Whether or not insertion succeeded, + //this is a cross-cluster edge possibly not seen before + mask.set(rowmap(row) + j); + } + } + }); + }); + } + + size_t team_shmem_size(int teamSize) const + { + return tableSize() * sizeof(int); + } + + lno_t numRows; + fine_rowmap_t rowmap; + fine_entries_t colinds; + ordinal_view_t clusterOffsets; + ordinal_view_t clusterVerts; + labels_t vertClusters; + bitset_t mask; + }; + + struct FillClusterEntriesFunctor + { + FillClusterEntriesFunctor( + const fine_rowmap_t& rowmap_, const fine_entries_t& colinds_, const coarse_rowmap_t& clusterRowmap_, const coarse_entries_t& clusterEntries_, const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, const labels_t& vertClusters_, const bitset_t& edgeMask_) + : rowmap(rowmap_), colinds(colinds_), clusterRowmap(clusterRowmap_), clusterEntries(clusterEntries_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), edgeMask(edgeMask_) + {} + //Run this scan over entries in clusterVerts (reordered point rows) + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i, lno_t& lcount, const bool& finalPass) const + { + lno_t numRows = rowmap.extent(0) - 1; + lno_t row = clusterVerts(i); + size_type rowStart = rowmap(row); + size_type rowEnd = rowmap(row + 1); + lno_t cluster = vertClusters(row); + lno_t clusterStart = clusterOffsets(cluster); + //Count the number of entries in this row. + //This is how much lcount will be increased by, + //yielding the offset corresponding to + //these point entries in the cluster entries. + lno_t rowEntries = 0; + for(size_type j = rowStart; j < rowEnd; j++) + { + if(edgeMask.test(j)) + rowEntries++; + } + if(finalPass) + { + //if this is the last row in the cluster, update the upper bound in clusterRowmap + if(i == clusterStart) + { + clusterRowmap(cluster) = lcount; + } + lno_t clusterEdge = lcount; + //populate clusterEntries for these edges + for(size_type j = rowStart; j < rowEnd; j++) + { + if(edgeMask.test(j)) + { + clusterEntries(clusterEdge++) = vertClusters(colinds(j)); + } + } + } + //update the scan result at the end (exclusive) + lcount += rowEntries; + if(i == numRows - 1 && finalPass) + { + //on the very last row, set the last entry of the cluster rowmap + clusterRowmap(clusterRowmap.extent(0) - 1) = lcount; + } + } + fine_rowmap_t rowmap; + fine_entries_t colinds; + coarse_rowmap_t clusterRowmap; + coarse_entries_t clusterEntries; + ordinal_view_t clusterOffsets; + ordinal_view_t clusterVerts; + labels_t vertClusters; + const_bitset_t edgeMask; + }; + + //Constructor just does the computation and outputs to coarseRowmap, coarseEntries. + ExplicitGraphCoarsening(const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, const labels_t& labels, lno_t numCoarseVerts) + { + lno_t numFineVerts = fineRowmap.extent(0); + if(numFineVerts <= 1) + { + coarseRowmap = coarse_rowmap_t(); + coarseEntries = coarse_entries_t(); + return; + } + numFineVerts--; + clusterOffsets = ordinal_view_t("Cluster offsets", numCoarseVerts + 1); + clusterVerts = ordinal_view_t(Kokkos::ViewAllocateWithoutInitializing("Cluster verts"), numFineVerts); + Kokkos::parallel_for(range_pol(0, numFineVerts), ClusterSizeFunctor(clusterOffsets, labels)); + KokkosKernels::Impl::exclusive_parallel_prefix_sum(numCoarseVerts + 1, clusterOffsets); + { + coarse_entries_t tempInsertCounts("Temporary cluster insert counts", numCoarseVerts); + Kokkos::parallel_for(range_pol(0, numFineVerts), FillClusterVertsFunctor(clusterOffsets, clusterVerts, labels, tempInsertCounts)); + } + //Determine the set of edges (in the point graph) that cross between two distinct clusters + int vectorSize = KokkosKernels::Impl::kk_get_suggested_vector_size(numFineVerts, fineEntries.extent(0), KokkosKernels::Impl::kk_get_exec_space_type()); + bitset_t crossClusterEdgeMask(fineEntries.extent(0)); + size_type numClusterEdges; + { + BuildCrossClusterMaskFunctor buildEdgeMask(fineRowmap, fineEntries, clusterOffsets, clusterVerts, labels, crossClusterEdgeMask); + int sharedPerTeam = buildEdgeMask.team_shmem_size(0); //using team-size = 0 for since no per-thread shared is used. + int teamSize = KokkosKernels::Impl::get_suggested_team_size(buildEdgeMask, vectorSize, sharedPerTeam, 0); + Kokkos::parallel_for(team_pol(numCoarseVerts, teamSize, vectorSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)), buildEdgeMask); + numClusterEdges = crossClusterEdgeMask.count(); + } + coarseRowmap = coarse_rowmap_t(Kokkos::ViewAllocateWithoutInitializing("Cluster graph rowmap"), numCoarseVerts + 1); + coarseEntries = coarse_entries_t(Kokkos::ViewAllocateWithoutInitializing("Cluster graph colinds"), numClusterEdges); + Kokkos::parallel_scan(range_pol(0, numFineVerts), FillClusterEntriesFunctor + (fineRowmap, fineEntries, coarseRowmap, coarseEntries, clusterOffsets, clusterVerts, labels, crossClusterEdgeMask)); + } + + coarse_rowmap_t coarseRowmap; + coarse_entries_t coarseEntries; + ordinal_view_t clusterOffsets; + ordinal_view_t clusterVerts; +}; + +}} + +#endif diff --git a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index bca3bd725a..0596c6f3a5 100644 --- a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -57,6 +57,7 @@ #include "KokkosKernels_SimpleUtils.hpp" #include "KokkosSparse_partitioning_impl.hpp" #include "KokkosGraph_MIS2.hpp" +#include "KokkosGraph_ExplicitCoarsening.hpp" namespace KokkosSparse{ namespace Impl{ @@ -495,208 +496,6 @@ namespace KokkosSparse{ nnz_lno_t clusterSize; }; - template - struct ClusterSizeFunctor - { - ClusterSizeFunctor(nnz_view_t& counts_, nnz_view_t& vertClusters_) - : counts(counts_), vertClusters(vertClusters_) - {} - KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i) const - { - Kokkos::atomic_increment(&counts(vertClusters(i))); - } - nnz_view_t counts; - nnz_view_t vertClusters; - }; - - template - struct FillClusterVertsFunctor - { - FillClusterVertsFunctor(nnz_view_t& clusterOffsets_, nnz_view_t& clusterVerts_, nnz_view_t& vertClusters_, nnz_view_t& insertCounts_) - : clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), insertCounts(insertCounts_) - {} - KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i) const - { - nnz_lno_t cluster = vertClusters(i); - nnz_lno_t offset = clusterOffsets(cluster) + Kokkos::atomic_fetch_add(&insertCounts(cluster), 1); - clusterVerts(offset) = i; - } - nnz_view_t clusterOffsets; - nnz_view_t clusterVerts; - nnz_view_t vertClusters; - nnz_view_t insertCounts; - }; - - template - struct BuildCrossClusterMaskFunctor - { - BuildCrossClusterMaskFunctor(Rowmap& rowmap_, Colinds& colinds_, nnz_view_t& clusterOffsets_, nnz_view_t& clusterVerts_, nnz_view_t& vertClusters_, bitset_t& mask_) - : numRows(rowmap_.extent(0) - 1), rowmap(rowmap_), colinds(colinds_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), mask(mask_) - {} - - //Used a fixed-size hash set in shared memory - KOKKOS_INLINE_FUNCTION constexpr int tableSize() const - { - //Should always be a power-of-two, so that X % tableSize() reduces to a bitwise and. - return 512; - } - - //Given a cluster index, get the hash table index. - //This is the 32-bit xorshift RNG, but it works as a hash function. - KOKKOS_INLINE_FUNCTION unsigned xorshiftHash(nnz_lno_t cluster) const - { - unsigned x = cluster; - x ^= x << 13; - x ^= x >> 17; - x ^= x << 5; - return x; - } - - KOKKOS_INLINE_FUNCTION bool lookup(nnz_lno_t cluster, int* table) const - { - unsigned h = xorshiftHash(cluster); - for(unsigned i = h; i < h + 2; i++) - { - if(table[i % tableSize()] == cluster) - return true; - } - return false; - } - - //Try to insert the edge between cluster (team's cluster) and neighbor (neighboring cluster) - //by inserting nei into the table. - KOKKOS_INLINE_FUNCTION bool insert(nnz_lno_t cluster, nnz_lno_t nei, int* table) const - { - unsigned h = xorshiftHash(nei); - for(unsigned i = h; i < h + 2; i++) - { - if(Kokkos::atomic_compare_exchange_strong(&table[i % tableSize()], cluster, nei)) - return true; - } - return false; - } - - KOKKOS_INLINE_FUNCTION void operator()(const team_member_t t) const - { - nnz_lno_t cluster = t.league_rank(); - nnz_lno_t clusterSize = clusterOffsets(cluster + 1) - clusterOffsets(cluster); - //Use a fixed-size hash table per thread to accumulate neighbor of the cluster. - //If it fills up (very unlikely) then just count every remaining edge going to another cluster - //not already in the table; this provides a reasonable upper bound for overallocating the cluster graph. - //each thread handles a cluster - int* table = (int*) t.team_shmem().get_shmem(tableSize() * sizeof(int)); - //mark every entry as cluster (self-loop) to represent free/empty - Kokkos::parallel_for(Kokkos::TeamVectorRange(t, tableSize()), - [&](const nnz_lno_t i) - { - table[i] = cluster; - }); - t.team_barrier(); - //now, for each row belonging to the cluster, iterate through the neighbors - Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clusterSize), - [&] (const nnz_lno_t i) - { - nnz_lno_t row = clusterVerts(clusterOffsets(cluster) + i); - nnz_lno_t rowDeg = rowmap(row + 1) - rowmap(row); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, rowDeg), - [&] (const nnz_lno_t j) - { - nnz_lno_t nei = colinds(rowmap(row) + j); - //Remote neighbors are not included - if(nei >= numRows) - return; - nnz_lno_t neiCluster = vertClusters(nei); - if(neiCluster != cluster) - { - //Have a neighbor. Try to find it in the table. - if(!lookup(neiCluster, table)) - { - //Not in the table. Try to insert it. - insert(cluster, neiCluster, table); - //Whether or not insertion succeeded, - //this is a cross-cluster edge possibly not seen before - mask.set(rowmap(row) + j); - } - } - }); - }); - } - - size_t team_shmem_size(int teamSize) const - { - return tableSize() * sizeof(int); - } - - nnz_lno_t numRows; - Rowmap rowmap; - Colinds colinds; - nnz_view_t clusterOffsets; - nnz_view_t clusterVerts; - nnz_view_t vertClusters; - bitset_t mask; - }; - - template - struct FillClusterEntriesFunctor - { - FillClusterEntriesFunctor( - Rowmap& rowmap_, Colinds& colinds_, nnz_view_t& clusterRowmap_, nnz_view_t& clusterEntries_, nnz_view_t& clusterOffsets_, nnz_view_t& clusterVerts_, nnz_view_t& vertClusters_, bitset_t& edgeMask_) - : rowmap(rowmap_), colinds(colinds_), clusterRowmap(clusterRowmap_), clusterEntries(clusterEntries_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), edgeMask(edgeMask_) - {} - //Run this scan over entries in clusterVerts (reordered point rows) - KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i, nnz_lno_t& lcount, const bool& finalPass) const - { - nnz_lno_t numRows = rowmap.extent(0) - 1; - nnz_lno_t row = clusterVerts(i); - size_type rowStart = rowmap(row); - size_type rowEnd = rowmap(row + 1); - nnz_lno_t cluster = vertClusters(row); - nnz_lno_t clusterStart = clusterOffsets(cluster); - //Count the number of entries in this row. - //This is how much lcount will be increased by, - //yielding the offset corresponding to - //these point entries in the cluster entries. - nnz_lno_t rowEntries = 0; - for(size_type j = rowStart; j < rowEnd; j++) - { - if(edgeMask.test(j)) - rowEntries++; - } - if(finalPass) - { - //if this is the last row in the cluster, update the upper bound in clusterRowmap - if(i == clusterStart) - { - clusterRowmap(cluster) = lcount; - } - nnz_lno_t clusterEdge = lcount; - //populate clusterEntries for these edges - for(size_type j = rowStart; j < rowEnd; j++) - { - if(edgeMask.test(j)) - { - clusterEntries(clusterEdge++) = vertClusters(colinds(j)); - } - } - } - //update the scan result at the end (exclusive) - lcount += rowEntries; - if(i == numRows - 1 && finalPass) - { - //on the very last row, set the last entry of the cluster rowmap - clusterRowmap(clusterRowmap.extent(0) - 1) = lcount; - } - } - Rowmap rowmap; - Colinds colinds; - nnz_view_t clusterRowmap; - nnz_view_t clusterEntries; - nnz_view_t clusterOffsets; - nnz_view_t clusterVerts; - nnz_view_t vertClusters; - const_bitset_t edgeMask; - }; - //Assign cluster labels to vertices, given that the vertices are naturally //ordered so that contiguous groups of vertices form decent clusters. template @@ -768,8 +567,6 @@ namespace KokkosSparse{ //Now that a symmetric graph is available, build the cluster graph (also symmetric) nnz_lno_t clusterSize = gsHandle->get_cluster_size(); nnz_lno_t numClusters = (num_rows + clusterSize - 1) / clusterSize; - nnz_view_t clusterOffsets("Cluster offsets", numClusters + 1); - nnz_view_t clusterVerts("Cluster -> vertices", num_rows); raw_rowmap_t raw_sym_xadj; raw_colinds_t raw_sym_adj; if(this->is_symmetric) @@ -811,46 +608,12 @@ namespace KokkosSparse{ std::cout << "Graph clustering: " << timer.seconds() << '\n'; timer.reset(); #endif - //Construct the cluster offset and vertex array. These allow fast iteration over all vertices in a given cluster. - Kokkos::parallel_for(my_exec_space(0, num_rows), ClusterSizeFunctor(clusterOffsets, vertClusters)); - KokkosKernels::Impl::exclusive_parallel_prefix_sum(numClusters + 1, clusterOffsets); - { - nnz_view_t tempInsertCounts("Temporary cluster insert counts", numClusters); - Kokkos::parallel_for(my_exec_space(0, num_rows), FillClusterVertsFunctor(clusterOffsets, clusterVerts, vertClusters, tempInsertCounts)); - } -#if KOKKOSSPARSE_IMPL_PRINTDEBUG - { - auto clusterOffsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), clusterOffsets); - auto clusterVertsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), clusterVerts); - puts("Clusters (cluster #, and vertex #s):"); - for(nnz_lno_t i = 0; i < numClusters; i++) - { - printf("%d: ", (int) i); - for(nnz_lno_t j = clusterOffsetsHost(i); j < clusterOffsetsHost(i + 1); j++) - { - printf("%d ", (int) clusterVerts(j)); - } - putchar('\n'); - } - printf("\n\n\n"); - } -#endif - //Determine the set of edges (in the point graph) that cross between two distinct clusters - int vectorSize = this->handle->get_suggested_vector_size(num_rows, raw_sym_adj.extent(0)); - bitset_t crossClusterEdgeMask(raw_sym_adj.extent(0)); - size_type numClusterEdges; - { - BuildCrossClusterMaskFunctor - buildEdgeMask(raw_sym_xadj, raw_sym_adj, clusterOffsets, clusterVerts, vertClusters, crossClusterEdgeMask); - int sharedPerTeam = buildEdgeMask.team_shmem_size(0); //using team-size = 0 for since no per-thread shared is used. - int teamSize = KokkosKernels::Impl::get_suggested_team_size(buildEdgeMask, vectorSize, sharedPerTeam, 0); - Kokkos::parallel_for(team_policy_t(numClusters, teamSize, vectorSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)), buildEdgeMask); - numClusterEdges = crossClusterEdgeMask.count(); - } - nnz_view_t clusterRowmap = nnz_view_t("Cluster graph rowmap", numClusters + 1); - nnz_view_t clusterEntries = nnz_view_t("Cluster graph colinds", numClusterEdges); - Kokkos::parallel_scan(my_exec_space(0, num_rows), FillClusterEntriesFunctor - (raw_sym_xadj, raw_sym_adj, clusterRowmap, clusterEntries, clusterOffsets, clusterVerts, vertClusters, crossClusterEdgeMask)); + rowmap_t clusterRowmap; + colinds_t clusterEntries; + nnz_view_t clusterOffsets; + nnz_view_t clusterVerts; + KokkosGraph::Experimental::graph_explicit_coarsen_with_inverse_map, raw_rowmap_t, raw_colinds_t, nnz_view_t, rowmap_t, colinds_t, nnz_view_t> + (raw_sym_xadj, raw_sym_adj, vertClusters, numClusters, clusterRowmap, clusterEntries, clusterOffsets, clusterVerts, false); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "Building explicit cluster graph: " << timer.seconds() << '\n'; timer.reset(); From 6158a8b078f59cbd7bca3d705aa2f354e401dc46 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 8 Oct 2020 22:18:32 -0600 Subject: [PATCH 043/106] Fix incorrect type --- src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp b/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp index 65ed3a1415..51fa777c79 100644 --- a/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp +++ b/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp @@ -272,7 +272,7 @@ struct ExplicitGraphCoarsening Kokkos::parallel_for(range_pol(0, numFineVerts), ClusterSizeFunctor(clusterOffsets, labels)); KokkosKernels::Impl::exclusive_parallel_prefix_sum(numCoarseVerts + 1, clusterOffsets); { - coarse_entries_t tempInsertCounts("Temporary cluster insert counts", numCoarseVerts); + ordinal_view_t tempInsertCounts("Temporary cluster insert counts", numCoarseVerts); Kokkos::parallel_for(range_pol(0, numFineVerts), FillClusterVertsFunctor(clusterOffsets, clusterVerts, labels, tempInsertCounts)); } //Determine the set of edges (in the point graph) that cross between two distinct clusters From 96a4d9b504c0aae8e179ace21c0b6ca57bfc625e Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 9 Oct 2020 10:23:57 -0600 Subject: [PATCH 044/106] Fixed types, added some static asserts (checking handle types against rowmap/colinds element types) --- src/graph/impl/KokkosGraph_Distance1Color_impl.hpp | 8 +++++++- .../impl/KokkosSparse_cluster_gauss_seidel_impl.hpp | 10 +++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp index 4e6f322bce..110756a364 100644 --- a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp @@ -124,7 +124,13 @@ class GraphColor const_lno_nnz_view_t entries, HandleType *coloring_handle): nv (nv_), ne(ne_),xadj(row_map), adj (entries), - kok_src(), kok_dst(), cp(coloring_handle){} + kok_src(), kok_dst(), cp(coloring_handle) + { + static_assert(std::is_same::value, + "Row map element type does not match handle's size_type."); + static_assert(std::is_same::value, + "Entries element type does not match handle's nnz_lno_t."); + } /** \brief GraphColor destructor. */ diff --git a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index 0596c6f3a5..6a4c6caf29 100644 --- a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -82,6 +82,10 @@ namespace KokkosSparse{ typedef typename HandleType::nnz_lno_t nnz_lno_t; typedef typename HandleType::nnz_scalar_t nnz_scalar_t; + static_assert(std::is_same::value, + "ClusterGaussSeidel: Handle's size_type does not match input rowmap's element type."); + static_assert(std::is_same::value, + "ClusterGaussSeidel: Handle's nnz_lno_t does not match input entries's element type."); typedef typename in_lno_row_view_t::const_type const_lno_row_view_t; typedef typename in_lno_row_view_t::non_const_type non_const_lno_row_view_t; @@ -540,9 +544,9 @@ namespace KokkosSparse{ using nnz_view_t = nnz_lno_persistent_work_view_t; using in_rowmap_t = const_lno_row_view_t; using in_colinds_t = const_lno_nnz_view_t; - using rowmap_t = Kokkos::View; + using rowmap_t = Kokkos::View; using colinds_t = Kokkos::View; - using raw_rowmap_t = Kokkos::View>; + using raw_rowmap_t = Kokkos::View>; using raw_colinds_t = Kokkos::View>; auto gsHandle = get_gs_handle(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE @@ -648,7 +652,7 @@ namespace KokkosSparse{ Kokkos::deep_copy(colors, h_colors); #else //Create a handle that uses nnz_lno_t as the size_type, since the cluster graph should never be larger than 2^31 entries. - KokkosKernels::Experimental::KokkosKernelsHandle kh; + HandleType kh; kh.create_graph_coloring_handle(KokkosGraph::COLORING_DEFAULT); KokkosGraph::Experimental::graph_color_symbolic(&kh, numClusters, numClusters, clusterRowmap, clusterEntries); //retrieve colors From 00e1c83df74acce1a4cb4cad2e0151b4a8bf751d Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 12 Oct 2020 13:21:20 -0600 Subject: [PATCH 045/106] Move VERIFY_EMPTY from fake_tribits to kokkoskernels_tribits Part of PR #823 --- cmake/fake_tribits.cmake | 6 ------ cmake/kokkoskernels_tribits.cmake | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cmake/fake_tribits.cmake b/cmake/fake_tribits.cmake index c306891cff..26737b8919 100644 --- a/cmake/fake_tribits.cmake +++ b/cmake/fake_tribits.cmake @@ -109,12 +109,6 @@ LIST(APPEND TEMP ${ARGN}) GLOBAL_SET(${VARNAME} ${TEMP}) ENDFUNCTION() -FUNCTION(VERIFY_EMPTY CONTEXT) -IF(${ARGN}) - MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") -ENDIF() -ENDFUNCTION() - MACRO(PREPEND_GLOBAL_SET VARNAME) ASSERT_DEFINED(${VARNAME}) GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}}) diff --git a/cmake/kokkoskernels_tribits.cmake b/cmake/kokkoskernels_tribits.cmake index a0cc9d30d5..4eebb97c7b 100644 --- a/cmake/kokkoskernels_tribits.cmake +++ b/cmake/kokkoskernels_tribits.cmake @@ -5,6 +5,12 @@ IF (KOKKOSKERNELS_HAS_TRILINOS) INCLUDE(TribitsETISupport) ENDIF() +FUNCTION(VERIFY_EMPTY CONTEXT) + IF(${ARGN}) + MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") + ENDIF() +ENDFUNCTION() + #MESSAGE(STATUS "The project name is: ${PROJECT_NAME}") MACRO(KOKKOSKERNELS_PACKAGE_POSTPROCESS) From 627183c04680550cc6b8325962f233336ed90ca5 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 13 Oct 2020 14:37:10 -0600 Subject: [PATCH 046/106] Test all 4 modes for spmv/spmv_mv (N, C, T, H). Before, only N was tested. --- unit_test/sparse/Test_Sparse_spmv.hpp | 89 ++++++++++++++++++++------- 1 file changed, 66 insertions(+), 23 deletions(-) diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index a7b42fa697..1bd12ce4dc 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -51,7 +51,8 @@ struct fSPMV { template void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta){ + typename y_vector_type::non_const_value_type beta, + char mode = 'N'){ using graph_t = typename crsMat_t::StaticCrsGraphType; using size_type_view_t = typename graph_t::row_map_type; @@ -61,7 +62,9 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, using size_type = typename size_type_view_t::non_const_value_type; using lno_t = typename lno_view_t::non_const_value_type; using scalar_t = typename scalar_view_t::non_const_value_type; + using KAT = Kokkos::ArithTraits; + mode = toupper(mode); typename scalar_view_t::HostMirror h_values = Kokkos::create_mirror_view(input_mat.values); Kokkos::deep_copy(h_values,input_mat.values); @@ -84,15 +87,24 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, lno_t nr = input_mat.numRows(); - for (lno_t i = 0; i < nr; ++i){ - scalar_t result = 0; - for (size_type j = h_rowmap(i); j < h_rowmap(i+1); ++j){ + //first, scale y by beta + for(size_t i = 0; i < h_y.extent(0); i++) + h_y(i) *= beta; + + //then go through the matrix and accumulate the matrix-vector product + for (lno_t row = 0; row < nr; ++row) { + for (size_type j = h_rowmap(row); j < h_rowmap(row+1); ++j) { lno_t col = h_entries(j); scalar_t val = h_values(j); - scalar_t vector_val = h_x(col); - result += val * vector_val; + if(mode == 'N') + h_y(row) += alpha * val * h_x(col); + else if(mode == 'C') + h_y(row) += alpha * KAT::conj(val) * h_x(col); + else if(mode == 'T') + h_y(col) += alpha * val * h_x(row); + else if(mode == 'H') + h_y(col) += alpha * KAT::conj(val) * h_x(row); } - h_y(i) = beta * h_y(i) + alpha * result; } KokkosKernels::Impl::safe_host_to_device_deep_copy (y.extent(0), h_y, y); Kokkos::fence(); @@ -102,7 +114,7 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, template void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta) { + typename y_vector_type::non_const_value_type beta, char mode) { //typedef typename crsMat_t::StaticCrsGraphType graph_t; using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; @@ -115,13 +127,14 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, // the appropriate tolerance precision. const y_value_mag_type eps = std::is_same::value ? 2*1e-3 : 1e-7; const size_t nr = input_mat.numRows(); - y_vector_type expected_y("expected", nr); + bool transposed = (mode == 'T') || (mode == 'H'); + y_vector_type expected_y("expected", transposed ? input_mat.numCols() : input_mat.numRows()); Kokkos::deep_copy(expected_y, y); Kokkos::fence(); - sequential_spmv(input_mat, x, expected_y, alpha, beta); + sequential_spmv(input_mat, x, expected_y, alpha, beta, mode); //KokkosKernels::Impl::print_1Dview(expected_y); - KokkosSparse::spmv("N", alpha, input_mat, x, beta, y); + KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y); //KokkosKernels::Impl::print_1Dview(y); int num_errors = 0; Kokkos::parallel_reduce("KokkosSparse::Test::spmv", @@ -137,7 +150,7 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, template void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vector_type expected_y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, int numMV) { + typename y_vector_type::non_const_value_type beta, int numMV, char mode) { using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; using y_value_type = typename y_vector_type::non_const_value_type; @@ -153,7 +166,7 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vecto Kokkos::fence(); - KokkosSparse::spmv("N", alpha, input_mat, x, beta, y); + KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y); for (int i = 0; i < numMV; ++i){ @@ -162,7 +175,7 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vecto auto y_i = Kokkos::subview (expected_y, Kokkos::ALL (), i); Kokkos::fence(); - sequential_spmv(input_mat, x_i, y_i, alpha, beta); + sequential_spmv(input_mat, x_i, y_i, alpha, beta, mode); auto y_spmv = Kokkos::subview (y, Kokkos::ALL (), i); int num_errors = 0; @@ -326,6 +339,8 @@ void test_spmv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_vari x_vector_type input_x ("x", nc); y_vector_type output_y ("y", nr); + x_vector_type input_xt ("x", nr); + y_vector_type output_yt ("y", nc); Kokkos::Random_XorShift64_Pool rand_pool(13718); @@ -334,10 +349,23 @@ void test_spmv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_vari Kokkos::fill_random(input_x,rand_pool,ScalarX(10)); Kokkos::fill_random(output_y,rand_pool,ScalarY(10)); + Kokkos::fill_random(input_xt,rand_pool,ScalarX(10)); + Kokkos::fill_random(output_yt,rand_pool,ScalarY(10)); - Test::check_spmv(input_mat, input_x, output_y, 1.0, 0.0); - Test::check_spmv(input_mat, input_x, output_y, 0.0, 1.0); - Test::check_spmv(input_mat, input_x, output_y, 1.0, 1.0); + std::vector nonTransModes = {'N', 'C'}; + std::vector transModes = {'T', 'H'}; + for(auto mode : nonTransModes) + { + Test::check_spmv(input_mat, input_x, output_y, 1.0, 0.0, mode); + Test::check_spmv(input_mat, input_x, output_y, 0.0, 1.0, mode); + Test::check_spmv(input_mat, input_x, output_y, 1.0, 1.0, mode); + } + for(auto mode : transModes) + { + Test::check_spmv(input_mat, input_xt, output_yt, 1.0, 0.0, mode); + Test::check_spmv(input_mat, input_xt, output_yt, 0.0, 1.0, mode); + Test::check_spmv(input_mat, input_xt, output_yt, 1.0, 1.0, mode); + } } template @@ -353,21 +381,36 @@ void test_spmv_mv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_v ViewTypeY b_y("B",numCols,numMV); ViewTypeY b_y_copy("B",numCols,numMV); + ViewTypeX b_xt("A",numCols,numMV); + ViewTypeY b_yt("B",numRows,numMV); + ViewTypeY b_yt_copy("B",numRows,numMV); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(b_x,rand_pool,scalar_t(10)); Kokkos::fill_random(b_y,rand_pool,scalar_t(10)); + Kokkos::fill_random(b_xt,rand_pool,scalar_t(10)); + Kokkos::fill_random(b_yt,rand_pool,scalar_t(10)); crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows,numCols,nnz,row_size_variance, bandwidth); Kokkos::deep_copy(b_y_copy, b_y); + Kokkos::deep_copy(b_yt_copy, b_yt); - - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, numMV); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, numMV); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, numMV); - - + std::vector nonTransModes = {'N', 'C'}; + std::vector transModes = {'T', 'H'}; + for(auto mode : nonTransModes) + { + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, numMV, mode); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, numMV, mode); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, numMV, mode); + } + for(auto mode : transModes) + { + Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, 1.0, 0.0, numMV, mode); + Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, 0.0, 1.0, numMV, mode); + Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, 1.0, 1.0, numMV, mode); + } } template From 24c7634d204bd2d56c4535365e6a7420307e4c31 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 13 Oct 2020 13:43:42 -0700 Subject: [PATCH 047/106] Remove unused var --- unit_test/sparse/Test_Sparse_spmv.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index 1bd12ce4dc..4a19137886 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -126,7 +126,6 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, // so let us use y_value_type to determine // the appropriate tolerance precision. const y_value_mag_type eps = std::is_same::value ? 2*1e-3 : 1e-7; - const size_t nr = input_mat.numRows(); bool transposed = (mode == 'T') || (mode == 'H'); y_vector_type expected_y("expected", transposed ? input_mat.numCols() : input_mat.numRows()); Kokkos::deep_copy(expected_y, y); From 342d543c85117ba7ebc648ec8d3b669606cb9ba1 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 14 Oct 2020 11:29:00 -0600 Subject: [PATCH 048/106] update test scripts add timeout input option remove bowman update intel/17 old gcc module swaps --- scripts/cm_test_all_sandia | 42 +++++++++----------------------------- scripts/update_lib.sh | 23 ++++++++------------- 2 files changed, 18 insertions(+), 47 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 7f14255b7f..e31ff017d9 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -26,6 +26,7 @@ print_help() { echo "--num=N: Number of jobs to run in parallel" echo "--spot-check: Minimal test set to issue pull request" echo "--spot-check-tpls: Minimal test set enabling blas and lapack tpls" + echo "--timeout: Max time before ctest timeout (in seconds)" echo "--dry-run: Just print what would be executed" echo "--build-only: Just do builds, don't run anything" echo "--opt-flag=FLAG: Optimization flag (default: -O3)" @@ -129,11 +130,6 @@ if [[ "$HOSTNAME" =~ .*voltrino.* ]]; then module load git fi -if [[ "$HOSTNAME" =~ .*bowman.* ]]; then - MACHINE=bowman - module load git -fi - if [[ "$HOSTNAME" == *blake* ]]; then # Warning: very generic name MACHINE=blake module load git @@ -254,6 +250,8 @@ KOKKOSKERNELS_SCALARS="double,complex_double" KOKKOSKERNELS_ORDINALS="int" KOKKOSKERNELS_OFFSETS="int,size_t" KOKKOSKERNELS_LAYOUTS="LayoutLeft" + +CTESTTIMEOUT=2500 # # Handle arguments. # @@ -305,6 +303,9 @@ do --spot-check*) SPOT_CHECK=True ;; + --timeout*) + CTESTTIMEOUT="${key#*=}" + ;; --arch*) ARCH_FLAG="--arch=${key#*=}" ;; @@ -610,26 +611,6 @@ elif [ "$MACHINE" = "voltrino" ]; then if [ -z "$ARCH_FLAG" ]; then ARCH_FLAG="--arch=KNL" fi -elif [ "$MACHINE" = "bowman" ]; then - MODULE_ENVIRONMENT="source /etc/profile.d/modules.sh" - eval "$MODULE_ENVIRONMENT" - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="cmake/3.12.3,/compilers/" - - OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/17.2.174 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.2.199 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=KNL" - fi - SPACK_HOST_ARCH="+knl" - SPACK_CUDA_HOST_COMPILER="%gcc@7.2.0" elif [ "$MACHINE" = "mayer" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=96 @@ -675,7 +656,8 @@ elif [ "$MACHINE" = "blake" ]; then "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" ) else - COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/19.1.144 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/19.3.199 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" @@ -1094,10 +1076,6 @@ setup_env() { done - if [[ "$MACHINE" = bowman* ]]; then - module swap gcc/6.2.0 - fi - if [ -e ${CM_ALL_SCRIPT_PATH}/update_lib.sh ]; then echo "calling ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE" source ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE @@ -1235,7 +1213,7 @@ single_build_and_test() { run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } local make_par_lvl=12 - if [[ "$MACHINE" = bowman* ]] || [[ "$MACHINE" = white* ]]; then + if [[ "$MACHINE" = white* ]]; then make_par_lvl=48 fi local -i build_start_time=$(date +%s) @@ -1244,7 +1222,7 @@ single_build_and_test() { comment="build_time=$(($build_end_time-$build_start_time))" if [[ "$BUILD_ONLY" == False ]]; then - run_cmd ctest --timeout 2500 -V --output-on-failure >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; } + run_cmd ctest --timeout ${CTESTTIMEOUT} -V --output-on-failure >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; } local -i run_end_time=$(date +%s) comment="$comment run_time=$(($run_end_time-$build_end_time))" fi diff --git a/scripts/update_lib.sh b/scripts/update_lib.sh index 5c4ec5d66a..822efa28b8 100755 --- a/scripts/update_lib.sh +++ b/scripts/update_lib.sh @@ -1,19 +1,5 @@ #!/bin/bash -echo "CALLING UPDATE_LIB" - -if [ "$1" = bowman ]; then - ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" - if [[ "${ICPCVER}" = 17.0.* ]]; then - module swap gcc/4.7.4 gcc/6.2.0 - module list - elif [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.* ]]; then - module swap gcc/4.9.3 gcc/6.2.0 - module list - fi - export LIBRARY_PATH=/home/projects/x86-64-knl/gcc/6.2.0/lib/gcc/x86_64-pc-linux-gnu/6.2.0:/home/projects/x86-64-knl/cloog/0.18.4/lib:/home/projects/x86-64-knl/isl/0.16.1/lib:/home/projects/x86-64-knl/gmp/6.1.0/lib:/home/projects/x86-64-knl/mpfr/3.1.3/lib:/home/projects/x86-64-knl/mpc/1.0.3/lib:/home/projects/x86-64-knl/binutils/2.26.0/lib:/usr/lib/gcc/x86_64-redhat-linux/4.8.3:$LIBRARY_PATH - export LD_LIBRARY_PATH=/home/projects/x86-64-knl/gcc/6.2.0/lib64:/home/projects/x86-64-knl/gcc/6.2.0/lib:/home/projects/x86-64-knl/cloog/0.18.4/lib:/home/projects/x86-64-knl/isl/0.16.1/lib:/home/projects/x86-64-knl/gmp/6.1.0/lib:/home/projects/x86-64-knl/mpfr/3.1.3/lib:/home/projects/x86-64-knl/mpc/1.0.3/lib:/home/projects/x86-64-knl/binutils/2.26.0/lib:/usr/lib/gcc/x86_64-redhat-linux/4.8.3:$LD_LIBRARY_PATH -fi if [ "$1" = blake ]; then ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.0.128 ]]; then @@ -24,7 +10,7 @@ fi if [ "$1" = kokkos-dev ]; then ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" if [[ "${ICPCVER}" = 17.* ]]; then - module swap sems-gcc/4.8.4 sems-gcc/6.4.0 + module swap sems-gcc/4.9.3 sems-gcc/6.4.0 module list fi fi @@ -35,3 +21,10 @@ if [ "$1" = kokkos-dev-2 ]; then module list fi fi +if [ "$1" = sems ]; then + ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" + if [[ "${ICPCVER}" = 17.* ]]; then + module swap sems-gcc/4.8.4 sems-gcc/6.4.0 + module list + fi +fi From c8fdb1108072ba1b7bf7eeeeb579a770dc22973f Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 15 Oct 2020 11:49:26 -0600 Subject: [PATCH 049/106] Add fallback condition to use spmv_native when cuSPARSE won't work Improve SpMV unit test: - generate random complex values with nonzero imaginary component - catch exceptions in spmv --- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 46 +++++++++++-- src/sparse/KokkosSparse_spmv.hpp | 2 +- .../KokkosSparse_spgemm_cuSPARSE_impl.hpp | 16 ++--- src/sparse/impl/KokkosSparse_spmv_spec.hpp | 2 - .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 4 +- unit_test/sparse/Test_Sparse_spmv.hpp | 64 +++++++++++++++---- 6 files changed, 101 insertions(+), 33 deletions(-) diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index e9596fb772..ced3476539 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -50,6 +50,7 @@ #include "cusparse.h" #include "KokkosKernels_SparseUtils_cusparse.hpp" #include "KokkosKernels_Controls.hpp" +#include "KokkosSparse_spmv_impl.hpp" namespace KokkosSparse { namespace Impl { @@ -64,8 +65,18 @@ namespace Impl { const YVector& y) { using KAT = Kokkos::Details::ArithTraits; - std::cout << "It is currently not possible to use the native SpMV implementation" - " when cuSPARSE is enabled" << std::endl; + if (beta == KAT::zero ()) { + KokkosSparse::Impl::spmv_beta (controls, mode, alpha, A, x, beta, y); + } + else if (beta == KAT::one ()) { + KokkosSparse::Impl::spmv_beta (controls, mode, alpha, A, x, beta, y); + } + else if (beta == -KAT::one ()) { + KokkosSparse::Impl::spmv_beta (controls, mode, alpha, A, x, beta, y); + } + else { + KokkosSparse::Impl::spmv_beta (controls, mode, alpha, A, x, beta, y); + } } template @@ -84,9 +95,24 @@ namespace Impl { cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); /* Set the operation mode */ - cusparseOperation_t myCusparseOperation = CUSPARSE_OPERATION_NON_TRANSPOSE; - if(mode[0] == Transpose[0]) {myCusparseOperation = CUSPARSE_OPERATION_TRANSPOSE;} - else if(mode[0] == ConjugateTranspose[0]) {myCusparseOperation = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE;} + cusparseOperation_t myCusparseOperation; + switch(toupper(mode[0])) + { + case 'N': + myCusparseOperation = CUSPARSE_OPERATION_NON_TRANSPOSE; + break; + case 'T': + myCusparseOperation = CUSPARSE_OPERATION_TRANSPOSE; + break; + case 'H': + myCusparseOperation = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE; + break; + default: + { + std::cerr << "Mode " << mode << " invalid for cuSPARSE SpMV.\n"; + throw std::invalid_argument("Invalid mode"); + } + } #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) @@ -239,8 +265,9 @@ namespace Impl { const XVector& x, \ const coefficient_type& beta, \ const YVector& y) { \ - if(controls.isParameter("algorithm") && controls.getParameter("algorithm") == "native") { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + Kokkos::ArithTraits::name() + "]"; \ + bool fallback = *mode == 'C' || ((*mode == 'T' || *mode == 'H') && 9000 <= CUDA_VERSION && CUDA_VERSION < 10000); \ + if((controls.isParameter("algorithm") && controls.getParameter("algorithm") == "native") || fallback) { \ + std::string label = "KokkosSparse::spmv[NATIVE," + Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ spmv_native(controls, mode, alpha, A, x, beta, y); \ Kokkos::Profiling::popRegion(); \ @@ -252,6 +279,11 @@ namespace Impl { } \ } \ }; + +//BMK: cuSPARSE that comes with CUDA 9 does not support tranpose or conjugate transpose modes. +//No version of cuSPARSE supports mode C (conjugate, non transpose). +//In those cases, fall back to KokkosKernels native spmv. + #if (9000 <= CUDA_VERSION) KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutRight, Kokkos::CudaSpace, true) diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp index e18bc4690f..4c26f5cd6e 100644 --- a/src/sparse/KokkosSparse_spmv.hpp +++ b/src/sparse/KokkosSparse_spmv.hpp @@ -151,7 +151,7 @@ spmv (KokkosKernels::Experimental::Controls controls, KokkosBlas::scal(y_i, beta, y_i); return; } - return Impl::SPMV< + Impl::SPMV< typename AMatrix_Internal::value_type, typename AMatrix_Internal::ordinal_type, typename AMatrix_Internal::device_type, diff --git a/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp index f523bfe5f1..420e622c8f 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp @@ -79,9 +79,10 @@ namespace Impl{ #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - using device1 = typename ain_row_index_view_type::device_type; - using device2 = typename ain_nonzero_index_view_type::device_type; - using idx = typename KernelHandle::nnz_lno_t; + using device1 = typename ain_row_index_view_type::device_type; + using device2 = typename ain_nonzero_index_view_type::device_type; + using idx = typename KernelHandle::nnz_lno_t; + using size_type = typename KernelHandle::size_type; //TODO this is not correct, check memory space. @@ -98,11 +99,10 @@ namespace Impl{ throw std::runtime_error ("SpGEMM cuSPARSE backend is not yet supported for this CUDA version\n"); #else - if (std::is_same::value){ - - const idx *a_xadj = (int *)row_mapA.data(); - const idx *b_xadj = (int *)row_mapB.data(); - idx *c_xadj = (int *)row_mapC.data(); + if (std::is_same::value && std::is_same::value){ + const idx *a_xadj = (const idx*) row_mapA.data(); + const idx *b_xadj = (const idx*) row_mapB.data(); + idx *c_xadj = (idx*) row_mapC.data(); const idx *a_adj = entriesA.data(); const idx *b_adj = entriesB.data(); diff --git a/src/sparse/impl/KokkosSparse_spmv_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_spec.hpp index 9d1f44bd2a..b678142dbe 100644 --- a/src/sparse/impl/KokkosSparse_spmv_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_spec.hpp @@ -257,8 +257,6 @@ struct SPMV < AT, AO, AD, AM, AS, { typedef Kokkos::Details::ArithTraits KAT; - typedef Kokkos::Details::ArithTraits KAT; - if (alpha == KAT::zero ()) { if (beta != KAT::one ()) { KokkosBlas::scal (y, beta, y); diff --git a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 25e9844940..623df284ea 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -95,7 +95,7 @@ namespace Impl{ if (!std::is_same::value) sptrsv_handle->allocate_tmp_int_rowmap(row_map.extent(0)); const int* rm = !std::is_same::value ? sptrsv_handle->get_int_rowmap_ptr_copy(row_map) : (const int*)row_map.data(); - const int* ent = entries.data(); + const int* ent = (const int*) entries.data(); const scalar_type* vals = values.data(); if (std::is_same::value) { @@ -297,7 +297,7 @@ namespace Impl{ int nnz = entries.extent_int(0); const int* rm = !std::is_same::value ? sptrsv_handle->get_int_rowmap_ptr() : (const int*)row_map.data(); - const int* ent = entries.data(); + const int* ent = (const int*) entries.data(); const scalar_type* vals = values.data(); const scalar_type* bv = rhs.data(); scalar_type* xv = lhs.data(); diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index 4a19137886..e27012991a 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -132,9 +132,19 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, Kokkos::fence(); sequential_spmv(input_mat, x, expected_y, alpha, beta, mode); - //KokkosKernels::Impl::print_1Dview(expected_y); - KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y); - //KokkosKernels::Impl::print_1Dview(y); + bool threw = false; + std::string msg; + try + { + KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y); + Kokkos::fence(); + } + catch(std::exception& e) + { + threw = true; + msg = e.what(); + } + ASSERT_FALSE(threw) << "KokkosSparse::Test::spmv 1D, mode " << mode << ": threw exception:\n" << msg << '\n'; int num_errors = 0; Kokkos::parallel_reduce("KokkosSparse::Test::spmv", my_exec_space(0, y.extent(0)), @@ -165,8 +175,19 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vecto Kokkos::fence(); - KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y); - + bool threw = false; + std::string msg; + try + { + KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y); + Kokkos::fence(); + } + catch(std::exception& e) + { + threw = true; + msg = e.what(); + } + ASSERT_FALSE(threw) << "KokkosSparse::Test::spmv 2D, mode " << mode << ": threw exception:\n" << msg << '\n'; for (int i = 0; i < numMV; ++i){ auto x_i = Kokkos::subview (x, Kokkos::ALL (), i); @@ -318,6 +339,23 @@ void check_spmv_controls(KokkosKernels::Experimental::Controls controls, } // namespace Test +template +scalar_t randomUpperBound(int mag) +{ + return (scalar_t) mag; +} + +template <> +Kokkos::complex randomUpperBound>(int mag) +{ + return Kokkos::complex(mag, mag); +} + +template <> +Kokkos::complex randomUpperBound>(int mag) +{ + return Kokkos::complex(mag, mag); +} template void test_spmv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance){ @@ -346,10 +384,10 @@ void test_spmv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_vari typedef typename x_vector_type::value_type ScalarX; typedef typename y_vector_type::value_type ScalarY; - Kokkos::fill_random(input_x,rand_pool,ScalarX(10)); - Kokkos::fill_random(output_y,rand_pool,ScalarY(10)); - Kokkos::fill_random(input_xt,rand_pool,ScalarX(10)); - Kokkos::fill_random(output_yt,rand_pool,ScalarY(10)); + Kokkos::fill_random(input_x,rand_pool,randomUpperBound(10)); + Kokkos::fill_random(output_y,rand_pool,randomUpperBound(10)); + Kokkos::fill_random(input_xt,rand_pool,randomUpperBound(10)); + Kokkos::fill_random(output_yt,rand_pool,randomUpperBound(10)); std::vector nonTransModes = {'N', 'C'}; std::vector transModes = {'T', 'H'}; @@ -385,10 +423,10 @@ void test_spmv_mv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_v ViewTypeY b_yt_copy("B",numRows,numMV); Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,scalar_t(10)); - Kokkos::fill_random(b_y,rand_pool,scalar_t(10)); - Kokkos::fill_random(b_xt,rand_pool,scalar_t(10)); - Kokkos::fill_random(b_yt,rand_pool,scalar_t(10)); + Kokkos::fill_random(b_x,rand_pool,randomUpperBound(10)); + Kokkos::fill_random(b_y,rand_pool,randomUpperBound(10)); + Kokkos::fill_random(b_xt,rand_pool,randomUpperBound(10)); + Kokkos::fill_random(b_yt,rand_pool,randomUpperBound(10)); crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows,numCols,nnz,row_size_variance, bandwidth); From b88cb33346e82e705090830fd25e491ae527d058 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 18 Aug 2020 17:04:46 -0600 Subject: [PATCH 050/106] Initial support for half precision. cmake: Various updates for half precision support - Added checks for FP16 support in host toolchains and cuda toolchains - Added HAVE_KOKKOSKERNELS_HALFMATH, HAVE_KOKKOSKERNELS_FP16, and HAVE_KOKKOSKERNELS_CUDA_FP16 defines src: Various updates for half precision support - Added Kokkos_HalfPrecision to typedef 'half' based on cmake checks - Added 'device_fp16_t' and 'host_fp16_t' types. - Updated Kokkos_ArithTraits to support 'half' unit_test/batched: - Add half precision tests for TeamGemm, TeamVectorGemm, and SerialGemm - Fixes #813 test_common: - Add half precision ArithTraits tests - Fixes #814 - Add vanilla gemm implementation for unit tests --- CMakeLists.txt | 11 +- cmake/HalfPrecisionSupport.cmake | 29 ++ cmake/KokkosKernels_config.h.in | 7 + src/Kokkos_ArithTraits.hpp | 174 +++++++++ src/Kokkos_HalfPrecision.hpp | 173 +++++++++ .../KokkosBatched_Gemm_Serial_Internal.hpp | 2 +- .../KokkosBatched_Gemm_TeamVector_Impl.hpp | 2 +- .../KokkosBatched_Gemm_Team_Internal.hpp | 2 +- src/batched/KokkosBatched_Util.hpp | 2 +- test_common/KokkosKernels_TestUtils.hpp | 90 ++++- test_common/Test_Common_ArithTraits.hpp | 294 +++++++++------ unit_test/batched/Test_Batched_SerialGemm.hpp | 156 +++++++- .../batched/Test_Batched_SerialGemm_Real.hpp | 27 ++ unit_test/batched/Test_Batched_TeamGemm.hpp | 181 ++++++++- .../batched/Test_Batched_TeamGemm_Complex.hpp | 24 +- .../batched/Test_Batched_TeamGemm_Real.hpp | 42 ++- .../batched/Test_Batched_TeamVectorGemm.hpp | 346 ++++++++++++++++++ .../Test_Batched_TeamVectorGemm_Complex.hpp | 53 +++ .../Test_Batched_TeamVectorGemm_Real.hpp | 80 ++++ ...st_Cuda_Batched_TeamVectorGemm_Complex.cpp | 3 + .../Test_Cuda_Batched_TeamVectorGemm_Real.cpp | 3 + ..._OpenMP_Batched_TeamVectorGemm_Complex.cpp | 3 + ...est_OpenMP_Batched_TeamVectorGemm_Real.cpp | 3 + ..._Serial_Batched_TeamVectorGemm_Complex.cpp | 3 + ...est_Serial_Batched_TeamVectorGemm_Real.cpp | 3 + ...Threads_Batched_TeamVectorGemm_Complex.cpp | 3 + ...st_Threads_Batched_TeamVectorGemm_Real.cpp | 3 + 27 files changed, 1556 insertions(+), 163 deletions(-) create mode 100644 cmake/HalfPrecisionSupport.cmake create mode 100644 src/Kokkos_HalfPrecision.hpp create mode 100644 unit_test/batched/Test_Batched_TeamVectorGemm.hpp create mode 100644 unit_test/batched/Test_Batched_TeamVectorGemm_Complex.hpp create mode 100644 unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp create mode 100644 unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Complex.cpp create mode 100644 unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Real.cpp create mode 100644 unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Complex.cpp create mode 100644 unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Real.cpp create mode 100644 unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Complex.cpp create mode 100644 unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Real.cpp create mode 100644 unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Complex.cpp create mode 100644 unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Real.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1149a2101d..35a72a489c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -160,7 +160,10 @@ ELSE() # Enable Layout Types for ETI # ================================================================== INCLUDE(cmake/kokkoskernels_eti_layouts.cmake) - + # ================================================================== + # Determine half precision support + # ================================================================== + INCLUDE(cmake/HalfPrecisionSupport.cmake) # ================================================================== # Enable Third Party Libraries # ================================================================== @@ -179,6 +182,12 @@ ELSE() MESSAGE(" Offsets: ${OFFSET_LIST}") MESSAGE(" Layouts: ${LAYOUT_LIST}") MESSAGE("") + IF(HAVE_KOKKOSKERNELS_HALFMATH) + MESSAGE("KokkosKernels Half Precision Types") + MESSAGE(" HAVE_FP16: ${HAVE_FP16}") + MESSAGE(" HAVE_CUDA_FP16: ${HAVE_CUDA_FP16}") + MESSAGE("") + ENDIF() MESSAGE("KokkosKernels TPLs") FOREACH(TPL ${KOKKOSKERNELS_TPL_LIST}) PAD_STRING("${TPL}:" TPL_PADDED 12) diff --git a/cmake/HalfPrecisionSupport.cmake b/cmake/HalfPrecisionSupport.cmake new file mode 100644 index 0000000000..68971b1279 --- /dev/null +++ b/cmake/HalfPrecisionSupport.cmake @@ -0,0 +1,29 @@ +# Check whether the compiler defined the _Float16 type +# HAVE_KOKKOSKERNELS_FP16 is passed to C++ via KokkosKernels_config.h.in +INCLUDE(CheckTypeSize) +CHECK_TYPE_SIZE(_Float16 FP16 LANGUAGE CXX) +IF(HAVE_FP16) + SET(HAVE_KOKKOSKERNELS_FP16 ${HAVE_FP16}) +ENDIF() + +# Check whether the cuda_fp16.h header exists to infer that the __half type exists +# HAVE_KOKKOSKERNELS_CUDA_FP16 is passed to C++ via KokkosKernels_config.h.in +INCLUDE(CheckIncludeFileCXX) +CHECK_INCLUDE_FILE_CXX(cuda_fp16.h HAVE_CUDA_FP16) +IF(HAVE_CUDA_FP16) + SET(HAVE_CUDA_FP16 TRUE) + SET(HAVE_KOKKOSKERNELS_CUDA_FP16 TRUE) +ELSE() + SET(HAVE_CUDA_FP16 FALSE) +ENDIF() + +IF(HAVE_KOKKOSKERNELS_FP16 AND HAVE_KOKKOSKERNELS_CUDA_FP16) + MESSAGE(WARNING "'half' is set to 'device_fp16_t'. To use half precision on host, use 'host_fp16_t'.") + MESSAGE(WARNING "Use 'float' and 'host_fp16_t' to cast on host.") + MESSAGE(WARNING "Use '__half2float' and '__float2half' functions to cast on device.") +ENDIF() + +# HAVE_KOKKOSKERNELS_HALFMATH is passed to C++ via KokkosKernels_config.h.in +IF(HAVE_KOKKOSKERNELS_FP16 OR HAVE_KOKKOSKERNELS_CUDA_FP16) + SET(HAVE_KOKKOSKERNELS_HALFMATH TRUE) +ENDIF() diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index b5611c20ca..30b141ba38 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -16,6 +16,9 @@ /* Define this macro if the quadmath TPL is enabled */ #cmakedefine HAVE_KOKKOSKERNELS_QUADMATH +/* Define this macro if half precision is supported by toolchain */ +#cmakedefine HAVE_KOKKOSKERNELS_HALFMATH + /* Define this macro if the MKL TPL is enabled. This is different than just linking against the MKL to get the BLAS and LAPACK; it requires (a) header file(s) as well, and may use functions other @@ -60,6 +63,10 @@ #define KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_ #endif +/* Whether the _Float16 type is defined by the toolchain */ +#cmakedefine HAVE_KOKKOSKERNELS_FP16 +/* Whether the __half type is defined by the toolchain */ +#cmakedefine HAVE_KOKKOSKERNELS_CUDA_FP16 /* Whether to build kernels for multivectors of LayoutLeft */ #cmakedefine KOKKOSKERNELS_INST_LAYOUTLEFT /* Whether to build kernels for multivectors of LayoutRight */ diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index 3a6ea1cca5..9040103bfe 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -50,6 +50,7 @@ #include #include +#include #ifdef HAVE_KOKKOSKERNELS_QUADMATH # include @@ -674,6 +675,179 @@ class ArithTraits { //@} }; +/** + * Currently, all the ArithTraits member will only work on either the host + * due toolchain support of half precision types. + */ +#if defined(HAVE_KOKKOSKERNELS_HALFMATH) +template<> +class ArithTraits { +public: + typedef KokkosKernels::Experimental::half val_type; + typedef val_type mag_type; + + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool is_complex = false; + + static constexpr bool has_infinity = true; + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return KokkosKernels::Experimental::__cast2half(HUGE_VALF); } + + static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) { + #ifndef __CUDA_ARCH__ + using std::isinf; + #endif + return isinf (KokkosKernels::Experimental::__cast2float(x)); + } + static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) { + #ifndef __CUDA_ARCH__ + using std::isnan; + #endif + return isnan(KokkosKernels::Experimental::__cast2float(x)); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) { + return KokkosKernels::Experimental::__cast2float(fabs(KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type zero () { + return KokkosKernels::Experimental::__cast2float(0.0F); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type one () { + return KokkosKernels::Experimental::__cast2float(1.0F); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type min () { + return KokkosKernels::Experimental::__cast2float(-FP16_MAX); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type max () { + return KokkosKernels::Experimental::__cast2float(FP16_MAX); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) { + return x; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type) { + return KokkosKernels::Experimental::__cast2float(0.0F); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) { + return x; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const val_type y) { + return KokkosKernels::Experimental::__cast2float(::pow (KokkosKernels::Experimental::__cast2float(x), KokkosKernels::Experimental::__cast2float(y))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { + return KokkosKernels::Experimental::__cast2float(::sqrt (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { + return KokkosKernels::Experimental::__cast2float(::cbrt (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { + return KokkosKernels::Experimental::__cast2float(::exp (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) { + return KokkosKernels::Experimental::__cast2float(::log (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) { + return KokkosKernels::Experimental::__cast2float(::log10 (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { + return KokkosKernels::Experimental::__cast2float(::sin (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { + return KokkosKernels::Experimental::__cast2float(::cos (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { + return KokkosKernels::Experimental::__cast2float(::tan (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { + return KokkosKernels::Experimental::__cast2float(::sinh (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { + return KokkosKernels::Experimental::__cast2float(::cosh (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { + return KokkosKernels::Experimental::__cast2float(::tanh (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { + return KokkosKernels::Experimental::__cast2float(::asin (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { + return KokkosKernels::Experimental::__cast2float(::acos (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { + return KokkosKernels::Experimental::__cast2float(::atan (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () { + //return ::pow(2, -FP16_SIGNIFICAND_BITS); + return KokkosKernels::Experimental::__cast2half(FP16_EPSILON); + } + // Backwards compatibility with Teuchos::ScalarTraits. + typedef mag_type magnitudeType; + // C++ doesn't have a standard "half-float" type. + typedef val_type halfPrecision; + typedef double doublePrecision; + + static const bool isComplex = false; + static const bool isOrdinal = false; + static const bool isComparable = true; + static const bool hasMachineParameters = true; + static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const val_type x) { + return isNan (x) || isInf (x); + } + static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) { + return KokkosKernels::Experimental::__cast2float(abs (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) { + return KokkosKernels::Experimental::__cast2float(conj (KokkosKernels::Experimental::__cast2float(x))); + } + static std::string name () { + return "half"; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) { + return KokkosKernels::Experimental::__cast2float(sqrt (KokkosKernels::Experimental::__cast2float(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type nan () { +#ifdef __CUDA_ARCH__ + return KokkosKernels::Experimental::__cast2half(CUDART_NAN_F); +#else + return KokkosKernels::Experimental::__cast2half(std::numeric_limits::quiet_NaN()); +#endif // __CUDA_ARCH__ + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type eps () { + return epsilon (); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin () { + return FP16_MIN; + } + static KOKKOS_FORCEINLINE_FUNCTION int base () { + return FP16_RADIX; + } + // Use float to allow running on both host and device + static KOKKOS_FORCEINLINE_FUNCTION float prec () { + float e = FP16_EPSILON; + float b = (float) base(); + float r = e * b; + return r; + } + static KOKKOS_FORCEINLINE_FUNCTION int t () { + return FP16_MANT_DIG; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd () { + return 1.0; + } + static KOKKOS_FORCEINLINE_FUNCTION int emin () { + return FP16_MIN_EXP; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin () { + return FP16_MIN; + } + static KOKKOS_FORCEINLINE_FUNCTION int emax () { + return FP16_MAX_EXP; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax () { + return FP16_MAX; + } +}; +#endif // HAVE_KOKKOSKERNELS_HALFMATH template<> class ArithTraits { diff --git a/src/Kokkos_HalfPrecision.hpp b/src/Kokkos_HalfPrecision.hpp new file mode 100644 index 0000000000..89318270f5 --- /dev/null +++ b/src/Kokkos_HalfPrecision.hpp @@ -0,0 +1,173 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HALFPRECISION_HPP +#define KOKKOS_HALFPRECISION_HPP + +#if defined(KOKKOS_ENABLE_CUDA) && defined(HAVE_KOKKOSKERNELS_CUDA_FP16) +#include +#endif + +namespace KokkosKernels { + namespace Experimental { + /** + * Below we check whether the given toolchain has support for portable IEEE-754 + * FP16 (binary16) precision types. The checks are done via CMake which passes the + * results via a KOKKOSKERNELS_HAVE define to KokkosKernels_config.h + * + * First we check for cuda half precision support (HAVE_KOKKOSKERNELS_CUDA_FP16). + * Second we check for host half precision support (HAVE_KOKKOSKERNELS_FP16). + * Lastly, we fall back to single precision support. + * + * NOTE: If both cuda and host support half precision, the half type will + * default to device_fp16_t. + */ + #if defined(KOKKOS_ENABLE_CUDA) && defined(HAVE_KOKKOSKERNELS_CUDA_FP16) + using device_fp16_t = __half; + using half = device_fp16_t; + #if defined(HAVE_KOKKOSKERNELS_FP16) + using host_fp16_t = _Float16; + #else + using host_fp16_t = float; + #endif // defined(HAVE_KOKKOSKERNELS_FP16) + static KOKKOS_FORCEINLINE_FUNCTION float __cast2float(device_fp16_t x) { return __half2float(x); } + static KOKKOS_FORCEINLINE_FUNCTION device_fp16_t __cast2half(float x) { return __float2half(x); } + + #else // defined(KOKKOS_ENABLE_CUDA) && defined(HAVE_KOKKOSKERNELS_CUDA_FP16) + #if defined(HAVE_KOKKOSKERNELS_FP16) + using host_fp16_t = _Float16; + using half = host_fp16_t; + #else // defined(HAVE_KOKKOSKERNELS_FP16) + using host_fp16_t = float; + using device_fp16_t = host_fp16_t; + using half = host_fp16_t; + #endif // _Float16 + static inline float __cast2float(host_fp16_t x) { return (float) x; } + static inline host_fp16_t __cast2half(float x) { return (host_fp16_t) x; } + #endif + ////////////// BEGIN half2float and float2half overloads ////////////// + /** + * Since kokkos does not have support for half precision types yet, we + * must cast to/from float in some kokkos-kernels routines. Except for + * the overloads below that actually cast to/from half precision types, + * the others should be optimized away by the compiler. + */ + // host_fp16_t +#if defined(HAVE_KOKKOSKERNELS_FP16) + static inline + float half2float(host_fp16_t x, float &ret) { + ret = __cast2float(x); + return ret; + } + static inline + host_fp16_t float2half(float x, host_fp16_t &ret) { + ret = __cast2half(x); + return ret; + } +#endif + // device_fp16_t +#if defined(KOKKOS_ENABLE_CUDA) && defined(HAVE_KOKKOSKERNELS_CUDA_FP16) + static inline __device__ + float half2float(device_fp16_t x, float &ret) { + ret = __cast2float(x); + return ret; + } + static inline __device__ + device_fp16_t float2half(float x, device_fp16_t &ret) { + ret = __cast2half(x); + return ret; + } +#endif + // float + static KOKKOS_FORCEINLINE_FUNCTION float half2float(float x, float &ret) { + ret = x; + return ret; + } + static KOKKOS_FORCEINLINE_FUNCTION float float2half(float x, float &ret) { + ret = x; + return ret; + } + // complex float + static KOKKOS_FORCEINLINE_FUNCTION Kokkos::complex half2float(Kokkos::complex x, Kokkos::complex &ret) { + ret = x; + return ret; + } + static KOKKOS_FORCEINLINE_FUNCTION Kokkos::complex float2half(Kokkos::complex x, Kokkos::complex &ret) { + ret = x; + return ret; + } + // double + static KOKKOS_FORCEINLINE_FUNCTION double half2float(double x, double &ret) { + ret = x; + return ret; + } + static KOKKOS_FORCEINLINE_FUNCTION double float2half(double x, double &ret) { + ret = x; + return ret; + } + // complex double + static KOKKOS_FORCEINLINE_FUNCTION Kokkos::complex half2float(Kokkos::complex x, Kokkos::complex &ret) { + ret = x; + return ret; + } + static KOKKOS_FORCEINLINE_FUNCTION Kokkos::complex float2half(Kokkos::complex x, Kokkos::complex &ret) { + ret = x; + return ret; + } + ////////////// END half2float and float2half overloads ////////////// + + ////////////// BEGIN FP16/binary16 limits ////////////// + #define FP16_MAX 65504.0F // Maximum normalized number + #define FP16_MIN 0.000000059604645F // Minimum normalized positive half precision number + #define FP16_RADIX 2 // Value of the base of the exponent representation. TODO: Confirm this + #define FP16_MANT_DIG 15 // Number of digits in the matissa that can be represented without losing precision. TODO: Confirm this + #define FP16_MIN_EXP -14 // This is the smallest possible exponent value + #define FP16_MAX_EXP 15 // This is the largest possible exponent value + #define FP16_SIGNIFICAND_BITS 10 + #define FP16_EPSILON 0.0009765625F + #define HUGE_VALH 0x7c00 // bits [10,14] set. + ////////////// END FP16/binary16 limits ////////////// + } // Experimental +} // KokkosKernels +#endif // KOKKOS_HALFPRECISION_HPP diff --git a/src/batched/KokkosBatched_Gemm_Serial_Internal.hpp b/src/batched/KokkosBatched_Gemm_Serial_Internal.hpp index afad371334..5875029dd1 100644 --- a/src/batched/KokkosBatched_Gemm_Serial_Internal.hpp +++ b/src/batched/KokkosBatched_Gemm_Serial_Internal.hpp @@ -45,7 +45,7 @@ namespace KokkosBatched { /**/ ValueType *__restrict__ C, const int cs0, const int cs1) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - + const ScalarType one(1.0), zero(0.0); if (beta == zero) SerialSetInternal ::invoke(m, n, zero, C, cs0, cs1); diff --git a/src/batched/KokkosBatched_Gemm_TeamVector_Impl.hpp b/src/batched/KokkosBatched_Gemm_TeamVector_Impl.hpp index 4e1c4d9579..0b68727f0e 100644 --- a/src/batched/KokkosBatched_Gemm_TeamVector_Impl.hpp +++ b/src/batched/KokkosBatched_Gemm_TeamVector_Impl.hpp @@ -138,7 +138,7 @@ namespace KokkosBatched { const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal:: + return TeamVectorGemmInternal:: invoke(member, C.extent(0), C.extent(1), A.extent(0), alpha, diff --git a/src/batched/KokkosBatched_Gemm_Team_Internal.hpp b/src/batched/KokkosBatched_Gemm_Team_Internal.hpp index f4f682cb91..9758836af1 100644 --- a/src/batched/KokkosBatched_Gemm_Team_Internal.hpp +++ b/src/batched/KokkosBatched_Gemm_Team_Internal.hpp @@ -49,7 +49,7 @@ namespace KokkosBatched { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - + const ScalarType one(1.0), zero(0.0); if (beta == zero) TeamSetInternal ::invoke(member, m, n, zero, C, cs0, cs1); diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 2347c63e87..f58cfc10ca 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -305,7 +305,7 @@ namespace KokkosBatched { using Gemm = Level3; using Trsm = Level3; using Trmm = Level3; - using Trtri = Level3; // TODO: Need new level for Trtri? + using Trtri = Level3; using LU = Level3; using InverseLU = Level3; using SolveLU = Level3; diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 8a9306325f..f63cdb0495 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -105,6 +105,94 @@ namespace Test { EXPECT_NEAR_KK(h_v1(i), h_v2(i), tol); } } -} + #if defined(KOKKOS_ENABLE_CUDA) + using halfScalarType = typename std::conditional::value, KokkosKernels::Experimental::device_fp16_t, KokkosKernels::Experimental::host_fp16_t>::type; + #else + using halfScalarType = KokkosKernels::Experimental::host_fp16_t; + #endif // KOKKOS_ENABLE_CUDA + + template + struct SharedVanillaGEMM { + bool A_t, B_t, A_c, B_c; + int C_rows, C_cols, A_cols; + ViewTypeA A; + ViewTypeB B; + ViewTypeC C; + + typedef typename ViewTypeA::value_type ScalarA; + typedef typename ViewTypeB::value_type ScalarB; + typedef typename ViewTypeC::value_type ScalarC; + typedef Kokkos::Details::ArithTraits APT; + typedef typename APT::mag_type mag_type; + ScalarA alpha; + ScalarC beta; + + KOKKOS_INLINE_FUNCTION + void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,C_rows), [&] (const int& i) { + // Give each kokkos thread a vector of A + auto a_vec = A_t ? Kokkos::subview(A, Kokkos::ALL(), i) : Kokkos::subview(A, i, Kokkos::ALL()); + + // Have all vector lanes perform the dot product + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,C_cols), [&] (const int& j) { + auto b_vec = B_t ? Kokkos::subview(B, j, Kokkos::ALL()) : Kokkos::subview(B, Kokkos::ALL(), j); + ScalarC ab = ScalarC(0); + for (int k = 0; k < A_cols; k++) { + auto a = A_c ? APT::conj(a_vec(k)) : a_vec(k); + auto b = B_c ? APT::conj(b_vec(k)) : b_vec(k); + ab += a * b; + } + C(i,j) = beta * C(i,j) + alpha * ab; + }); + }); + } + }; + // C(i,:,:) = alpha * (A(i,:,:) * B(i,:,:)) + beta * C(i,:,:) + template + struct Functor_BatchedVanillaGEMM { + bool A_t, B_t, A_c, B_c; + ViewTypeA A; + ViewTypeB B; + ViewTypeC C; + + using ScalarA = typename ViewTypeA::value_type; + using ScalarB = typename ViewTypeB::value_type; + using ScalarC = typename ViewTypeC::value_type; + ScalarA alpha; + ScalarC beta; + + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { + int i = team.league_rank(); + + auto _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); + auto _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); + using SubviewTypeA = decltype(_A); + using SubviewTypeB = decltype(_B); + using SubviewTypeC = decltype(_C); + struct SharedVanillaGEMM vgemm; + vgemm.A_t = A_t; vgemm.B_t = B_t; + vgemm.A_c = A_c; vgemm.B_c = B_c; + vgemm.C_rows = C.extent(1); + vgemm.C_cols = C.extent(2); + vgemm.A_cols = A_t?A.extent(1):A.extent(2); + vgemm.A = _A; + vgemm.B = _B; + vgemm.C = _C; + vgemm.alpha = alpha; + vgemm.beta = beta; + vgemm(team); + } + + inline + void run() { + Kokkos::parallel_for( + "Test::VanillaGEMM", + Kokkos::TeamPolicy(C.extent(0), Kokkos::AUTO, 16), + *this); + } + }; +} #endif diff --git a/test_common/Test_Common_ArithTraits.hpp b/test_common/Test_Common_ArithTraits.hpp index ff2abd0acc..5c93a39445 100644 --- a/test_common/Test_Common_ArithTraits.hpp +++ b/test_common/Test_Common_ArithTraits.hpp @@ -63,6 +63,13 @@ #include // typeid (T) #include +#define FAILURE() {printf("%s:%s:%d: Failure\n", __FILE__, __func__, __LINE__); success = 0;} + +#if 0 +#define TRACE() printf("%s:%s:%d: Trace\n", __FILE__, __func__, __LINE__); +#else +#define TRACE() +#endif namespace { // Whether Kokkos::Details::ArithTraits implements @@ -183,6 +190,7 @@ class ArithTraitsTesterBase { KOKKOS_INLINE_FUNCTION void operator () (size_type iwork, value_type& dst) const { + TRACE(); typedef Kokkos::Details::ArithTraits AT; (void) iwork; // not using this argument int success = 1; @@ -203,7 +211,7 @@ class ArithTraitsTesterBase { // std::numeric_limits. if (! AT::is_specialized) { printf ("! AT::is_specialized\n"); - success = 0; + FAILURE(); } // It's OK to refer to std::numeric_limits constants in a device @@ -211,11 +219,11 @@ class ArithTraitsTesterBase { // as device functions). if (AT::is_integer != std::numeric_limits::is_integer) { printf ("AT::is_integer not same as numeric_limits\n"); - success = 0; + FAILURE(); } if (AT::is_exact != std::numeric_limits::is_exact) { printf ("AT::is_exact not same as numeric_limits\n"); - success = 0; + FAILURE(); } const ScalarType zero = AT::zero (); @@ -224,34 +232,34 @@ class ArithTraitsTesterBase { // Test properties of the arithmetic and multiplicative identities. if (zero + zero != zero) { printf ("0 + 0 != 0\n"); - success = 0; + FAILURE(); } if (zero + one != one) { printf ("0 + 1 != 1\n"); - success = 0; + FAILURE(); } if (one - one != zero) { printf ("1 - 1 != 0\n"); - success = 0; + FAILURE(); } // This is technically 1 even of Z_2, since in that field, one // is its own inverse (so -one == one). if ((one + one) - one != one) { printf ("(1 + 1) - 1 != 1\n"); - success = 0; + FAILURE(); } if (AT::abs (zero) != zero) { printf ("AT::abs(0) != 0\n"); - success = 0; + FAILURE(); } if (AT::abs (one) != one) { printf ("AT::abs(1) != 1\n"); - success = 0; + FAILURE(); } if (AT::is_signed && AT::abs (-one) != one) { printf ("AT::is_signed and AT::abs(-1) != 1\n"); - success = 0; + FAILURE(); } // Need enable_if to test whether T can be compared using <=. // However, mag_type should always be comparable using <=. @@ -260,7 +268,7 @@ class ArithTraitsTesterBase { // They should work even for a set only containing zero. if (AT::abs (zero) > AT::abs (AT::max ())) { printf ("AT::abs(0) > AT::abs (AT::max ())\n"); - success = 0; + FAILURE(); } dst = dst && success; @@ -312,17 +320,17 @@ class ArithTraitsTesterBase { // std::numeric_limits. if (! AT::is_specialized) { out << "ArithTraits is not specialized for T" << endl; - success = 0; + FAILURE(); } if (AT::is_integer != std::numeric_limits::is_integer) { out << "AT::is_integer != std::numeric_limits::is_integer" << endl; - success = 0; + FAILURE(); } if (AT::is_exact != std::numeric_limits::is_exact) { out << "AT::is_exact != std::numeric_limits::is_exact" << endl; - success = 0; + FAILURE(); } const ScalarType zero = AT::zero (); @@ -331,35 +339,35 @@ class ArithTraitsTesterBase { if (zero + zero != zero) { out << "zero + zero != zero" << endl; - success = 0; + FAILURE(); } if (zero + one != one) { out << "zero + one != one" << endl; - success = 0; + FAILURE(); } if (one - one != zero) { out << "one - one != zero" << endl; - success = 0; + FAILURE(); } // This is technically 1 even of Z_2, since in that field, one // is its own inverse (so -one == one). if ((one + one) - one != one) { out << "(one + one) - one != one" << endl; - success = 0; + FAILURE(); } if (AT::abs (zero) != zero) { out << "AT::abs (zero) != zero" << endl; - success = 0; + FAILURE(); } if (AT::abs (one) != one) { out << "AT::abs (one) != one" << endl; - success = 0; + FAILURE(); } if (AT::is_signed) { if (AT::abs (-one) != one) { out << "AT::abs (-one) != one" << endl; - success = 0; + FAILURE(); } } // Need enable_if to test whether T can be compared using <=. @@ -369,19 +377,19 @@ class ArithTraitsTesterBase { // // They should work even for a set only containing zero. if (AT::abs (zero) > AT::abs (AT::max ())) { out << "AT::abs (zero) > AT::abs (AT::max ())" << endl; - success = 0; + FAILURE(); } if (AT::has_infinity) { if (! AT::isInf (AT::infinity())) { out << "AT::isInf (inf) != true" << endl; - success = 0; + FAILURE(); } } if ( ! std::is_same< ScalarType, decltype(AT::infinity()) >::value ) { std::cout << "AT::infinity() return value has wrong type" << endl; - success = 0; + FAILURE(); } // Run the parent class' remaining tests, if any. @@ -462,12 +470,13 @@ class ArithTraitsTesterTranscendentalBase : KOKKOS_INLINE_FUNCTION void operator () (size_type iwork, value_type& dst) const { + TRACE(); //typedef Kokkos::Details::ArithTraits AT; (void) iwork; // forestall compiler warning for unused variable int success = 1; if (HasTranscendentals::value) { - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -488,7 +497,7 @@ class ArithTraitsTesterTranscendentalBase : if (HasTranscendentals::value) { out << "HasTranscendentals::value is true" << endl; - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -542,12 +551,13 @@ class ArithTraitsTesterTranscendentalBase : KOKKOS_INLINE_FUNCTION void operator () (size_type iwork, value_type& dst) const { + TRACE(); typedef Kokkos::Details::ArithTraits AT; (void) iwork; // forestall compiler warning for unused variable int success = 1; if (! HasTranscendentals::value) { - success = 0; + FAILURE(); } const ScalarType zero = AT::zero (); @@ -576,20 +586,20 @@ class ArithTraitsTesterTranscendentalBase : result = AT::pow (two, three); if (!equal(result,eight)) { printf ("AT::pow(2,3) != 8\n"); - success = 0; + FAILURE(); } } if (!equal(AT::pow (three, zero) , one)) { printf ("AT::pow(3,0) != 1\n"); - success = 0; + FAILURE(); } if (!equal(AT::pow (three, one) , three)) { printf ("AT::pow(3,1) != 3\n"); - success = 0; + FAILURE(); } if (!equal(AT::pow (three, two) , nine)) { printf ("AT::pow(3,2) != 9\n"); - success = 0; + FAILURE(); } // This fails inexplicably for complex numbers on gcc 4.2.1 on Mac. @@ -597,7 +607,7 @@ class ArithTraitsTesterTranscendentalBase : result = AT::pow (three, three); if (!equal(result , twentySeven)) { printf ("AT::pow(3,3) != 27\n"); - success = 0; + FAILURE(); } } @@ -606,93 +616,93 @@ class ArithTraitsTesterTranscendentalBase : result = AT::pow (-three, one); if (!equal(result , -three)) { printf ("AT::pow(-3,1) != -3\n"); - success = 0; + FAILURE(); } result = AT::pow (-three, two); if (!equal(result , nine)) { printf ("AT::pow(-3,2) != 9\n"); - success = 0; + FAILURE(); } result = AT::pow (-three, three); if (!equal(result , -twentySeven)) { printf ("AT::pow(-3,3) != 27\n"); - success = 0; + FAILURE(); } } if (!equal(AT::sqrt (zero) , zero)) { printf ("AT::sqrt(0) != 0\n"); - success = 0; + FAILURE(); } if (!equal(AT::sqrt (one) , one)) { printf ("AT::sqrt(1) != 1\n"); - success = 0; + FAILURE(); } if (!equal(AT::sqrt (thirtySix) , six)) { printf ("AT::sqrt(36) != 6\n"); - success = 0; + FAILURE(); } if (!equal(AT::sqrt (sixtyFour) , eight)) { printf ("AT::sqrt(64) != 8\n"); - success = 0; + FAILURE(); } if (AT::is_integer) { if (!equal(AT::sqrt (fortyTwo) , six)) { printf ("AT:sqrt(42) != 6\n"); - success = 0; + FAILURE(); } if (!equal(AT::sqrt (oneTwentySeven) , eleven)) { printf ("AT::sqrt(127) != 11\n"); - success = 0; + FAILURE(); } } if (!equal(AT::cbrt (zero) , zero)) { printf ("AT::cbrt(0) != 0\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (one) , one)) { printf ("AT::cbrt(1) != 1\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (twentySeven) , three)) { printf ("AT::cbrt(27) != 3\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (sixtyFour) , four)) { printf ("AT::cbrt(64) != 4\n"); - success = 0; + FAILURE(); } if (AT::is_integer) { if (!equal(AT::cbrt (fortyTwo) , three)) { printf ("AT:cbrt(42) != 3\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (oneTwentySeven) , five)) { printf ("AT::cbrt(127) != 5\n"); - success = 0; + FAILURE(); } } if (!equal(AT::exp (zero) , one)) { printf ("AT::cbrt(0) != 1\n"); - success = 0; + FAILURE(); } if (AT::is_complex) { const ScalarType val = two; //(two.real(), two.real()); if (!equal(AT::conj (AT::exp (val)) , AT::exp (AT::conj (val)))) { printf ("AT::conj(exp(complex(2,2))) != AT::exp(conj(complex(2,2)))\n"); - success = 0; + FAILURE(); } } if (!equal(AT::log (one) , zero)) { printf ("AT::log(1) != 0\n"); - success = 0; + FAILURE(); } if (!equal(AT::log10 (one) , zero)) { printf ("AT::log10(1) != 0\n"); - success = 0; + FAILURE(); } if (AT::is_complex) { @@ -701,11 +711,11 @@ class ArithTraitsTesterTranscendentalBase : const auto val_cos = AT::cos (val); if (!equal(val_sin*val_sin + val_cos*val_cos , one)) { printf ("AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); - success = 0; + FAILURE(); } if (!equal(val_sin/val_cos , AT::tan(val))) { printf ("AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); - success = 0; + FAILURE(); } } else { ScalarType val = three; @@ -713,25 +723,25 @@ class ArithTraitsTesterTranscendentalBase : const auto val_cos = AT::cos (val); if (!equal(val_sin*val_sin + val_cos*val_cos , one)) { printf ("AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); - success = 0; + FAILURE(); } if (!equal(val_sin/val_cos , AT::tan(val))) { printf ("AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); - success = 0; + FAILURE(); } } if (!equal(AT::asin (AT::sin (one)), one)) { printf ("AT::asin(sin(1)) != 1\n"); - success = 0; + FAILURE(); } if (!equal(AT::acos (AT::cos (one)), one)) { printf ("AT::acos(cos(1)) != 1\n"); - success = 0; + FAILURE(); } if (!equal(AT::atan (AT::tan (one)), one)) { printf ("AT::atan(tan(1)) != 1\n"); - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -752,7 +762,7 @@ class ArithTraitsTesterTranscendentalBase : if (! HasTranscendentals::value) { out << "HasTranscendentals::value is false" << endl; - success = 0; + FAILURE(); } const ScalarType zero = AT::zero (); @@ -781,20 +791,20 @@ class ArithTraitsTesterTranscendentalBase : result = AT::pow (two, three); if (result != eight) { out << "AT::pow (two, three) != eight" << endl; - success = 0; + FAILURE(); } } if (AT::pow (three, zero) != one) { out << "AT::pow (three, zero) != one" << endl; - success = 0; + FAILURE(); } if (AT::pow (three, one) != three) { out << "AT::pow (three, one) != three" << endl; - success = 0; + FAILURE(); } if (AT::pow (three, two) != nine) { out << "AT::pow (three, two) != nine" << endl; - success = 0; + FAILURE(); } // This fails inexplicably for complex numbers on gcc 4.2.1 on Mac. @@ -803,7 +813,7 @@ class ArithTraitsTesterTranscendentalBase : if (result != twentySeven) { out << "AT::pow (three, three) = " << result << " != twentySeven = " << twentySeven << endl; - success = 0; + FAILURE(); } } @@ -813,95 +823,95 @@ class ArithTraitsTesterTranscendentalBase : if (result != -three) { out << "AT::pow (-three, one) = " << result << " != -three = " << -three << endl; - success = 0; + FAILURE(); } result = AT::pow (-three, two); if (result != nine) { out << "AT::pow (-three, two) = " << result << " != nine = " << nine << endl; - success = 0; + FAILURE(); } result = AT::pow (-three, three); if (result != -twentySeven) { out << "AT::pow (-three, three) = " << result << " != -twentySeven = " << twentySeven << endl; - success = 0; + FAILURE(); } } if (AT::sqrt (zero) != zero) { out << "AT::sqrt (zero) != zero" << endl; - success = 0; + FAILURE(); } if (AT::sqrt (one) != one) { out << "AT::sqrt (one) != one" << endl; - success = 0; + FAILURE(); } if (AT::sqrt (thirtySix) != six) { out << "AT::sqrt (thirtySix) != six" << endl; - success = 0; + FAILURE(); } if (AT::sqrt (sixtyFour) != eight) { out << "AT::sqrt (sixtyFour) != eight" << endl; - success = 0; + FAILURE(); } if (AT::is_integer) { if (AT::sqrt (fortyTwo) != six) { out << "AT::sqrt (fortyTwo) != six" << endl; - success = 0; + FAILURE(); } if (AT::sqrt (oneTwentySeven) != eleven) { out << "AT::sqrt (oneTwentySeven) != eleven" << endl; - success = 0; + FAILURE(); } } if (!equal(AT::cbrt (zero) , zero)) { printf ("AT::cbrt(0) != 0\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (one) , one)) { printf ("AT::cbrt(1) != 1\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (twentySeven) , three)) { printf ("AT::cbrt(27) != 3\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (sixtyFour) , four)) { printf ("AT::cbrt(64) != 4\n"); - success = 0; + FAILURE(); } if (AT::is_integer) { if (!equal(AT::cbrt (fortyTwo) , three)) { printf ("AT:cbrt(42) != 3\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (oneTwentySeven) , five)) { printf ("AT::cbrt(127) != 5\n"); - success = 0; + FAILURE(); } } if (!equal(AT::exp (zero) , one)) { printf ("AT::cbrt(0) != 1\n"); - success = 0; + FAILURE(); } if (AT::is_complex) { const ScalarType val = two; //(two.real(), two.real()); if (!equal(AT::conj (AT::exp (val)) , AT::exp (AT::conj (val)))) { printf ("AT::conj(exp(complex(2,0))) != AT::exp(conj(complex(2,0)))\n"); - success = 0; + FAILURE(); } } if (AT::log (one) != zero) { out << "AT::log (one) != zero" << endl; - success = 0; + FAILURE(); } if (AT::log10 (one) != zero) { out << "AT::log10 (one) != zero" << endl; - success = 0; + FAILURE(); } if (AT::is_complex) { @@ -910,11 +920,11 @@ class ArithTraitsTesterTranscendentalBase : const auto val_cos = AT::cos (val); if (!equal(val_sin*val_sin + val_cos*val_cos , one)) { printf ("AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); - success = 0; + FAILURE(); } if (!equal(val_sin/val_cos , AT::tan(val))) { printf ("AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); - success = 0; + FAILURE(); } } else { const ScalarType val = three; @@ -922,25 +932,25 @@ class ArithTraitsTesterTranscendentalBase : const auto val_cos = AT::cos (val); if (!equal(val_sin*val_sin + val_cos*val_cos , one)) { printf ("AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); - success = 0; + FAILURE(); } if (!equal(val_sin/val_cos , AT::tan(val))) { printf ("AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); - success = 0; + FAILURE(); } } if (!equal(AT::asin (AT::sin (three)), three)) { printf ("AT::asin(sin(3)) != 3\n"); - success = 0; + FAILURE(); } if (!equal(AT::acos (AT::cos (three)), three)) { printf ("AT::acos(cos(3)) != 3\n"); - success = 0; + FAILURE(); } if (!equal(AT::atan (AT::tan (three)), three)) { printf ("AT::atan(tan(3)) != 3\n"); - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -1020,17 +1030,31 @@ class ArithTraitsTesterComplexBase : KOKKOS_INLINE_FUNCTION void operator () (size_type iwork, value_type& dst) const { + TRACE(); typedef Kokkos::Details::ArithTraits AT; (void) iwork; // forestall compiler warning for unused variable int success = 1; - - // Apparently, std::numeric_limits::is_signed is 1 - // only for real numbers. - if (AT::is_signed != std::numeric_limits::is_signed) { - success = 0; + +#if defined(HAVE_KOKKOSKERNELS_CUDA_FP16) &&\ + defined(__CUDA_ARCH__) + if(std::is_same::value) { + if (AT::is_signed != 0x1) + FAILURE(); + } else +#endif // HAVE_KOKKOSKERNELS_CUDA_FP16 + { + // Apparently, std::numeric_limits::is_signed is 1 + // only for real numbers. + if (AT::is_signed != std::numeric_limits::is_signed) { + printf("AT::is_signed = 0x%x, std::numeric_limits::is_signed = 0x%x\n", + AT::is_signed, + std::numeric_limits::is_signed); + FAILURE(); + } } + if (AT::is_complex) { - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -1052,11 +1076,11 @@ class ArithTraitsTesterComplexBase : // Apparently, std::numeric_limits::is_signed is 1 only for real numbers. if (AT::is_signed != std::numeric_limits::is_signed) { out << "ArithTraits::is_signed != std::numeric_limits::is_signed" << endl; - success = 0; + FAILURE(); } if (AT::is_complex) { out << "ArithTraits::is_complex is wrong" << endl; - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' // implementation of testHostImpl() should (must) do this, in @@ -1090,12 +1114,13 @@ class ArithTraitsTesterComplexBase : KOKKOS_INLINE_FUNCTION void operator () (size_type iwork, value_type& dst) const { + TRACE(); typedef Kokkos::Details::ArithTraits AT; (void) iwork; // forestall compiler warning for unused variable int success = 1; if (! AT::is_complex) { - success = 0; + FAILURE(); } typedef typename AT::mag_type mag_type; const mag_type one = Kokkos::Details::ArithTraits::one (); @@ -1108,7 +1133,7 @@ class ArithTraitsTesterComplexBase : // Test conjugation. if (AT::conj (oneMinusOne) != onePlusOne || AT::conj (onePlusOne) != oneMinusOne) { - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -1129,7 +1154,7 @@ class ArithTraitsTesterComplexBase : if (! AT::is_complex) { out << "ArithTraits::is_complex is wrong" << endl; - success = 0; + FAILURE(); } typedef typename AT::mag_type mag_type; const mag_type one = Kokkos::Details::ArithTraits::one (); @@ -1142,11 +1167,11 @@ class ArithTraitsTesterComplexBase : // Test conjugation. if (AT::conj (oneMinusOne) != onePlusOne) { out << "AT::conj ((1, -1)) != (1, 1)" << endl; - success = 0; + FAILURE(); } if (AT::conj (onePlusOne) != oneMinusOne) { out << "AT::conj ((1, 1)) != (1, -1)" << endl; - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' // implementation of testHostImpl() should (must) do this, in @@ -1232,17 +1257,19 @@ class ArithTraitsTesterFloatingPointBase : KOKKOS_INLINE_FUNCTION void operator () (size_type iwork, value_type& dst) const { + TRACE(); typedef Kokkos::Details::ArithTraits AT; (void) iwork; // forestall compiler warning for unused variable int success = 1; if (AT::is_exact) { printf ("AT::is_exact is 1\n"); - success = 0; + FAILURE(); } + if (! AT::isNan (AT::nan ())) { printf ("NaN is not NaN\n"); - success = 0; + FAILURE(); } const ScalarType zero = AT::zero (); @@ -1250,19 +1277,19 @@ class ArithTraitsTesterFloatingPointBase : if (AT::isInf (zero)) { printf ("0 is Inf\n"); - success = 0; + FAILURE(); } if (AT::isInf (one)) { printf ("1 is Inf\n"); - success = 0; + FAILURE(); } if (AT::isNan (zero)) { printf ("0 is NaN\n"); - success = 0; + FAILURE(); } if (AT::isNan (one)) { printf ("1 is NaN\n"); - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -1283,14 +1310,14 @@ class ArithTraitsTesterFloatingPointBase : if (AT::is_exact) { out << "AT::is_exact is wrong" << endl; - success = 0; + FAILURE(); } //if (std::numeric_limits::is_iec559) { //success = success && AT::isInf (AT::inf ()); if (! AT::isNan (AT::nan ())) { out << "isNan or nan failed" << endl; - success = 0; + FAILURE(); } //} @@ -1299,19 +1326,19 @@ class ArithTraitsTesterFloatingPointBase : if (AT::isInf (zero)) { out << "isInf(zero) is 1" << endl; - success = 0; + FAILURE(); } if (AT::isInf (one)) { out << "isInf(one) is 1" << endl; - success = 0; + FAILURE(); } if (AT::isNan (zero)) { out << "isNan(zero) is 1" << endl; - success = 0; + FAILURE(); } if (AT::isNan (one)) { out << "isNan(one) is 1" << endl; - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -1351,13 +1378,14 @@ class ArithTraitsTesterFloatingPointBase : KOKKOS_INLINE_FUNCTION void operator () (size_type iwork, value_type& dst) const { + TRACE(); typedef Kokkos::Details::ArithTraits AT; (void) iwork; // forestall compiler warning for unused variable int success = 1; if (! AT::is_exact) { printf ("! AT:is_exact\n"); - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -1378,7 +1406,7 @@ class ArithTraitsTesterFloatingPointBase : if (! AT::is_exact) { out << "AT::is_exact is wrong" << endl; - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' // implementation of testHostImpl() should (must) do this, in @@ -1532,6 +1560,24 @@ int runAllArithTraitsDeviceTests (std::ostream& out, const int verbose) // Built-in real floating-point types // +#if defined(HAVE_KOKKOSKERNELS_HALFMATH) &&\ + defined(KOKKOS_ENABLE_CUDA) && defined(HAVE_KOKKOSKERNELS_CUDA_FP16) + if (std::is_same::value) { + TRACE(); + success = success && curSuccess; + curSuccess = + testArithTraitsOnDevice(out, verbose); + } else { +#if defined(HAVE_KOKKOSKERNELS_FP16) + TRACE(); + success = success && curSuccess; + curSuccess = + testArithTraitsOnDevice(out, verbose); +#endif // HAVE_KOKKOSKERNELS_FP16 + } +#endif // HAVE_KOKKOSKERNELS_HALFMATH success = success && curSuccess; curSuccess = testArithTraitsOnDevice (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnDevice (out, verbose); @@ -1542,7 +1588,7 @@ int runAllArithTraitsDeviceTests (std::ostream& out, const int verbose) success = success && curSuccess; curSuccess = testArithTraitsOnDevice, DeviceType> (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnDevice, DeviceType> (out, verbose); - return success; + return success && curSuccess; } @@ -1609,11 +1655,17 @@ int runAllArithTraitsHostTests (std::ostream& out, const int verbose) // Kokkos' complex floating-point types // +#if defined(HAVE_KOKKOSKERNELS_HALFMATH) && defined(HAVE_KOKKOSKERNELS_FP16) + success = success && curSuccess; + TRACE(); + curSuccess = testArithTraitsOnHost(out, verbose); +#endif // HAVE_KOKKOSKERNELS_HALFMATH success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); //success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); - return success; + return success && curSuccess; } template @@ -1627,8 +1679,8 @@ void test_ArithTraits () int overflow(int c) { return c; } }; NullBuffer null_buffer; - //std::ostream &out = std::cout; - std::ostream out(&null_buffer); + std::ostream &out = std::cerr; + //std::ostream out(&null_buffer); bool success = true; diff --git a/unit_test/batched/Test_Batched_SerialGemm.hpp b/unit_test/batched/Test_Batched_SerialGemm.hpp index 791c22d054..c38bfcda11 100644 --- a/unit_test/batched/Test_Batched_SerialGemm.hpp +++ b/unit_test/batched/Test_Batched_SerialGemm.hpp @@ -66,6 +66,96 @@ namespace Test { Kokkos::Profiling::popRegion(); } }; + +template + void impl_test_batched_gemm_half(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, + const int matCdim1, const int matCdim2) { + using layout_type = typename ViewType::array_layout; + using execution_space = typename DeviceType::execution_space; + using host_value_type = float; + using transA = typename ParamTagType::transA; + using transB = typename ParamTagType::transB; + using ViewType_host_value_type = Kokkos::View; + using ats = Kokkos::Details::ArithTraits; + + /// randomized input testing views + ScalarType alpha = 1.5, beta = 3.0; + + ViewType + a_expected("a_expected", N, matAdim1, matAdim2), a1("a1", N, matAdim1, matAdim2), + b_expected("b_expected", N, matBdim1, matBdim2), b1("b1", N, matBdim1, matBdim2), + c_expected("c_expected", N, matCdim1, matCdim2), c1("c1", N, matCdim1, matCdim2); + + // fill_random does not support half precision, so use float to + // generate random numbers and copy to half views with deep_copy + Kokkos::Random_XorShift64_Pool random(13718); + ViewType_host_value_type + a_expected_host_value_type("a_expected_host_value_type", N, matAdim1, matAdim2), + b_expected_host_value_type("b_expected_host_value_type", N, matBdim1, matBdim2), + c_expected_host_value_type("c_expected_host_value_type", N, matCdim1, matCdim2), + c1_host_value_type("c1_host_value_type", N, matCdim1, matCdim2); + + Kokkos::fill_random(a_expected_host_value_type, random, host_value_type(1.0)); + Kokkos::fill_random(b_expected_host_value_type, random, host_value_type(1.0)); + Kokkos::fill_random(c_expected_host_value_type, random, host_value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(a_expected, a_expected_host_value_type); + Kokkos::deep_copy(b_expected, b_expected_host_value_type); + Kokkos::deep_copy(c_expected, c_expected_host_value_type); + + Kokkos::deep_copy(a1, a_expected); + Kokkos::deep_copy(b1, b_expected); + Kokkos::deep_copy(c1, c_expected); + + Functor_BatchedVanillaGEMM vgemm; + vgemm.A_t = std::is_same::value; + vgemm.B_t = std::is_same::value; + vgemm.A_c = vgemm.B_c = false; + vgemm.A = a_expected; + vgemm.B = b_expected; + vgemm.C = c_expected; + vgemm.alpha = alpha; + vgemm.beta = beta; + vgemm.run(); // Compute c_expected + Functor_TestBatchedSerialGemm(alpha, a1, b1, beta, c1).run(); + + // Convert and copy half to host_value_type, on device + Kokkos::deep_copy(c_expected_host_value_type, c_expected); + Kokkos::deep_copy(c1_host_value_type, c1); + + // We may not have half precision on the host, use single precision here. + // For comparison send it to host, in host compatible type + typename ViewType_host_value_type::HostMirror c_expected_host_value_type_host = Kokkos::create_mirror_view(c_expected_host_value_type); + typename ViewType_host_value_type::HostMirror c1_host_value_type_host = Kokkos::create_mirror_view(c1_host_value_type); + + // Copy host_value_type on device to host_value_type on host + Kokkos::deep_copy(c_expected_host_value_type_host, c_expected_host_value_type); + Kokkos::deep_copy(c1_host_value_type_host, c1_host_value_type); + + Kokkos::fence(); + + // check c_expected = c1 ; this eps is about 2^-9 + // Set mag_type to host_value_type, we may not have half precision on host + using mag_type = host_value_type; + mag_type sum(1), diff(0); + + mag_type eps = (mag_type) (1 << 1) * FP16_EPSILON; + + for (int k=0;k ViewType; Test::impl_test_batched_gemm(0, 10, 10, 10, 10, 10, 10); @@ -187,3 +277,65 @@ int test_batched_gemm() { return 0; } + +template +int test_batched_gemm_half() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_gemm_half(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + Test::impl_test_batched_gemm_half(1024, i, i, i, i, i, i); + } + for (int i=1;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_gemm_half(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutRight, Blksize %d\n", i); + Test::impl_test_batched_gemm_half(1024, i, i, i, i, i, i); + } + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/Test_Batched_SerialGemm_Real.hpp b/unit_test/batched/Test_Batched_SerialGemm_Real.hpp index 24222cba2f..6d478923d8 100644 --- a/unit_test/batched/Test_Batched_SerialGemm_Real.hpp +++ b/unit_test/batched/Test_Batched_SerialGemm_Real.hpp @@ -1,3 +1,30 @@ +#if defined(HAVE_KOKKOSKERNELS_HALFMATH) +TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_gemm_half(); + test_batched_gemm_half(); +} +TEST_F( TestCategory, batched_scalar_serial_gemm_t_nt_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_gemm_half(); + test_batched_gemm_half(); +} +TEST_F( TestCategory, batched_scalar_serial_gemm_nt_t_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_gemm_half(); + test_batched_gemm_half(); +} +TEST_F( TestCategory, batched_scalar_serial_gemm_t_t_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_gemm_half(); + test_batched_gemm_half(); +} +#endif // HAVE_KOKKOSKERNELS_HALFMATH + #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_float_float ) { typedef ::Test::ParamTag param_tag_type; diff --git a/unit_test/batched/Test_Batched_TeamGemm.hpp b/unit_test/batched/Test_Batched_TeamGemm.hpp index 7418361809..5ae23c4d12 100644 --- a/unit_test/batched/Test_Batched_TeamGemm.hpp +++ b/unit_test/batched/Test_Batched_TeamGemm.hpp @@ -78,7 +78,7 @@ namespace Test { typename ScalarType, typename ParamTagType, typename AlgoTagType> - void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, + void impl_test_batched_teamgemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, const int matCdim1, const int matCdim2) { typedef typename ViewType::value_type value_type; typedef Kokkos::Details::ArithTraits ats; @@ -130,63 +130,155 @@ namespace Test { } EXPECT_NEAR_KK( diff/sum, 0, eps); } + + template + void impl_test_batched_teamgemm_half(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, + const int matCdim1, const int matCdim2) { + using layout_type = typename ViewType::array_layout; + using transA = typename ParamTagType::transA; + using transB = typename ParamTagType::transB; + using execution_space = typename DeviceType::execution_space; + using host_value_type = float; + using ViewType_host_value_type = Kokkos::View; + using ats = Kokkos::Details::ArithTraits; + + /// randomized input testing views + ScalarType alpha = 1.5, beta = 3.0; + + ViewType + a_expected("a_expected", N, matAdim1, matAdim2), a1("a1", N, matAdim1, matAdim2), + b_expected("b_expected", N, matBdim1, matBdim2), b1("b1", N, matBdim1, matBdim2), + c_expected("c_expected", N, matCdim1, matCdim2), c1("c1", N, matCdim1, matCdim2); + + // fill_random does not support half precision, so use float to + // generate random numbers and copy to half views with deep_copy + Kokkos::Random_XorShift64_Pool random(13718); + ViewType_host_value_type + a_expected_host_value_type("a_expected_host_value_type", N, matAdim1, matAdim2), + b_expected_host_value_type("b_expected_host_value_type", N, matBdim1, matBdim2), + c_expected_host_value_type("c_expected_host_value_type", N, matCdim1, matCdim2), + c1_host_value_type("c1_host_value_type", N, matCdim1, matCdim2); + + Kokkos::fill_random(a_expected_host_value_type, random, host_value_type(1.0)); + Kokkos::fill_random(b_expected_host_value_type, random, host_value_type(1.0)); + Kokkos::fill_random(c_expected_host_value_type, random, host_value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(a_expected, a_expected_host_value_type); + Kokkos::deep_copy(b_expected, b_expected_host_value_type); + Kokkos::deep_copy(c_expected, c_expected_host_value_type); + + Kokkos::deep_copy(a1, a_expected); + Kokkos::deep_copy(b1, b_expected); + Kokkos::deep_copy(c1, c_expected); + + Functor_BatchedVanillaGEMM vgemm; + vgemm.A_t = std::is_same::value; + vgemm.B_t = std::is_same::value; + vgemm.A_c = vgemm.B_c = false; + vgemm.A = a_expected; + vgemm.B = b_expected; + vgemm.C = c_expected; + vgemm.alpha = alpha; + vgemm.beta = beta; + vgemm.run(); // Compute c_expected + + Functor_TestBatchedTeamGemm(alpha, a1, b1, beta, c1).run(); + + Kokkos::fence(); + + // Convert and copy half to host_value_type, on device + Kokkos::deep_copy(c_expected_host_value_type, c_expected); + Kokkos::deep_copy(c1_host_value_type, c1); + + // We may not have half precision on the host, use single precision here. + // For comparison send it to host, in host compatible type + typename ViewType_host_value_type::HostMirror c_expected_host_value_type_host = Kokkos::create_mirror_view(c_expected_host_value_type); + typename ViewType_host_value_type::HostMirror c1_host_value_type_host = Kokkos::create_mirror_view(c1_host_value_type); + + // Copy host_value_type on device to host_value_type on host + Kokkos::deep_copy(c_expected_host_value_type_host, c_expected_host_value_type); + Kokkos::deep_copy(c1_host_value_type_host, c1_host_value_type); + + // check c_expected = c1 ; this eps is about 2^-9 + // Set mag_type to host_value_type, we may not have half precision on host + using mag_type = host_value_type; + mag_type sum(1), diff(0); + + mag_type eps = (mag_type) (1 << 1) * FP16_EPSILON; + + for (int k=0;k -int test_batched_gemm() { +int test_batched_teamgemm() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; - Test::impl_test_batched_gemm(0, 10, 10, 10, 10, 10, 10); + Test::impl_test_batched_teamgemm(0, 10, 10, 10, 10, 10, 10); for (int i=0;i<10;++i) { //printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::impl_test_batched_gemm(1024, i, i, i, i, i, i); + Test::impl_test_batched_teamgemm(1024, i, i, i, i, i, i); } for (int i=0;i<10;++i) { //printf("Testing: LayoutLeft, Blksize %d\n", i); int dimM=i; int dimN=2*i; int dimK=3*i; if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { typedef Kokkos::View ViewType; - Test::impl_test_batched_gemm(0, 10, 10, 10, 10, 10, 10); + Test::impl_test_batched_teamgemm(0, 10, 10, 10, 10, 10, 10); for (int i=0;i<10;++i) { //printf("Testing: LayoutRight, Blksize %d\n", i); - Test::impl_test_batched_gemm(1024, i, i, i, i, i, i); + Test::impl_test_batched_teamgemm(1024, i, i, i, i, i, i); } for (int i=0;i<10;++i) { //printf("Testing: LayoutLeft, Blksize %d\n", i); int dimM=i; int dimN=2*i; int dimK=3*i; if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } } } #endif @@ -194,3 +286,64 @@ int test_batched_gemm() { return 0; } +template +int test_batched_teamgemm_half() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_teamgemm_half(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + Test::impl_test_batched_teamgemm_half(1024, i, i, i, i, i, i); + } + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_teamgemm_half(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutRight, Blksize %d\n", i); + Test::impl_test_batched_teamgemm_half(1024, i, i, i, i, i, i); + } + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/Test_Batched_TeamGemm_Complex.hpp b/unit_test/batched/Test_Batched_TeamGemm_Complex.hpp index abf7983966..2f66860ff4 100644 --- a/unit_test/batched/Test_Batched_TeamGemm_Complex.hpp +++ b/unit_test/batched/Test_Batched_TeamGemm_Complex.hpp @@ -6,32 +6,32 @@ TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_dcomplex ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_dcomplex ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); } TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_dcomplex ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_t_dcomplex_dcomplex ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_dcomplex ) { // typedef ::Test::ParamTag param_tag_type; // typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_gemm_nt_ct_dcomplex_dcomplex ) { // typedef ::Test::ParamTag param_tag_type; // typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } /// dcomplex, double @@ -39,32 +39,32 @@ TEST_F( TestCategory, batched_scalar_team_gemm_t_t_dcomplex_dcomplex ) { TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,double,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,double,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); } TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,double,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_t_dcomplex_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,double,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_double ) { // typedef ::Test::ParamTag param_tag_type; // typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,double,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_gemm_nt_ct_dcomplex_double ) { // typedef ::Test::ParamTag param_tag_type; // typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,double,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); // } #endif diff --git a/unit_test/batched/Test_Batched_TeamGemm_Real.hpp b/unit_test/batched/Test_Batched_TeamGemm_Real.hpp index 065fb68c97..6dde7dcced 100644 --- a/unit_test/batched/Test_Batched_TeamGemm_Real.hpp +++ b/unit_test/batched/Test_Batched_TeamGemm_Real.hpp @@ -1,24 +1,50 @@ +#if defined(HAVE_KOKKOSKERNELS_HALFMATH) +TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_teamgemm_half(); + test_batched_teamgemm_half(); +} +TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_teamgemm_half(); + test_batched_teamgemm_half(); +} +TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_teamgemm_half(); + test_batched_teamgemm_half(); +} +TEST_F( TestCategory, batched_scalar_team_gemm_t_t_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_teamgemm_half(); + test_batched_teamgemm_half(); +} +#endif #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_float_float ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_float_float ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_float_float ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_t_float_float ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } #endif @@ -26,22 +52,22 @@ TEST_F( TestCategory, batched_scalar_team_gemm_t_t_float_float ) { TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_double_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_double_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_double_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_t_double_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } #endif diff --git a/unit_test/batched/Test_Batched_TeamVectorGemm.hpp b/unit_test/batched/Test_Batched_TeamVectorGemm.hpp new file mode 100644 index 0000000000..10c4a34fe9 --- /dev/null +++ b/unit_test/batched/Test_Batched_TeamVectorGemm.hpp @@ -0,0 +1,346 @@ +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_Gemm_TeamVector_Impl.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { + + template + struct ParamTag { + typedef TA transA; + typedef TB transB; + }; + + template + struct Functor_TestBatchedTeamVector { + ViewType _a, _b, _c; + + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedTeamVector(const ScalarType alpha, + const ViewType &a, + const ViewType &b, + const ScalarType beta, + const ViewType &c) + : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} + + template + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const MemberType &member) const { + const int k = member.league_rank(); + + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); + + TeamVectorGemm:: + invoke(member, _alpha, aa, bb, _beta, cc); + } + + inline + void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamVector"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); + const int league_size = _c.extent(0); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } + }; + + template + void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, + const int matCdim1, const int matCdim2) { + typedef typename ViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + /// randomized input testing views + ScalarType alpha = 1.5, beta = 3.0; + + ViewType + a0("a0", N, matAdim1,matAdim2), a1("a1", N, matAdim1,matAdim2), + b0("b0", N, matBdim1,matBdim2), b1("b1", N, matBdim1,matBdim2), + c0("c0", N, matCdim1,matCdim2), c1("c1", N, matCdim1,matCdim2); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(a0, random, value_type(1.0)); + Kokkos::fill_random(b0, random, value_type(1.0)); + Kokkos::fill_random(c0, random, value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(a1, a0); + Kokkos::deep_copy(b1, b0); + Kokkos::deep_copy(c1, c0); + + /// test body + Functor_TestBatchedTeamVector(alpha, a0, b0, beta, c0).run(); + Functor_TestBatchedTeamVector(alpha, a1, b1, beta, c1).run(); + + Kokkos::fence(); + + /// for comparison send it to host + typename ViewType::HostMirror c0_host = Kokkos::create_mirror_view(c0); + typename ViewType::HostMirror c1_host = Kokkos::create_mirror_view(c1); + + Kokkos::deep_copy(c0_host, c0); + Kokkos::deep_copy(c1_host, c1); + + /// check c0 = c1 ; this eps is about 10^-14 + typedef typename ats::mag_type mag_type; + mag_type sum(1), diff(0); + const mag_type eps = 1.0e3 * ats::epsilon(); + + for (int k=0;k + void impl_test_batched_teamvectorgemm_half(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, + const int matCdim1, const int matCdim2) { + using layout_type = typename ViewType::array_layout; + using transA = typename ParamTagType::transA; + using transB = typename ParamTagType::transB; + using execution_space = typename DeviceType::execution_space; + using host_value_type = float; + using ViewType_host_value_type = Kokkos::View; + using ats = Kokkos::Details::ArithTraits; + + /// randomized input testing views + ScalarType alpha = 1.5, beta = 3.0; + + ViewType + a_expected("a_expected", N, matAdim1, matAdim2), a1("a1", N, matAdim1, matAdim2), + b_expected("b_expected", N, matBdim1, matBdim2), b1("b1", N, matBdim1, matBdim2), + c_expected("c_expected", N, matCdim1, matCdim2), c1("c1", N, matCdim1, matCdim2); + + // fill_random does not support half precision, so use float to + // generate random numbers and copy to half views with deep_copy + Kokkos::Random_XorShift64_Pool random(13718); + ViewType_host_value_type + a_expected_host_value_type("a_expected_host_value_type", N, matAdim1, matAdim2), + b_expected_host_value_type("b_expected_host_value_type", N, matBdim1, matBdim2), + c_expected_host_value_type("c_expected_host_value_type", N, matCdim1, matCdim2), + c1_host_value_type("c1_host_value_type", N, matCdim1, matCdim2); + + Kokkos::fill_random(a_expected_host_value_type, random, host_value_type(1.0)); + Kokkos::fill_random(b_expected_host_value_type, random, host_value_type(1.0)); + Kokkos::fill_random(c_expected_host_value_type, random, host_value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(a_expected, a_expected_host_value_type); + Kokkos::deep_copy(b_expected, b_expected_host_value_type); + Kokkos::deep_copy(c_expected, c_expected_host_value_type); + + Kokkos::deep_copy(a1, a_expected); + Kokkos::deep_copy(b1, b_expected); + Kokkos::deep_copy(c1, c_expected); + + //Functor_TestBatchedTeamVector(alpha, a_expected, b_expected, beta, c_expected).run(); + Functor_BatchedVanillaGEMM vgemm; + vgemm.A_t = std::is_same::value; + vgemm.B_t = std::is_same::value; + vgemm.A_c = vgemm.B_c = false; + vgemm.A = a_expected; + vgemm.B = b_expected; + vgemm.C = c_expected; + vgemm.alpha = alpha; + vgemm.beta = beta; + vgemm.run(); // Compute c_expected + + Functor_TestBatchedTeamVector(alpha, a1, b1, beta, c1).run(); + + Kokkos::fence(); + + // Convert and copy half to host_value_type, on device + Kokkos::deep_copy(c_expected_host_value_type, c_expected); + Kokkos::deep_copy(c1_host_value_type, c1); + + // We may not have half precision on the host, use single precision here. + // For comparison send it to host, in host compatible type + typename ViewType_host_value_type::HostMirror c_expected_host_value_type_host = Kokkos::create_mirror_view(c_expected_host_value_type); + typename ViewType_host_value_type::HostMirror c1_host_value_type_host = Kokkos::create_mirror_view(c1_host_value_type); + + // Copy host_value_type on device to host_value_type on host + Kokkos::deep_copy(c_expected_host_value_type_host, c_expected_host_value_type); + Kokkos::deep_copy(c1_host_value_type_host, c1_host_value_type); + + // check c_expected = c1 ; this eps is about 2^-9 + // Set mag_type to host_value_type, we may not have half precision on host + using mag_type = host_value_type; + mag_type sum(1), diff(0); + + mag_type eps = (mag_type) (1 << 1) * FP16_EPSILON; + + for (int k=0;k +int test_batched_teamvectorgemm() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_teamvectorgemm(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + Test::impl_test_batched_teamvectorgemm(1024, i, i, i, i, i, i); + } + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_teamvectorgemm(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutRight, Blksize %d\n", i); + Test::impl_test_batched_teamvectorgemm(1024, i, i, i, i, i, i); + } + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif + + return 0; +} + +template +int test_batched_teamvectorgemm_half() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_teamvectorgemm_half(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + Test::impl_test_batched_teamvectorgemm_half(1024, i, i, i, i, i, i); + } + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_teamvectorgemm_half(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutRight, Blksize %d\n", i); + Test::impl_test_batched_teamvectorgemm_half(1024, i, i, i, i, i, i); + } + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/Test_Batched_TeamVectorGemm_Complex.hpp b/unit_test/batched/Test_Batched_TeamVectorGemm_Complex.hpp new file mode 100644 index 0000000000..4926d20670 --- /dev/null +++ b/unit_test/batched/Test_Batched_TeamVectorGemm_Complex.hpp @@ -0,0 +1,53 @@ +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_scomplex_scomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_scomplex_scomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_scomplex_scomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_scomplex_scomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_dcomplex_dcomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_dcomplex_dcomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_dcomplex_dcomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_dcomplex_dcomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +#endif diff --git a/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp b/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp new file mode 100644 index 0000000000..bb71c062ab --- /dev/null +++ b/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp @@ -0,0 +1,80 @@ +#if defined(HAVE_KOKKOSKERNELS_HALFMATH) +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm_half(); + test_batched_teamvectorgemm_half(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm_half(); + test_batched_teamvectorgemm_half(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm_half(); + test_batched_teamvectorgemm_half(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm_half(); + test_batched_teamvectorgemm_half(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_float_float ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_float_float ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_float_float ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_float_float ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_double_double ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_double_double ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_double_double ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_double_double ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +#endif diff --git a/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Complex.cpp b/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Complex.cpp new file mode 100644 index 0000000000..8ac5c834bc --- /dev/null +++ b/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_Cuda.hpp" +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Complex.hpp" diff --git a/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Real.cpp b/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Real.cpp new file mode 100644 index 0000000000..27e7b3b565 --- /dev/null +++ b/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_Cuda.hpp" +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Real.hpp" diff --git a/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Complex.cpp b/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Complex.cpp new file mode 100644 index 0000000000..9adfd61517 --- /dev/null +++ b/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_OpenMP.hpp" +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Complex.hpp" diff --git a/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Real.cpp b/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Real.cpp new file mode 100644 index 0000000000..e841dea6a5 --- /dev/null +++ b/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_OpenMP.hpp" +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Real.hpp" diff --git a/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Complex.cpp b/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Complex.cpp new file mode 100644 index 0000000000..991031d817 --- /dev/null +++ b/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_Serial.hpp" +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Complex.hpp" diff --git a/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Real.cpp b/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Real.cpp new file mode 100644 index 0000000000..cc2041cefc --- /dev/null +++ b/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_Serial.hpp" +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Real.hpp" diff --git a/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Complex.cpp b/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Complex.cpp new file mode 100644 index 0000000000..02b4d3681f --- /dev/null +++ b/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Complex.cpp @@ -0,0 +1,3 @@ +#include +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Complex.hpp" diff --git a/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Real.cpp b/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Real.cpp new file mode 100644 index 0000000000..5c17d8df16 --- /dev/null +++ b/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Real.cpp @@ -0,0 +1,3 @@ +#include +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Real.hpp" From 9e4d23ed1f4d2f35707da5c0129ab952ff8e757a Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 16 Oct 2020 08:44:22 -0600 Subject: [PATCH 051/106] Remove KokkosKernels half precision support - Replace half with half_t from Kokkos - Remove KokkosKernels half precision support on AMD --- CMakeLists.txt | 9 +- cmake/HalfPrecisionSupport.cmake | 31 +--- cmake/KokkosKernels_config.h.in | 8 +- src/KokkosKernels_Half.hpp | 67 +++++++ src/Kokkos_ArithTraits.hpp | 80 ++++---- src/Kokkos_HalfPrecision.hpp | 173 ------------------ test_common/KokkosKernels_TestUtils.hpp | 8 +- test_common/Test_Common_ArithTraits.hpp | 63 +++---- .../batched/Test_Batched_SerialGemm_Real.hpp | 4 +- .../batched/Test_Batched_TeamGemm_Real.hpp | 4 +- .../Test_Batched_TeamVectorGemm_Real.hpp | 4 +- 11 files changed, 147 insertions(+), 304 deletions(-) create mode 100644 src/KokkosKernels_Half.hpp delete mode 100644 src/Kokkos_HalfPrecision.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 35a72a489c..93f5058f57 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -182,12 +182,9 @@ ELSE() MESSAGE(" Offsets: ${OFFSET_LIST}") MESSAGE(" Layouts: ${LAYOUT_LIST}") MESSAGE("") - IF(HAVE_KOKKOSKERNELS_HALFMATH) - MESSAGE("KokkosKernels Half Precision Types") - MESSAGE(" HAVE_FP16: ${HAVE_FP16}") - MESSAGE(" HAVE_CUDA_FP16: ${HAVE_CUDA_FP16}") - MESSAGE("") - ENDIF() + MESSAGE("KokkosKernels Half Precision Types") + MESSAGE(" HAVE_KOKKOS_HALFMATH: ${HAVE_KOKKOS_HALFMATH}") + MESSAGE("") MESSAGE("KokkosKernels TPLs") FOREACH(TPL ${KOKKOSKERNELS_TPL_LIST}) PAD_STRING("${TPL}:" TPL_PADDED 12) diff --git a/cmake/HalfPrecisionSupport.cmake b/cmake/HalfPrecisionSupport.cmake index 68971b1279..0e2b0fc0ce 100644 --- a/cmake/HalfPrecisionSupport.cmake +++ b/cmake/HalfPrecisionSupport.cmake @@ -1,29 +1,6 @@ -# Check whether the compiler defined the _Float16 type -# HAVE_KOKKOSKERNELS_FP16 is passed to C++ via KokkosKernels_config.h.in -INCLUDE(CheckTypeSize) -CHECK_TYPE_SIZE(_Float16 FP16 LANGUAGE CXX) -IF(HAVE_FP16) - SET(HAVE_KOKKOSKERNELS_FP16 ${HAVE_FP16}) -ENDIF() - -# Check whether the cuda_fp16.h header exists to infer that the __half type exists -# HAVE_KOKKOSKERNELS_CUDA_FP16 is passed to C++ via KokkosKernels_config.h.in -INCLUDE(CheckIncludeFileCXX) -CHECK_INCLUDE_FILE_CXX(cuda_fp16.h HAVE_CUDA_FP16) -IF(HAVE_CUDA_FP16) - SET(HAVE_CUDA_FP16 TRUE) - SET(HAVE_KOKKOSKERNELS_CUDA_FP16 TRUE) +# Check whether Kokkos has half precision headers +IF(EXISTS ${Kokkos_DIR}/../../../include/Kokkos_Half.hpp) + SET(HAVE_KOKKOS_HALFMATH TRUE) ELSE() - SET(HAVE_CUDA_FP16 FALSE) -ENDIF() - -IF(HAVE_KOKKOSKERNELS_FP16 AND HAVE_KOKKOSKERNELS_CUDA_FP16) - MESSAGE(WARNING "'half' is set to 'device_fp16_t'. To use half precision on host, use 'host_fp16_t'.") - MESSAGE(WARNING "Use 'float' and 'host_fp16_t' to cast on host.") - MESSAGE(WARNING "Use '__half2float' and '__float2half' functions to cast on device.") -ENDIF() - -# HAVE_KOKKOSKERNELS_HALFMATH is passed to C++ via KokkosKernels_config.h.in -IF(HAVE_KOKKOSKERNELS_FP16 OR HAVE_KOKKOSKERNELS_CUDA_FP16) - SET(HAVE_KOKKOSKERNELS_HALFMATH TRUE) + SET(HAVE_KOKKOS_HALFMATH FALSE) ENDIF() diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index 30b141ba38..a79c30427b 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -16,8 +16,8 @@ /* Define this macro if the quadmath TPL is enabled */ #cmakedefine HAVE_KOKKOSKERNELS_QUADMATH -/* Define this macro if half precision is supported by toolchain */ -#cmakedefine HAVE_KOKKOSKERNELS_HALFMATH +/* Define this macro if half precision is supported by kokkos */ +#cmakedefine HAVE_KOKKOS_HALFMATH /* Define this macro if the MKL TPL is enabled. This is different than just linking against the MKL to get the BLAS and LAPACK; it @@ -63,10 +63,6 @@ #define KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_ #endif -/* Whether the _Float16 type is defined by the toolchain */ -#cmakedefine HAVE_KOKKOSKERNELS_FP16 -/* Whether the __half type is defined by the toolchain */ -#cmakedefine HAVE_KOKKOSKERNELS_CUDA_FP16 /* Whether to build kernels for multivectors of LayoutLeft */ #cmakedefine KOKKOSKERNELS_INST_LAYOUTLEFT /* Whether to build kernels for multivectors of LayoutRight */ diff --git a/src/KokkosKernels_Half.hpp b/src/KokkosKernels_Half.hpp new file mode 100644 index 0000000000..05d4053c50 --- /dev/null +++ b/src/KokkosKernels_Half.hpp @@ -0,0 +1,67 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSKERNELS_HALF_HPP +#define KOKKOSKERNELS_HALF_HPP + +#if defined(HAVE_KOKKOS_HALFMATH) +#include "Kokkos_Half.hpp" +#endif // HAVE_KOKKOS_HALFMATH + +namespace KokkosKernels { + namespace Experimental { + ////////////// BEGIN FP16/binary16 limits ////////////// + #define FP16_MAX 65504.0F // Maximum normalized number + #define FP16_MIN 0.000000059604645F // Minimum normalized positive half precision number + #define FP16_RADIX 2 // Value of the base of the exponent representation. TODO: Confirm this + #define FP16_MANT_DIG 15 // Number of digits in the matissa that can be represented without losing precision. TODO: Confirm this + #define FP16_MIN_EXP -14 // This is the smallest possible exponent value + #define FP16_MAX_EXP 15 // This is the largest possible exponent value + #define FP16_SIGNIFICAND_BITS 10 + #define FP16_EPSILON 0.0009765625F + #define HUGE_VALH 0x7c00 // bits [10,14] set. + ////////////// END FP16/binary16 limits ////////////// + } // Experimental +} // KokkosKernels +#endif // KOKKOSKERNELS_HALF_HPP diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index 9040103bfe..c001b5bdce 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -50,7 +50,7 @@ #include #include -#include +#include #ifdef HAVE_KOKKOSKERNELS_QUADMATH # include @@ -675,15 +675,15 @@ class ArithTraits { //@} }; -/** - * Currently, all the ArithTraits member will only work on either the host - * due toolchain support of half precision types. - */ -#if defined(HAVE_KOKKOSKERNELS_HALFMATH) +// Since Kokkos::Experimental::half_t falls back to float, only define +// ArithTraits on supported execution spaces +#if defined(HAVE_KOKKOS_HALFMATH) &&\ + defined(KOKKOS_ENABLE_CUDA) +// defined(Kokkos::Experimental::HALF_IMPL_TYPE) template<> -class ArithTraits { +class ArithTraits { public: - typedef KokkosKernels::Experimental::half val_type; + typedef Kokkos::Experimental::half_t val_type; typedef val_type mag_type; static const bool is_specialized = true; @@ -693,92 +693,93 @@ class ArithTraits { static const bool is_complex = false; static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return KokkosKernels::Experimental::__cast2half(HUGE_VALF); } + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return HUGE_VALF; } static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) { #ifndef __CUDA_ARCH__ using std::isinf; #endif - return isinf (KokkosKernels::Experimental::__cast2float(x)); + return isinf (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) { #ifndef __CUDA_ARCH__ using std::isnan; #endif - return isnan(KokkosKernels::Experimental::__cast2float(x)); + return isnan(Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) { - return KokkosKernels::Experimental::__cast2float(fabs(KokkosKernels::Experimental::__cast2float(x))); + return fabs(Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type zero () { - return KokkosKernels::Experimental::__cast2float(0.0F); + return 0.0F; } static KOKKOS_FORCEINLINE_FUNCTION val_type one () { - return KokkosKernels::Experimental::__cast2float(1.0F); + return 1.0F; } static KOKKOS_FORCEINLINE_FUNCTION val_type min () { - return KokkosKernels::Experimental::__cast2float(-FP16_MAX); + return -FP16_MAX; } static KOKKOS_FORCEINLINE_FUNCTION val_type max () { - return KokkosKernels::Experimental::__cast2float(FP16_MAX); + return FP16_MAX; } static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type) { - return KokkosKernels::Experimental::__cast2float(0.0F); + return 0.0F; } static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const val_type y) { - return KokkosKernels::Experimental::__cast2float(::pow (KokkosKernels::Experimental::__cast2float(x), KokkosKernels::Experimental::__cast2float(y))); + return ::pow(Kokkos::Experimental::cast_from_half(x), + Kokkos::Experimental::cast_from_half(y)); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { - return KokkosKernels::Experimental::__cast2float(::sqrt (KokkosKernels::Experimental::__cast2float(x))); + return ::sqrt (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - return KokkosKernels::Experimental::__cast2float(::cbrt (KokkosKernels::Experimental::__cast2float(x))); + return ::cbrt (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { - return KokkosKernels::Experimental::__cast2float(::exp (KokkosKernels::Experimental::__cast2float(x))); + return ::exp (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) { - return KokkosKernels::Experimental::__cast2float(::log (KokkosKernels::Experimental::__cast2float(x))); + return ::log (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) { - return KokkosKernels::Experimental::__cast2float(::log10 (KokkosKernels::Experimental::__cast2float(x))); + return ::log10 (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - return KokkosKernels::Experimental::__cast2float(::sin (KokkosKernels::Experimental::__cast2float(x))); + return ::sin (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - return KokkosKernels::Experimental::__cast2float(::cos (KokkosKernels::Experimental::__cast2float(x))); + return ::cos (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - return KokkosKernels::Experimental::__cast2float(::tan (KokkosKernels::Experimental::__cast2float(x))); + return ::tan (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - return KokkosKernels::Experimental::__cast2float(::sinh (KokkosKernels::Experimental::__cast2float(x))); + return ::sinh (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - return KokkosKernels::Experimental::__cast2float(::cosh (KokkosKernels::Experimental::__cast2float(x))); + return ::cosh (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - return KokkosKernels::Experimental::__cast2float(::tanh (KokkosKernels::Experimental::__cast2float(x))); + return ::tanh (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - return KokkosKernels::Experimental::__cast2float(::asin (KokkosKernels::Experimental::__cast2float(x))); + return ::asin (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - return KokkosKernels::Experimental::__cast2float(::acos (KokkosKernels::Experimental::__cast2float(x))); + return ::acos (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - return KokkosKernels::Experimental::__cast2float(::atan (KokkosKernels::Experimental::__cast2float(x))); + return ::atan (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () { //return ::pow(2, -FP16_SIGNIFICAND_BITS); - return KokkosKernels::Experimental::__cast2half(FP16_EPSILON); + return FP16_EPSILON; } // Backwards compatibility with Teuchos::ScalarTraits. typedef mag_type magnitudeType; @@ -794,22 +795,22 @@ class ArithTraits { return isNan (x) || isInf (x); } static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) { - return KokkosKernels::Experimental::__cast2float(abs (KokkosKernels::Experimental::__cast2float(x))); + return abs (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) { - return KokkosKernels::Experimental::__cast2float(conj (KokkosKernels::Experimental::__cast2float(x))); + return conj (Kokkos::Experimental::cast_from_half(x)); } static std::string name () { return "half"; } static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) { - return KokkosKernels::Experimental::__cast2float(sqrt (KokkosKernels::Experimental::__cast2float(x))); + return sqrt (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type nan () { #ifdef __CUDA_ARCH__ - return KokkosKernels::Experimental::__cast2half(CUDART_NAN_F); + return CUDART_NAN_F; #else - return KokkosKernels::Experimental::__cast2half(std::numeric_limits::quiet_NaN()); + return std::numeric_limits::quiet_NaN(); #endif // __CUDA_ARCH__ } static KOKKOS_FORCEINLINE_FUNCTION mag_type eps () { @@ -847,7 +848,7 @@ class ArithTraits { return FP16_MAX; } }; -#endif // HAVE_KOKKOSKERNELS_HALFMATH +#endif // HAVE_KOKKOS_HALFMATH && KOKKOS_ENABLE_CUDA template<> class ArithTraits { @@ -1016,7 +1017,6 @@ class ArithTraits { } }; - /// \brief Partial specialization for std::complex. /// /// The C++ Standard Library (with C++03 at least) only allows diff --git a/src/Kokkos_HalfPrecision.hpp b/src/Kokkos_HalfPrecision.hpp deleted file mode 100644 index 89318270f5..0000000000 --- a/src/Kokkos_HalfPrecision.hpp +++ /dev/null @@ -1,173 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_HALFPRECISION_HPP -#define KOKKOS_HALFPRECISION_HPP - -#if defined(KOKKOS_ENABLE_CUDA) && defined(HAVE_KOKKOSKERNELS_CUDA_FP16) -#include -#endif - -namespace KokkosKernels { - namespace Experimental { - /** - * Below we check whether the given toolchain has support for portable IEEE-754 - * FP16 (binary16) precision types. The checks are done via CMake which passes the - * results via a KOKKOSKERNELS_HAVE define to KokkosKernels_config.h - * - * First we check for cuda half precision support (HAVE_KOKKOSKERNELS_CUDA_FP16). - * Second we check for host half precision support (HAVE_KOKKOSKERNELS_FP16). - * Lastly, we fall back to single precision support. - * - * NOTE: If both cuda and host support half precision, the half type will - * default to device_fp16_t. - */ - #if defined(KOKKOS_ENABLE_CUDA) && defined(HAVE_KOKKOSKERNELS_CUDA_FP16) - using device_fp16_t = __half; - using half = device_fp16_t; - #if defined(HAVE_KOKKOSKERNELS_FP16) - using host_fp16_t = _Float16; - #else - using host_fp16_t = float; - #endif // defined(HAVE_KOKKOSKERNELS_FP16) - static KOKKOS_FORCEINLINE_FUNCTION float __cast2float(device_fp16_t x) { return __half2float(x); } - static KOKKOS_FORCEINLINE_FUNCTION device_fp16_t __cast2half(float x) { return __float2half(x); } - - #else // defined(KOKKOS_ENABLE_CUDA) && defined(HAVE_KOKKOSKERNELS_CUDA_FP16) - #if defined(HAVE_KOKKOSKERNELS_FP16) - using host_fp16_t = _Float16; - using half = host_fp16_t; - #else // defined(HAVE_KOKKOSKERNELS_FP16) - using host_fp16_t = float; - using device_fp16_t = host_fp16_t; - using half = host_fp16_t; - #endif // _Float16 - static inline float __cast2float(host_fp16_t x) { return (float) x; } - static inline host_fp16_t __cast2half(float x) { return (host_fp16_t) x; } - #endif - ////////////// BEGIN half2float and float2half overloads ////////////// - /** - * Since kokkos does not have support for half precision types yet, we - * must cast to/from float in some kokkos-kernels routines. Except for - * the overloads below that actually cast to/from half precision types, - * the others should be optimized away by the compiler. - */ - // host_fp16_t -#if defined(HAVE_KOKKOSKERNELS_FP16) - static inline - float half2float(host_fp16_t x, float &ret) { - ret = __cast2float(x); - return ret; - } - static inline - host_fp16_t float2half(float x, host_fp16_t &ret) { - ret = __cast2half(x); - return ret; - } -#endif - // device_fp16_t -#if defined(KOKKOS_ENABLE_CUDA) && defined(HAVE_KOKKOSKERNELS_CUDA_FP16) - static inline __device__ - float half2float(device_fp16_t x, float &ret) { - ret = __cast2float(x); - return ret; - } - static inline __device__ - device_fp16_t float2half(float x, device_fp16_t &ret) { - ret = __cast2half(x); - return ret; - } -#endif - // float - static KOKKOS_FORCEINLINE_FUNCTION float half2float(float x, float &ret) { - ret = x; - return ret; - } - static KOKKOS_FORCEINLINE_FUNCTION float float2half(float x, float &ret) { - ret = x; - return ret; - } - // complex float - static KOKKOS_FORCEINLINE_FUNCTION Kokkos::complex half2float(Kokkos::complex x, Kokkos::complex &ret) { - ret = x; - return ret; - } - static KOKKOS_FORCEINLINE_FUNCTION Kokkos::complex float2half(Kokkos::complex x, Kokkos::complex &ret) { - ret = x; - return ret; - } - // double - static KOKKOS_FORCEINLINE_FUNCTION double half2float(double x, double &ret) { - ret = x; - return ret; - } - static KOKKOS_FORCEINLINE_FUNCTION double float2half(double x, double &ret) { - ret = x; - return ret; - } - // complex double - static KOKKOS_FORCEINLINE_FUNCTION Kokkos::complex half2float(Kokkos::complex x, Kokkos::complex &ret) { - ret = x; - return ret; - } - static KOKKOS_FORCEINLINE_FUNCTION Kokkos::complex float2half(Kokkos::complex x, Kokkos::complex &ret) { - ret = x; - return ret; - } - ////////////// END half2float and float2half overloads ////////////// - - ////////////// BEGIN FP16/binary16 limits ////////////// - #define FP16_MAX 65504.0F // Maximum normalized number - #define FP16_MIN 0.000000059604645F // Minimum normalized positive half precision number - #define FP16_RADIX 2 // Value of the base of the exponent representation. TODO: Confirm this - #define FP16_MANT_DIG 15 // Number of digits in the matissa that can be represented without losing precision. TODO: Confirm this - #define FP16_MIN_EXP -14 // This is the smallest possible exponent value - #define FP16_MAX_EXP 15 // This is the largest possible exponent value - #define FP16_SIGNIFICAND_BITS 10 - #define FP16_EPSILON 0.0009765625F - #define HUGE_VALH 0x7c00 // bits [10,14] set. - ////////////// END FP16/binary16 limits ////////////// - } // Experimental -} // KokkosKernels -#endif // KOKKOS_HALFPRECISION_HPP diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index f63cdb0495..23b8030342 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -106,11 +106,9 @@ namespace Test { } } - #if defined(KOKKOS_ENABLE_CUDA) - using halfScalarType = typename std::conditional::value, KokkosKernels::Experimental::device_fp16_t, KokkosKernels::Experimental::host_fp16_t>::type; - #else - using halfScalarType = KokkosKernels::Experimental::host_fp16_t; - #endif // KOKKOS_ENABLE_CUDA + #if defined(HAVE_KOKKOS_HALFMATH) + using halfScalarType = Kokkos::Experimental::half_t; + #endif // HAVE_KOKKOS_HALFMATH template struct SharedVanillaGEMM { diff --git a/test_common/Test_Common_ArithTraits.hpp b/test_common/Test_Common_ArithTraits.hpp index 5c93a39445..32e36215ab 100644 --- a/test_common/Test_Common_ArithTraits.hpp +++ b/test_common/Test_Common_ArithTraits.hpp @@ -65,7 +65,7 @@ #define FAILURE() {printf("%s:%s:%d: Failure\n", __FILE__, __func__, __LINE__); success = 0;} -#if 0 +#if 1 #define TRACE() printf("%s:%s:%d: Trace\n", __FILE__, __func__, __LINE__); #else #define TRACE() @@ -1034,23 +1034,15 @@ class ArithTraitsTesterComplexBase : typedef Kokkos::Details::ArithTraits AT; (void) iwork; // forestall compiler warning for unused variable int success = 1; - -#if defined(HAVE_KOKKOSKERNELS_CUDA_FP16) &&\ - defined(__CUDA_ARCH__) - if(std::is_same::value) { - if (AT::is_signed != 0x1) - FAILURE(); - } else -#endif // HAVE_KOKKOSKERNELS_CUDA_FP16 - { - // Apparently, std::numeric_limits::is_signed is 1 - // only for real numbers. - if (AT::is_signed != std::numeric_limits::is_signed) { - printf("AT::is_signed = 0x%x, std::numeric_limits::is_signed = 0x%x\n", - AT::is_signed, - std::numeric_limits::is_signed); - FAILURE(); - } + + // Apparently, std::numeric_limits::is_signed is 1 + // only for real numbers. + if (AT::is_signed != std::numeric_limits::is_signed) { + printf( + "AT::is_signed = 0x%x, std::numeric_limits::is_signed " + "= 0x%x\n", + AT::is_signed, std::numeric_limits::is_signed); + FAILURE(); } if (AT::is_complex) { @@ -1560,24 +1552,13 @@ int runAllArithTraitsDeviceTests (std::ostream& out, const int verbose) // Built-in real floating-point types // -#if defined(HAVE_KOKKOSKERNELS_HALFMATH) &&\ - defined(KOKKOS_ENABLE_CUDA) && defined(HAVE_KOKKOSKERNELS_CUDA_FP16) - if (std::is_same::value) { - TRACE(); - success = success && curSuccess; - curSuccess = - testArithTraitsOnDevice(out, verbose); - } else { -#if defined(HAVE_KOKKOSKERNELS_FP16) - TRACE(); - success = success && curSuccess; - curSuccess = - testArithTraitsOnDevice(out, verbose); -#endif // HAVE_KOKKOSKERNELS_FP16 - } -#endif // HAVE_KOKKOSKERNELS_HALFMATH +#if defined(HAVE_KOKKOS_HALFMATH) + TRACE(); + success = success && curSuccess; + curSuccess = + testArithTraitsOnDevice( + out, verbose); +#endif // HAVE_KOKKOS_HALFMATH success = success && curSuccess; curSuccess = testArithTraitsOnDevice (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnDevice (out, verbose); @@ -1655,12 +1636,12 @@ int runAllArithTraitsHostTests (std::ostream& out, const int verbose) // Kokkos' complex floating-point types // -#if defined(HAVE_KOKKOSKERNELS_HALFMATH) && defined(HAVE_KOKKOSKERNELS_FP16) - success = success && curSuccess; +#if defined(HAVE_KOKKOS_HALFMATH) + success = success && curSuccess; TRACE(); - curSuccess = testArithTraitsOnHost(out, verbose); -#endif // HAVE_KOKKOSKERNELS_HALFMATH + curSuccess = testArithTraitsOnHost( + out, verbose); +#endif // HAVE_KOKKOS_HALFMATH success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); //success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); diff --git a/unit_test/batched/Test_Batched_SerialGemm_Real.hpp b/unit_test/batched/Test_Batched_SerialGemm_Real.hpp index 6d478923d8..5ef1df82bf 100644 --- a/unit_test/batched/Test_Batched_SerialGemm_Real.hpp +++ b/unit_test/batched/Test_Batched_SerialGemm_Real.hpp @@ -1,4 +1,4 @@ -#if defined(HAVE_KOKKOSKERNELS_HALFMATH) +#if defined(HAVE_KOKKOS_HALFMATH) TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_half_half ) { typedef ::Test::ParamTag param_tag_type; @@ -23,7 +23,7 @@ TEST_F( TestCategory, batched_scalar_serial_gemm_t_t_half_half ) { test_batched_gemm_half(); test_batched_gemm_half(); } -#endif // HAVE_KOKKOSKERNELS_HALFMATH +#endif // HAVE_KOKKOS_HALFMATH #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_float_float ) { diff --git a/unit_test/batched/Test_Batched_TeamGemm_Real.hpp b/unit_test/batched/Test_Batched_TeamGemm_Real.hpp index 6dde7dcced..e8fe47b202 100644 --- a/unit_test/batched/Test_Batched_TeamGemm_Real.hpp +++ b/unit_test/batched/Test_Batched_TeamGemm_Real.hpp @@ -1,4 +1,4 @@ -#if defined(HAVE_KOKKOSKERNELS_HALFMATH) +#if defined(HAVE_KOKKOS_HALFMATH) TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_half_half ) { typedef ::Test::ParamTag param_tag_type; @@ -23,7 +23,7 @@ TEST_F( TestCategory, batched_scalar_team_gemm_t_t_half_half ) { test_batched_teamgemm_half(); test_batched_teamgemm_half(); } -#endif +#endif // HAVE_KOKKOS_HALFMATH #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_float_float ) { diff --git a/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp b/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp index bb71c062ab..747c483b97 100644 --- a/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp +++ b/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp @@ -1,4 +1,4 @@ -#if defined(HAVE_KOKKOSKERNELS_HALFMATH) +#if defined(HAVE_KOKKOS_HALFMATH) TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_half_half ) { typedef ::Test::ParamTag param_tag_type; @@ -23,7 +23,7 @@ TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_half_half ) { //test_batched_teamvectorgemm_half(); test_batched_teamvectorgemm_half(); } -#endif +#endif // HAVE_KOKKOS_HALFMATH #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_float_float ) { From 2f19ee98b10e3c7c8a32e068ed82ed399283d611 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 16 Oct 2020 15:15:14 -0700 Subject: [PATCH 052/106] Add ArithTraits signed work around back --- test_common/Test_Common_ArithTraits.hpp | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/test_common/Test_Common_ArithTraits.hpp b/test_common/Test_Common_ArithTraits.hpp index 32e36215ab..d447a1fa92 100644 --- a/test_common/Test_Common_ArithTraits.hpp +++ b/test_common/Test_Common_ArithTraits.hpp @@ -65,7 +65,7 @@ #define FAILURE() {printf("%s:%s:%d: Failure\n", __FILE__, __func__, __LINE__); success = 0;} -#if 1 +#if 0 #define TRACE() printf("%s:%s:%d: Trace\n", __FILE__, __func__, __LINE__); #else #define TRACE() @@ -1037,13 +1037,22 @@ class ArithTraitsTesterComplexBase : // Apparently, std::numeric_limits::is_signed is 1 // only for real numbers. - if (AT::is_signed != std::numeric_limits::is_signed) { - printf( - "AT::is_signed = 0x%x, std::numeric_limits::is_signed " - "= 0x%x\n", - AT::is_signed, std::numeric_limits::is_signed); - FAILURE(); +#if defined(HAVE_KOKKOS_HALFMATH) + if (std::is_same::value) { + if (AT::is_signed != 0x1) + FAILURE(); + } else +#else + { + if (AT::is_signed != std::numeric_limits::is_signed) { + printf( + "AT::is_signed = 0x%x, std::numeric_limits::is_signed " + "= 0x%x\n", + AT::is_signed, std::numeric_limits::is_signed); + FAILURE(); + } } +#endif // HAVE_KOKKOS_HALFMATH if (AT::is_complex) { FAILURE(); From 88c1608896307b6c51dbe744b600010a176e1cbb Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 20 Oct 2020 10:13:00 -0700 Subject: [PATCH 053/106] Fix clang-8 openmp spot-check --- src/sparse/KokkosSparse_sptrsv_supernode.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/sparse/KokkosSparse_sptrsv_supernode.hpp b/src/sparse/KokkosSparse_sptrsv_supernode.hpp index 42529155fa..e73837e3a4 100644 --- a/src/sparse/KokkosSparse_sptrsv_supernode.hpp +++ b/src/sparse/KokkosSparse_sptrsv_supernode.hpp @@ -1122,8 +1122,6 @@ void sptrsv_supernodal_symbolic( using range_type = Kokkos::pair; using TrtriAlgoType = KokkosBatched::Algo::Trtri::Unblocked; - using Side = KokkosBatched::Side; - using Trans = KokkosBatched::Trans; int s = supernode_ids(i); int j1 = nb[s]; From f0eafbd96bdcc94c1999a37ee63f98c43b7bfd9a Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 21 Oct 2020 09:32:11 -0700 Subject: [PATCH 054/106] Ensure half_t is device type in ArithTraits --- src/Kokkos_ArithTraits.hpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index c001b5bdce..3738a3799f 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -676,11 +676,9 @@ class ArithTraits { }; // Since Kokkos::Experimental::half_t falls back to float, only define -// ArithTraits on supported execution spaces -#if defined(HAVE_KOKKOS_HALFMATH) &&\ - defined(KOKKOS_ENABLE_CUDA) -// defined(Kokkos::Experimental::HALF_IMPL_TYPE) -template<> +// ArithTraits if an IMPL type exists +#if defined(HAVE_KOKKOS_HALFMATH) && defined(HALF_IMPL_TYPE) +template <> class ArithTraits { public: typedef Kokkos::Experimental::half_t val_type; From 281e21cbc29bdb06372c8241866595163a6b9a98 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 21 Oct 2020 12:34:09 -0600 Subject: [PATCH 055/106] Fixes -Werror for gcc with c++20 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolves messages of form: error: implicit capture of ‘this’ via ‘[=]’ is deprecated in C++20 [-Werror=deprecated] --- src/common/KokkosKernels_Sorting.hpp | 6 +++--- src/graph/KokkosGraph_Distance1ColorHandle.hpp | 2 +- .../impl/KokkosSparse_cluster_gauss_seidel_impl.hpp | 2 +- src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp | 8 ++++---- src/sparse/impl/KokkosSparse_partitioning_impl.hpp | 8 ++++---- src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp | 2 +- .../impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp | 2 +- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp index d9346aba61..be37765594 100644 --- a/src/common/KokkosKernels_Sorting.hpp +++ b/src/common/KokkosKernels_Sorting.hpp @@ -431,7 +431,7 @@ struct BitonicPhase1Functor Ordinal workStart = work * (t.league_rank() % teamsPerBox); Ordinal workReflect = boxSize - workStart - 1; Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), - [=](const Ordinal i) + [&](const Ordinal i) { Ordinal elem1 = boxStart + workStart + i; Ordinal elem2 = boxStart + workReflect - i; @@ -471,7 +471,7 @@ struct BitonicPhase2Functor Ordinal workStart = boxStart + work * (t.league_rank() % teamsPerBox); Ordinal jump = boxSize / 2; Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), - [=](const Ordinal i) + [&](const Ordinal i) { Ordinal elem1 = workStart + i; Ordinal elem2 = workStart + jump + i; @@ -495,7 +495,7 @@ struct BitonicPhase2Functor Ordinal logSubBoxSize = logBoxSize - subLevel; Ordinal subBoxSize = Ordinal(1) << logSubBoxSize; Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), - [=](const Ordinal i) + [&](const Ordinal i) { Ordinal globalThread = i + t.league_rank() * work; Ordinal subBox = globalThread >> (logSubBoxSize - 1); diff --git a/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/src/graph/KokkosGraph_Distance1ColorHandle.hpp index 268c8e6a68..ca50a4f891 100644 --- a/src/graph/KokkosGraph_Distance1ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance1ColorHandle.hpp @@ -357,7 +357,7 @@ class GraphColoringHandle } }, new_edge_count); - Kokkos::single(Kokkos::PerThread(teamMember),[=] () { + Kokkos::single(Kokkos::PerThread(teamMember),[&] () { lower_xadj_counts(ii + 1) = new_edge_count; }); } diff --git a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index 6a4c6caf29..bb1f96c4e3 100644 --- a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -312,7 +312,7 @@ namespace KokkosSparse{ for(int j = 0; j < N; j++) lsum.data[j] += val * _Xvector(colIndex, colStart + j); }, sum); - Kokkos::single(Kokkos::PerThread(teamMember),[=] () + Kokkos::single(Kokkos::PerThread(teamMember),[&] () { nnz_scalar_t invDiagonalVal = _inverse_diagonal(row); for(int i = 0; i < N; i++) diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 5c50815f34..03eef00e4d 100644 --- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -276,7 +276,7 @@ namespace KokkosSparse{ for(int j = 0; j < N; j++) lsum.data[j] += val * _Xvector(colIndex, colStart + j); }, sum); - Kokkos::single(Kokkos::PerThread(teamMember),[=] () + Kokkos::single(Kokkos::PerThread(teamMember),[&] () { nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(row); for(int i = 0; i < N; i++) @@ -420,7 +420,7 @@ namespace KokkosSparse{ product += product2; //update the new vector entries. - Kokkos::single(Kokkos::PerThread(teamMember),[=] () { + Kokkos::single(Kokkos::PerThread(teamMember),[&] () { nnz_lno_t block_row_index = ii * block_size + i; nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(block_row_index); _Xvector(block_row_index, vec) += omega * (_Yvector(block_row_index, vec) - product) * invDiagonalVal; @@ -484,7 +484,7 @@ namespace KokkosSparse{ Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& ii) { #if KOKKOSSPARSE_IMPL_PRINTDEBUG - Kokkos::single(Kokkos::PerThread(teamMember),[=] () { + Kokkos::single(Kokkos::PerThread(teamMember),[&] () { for(nnz_lno_t i = 0; i < block_size; diagonal_positions[i++] = -1); }); #endif @@ -542,7 +542,7 @@ namespace KokkosSparse{ valueToUpdate += all_shared_memory[colind] * _adj_vals(current_row_begin + colind); }, product); - Kokkos::single(Kokkos::PerThread(teamMember),[=] () + Kokkos::single(Kokkos::PerThread(teamMember),[&] () { nnz_lno_t block_row_index = ii * block_size + i; nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(block_row_index); diff --git a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp index 86c34a1eee..0ef887d80e 100644 --- a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp +++ b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp @@ -253,7 +253,7 @@ struct RCM int next = 1; nnz_lno_t visitCounter = 0; Kokkos::single(Kokkos::PerTeam(mem), - [=]() + [&]() { workQueue(active, 0) = start; visit(start) = QUEUED; @@ -337,7 +337,7 @@ struct RCM if(visitCounter < numRows && activeQSize == 0) { Kokkos::single(Kokkos::PerTeam(mem), - [=]() + [&]() { //Some nodes are unreachable from start (graph not connected) //Find an unvisited node to resume BFS @@ -356,7 +356,7 @@ struct RCM level++; } Kokkos::single(Kokkos::PerTeam(mem), - [=] + [&] { numLevels() = level - 1; }); @@ -447,7 +447,7 @@ struct RCM } mem.team_barrier(); Kokkos::single(Kokkos::PerTeam(mem), - [=]() + [&]() { radixSortKeysAndValues (scores.data(), scoresAux.data(), adj.data() + levelOffset, adjAux.data(), levelSize, mem); diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp index 095cef74b5..3f29c39e4e 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp @@ -654,7 +654,7 @@ struct KokkosSPGEMM if (c_row_size > max_first_level_hash_size){ { while (tmp == NULL){ - Kokkos::single(Kokkos::PerTeam(teamMember),[=] (volatile nnz_lno_t * &memptr) { + Kokkos::single(Kokkos::PerTeam(teamMember),[&] (volatile nnz_lno_t * &memptr) { memptr = (volatile nnz_lno_t * )( memory_space.allocate_chunk(row_index)); }, tmp); } diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp index 70b1d05391..a32d6689b9 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp @@ -816,7 +816,7 @@ namespace KokkosSparse{ // Initialize hashmaps if (c_row_size > max_first_level_hash_size){ while (tmp == NULL){ - Kokkos::single(Kokkos::PerTeam(teamMember),[=] (volatile nnz_lno_t * &memptr) { + Kokkos::single(Kokkos::PerTeam(teamMember),[&] (volatile nnz_lno_t * &memptr) { memptr = (volatile nnz_lno_t * )( memory_space.allocate_chunk(row_index)); }, tmp); } From 6c00ae3354b9be7bc22c3442b4cfdeaef0df3a82 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 26 Oct 2020 13:10:33 -0600 Subject: [PATCH 056/106] Throw an exception if BLAS GESV is not enabled We don't have fallback implementation for GESV. If no TPL provides it, throw an exception. --- src/blas/impl/KokkosBlas_gesv_spec.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/blas/impl/KokkosBlas_gesv_spec.hpp b/src/blas/impl/KokkosBlas_gesv_spec.hpp index e1e0b77f67..8f2d171436 100644 --- a/src/blas/impl/KokkosBlas_gesv_spec.hpp +++ b/src/blas/impl/KokkosBlas_gesv_spec.hpp @@ -118,6 +118,7 @@ struct GESV{ const IPIVV& IPIV) { //NOTE: Might add the implementation of KokkosBlas::gesv later + throw std::runtime_error("No fallback implementation of GESV (general LU factorization & solve) exists. Enable BLAS and/or MAGMA TPL."); } }; From 57a0c83d78cbbdbf571bacef2040bcbdfe178d56 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 26 Oct 2020 12:26:35 -0700 Subject: [PATCH 057/106] Implement PR feedback --- src/KokkosKernels_Half.hpp | 18 +++++++------- src/Kokkos_ArithTraits.hpp | 24 +++++++++---------- unit_test/batched/Test_Batched_SerialGemm.hpp | 2 +- unit_test/batched/Test_Batched_TeamGemm.hpp | 2 +- .../batched/Test_Batched_TeamVectorGemm.hpp | 2 +- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/KokkosKernels_Half.hpp b/src/KokkosKernels_Half.hpp index 05d4053c50..5db55cf514 100644 --- a/src/KokkosKernels_Half.hpp +++ b/src/KokkosKernels_Half.hpp @@ -52,15 +52,15 @@ namespace KokkosKernels { namespace Experimental { ////////////// BEGIN FP16/binary16 limits ////////////// - #define FP16_MAX 65504.0F // Maximum normalized number - #define FP16_MIN 0.000000059604645F // Minimum normalized positive half precision number - #define FP16_RADIX 2 // Value of the base of the exponent representation. TODO: Confirm this - #define FP16_MANT_DIG 15 // Number of digits in the matissa that can be represented without losing precision. TODO: Confirm this - #define FP16_MIN_EXP -14 // This is the smallest possible exponent value - #define FP16_MAX_EXP 15 // This is the largest possible exponent value - #define FP16_SIGNIFICAND_BITS 10 - #define FP16_EPSILON 0.0009765625F - #define HUGE_VALH 0x7c00 // bits [10,14] set. + #define KOKKOSKERNELS_IMPL_FP16_MAX 65504.0F // Maximum normalized number + #define KOKKOSKERNELS_IMPL_FP16_MIN 0.000000059604645F // Minimum normalized positive half precision number + #define KOKKOSKERNELS_IMPL_FP16_RADIX 2 // Value of the base of the exponent representation. TODO: Confirm this + #define KOKKOSKERNELS_IMPL_FP16_MANT_DIG 15 // Number of digits in the matissa that can be represented without losing precision. TODO: Confirm this + #define KOKKOSKERNELS_IMPL_FP16_MIN_EXP -14 // This is the smallest possible exponent value + #define KOKKOSKERNELS_IMPL_FP16_MAX_EXP 15 // This is the largest possible exponent value + #define KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS 10 + #define KOKKOSKERNELS_IMPL_FP16_EPSILON 0.0009765625F + #define KOKKOSKERNELS_IMPL_HUGE_VALH 0x7c00 // bits [10,14] set. ////////////// END FP16/binary16 limits ////////////// } // Experimental } // KokkosKernels diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index 3738a3799f..d0e36b443c 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -715,10 +715,10 @@ class ArithTraits { return 1.0F; } static KOKKOS_FORCEINLINE_FUNCTION val_type min () { - return -FP16_MAX; + return -KOKKOSKERNELS_IMPL_FP16_MAX; } static KOKKOS_FORCEINLINE_FUNCTION val_type max () { - return FP16_MAX; + return KOKKOSKERNELS_IMPL_FP16_MAX; } static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) { return x; @@ -776,8 +776,8 @@ class ArithTraits { return ::atan (Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () { - //return ::pow(2, -FP16_SIGNIFICAND_BITS); - return FP16_EPSILON; + //return ::pow(2, -KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS); + return KOKKOSKERNELS_IMPL_FP16_EPSILON; } // Backwards compatibility with Teuchos::ScalarTraits. typedef mag_type magnitudeType; @@ -815,35 +815,35 @@ class ArithTraits { return epsilon (); } static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin () { - return FP16_MIN; + return KOKKOSKERNELS_IMPL_FP16_MIN; } static KOKKOS_FORCEINLINE_FUNCTION int base () { - return FP16_RADIX; + return KOKKOSKERNELS_IMPL_FP16_RADIX; } // Use float to allow running on both host and device static KOKKOS_FORCEINLINE_FUNCTION float prec () { - float e = FP16_EPSILON; + float e = KOKKOSKERNELS_IMPL_FP16_EPSILON; float b = (float) base(); float r = e * b; return r; } static KOKKOS_FORCEINLINE_FUNCTION int t () { - return FP16_MANT_DIG; + return KOKKOSKERNELS_IMPL_FP16_MANT_DIG; } static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd () { return 1.0; } static KOKKOS_FORCEINLINE_FUNCTION int emin () { - return FP16_MIN_EXP; + return KOKKOSKERNELS_IMPL_FP16_MIN_EXP; } static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin () { - return FP16_MIN; + return KOKKOSKERNELS_IMPL_FP16_MIN; } static KOKKOS_FORCEINLINE_FUNCTION int emax () { - return FP16_MAX_EXP; + return KOKKOSKERNELS_IMPL_FP16_MAX_EXP; } static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax () { - return FP16_MAX; + return KOKKOSKERNELS_IMPL_FP16_MAX; } }; #endif // HAVE_KOKKOS_HALFMATH && KOKKOS_ENABLE_CUDA diff --git a/unit_test/batched/Test_Batched_SerialGemm.hpp b/unit_test/batched/Test_Batched_SerialGemm.hpp index c38bfcda11..3d0ae712cb 100644 --- a/unit_test/batched/Test_Batched_SerialGemm.hpp +++ b/unit_test/batched/Test_Batched_SerialGemm.hpp @@ -146,7 +146,7 @@ template Date: Tue, 6 Oct 2020 11:49:39 -0600 Subject: [PATCH 058/106] WIP: adding HIP codepaths in preparation for tests/ETI --- perf_test/graph/KokkosGraph_color.cpp | 9 + perf_test/graph/KokkosGraph_color_d2.cpp | 23 +- perf_test/graph/KokkosGraph_mis_d2.cpp | 18 +- perf_test/graph/KokkosGraph_triangle.cpp | 19 +- perf_test/sparse/KokkosSparse_pcg.cpp | 291 ++--- perf_test/sparse/KokkosSparse_spadd.cpp | 2 +- perf_test/sparse/KokkosSparse_spgemm.cpp | 17 +- src/Kokkos_ArithTraits.hpp | 11 +- .../KokkosBatched_Gemm_Team_Internal.hpp | 20 +- .../KokkosBatched_Trsm_Team_Internal.hpp | 20 +- .../KokkosBatched_Trsv_Serial_Internal.hpp | 2 +- .../KokkosBatched_Trsv_Team_Internal.hpp | 2 +- src/batched/KokkosBatched_Util.hpp | 2 +- src/batched/KokkosBatched_Vector.hpp | 38 + src/batched/KokkosBatched_Vector_SIMD.hpp | 10 +- .../KokkosBatched_Vector_SIMD_Arith.hpp | 8 +- src/blas/impl/KokkosBlas2_gemv_impl.hpp | 4 +- src/blas/impl/KokkosBlas3_gemm_impl.hpp | 7 + src/blas/impl/KokkosBlas3_gemm_spec.hpp | 4 + src/common/KokkosKernels_BitUtils.hpp | 1 + src/common/KokkosKernels_ExecSpaceUtils.hpp | 86 +- src/common/KokkosKernels_Handle.hpp | 2 +- src/common/KokkosKernels_Macros.hpp | 4 +- src/common/KokkosKernels_SparseUtils.hpp | 168 +-- ...Kernels_Uniform_Initialized_MemoryPool.hpp | 3 +- src/common/KokkosKernels_Utils.hpp | 67 +- src/common/KokkosKernels_default_types.hpp | 2 + .../KokkosGraph_Distance1ColorHandle.hpp | 70 +- .../KokkosGraph_Distance2ColorHandle.hpp | 62 +- .../impl/KokkosGraph_Distance2MIS_impl.hpp | 2 +- src/sparse/KokkosSparse_CrsMatrix.hpp | 6 + .../KokkosSparse_gauss_seidel_handle.hpp | 74 +- src/sparse/KokkosSparse_spadd.hpp | 61 - src/sparse/KokkosSparse_spgemm_handle.hpp | 76 +- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 47 +- .../impl/KokkosSparse_partitioning_impl.hpp | 529 --------- .../KokkosSparse_spgemm_impl_compression.hpp | 53 +- .../impl/KokkosSparse_spgemm_impl_def.hpp | 5 +- .../impl/KokkosSparse_spgemm_impl_kkmem.hpp | 26 +- .../impl/KokkosSparse_spgemm_impl_speed.hpp | 8 +- .../KokkosSparse_spgemm_impl_symbolic.hpp | 86 +- .../KokkosSparse_spgemm_impl_triangle.hpp | 48 +- ...se_spgemm_impl_triangle_no_compression.hpp | 46 +- ...kosSparse_spgemm_jacobi_sparseacc_impl.hpp | 26 +- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 451 +++----- .../impl/KokkosSparse_spmv_struct_impl.hpp | 1005 ++++++++--------- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 17 + test_common/KokkosKernels_TestParameters.hpp | 2 + 48 files changed, 1312 insertions(+), 2228 deletions(-) diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index cbc3697517..f7d8a93e80 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -579,6 +579,15 @@ int main (int argc, char ** argv){ #endif +#if defined( KOKKOS_ENABLE_HIP ) + if (params.use_hip) { + KokkosKernels::Experiment::run_multi_mem_experiment + ( + params + ); + } +#endif + #if defined( KOKKOS_ENABLE_SERIAL ) if (params.use_serial) { #ifdef KOKKOSKERNELS_MULTI_MEM diff --git a/perf_test/graph/KokkosGraph_color_d2.cpp b/perf_test/graph/KokkosGraph_color_d2.cpp index 970bafa380..04d977527d 100644 --- a/perf_test/graph/KokkosGraph_color_d2.cpp +++ b/perf_test/graph/KokkosGraph_color_d2.cpp @@ -81,6 +81,7 @@ struct D2Parameters int use_threads; int use_openmp; int use_cuda; + int use_hip; int use_serial; const char* mtx_file; ColoringMode d2_color_type; @@ -93,6 +94,7 @@ struct D2Parameters use_threads = 0; use_openmp = 0; use_cuda = 0; + use_hip = 0; use_serial = 0; mtx_file = NULL; d2_color_type = MODE_D2_SYMMETRIC; @@ -147,6 +149,9 @@ void print_options(std::ostream &os, const char *app_name, unsigned int indent = #endif #ifdef KOKKOS_ENABLE_CUDA << spaces << " --cuda Use given CUDA device" << std::endl +#endif +#ifdef KOKKOS_ENABLE_HIP + << spaces << " --hip Use given HIP device" << std::endl #endif << std::endl << spaces << " Coloring modes:" << std::endl @@ -199,6 +204,10 @@ int parse_inputs(D2Parameters ¶ms, int argc, char **argv) { params.use_cuda = 1 + atoi(getNextArg(i, argc, argv)); } + else if(0 == strcasecmp(argv[i], "--hip")) + { + params.use_hip = 1 + atoi(getNextArg(i, argc, argv)); + } else if(0 == strcasecmp(argv[i], "--repeat")) { params.repeat = atoi(getNextArg(i, argc, argv)); @@ -273,7 +282,7 @@ int parse_inputs(D2Parameters ¶ms, int argc, char **argv) print_options(std::cout, argv[0]); return 1; } - if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda) + if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip) { print_options(std::cout, argv[0]); return 1; @@ -603,6 +612,8 @@ int main(int argc, char *argv[]) int device_id = 0; if(params.use_cuda) device_id = params.use_cuda - 1; + else if(params.use_hip) + device_id = params.use_hip - 1; Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); // Print out verbose information about the configuration of the run. @@ -645,6 +656,16 @@ int main(int argc, char *argv[]) } #endif + #if defined(KOKKOS_ENABLE_HIP) + if(params.use_hip) + { + if(!use_multi_mem) + { + KokkosKernels::Experiment::experiment_driver(params); + } + } + #endif + #if defined(KOKKOS_ENABLE_SERIAL) if(params.use_serial) { diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp index da9fb549d6..32ff5f5fbd 100644 --- a/perf_test/graph/KokkosGraph_mis_d2.cpp +++ b/perf_test/graph/KokkosGraph_mis_d2.cpp @@ -75,6 +75,7 @@ struct MIS2Parameters int use_threads = 0; int use_openmp = 0; int use_cuda = 0; + int use_hip = 0; int use_serial = 0; const char* mtx_file = NULL; MIS2_Algorithm algo = MIS2_FAST; @@ -163,6 +164,9 @@ void print_options(std::ostream &os, const char *app_name, unsigned int indent = #endif #ifdef KOKKOS_ENABLE_CUDA << spaces << " --cuda Use CUDA.\n" +#endif +#ifdef KOKKOS_ENABLE_HIP + << spaces << " --hip Use HIP.\n" #endif << std::endl << spaces << " Optional Parameters:" << std::endl @@ -205,6 +209,10 @@ int parse_inputs(MIS2Parameters ¶ms, int argc, char **argv) { params.use_cuda = 1; } + else if(0 == strcasecmp(argv[i], "--hip")) + { + params.use_hip = 1; + } else if(0 == strcasecmp(argv[i], "--repeat")) { params.repeat = atoi(getNextArg(i, argc, argv)); @@ -252,7 +260,7 @@ int parse_inputs(MIS2Parameters ¶ms, int argc, char **argv) print_options(std::cout, argv[0]); return 1; } - if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda) + if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip) { print_options(std::cout, argv[0]); return 1; @@ -362,6 +370,14 @@ int main(int argc, char *argv[]) } #endif + #if defined(KOKKOS_ENABLE_HIP) + if(params.use_hip) + { + run_mis2(params); + run = true; + } + #endif + #if defined(KOKKOS_ENABLE_SERIAL) if(params.use_serial) { diff --git a/perf_test/graph/KokkosGraph_triangle.cpp b/perf_test/graph/KokkosGraph_triangle.cpp index 6f0b6c73df..63a52dbaea 100644 --- a/perf_test/graph/KokkosGraph_triangle.cpp +++ b/perf_test/graph/KokkosGraph_triangle.cpp @@ -54,7 +54,7 @@ void print_options(){ std::cerr << "Options\n" << std::endl; - std::cerr << "Choose BackEnd : --openmp [numthreads] | --cuda" << std::endl; + std::cerr << "Choose BackEnd : --openmp [numthreads] | --cuda | --hip" << std::endl; std::cerr << "Input Matrix : --amtx [path_to_input_matrix]" << std::endl; std::cerr << "\tInput Matrix format can be multiple formats. If it ends with:" << std::endl; std::cerr << "\t\t.mtx: it will read matrix market format." << std::endl; @@ -96,6 +96,9 @@ int parse_inputs (KokkosKernels::Experiment::Parameters ¶ms, int argc, char else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) { params.use_cuda = 1; } + else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) { + params.use_hip = 1; + } else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) { params.repeat = atoi( argv[++i] ); } @@ -292,7 +295,6 @@ int main (int argc, char ** argv){ const int device_id = 0; Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) ); -#if !defined (KOKKOS_ENABLE_CUDA) #if defined( KOKKOS_ENABLE_OPENMP ) if (params.use_openmp) { @@ -311,10 +313,9 @@ int main (int argc, char ** argv){ } #endif -#endif -#if defined( KOKKOS_ENABLE_CUDA1 ) +#if defined( KOKKOS_ENABLE_CUDA ) if (params.use_cuda) { Kokkos::Cuda::print_configuration(std::cout); #ifdef KOKKOSKERNELS_MULTI_MEM @@ -332,6 +333,16 @@ int main (int argc, char ** argv){ #endif +#if defined( KOKKOS_ENABLE_HIP ) + if (params.use_hip) { + Kokkos::Experimental::HIP::print_configuration(std::cout); + KokkosKernels::Experiment::run_multi_mem_triangle + ( + params + ); + } +#endif + Kokkos::finalize(); return 0; diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index 681327dfaf..0f6351189b 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -43,32 +43,24 @@ */ #include -#if defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) #include "KokkosSparse_pcg.hpp" #include "KokkosKernels_Utils.hpp" -#include #include "KokkosKernels_IOUtils.hpp" +#include "KokkosKernels_default_types.hpp" +#include #define MAXVAL 1 -#define SIZE_TYPE size_t -#define INDEX_TYPE int -#define SCALAR_TYPE double - - - template -scalar_view_t create_x_vector(INDEX_TYPE nv, SCALAR_TYPE max_value = 1.0){ +scalar_view_t create_x_vector(default_lno_t nv, default_scalar max_value = 1.0){ scalar_view_t kok_x ("X", nv); typename scalar_view_t::HostMirror h_x = Kokkos::create_mirror_view (kok_x); - for (INDEX_TYPE i = 0; i < nv; ++i){ - SCALAR_TYPE r = static_cast (rand()) / static_cast (RAND_MAX / max_value); + for (default_lno_t i = 0; i < nv; ++i){ + default_scalar r = static_cast (rand()) / static_cast (RAND_MAX / max_value); h_x(i) = r; } Kokkos::deep_copy (kok_x, h_x); @@ -98,7 +90,7 @@ void run_experiment( typedef typename lno_view_t::value_type size_type; typedef typename scalar_view_t::value_type scalar_t; - INDEX_TYPE nv = crsmat.numRows(); + default_lno_t nv = crsmat.numRows(); scalar_view_t kok_x_original = create_x_vector(nv, MAXVAL); scalar_view_t kok_b_vector = create_y_vector(crsmat, kok_x_original); @@ -255,25 +247,70 @@ void run_experiment( */ } - - - enum { CMD_USE_THREADS = 0 , CMD_USE_NUMA , CMD_USE_CORE_PER_NUMA , CMD_USE_CUDA + , CMD_USE_HIP , CMD_USE_OPENMP - , CMD_USE_CUDA_DEV + , CMD_DEVICE , CMD_BIN_MTX , CMD_CLUSTER_SIZE , CMD_USE_SEQUENTIAL_SGS , CMD_ERROR , CMD_COUNT }; +template +void run_pcg(int* cmdline, const char* mtx_file) +{ + default_lno_t nv = 0, ne = 0; + default_lno_t *xadj, *adj; + default_scalar *ew; + + KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_file); + + typedef typename KokkosSparse::CrsMatrix crsMat_t; + + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; + typedef typename crsMat_t::index_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; + + row_map_view_t rowmap_view("rowmap_view", nv+1); + cols_view_t columns_view("colsmap_view", ne); + values_view_t values_view("values_view", ne); + + { + typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view); + typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view); + typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view); + + for (default_lno_t i = 0; i <= nv; ++i){ + hr(i) = xadj[i]; + } + + for (default_lno_t i = 0; i < ne; ++i){ + hc(i) = adj[i]; + hv(i) = ew[i]; + } + Kokkos::deep_copy (rowmap_view , hr); + Kokkos::deep_copy (columns_view , hc); + Kokkos::deep_copy (values_view , hv); + } + graph_t static_graph (columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); + + delete [] xadj; + delete [] adj; + delete [] ew; + + run_experiment(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]); +} + int main (int argc, char ** argv){ int cmdline[ CMD_COUNT ] ; - char *mtx_bin_file = NULL; + char *mtx_file = NULL; for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ; for ( int i = 1 ; i < argc ; ++i ) { @@ -283,17 +320,22 @@ int main (int argc, char ** argv){ else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) { cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] ); } + /* else if ( 0 == strcasecmp( argv[i] , "--cores" ) ) { + //Note BMK: specifying #NUMA regions isn't supported by initialize sscanf( argv[++i] , "%dx%d" , cmdline + CMD_USE_NUMA , cmdline + CMD_USE_CORE_PER_NUMA ); } + */ else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) { cmdline[ CMD_USE_CUDA ] = 1 ; } - else if ( 0 == strcasecmp( argv[i] , "--cuda-dev" ) ) { - cmdline[ CMD_USE_CUDA ] = 1 ; - cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ; + else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) { + cmdline[ CMD_USE_HIP ] = 1 ; + } + else if ( 0 == strcasecmp( argv[i] , "--device-id" ) ) { + cmdline[ CMD_DEVICE ] = atoi( argv[++i] ) ; } else if ( 0 == strcasecmp( argv[i] , "--cluster-size" ) ) { cmdline[CMD_CLUSTER_SIZE] = atoi(argv[++i]); @@ -303,12 +345,12 @@ int main (int argc, char ** argv){ } else if ( 0 == strcasecmp( argv[i] , "--mtx" ) ) { - mtx_bin_file = argv[++i]; + mtx_file = argv[++i]; } else { cmdline[ CMD_ERROR ] = 1 ; std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ; - std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl; + std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--hip\n\t--device-id[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl; return 0; } @@ -317,190 +359,43 @@ int main (int argc, char ** argv){ if(cmdline[CMD_CLUSTER_SIZE] == 0) cmdline[CMD_CLUSTER_SIZE] = 1; - if (mtx_bin_file == NULL){ - std::cerr << "Provide a mtx binary file" << std::endl ; - std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl; + if (mtx_file == NULL){ + std::cerr << "Provide a matrix file" << std::endl ; + std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--hip\n\t--device-id[DeviceIndex]\n\t--mtx[matrix]" << std::endl; return 0; } + Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space -#if defined( KOKKOS_ENABLE_THREADS ) - - if ( cmdline[ CMD_USE_THREADS ] ) { - - Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space - - if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) { - init_args.num_threads = cmdline[ CMD_USE_THREADS ]; - init_args.num_numa = cmdline[ CMD_USE_NUMA ]; - //const int core_per_numa = cmdline[ CMD_USE_CORE_PER_NUMA ]; // How to get this to initialize() without using impl_initialize()? - } - else { - init_args.num_threads = cmdline[ CMD_USE_THREADS ]; - } - - Kokkos::initialize( init_args ); - Kokkos::print_configuration(std::cout); - { - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; - - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - - typedef Kokkos::Threads myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); - - KokkosKernels::Impl::copy_vector(ne, ew, values_view); - KokkosKernels::Impl::copy_vector(ne, adj, columns_view); - KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); - - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - delete [] xadj; - delete [] adj; - delete [] ew; - - run_experiment(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]); - } + init_args.device_id = cmdline[ CMD_DEVICE ]; + if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) { + init_args.num_threads = std::max(cmdline[ CMD_USE_THREADS ], cmdline [ CMD_USE_OPENMP ]); + init_args.num_numa = cmdline[ CMD_USE_NUMA ]; + } + else { + init_args.num_threads = cmdline[ CMD_USE_THREADS ]; + } - Kokkos::finalize(); - } + Kokkos::initialize( init_args ); + { +#if defined( KOKKOS_ENABLE_THREADS ) + if(cmdline[CMD_USE_THREADS]) + run_pcg(cmdline, mtx_file); #endif - #if defined( KOKKOS_ENABLE_OPENMP ) - - if ( cmdline[ CMD_USE_OPENMP ] ) { - - Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space - - if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) { - init_args.num_threads = cmdline[ CMD_USE_OPENMP ]; - init_args.num_numa = cmdline[ CMD_USE_NUMA ]; - //const int core_per_numa = cmdline[ CMD_USE_CORE_PER_NUMA ]; - } - else { - init_args.num_threads = cmdline[ CMD_USE_OPENMP ]; - } - - Kokkos::initialize( init_args ); - Kokkos::print_configuration(std::cout); - { - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; - - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - - - typedef Kokkos::OpenMP myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsMat_t::index_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); - - KokkosKernels::Impl::copy_vector(ne, ew, values_view); - KokkosKernels::Impl::copy_vector(ne, adj, columns_view); - KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); - - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - - //crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj); - delete [] xadj; - delete [] adj; - delete [] ew; - - run_experiment(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]); - } - Kokkos::finalize(); - } + if(cmdline[CMD_USE_OPENMP]) + run_pcg(cmdline, mtx_file); #endif - #if defined( KOKKOS_ENABLE_CUDA ) - if ( cmdline[ CMD_USE_CUDA ] ) { - - Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space - - // Use the last device: - init_args.device_id = cmdline[ CMD_USE_CUDA_DEV ]; - - Kokkos::initialize( init_args ); - Kokkos::print_configuration(std::cout); - { - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; - - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - - - typedef Kokkos::Cuda myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsMat_t::index_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); - - - { - typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view); - typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view); - typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view); - - for (INDEX_TYPE i = 0; i <= nv; ++i){ - hr(i) = xadj[i]; - } - - for (INDEX_TYPE i = 0; i < ne; ++i){ - hc(i) = adj[i]; - hv(i) = ew[i]; - } - Kokkos::deep_copy (rowmap_view , hr); - Kokkos::deep_copy (columns_view , hc); - Kokkos::deep_copy (values_view , hv); - - - } - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - - // typedef typename KokkosSparse::CrsMatrix crsMat_t; - // crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj); - delete [] xadj; - delete [] adj; - delete [] ew; - - run_experiment(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]); - } - Kokkos::finalize(); - } + if(cmdline[CMD_USE_CUDA]) + run_pcg(cmdline, mtx_file); #endif - +#if defined( KOKKOS_ENABLE_HIP ) + if(cmdline[CMD_USE_HIP]) + run_pcg(cmdline, mtx_file); +#endif + } + Kokkos::finalize(); return 0; } -#else -int main() { -} -#endif diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index f90c6179f7..959e9d973c 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -60,7 +60,7 @@ void print_options(){ std::cerr << "Options\n" << std::endl; - std::cerr << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]'" << std::endl; + std::cerr << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" << std::endl; std::cerr << "\t[Required] --amtx :: 1st input matrix" << std::endl; std::cerr << "\t[Required] --bmtx :: 2nd input matrix" << std::endl; diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp index 80e4ab7c34..0f1c9f6210 100644 --- a/perf_test/sparse/KokkosSparse_spgemm.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm.cpp @@ -52,7 +52,7 @@ void print_options(){ std::cerr << "\t[Required] INPUT MATRIX: '--amtx [left_hand_side.mtx]' -- for C=AxA" << std::endl; - std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' --> if none are specified, Serial is used (if enabled)" << std::endl; + std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]' --> if none are specified, Serial is used (if enabled)" << std::endl; std::cerr << "\t[Optional] '--algorithm [DEFAULT=KKDEFAULT=KKSPGEMM|KKMEM|KKDENSE|MKL|CUSPARSE|CUSP|VIENNA|MKL2]' --> to choose algorithm. KKMEM is outdated, use KKSPGEMM instead." << std::endl; std::cerr << "\t[Optional] --bmtx [righ_hand_side.mtx]' for C = AxB" << std::endl; std::cerr << "\t[Optional] OUTPUT MATRICES: '--cmtx [output_matrix.mtx]' --> to write output C=AxB" << std::endl; @@ -84,6 +84,9 @@ int parse_inputs (KokkosKernels::Experiment::Parameters ¶ms, int argc, char else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) { params.use_cuda = atoi(getNextArg(i, argc, argv)) + 1; } + else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) { + params.use_hip = atoi(getNextArg(i, argc, argv)) + 1; + } else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) { params.repeat = atoi(getNextArg(i, argc, argv)); } @@ -297,7 +300,7 @@ int main (int argc, char ** argv){ } const int num_threads = std::max(params.use_openmp, params.use_threads); - const int device_id = params.use_cuda - 1; + const int device_id = params.use_cuda ? params.use_cuda - 1 : params.use_hip - 1; Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) ); Kokkos::print_configuration(std::cout); @@ -336,6 +339,16 @@ int main (int argc, char ** argv){ } #endif +#if defined( KOKKOS_ENABLE_HIP ) + if (params.use_hip) { + KokkosKernels::Experiment::run_multi_mem_spgemm + ( + params + ); + + } +#endif + #if defined( KOKKOS_ENABLE_THREADS ) //If only serial is enabled (or no other device was specified), run with serial if (params.use_threads) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index 3a6ea1cca5..6e4af2c7b3 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -50,6 +50,7 @@ #include #include +#include #ifdef HAVE_KOKKOSKERNELS_QUADMATH # include @@ -63,16 +64,6 @@ #ifdef __CUDACC__ # include #endif -// -// mfh 24 Dec 2013: Temporary measure for testing; will go away. -// -#ifndef KOKKOS_FORCEINLINE_FUNCTION -# ifdef __CUDA_ARCH__ -# define KOKKOS_FORCEINLINE_FUNCTION inline __host__ __device__ -# else -# define KOKKOS_FORCEINLINE_FUNCTION -# endif // __CUDA_ARCH__ -#endif // KOKKOS_FORCEINLINE_FUNCTION namespace { // anonymous diff --git a/src/batched/KokkosBatched_Gemm_Team_Internal.hpp b/src/batched/KokkosBatched_Gemm_Team_Internal.hpp index f4f682cb91..c7e7613769 100644 --- a/src/batched/KokkosBatched_Gemm_Team_Internal.hpp +++ b/src/batched/KokkosBatched_Gemm_Team_Internal.hpp @@ -5,6 +5,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "KokkosBatched_Util.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBatched_Set_Internal.hpp" #include "KokkosBatched_Scale_Internal.hpp" @@ -111,7 +112,7 @@ namespace KokkosBatched { member.team_barrier(); /// - /// case cuda: team size is large and blocksize (mb,nb) is small + /// GPU case: team size is large and blocksize (mb,nb) is small InnerGemmFixC inner(as0, as1, bs0, bs1, cs0, cs1); auto gemm = [&](const int ib, const int jb, @@ -128,13 +129,16 @@ namespace KokkosBatched { Kokkos::parallel_for (Kokkos::TeamThreadRange(member, mq*nq ), [&](const int &ij) { -#if \ - defined (KOKKOS_ENABLE_CUDA) && \ - defined (KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) - const int i = ij%mq*mb, j = ij/mq*nb; -#else - const int i = ij/nq*mb, j = ij%nq*nb; -#endif + int i, j; + //note: the condition is constexpr + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + i = ij%mq*mb; + j = ij/mq*nb; + } + else { + i = ij/nq*mb; + j = ij%nq*nb; + } inner.serial_invoke(alpha, AA+i*as0, BB+j*bs1, (i+mb) > ib ? mp : mb, diff --git a/src/batched/KokkosBatched_Trsm_Team_Internal.hpp b/src/batched/KokkosBatched_Trsm_Team_Internal.hpp index 085bd9e293..64d8368f16 100644 --- a/src/batched/KokkosBatched_Trsm_Team_Internal.hpp +++ b/src/batched/KokkosBatched_Trsm_Team_Internal.hpp @@ -5,6 +5,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "KokkosBatched_Util.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBatched_Set_Internal.hpp" #include "KokkosBatched_Scale_Internal.hpp" @@ -114,7 +115,7 @@ namespace KokkosBatched { /// case host: team size is small and blocksize (mb,nb) is large /// - /// case cuda: team size is large and blocksize (mb,nb) is small + /// case GPU: team size is large and blocksize (mb,nb) is small InnerTrsmLeftLowerUnitDiag trsm_u(as0, as1, bs0, bs1); InnerTrsmLeftLowerNonUnitDiag trsm_n(as0, as1, bs0, bs1); @@ -195,7 +196,6 @@ namespace KokkosBatched { const ScalarType alpha, const ValueType *__restrict__ A, const int as0, const int as1, /**/ ValueType *__restrict__ B, const int bs0, const int bs1) { - const ScalarType one(1.0), zero(0.0); // note that parallel range is different ( m*n vs m-1*n); @@ -223,13 +223,15 @@ namespace KokkosBatched { } Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,iend*jend),[&](const int &ij) { -#if \ - defined (KOKKOS_ENABLE_CUDA) && \ - defined (KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) - const int i = ij%iend, j = ij/iend; -#else - const int i = ij/jend, j = ij%jend; -#endif + int i, j; + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + i = ij%iend; + j = ij/iend; + } + else { + i = ij/jend; + j = ij%jend; + } B0[i*bs0+j*bs1] -= a01[i*as0] * b1t[j*bs1]; }); } diff --git a/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp b/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp index 618f8dc614..5bf26f0865 100644 --- a/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp +++ b/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp @@ -99,7 +99,7 @@ namespace KokkosBatched { if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; - /// case cuda: team size is large and blocksize (mb,nb) is small + /// case GPU: team size is large and blocksize (mb,nb) is small InnerTrsmLeftLowerUnitDiag trsm_u(as0, as1, bs0, 0); InnerTrsmLeftLowerNonUnitDiag trsm_n(as0, as1, bs0, 0); diff --git a/src/batched/KokkosBatched_Trsv_Team_Internal.hpp b/src/batched/KokkosBatched_Trsv_Team_Internal.hpp index 20ee624006..7d72f01e15 100644 --- a/src/batched/KokkosBatched_Trsv_Team_Internal.hpp +++ b/src/batched/KokkosBatched_Trsv_Team_Internal.hpp @@ -115,7 +115,7 @@ namespace KokkosBatched { if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; - /// case cuda: team size is large and blocksize (mb,nb) is small + /// case GPU: team size is large and blocksize (mb,nb) is small InnerTrsmLeftLowerUnitDiag trsm_u(as0, as1, bs0, 0); InnerTrsmLeftLowerNonUnitDiag trsm_n(as0, as1, bs0, 0); diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 2347c63e87..6d6fe4edbd 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -270,7 +270,7 @@ namespace KokkosBatched { // regieter blocking (not about team parallelism). // this mb should vary according to // - team policy (smaller) or range policy (bigger) - // - space (cuda vs host) + // - space (gpu vs host) // - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc. #if defined(KOKKOS_ENABLE_CUDA) template KOKKOS_INLINE_FUNCTION static constexpr diff --git a/src/batched/KokkosBatched_Vector.hpp b/src/batched/KokkosBatched_Vector.hpp index 8737d72850..28a537f885 100644 --- a/src/batched/KokkosBatched_Vector.hpp +++ b/src/batched/KokkosBatched_Vector.hpp @@ -104,6 +104,25 @@ namespace KokkosBatched { }; #endif +#if defined(KOKKOS_ENABLE_HIP) + template<> + struct DefaultVectorLength { + enum : int { value = 16 }; + }; + template<> + struct DefaultVectorLength { + enum : int { value = 16 }; + }; + template<> + struct DefaultVectorLength,Kokkos::Experimental::HIPSpace> { + enum : int { value = 16 }; + }; + template<> + struct DefaultVectorLength,Kokkos::Experimental::HIPSpace> { + enum : int { value = 16 }; + }; +#endif + template struct DefaultInternalVectorLength { enum : int { value = 1 }; @@ -147,6 +166,25 @@ namespace KokkosBatched { enum : int { value = 1 }; }; #endif + +#if defined(KOKKOS_ENABLE_HIP) + template<> + struct DefaultInternalVectorLength { + enum : int { value = 8 }; + }; + template<> + struct DefaultInternalVectorLength { + enum : int { value = 4 }; + }; + template<> + struct DefaultInternalVectorLength,Kokkos::Experimental::HIPSpace> { + enum : int { value = 4 }; + }; + template<> + struct DefaultInternalVectorLength,Kokkos::Experimental::HIPSpace> { + enum : int { value = 2 }; + }; +#endif template struct MagnitudeScalarType; diff --git a/src/batched/KokkosBatched_Vector_SIMD.hpp b/src/batched/KokkosBatched_Vector_SIMD.hpp index d59f0f9be4..e8fe83b7e2 100644 --- a/src/batched/KokkosBatched_Vector_SIMD.hpp +++ b/src/batched/KokkosBatched_Vector_SIMD.hpp @@ -129,7 +129,7 @@ namespace KokkosBatched { } -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) namespace KokkosBatched { template<> @@ -143,7 +143,7 @@ namespace KokkosBatched { typedef float2 data_type; KOKKOS_INLINE_FUNCTION - static const char* label() { return "CudaFloat2"; } + static const char* label() { return "GpuFloat2"; } template friend class Vector; @@ -224,7 +224,7 @@ namespace KokkosBatched { typedef double2 data_type; KOKKOS_INLINE_FUNCTION - static const char* label() { return "CudaDouble2"; } + static const char* label() { return "GpuDouble2"; } template friend class Vector; @@ -305,7 +305,7 @@ namespace KokkosBatched { typedef float4 data_type; KOKKOS_INLINE_FUNCTION - static const char* label() { return "CudaFloat4"; } + static const char* label() { return "GpuFloat4"; } template friend class Vector; @@ -400,7 +400,7 @@ namespace KokkosBatched { typedef double4 data_type; KOKKOS_INLINE_FUNCTION - static const char* label() { return "CudaDouble4"; } + static const char* label() { return "GpuDouble4"; } template friend class Vector; diff --git a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp index 95ab97d882..43ddbb101b 100644 --- a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp +++ b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp @@ -77,7 +77,7 @@ namespace KokkosBatched { return r_val; } -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) @@ -298,7 +298,7 @@ namespace KokkosBatched { return r_val; } -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) @@ -568,7 +568,7 @@ namespace KokkosBatched { return r_val; } -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) @@ -858,7 +858,7 @@ namespace KokkosBatched { return r_val; } -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp index 74d15af1c3..db5bc9fbca 100644 --- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -139,8 +139,8 @@ struct SingleLevelNontransposeGEMV { // matrix A and the input vector x. The output vector y is the // reduction result. // -// WARNING: NOT RECOMMENDED FOR CUDA. Reduction result may have -// arbitrary length. This is bad on CUDA because the CUDA +// WARNING: NOT RECOMMENDED FOR GPU. Reduction result may have +// arbitrary length. This is bad on GPU because the GPU // implementation of Kokkos::parallel_reduce may use shared memory for // intermediate results. template { }; #endif +#ifdef KOKKOS_ENABLE_HIP +template +struct impl_gemm_choose_copy_layout { + typedef LayoutA type; +}; +#endif + // DeepCopy matrix block into scratch template struct impl_deep_copy_matrix_block; diff --git a/src/blas/impl/KokkosBlas3_gemm_spec.hpp b/src/blas/impl/KokkosBlas3_gemm_spec.hpp index 877d73c5fa..2a63c3736f 100644 --- a/src/blas/impl/KokkosBlas3_gemm_spec.hpp +++ b/src/blas/impl/KokkosBlas3_gemm_spec.hpp @@ -157,6 +157,10 @@ struct GEMM { if(std::is_same::value) team_size = blockA0; #endif + #if defined(KOKKOS_ENABLE_HIP) + if(std::is_same::value) + team_size = blockA0; + #endif #if defined(KOKKOS_ENABLE_ROCM) if(std::is_same::value) team_size = blockA0; diff --git a/src/common/KokkosKernels_BitUtils.hpp b/src/common/KokkosKernels_BitUtils.hpp index b22d86a8bb..28b2a01389 100644 --- a/src/common/KokkosKernels_BitUtils.hpp +++ b/src/common/KokkosKernels_BitUtils.hpp @@ -51,6 +51,7 @@ namespace KokkosKernels{ namespace Impl{ // POP COUNT function returns the number of set bits +// Note BMK: HIP also defines __CUDA_ARCH__, and provides the same intrinsics. #if defined( __CUDA_ARCH__ ) KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned i ){ diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp index c0ae6ce5eb..22930c82e1 100644 --- a/src/common/KokkosKernels_ExecSpaceUtils.hpp +++ b/src/common/KokkosKernels_ExecSpaceUtils.hpp @@ -53,9 +53,9 @@ namespace KokkosKernels{ namespace Impl{ -enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA}; +enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA, Exec_HIP}; template -inline ExecSpaceType kk_get_exec_space_type(){ +constexpr KOKKOS_INLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){ ExecSpaceType exec_space = Exec_SERIAL; #if defined( KOKKOS_ENABLE_SERIAL ) if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ @@ -81,6 +81,12 @@ inline ExecSpaceType kk_get_exec_space_type(){ } #endif +#if defined( KOKKOS_ENABLE_HIP ) + if (std::is_same::value){ + exec_space = Exec_HIP; + } +#endif + #if defined( KOKKOS_ENABLE_QTHREAD) if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ exec_space = Exec_QTHREADS; @@ -90,6 +96,48 @@ inline ExecSpaceType kk_get_exec_space_type(){ } +template +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { + auto exec = kk_get_exec_space_type(); + //TODO BMK: Add OpenMPTarget and any other future GPU exec spaces + return exec == Exec_CUDA || exec == Exec_HIP; +} + +//Host function to determine free and total device memory. +//Will throw if execution space doesn't support this. +template +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) +{ + std::ostringstream oss; + oss << "Error: memory space " << MemorySpace::name() << " does not support querying free/total memory."; + throw std::runtime_error(oss.str()); +} + +#ifdef KOKKOS_ENABLE_CUDA +template <> +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) +{ + cudaMemGetInfo(&free_mem, &total_mem); +} +template <> +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) +{ + cudaMemGetInfo(&free_mem, &total_mem); +} +template <> +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) +{ + cudaMemGetInfo(&free_mem, &total_mem); +} +#endif + +#ifdef KOKKOS_ENABLE_HIP +template <> +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) +{ + hipMemGetInfo(&free_mem, &total_mem); +} +#endif inline int kk_get_suggested_vector_size( const size_t nr, const size_t nnz, const ExecSpaceType exec_space){ @@ -103,7 +151,7 @@ inline int kk_get_suggested_vector_size( case Exec_QTHREADS: break; case Exec_CUDA: - + case Exec_HIP: if (nr > 0) suggested_vector_size_ = nnz / double (nr) + 0.5; if (suggested_vector_size_ < 3){ @@ -119,7 +167,14 @@ inline int kk_get_suggested_vector_size( suggested_vector_size_ = 16; } else { - suggested_vector_size_ = 32; + if(exec_space == Exec_CUDA || suggested_vector_size_ <= 48) { + //use full CUDA warp, or half a HIP wavefront + suggested_vector_size_ = 32; + } + else { + //use full HIP wavefront + suggested_vector_size_ = 64; + } } break; } @@ -129,7 +184,9 @@ inline int kk_get_suggested_vector_size( inline int kk_get_suggested_team_size(const int vector_size, const ExecSpaceType exec_space){ - if (exec_space == Exec_CUDA){ + if (exec_space == Exec_CUDA || exec_space == Exec_HIP) { + //TODO: where this is used, tune the target value for + //threads per block (but 256 is probably OK for CUDA and HIP) return 256 / vector_size; } else { @@ -171,6 +228,25 @@ struct SpaceInstance { }; #endif +#ifdef KOKKOS_ENABLE_HIP +template <> +struct SpaceInstance { + static Kokkos::Experimental::HIP create() { + hipStream_t stream; + hipStreamCreate(&stream); + return Kokkos::Experimental::HIP(stream); + } + static void destroy(Kokkos::Experimental::HIP& space) { + hipStream_t stream = space.hip_stream(); + hipStreamDestroy(stream); + } + static bool overlap() { + //TODO: does HIP have an equivalent for CUDA_LAUNCH_BLOCKING? + return true; + } +}; +#endif + } } diff --git a/src/common/KokkosKernels_Handle.hpp b/src/common/KokkosKernels_Handle.hpp index 9d43ba670c..2e335d4f04 100644 --- a/src/common/KokkosKernels_Handle.hpp +++ b/src/common/KokkosKernels_Handle.hpp @@ -371,7 +371,7 @@ class KokkosKernelsHandle return this->team_work_size; } else { - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (my_exec_space == KokkosKernels::Impl::Exec_CUDA || my_exec_space == KokkosKernels::Impl::Exec_HIP) { return team_size; } else { diff --git a/src/common/KokkosKernels_Macros.hpp b/src/common/KokkosKernels_Macros.hpp index 84de9048c9..ced946fe4f 100644 --- a/src/common/KokkosKernels_Macros.hpp +++ b/src/common/KokkosKernels_Macros.hpp @@ -46,10 +46,10 @@ #define _KOKKOSKERNELS_MACROUTILS_HPP_ // If KOKKOSKERNELS_ENABLE_OMP_SIMD is defined, it's legal to place -// "#pragma omp simd" before a for loop. It's never defined if CUDA is enabled, +// "#pragma omp simd" before a for loop. It's never defined if a GPU-type device is enabled, // since in that case, Kokkos::ThreadVectorRange should be used instead for SIMD parallel loops. -#if !defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_OPENMP) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ENABLE_OPENMP) #if defined(KOKKOS_COMPILER_GNU) // GCC 4.8.5 and older do not support #pragma omp simd #if (KOKKOS_COMPILER_GNU > 485 ) diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp index 2547c2e1b9..7628e6de31 100644 --- a/src/common/KokkosKernels_SparseUtils.hpp +++ b/src/common/KokkosKernels_SparseUtils.hpp @@ -1041,12 +1041,7 @@ void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const val { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; -#ifdef KOKKOS_ENABLE_CUDA - //only CUDA benefits from using team-based bitonic - bool useRadix = std::is_same::value ? false : true; -#else - bool useRadix = true; -#endif + bool useRadix = !kk_is_gpu_exec_space(); SortCrsMatrixFunctor funct(useRadix, rowmap, entries, values); lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; @@ -1094,12 +1089,7 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; -#ifdef KOKKOS_ENABLE_CUDA - //only CUDA benefits from using team-based bitonic - bool useRadix = std::is_same::value ? false : true; -#else - bool useRadix = true; -#endif + bool useRadix = !kk_is_gpu_exec_space(); SortCrsGraphFunctor funct(useRadix, rowmap, entries); lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; @@ -1353,74 +1343,45 @@ void kk_sort_graph( out_scalar_view_t out_vals){ ExecSpaceType exec = kk_get_exec_space_type(); - if (exec == Exec_CUDA){ - typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj); - Kokkos::deep_copy (hr, in_xadj); - typename lno_nnz_view_t::HostMirror he = Kokkos::create_mirror_view (in_adj); - Kokkos::deep_copy (he, in_adj); - typename scalar_view_t::HostMirror hv = Kokkos::create_mirror_view (in_vals); - Kokkos::deep_copy (hv, in_vals); - MyExecSpace().fence(); - - typename lno_nnz_view_t::HostMirror heo = Kokkos::create_mirror_view (out_adj); - typename scalar_view_t::HostMirror hvo = Kokkos::create_mirror_view (out_vals); + // If possible, sort on host and avoid a deep copy + // TODO BMK: can this function be deprecated? + typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj); + Kokkos::deep_copy (hr, in_xadj); + typename lno_nnz_view_t::HostMirror he = Kokkos::create_mirror_view (in_adj); + Kokkos::deep_copy (he, in_adj); + typename scalar_view_t::HostMirror hv = Kokkos::create_mirror_view (in_vals); + Kokkos::deep_copy (hv, in_vals); + MyExecSpace().fence(); + typename lno_nnz_view_t::HostMirror heo = Kokkos::create_mirror_view (out_adj); + typename scalar_view_t::HostMirror hvo = Kokkos::create_mirror_view (out_vals); - typedef typename lno_view_t::non_const_value_type size_type; - typedef typename lno_nnz_view_t::non_const_value_type lno_t; - typedef typename scalar_view_t::non_const_value_type scalar_t; + typedef typename lno_view_t::non_const_value_type size_type; + typedef typename lno_nnz_view_t::non_const_value_type lno_t; + typedef typename scalar_view_t::non_const_value_type scalar_t; - lno_t nrows = in_xadj.extent(0) - 1; - std::vector > edges(in_adj.extent(0)); + lno_t nrows = in_xadj.extent(0) - 1; + std::vector > edges(in_adj.extent(0)); - size_type row_size = 0; - for (lno_t i = 0; i < nrows; ++i){ - for (size_type j = hr(i); j < hr(i + 1); ++j){ - edges[row_size].src = i; - edges[row_size].dst = he(j); - edges[row_size++].ew = hv(j); - } - } - std::sort (edges.begin(), edges.begin() + row_size); - size_type ne = in_adj.extent(0); - for(size_type i = 0; i < ne; ++i){ - heo(i) = edges[i].dst; - hvo(i) = edges[i].ew; + size_type row_size = 0; + for (lno_t i = 0; i < nrows; ++i){ + for (size_type j = hr(i); j < hr(i + 1); ++j){ + edges[row_size].src = i; + edges[row_size].dst = he(j); + edges[row_size++].ew = hv(j); } - - - Kokkos::deep_copy (out_adj, heo); - Kokkos::deep_copy (out_vals, hvo); - MyExecSpace().fence(); } - else { - - - typedef typename lno_view_t::non_const_value_type size_type; - typedef typename lno_nnz_view_t::non_const_value_type lno_t; - typedef typename scalar_view_t::non_const_value_type scalar_t; - - lno_t nrows = in_xadj.extent(0) - 1; - std::vector > edges(in_adj.extent(0)); - - size_type row_size = 0; - for (lno_t i = 0; i < nrows; ++i){ - for (size_type j = in_xadj(i); j < in_xadj(i + 1); ++j){ - edges[row_size].src = i; - edges[row_size].dst = in_adj(j); - edges[row_size++].ew = in_vals(j); - } - } - std::sort (edges.begin(), edges.begin() + row_size); - size_type ne = in_adj.extent(0); - for(size_type i = 0; i < ne; ++i){ - out_adj(i) = edges[i].dst; - out_vals(i) = edges[i].ew; - } - + std::sort (edges.begin(), edges.begin() + row_size); + size_type ne = in_adj.extent(0); + for(size_type i = 0; i < ne; ++i){ + heo(i) = edges[i].dst; + hvo(i) = edges[i].ew; + } - } + Kokkos::deep_copy (out_adj, heo); + Kokkos::deep_copy (out_vals, hvo); + MyExecSpace().fence(); } /* @@ -1714,47 +1675,46 @@ struct LowerTriangularMatrix{ const size_type write_end = t_xadj[row_index + 1]; const lno_t write_left_work = write_end - write_begin; - switch (exec_space){ - case Exec_CUDA: - //TODO: Write cuda version here. - /* + //TODO: Write GPU (vector-level) version here: + /* + if(kk_is_gpu_exec_space()) + { Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, read_left_work), [&] (lno_t i) { const size_type adjind = i + col_begin; const lno_t colIndex = adj[adjind]; - }); - */ + } + else + ... + */ - default: - for (lno_t r = 0 , w = 0; r < read_left_work && w < write_left_work; ++r){ - const size_type adjind = r + col_begin; - const lno_t colIndex = adj[adjind]; - lno_t colperm = colIndex; - if (permutation != NULL){ - colperm = permutation[colIndex]; - } - if (is_lower){ - if (row_perm > colperm){ - if (in_vals != NULL){ - t_vals[write_begin + w] = in_vals[adjind]; - } - t_adj[write_begin + w++] = colIndex; + for (lno_t r = 0 , w = 0; r < read_left_work && w < write_left_work; ++r){ + const size_type adjind = r + col_begin; + const lno_t colIndex = adj[adjind]; + lno_t colperm = colIndex; + if (permutation != NULL){ + colperm = permutation[colIndex]; + } + if (is_lower){ + if (row_perm > colperm){ + if (in_vals != NULL){ + t_vals[write_begin + w] = in_vals[adjind]; } + t_adj[write_begin + w++] = colIndex; } - else { - if (row_perm < colperm){ - if (in_vals != NULL){ - t_vals[write_begin + w] = in_vals[adjind]; - } - t_adj[write_begin + w++] = colIndex; + } + else { + if (row_perm < colperm){ + if (in_vals != NULL){ + t_vals[write_begin + w] = in_vals[adjind]; } + t_adj[write_begin + w++] = colIndex; } + } - } - break; } }); } @@ -2340,7 +2300,6 @@ void kk_create_incidence_tranpose_matrix_from_lower_triangle( bool use_dynamic_scheduling = false, bool chunksize = 4){ -#ifndef KOKKOS_ENABLE_CUDA //typedef typename row_map_view_t::const_type const_row_map_view_t; //typedef typename cols_view_t::const_type const_cols_view_t; @@ -2381,7 +2340,6 @@ void kk_create_incidence_tranpose_matrix_from_lower_triangle( } }); -#endif } template void get_suggested_vector_size( int &suggested_vector_size_, - idx nr, idx nnz){ - - suggested_vector_size_ = 1; - -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ - suggested_vector_size_ = 1; - } -#endif - -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - suggested_vector_size_ = 1; - } -#endif - -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - suggested_vector_size_ = 1; - } -#endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - - suggested_vector_size_ = nnz / double (nr) + 0.5; - - if (suggested_vector_size_ <= 3){ - suggested_vector_size_ = 2; - } - else if (suggested_vector_size_ <= 6){ - suggested_vector_size_ = 4; - } - else if (suggested_vector_size_ <= 12){ - suggested_vector_size_ = 8; - } - else if (suggested_vector_size_ <= 24){ - suggested_vector_size_ = 16; - } - else { - suggested_vector_size_ = 32; - } - } -#endif - -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ - suggested_vector_size_ = 1; - } -#endif - + idx nr, idx nnz) { + suggested_vector_size_ = kk_get_suggested_vector_size(nr, nnz, get_exec_space_type()); } //Get the best team size for the given functor. @@ -152,34 +103,28 @@ void get_suggested_vector_size( template int get_suggested_team_size(Functor& f, int vector_size) { -#ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) + using execution_space = typename team_policy_t::traits::execution_space; + if(kk_is_gpu_exec_space()) { team_policy_t temp(1, 1, vector_size); return temp.team_size_recommended(f, ParallelTag()); } else -#endif - { return 1; - } } template int get_suggested_team_size(Functor& f, int vector_size, size_t sharedPerTeam, size_t sharedPerThread) { -#ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) + using execution_space = typename team_policy_t::traits::execution_space; + if(kk_is_gpu_exec_space()) { team_policy_t temp = team_policy_t(1, 1, vector_size). set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam), Kokkos::PerThread(sharedPerThread)); return temp.team_size_recommended(f, ParallelTag()); } else -#endif - { return 1; - } } template nnz_lno_persistent_work_view_t; typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type - typedef Kokkos::TeamPolicy team_policy_t ; + typedef Kokkos::TeamPolicy team_policy_t ; typedef typename team_policy_t::member_type team_member_t ; typedef typename Kokkos::View non_const_1d_size_type_view_t; @@ -229,54 +229,17 @@ class GraphColoringHandle } - /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise. + /** \brief Chooses best algorithm based on the execution space. COLORING_SERIAL if serial, otherwise COLORING_VBBIT. + * VBBIT is the fastest parallel algorithm (unless on GPU and the graph's maximum degree is very large, but + * we don't have information about the graph here) */ void choose_default_algorithm() { -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ + auto exec = KokkosKernels::Impl::kk_get_exec_space_type(); + if(exec == KokkosKernels::Impl::Exec_SERIAL) this->coloring_algorithm_type = COLORING_SERIAL; -#ifdef VERBOSE - std::cout << "Serial Execution Space, Default Algorithm: COLORING_VB" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - this->coloring_algorithm_type = COLORING_VB; -#ifdef VERBOSE - std::cout << "PTHREAD Execution Space, Default Algorithm: COLORING_VB" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - this->coloring_algorithm_type = COLORING_VB; -#ifdef VERBOSE - std::cout << "OpenMP Execution Space, Default Algorithm: COLORING_VB" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - this->coloring_algorithm_type = COLORING_EB; -#ifdef VERBOSE - std::cout << "Cuda Execution Space, Default Algorithm: COLORING_VB" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ - this->coloring_algorithm_type = COLORING_VB; -#ifdef VERBOSE - std::cout << "Qthread Execution Space, Default Algorithm: COLORING_VB" << std::endl; -#endif - } -#endif + else + this->coloring_algorithm_type = COLORING_VBBIT; } template @@ -463,7 +426,7 @@ class GraphColoringHandle row_index_view_type xadj, nonzero_view_type adj){ KokkosKernels::Impl::symmetrize_and_get_lower_diagonal_edge_list - + ( nv, xadj, @@ -496,13 +459,8 @@ class GraphColoringHandle size_type_temp_work_view_t lower_count("LowerXADJ", nv + 1); size_type new_num_edge = 0; - typedef Kokkos::RangePolicy my_exec_space; - - if ( false -#if defined( KOKKOS_ENABLE_CUDA ) - || std::is_same::value -#endif - ) + typedef Kokkos::RangePolicy my_exec_space; + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { @@ -522,10 +480,10 @@ class GraphColoringHandle clt//, new_num_edge ); - KokkosKernels::Impl::inclusive_parallel_prefix_sum + KokkosKernels::Impl::inclusive_parallel_prefix_sum (nv+1, lower_count); //Kokkos::parallel_scan (my_exec_space(0, nv + 1), PPS(lower_count)); - HandleExecSpace().fence(); + ExecutionSpace().fence(); auto lower_total_count = Kokkos::subview(lower_count, nv); auto hlower = Kokkos::create_mirror_view (lower_total_count); Kokkos::deep_copy (hlower, lower_total_count); @@ -551,7 +509,7 @@ class GraphColoringHandle //Kokkos::parallel_scan (my_exec_space(0, nv + 1), PPS(lower_count)); - KokkosKernels::Impl::inclusive_parallel_prefix_sum + KokkosKernels::Impl::inclusive_parallel_prefix_sum (nv+1, lower_count); nnz_lno_persistent_work_view_t half_src (Kokkos::ViewAllocateWithoutInitializing("HALF SRC"),new_num_edge); nnz_lno_persistent_work_view_t half_dst (Kokkos::ViewAllocateWithoutInitializing("HALF DST"),new_num_edge); diff --git a/src/graph/KokkosGraph_Distance2ColorHandle.hpp b/src/graph/KokkosGraph_Distance2ColorHandle.hpp index f4624f545b..4c392051fb 100644 --- a/src/graph/KokkosGraph_Distance2ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance2ColorHandle.hpp @@ -198,71 +198,17 @@ class GraphColorDistance2Handle * Chooses best algorithm based on the execution space. * * This chooses the best algorithm based on the execution space: - * - COLORING_D2_SERIAL if the execution space is SERIAL - * - COLORING_D2_NB_BIT otherwise + * - COLORING_D2_SERIAL if the execution space is SERIAL (more work efficient than NB_BIT) + * - COLORING_D2_NB_BIT otherwise (fastest parallel algorithm) * */ void choose_default_algorithm() { - bool found = false; -#if defined(KOKKOS_ENABLE_SERIAL) - if(std::is_same::value) - { + if(KokkosKernels::Impl::kk_get_exec_space_type() == KokkosKernels::Impl::Exec_SERIAL) this->coloring_algorithm_type = COLORING_D2_SERIAL; - found = true; -#ifdef VERBOSE - std::cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL" << std::endl; -#endif - } -#endif - -#if defined(KOKKOS_ENABLE_THREADS) - if(std::is_same::value) - { - this->coloring_algorithm_type = COLORING_D2_NB_BIT; - found = true; -#ifdef VERBOSE - std::cout << "PTHREAD Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl; -#endif - } -#endif - -#if defined(KOKKOS_ENABLE_OPENMP) - if(std::is_same::value) - { - this->coloring_algorithm_type = COLORING_D2_NB_BIT; - found = true; -#ifdef VERBOSE - std::cout << "OpenMP Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl; -#endif - } -#endif - -#if defined(KOKKOS_ENABLE_CUDA) - if(std::is_same::value) - { - this->coloring_algorithm_type = COLORING_D2_NB_BIT; - found = true; -#ifdef VERBOSE - std::cout << "Cuda Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl; -#endif - } -#endif - -#if defined(KOKKOS_ENABLE_QTHREAD) - if(std::is_same::value) - { + else this->coloring_algorithm_type = COLORING_D2_NB_BIT; - found = true; -#ifdef VERBOSE - std::cout << "Qthread Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl; -#endif - } -#endif - //Since this logic is based on checking every exec space, detect when a new one needs to be supported - if(!found) - throw std::logic_error("D2 coloring: default algorithm hasn't been chosen for the current execution space"); } diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index 0a5493df7d..866ad54daf 100644 --- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -396,7 +396,7 @@ struct D2_MIS_RandomPriority Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(colWorklist)); worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); - bool useTeams = (execSpaceEnum == KokkosKernels::Impl::Exec_CUDA) && (entries.extent(0) / numVerts >= 16); + bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space() && (entries.extent(0) / numVerts >= 16); int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum); int round = 0; lno_t rowWorkLen = numVerts; diff --git a/src/sparse/KokkosSparse_CrsMatrix.hpp b/src/sparse/KokkosSparse_CrsMatrix.hpp index 938d6e91be..c618d3add6 100644 --- a/src/sparse/KokkosSparse_CrsMatrix.hpp +++ b/src/sparse/KokkosSparse_CrsMatrix.hpp @@ -104,6 +104,12 @@ inline int RowsPerThread(const int NNZPerRow) { return 1; } #endif +#ifdef KOKKOS_ENABLE_HIP +template<> +inline int RowsPerThread(const int NNZPerRow) { + return 1; +} +#endif // A simple struct for storing a kernel launch configuration. // This is currently used by CrsMatrix to allow the user to have some control diff --git a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp index 2def3a17f1..fd4a9b58d9 100644 --- a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp +++ b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp @@ -274,53 +274,11 @@ namespace KokkosSparse{ void set_block_size(nnz_lno_t bs){this->block_size = bs; } nnz_lno_t get_block_size() const {return this->block_size;} - /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise. - */ void choose_default_algorithm(){ -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ - this->algorithm_type = GS_PERMUTED; -#ifdef VERBOSE - std::cout << "Serial Execution Space, Default Algorithm: GS_PERMUTED" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - this->algorithm_type = GS_PERMUTED; -#ifdef VERBOSE - std::cout << "PTHREAD Execution Space, Default Algorithm: GS_PERMUTED" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - this->algorithm_type = GS_PERMUTED; -#ifdef VERBOSE - std::cout << "OpenMP Execution Space, Default Algorithm: GS_PERMUTED" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) this->algorithm_type = GS_TEAM; -#ifdef VERBOSE - std::cout << "Cuda Execution Space, Default Algorithm: GS_TEAM" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ + else this->algorithm_type = GS_PERMUTED; -#ifdef VERBOSE - std::cout << "Qthread Execution Space, Default Algorithm: GS_PERMUTED" << std::endl; -#endif - } -#endif } ~PointGaussSeidelHandle() = default; @@ -559,33 +517,7 @@ namespace KokkosSparse{ bool use_teams() const { - bool return_value = false; -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value) { - return_value = false; - } -#endif -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - return_value = false; - } -#endif -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - return_value = false; - } -#endif -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - return_value = true; - } -#endif -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ - return_value = false; - } -#endif - return return_value; + return KokkosKernels::Impl::kk_is_gpu_exec_space(); } ~ClusterGaussSeidelHandle() = default; diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp index 820afbbaa3..9ed66ce2ad 100644 --- a/src/sparse/KokkosSparse_spadd.hpp +++ b/src/sparse/KokkosSparse_spadd.hpp @@ -202,67 +202,6 @@ struct UnmergedSumFunctor { CcolindsT ABperm; }; -template -struct SortEntriesFunctor { - SortEntriesFunctor(const CrowptrsT& Crowptrs_, const CcolindsT& Ccolinds_, - const CcolindsT& ABperm_) - : Crowptrs(Crowptrs_), - Ccolinds(Ccolinds_), - CcolindsAux("C colind aux", Ccolinds_.extent(0)), - ABperm(ABperm_), - ABpermAux("AB perm aux", ABperm_.extent(0)) {} - typedef typename Kokkos::TeamPolicy::member_type TeamMember; - KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { - // 3: Sort each row's colinds (permuting values at same time), then count - // unique colinds (write that to Crowptr(i)) CrowptrTemp tells how many - // entries in each oversized row - ordinal_type i = t.league_rank(); - size_type rowStart = Crowptrs(i); - size_type rowEnd = Crowptrs(i + 1); - size_type rowNum = rowEnd - rowStart; - using lno_t = typename CcolindsT::non_const_value_type; - using unsigned_lno_t = typename std::make_unsigned::type; - KokkosKernels::Impl::SerialRadixSort2( - (unsigned_lno_t*)Ccolinds.data() + rowStart, - (unsigned_lno_t*)CcolindsAux.data() + rowStart, - ABperm.data() + rowStart, ABpermAux.data() + rowStart, rowNum); - } - CrowptrsT Crowptrs; - CcolindsT Ccolinds; - CcolindsT CcolindsAux; - CcolindsT ABperm; - CcolindsT ABpermAux; -}; - -#ifdef KOKKOS_ENABLE_CUDA -template -struct SortEntriesFunctor { - SortEntriesFunctor(const CrowptrsT& Crowptrs_, CcolindsT& Ccolinds_, - CcolindsT& ABperm_) - : Crowptrs(Crowptrs_), Ccolinds(Ccolinds_), ABperm(ABperm_) {} - typedef typename Kokkos::TeamPolicy::member_type TeamMember; - KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { - // 3: Sort each row's colinds (permuting values at same time), then count - // unique colinds (write that to Crowptr(i)) CrowptrTemp tells how many - // entries in each oversized row - size_type i = t.league_rank(); - size_type rowStart = Crowptrs(i); - size_type rowEnd = Crowptrs(i + 1); - size_type rowNum = rowEnd - rowStart; - KokkosKernels::Impl::TeamBitonicSort2< - size_type, typename CcolindsT::non_const_value_type, - typename CcolindsT::non_const_value_type, TeamMember>( - Ccolinds.data() + rowStart, ABperm.data() + rowStart, rowNum, t); - } - CrowptrsT Crowptrs; - CcolindsT Ccolinds; - CcolindsT ABperm; -}; -#endif - template struct MergeEntriesFunctor { diff --git a/src/sparse/KokkosSparse_spgemm_handle.hpp b/src/sparse/KokkosSparse_spgemm_handle.hpp index b34d349457..f517682d5e 100644 --- a/src/sparse/KokkosSparse_spgemm_handle.hpp +++ b/src/sparse/KokkosSparse_spgemm_handle.hpp @@ -504,8 +504,6 @@ class SPGEMMHandle{ return this->cuSPARSEHandle; } #endif - /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise. - */ void choose_default_algorithm(){ #if defined( KOKKOS_ENABLE_SERIAL ) if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ @@ -543,6 +541,15 @@ class SPGEMMHandle{ } #endif +#if defined( KOKKOS_ENABLE_HIP ) + if (std::is_same::value){ + this->algorithm_type = SPGEMM_KK; +#ifdef VERBOSE + std::cout << "HIP Execution Space, Default Algorithm: SPGEMM_KK" << std::endl; +#endif + } +#endif + #if defined( KOKKOS_ENABLE_QTHREAD) if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ this->algorithm_type = SPGEMM_SERIAL; @@ -604,67 +611,20 @@ class SPGEMMHandle{ //suggested_vector_size_=this->suggested_vector_size = 1; //return; if (this->suggested_team_size && this->suggested_vector_size) { + //already set in the handle suggested_vector_size_ = this->suggested_vector_size; suggested_team_size_ = this->suggested_team_size; return; } -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ - suggested_vector_size_ = this->suggested_vector_size = 1; - suggested_team_size_ = this->suggested_team_size = max_allowed_team_size; - return; - } -#endif - -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - suggested_vector_size_ = this->suggested_vector_size = 1; - suggested_team_size_ = this->suggested_team_size = max_allowed_team_size; - return; - } -#endif - -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - suggested_vector_size_ = this->suggested_vector_size = 1; - suggested_team_size_ = this->suggested_team_size = max_allowed_team_size; - } -#endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - - this->suggested_vector_size = nnz / double (nr) + 0.5; - - if (this->suggested_vector_size <= 3){ - this->suggested_vector_size = 2; - } - else if (this->suggested_vector_size <= 6){ - this->suggested_vector_size = 4; - } - else if (this->suggested_vector_size <= 12){ - this->suggested_vector_size = 8; - } - else if (this->suggested_vector_size <= 24){ - this->suggested_vector_size = 16; - } - else { - this->suggested_vector_size = 32; - } - - suggested_vector_size_ = this->suggested_vector_size; - this->suggested_team_size= suggested_team_size_ = max_allowed_team_size / this->suggested_vector_size; - } -#endif - -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ - suggested_vector_size_ = this->suggested_vector_size = 1; - suggested_team_size_ = this->suggested_team_size = max_allowed_team_size; - } -#endif - + //otherwise, recompute team_size/vector_size based on heuristic and save them in the handle + suggested_vector_size_ = KokkosKernels::Impl::kk_get_suggested_vector_size(nr, nnz, KokkosKernels::Impl::kk_get_exec_space_type()); + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) + suggested_team_size_ = max_allowed_team_size / suggested_vector_size_; + else + suggested_team_size = max_allowed_team_size; + this->suggested_vector_size = suggested_vector_size_; + this->suggested_team_size = suggested_vector_size_; } void set_compression_steps(bool isCompressionSingleStep){ diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 03eef00e4d..d956ed8d4d 100644 --- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -554,9 +554,8 @@ namespace KokkosSparse{ } }); -#if !defined(__CUDA_ARCH__) #if KOKKOSSPARSE_IMPL_PRINTDEBUG - if (/*i == 0 && ii == 1*/ ii == 0 || (block_size == 1 && ii < 2) ){ + if (!KokkosKernels::Impl::kk_is_gpu_exec_space() && (ii == 0 || (block_size == 1 && ii < 2))){ std::cout << "\n\n\nrow:" << ii * block_size + i; std::cout << "\nneighbors:"; for (nnz_lno_t z = 0; z < block_row_size; ++z){ @@ -573,7 +572,6 @@ namespace KokkosSparse{ std::cout << std::endl << "block_row_index:" << ii * block_size + i << " _Xvector(block_row_index):" << _Xvector(ii * block_size + i, vec) << std::endl << std::endl<< std::endl; } -#endif #endif //row_begin += row_size * block_size; } @@ -737,31 +735,16 @@ namespace KokkosSparse{ timer.reset(); #endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - for (nnz_lno_t i = 0; i < numColors; ++i){ - nnz_lno_t color_index_begin = h_color_xadj(i); - nnz_lno_t color_index_end = h_color_xadj(i + 1); - - if (color_index_begin + 1 >= color_index_end ) continue; - auto colorsubset = - subview(color_adj, Kokkos::pair (color_index_begin, color_index_end)); - MyExecSpace().fence(); - Kokkos::sort (colorsubset); - //TODO: MD 08/2017: If I remove the below fence, code fails on cuda. - //I do not see any reason yet it to fail. - MyExecSpace().fence(); - } - } -#endif - - MyExecSpace().fence(); + // TODO BMK: Why are the vertices in each color set only being sorted on GPU? + // Wouldn't it have a locality benefit on CPU too? + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + KokkosKernels::Impl::sort_crs_graph(color_xadj, color_adj); + MyExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - std::cout << "SORT_TIME:" << timer.seconds() << std::endl; - timer.reset(); - //std::cout << "sort" << std::endl; + std::cout << "SORT_TIME:" << timer.seconds() << std::endl; + timer.reset(); #endif + } row_lno_persistent_work_view_t permuted_xadj ("new xadj", num_rows + 1); nnz_lno_persistent_work_view_t old_to_new_map ("old_to_new_index_", num_rows ); @@ -844,7 +827,7 @@ namespace KokkosSparse{ nnz_lno_t num_big_rows = 0; KokkosKernels::Impl::ExecSpaceType ex_sp = this->handle->get_handle_exec_space(); - if (ex_sp != KokkosKernels::Impl::Exec_CUDA){ + if (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { //again, if it is on CPUs, we make L1 as big as we need. size_t l1mem = 1; while(l1mem < level_1_mem){ @@ -882,12 +865,11 @@ namespace KokkosSparse{ num_big_rows = KOKKOSKERNELS_MACRO_MIN(num_large_rows, (size_type)(MyExecSpace::concurrency() / suggested_vector_size)); //std::cout << "num_big_rows:" << num_big_rows << std::endl; -#if defined( KOKKOS_ENABLE_CUDA ) - if (ex_sp == KokkosKernels::Impl::Exec_CUDA) { + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { //check if we have enough memory for this. lower the concurrency if we do not have enugh memory. size_t free_byte ; size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); size_t required_size = size_t (num_big_rows) * level_2_mem; if (required_size + num_big_rows * sizeof(int) > free_byte){ num_big_rows = ((((free_byte - num_big_rows * sizeof(int))* 0.8) /8 ) * 8) / level_2_mem; @@ -900,7 +882,6 @@ namespace KokkosSparse{ num_big_rows = min_chunk_size; } } -#endif } } @@ -1165,7 +1146,7 @@ namespace KokkosSparse{ // change fill_matrix_numeric so that they store the internal matrix as above. // the rest will wok fine. - if (this->handle->get_handle_exec_space() == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::Team_fill_matrix_numeric", team_policy_t(num_rows / rows_per_team + 1 , suggested_team_size, suggested_vector_size), fill_matrix_numeric( @@ -1209,7 +1190,7 @@ namespace KokkosSparse{ block_size, block_matrix_size); - if (this->handle->get_handle_exec_space() == KokkosKernels::Impl::Exec_CUDA || block_size > 1){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space() || block_size > 1){ Kokkos::parallel_for("KokkosSparse::GaussSeidel::team_get_matrix_diagonals", team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), gmd ); diff --git a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp index 0ef887d80e..af10787c46 100644 --- a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp +++ b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp @@ -74,535 +74,6 @@ struct IotaFunctor View v; }; -template -struct RCM -{ - typedef typename HandleType::HandleExecSpace MyExecSpace; - typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace; - typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; - - typedef typename HandleType::size_type size_type; - typedef typename HandleType::nnz_lno_t nnz_lno_t; - - typedef typename lno_row_view_t::const_type const_lno_row_view_t; - typedef typename lno_row_view_t::non_const_type non_const_lno_row_view_t; - typedef typename non_const_lno_row_view_t::value_type offset_t; - - typedef typename lno_nnz_view_t::const_type const_lno_nnz_view_t; - typedef typename lno_nnz_view_t::non_const_type non_const_lno_nnz_view_t; - - typedef typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t; - typedef typename HandleType::row_lno_persistent_work_view_t row_lno_persistent_work_view_t; - typedef typename HandleType::row_lno_persistent_work_host_view_t row_lno_persistent_work_host_view_t; //Host view type - - typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; - typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t; - typedef typename HandleType::nnz_lno_persistent_work_host_view_t nnz_lno_persistent_work_host_view_t; //Host view type - - typedef nnz_lno_persistent_work_view_t nnz_view_t; - typedef Kokkos::View> single_view_t; - typedef Kokkos::View> single_view_host_t; - - typedef Kokkos::RangePolicy my_exec_space; - - typedef Kokkos::Device device_t; - - typedef Kokkos::RangePolicy range_policy_t ; - typedef Kokkos::TeamPolicy team_policy_t ; - typedef typename team_policy_t::member_type team_member_t ; - - typedef nnz_lno_t LO; - - RCM(size_type numRows_, lno_row_view_t& rowmap_, lno_nnz_view_t& colinds_) - : numRows(numRows_), rowmap(rowmap_), colinds(colinds_) - {} - - nnz_lno_t numRows; - const_lno_row_view_t rowmap; - const_lno_nnz_view_t colinds; - - //radix sort keys according to their corresponding values ascending. - //keys are NOT preserved since the use of this in RCM doesn't care about degree after sorting - template - KOKKOS_INLINE_FUNCTION static void - radixSortKeysAndValues(KeyType* keys, KeyType* keysAux, ValueType* values, ValueType* valuesAux, IndexType n, const member_t& mem) - { - if(n <= 1) - return; - //sort 4 bits at a time - KeyType mask = 0xF; - bool inAux = false; - //maskPos counts the low bit index of mask (0, 4, 8, ...) - IndexType maskPos = 0; - IndexType sortBits = 0; - KeyType minKey = Kokkos::ArithTraits::max(); - KeyType maxKey = 0; - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(mem, n), - [=](size_type i, KeyType& lminkey) - { - if(keys[i] < lminkey) - lminkey = keys[i]; - }, Kokkos::Min(minKey)); - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(mem, n), - [=](size_type i, KeyType& lmaxkey) - { - if(keys[i] > lmaxkey) - lmaxkey = keys[i]; - }, Kokkos::Max(maxKey)); - //apply a bias so that key range always starts at 0 - //also invert key values here for a descending sort - Kokkos::parallel_for(Kokkos::ThreadVectorRange(mem, n), - [=](size_type i) - { - keys[i] -= minKey; - }); - KeyType upperBound = maxKey - minKey; - while(upperBound) - { - upperBound >>= 1; - sortBits++; - } - for(IndexType s = 0; s < (sortBits + 3) / 4; s++) - { - //Count the number of elements in each bucket - IndexType count[16] = {0}; - IndexType offset[17]; - if(!inAux) - { - for(IndexType i = 0; i < n; i++) - { - count[(keys[i] & mask) >> maskPos]++; - } - } - else - { - for(IndexType i = 0; i < n; i++) - { - count[(keysAux[i] & mask) >> maskPos]++; - } - } - offset[0] = 0; - //get offset as the prefix sum for count - for(IndexType i = 0; i < 16; i++) - { - offset[i + 1] = offset[i] + count[i]; - } - //now for each element in [lo, hi), move it to its offset in the other buffer - //this branch should be ok because whichBuf is the same on all threads - if(!inAux) - { - //copy from *Over to *Aux - for(IndexType i = 0; i < n; i++) - { - IndexType bucket = (keys[i] & mask) >> maskPos; - keysAux[offset[bucket + 1] - count[bucket]] = keys[i]; - valuesAux[offset[bucket + 1] - count[bucket]] = values[i]; - count[bucket]--; - } - } - else - { - //copy from *Aux to *Over - for(IndexType i = 0; i < n; i++) - { - IndexType bucket = (keysAux[i] & mask) >> maskPos; - keys[offset[bucket + 1] - count[bucket]] = keysAux[i]; - values[offset[bucket + 1] - count[bucket]] = valuesAux[i]; - count[bucket]--; - } - } - inAux = !inAux; - mask = mask << 4; - maskPos += 4; - } - //move keys/values back from aux if they are currently in aux, - //and remove bias - if(inAux) - { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(mem, n), - [=](size_type i) - { - //TODO: when everything works, is safe to remove next line - //since keys (BFS visit scores) will never be needed again - keys[i] = keysAux[i]; - values[i] = valuesAux[i]; - }); - } - } - - //Functor that does breadth-first search on a sparse graph. - struct BfsFunctor - { - typedef Kokkos::View> WorkView; - - BfsFunctor(const WorkView& workQueue_, const WorkView& scratch_, const nnz_view_t& visit_, const const_lno_row_view_t& rowmap_, const const_lno_nnz_view_t& colinds_, const single_view_t& numLevels_, const nnz_view_t& threadNeighborCounts_, nnz_lno_t start_, nnz_lno_t numRows_) - : workQueue(workQueue_), scratch(scratch_), visit(visit_), rowmap(rowmap_), colinds(colinds_), numLevels(numLevels_), threadNeighborCounts(threadNeighborCounts_), start(start_), numRows(numRows_) - {} - - KOKKOS_INLINE_FUNCTION void operator()(const team_member_t mem) const - { - const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); - const nnz_lno_t NOT_VISITED = LNO_MAX; - const nnz_lno_t QUEUED = NOT_VISITED - 1; - int nthreads = mem.team_size(); - nnz_lno_t tid = mem.team_rank(); - auto neighborList = Kokkos::subview(scratch, tid, Kokkos::ALL()); - //active and next indicate which buffer in workQueue holds the nodes in current/next frontiers, respectively - //active, next and visitCounter are thread-local, but always kept consistent across threads - int active = 0; - int next = 1; - nnz_lno_t visitCounter = 0; - Kokkos::single(Kokkos::PerTeam(mem), - [&]() - { - workQueue(active, 0) = start; - visit(start) = QUEUED; - }); - nnz_lno_t activeQSize = 1; - nnz_lno_t nextQSize = 0; - //KK create_reverse_map() expects incoming values to start at 1 - nnz_lno_t level = 1; - //do this until all nodes have been visited and added to a level - while(visitCounter < numRows) - { - mem.team_barrier(); - //each thread works on a contiguous block of nodes in queue (for locality) - //compute in size_t to avoid possible 32-bit overflow - nnz_lno_t workStart = tid * activeQSize / nthreads; - nnz_lno_t workEnd = (tid + 1) * activeQSize / nthreads; - //the maximum work batch size (among all threads) - //the following loop contains barriers so all threads must iterate same # of times - nnz_lno_t maxBatch = (activeQSize + nthreads - 1) / nthreads; - for(nnz_lno_t loop = 0; loop < maxBatch; loop++) - { - //this thread may not actually have anything to work on (if nthreads doesn't divide qSize) - bool busy = loop < workEnd - workStart; - nnz_lno_t neiCount = 0; - nnz_lno_t process = LNO_MAX; - if(busy) - { - process = workQueue(active, workStart + loop); - offset_t rowStart = rowmap(process); - offset_t rowEnd = rowmap(process + 1); - //build a list of all non-visited neighbors - for(offset_t j = rowStart; j < rowEnd; j++) - { - nnz_lno_t col = colinds(j); - //use atomic here to guarantee neighbors are added to neighborList exactly once - if(col < numRows && Kokkos::atomic_compare_exchange_strong(&visit(col), NOT_VISITED, QUEUED)) - { - //this thread is the first to see that col needs to be queued - neighborList(neiCount) = col; - neiCount++; - } - } - } - threadNeighborCounts(tid) = neiCount; - mem.team_barrier(); - size_type queueUpdateOffset = 0; - for(nnz_lno_t i = 0; i < tid; i++) - { - queueUpdateOffset += threadNeighborCounts(i); - } - //write out all updates to next queue in parallel - if(busy) - { - nnz_lno_t nextQueueIter = 0; - for(nnz_lno_t i = 0; i < neiCount; i++) - { - nnz_lno_t toQueue = neighborList(i); - visit(toQueue) = QUEUED; - workQueue(next, nextQSize + queueUpdateOffset + nextQueueIter) = toQueue; - nextQueueIter++; - } - //assign level to to process - visit(process) = level; - } - nnz_lno_t totalAdded = 0; - for(nnz_lno_t i = 0; i < nthreads; i++) - { - totalAdded += threadNeighborCounts(i); - } - nextQSize += totalAdded; - mem.team_barrier(); - } - //swap queue buffers - active = next; - next = 1 - next; - //all threads have a consistent value of qSize here. - //update visitCounter in preparation for next frontier - visitCounter += activeQSize; - activeQSize = nextQSize; - nextQSize = 0; - if(visitCounter < numRows && activeQSize == 0) - { - Kokkos::single(Kokkos::PerTeam(mem), - [&]() - { - //Some nodes are unreachable from start (graph not connected) - //Find an unvisited node to resume BFS - for(nnz_lno_t search = numRows - 1; search >= 0; search--) - { - if(visit(search) == NOT_VISITED) - { - workQueue(active, 0) = search; - visit(search) = QUEUED; - break; - } - } - }); - activeQSize = 1; - } - level++; - } - Kokkos::single(Kokkos::PerTeam(mem), - [&] - { - numLevels() = level - 1; - }); - } - - WorkView workQueue; - WorkView scratch; - nnz_view_t visit; - const_lno_row_view_t rowmap; - const_lno_nnz_view_t colinds; - single_view_t numLevels; - nnz_view_t threadNeighborCounts; - nnz_lno_t start; - nnz_lno_t numRows; - }; - - //Parallel breadth-first search, producing level structure in (xadj, adj) form: - //xadj(level) gives index in adj where level begins. - //Returns the total number of levels, and sets xadj, adj and maxDeg. - nnz_lno_t parallel_bfs(nnz_lno_t start, nnz_view_t& xadj, nnz_view_t& adj, nnz_lno_t& maxDeg, nnz_lno_t nthreads) - { - //need to know maximum degree to allocate scratch space for threads - maxDeg = KokkosKernels::Impl::graph_max_degree(rowmap); - //view for storing the visit timestamps - nnz_view_t visit("BFS visited nodes", numRows); - const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); - const nnz_lno_t NOT_VISITED = LNO_MAX; - KokkosBlas::fill(visit, NOT_VISITED); - //the visit queue - //one of q1,q2 is active at a time and holds the nodes to process in next BFS level - //elements which are LNO_MAX are just placeholders (nothing to process) - Kokkos::View> workQueue("BFS queue (double buffered)", 2, numRows); - nnz_view_t threadNeighborCounts("Number of nodes to queue on each thread", nthreads); - single_view_t numLevels("# of BFS levels"); - single_view_host_t numLevelsHost("# of BFS levels"); - Kokkos::View> scratch("Scratch buffer shared by threads", nthreads, maxDeg); - Kokkos::parallel_for(team_policy_t(1, nthreads), BfsFunctor(workQueue, scratch, visit, rowmap, colinds, numLevels, threadNeighborCounts, start, numRows)); - Kokkos::deep_copy(numLevelsHost, numLevels); - //now that level structure has been computed, construct xadj/adj - KokkosKernels::Impl::create_reverse_map - (numRows, numLevelsHost(), visit, xadj, adj); - return numLevelsHost(); - } - - struct CuthillMcKeeFunctor - { - typedef Kokkos::View> ScoreView; - - CuthillMcKeeFunctor(nnz_lno_t numLevels_, nnz_lno_t maxDegree_, const const_lno_row_view_t& rowmap_, const const_lno_nnz_view_t& colinds_, const ScoreView& scores_, const ScoreView& scoresAux_, const nnz_view_t& visit_, const nnz_view_t& xadj_, const nnz_view_t& adj_, const nnz_view_t& adjAux_) - : numLevels(numLevels_), maxDegree(maxDegree_), rowmap(rowmap_), colinds(colinds_), scores(scores_), scoresAux(scoresAux_), visit(visit_), xadj(xadj_), adj(adj_), adjAux(adjAux_) - { - numRows = rowmap.extent(0) - 1; - } - - KOKKOS_INLINE_FUNCTION void operator()(const team_member_t mem) const - { - int tid = mem.team_rank(); - int nthreads = mem.team_size(); - const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); - nnz_lno_t visitCounter = 0; - for(nnz_lno_t level = 0; level < numLevels; level++) - { - //iterate over vertices in this level and compute - //min predecessors (minimum-labeled vertices from previous level) - nnz_lno_t levelOffset = xadj(level); - nnz_lno_t levelSize = xadj(level + 1) - levelOffset; - //compute as offset_t to avoid overflow, but the upper bound on - //the scores is approx. numRows * maxDegree, which should be representable - nnz_lno_t workStart = tid * levelSize / nthreads; - nnz_lno_t workEnd = (tid + 1) * levelSize / nthreads; - for(nnz_lno_t i = workStart; i < workEnd; i++) - { - nnz_lno_t process = adj(levelOffset + i); - nnz_lno_t minNeighbor = LNO_MAX; - offset_t rowStart = rowmap(process); - offset_t rowEnd = rowmap(process + 1); - for(offset_t j = rowStart; j < rowEnd; j++) - { - nnz_lno_t neighbor = colinds(j); - if(neighbor < numRows) - { - nnz_lno_t neighborVisit = visit(neighbor); - if(neighborVisit < minNeighbor) - minNeighbor = neighborVisit; - } - } - scores(i) = ((offset_t) minNeighbor * (maxDegree + 1)) + (rowmap(process + 1) - rowmap(process)); - } - mem.team_barrier(); - Kokkos::single(Kokkos::PerTeam(mem), - [&]() - { - radixSortKeysAndValues - (scores.data(), scoresAux.data(), adj.data() + levelOffset, adjAux.data(), levelSize, mem); - }); - mem.team_barrier(); - //label all vertices (which are now in label order within their level) - for(nnz_lno_t i = workStart; i < workEnd; i++) - { - nnz_lno_t process = adj(levelOffset + i); - //visit counter increases with levels, so flip the range for the "reverse" in RCM - visit(process) = visitCounter + i; - } - visitCounter += levelSize; - } - } - - nnz_lno_t numRows; - nnz_lno_t numLevels; - nnz_lno_t maxDegree; - const_lno_row_view_t rowmap; - const_lno_nnz_view_t colinds; - ScoreView scores; - ScoreView scoresAux; - nnz_view_t visit; - //The levels, stored in CRS format. - //xadj stores offsets for each level, and adj stores the rows in each level. - nnz_view_t xadj; - nnz_view_t adj; - nnz_view_t adjAux; - }; - - //Does the reversing in "reverse Cuthill-McKee") - struct OrderReverseFunctor - { - OrderReverseFunctor(const nnz_view_t& visit_, nnz_lno_t numRows_) - : visit(visit_), numRows(numRows_) - {} - - KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const - { - visit(i) = numRows - visit(i) - 1; - } - nnz_view_t visit; - nnz_lno_t numRows; - }; - - //breadth-first search, producing a reverse Cuthill-McKee ordering - nnz_view_t parallel_cuthill_mckee(nnz_lno_t start) - { - size_type nthreads = MyExecSpace::concurrency(); - if(nthreads > 64) - nthreads = 64; - #ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) - { - nthreads = 256; - } - #endif - nnz_view_t xadj, adj; - nnz_lno_t maxDegree = 0; - //parallel_bfs will compute maxDegree - auto numLevels = parallel_bfs(start, xadj, adj, maxDegree, nthreads); - //xadj determines where each level set starts and begins, - //so its max 'degree' gives the size of the largest level - nnz_lno_t maxLevelSize = KokkosKernels::Impl::graph_max_degree(xadj); - std::cout << "Maximum size of a level set: " << maxLevelSize << '\n'; - //visit (to be returned) contains the RCM numberings of each row - nnz_view_t visit("RCM labels", numRows); - //Populate visit wth LNO_MAX so that the "min-labeled neighbor" - //is always a node in the previous level - const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); - KokkosBlas::fill(visit, LNO_MAX); - //the "score" of a node is a single value that provides an ordering equivalent - //to sorting by min predecessor and then by min degree - //reduce nthreads to be a power of 2 - Kokkos::View> scores("RCM scores for sorting", maxLevelSize); - Kokkos::View> scoresAux("RCM scores for sorting (radix sort aux)", maxLevelSize); - nnz_view_t adjAux("RCM scores for sorting (radix sort aux)", maxLevelSize); - Kokkos::parallel_for(team_policy_t(1, nthreads), CuthillMcKeeFunctor(numLevels, maxDegree, rowmap, colinds, scores, scoresAux, visit, xadj, adj, adjAux)); - //reverse the visit order (for the 'R' in RCM) - Kokkos::parallel_for(range_policy_t(0, numRows), OrderReverseFunctor(visit, numRows)); - return visit; - } - - template - struct MinDegreeRowFunctor - { - typedef typename Reducer::value_type Value; - MinDegreeRowFunctor(const const_lno_row_view_t& rowmap_) : rowmap(rowmap_) {} - KOKKOS_INLINE_FUNCTION void operator()(const size_type i, Value& lval) const - { - size_type ideg = rowmap(i + 1) - rowmap(i); - if(ideg < lval.val) - { - lval.val = ideg; - lval.loc = i; - } - } - const_lno_row_view_t rowmap; - }; - - //parallel-for functor that assigns a cluster given a envelope-reduced reordering (like RCM) - struct OrderToClusterFunctor - { - OrderToClusterFunctor(const nnz_view_t& ordering_, const nnz_view_t& vertClusters_, nnz_lno_t clusterSize_) - : ordering(ordering_), vertClusters(vertClusters_), clusterSize(clusterSize_) - {} - - KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const - { - vertClusters(i) = ordering(i) / clusterSize; - } - - const nnz_view_t ordering; - nnz_view_t vertClusters; - nnz_lno_t clusterSize; - }; - - //Find a peripheral node (one of minimal degree), suitable for starting RCM or BFS - nnz_lno_t find_peripheral() - { - typedef Kokkos::MinLoc MinLocReducer; - typedef typename MinLocReducer::value_type MinLocVal; - MinLocVal v; - Kokkos::parallel_reduce(range_policy_t(0, numRows), - MinDegreeRowFunctor(rowmap), MinLocReducer(v)); - return v.loc; - } - - nnz_view_t cuthill_mckee() - { - nnz_lno_t periph = find_peripheral(); - //run Cuthill-McKee BFS from periph - auto ordering = parallel_cuthill_mckee(periph); - return ordering; - } - - nnz_view_t rcm() - { - nnz_view_t cm = cuthill_mckee(); - //reverse the visit order (for the 'R' in RCM) - Kokkos::parallel_for(range_policy_t(0, numRows), OrderReverseFunctor(cm, numRows)); - return cm; - } - - nnz_view_t cm_cluster(nnz_lno_t clusterSize) - { - nnz_view_t cm = cuthill_mckee(); - nnz_view_t vertClusters("Vert to cluster", numRows); - OrderToClusterFunctor makeClusters(cm, vertClusters, clusterSize); - Kokkos::parallel_for(range_policy_t(0, numRows), makeClusters); - return vertClusters; - } -}; - template struct BalloonClustering { diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp index 6d240d11b3..c881c98ed4 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp @@ -219,6 +219,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } @@ -761,6 +765,7 @@ bool KokkosSPGEMM { //get the execution space type. KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); //get the suggested vectorlane size based on the execution space, and average number of nnzs per row. int suggested_vector_size = this->handle->get_suggested_vector_size(n, nnz); //get the suggested team size. @@ -791,7 +796,7 @@ bool KokkosSPGEMM out_nnz_view_t set_nexts_; out_nnz_view_t set_begins_; #ifdef KOKKOSKERNELSMOREMEM - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { set_nexts_ = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_nexts_"), nnz); set_begins_ = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_begins_"), nnz); Kokkos::deep_copy (set_begins_, -1); @@ -804,8 +809,9 @@ bool KokkosSPGEMM } //if compressing in single step, allocate the memory as upperbound. - //TODO: two step is not there for cuda. - if (compress_in_single_step || lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + //TODO: two step is not there for GPU. + + if (compress_in_single_step || exec_gpu) { out_nnz_indices = out_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("set_entries_"), nnz); out_nnz_sets = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_indices_"), nnz); } @@ -834,7 +840,8 @@ bool KokkosSPGEMM timer1.reset(); //bool compression_applied = false; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + #ifndef KOKKOSKERNELSMOREMEM size_type max_row_nnz = 0; @@ -856,27 +863,23 @@ bool KokkosSPGEMM size_t num_chunks = concurrency / suggested_vector_size; -#if defined( KOKKOS_ENABLE_CUDA ) - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks*sizeof(int) > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - size_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } -#endif + if (exec_gpu) { + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); + size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); + if (KOKKOSKERNELS_VERBOSE) + std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; + if (required_size + num_chunks*sizeof(int) > free_byte){ + num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; + } + { + size_t min_chunk_size = 1; + while (min_chunk_size * 2 <= num_chunks) { + min_chunk_size *= 2; + } + num_chunks = min_chunk_size; + } + } if (KOKKOSKERNELS_VERBOSE){ std::cout << "\t\tPOOL chunksize:" << chunksize << " num_chunks:" diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp index aa73c1e55b..4924e11b0c 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp @@ -124,10 +124,9 @@ void KokkosSPGEMM KokkosKernels::Impl::ExecSpaceType my_exec_space_ = KokkosKernels::Impl::get_exec_space_type(); bool compress_in_single_step = this->handle->get_spgemm_handle()->get_compression_step(); - //compress in single step if it is cuda execution space. - if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA) { + //compress in single step if it is GPU. + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) compress_in_single_step = true; - } //compressed B fields. row_lno_temp_work_view_t new_row_mapB(Kokkos::ViewAllocateWithoutInitializing("new row map"), n+1); diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp index 3f29c39e4e..38fce91b1b 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp @@ -234,6 +234,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -1244,7 +1248,7 @@ void //choose parameters if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){ - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { //then chose the best method and parameters. size_type average_row_nnz = overall_nnz / this->a_row_cnt; size_t average_row_flops = original_overall_flops / this->a_row_cnt; @@ -1374,7 +1378,7 @@ void //required memory for L2 - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){ tmp_max_nnz = 1; @@ -1419,12 +1423,9 @@ void } int num_chunks = concurrency / suggested_vector_size; -#if defined( KOKKOS_ENABLE_CUDA ) - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); if (KOKKOSKERNELS_VERBOSE) std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; @@ -1439,7 +1440,6 @@ void num_chunks = min_chunk_size; } } -#endif // END SIZE CALCULATIONS FOR MEMORYPOOL @@ -1455,7 +1455,7 @@ void KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } @@ -1505,7 +1505,7 @@ void } timer1.reset(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){ if (thread_shmem_key_size <= 0) { std::cout << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl; @@ -1617,7 +1617,7 @@ void KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } @@ -1667,7 +1667,7 @@ void } timer1.reset(); - if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY2", gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc); MyExecSpace().fence(); } diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp index 415bd1ed3a..e3a4f492a6 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp @@ -143,6 +143,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -481,7 +485,7 @@ struct KokkosSPGEMM // // Policy typedefs with tags found in: KokkosSparse_spgemm_impl.hpp // -// if Cuda enabled : +// if GPU: // "KokkosSparse::NumericCMEM::KKSPEED::GPU" : gpu_team_policy_t, i.e. GPUTag // // else : @@ -519,7 +523,7 @@ void Kokkos::Impl::Timer numeric_speed_timer_with_free; - if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { //allocate memory for begins and next to be used by the hashmap nnz_lno_temp_work_view_t beginsC (Kokkos::ViewAllocateWithoutInitializing("C keys"), valuesC_.extent(0)); diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp index 9f4f7ec753..29dbb5c477 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp @@ -210,6 +210,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -785,6 +789,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -1493,13 +1501,14 @@ void KokkosSPGEMM ){ SPGEMMAlgorithm current_spgemm_algorithm = this->spgemm_algorithm; + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) + if (exec_gpu) { current_spgemm_algorithm = SPGEMM_KK_MEMORY; } maxNumRoughNonzeros = KOKKOSKERNELS_MACRO_MIN(this->b_col_cnt, maxNumRoughNonzeros); - int shmem_size_to_use = shmem_size; + int shmem_size_to_use = shmem_size; typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space; @@ -1511,7 +1520,7 @@ void KokkosSPGEMM int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz); //this kernel does not really work well if the vector size is less than 4. - if (suggested_vector_size < 4 && lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (suggested_vector_size < 4 && exec_gpu) { if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tsuggested_vector_size:" << suggested_vector_size << " setting it to 4 for Structure kernel" << std::endl; } @@ -1522,7 +1531,7 @@ void KokkosSPGEMM if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){ - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu){ //then chose the best method and parameters. current_spgemm_algorithm = SPGEMM_KK_MEMORY; int estimate_compress = 8; @@ -1635,31 +1644,28 @@ void KokkosSPGEMM //initizalize value for the mem pool nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } -#if defined( KOKKOS_ENABLE_CUDA ) - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } + if (exec_gpu) { + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); + size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); + if (KOKKOSKERNELS_VERBOSE) + std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; + if (required_size + num_chunks > free_byte){ + num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; + } + { + nnz_lno_t min_chunk_size = 1; + while (min_chunk_size * 2 <= num_chunks) { + min_chunk_size *= 2; + } + num_chunks = min_chunk_size; + } } -#endif if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl; @@ -1705,8 +1711,8 @@ void KokkosSPGEMM timer1.reset(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - Kokkos::parallel_for("StructureC_NC::CUDA_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); + if (exec_gpu) { + Kokkos::parallel_for("StructureC_NC::GPU_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); } else { if (current_spgemm_algorithm == SPGEMM_KK_DENSE){ @@ -1791,8 +1797,9 @@ void KokkosSPGEMM ){ SPGEMMAlgorithm current_spgemm_algorithm = this->spgemm_algorithm; + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { current_spgemm_algorithm = SPGEMM_KK_MEMORY; } @@ -1800,7 +1807,7 @@ void KokkosSPGEMM nnz_lno_t brows = row_mapB_.extent(0) - 1; size_type bnnz = entriesSetIndex.extent(0); size_type compressed_b_size = bnnz; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { KokkosKernels::Impl::kk_reduce_diff_view (brows, old_row_mapB, row_mapB_, compressed_b_size); if (KOKKOSKERNELS_VERBOSE){ @@ -1810,7 +1817,7 @@ void KokkosSPGEMM int suggested_vector_size = this->handle->get_suggested_vector_size(brows, compressed_b_size); //this kernel does not really work well if the vector size is less than 4. - if (suggested_vector_size < 4 && lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (suggested_vector_size < 4 && exec_gpu) { if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tsuggested_vector_size:" << suggested_vector_size << " setting it to 4 for Structure kernel" << std::endl; } @@ -1821,7 +1828,7 @@ void KokkosSPGEMM int shmem_size_to_use = shmem_size; if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){ - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { //then chose the best method and parameters. current_spgemm_algorithm = SPGEMM_KK_MEMORY; int estimate_compress = 8; @@ -1951,7 +1958,7 @@ void KokkosSPGEMM } - if (current_spgemm_algorithm == SPGEMM_KK_DENSE && lcl_my_exec_space != KokkosKernels::Impl::Exec_CUDA){ + if (current_spgemm_algorithm == SPGEMM_KK_DENSE && !exec_gpu) { nnz_lno_t col_size = this->b_col_cnt / (sizeof (nnz_lno_t) * 8)+ 1; nnz_lno_t max_row_size = KOKKOSKERNELS_MACRO_MIN(col_size, maxNumRoughNonzeros); chunksize = col_size + max_row_size; @@ -1966,16 +1973,14 @@ void KokkosSPGEMM nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } -#if defined( KOKKOS_ENABLE_CUDA ) - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; + if (exec_gpu) { + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); if (KOKKOSKERNELS_VERBOSE) std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; @@ -1990,7 +1995,6 @@ void KokkosSPGEMM num_chunks = min_chunk_size; } } -#endif if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl; @@ -2035,7 +2039,7 @@ void KokkosSPGEMM timer1.reset(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { Kokkos::parallel_for("KokkosSparse::StructureC::GPU_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); } else { @@ -2584,6 +2588,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp index d8997fcc12..27c0f4c7d9 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp @@ -219,6 +219,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -1322,17 +1326,17 @@ void KokkosSPGEMM ){ bool apply_compression = this->handle->get_spgemm_handle()->get_compression(); + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); const nnz_lno_t * min_result_row_for_each_row = this->handle->get_spgemm_handle()->get_min_col_of_row().data(); nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz(); typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space; - int suggested_vector_size = this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz); //this kernel does not really work well if the vector size is less than 4. - if (suggested_vector_size < 4 && MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA){ + if (suggested_vector_size < 4 && exec_gpu) { if (KOKKOSKERNELS_VERBOSE) std::cout << "\tVecSize:" << suggested_vector_size << " Setting it to 4" << std::endl; suggested_vector_size = 4; } @@ -1414,29 +1418,27 @@ void KokkosSPGEMM nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - -#if defined( KOKKOS_ENABLE_CUDA ) - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 < num_chunks) { - min_chunk_size *= 2; + if(exec_gpu) { + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); + size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t); + if (KOKKOSKERNELS_VERBOSE) + std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; + if (required_size + num_chunks > free_byte){ + num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize; + } + { + nnz_lno_t min_chunk_size = 1; + while (min_chunk_size * 2 < num_chunks) { + min_chunk_size *= 2; + } + num_chunks = min_chunk_size; } - num_chunks = min_chunk_size; } -#endif if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << @@ -1486,8 +1488,7 @@ void KokkosSPGEMM timer1.reset(); - //nnz_lno_t runcuda = atoi(getenv("runcuda")); - if (/*runcuda ||*/ MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { Kokkos::parallel_for( gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); } else { @@ -1682,6 +1683,7 @@ void KokkosSPGEMM b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>:: KokkosSPGEMM_symbolic_triangle_setup(){ + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); nnz_lno_t n = this->row_mapB.extent(0) - 1; size_type nnz = this->entriesB.extent(0); @@ -1733,7 +1735,7 @@ void KokkosSPGEMM } size_type bnnz = set_index_entries.extent(0); - if (this->MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { KokkosKernels::Impl::kkp_reduce_diff_view (this->b_row_cnt, p_rowmapB_begins, p_rowmapB_ends, bnnz); if (KOKKOSKERNELS_VERBOSE){ diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp index e59b95e8ac..ae913f864a 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp @@ -215,6 +215,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -892,12 +896,13 @@ void KokkosSPGEMM const int num_left_side_nnz_per_row = 2; const nnz_lno_t * min_result_row_for_each_row = this->handle->get_spgemm_handle()->get_min_col_of_row().data(); nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz(); + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space; int suggested_vector_size = this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz); //this kernel does not really work well if the vector size is less than 4. - if (suggested_vector_size < 4 && MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA){ + if (suggested_vector_size < 4 && exec_gpu) { if (KOKKOSKERNELS_VERBOSE) std::cout << "\tVecSize:" << suggested_vector_size << " Setting it to 4" << std::endl; suggested_vector_size = 4; } @@ -960,29 +965,24 @@ void KokkosSPGEMM nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; - } - - -#if defined( KOKKOS_ENABLE_CUDA ) - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 < num_chunks) { - min_chunk_size *= 2; + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); + size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t); + if (KOKKOSKERNELS_VERBOSE) + std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; + if (required_size + num_chunks > free_byte){ + num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize; + } + { + nnz_lno_t min_chunk_size = 1; + while (min_chunk_size * 2 < num_chunks) { + min_chunk_size *= 2; + } + num_chunks = min_chunk_size; } - num_chunks = min_chunk_size; } -#endif if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << @@ -1032,9 +1032,7 @@ void KokkosSPGEMM timer1.reset(); - //nnz_lno_t runcuda = atoi(getenv("runcuda")); - - if (/*runcuda ||*/ MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { Kokkos::parallel_for( gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); } else { diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp index a32d6689b9..2e12457822 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp @@ -219,6 +219,10 @@ namespace KokkosSparse{ #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -1181,6 +1185,8 @@ namespace KokkosSparse{ dinv_view_t dinv, KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space) { + using pool_memory_space = KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t>; + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tSPARSE ACC MODE" << std::endl; } @@ -1238,7 +1244,7 @@ namespace KokkosSparse{ // Choose the SpGEMM algorithm and corresponding parameters if (this->spgemm_algorithm == SPGEMM_KK || this->spgemm_algorithm == SPGEMM_KK_LP){ - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { size_type average_row_nnz = overall_nnz / this->a_row_cnt; size_t average_row_flops = original_overall_flops / this->a_row_cnt; @@ -1310,7 +1316,7 @@ namespace KokkosSparse{ } } } - // If CUDA is not enabled, we decide whether we want to use a sparse or a dense acumulator + // If non-GPU, we decide whether we want to use a sparse or a dense acumulator else { bool run_dense = false; @@ -1364,7 +1370,7 @@ namespace KokkosSparse{ // Compute the memory pool size - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){ tmp_max_nnz = 1; } @@ -1397,11 +1403,9 @@ namespace KokkosSparse{ } int num_chunks = concurrency / suggested_vector_size; -#if defined( KOKKOS_ENABLE_CUDA ) - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; + if (exec_gpu) { + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); if (KOKKOSKERNELS_VERBOSE) std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; @@ -1414,7 +1418,6 @@ namespace KokkosSparse{ } num_chunks = min_chunk_size; } -#endif if (KOKKOSKERNELS_VERBOSE){ std::cout << "\t\t max_nnz: " << max_nnz @@ -1428,11 +1431,10 @@ namespace KokkosSparse{ // Allocate the memory pool KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space; Kokkos::Impl::Timer timer; pool_memory_space m_space(num_chunks, chunksize, -1, my_pool_type); MyExecSpace().fence(); @@ -1470,7 +1472,7 @@ namespace KokkosSparse{ } timer.reset(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){ if (thread_shmem_key_size <= 0) { std::cout << "KokkosSPGEMM_jacobi_sparseacc SPGEMM_KK_MEMORY_SPREADTEAM: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl; diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index b14f781320..3389577497 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -48,6 +48,7 @@ #include "KokkosKernels_Controls.hpp" #include "Kokkos_InnerProductSpaceTraits.hpp" #include "KokkosBlas1_scal.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_spmv_impl_omp.hpp" @@ -113,37 +114,30 @@ struct SPMV_Transpose_Functor { KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - // This should be a thread loop as soon as we can use C++11 - for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { + const ordinal_type threadWork = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) + * rows_per_thread; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread), + [&](ordinal_type loop) + { // iRow represents a row of the matrix, so its correct type is // ordinal_type. - const ordinal_type iRow = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread + loop; + const ordinal_type iRow = threadWork + loop; if (iRow >= m_A.numRows ()) { return; } const auto row = m_A.rowConst (iRow); const ordinal_type row_length = row.length; - -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row_length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row_length; - iEntry ++) -#endif + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length), + [&](ordinal_type iEntry) { const value_type val = conjugate ? ATV::conj (row.value(iEntry)) : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); - Kokkos::atomic_add (&m_y(ind), static_cast (alpha * val * m_x(iRow))); - } - } + }); + }); } }; @@ -234,11 +228,9 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th // Determine rows per thread if(rows_per_thread < 1) { - #ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) rows_per_thread = 1; else - #endif { if(nnz_per_row < 20 && nnz > 5000000 ) { rows_per_thread = 256; @@ -247,14 +239,12 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th } } - #ifdef KOKKOS_ENABLE_CUDA if(team_size < 1) { - if(std::is_same::value) + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { team_size = 256/vector_length; } else { team_size = 1; } } - #endif rows_per_team = rows_per_thread * team_size; @@ -469,12 +459,14 @@ struct SPMV_MV_Transpose_Functor { KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - // This should be a thread loop as soon as we can use C++11 - for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { + const ordinal_type threadWork = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) + * rows_per_thread; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread), + [&](ordinal_type loop) + { // iRow represents a row of the matrix, so its correct type is // ordinal_type. - const ordinal_type iRow = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread + loop; + const ordinal_type iRow = threadWork + loop; if (iRow >= m_A.numRows ()) { return; } @@ -482,15 +474,8 @@ struct SPMV_MV_Transpose_Functor { const auto row = m_A.rowConst (iRow); const ordinal_type row_length = row.length; -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < static_cast (row_length); - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row_length; - iEntry ++) -#endif + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length), + [&](ordinal_type iEntry) { const A_value_type val = conjugate ? Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : @@ -514,8 +499,8 @@ struct SPMV_MV_Transpose_Functor { static_cast (val * m_x(iRow, k))); } } - } - } + }); + }); } }; @@ -527,7 +512,7 @@ template struct SPMV_MV_LayoutLeft_Functor { typedef typename AMatrix::execution_space execution_space; - typedef typename AMatrix::non_const_ordinal_type ordinal_type; + typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type A_value_type; typedef typename YVector::non_const_value_type y_value_type; typedef typename Kokkos::TeamPolicy team_policy; @@ -542,21 +527,23 @@ struct SPMV_MV_LayoutLeft_Functor { //! The number of columns in the input and output MultiVectors. ordinal_type n; ordinal_type rows_per_thread; + int vector_length; SPMV_MV_LayoutLeft_Functor (const coefficient_type& alpha_, const AMatrix& m_A_, const XVector& m_x_, const coefficient_type& beta_, const YVector& m_y_, - const ordinal_type rows_per_thread_) : + const ordinal_type rows_per_thread_, + int vector_length_) : alpha (alpha_), m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)), - rows_per_thread (rows_per_thread_) + rows_per_thread (rows_per_thread_), vector_length(vector_length_) {} template KOKKOS_INLINE_FUNCTION void - strip_mine (const team_member& /* dev */, const ordinal_type& iRow, const ordinal_type& kk) const + strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const { y_value_type sum[UNROLL]; @@ -586,133 +573,80 @@ struct SPMV_MV_LayoutLeft_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT #pragma loop count (15) #endif -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row.length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row.length; - iEntry ++) -#endif - { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row.length), + [&](ordinal_type iEntry) + { const A_value_type val = conjugate ? Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); - #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { sum[k] += val * m_x(ind, kk + k); } - } + }); if (doalpha == -1) { for (int ii=0; ii < UNROLL; ++ii) { - y_value_type sumt = sum[ii]; -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sumt += Kokkos::shfl_down(sumt, 1,blockDim.x); - if (blockDim.x > 2) - sumt += Kokkos::shfl_down(sumt, 2,blockDim.x); - if (blockDim.x > 4) - sumt += Kokkos::shfl_down(sumt, 4,blockDim.x); - if (blockDim.x > 8) - sumt += Kokkos::shfl_down(sumt, 8,blockDim.x); - if (blockDim.x > 16) - sumt += Kokkos::shfl_down(sumt, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - sum[ii] = -sumt; + y_value_type sumt; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length), + [&](ordinal_type, y_value_type& lsum) + { + //in this context, sum[ii] is a partial sum ii on one of the vector lanes. + lsum -= sum[ii]; + }, sumt); + sum[ii] = sumt; + //that was an all-reduce, so sum[ii] is the same on every vector lane } } else { for (int ii=0; ii < UNROLL; ++ii) { - y_value_type sumt = sum[ii]; -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sumt += Kokkos::shfl_down(sumt, 1,blockDim.x); - if (blockDim.x > 2) - sumt += Kokkos::shfl_down(sumt, 2,blockDim.x); - if (blockDim.x > 4) - sumt += Kokkos::shfl_down(sumt, 4,blockDim.x); - if (blockDim.x > 8) - sumt += Kokkos::shfl_down(sumt, 8,blockDim.x); - if (blockDim.x > 16) - sumt += Kokkos::shfl_down(sumt, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - sum[ii] = sumt; + y_value_type sumt; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length), + [&](ordinal_type, y_value_type& lsum) + { + //in this context, sum[ii] is a partial sum ii on one of the vector lanes. + lsum += sum[ii]; + }, sumt); + if(doalpha == 1) + sum[ii] = sumt; + else + sum[ii] = sumt * alpha; } } -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (threadIdx.x==0) -#else - if (true) -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - { - if (doalpha * doalpha != 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - sum[k] *= alpha; - } - } - - if (dobeta == 0) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = sum[k]; - } - } else if (dobeta == 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) += sum[k]; - } - } else if (dobeta == -1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; - } - } else { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k]; - } - } + if (dobeta == 0) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = sum[k]; + }); + } else if (dobeta == 1) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = sum[k]; + }); + } else if (dobeta == -1) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; + }); + } else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k]; + }); } } KOKKOS_INLINE_FUNCTION void - strip_mine_1 (const team_member& /* dev */, const ordinal_type& iRow) const + strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const { - y_value_type sum = Kokkos::Details::ArithTraits::zero (); - const auto row = m_A.rowConst (iRow); // The correct type of iEntry is ordinal_type, the type of the @@ -720,48 +654,17 @@ struct SPMV_MV_LayoutLeft_Functor { // assume either that rows have no duplicate entries, or that rows // never have enough duplicate entries to overflow ordinal_type. -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT -#pragma loop count (15) -#endif -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row.length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row.length; - iEntry ++) -#endif + y_value_type sum; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, row.length), + [&](ordinal_type iEntry, y_value_type& lsum) { const A_value_type val = conjugate ? Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : row.value(iEntry); - sum += val * m_x(row.colidx(iEntry),0); - } -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sum += Kokkos::shfl_down(sum, 1,blockDim.x); - if (blockDim.x > 2) - sum += Kokkos::shfl_down(sum, 2,blockDim.x); - if (blockDim.x > 4) - sum += Kokkos::shfl_down(sum, 4,blockDim.x); - if (blockDim.x > 8) - sum += Kokkos::shfl_down(sum, 8,blockDim.x); - if (blockDim.x > 16) - sum += Kokkos::shfl_down(sum, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (threadIdx.x==0) -#else - if (true) -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) + lsum += val * m_x(row.colidx(iEntry),0); + }); + Kokkos::single(Kokkos::PerThread(dev), + [&]() { if (doalpha == -1) { sum = -sum; @@ -778,7 +681,7 @@ struct SPMV_MV_LayoutLeft_Functor { } else { m_y(iRow, 0) = beta * m_y(iRow, 0) + sum; } - } + }); } @@ -800,99 +703,17 @@ struct SPMV_MV_LayoutLeft_Functor { // needs to have the same type as n. ordinal_type kk = 0; -#ifdef KOKKOS_FAST_COMPILE +//#ifdef KOKKOS_FAST_COMPILE for (; kk + 4 <= n; kk += 4) { strip_mine<4>(dev, iRow, kk); } for( ; kk < n; ++kk) { strip_mine<1>(dev, iRow, kk); } -#else -# ifdef __CUDA_ARCH__ - if ((n > 8) && (n % 8 == 1)) { - strip_mine<9>(dev, iRow, kk); - kk += 9; - } - for(; kk + 8 <= n; kk += 8) - strip_mine<8>(dev, iRow, kk); - if(kk < n) - switch(n - kk) { -# else // NOT a CUDA device - if ((n > 16) && (n % 16 == 1)) { - strip_mine<17>(dev, iRow, kk); - kk += 17; - } - - for (; kk + 16 <= n; kk += 16) { - strip_mine<16>(dev, iRow, kk); - } - - if(kk < n) - switch(n - kk) { - case 15: - strip_mine<15>(dev, iRow, kk); - break; - - case 14: - strip_mine<14>(dev, iRow, kk); - break; - - case 13: - strip_mine<13>(dev, iRow, kk); - break; - - case 12: - strip_mine<12>(dev, iRow, kk); - break; - - case 11: - strip_mine<11>(dev, iRow, kk); - break; - - case 10: - strip_mine<10>(dev, iRow, kk); - break; - - case 9: - strip_mine<9>(dev, iRow, kk); - break; - - case 8: - strip_mine<8>(dev, iRow, kk); - break; -# endif // __CUDA_ARCH__ - case 7: - strip_mine<7>(dev, iRow, kk); - break; - - case 6: - strip_mine<6>(dev, iRow, kk); - break; - - case 5: - strip_mine<5>(dev, iRow, kk); - break; - - case 4: - strip_mine<4>(dev, iRow, kk); - break; - - case 3: - strip_mine<3>(dev, iRow, kk); - break; - - case 2: - strip_mine<2>(dev, iRow, kk); - break; - - case 1: - strip_mine_1(dev, iRow); - break; - } -#endif // KOKKOS_FAST_COMPILE - } + //BMK: HERE } - }; + } +}; template OpType; - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); typename AMatrix::const_ordinal_type nrow = A.numRows(); @@ -957,7 +778,7 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, @@ -1115,7 +936,91 @@ spmv_alpha_mv (const char mode[], } } -} -} +}} //namespace KokkosSparse::Impl #endif // KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_ + /* +#else +# ifdef __CUDA_ARCH__ + if ((n > 8) && (n % 8 == 1)) { + strip_mine<9>(dev, iRow, kk); + kk += 9; + } + for(; kk + 8 <= n; kk += 8) + strip_mine<8>(dev, iRow, kk); + if(kk < n) { + switch(n - kk) { +# else // NOT a CUDA device + if ((n > 16) && (n % 16 == 1)) { + strip_mine<17>(dev, iRow, kk); + kk += 17; + } + + for (; kk + 16 <= n; kk += 16) { + strip_mine<16>(dev, iRow, kk); + } + + if(kk < n) { + switch(n - kk) { + case 15: + strip_mine<15>(dev, iRow, kk); + break; + + case 14: + strip_mine<14>(dev, iRow, kk); + break; + + case 13: + strip_mine<13>(dev, iRow, kk); + break; + + case 12: + strip_mine<12>(dev, iRow, kk); + break; + + case 11: + strip_mine<11>(dev, iRow, kk); + break; + + case 10: + strip_mine<10>(dev, iRow, kk); + break; + + case 9: + strip_mine<9>(dev, iRow, kk); + break; + + case 8: + strip_mine<8>(dev, iRow, kk); + break; +# endif // __CUDA_ARCH__ + case 7: + strip_mine<7>(dev, iRow, kk); + break; + + case 6: + strip_mine<6>(dev, iRow, kk); + break; + + case 5: + strip_mine<5>(dev, iRow, kk); + break; + + case 4: + strip_mine<4>(dev, iRow, kk); + break; + + case 3: + strip_mine<3>(dev, iRow, kk); + break; + + case 2: + strip_mine<2>(dev, iRow, kk); + break; + + case 1: + strip_mine_1(dev, iRow); + break; + } +#endif // KOKKOS_FAST_COMPILE + */ diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index a9c62806fd..3575f87dca 100644 --- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -46,6 +46,7 @@ #define KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_ #include "Kokkos_InnerProductSpaceTraits.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBlas1_scal.hpp" #include "KokkosSparse_CrsMatrix.hpp" @@ -91,12 +92,13 @@ struct SPMV_Struct_Transpose_Functor { KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - // This should be a thread loop as soon as we can use C++11 - for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { + const ordinal_type teamWorkStart = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) * rows_per_thread; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread), + [&](ordinal_type loop) + { // iRow represents a row of the matrix, so its correct type is // ordinal_type. - const ordinal_type iRow = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread + loop; + ordinal_type iRow = teamWorkStart + loop; if (iRow >= m_A.numRows ()) { return; } @@ -104,15 +106,8 @@ struct SPMV_Struct_Transpose_Functor { const auto row = m_A.rowConst (iRow); const ordinal_type row_length = row.length; -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row_length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row_length; - iEntry ++) -#endif + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length), + [&](ordinal_type iEntry) { const value_type val = conjugate ? ATV::conj (row.value(iEntry)) : @@ -120,8 +115,8 @@ struct SPMV_Struct_Transpose_Functor { const ordinal_type ind = row.colidx(iEntry); Kokkos::atomic_add (&m_y(ind), static_cast (alpha * val * m_x(iRow))); - } - } + }); + }); } }; @@ -302,7 +297,7 @@ struct SPMV_Struct_Functor { }); dev.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team), [&] (const ordinal_type& loop) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team),[&] (const ordinal_type& loop) { const ordinal_type interiorIdx = static_cast ( dev.league_rank() ) * rows_per_team + loop; if(interiorIdx >= numInterior) { return; } @@ -665,11 +660,9 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, int nnz_ // Determine rows per thread if(rows_per_thread < 1) { - #ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) rows_per_thread = 1; else - #endif { if(nnz_per_row < 20 && numInterior*nnz_per_row > 5000000 ) { rows_per_thread = 256; @@ -678,14 +671,12 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, int nnz_ } } - #ifdef KOKKOS_ENABLE_CUDA if(team_size < 1) { - if(std::is_same::value) + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { team_size = 128 / vector_length; } else { team_size = 1; } } - #endif rows_per_team = rows_per_thread * team_size; @@ -903,27 +894,19 @@ struct SPMV_MV_Struct_Transpose_Functor { operator() (const team_member& dev) const { // This should be a thread loop as soon as we can use C++11 - for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { - // iRow represents a row of the matrix, so its correct type is - // ordinal_type. - const ordinal_type iRow = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread + loop; + const ordinal_type teamWorkStart = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) * rows_per_thread; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread), + [&](ordinal_type loop) + { + const ordinal_type iRow = teamWorkStart + loop; if (iRow >= m_A.numRows ()) { return; } - const auto row = m_A.rowConst (iRow); const ordinal_type row_length = row.length; -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < static_cast (row_length); - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row_length; - iEntry ++) -#endif + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length), + [&](ordinal_type iEntry) { const A_value_type val = conjugate ? Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : @@ -947,430 +930,251 @@ struct SPMV_MV_Struct_Transpose_Functor { static_cast (val * m_x(iRow, k))); } } - } - } + }); + }); } }; - template - struct SPMV_MV_Struct_LayoutLeft_Functor { - typedef typename AMatrix::execution_space execution_space; - typedef typename AMatrix::non_const_ordinal_type ordinal_type; - typedef typename AMatrix::non_const_value_type A_value_type; - typedef typename YVector::non_const_value_type y_value_type; - typedef typename Kokkos::TeamPolicy team_policy; - typedef typename team_policy::member_type team_member; - typedef typename YVector::non_const_value_type coefficient_type; - - const coefficient_type alpha; - AMatrix m_A; - XVector m_x; - const coefficient_type beta; - YVector m_y; - //! The number of columns in the input and output MultiVectors. - ordinal_type n; - ordinal_type rows_per_thread; - - SPMV_MV_Struct_LayoutLeft_Functor (const coefficient_type& alpha_, - const AMatrix& m_A_, - const XVector& m_x_, - const coefficient_type& beta_, - const YVector& m_y_, - const ordinal_type rows_per_thread_) : - alpha (alpha_), - m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)), - rows_per_thread (rows_per_thread_) - {} - - template - KOKKOS_INLINE_FUNCTION void - strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const - { - y_value_type sum[UNROLL]; - -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - sum[k] = Kokkos::Details::ArithTraits::zero (); - } +template +struct SPMV_MV_Struct_LayoutLeft_Functor { + typedef typename AMatrix::execution_space execution_space; + typedef typename AMatrix::non_const_ordinal_type ordinal_type; + typedef typename AMatrix::non_const_value_type A_value_type; + typedef typename YVector::non_const_value_type y_value_type; + typedef typename Kokkos::TeamPolicy team_policy; + typedef typename team_policy::member_type team_member; + typedef typename YVector::non_const_value_type coefficient_type; - const auto row = m_A.rowConst (iRow); + const coefficient_type alpha; + AMatrix m_A; + XVector m_x; + const coefficient_type beta; + YVector m_y; + //! The number of columns in the input and output MultiVectors. + ordinal_type n; + ordinal_type rows_per_thread; + int vector_length; + + SPMV_MV_Struct_LayoutLeft_Functor (const coefficient_type& alpha_, + const AMatrix& m_A_, + const XVector& m_x_, + const coefficient_type& beta_, + const YVector& m_y_, + const ordinal_type rows_per_thread_, + int vector_length_) : + alpha (alpha_), + m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)), + rows_per_thread (rows_per_thread_), vector_length(vector_length_) + {} - // The correct type of iEntry is ordinal_type, the type of the - // number of columns in the (local) matrix. This is because we - // assume either that rows have no duplicate entries, or that rows - // never have enough duplicate entries to overflow ordinal_type. + template + KOKKOS_INLINE_FUNCTION void + strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const + { + y_value_type sum[UNROLL]; -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif -#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT -#pragma loop count (15) -#endif -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row.length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row.length; - iEntry ++) -#endif - { - const A_value_type val = conjugate ? - Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : - row.value(iEntry); - const ordinal_type ind = row.colidx(iEntry); + for (int k = 0; k < UNROLL; ++k) { + sum[k] = Kokkos::Details::ArithTraits::zero (); + } + + const auto row = m_A.rowConst (iRow); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row.length), + [&](ordinal_type iEntry) + { + const A_value_type val = conjugate ? + Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : + row.value(iEntry); + const ordinal_type ind = row.colidx(iEntry); #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - sum[k] += val * m_x(ind, kk + k); - } - } - - if (doalpha == -1) { - for (int ii=0; ii < UNROLL; ++ii) { - y_value_type sumt = sum[ii]; -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sumt += Kokkos::shfl_down(sumt, 1,blockDim.x); - if (blockDim.x > 2) - sumt += Kokkos::shfl_down(sumt, 2,blockDim.x); - if (blockDim.x > 4) - sumt += Kokkos::shfl_down(sumt, 4,blockDim.x); - if (blockDim.x > 8) - sumt += Kokkos::shfl_down(sumt, 8,blockDim.x); - if (blockDim.x > 16) - sumt += Kokkos::shfl_down(sumt, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - sum[ii] = -sumt; - } + for (int k = 0; k < UNROLL; ++k) { + sum[k] += val * m_x(ind, kk + k); + } + }); + + if (doalpha == -1) { + for (int ii=0; ii < UNROLL; ++ii) { + y_value_type sumt; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length), + [&](ordinal_type , y_value_type& lsum) + { + //in this context, sum[ii] is a partial sum ii on one of the vector lanes. + lsum -= sum[ii]; + }, sumt); + sum[ii] = sumt; + //that was an all-reduce, so sum[ii] is the same on every vector lane } - else { - for (int ii=0; ii < UNROLL; ++ii) { - y_value_type sumt = sum[ii]; -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sumt += Kokkos::shfl_down(sumt, 1,blockDim.x); - if (blockDim.x > 2) - sumt += Kokkos::shfl_down(sumt, 2,blockDim.x); - if (blockDim.x > 4) - sumt += Kokkos::shfl_down(sumt, 4,blockDim.x); - if (blockDim.x > 8) - sumt += Kokkos::shfl_down(sumt, 8,blockDim.x); - if (blockDim.x > 16) - sumt += Kokkos::shfl_down(sumt, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) + } + else { + for (int ii=0; ii < UNROLL; ++ii) { + y_value_type sumt; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length), + [&](ordinal_type, y_value_type& lsum) + { + //in this context, sum[ii] is a partial sum ii on one of the vector lanes. + lsum += sum[ii]; + }, sumt); + if(doalpha == 1) sum[ii] = sumt; - } + else + sum[ii] = sumt * alpha; } - -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (threadIdx.x==0) -#else - if (true) -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - { - if (doalpha * doalpha != 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - sum[k] *= alpha; - } - } - - if (dobeta == 0) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = sum[k]; - } - } else if (dobeta == 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) += sum[k]; - } - } else if (dobeta == -1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; - } - } else { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k]; - } - } - } } - KOKKOS_INLINE_FUNCTION void - strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const + Kokkos::single(Kokkos::PerThread(dev), + [&]() { - y_value_type sum = Kokkos::Details::ArithTraits::zero (); - - const auto row = m_A.rowConst (iRow); + if (dobeta == 0) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = sum[k]; + }); + } else if (dobeta == 1) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) += sum[k]; + }); + } else if (dobeta == -1) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; + }); + } else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k]; + }); + } + }); + } - // The correct type of iEntry is ordinal_type, the type of the - // number of columns in the (local) matrix. This is because we - // assume either that rows have no duplicate entries, or that rows - // never have enough duplicate entries to overflow ordinal_type. + KOKKOS_INLINE_FUNCTION void + strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const + { + const auto row = m_A.rowConst (iRow); -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT -#pragma loop count (15) -#endif -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row.length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row.length; - iEntry ++) -#endif - { - const A_value_type val = conjugate ? - Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : - row.value(iEntry); - sum += val * m_x(row.colidx(iEntry),0); - } -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sum += Kokkos::shfl_down(sum, 1,blockDim.x); - if (blockDim.x > 2) - sum += Kokkos::shfl_down(sum, 2,blockDim.x); - if (blockDim.x > 4) - sum += Kokkos::shfl_down(sum, 4,blockDim.x); - if (blockDim.x > 8) - sum += Kokkos::shfl_down(sum, 8,blockDim.x); - if (blockDim.x > 16) - sum += Kokkos::shfl_down(sum, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (threadIdx.x==0) -#else - if (true) -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - { - if (doalpha == -1) { - sum = -sum; - } else if (doalpha * doalpha != 1) { - sum *= alpha; - } + y_value_type sum; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, row.length), + [&](ordinal_type iEntry, y_value_type& lsum) + { + const A_value_type val = conjugate ? + Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : + row.value(iEntry); + lsum += val * m_x(row.colidx(iEntry),0); + }, sum); + + Kokkos::single(Kokkos::PerThread(dev), + [&]() + { + if (doalpha == -1) { + sum = -sum; + } else if (doalpha * doalpha != 1) { + sum *= alpha; + } - if (dobeta == 0) { - m_y(iRow, 0) = sum ; - } else if (dobeta == 1) { - m_y(iRow, 0) += sum ; - } else if (dobeta == -1) { - m_y(iRow, 0) = -m_y(iRow, 0) + sum; - } else { - m_y(iRow, 0) = beta * m_y(iRow, 0) + sum; - } - } - } + if (dobeta == 0) { + m_y(iRow, 0) = sum; + } else if (dobeta == 1) { + m_y(iRow, 0) += sum; + } else if (dobeta == -1) { + m_y(iRow, 0) = -m_y(iRow, 0) + sum; + } else { + m_y(iRow, 0) = beta * m_y(iRow, 0) + sum; + } + }); + } - KOKKOS_INLINE_FUNCTION void - operator() (const team_member& dev) const - { - for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { + KOKKOS_INLINE_FUNCTION void + operator() (const team_member& dev) const + { + for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { - // iRow indexes over (local) rows of the matrix, so its correct - // type is ordinal_type. + // iRow indexes over (local) rows of the matrix, so its correct + // type is ordinal_type. - const ordinal_type iRow = (dev.league_rank() * dev.team_size() + dev.team_rank()) - * rows_per_thread + loop; - if (iRow >= m_A.numRows ()) { - return; - } + const ordinal_type iRow = (dev.league_rank() * dev.team_size() + dev.team_rank()) + * rows_per_thread + loop; + if (iRow >= m_A.numRows ()) { + return; + } - // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it - // needs to have the same type as n. - ordinal_type kk = 0; + // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it + // needs to have the same type as n. + ordinal_type kk = 0; -#ifdef KOKKOS_FAST_COMPILE - for (; kk + 4 <= n; kk += 4) { - strip_mine<4>(dev, iRow, kk); - } - for( ; kk < n; ++kk) { - strip_mine<1>(dev, iRow, kk); - } -#else -# ifdef __CUDA_ARCH__ - if ((n > 8) && (n % 8 == 1)) { - strip_mine<9>(dev, iRow, kk); - kk += 9; - } - for(; kk + 8 <= n; kk += 8) - strip_mine<8>(dev, iRow, kk); - if(kk < n) - switch(n - kk) { -# else // NOT a CUDA device - if ((n > 16) && (n % 16 == 1)) { - strip_mine<17>(dev, iRow, kk); - kk += 17; - } +//#ifdef KOKKOS_FAST_COMPILE + for (; kk + 4 <= n; kk += 4) { + strip_mine<4>(dev, iRow, kk); + } + for( ; kk < n; ++kk) { + strip_mine<1>(dev, iRow, kk); + } + //BMK: HERE + } + } +}; - for (; kk + 16 <= n; kk += 16) { - strip_mine<16>(dev, iRow, kk); - } - if(kk < n) - switch(n - kk) { - case 15: - strip_mine<15>(dev, iRow, kk); - break; - - case 14: - strip_mine<14>(dev, iRow, kk); - break; - - case 13: - strip_mine<13>(dev, iRow, kk); - break; - - case 12: - strip_mine<12>(dev, iRow, kk); - break; - - case 11: - strip_mine<11>(dev, iRow, kk); - break; - - case 10: - strip_mine<10>(dev, iRow, kk); - break; - - case 9: - strip_mine<9>(dev, iRow, kk); - break; - - case 8: - strip_mine<8>(dev, iRow, kk); - break; -# endif // __CUDA_ARCH__ - case 7: - strip_mine<7>(dev, iRow, kk); - break; - - case 6: - strip_mine<6>(dev, iRow, kk); - break; - - case 5: - strip_mine<5>(dev, iRow, kk); - break; - - case 4: - strip_mine<4>(dev, iRow, kk); - break; - - case 3: - strip_mine<3>(dev, iRow, kk); - break; - - case 2: - strip_mine<2>(dev, iRow, kk); - break; - - case 1: - strip_mine_1(dev, iRow); - break; - } -#endif // KOKKOS_FAST_COMPILE - } - } - }; - - - template - static void - spmv_alpha_beta_mv_struct_no_transpose (const typename YVector::non_const_value_type& alpha, - const AMatrix& A, - const XVector& x, - const typename YVector::non_const_value_type& beta, - const YVector& y) - { - typedef typename AMatrix::ordinal_type ordinal_type; + template + static void + spmv_alpha_beta_mv_struct_no_transpose (const typename YVector::non_const_value_type& alpha, + const AMatrix& A, + const XVector& x, + const typename YVector::non_const_value_type& beta, + const YVector& y) + { + typedef typename AMatrix::ordinal_type ordinal_type; - if (A.numRows () <= static_cast (0)) { - return; - } - if (doalpha == 0) { - if (dobeta != 1) { - KokkosBlas::scal (y, beta, y); - } - return; + if (A.numRows () <= static_cast (0)) { + return; + } + if (doalpha == 0) { + if (dobeta != 1) { + KokkosBlas::scal (y, beta, y); } - else { - typedef typename AMatrix::size_type size_type; + return; + } + else { + typedef typename AMatrix::size_type size_type; - // Assuming that no row contains duplicate entries, NNZPerRow - // cannot be more than the number of columns of the matrix. Thus, - // the appropriate type is ordinal_type. - const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); + // Assuming that no row contains duplicate entries, NNZPerRow + // cannot be more than the number of columns of the matrix. Thus, + // the appropriate type is ordinal_type. + const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); - int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + int vector_length = 1; + while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels - typedef SPMV_MV_Struct_LayoutLeft_Functor OpType; - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + typedef SPMV_MV_Struct_LayoutLeft_Functor OpType; + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); - typename AMatrix::const_ordinal_type nrow = A.numRows(); + typename AMatrix::const_ordinal_type nrow = A.numRows(); +<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1382,16 +1186,34 @@ struct SPMV_MV_Struct_Transpose_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); +======= + // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here + // instead of int? For example, if the number of threads is 1, + // then this is just the number of rows. Ditto for rows_per_team. + // team_size is a hardware resource thing so it might legitimately + // be int. + const int rows_per_thread = RowsPerThread(NNZPerRow); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE + const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); +#else + const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); +#endif + const int rows_per_team = rows_per_thread * team_size; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); +>>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta - typedef SPMV_MV_Struct_LayoutLeft_Functor OpType; + typedef SPMV_MV_Struct_LayoutLeft_Functor OpType; - typename AMatrix::const_ordinal_type nrow = A.numRows(); + typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); +<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1403,55 +1225,73 @@ struct SPMV_MV_Struct_Transpose_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); +======= + // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here + // instead of int? For example, if the number of threads is 1, + // then this is just the number of rows. Ditto for rows_per_team. + // team_size is a hardware resource thing so it might legitimately + // be int. + const int rows_per_thread = RowsPerThread(NNZPerRow); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE + const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); +#else + const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); +#endif + const int rows_per_team = rows_per_thread * team_size; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); +>>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI #endif // KOKKOS_FAST_COMPILE - } } + } - template - static void - spmv_alpha_beta_mv_struct_transpose (const typename YVector::non_const_value_type& alpha, - const AMatrix& A, - const XVector& x, - const typename YVector::non_const_value_type& beta, - const YVector& y) - { - typedef typename AMatrix::ordinal_type ordinal_type; + template + static void + spmv_alpha_beta_mv_struct_transpose (const typename YVector::non_const_value_type& alpha, + const AMatrix& A, + const XVector& x, + const typename YVector::non_const_value_type& beta, + const YVector& y) + { + typedef typename AMatrix::ordinal_type ordinal_type; - if (A.numRows () <= static_cast (0)) { - return; - } + if (A.numRows () <= static_cast (0)) { + return; + } - // We need to scale y first ("scaling" by zero just means filling - // with zeros), since the functor works by atomic-adding into y. - if (dobeta != 1) { - KokkosBlas::scal (y, beta, y); - } + // We need to scale y first ("scaling" by zero just means filling + // with zeros), since the functor works by atomic-adding into y. + if (dobeta != 1) { + KokkosBlas::scal (y, beta, y); + } - if (doalpha != 0) { - typedef typename AMatrix::size_type size_type; + if (doalpha != 0) { + typedef typename AMatrix::size_type size_type; - // Assuming that no row contains duplicate entries, NNZPerRow - // cannot be more than the number of columns of the matrix. Thus, - // the appropriate type is ordinal_type. - const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); + // Assuming that no row contains duplicate entries, NNZPerRow + // cannot be more than the number of columns of the matrix. Thus, + // the appropriate type is ordinal_type. + const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); - int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + int vector_length = 1; + while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels - typedef SPMV_MV_Struct_Transpose_Functor OpType; - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + typedef SPMV_MV_Struct_Transpose_Functor OpType; + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); - typename AMatrix::const_ordinal_type nrow = A.numRows(); + typename AMatrix::const_ordinal_type nrow = A.numRows(); +<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1463,16 +1303,34 @@ struct SPMV_MV_Struct_Transpose_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for ("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); +======= + // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here + // instead of int? For example, if the number of threads is 1, + // then this is just the number of rows. Ditto for rows_per_team. + // team_size is a hardware resource thing so it might legitimately + // be int. + const int rows_per_thread = RowsPerThread(NNZPerRow); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE + const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); +#else + const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); +#endif + const int rows_per_team = rows_per_thread * team_size; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for ("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); +>>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta - typedef SPMV_MV_Struct_Transpose_Functor OpType; + typedef SPMV_MV_Struct_Transpose_Functor OpType; - typename AMatrix::const_ordinal_type nrow = A.numRows(); + typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); +<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1484,73 +1342,176 @@ struct SPMV_MV_Struct_Transpose_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); +======= + // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here + // instead of int? For example, if the number of threads is 1, + // then this is just the number of rows. Ditto for rows_per_team. + // team_size is a hardware resource thing so it might legitimately + // be int. + const int rows_per_thread = RowsPerThread(NNZPerRow); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE + const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); +#else + const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); +#endif + const int rows_per_team = rows_per_thread * team_size; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); +>>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI #endif // KOKKOS_FAST_COMPILE - } } + } - template - static void - spmv_alpha_beta_mv_struct (const char mode[], - const typename YVector::non_const_value_type& alpha, - const AMatrix& A, - const XVector& x, - const typename YVector::non_const_value_type& beta, - const YVector& y) - { - if (mode[0] == NoTranspose[0]) { - spmv_alpha_beta_mv_struct_no_transpose (alpha, A, x, beta, y); - } - else if (mode[0] == Conjugate[0]) { - spmv_alpha_beta_mv_struct_no_transpose (alpha, A, x, beta, y); - } - else if (mode[0] == Transpose[0]) { - spmv_alpha_beta_mv_struct_transpose (alpha, A, x, beta, y); - } - else if (mode[0] == ConjugateTranspose[0]) { - spmv_alpha_beta_mv_struct_transpose (alpha, A, x, beta, y); - } - else { - Kokkos::Impl::throw_runtime_exception ("Invalid Transpose Mode for KokkosSparse::spmv()"); - } + template + static void + spmv_alpha_beta_mv_struct (const char mode[], + const typename YVector::non_const_value_type& alpha, + const AMatrix& A, + const XVector& x, + const typename YVector::non_const_value_type& beta, + const YVector& y) + { + if (mode[0] == NoTranspose[0]) { + spmv_alpha_beta_mv_struct_no_transpose (alpha, A, x, beta, y); } - - template - void - spmv_alpha_mv_struct (const char mode[], - const typename YVector::non_const_value_type& alpha, - const AMatrix& A, - const XVector& x, - const typename YVector::non_const_value_type& beta, - const YVector& y) - { - typedef typename YVector::non_const_value_type coefficient_type; - typedef Kokkos::Details::ArithTraits KAT; - - if (beta == KAT::zero ()) { - spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); - } - else if (beta == KAT::one ()) { - spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); - } - else if (beta == -KAT::one ()) { - spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); - } - else { - spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); - } + else if (mode[0] == Conjugate[0]) { + spmv_alpha_beta_mv_struct_no_transpose (alpha, A, x, beta, y); } + else if (mode[0] == Transpose[0]) { + spmv_alpha_beta_mv_struct_transpose (alpha, A, x, beta, y); + } + else if (mode[0] == ConjugateTranspose[0]) { + spmv_alpha_beta_mv_struct_transpose (alpha, A, x, beta, y); + } + else { + Kokkos::Impl::throw_runtime_exception ("Invalid Transpose Mode for KokkosSparse::spmv()"); + } + } + template + void + spmv_alpha_mv_struct (const char mode[], + const typename YVector::non_const_value_type& alpha, + const AMatrix& A, + const XVector& x, + const typename YVector::non_const_value_type& beta, + const YVector& y) + { + typedef typename YVector::non_const_value_type coefficient_type; + typedef Kokkos::Details::ArithTraits KAT; + if (beta == KAT::zero ()) { + spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); + } + else if (beta == KAT::one ()) { + spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); + } + else if (beta == -KAT::one ()) { + spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); + } + else { + spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); + } + } } } #endif // KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_ + /* +#else +# ifdef __CUDA_ARCH__ + if ((n > 8) && (n % 8 == 1)) { + strip_mine<9>(dev, iRow, kk); + kk += 9; + } + for(; kk + 8 <= n; kk += 8) + strip_mine<8>(dev, iRow, kk); + if(kk < n) + { + switch(n - kk) { +# else // NOT a CUDA device + if ((n > 16) && (n % 16 == 1)) { + strip_mine<17>(dev, iRow, kk); + kk += 17; + } + + for (; kk + 16 <= n; kk += 16) { + strip_mine<16>(dev, iRow, kk); + } + + if(kk < n) + { + switch(n - kk) { + case 15: + strip_mine<15>(dev, iRow, kk); + break; + + case 14: + strip_mine<14>(dev, iRow, kk); + break; + + case 13: + strip_mine<13>(dev, iRow, kk); + break; + + case 12: + strip_mine<12>(dev, iRow, kk); + break; + + case 11: + strip_mine<11>(dev, iRow, kk); + break; + + case 10: + strip_mine<10>(dev, iRow, kk); + break; + + case 9: + strip_mine<9>(dev, iRow, kk); + break; + + case 8: + strip_mine<8>(dev, iRow, kk); + break; + #endif // __CUDA_ARCH__ + case 7: + strip_mine<7>(dev, iRow, kk); + break; + + case 6: + strip_mine<6>(dev, iRow, kk); + break; + + case 5: + strip_mine<5>(dev, iRow, kk); + break; + + case 4: + strip_mine<4>(dev, iRow, kk); + break; + + case 3: + strip_mine<3>(dev, iRow, kk); + break; + + case 2: + strip_mine<2>(dev, iRow, kk); + break; + + case 1: + strip_mine_1(dev, iRow); + break; + } + } +#endif // KOKKOS_FAST_COMPILE + */ diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index a9ffcd282a..271d8b2396 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2464,6 +2464,23 @@ struct ReturnRangePolicyType { } }; #endif +#ifdef KOKKOS_ENABLE_HIP +template <> +struct ReturnRangePolicyType { + using PolicyType = Kokkos::RangePolicy; + + static inline + PolicyType get_policy(int nt, int ts) { + return PolicyType(nt,ts); + } + + template + static inline + PolicyType get_policy(int nt, int ts, ExecInstanceType stream) { + return PolicyType(stream,nt,ts); + } +}; +#endif template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType > void lower_tri_solve_cg( TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) { diff --git a/test_common/KokkosKernels_TestParameters.hpp b/test_common/KokkosKernels_TestParameters.hpp index 295b46df9b..c069c618e6 100644 --- a/test_common/KokkosKernels_TestParameters.hpp +++ b/test_common/KokkosKernels_TestParameters.hpp @@ -72,6 +72,7 @@ struct Parameters{ int use_threads; int use_openmp; int use_cuda; + int use_hip; int use_serial; int a_mem_space, b_mem_space, c_mem_space, work_mem_space; @@ -121,6 +122,7 @@ struct Parameters{ use_threads = 0; use_openmp = 0; use_cuda = 0; + use_hip = 0; use_serial = 0; a_mem_space = b_mem_space = c_mem_space = work_mem_space = 1; a_mtx_bin_file = b_mtx_bin_file = c_mtx_bin_file = NULL; From 27e0a29071da9fd153e07a88c6a87244f127bb2f Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 9 Oct 2020 12:56:39 -0600 Subject: [PATCH 059/106] Fixed spmv for OpenMP --- src/common/KokkosKernels_SparseUtils.hpp | 2 - .../impl/KokkosSparse_gauss_seidel_impl.hpp | 1 - .../impl/KokkosSparse_spgemm_impl_def.hpp | 1 - src/sparse/impl/KokkosSparse_spmv_impl.hpp | 182 +++++++++--------- .../impl/KokkosSparse_spmv_struct_impl.hpp | 177 +++++++++-------- 5 files changed, 178 insertions(+), 185 deletions(-) diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp index 7628e6de31..02ab3a50b7 100644 --- a/src/common/KokkosKernels_SparseUtils.hpp +++ b/src/common/KokkosKernels_SparseUtils.hpp @@ -1341,8 +1341,6 @@ void kk_sort_graph( out_nnz_view_t out_adj, out_scalar_view_t out_vals){ - ExecSpaceType exec = kk_get_exec_space_type(); - // If possible, sort on host and avoid a deep copy // TODO BMK: can this function be deprecated? typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj); diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index d956ed8d4d..d5c111862f 100644 --- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -826,7 +826,6 @@ namespace KokkosSparse{ nnz_lno_t num_values_in_l2 = 0; nnz_lno_t num_big_rows = 0; - KokkosKernels::Impl::ExecSpaceType ex_sp = this->handle->get_handle_exec_space(); if (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { //again, if it is on CPUs, we make L1 as big as we need. size_t l1mem = 1; diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp index 4924e11b0c..8fdf276e61 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp @@ -121,7 +121,6 @@ void KokkosSPGEMM //number of rows and nnzs nnz_lno_t n = this->row_mapB.extent(0) - 1; size_type nnz = this->entriesB.extent(0); - KokkosKernels::Impl::ExecSpaceType my_exec_space_ = KokkosKernels::Impl::get_exec_space_type(); bool compress_in_single_step = this->handle->get_spgemm_handle()->get_compression_step(); //compress in single step if it is GPU. diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 3389577497..4645a08b63 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -366,7 +366,8 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2; + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) + while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2; typedef SPMV_Transpose_Functor OpType; @@ -627,7 +628,7 @@ struct SPMV_MV_LayoutLeft_Functor { Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), [&](ordinal_type k) { - m_y(iRow, kk + k) = sum[k]; + m_y(iRow, kk + k) = m_y(iRow, kk + k) + sum[k]; }); } else if (dobeta == -1) { Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), @@ -662,7 +663,7 @@ struct SPMV_MV_LayoutLeft_Functor { Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : row.value(iEntry); lsum += val * m_x(row.colidx(iEntry),0); - }); + }, sum); Kokkos::single(Kokkos::PerThread(dev), [&]() { @@ -703,14 +704,97 @@ struct SPMV_MV_LayoutLeft_Functor { // needs to have the same type as n. ordinal_type kk = 0; -//#ifdef KOKKOS_FAST_COMPILE +#ifdef KOKKOS_FAST_COMPILE for (; kk + 4 <= n; kk += 4) { strip_mine<4>(dev, iRow, kk); } for( ; kk < n; ++kk) { strip_mine<1>(dev, iRow, kk); } - //BMK: HERE +#else +# ifdef __CUDA_ARCH__ + if ((n > 8) && (n % 8 == 1)) { + strip_mine<9>(dev, iRow, kk); + kk += 9; + } + for(; kk + 8 <= n; kk += 8) + strip_mine<8>(dev, iRow, kk); + if(kk < n) { + switch(n - kk) { +# else // NOT a CUDA device + if ((n > 16) && (n % 16 == 1)) { + strip_mine<17>(dev, iRow, kk); + kk += 17; + } + + for (; kk + 16 <= n; kk += 16) { + strip_mine<16>(dev, iRow, kk); + } + + if(kk < n) { + switch(n - kk) { + case 15: + strip_mine<15>(dev, iRow, kk); + break; + + case 14: + strip_mine<14>(dev, iRow, kk); + break; + + case 13: + strip_mine<13>(dev, iRow, kk); + break; + + case 12: + strip_mine<12>(dev, iRow, kk); + break; + + case 11: + strip_mine<11>(dev, iRow, kk); + break; + + case 10: + strip_mine<10>(dev, iRow, kk); + break; + + case 9: + strip_mine<9>(dev, iRow, kk); + break; + + case 8: + strip_mine<8>(dev, iRow, kk); + break; +# endif // __CUDA_ARCH__ + case 7: + strip_mine<7>(dev, iRow, kk); + break; + + case 6: + strip_mine<6>(dev, iRow, kk); + break; + + case 5: + strip_mine<5>(dev, iRow, kk); + break; + + case 4: + strip_mine<4>(dev, iRow, kk); + break; + + case 3: + strip_mine<3>(dev, iRow, kk); + break; + + case 2: + strip_mine<2>(dev, iRow, kk); + break; + + case 1: + strip_mine_1(dev, iRow); + break; + } + } +#endif // KOKKOS_FAST_COMPILE } } }; @@ -749,7 +833,8 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) + while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels @@ -939,88 +1024,3 @@ spmv_alpha_mv (const char mode[], }} //namespace KokkosSparse::Impl #endif // KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_ - /* -#else -# ifdef __CUDA_ARCH__ - if ((n > 8) && (n % 8 == 1)) { - strip_mine<9>(dev, iRow, kk); - kk += 9; - } - for(; kk + 8 <= n; kk += 8) - strip_mine<8>(dev, iRow, kk); - if(kk < n) { - switch(n - kk) { -# else // NOT a CUDA device - if ((n > 16) && (n % 16 == 1)) { - strip_mine<17>(dev, iRow, kk); - kk += 17; - } - - for (; kk + 16 <= n; kk += 16) { - strip_mine<16>(dev, iRow, kk); - } - - if(kk < n) { - switch(n - kk) { - case 15: - strip_mine<15>(dev, iRow, kk); - break; - - case 14: - strip_mine<14>(dev, iRow, kk); - break; - - case 13: - strip_mine<13>(dev, iRow, kk); - break; - - case 12: - strip_mine<12>(dev, iRow, kk); - break; - - case 11: - strip_mine<11>(dev, iRow, kk); - break; - - case 10: - strip_mine<10>(dev, iRow, kk); - break; - - case 9: - strip_mine<9>(dev, iRow, kk); - break; - - case 8: - strip_mine<8>(dev, iRow, kk); - break; -# endif // __CUDA_ARCH__ - case 7: - strip_mine<7>(dev, iRow, kk); - break; - - case 6: - strip_mine<6>(dev, iRow, kk); - break; - - case 5: - strip_mine<5>(dev, iRow, kk); - break; - - case 4: - strip_mine<4>(dev, iRow, kk); - break; - - case 3: - strip_mine<3>(dev, iRow, kk); - break; - - case 2: - strip_mine<2>(dev, iRow, kk); - break; - - case 1: - strip_mine_1(dev, iRow); - break; - } -#endif // KOKKOS_FAST_COMPILE - */ diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index 3575f87dca..f4fa9ea1cd 100644 --- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -1118,14 +1118,99 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { // needs to have the same type as n. ordinal_type kk = 0; -//#ifdef KOKKOS_FAST_COMPILE +#ifdef KOKKOS_FAST_COMPILE for (; kk + 4 <= n; kk += 4) { strip_mine<4>(dev, iRow, kk); } for( ; kk < n; ++kk) { strip_mine<1>(dev, iRow, kk); } - //BMK: HERE +#else +# ifdef __CUDA_ARCH__ + if ((n > 8) && (n % 8 == 1)) { + strip_mine<9>(dev, iRow, kk); + kk += 9; + } + for(; kk + 8 <= n; kk += 8) + strip_mine<8>(dev, iRow, kk); + if(kk < n) + { + switch(n - kk) { +# else // NOT a CUDA device + if ((n > 16) && (n % 16 == 1)) { + strip_mine<17>(dev, iRow, kk); + kk += 17; + } + + for (; kk + 16 <= n; kk += 16) { + strip_mine<16>(dev, iRow, kk); + } + + if(kk < n) + { + switch(n - kk) { + case 15: + strip_mine<15>(dev, iRow, kk); + break; + + case 14: + strip_mine<14>(dev, iRow, kk); + break; + + case 13: + strip_mine<13>(dev, iRow, kk); + break; + + case 12: + strip_mine<12>(dev, iRow, kk); + break; + + case 11: + strip_mine<11>(dev, iRow, kk); + break; + + case 10: + strip_mine<10>(dev, iRow, kk); + break; + + case 9: + strip_mine<9>(dev, iRow, kk); + break; + + case 8: + strip_mine<8>(dev, iRow, kk); + break; + #endif // __CUDA_ARCH__ + case 7: + strip_mine<7>(dev, iRow, kk); + break; + + case 6: + strip_mine<6>(dev, iRow, kk); + break; + + case 5: + strip_mine<5>(dev, iRow, kk); + break; + + case 4: + strip_mine<4>(dev, iRow, kk); + break; + + case 3: + strip_mine<3>(dev, iRow, kk); + break; + + case 2: + strip_mine<2>(dev, iRow, kk); + break; + + case 1: + strip_mine_1(dev, iRow); + break; + } + } +#endif // KOKKOS_FAST_COMPILE } } }; @@ -1427,91 +1512,3 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { } #endif // KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_ - /* -#else -# ifdef __CUDA_ARCH__ - if ((n > 8) && (n % 8 == 1)) { - strip_mine<9>(dev, iRow, kk); - kk += 9; - } - for(; kk + 8 <= n; kk += 8) - strip_mine<8>(dev, iRow, kk); - if(kk < n) - { - switch(n - kk) { -# else // NOT a CUDA device - if ((n > 16) && (n % 16 == 1)) { - strip_mine<17>(dev, iRow, kk); - kk += 17; - } - - for (; kk + 16 <= n; kk += 16) { - strip_mine<16>(dev, iRow, kk); - } - - if(kk < n) - { - switch(n - kk) { - case 15: - strip_mine<15>(dev, iRow, kk); - break; - - case 14: - strip_mine<14>(dev, iRow, kk); - break; - - case 13: - strip_mine<13>(dev, iRow, kk); - break; - - case 12: - strip_mine<12>(dev, iRow, kk); - break; - - case 11: - strip_mine<11>(dev, iRow, kk); - break; - - case 10: - strip_mine<10>(dev, iRow, kk); - break; - - case 9: - strip_mine<9>(dev, iRow, kk); - break; - - case 8: - strip_mine<8>(dev, iRow, kk); - break; - #endif // __CUDA_ARCH__ - case 7: - strip_mine<7>(dev, iRow, kk); - break; - - case 6: - strip_mine<6>(dev, iRow, kk); - break; - - case 5: - strip_mine<5>(dev, iRow, kk); - break; - - case 4: - strip_mine<4>(dev, iRow, kk); - break; - - case 3: - strip_mine<3>(dev, iRow, kk); - break; - - case 2: - strip_mine<2>(dev, iRow, kk); - break; - - case 1: - strip_mine_1(dev, iRow); - break; - } - } -#endif // KOKKOS_FAST_COMPILE - */ From f993534289950c566205e25fded0236e80c379d6 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 9 Oct 2020 13:06:41 -0600 Subject: [PATCH 060/106] Removed #pragma unroll Used to be a normal for loop, now it's a ThreadVectorRange --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 4645a08b63..1d2f737fa6 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -565,15 +565,6 @@ struct SPMV_MV_LayoutLeft_Functor { // assume either that rows have no duplicate entries, or that rows // never have enough duplicate entries to overflow ordinal_type. -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT -#pragma loop count (15) -#endif Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row.length), [&](ordinal_type iEntry) { From 5e0b1191d97e43902b12205a71b9f49958c3ef32 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 12 Oct 2020 10:18:54 -0600 Subject: [PATCH 061/106] Update for deprecated removal --- .../impl/KokkosSparse_spmv_struct_impl.hpp | 73 ------------------- 1 file changed, 73 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index f4fa9ea1cd..be563c5257 100644 --- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -1259,7 +1259,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { typename AMatrix::const_ordinal_type nrow = A.numRows(); -<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1271,23 +1270,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); -======= - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); ->>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta @@ -1298,7 +1280,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); -<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1310,24 +1291,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); -======= - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); ->>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI - #endif // KOKKOS_FAST_COMPILE } } @@ -1376,7 +1339,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { typename AMatrix::const_ordinal_type nrow = A.numRows(); -<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1388,23 +1350,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for ("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); -======= - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for ("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); ->>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta @@ -1415,7 +1360,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); -<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1427,23 +1371,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); -======= - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); ->>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI #endif // KOKKOS_FAST_COMPILE } From 5315d0f9b0a478f9e799f1195709134668918025 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 13 Oct 2020 18:18:25 -0700 Subject: [PATCH 062/106] Fix SpMV transpose functors --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 52 ++++++++++------------ 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 1d2f737fa6..86b342647b 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -81,7 +81,6 @@ struct GetCoeffView,DeviceType> { template struct SPMV_Transpose_Functor { typedef typename AMatrix::execution_space execution_space; @@ -96,32 +95,26 @@ struct SPMV_Transpose_Functor { const coefficient_type alpha; AMatrix m_A; XVector m_x; - const coefficient_type beta; YVector m_y; - const ordinal_type rows_per_thread; + ordinal_type rows_per_team; SPMV_Transpose_Functor (const coefficient_type& alpha_, const AMatrix& m_A_, const XVector& m_x_, - const coefficient_type& beta_, - const YVector& m_y_, - const ordinal_type rows_per_thread_) : - alpha (alpha_), m_A (m_A_), m_x (m_x_), - beta (beta_), m_y (m_y_), - rows_per_thread (rows_per_thread_) + const YVector& m_y_) : + alpha (alpha_), m_A (m_A_), m_x (m_x_), m_y (m_y_) {} KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - const ordinal_type threadWork = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread; - Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread), + const ordinal_type teamWork = dev.league_rank() * rows_per_team; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_team), [&](ordinal_type loop) { // iRow represents a row of the matrix, so its correct type is // ordinal_type. - const ordinal_type iRow = threadWork + loop; + const ordinal_type iRow = teamWork + loop; if (iRow >= m_A.numRows ()) { return; } @@ -366,18 +359,18 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); int vector_length = 1; - if(KokkosKernels::Impl::kk_is_gpu_exec_space()) - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2; + while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2; - typedef SPMV_Transpose_Functor OpType; + typedef SPMV_Transpose_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, y); const int rows_per_thread = RowsPerThread (NNZPerRow); const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); @@ -444,30 +437,27 @@ struct SPMV_MV_Transpose_Functor { YVector m_y; const ordinal_type n; - const ordinal_type rows_per_thread; + ordinal_type rows_per_team; SPMV_MV_Transpose_Functor (const coefficient_type& alpha_, const AMatrix& m_A_, const XVector& m_x_, const coefficient_type& beta_, - const YVector& m_y_, - const ordinal_type rows_per_thread_) : + const YVector& m_y_) : alpha (alpha_), - m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)), - rows_per_thread (rows_per_thread_) + m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)) {} KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - const ordinal_type threadWork = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread; - Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread), + const ordinal_type teamWork = dev.league_rank() * rows_per_team; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_team), [&](ordinal_type loop) { // iRow represents a row of the matrix, so its correct type is // ordinal_type. - const ordinal_type iRow = threadWork + loop; + const ordinal_type iRow = teamWork + loop; if (iRow >= m_A.numRows ()) { return; } @@ -906,13 +896,15 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + //Transpose functor uses atomics which can't be vectorized on CPU + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) + while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels typedef SPMV_MV_Transpose_Functor OpType; - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, beta, y); typename AMatrix::const_ordinal_type nrow = A.numRows(); @@ -924,6 +916,7 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph const int rows_per_thread = RowsPerThread(NNZPerRow); const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for ("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); @@ -935,7 +928,7 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, beta, y); // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, @@ -945,6 +938,7 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph const int rows_per_thread = RowsPerThread(NNZPerRow); const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); From da542886d32819e9b02c0e684a7f1b23c2ce0ce5 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 14 Oct 2020 10:42:52 -0700 Subject: [PATCH 063/106] Add back D1 default algorithm verbose output --- src/graph/KokkosGraph_Distance1ColorHandle.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/src/graph/KokkosGraph_Distance1ColorHandle.hpp index e85412abb6..503c6c9310 100644 --- a/src/graph/KokkosGraph_Distance1ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance1ColorHandle.hpp @@ -237,9 +237,19 @@ class GraphColoringHandle { auto exec = KokkosKernels::Impl::kk_get_exec_space_type(); if(exec == KokkosKernels::Impl::Exec_SERIAL) + { this->coloring_algorithm_type = COLORING_SERIAL; +#ifdef VERBOSE + std:cout << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n"; +#endif + } else + { this->coloring_algorithm_type = COLORING_VBBIT; +#ifdef VERBOSE + std:cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VBBIT\n"; +#endif + } } template From b5349f110f27f96f2d4261fdeec8b6072c644ea1 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 14 Oct 2020 10:43:21 -0700 Subject: [PATCH 064/106] Fix HIP device code macros It's __HIP_DEVICE_COMPILE__, not __CUDA_ARCH__. --- perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp | 2 +- .../do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp | 2 +- .../do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp | 2 +- .../do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp | 2 +- .../do-not-use/KokkosBatched_Test_LU_Host_Real.cpp | 2 +- .../do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp | 2 +- src/batched/KokkosBatched_Vector_SIMD.hpp | 4 ++-- src/batched/KokkosBatched_Vector_SIMD_Arith.hpp | 8 ++++---- src/blas/impl/KokkosBlas3_gemm_impl.hpp | 8 ++++---- src/common/KokkosKernels_BitUtils.hpp | 5 ++--- src/common/KokkosKernels_SparseUtils.hpp | 1 - src/sparse/impl/KokkosSparse_spmv_impl.hpp | 6 +++--- src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp | 4 ++-- unit_test/batched/Test_Batched_SerialTrmm.hpp | 2 +- unit_test/batched/Test_Batched_SerialTrtri.hpp | 2 +- unit_test/blas/Test_Blas3_gemm.hpp | 2 +- unit_test/blas/Test_Blas3_trmm.hpp | 2 +- unit_test/blas/Test_Blas3_trsm.hpp | 3 ++- unit_test/blas/Test_Blas_trtri.hpp | 2 +- 19 files changed, 30 insertions(+), 31 deletions(-) diff --git a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp index f37c2d1b6f..ac8abb18f7 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp @@ -51,7 +51,7 @@ using namespace KokkosBatched; int main (int argc, char *argv[]) { Kokkos::initialize(argc, argv); -#if !defined(__CUDA_ARCH__) +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; const bool detail = false; diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp index adff41c48b..2fffa06855 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp @@ -29,7 +29,7 @@ int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); -#if !defined(__CUDA_ARCH__) +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) const int ntest = 1; //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; int N[1] = { 128*128 }; diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp index 7bb2a2907c..031909d540 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp @@ -27,7 +27,7 @@ void run(const int N) { int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); -#if !defined(__CUDA_ARCH__) +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) const int ntest = 1; //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; int N[1] = { 128*128 }; diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp index 8468800ee6..56ade7a446 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp @@ -27,7 +27,7 @@ void run(const int N) { int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); -#if !defined(__CUDA_ARCH__) +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) const int ntest = 1; //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; const int N[1] = { 128*128 }; diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp index 7b39c624f2..7d352283c6 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp @@ -21,7 +21,7 @@ int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); -#if !defined(__CUDA_ARCH__) +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int N = 128*128; for (int i=1;i #include -#if defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) #undef __KOKKOSBATCHED_ENABLE_AVX__ #else // compiler bug with AVX in some architectures @@ -129,7 +129,7 @@ namespace KokkosBatched { } -#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) namespace KokkosBatched { template<> diff --git a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp index 43ddbb101b..49317ca9d4 100644 --- a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp +++ b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp @@ -77,7 +77,7 @@ namespace KokkosBatched { return r_val; } -#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) @@ -298,7 +298,7 @@ namespace KokkosBatched { return r_val; } -#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) @@ -568,7 +568,7 @@ namespace KokkosBatched { return r_val; } -#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) @@ -858,7 +858,7 @@ namespace KokkosBatched { return r_val; } -#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) diff --git a/src/blas/impl/KokkosBlas3_gemm_impl.hpp b/src/blas/impl/KokkosBlas3_gemm_impl.hpp index 2e50a0064c..fc5ba4dfa6 100644 --- a/src/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/src/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -64,20 +64,20 @@ namespace Impl { // On GPUs it is more important to not jump around in global memory, i.e. have coallesced loads template struct impl_gemm_choose_copy_layout { - typedef LayoutAScratch type; + using type = LayoutAScratch; }; #ifdef KOKKOS_ENABLE_CUDA template struct impl_gemm_choose_copy_layout { - typedef LayoutA type; + using type = LayoutA; }; #endif #ifdef KOKKOS_ENABLE_HIP template struct impl_gemm_choose_copy_layout { - typedef LayoutA type; + using type = LayoutA; }; #endif @@ -399,7 +399,7 @@ KOKKOS_INLINE_FUNCTION void impl_team_gemm_block(const TeamHandle& team, const ViewTypeC& C, const ViewTypeA& A, const ViewTypeB& B) { typedef typename ViewTypeC::non_const_value_type ScalarC; // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) || !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && (!defined(__CUDA_ARCH__) || !defined(__HIP_DEVICE_COMPILE__)) int blockA0 = A.extent_int(0); int blockA1 = A.extent_int(1); int blockB1 = B.extent_int(1); diff --git a/src/common/KokkosKernels_BitUtils.hpp b/src/common/KokkosKernels_BitUtils.hpp index 28b2a01389..4d09fb964e 100644 --- a/src/common/KokkosKernels_BitUtils.hpp +++ b/src/common/KokkosKernels_BitUtils.hpp @@ -51,8 +51,7 @@ namespace KokkosKernels{ namespace Impl{ // POP COUNT function returns the number of set bits -// Note BMK: HIP also defines __CUDA_ARCH__, and provides the same intrinsics. -#if defined( __CUDA_ARCH__ ) +#if defined( __CUDA_ARCH__ ) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned i ){ return __popc(i); @@ -182,7 +181,7 @@ int pop_count( long long i ){ // least_set_bit function returns the position of right most set bit -#if defined( __CUDA_ARCH__ ) +#if defined( __CUDA_ARCH__ ) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION int least_set_bit( unsigned i ){ return __ffs(i); diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp index 02ab3a50b7..6979f15847 100644 --- a/src/common/KokkosKernels_SparseUtils.hpp +++ b/src/common/KokkosKernels_SparseUtils.hpp @@ -1341,7 +1341,6 @@ void kk_sort_graph( out_nnz_view_t out_adj, out_scalar_view_t out_vals){ - // If possible, sort on host and avoid a deep copy // TODO BMK: can this function be deprecated? typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj); Kokkos::deep_copy (hr, in_xadj); diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 86b342647b..1c011e42d9 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -693,7 +693,7 @@ struct SPMV_MV_LayoutLeft_Functor { strip_mine<1>(dev, iRow, kk); } #else -# ifdef __CUDA_ARCH__ +# if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) if ((n > 8) && (n % 8 == 1)) { strip_mine<9>(dev, iRow, kk); kk += 9; @@ -702,7 +702,7 @@ struct SPMV_MV_LayoutLeft_Functor { strip_mine<8>(dev, iRow, kk); if(kk < n) { switch(n - kk) { -# else // NOT a CUDA device +# else // NOT a GPU if ((n > 16) && (n % 16 == 1)) { strip_mine<17>(dev, iRow, kk); kk += 17; @@ -745,7 +745,7 @@ struct SPMV_MV_LayoutLeft_Functor { case 8: strip_mine<8>(dev, iRow, kk); break; -# endif // __CUDA_ARCH__ +# endif // if/else: __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__ case 7: strip_mine<7>(dev, iRow, kk); break; diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index be563c5257..3179a0cc31 100644 --- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -1126,7 +1126,7 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { strip_mine<1>(dev, iRow, kk); } #else -# ifdef __CUDA_ARCH__ +# if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) if ((n > 8) && (n % 8 == 1)) { strip_mine<9>(dev, iRow, kk); kk += 9; @@ -1180,7 +1180,7 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { case 8: strip_mine<8>(dev, iRow, kk); break; - #endif // __CUDA_ARCH__ + #endif // __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__ case 7: strip_mine<7>(dev, iRow, kk); break; diff --git a/unit_test/batched/Test_Batched_SerialTrmm.hpp b/unit_test/batched/Test_Batched_SerialTrmm.hpp index 8f8fd48758..3301f3cd42 100644 --- a/unit_test/batched/Test_Batched_SerialTrmm.hpp +++ b/unit_test/batched/Test_Batched_SerialTrmm.hpp @@ -54,7 +54,7 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/batched/Test_Batched_SerialTrtri.hpp b/unit_test/batched/Test_Batched_SerialTrtri.hpp index c50e26ae35..f4f74d6b7c 100644 --- a/unit_test/batched/Test_Batched_SerialTrtri.hpp +++ b/unit_test/batched/Test_Batched_SerialTrtri.hpp @@ -56,7 +56,7 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/blas/Test_Blas3_gemm.hpp b/unit_test/blas/Test_Blas3_gemm.hpp index 55c71231f6..451b7fedac 100644 --- a/unit_test/blas/Test_Blas3_gemm.hpp +++ b/unit_test/blas/Test_Blas3_gemm.hpp @@ -25,7 +25,7 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/blas/Test_Blas3_trmm.hpp b/unit_test/blas/Test_Blas3_trmm.hpp index 74fd49b988..9f72bd5e63 100644 --- a/unit_test/blas/Test_Blas3_trmm.hpp +++ b/unit_test/blas/Test_Blas3_trmm.hpp @@ -49,7 +49,7 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/blas/Test_Blas3_trsm.hpp b/unit_test/blas/Test_Blas3_trsm.hpp index e6e98723c2..8fec44b637 100644 --- a/unit_test/blas/Test_Blas3_trsm.hpp +++ b/unit_test/blas/Test_Blas3_trsm.hpp @@ -49,7 +49,8 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/blas/Test_Blas_trtri.hpp b/unit_test/blas/Test_Blas_trtri.hpp index f939b87b31..bcc6b842c8 100644 --- a/unit_test/blas/Test_Blas_trtri.hpp +++ b/unit_test/blas/Test_Blas_trtri.hpp @@ -49,7 +49,7 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); From 9a9ec3443b8be533f35d9baf4ca865f1ae9e7741 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 14 Oct 2020 10:52:53 -0700 Subject: [PATCH 065/106] Restore d2 coloring verbose about default algo --- src/graph/KokkosGraph_Distance2ColorHandle.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/graph/KokkosGraph_Distance2ColorHandle.hpp b/src/graph/KokkosGraph_Distance2ColorHandle.hpp index 4c392051fb..39d66b744f 100644 --- a/src/graph/KokkosGraph_Distance2ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance2ColorHandle.hpp @@ -206,9 +206,19 @@ class GraphColorDistance2Handle void choose_default_algorithm() { if(KokkosKernels::Impl::kk_get_exec_space_type() == KokkosKernels::Impl::Exec_SERIAL) + { this->coloring_algorithm_type = COLORING_D2_SERIAL; +#ifdef VERBOSE + std:cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n"; +#endif + } else + { this->coloring_algorithm_type = COLORING_D2_NB_BIT; +#ifdef VERBOSE + std:cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n"; +#endif + } } From 2c3e3a46750f21d7d1b268e47645da9089b2ca54 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 14 Oct 2020 11:01:38 -0700 Subject: [PATCH 066/106] Fix indent --- src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp index 29dbb5c477..ec0c2034a2 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp @@ -1508,7 +1508,7 @@ void KokkosSPGEMM current_spgemm_algorithm = SPGEMM_KK_MEMORY; } maxNumRoughNonzeros = KOKKOSKERNELS_MACRO_MIN(this->b_col_cnt, maxNumRoughNonzeros); - int shmem_size_to_use = shmem_size; + int shmem_size_to_use = shmem_size; typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space; From d2448f2943052642a03a50c95588e1728283dd40 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 14 Oct 2020 12:48:25 -0700 Subject: [PATCH 067/106] Factor out pool #chunks computation for SpGEMM (same code used in 7 places) --- src/sparse/impl/KokkosSparse_spgemm_impl.hpp | 27 ++++++++++++ .../KokkosSparse_spgemm_impl_compression.hpp | 21 +-------- .../impl/KokkosSparse_spgemm_impl_kkmem.hpp | 20 +-------- .../KokkosSparse_spgemm_impl_symbolic.hpp | 43 ++----------------- .../KokkosSparse_spgemm_impl_triangle.hpp | 21 ++------- ...se_spgemm_impl_triangle_no_compression.hpp | 19 ++------ ...kosSparse_spgemm_jacobi_sparseacc_impl.hpp | 18 +------- 7 files changed, 43 insertions(+), 126 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp index a8a539ef10..52ae067801 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp @@ -789,6 +789,33 @@ class KokkosSPGEMM{ }; +//Utility to compute the number of pool chunks for L2 hashmap accumulators. +//Uses free memory query for accelerators/GPUs but assumes infinite available host memory. +// +//chunk_bytes: bytes in each chunk +//ideal_num_chunks: number of chunks that would give each thread/team its own chunk (no contention) +template +size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks) +{ + if(!KokkosKernels::Impl::kk_is_gpu_exec_space()) + return ideal_num_chunks; + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); + size_t required_size = ideal_num_chunks * chunk_bytes; + if (KOKKOSKERNELS_VERBOSE) + std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; + size_t num_chunks = ideal_num_chunks; + //If there is not enough memory to safely allocate ideal_num_chunks, use half the free memory, rounded down + if (required_size > free_byte / 2) { + num_chunks = (free_byte / 2) / chunk_bytes; + } + //then take the largest power of 2 smaller than that + nnz_lno_t po2_num_chunks = 1; + while (po2_num_chunks * 2 < num_chunks) { + po2_num_chunks *= 2; + } + return po2_num_chunks; +} } } diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp index c881c98ed4..6936a49f15 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp @@ -860,26 +860,9 @@ bool KokkosSPGEMM sszm_compressMatrix.pow2_hash_size = min_hash_size; sszm_compressMatrix.pow2_hash_func = min_hash_size - 1; - size_t num_chunks = concurrency / suggested_vector_size; + nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); - - if (exec_gpu) { - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks*sizeof(int) > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - size_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } if (KOKKOSKERNELS_VERBOSE){ std::cout << "\t\tPOOL chunksize:" << chunksize << " num_chunks:" diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp index 38fce91b1b..e81b019e15 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp @@ -1421,25 +1421,9 @@ void chunksize += min_hash_size ; //this is for the hash begins chunksize += max_nnz; //this is for hash nexts } - int num_chunks = concurrency / suggested_vector_size; - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } + nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); // END SIZE CALCULATIONS FOR MEMORYPOOL diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp index ec0c2034a2..4eb13d9b5e 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp @@ -1642,30 +1642,13 @@ void KokkosSPGEMM } //initizalize value for the mem pool - nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - - if (exec_gpu) { - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } + nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl; @@ -1970,31 +1953,13 @@ void KokkosSPGEMM std::cout << "\tDense Acc - COLS:" << col_size << " max_row_size:" << max_row_size << std::endl; } } - nnz_lno_t num_chunks = concurrency / suggested_vector_size; - KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - - if (exec_gpu) { - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } + nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl; diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp index 27c0f4c7d9..6624343b52 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp @@ -1416,29 +1416,14 @@ void KokkosSPGEMM } - nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - if(exec_gpu) { - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 < num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } + nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); + if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp index ae913f864a..adc75d6eb2 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp @@ -963,26 +963,13 @@ void KokkosSPGEMM pool_init_val = 0; } - nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 < num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } } + nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); + if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp index 2e12457822..2140b8dc56 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp @@ -1401,23 +1401,9 @@ namespace KokkosSparse{ chunksize += min_hash_size ; //this is for the hash begins chunksize += max_nnz; //this is for hash nexts } - int num_chunks = concurrency / suggested_vector_size; - if (exec_gpu) { - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } + nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ std::cout << "\t\t max_nnz: " << max_nnz From f4cacdc19ccf98c89f22e5d65246cf8782189433 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 14 Oct 2020 13:10:09 -0700 Subject: [PATCH 068/106] Made compute_num_pool_chunks a member of SpGEMM --- src/sparse/impl/KokkosSparse_spgemm_impl.hpp | 55 +++++++++---------- .../KokkosSparse_spgemm_impl_compression.hpp | 2 +- .../impl/KokkosSparse_spgemm_impl_kkmem.hpp | 2 +- .../KokkosSparse_spgemm_impl_symbolic.hpp | 4 +- .../KokkosSparse_spgemm_impl_triangle.hpp | 2 +- ...se_spgemm_impl_triangle_no_compression.hpp | 2 +- ...kosSparse_spgemm_jacobi_sparseacc_impl.hpp | 2 +- 7 files changed, 34 insertions(+), 35 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp index 52ae067801..19e576eb9d 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp @@ -787,35 +787,34 @@ class KokkosSPGEMM{ typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv, KokkosKernels::Impl::ExecSpaceType my_exec_space); -}; - -//Utility to compute the number of pool chunks for L2 hashmap accumulators. -//Uses free memory query for accelerators/GPUs but assumes infinite available host memory. -// -//chunk_bytes: bytes in each chunk -//ideal_num_chunks: number of chunks that would give each thread/team its own chunk (no contention) -template -size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks) -{ - if(!KokkosKernels::Impl::kk_is_gpu_exec_space()) - return ideal_num_chunks; - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = ideal_num_chunks * chunk_bytes; - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - size_t num_chunks = ideal_num_chunks; - //If there is not enough memory to safely allocate ideal_num_chunks, use half the free memory, rounded down - if (required_size > free_byte / 2) { - num_chunks = (free_byte / 2) / chunk_bytes; + //Utility to compute the number of pool chunks for L2 hashmap accumulators. + //Uses free memory query for accelerators/GPUs but assumes infinite available host memory. + // + //chunk_bytes: bytes in each chunk + //ideal_num_chunks: number of chunks that would give each thread/team its own chunk (no contention) + template + size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks) + { + if(!KokkosKernels::Impl::kk_is_gpu_exec_space()) + return ideal_num_chunks; + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); + size_t required_size = ideal_num_chunks * chunk_bytes; + if (KOKKOSKERNELS_VERBOSE) + std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; + size_t num_chunks = ideal_num_chunks; + //If there is not enough memory to safely allocate ideal_num_chunks, use half the free memory, rounded down + if (required_size > free_byte / 2) { + num_chunks = (free_byte / 2) / chunk_bytes; + } + //then take the largest power of 2 smaller than that + nnz_lno_t po2_num_chunks = 1; + while (po2_num_chunks * 2 < num_chunks) { + po2_num_chunks *= 2; + } + return po2_num_chunks; } - //then take the largest power of 2 smaller than that - nnz_lno_t po2_num_chunks = 1; - while (po2_num_chunks * 2 < num_chunks) { - po2_num_chunks *= 2; - } - return po2_num_chunks; -} +}; } } diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp index 6936a49f15..35f00201a2 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp @@ -860,7 +860,7 @@ bool KokkosSPGEMM sszm_compressMatrix.pow2_hash_size = min_hash_size; sszm_compressMatrix.pow2_hash_func = min_hash_size - 1; - nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + nnz_lno_t num_chunks = this->template compute_num_pool_chunks (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp index e81b019e15..a5fc298e2c 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp @@ -1422,7 +1422,7 @@ void chunksize += max_nnz; //this is for hash nexts } - nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + nnz_lno_t num_chunks = this->template compute_num_pool_chunks (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); // END SIZE CALCULATIONS FOR MEMORYPOOL diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp index 4eb13d9b5e..f6f4e8e3a8 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp @@ -1647,7 +1647,7 @@ void KokkosSPGEMM my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + nnz_lno_t num_chunks = this->template compute_num_pool_chunks (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ @@ -1958,7 +1958,7 @@ void KokkosSPGEMM my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + nnz_lno_t num_chunks = this->template compute_num_pool_chunks (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp index 6624343b52..c06d4c4cb2 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp @@ -1421,7 +1421,7 @@ void KokkosSPGEMM my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + nnz_lno_t num_chunks = this->template compute_num_pool_chunks (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp index adc75d6eb2..6a9b67c0b2 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp @@ -967,7 +967,7 @@ void KokkosSPGEMM if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + nnz_lno_t num_chunks = this->template compute_num_pool_chunks (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp index 2140b8dc56..d4c2c98a6f 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp @@ -1402,7 +1402,7 @@ namespace KokkosSparse{ chunksize += max_nnz; //this is for hash nexts } - nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + nnz_lno_t num_chunks = this->template compute_num_pool_chunks (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ From 7aef9b13a1b38e19392074cbe18381802dfad115 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 14 Oct 2020 13:13:13 -0700 Subject: [PATCH 069/106] Fix signed vs. unsigned --- src/sparse/impl/KokkosSparse_spgemm_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp index 19e576eb9d..06a3153ad9 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp @@ -808,7 +808,7 @@ class KokkosSPGEMM{ num_chunks = (free_byte / 2) / chunk_bytes; } //then take the largest power of 2 smaller than that - nnz_lno_t po2_num_chunks = 1; + size_t po2_num_chunks = 1; while (po2_num_chunks * 2 < num_chunks) { po2_num_chunks *= 2; } From fd94bd47e3fa964a4054c510564fe1a3d83e2472 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 26 Oct 2020 21:24:56 -0600 Subject: [PATCH 070/106] WIP: improving performance of spmv for openmp --- perf_test/sparse/CMakeLists.txt | 5 + perf_test/sparse/KokkosSparse_kk_spmv.cpp | 186 ++++++++ src/common/KokkosKernels_ExecSpaceUtils.hpp | 20 +- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 427 ++++++++++++++---- .../impl/KokkosSparse_spmv_impl_omp.hpp | 1 - unit_test/sparse/Test_Sparse_spmv.hpp | 31 ++ 6 files changed, 576 insertions(+), 94 deletions(-) create mode 100644 perf_test/sparse/KokkosSparse_kk_spmv.cpp diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index da22993cda..f0662e4a08 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -43,6 +43,11 @@ KOKKOSKERNELS_ADD_EXECUTABLE( SOURCES KokkosSparse_spmv.cpp ) +KOKKOSKERNELS_ADD_EXECUTABLE( + sparse_kk_spmv + SOURCES KokkosSparse_kk_spmv.cpp + ) + IF(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) KOKKOSKERNELS_ADD_EXECUTABLE( sparse_spmv_merge diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp new file mode 100644 index 0000000000..07c29e3735 --- /dev/null +++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp @@ -0,0 +1,186 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "KokkosKernels_default_types.hpp" + +typedef default_scalar Scalar; +typedef default_lno_t Ordinal; +typedef default_size_type Offset; + +template +void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, int num_vecs, char mode, Scalar beta) { + typedef KokkosSparse::CrsMatrix matrix_type; + typedef typename Kokkos::View mv_type; + typedef typename mv_type::HostMirror h_mv_type; + + srand(17312837); + matrix_type A; + if(filename) + A = KokkosKernels::Impl::read_kokkos_crst_matrix(filename); + else + { + Offset nnz = 10 * numRows; + //note: the help text says the bandwidth is fixed at 0.01 * numRows + A = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows, numCols, nnz, 0, 0.01 * numRows); + } + numRows = A.numRows(); + numCols = A.numCols(); + Offset nnz = A.nnz(); + mv_type x("X", numCols, num_vecs); + mv_type y("Y", numRows, num_vecs); + h_mv_type h_x = Kokkos::create_mirror_view(x); + h_mv_type h_y = Kokkos::create_mirror_view(y); + h_mv_type h_y_compare = Kokkos::create_mirror(y); + + for(int v = 0; v < num_vecs; v++) + { + for(int i=0; i::value) + layout = 'L'; + else + layout = 'R'; + int loop = 100; + int num_vecs = 1; + Scalar beta = 0.0; + + if(argc == 1) { + print_help(); + return 0; + } + + for(int i=0;i(size,size,filename,loop,num_vecs,mode,beta); + else + run_spmv(size,size,filename,loop,num_vecs,mode,beta); + + Kokkos::finalize(); +} + diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp index 22930c82e1..59bcf487fb 100644 --- a/src/common/KokkosKernels_ExecSpaceUtils.hpp +++ b/src/common/KokkosKernels_ExecSpaceUtils.hpp @@ -55,7 +55,7 @@ namespace Impl{ enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA, Exec_HIP}; template -constexpr KOKKOS_INLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){ +KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){ ExecSpaceType exec_space = Exec_SERIAL; #if defined( KOKKOS_ENABLE_SERIAL ) if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ @@ -98,11 +98,23 @@ constexpr KOKKOS_INLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){ template constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { - auto exec = kk_get_exec_space_type(); - //TODO BMK: Add OpenMPTarget and any other future GPU exec spaces - return exec == Exec_CUDA || exec == Exec_HIP; + return false; } +#ifdef KOKKOS_ENABLE_CUDA +template <> +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { + return true; +} +#endif + +#ifdef KOKKOS_ENABLE_HIP +template <> +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { + return true; +} +#endif + //Host function to determine free and total device memory. //Will throw if execution space doesn't support this. template diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 1c011e42d9..558acc363a 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -171,10 +171,38 @@ struct SPMV_Functor { "YVector must be a rank 1 View."); } + KOKKOS_INLINE_FUNCTION + void operator() (const ordinal_type iRow) const + { + using y_value_type = typename YVector::non_const_value_type; + if (iRow >= m_A.numRows ()) { + return; + } + const KokkosSparse::SparseRowViewConst row = m_A.rowConst(iRow); + const ordinal_type row_length = static_cast (row.length); + y_value_type sum = 0; + + for(ordinal_type iEntry = 0; iEntry < row_length; iEntry++) + { + const value_type val = conjugate ? + ATV::conj (row.value(iEntry)) : + row.value(iEntry); + sum += val * m_x(row.colidx(iEntry)); + } + + sum *= alpha; + + if (dobeta == 0) { + m_y(iRow) = sum ; + } else { + m_y(iRow) = beta * m_y(iRow) + sum; + } + } + KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - typedef typename YVector::non_const_value_type y_value_type; + using y_value_type = typename YVector::non_const_value_type; Kokkos::parallel_for(Kokkos::TeamThreadRange(dev,0,rows_per_team), [&] (const ordinal_type& loop) { @@ -213,9 +241,19 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th if(nnz_per_row < 1) nnz_per_row = 1; + int max_vector_length = 1; +#ifdef KOKKOS_ENABLE_CUDA + if(std::is_same::value) + max_vector_length = 32; +#endif +#ifdef KOKKOS_ENABLE_HIP + if(std::is_same::value) + max_vector_length = 64; +#endif + if(vector_length < 1) { vector_length = 1; - while(vector_length<32 && vector_length*6 < nnz_per_row) + while(vector_length < max_vector_length && vector_length * 6 < nnz_per_row) vector_length*=2; } @@ -280,21 +318,14 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, ((int) A.graph.row_block_offsets.extent(0) == (int) omp_get_max_threads()+1) && (((uintptr_t)(const void*)(x.data())%64)==0) && (((uintptr_t)(const void*)(y.data())%64)==0) ) { + //Note BMK: this case is typically not called in practice even for OpenMP, since + //it requires row_block_offsets to have been computed in the graph. spmv_raw_openmp_no_transpose(alpha,A,x,beta,y); return; } #endif - int team_size = -1; - int vector_length = -1; - int64_t rows_per_thread = -1; - - // Note on 03/24/20, lbv: We can use the controls - // here to allow the user to pass in some tunning - // parameters. - if(controls.isParameter("team size")) {team_size = std::stoi(controls.getParameter("team size"));} - if(controls.isParameter("vector length")) {vector_length = std::stoi(controls.getParameter("vector length"));} - if(controls.isParameter("rows per thread")) {rows_per_thread = std::stoll(controls.getParameter("rows per thread"));} + bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule bool use_static_schedule = false; // Forces the use of a static schedule if(controls.isParameter("schedule")) { @@ -304,26 +335,45 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, use_static_schedule = true; } } - - int64_t rows_per_team = spmv_launch_parameters(A.numRows(),A.nnz(),rows_per_thread,team_size,vector_length); - int64_t worksets = (y.extent(0)+rows_per_team-1)/rows_per_team; - - SPMV_Functor func (alpha,A,x,beta,y,rows_per_team); - - if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule) { - Kokkos::TeamPolicy > policy(1,1); - if(team_size<0) - policy = Kokkos::TeamPolicy >(worksets,Kokkos::AUTO,vector_length); - else - policy = Kokkos::TeamPolicy >(worksets,team_size,vector_length); - Kokkos::parallel_for("KokkosSparse::spmv",policy,func); - } else { - Kokkos::TeamPolicy > policy(1,1); - if(team_size<0) - policy = Kokkos::TeamPolicy >(worksets,Kokkos::AUTO,vector_length); + if(use_teams) { + int team_size = -1; + int vector_length = -1; + int64_t rows_per_thread = -1; + + // Note on 03/24/20, lbv: We can use the controls + // here to allow the user to pass in some tunning + // parameters. + if(controls.isParameter("team size")) {team_size = std::stoi(controls.getParameter("team size"));} + if(controls.isParameter("vector length")) {vector_length = std::stoi(controls.getParameter("vector length"));} + if(controls.isParameter("rows per thread")) {rows_per_thread = std::stoll(controls.getParameter("rows per thread"));} + + int64_t rows_per_team = spmv_launch_parameters(A.numRows(),A.nnz(),rows_per_thread,team_size,vector_length); + int64_t worksets = (y.extent(0)+rows_per_team-1)/rows_per_team; + + SPMV_Functor func (alpha,A,x,beta,y,rows_per_team); + + if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::TeamPolicy > policy(1,1); + if(team_size<0) + policy = Kokkos::TeamPolicy >(worksets,Kokkos::AUTO,vector_length); + else + policy = Kokkos::TeamPolicy >(worksets,team_size,vector_length); + Kokkos::parallel_for("KokkosSparse::spmv",policy,func); + } else { + Kokkos::TeamPolicy > policy(1,1); + if(team_size<0) + policy = Kokkos::TeamPolicy >(worksets,Kokkos::AUTO,vector_length); + else + policy = Kokkos::TeamPolicy >(worksets,team_size,vector_length); + Kokkos::parallel_for("KokkosSparse::spmv",policy,func); + } + } + else { + SPMV_Functor func (alpha,A,x,beta,y,1); + if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule) + Kokkos::parallel_for("KokkosSparse::spmv",Kokkos::RangePolicy>(0, A.numRows()),func); else - policy = Kokkos::TeamPolicy >(worksets,team_size,vector_length); - Kokkos::parallel_for("KokkosSparse::spmv",policy,func); + Kokkos::parallel_for("KokkosSparse::spmv",Kokkos::RangePolicy>(0, A.numRows()),func); } } @@ -339,7 +389,8 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, typename YVector::const_value_type& beta, const YVector& y) { - typedef typename AMatrix::ordinal_type ordinal_type; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; if (A.numRows () <= static_cast (0)) { return; @@ -351,15 +402,23 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, KokkosBlas::scal (y, beta, y); } - typedef typename AMatrix::size_type size_type; - // Assuming that no row contains duplicate entries, NNZPerRow // cannot be more than the number of columns of the matrix. Thus, // the appropriate type is ordinal_type. - const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); + const ordinal_type NNZPerRow = A.nnz () / A.numRows (); int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2; + int max_vector_length = 1; +#ifdef KOKKOS_ENABLE_CUDA + if(std::is_same::value) + max_vector_length = 32; +#endif +#ifdef KOKKOS_ENABLE_HIP + if(std::is_same::value) + max_vector_length = 64; +#endif + while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) ) + vector_length*=2; typedef SPMV_Transpose_Functor OpType; @@ -367,9 +426,9 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, OpType op (alpha, A, x, y); - const int rows_per_thread = RowsPerThread (NNZPerRow); - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const int rows_per_team = rows_per_thread * team_size; + const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; op.rows_per_team = rows_per_team; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > @@ -626,6 +685,65 @@ struct SPMV_MV_LayoutLeft_Functor { } } + template + KOKKOS_INLINE_FUNCTION void + strip_mine (const ordinal_type& iRow, const ordinal_type& kk) const + { + y_value_type sum[UNROLL]; + +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + sum[k] = Kokkos::Details::ArithTraits::zero (); + } + + const auto row = m_A.rowConst (iRow); + + // The correct type of iEntry is ordinal_type, the type of the + // number of columns in the (local) matrix. This is because we + // assume either that rows have no duplicate entries, or that rows + // never have enough duplicate entries to overflow ordinal_type. + + for(ordinal_type iEntry = 0; iEntry < row.length; iEntry++) + { + const A_value_type val = conjugate ? + Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : + row.value(iEntry); + const ordinal_type ind = row.colidx(iEntry); +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + if(doalpha == 1) + sum[k] += val * m_x(ind, kk + k); + else if(doalpha == -1) + sum[k] -= val * m_x(ind, kk + k); + else + sum[k] += alpha * val * m_x(ind, kk + k); + } + } + + if(doalpha == -1) + + if (dobeta == 0) { + for(ordinal_type k = 0; k < UNROLL; k++) + m_y(iRow, kk + k) = sum[k]; + } else if (dobeta == 1) { + for(ordinal_type k = 0; k < UNROLL; k++) + m_y(iRow, kk + k) = m_y(iRow, kk + k) + sum[k]; + } else if (dobeta == -1) { + for(ordinal_type k = 0; k < UNROLL; k++) + m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; + } else { + for(ordinal_type k = 0; k < UNROLL; k++) + m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k]; + } + } + KOKKOS_INLINE_FUNCTION void strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const { @@ -666,6 +784,141 @@ struct SPMV_MV_LayoutLeft_Functor { }); } + KOKKOS_INLINE_FUNCTION void + strip_mine_1 (const ordinal_type& iRow) const + { + const auto row = m_A.rowConst (iRow); + + // The correct type of iEntry is ordinal_type, the type of the + // number of columns in the (local) matrix. This is because we + // assume either that rows have no duplicate entries, or that rows + // never have enough duplicate entries to overflow ordinal_type. + + y_value_type sum = y_value_type(); + for(ordinal_type iEntry = 0; iEntry < row.length; iEntry++) + { + const A_value_type val = conjugate ? + Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : + row.value(iEntry); + sum += val * m_x(row.colidx(iEntry),0); + } + if (doalpha == -1) { + sum = -sum; + } else if (doalpha != 1) { + sum *= alpha; + } + + if (dobeta == 0) { + m_y(iRow, 0) = sum ; + } else if (dobeta == 1) { + m_y(iRow, 0) += sum ; + } else if (dobeta == -1) { + m_y(iRow, 0) = -m_y(iRow, 0) + sum; + } else { + m_y(iRow, 0) = beta * m_y(iRow, 0) + sum; + } + } + + KOKKOS_INLINE_FUNCTION void + operator() (const ordinal_type& iRow) const + { + // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it + // needs to have the same type as n. + ordinal_type kk = 0; + +#ifdef KOKKOS_FAST_COMPILE + for (; kk + 4 <= n; kk += 4) { + strip_mine<4>(dev, iRow, kk); + } + for( ; kk < n; ++kk) { + strip_mine<1>(dev, iRow, kk); + } +#else +# if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + if ((n > 8) && (n % 8 == 1)) { + strip_mine<9>(iRow, kk); + kk += 9; + } + for(; kk + 8 <= n; kk += 8) + strip_mine<8>(iRow, kk); + if(kk < n) { + switch(n - kk) { +# else // NOT a GPU + if ((n > 16) && (n % 16 == 1)) { + strip_mine<17>(iRow, kk); + kk += 17; + } + + for (; kk + 16 <= n; kk += 16) { + strip_mine<16>(iRow, kk); + } + + if(kk < n) { + switch(n - kk) { + case 15: + strip_mine<15>(iRow, kk); + break; + + case 14: + strip_mine<14>(iRow, kk); + break; + + case 13: + strip_mine<13>(iRow, kk); + break; + + case 12: + strip_mine<12>(iRow, kk); + break; + + case 11: + strip_mine<11>(iRow, kk); + break; + + case 10: + strip_mine<10>(iRow, kk); + break; + + case 9: + strip_mine<9>(iRow, kk); + break; + + case 8: + strip_mine<8>(iRow, kk); + break; +# endif // if/else: __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__ + case 7: + strip_mine<7>(iRow, kk); + break; + + case 6: + strip_mine<6>(iRow, kk); + break; + + case 5: + strip_mine<5>(iRow, kk); + break; + + case 4: + strip_mine<4>(iRow, kk); + break; + + case 3: + strip_mine<3>(iRow, kk); + break; + + case 2: + strip_mine<2>(iRow, kk); + break; + + case 1: + strip_mine_1(iRow); + break; + } + } +#endif // KOKKOS_FAST_COMPILE + } + KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const @@ -794,7 +1047,8 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a const typename YVector::non_const_value_type& beta, const YVector& y) { - typedef typename AMatrix::ordinal_type ordinal_type; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; if (A.numRows () <= static_cast (0)) { return; @@ -806,16 +1060,18 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a return; } else { - typedef typename AMatrix::size_type size_type; // Assuming that no row contains duplicate entries, NNZPerRow // cannot be more than the number of columns of the matrix. Thus, // the appropriate type is ordinal_type. - const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); + const ordinal_type NNZPerRow = A.nnz () / A.numRows (); - int vector_length = 1; - if(KokkosKernels::Impl::kk_is_gpu_exec_space()) - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); + ordinal_type vector_length = 1; + if(use_teams) { + while( (vector_length*2*3 <= NNZPerRow) && (vector_length<8) ) + vector_length *= 2; + } #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels @@ -825,17 +1081,17 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a typename AMatrix::const_ordinal_type nrow = A.numRows(); - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); + if(use_teams) { + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< typename AMatrix::execution_space >( 0, nrow ), op ); + } #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta @@ -846,18 +1102,18 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); - + if(use_teams) { + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< typename AMatrix::execution_space > + ( 0, nrow ) , op ); + } #endif // KOKKOS_FAST_COMPILE } } @@ -875,7 +1131,8 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph const typename YVector::non_const_value_type& beta, const YVector& y) { - typedef typename AMatrix::ordinal_type ordinal_type; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; if (A.numRows () <= static_cast (0)) { return; @@ -895,10 +1152,12 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph // the appropriate type is ordinal_type. const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); - int vector_length = 1; + ordinal_type vector_length = 1; //Transpose functor uses atomics which can't be vectorized on CPU - if(KokkosKernels::Impl::kk_is_gpu_exec_space()) - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + while( (vector_length*2*3 <= NNZPerRow) && (vector_length<8) ) + vector_length*=2; + } #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels @@ -906,16 +1165,11 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph doalpha, dobeta, conjugate> OpType; OpType op (alpha, A, x, beta, y); - typename AMatrix::const_ordinal_type nrow = A.numRows(); + const ordinal_type nrow = A.numRows(); - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const int rows_per_team = rows_per_thread * team_size; + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; op.rows_per_team = rows_per_team; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for ("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > @@ -930,14 +1184,9 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph OpType op (alpha, A, x, beta, y); - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const int rows_per_team = rows_per_thread * team_size; + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; op.rows_per_team = rows_per_team; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > diff --git a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp index a4f1c07258..72c8a969fe 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp @@ -47,7 +47,6 @@ namespace Impl { #ifdef KOKKOS_ENABLE_OPENMP template void spmv_raw_openmp_no_transpose(typename YVector::const_value_type& s_a, AMatrix A, XVector x, typename YVector::const_value_type& s_b, YVector y) { - typedef typename YVector::non_const_value_type value_type; typedef typename AMatrix::ordinal_type ordinal_type; typedef typename AMatrix::non_const_size_type size_type; diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index e27012991a..598f906f8d 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -450,6 +450,36 @@ void test_spmv_mv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_v } } +template +void test_spmv_mv_heavy(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance, int numMV){ + + typedef typename KokkosSparse::CrsMatrix crsMat_t; + + typedef Kokkos::View ViewTypeX; + typedef Kokkos::View ViewTypeY; + + crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows,numCols,nnz,row_size_variance, bandwidth); + Kokkos::Random_XorShift64_Pool rand_pool(13718); + + for(int nv = 1; nv <= numMV; nv++) { + ViewTypeX b_x("A",numRows,nv); + ViewTypeY b_y("B",numRows,nv); + ViewTypeY b_y_copy("B",numRows,nv); + + Kokkos::fill_random(b_x,rand_pool,scalar_t(10)); + Kokkos::fill_random(b_y,rand_pool,scalar_t(10)); + + Kokkos::deep_copy(b_y_copy, b_y); + + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'N'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'N'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'T'); + } +} + template void test_spmv_struct_1D(lno_t nx, lno_t leftBC, lno_t rightBC) { @@ -816,6 +846,7 @@ TEST_F( TestCategory,sparse ## _ ## spmv_mv ## _ ## SCALAR ## _ ## ORDINAL ## _ test_spmv_mv (50000, 50000 * 30, 100, 10, 5); \ test_spmv_mv (50000, 50000 * 30, 200, 10, 1); \ test_spmv_mv (10000, 10000 * 20, 100, 5, 10); \ + test_spmv_mv_heavy (200, 200 * 10, 60, 4, 50); \ } #define EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, DEVICE) \ From 792e48f6a6763b889d7c11d1a598218bdaf0ca4b Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 26 Oct 2020 21:46:41 -0600 Subject: [PATCH 071/106] Fixed typo in spmv --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 2 -- unit_test/sparse/Test_Sparse_spmv.hpp | 11 ++++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 558acc363a..55da2dea60 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -727,8 +727,6 @@ struct SPMV_MV_LayoutLeft_Functor { } } - if(doalpha == -1) - if (dobeta == 0) { for(ordinal_type k = 0; k < UNROLL; k++) m_y(iRow, kk + k) = sum[k]; diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index 598f906f8d..5a033fdf34 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -42,7 +42,7 @@ struct fSPMV { if(error > eps) { err++; - printf("expected_y(%d)=%f, y(%d)=%f\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i))); + //printf("expected_y(%d)=%f, y(%d)=%f\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i))); } } }; @@ -203,8 +203,9 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vecto my_exec_space(0,y_i.extent(0)), fSPMV(y_i, y_spmv, eps), num_errors); - if(num_errors>0) printf("KokkosSparse::Test::spmv_mv: %i errors of %i for mv %i\n", - num_errors, y_i.extent_int(0), i); + if(num_errors>0) + std::cout << "KokkosSparse::Test::spmv_mv: " << num_errors << " errors of " << y_i.extent_int(0) + << " for mv " << i << " (alpha=" << alpha << ", beta=" << beta << ", mode = " << mode << ")\n"; EXPECT_TRUE(num_errors==0); } } @@ -458,7 +459,7 @@ void test_spmv_mv_heavy(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_ typedef Kokkos::View ViewTypeX; typedef Kokkos::View ViewTypeY; - crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows,numCols,nnz,row_size_variance, bandwidth); + crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows,numRows,nnz,row_size_variance, bandwidth); Kokkos::Random_XorShift64_Pool rand_pool(13718); for(int nv = 1; nv <= numMV; nv++) { @@ -846,7 +847,7 @@ TEST_F( TestCategory,sparse ## _ ## spmv_mv ## _ ## SCALAR ## _ ## ORDINAL ## _ test_spmv_mv (50000, 50000 * 30, 100, 10, 5); \ test_spmv_mv (50000, 50000 * 30, 200, 10, 1); \ test_spmv_mv (10000, 10000 * 20, 100, 5, 10); \ - test_spmv_mv_heavy (200, 200 * 10, 60, 4, 50); \ + test_spmv_mv_heavy (200, 200 * 10, 60, 4, 30); \ } #define EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, DEVICE) \ From 55933aa0790b248c6171c080e238c87fb8108787 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 26 Oct 2020 22:50:44 -0600 Subject: [PATCH 072/106] Use range policy for omp mode T spmv/spmv_mv --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 123 ++++++++++++++++----- 1 file changed, 95 insertions(+), 28 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 55da2dea60..e3934e9d5d 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -105,6 +105,21 @@ struct SPMV_Transpose_Functor { alpha (alpha_), m_A (m_A_), m_x (m_x_), m_y (m_y_) {} + KOKKOS_INLINE_FUNCTION void + operator() (const ordinal_type iRow) const + { + const auto row = m_A.rowConst (iRow); + const ordinal_type row_length = row.length; + for(ordinal_type iEntry = 0; iEntry < row_length; iEntry++) + { + const value_type val = conjugate ? + ATV::conj (row.value(iEntry)) : + row.value(iEntry); + const ordinal_type ind = row.colidx(iEntry); + Kokkos::atomic_add (&m_y(ind), static_cast (alpha * val * m_x(iRow))); + } + } + KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { @@ -408,6 +423,7 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, const ordinal_type NNZPerRow = A.nnz () / A.numRows (); int vector_length = 1; + bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); int max_vector_length = 1; #ifdef KOKKOS_ENABLE_CUDA if(std::is_same::value) @@ -417,8 +433,10 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, if(std::is_same::value) max_vector_length = 64; #endif - while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) ) - vector_length*=2; + if(use_teams) { + while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) ) + vector_length*=2; + } typedef SPMV_Transpose_Functor OpType; @@ -426,14 +444,19 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, OpType op (alpha, A, x, y); - const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); - const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const ordinal_type rows_per_team = rows_per_thread * team_size; - op.rows_per_team = rows_per_team; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); - + if(use_teams) { + const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< typename AMatrix::execution_space > + ( 0 , nrow ) , op ); + } } template::conj (row.value(iEntry)) : + row.value(iEntry); + const ordinal_type ind = row.colidx(iEntry); + + if (doalpha != 1) { + #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL + #pragma unroll + #endif + for (ordinal_type k = 0; k < n; ++k) { + Kokkos::atomic_add (&m_y(ind,k), + static_cast (alpha * val * m_x(iRow, k))); + } + } else { + #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL + #pragma unroll + #endif + for (ordinal_type k = 0; k < n; ++k) { + Kokkos::atomic_add (&m_y(ind,k), + static_cast (val * m_x(iRow, k))); + } + } + } + } + KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { @@ -1151,8 +1207,9 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); ordinal_type vector_length = 1; + bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); //Transpose functor uses atomics which can't be vectorized on CPU - if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if(use_teams) { while( (vector_length*2*3 <= NNZPerRow) && (vector_length<8) ) vector_length*=2; } @@ -1164,14 +1221,19 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph OpType op (alpha, A, x, beta, y); const ordinal_type nrow = A.numRows(); - - const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); - const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const ordinal_type rows_per_team = rows_per_thread * team_size; - op.rows_per_team = rows_per_team; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for ("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); + if(use_teams) { + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for ("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for ("KokkosSparse::spmv", Kokkos::RangePolicy < typename AMatrix::execution_space > + ( 0 , nrow ) , op ); + } #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta @@ -1179,16 +1241,21 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph 2, 2, conjugate, SizeType> OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); + if(use_teams) { + OpType op (alpha, A, x, beta, y); - OpType op (alpha, A, x, beta, y); - - const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); - const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const ordinal_type rows_per_team = rows_per_thread * team_size; - op.rows_per_team = rows_per_team; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< typename AMatrix::execution_space > + ( 0, nrow ) , op ); + } #endif // KOKKOS_FAST_COMPILE } From 80fc49c1e16236f6b33115d972bbe584ac602c85 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 27 Oct 2020 08:58:56 -0600 Subject: [PATCH 073/106] Remove duplicate local typedef --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index e3934e9d5d..d42a0c81b2 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -1199,7 +1199,6 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph } if (doalpha != 0) { - typedef typename AMatrix::size_type size_type; // Assuming that no row contains duplicate entries, NNZPerRow // cannot be more than the number of columns of the matrix. Thus, From 1a35a8d33a6acceacae542700c1b1ad375bb0aa5 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 27 Oct 2020 09:05:09 -0600 Subject: [PATCH 074/106] Remove unused var --- perf_test/sparse/KokkosSparse_kk_spmv.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp index 07c29e3735..aa8f2ddfa3 100644 --- a/perf_test/sparse/KokkosSparse_kk_spmv.cpp +++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp @@ -80,7 +80,6 @@ void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, } numRows = A.numRows(); numCols = A.numCols(); - Offset nnz = A.nnz(); mv_type x("X", numCols, num_vecs); mv_type y("Y", numRows, num_vecs); h_mv_type h_x = Kokkos::create_mirror_view(x); From d3909de4a28049a507e8983cbac6febecb188010 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 27 Oct 2020 10:03:34 -0600 Subject: [PATCH 075/106] Fix execution_space typedef --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index d42a0c81b2..7b91f95e09 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -406,6 +406,7 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, { using ordinal_type = typename AMatrix::non_const_ordinal_type; using size_type = typename AMatrix::non_const_size_type; + using execution_space = typename AMatrix::execution_space; if (A.numRows () <= static_cast (0)) { return; @@ -423,7 +424,7 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, const ordinal_type NNZPerRow = A.nnz () / A.numRows (); int vector_length = 1; - bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); + bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); int max_vector_length = 1; #ifdef KOKKOS_ENABLE_CUDA if(std::is_same::value) @@ -445,16 +446,16 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, OpType op (alpha, A, x, y); if(use_teams) { - const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); - const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); const ordinal_type rows_per_team = rows_per_thread * team_size; op.rows_per_team = rows_per_team; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< execution_space > ( nteams , team_size , vector_length ) , op ); } else { - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< typename AMatrix::execution_space > + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< execution_space > ( 0 , nrow ) , op ); } } From 19e1eb1e93bb94adde5704bd9b0b79fcc936ad5a Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 27 Oct 2020 15:07:42 -0700 Subject: [PATCH 076/106] Remove HALF_IMPL_TYPE --- src/Kokkos_ArithTraits.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index d0e36b443c..30c7c11f56 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -677,7 +677,8 @@ class ArithTraits { // Since Kokkos::Experimental::half_t falls back to float, only define // ArithTraits if an IMPL type exists -#if defined(HAVE_KOKKOS_HALFMATH) && defined(HALF_IMPL_TYPE) +#if defined(HAVE_KOKKOS_HALFMATH) &&\ + defined(KOKKOS_ENABLE_CUDA_HALF) // TODO: Check for other backends template <> class ArithTraits { public: From ac22fb25bb6271468dd7dacdab3feb7cd0e15f3a Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 28 Oct 2020 10:37:08 -0600 Subject: [PATCH 077/106] HIP: fixing a couple of issues on AMD 1. typo: Kokkos::HIP instead of Kokkos::Experimental::HIP 2. __ffsll is not overloaded in HIP so need to cast the input correctly --- src/common/KokkosKernels_BitUtils.hpp | 8 ++++++++ src/sparse/KokkosSparse_CrsMatrix.hpp | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/common/KokkosKernels_BitUtils.hpp b/src/common/KokkosKernels_BitUtils.hpp index 4d09fb964e..3dc78f77b1 100644 --- a/src/common/KokkosKernels_BitUtils.hpp +++ b/src/common/KokkosKernels_BitUtils.hpp @@ -189,7 +189,11 @@ int least_set_bit( unsigned i ){ KOKKOS_FORCEINLINE_FUNCTION int least_set_bit( unsigned long i ){ +#if defined(__HIP_DEVICE_COMPILE__) + return __ffsll(static_cast(i)); +#else return __ffsll(i); +#endif } @@ -207,7 +211,11 @@ int least_set_bit( int i ){ KOKKOS_FORCEINLINE_FUNCTION int least_set_bit( long i ){ +#if defined(__HIP_DEVICE_COMPILE__) + return __ffsll(static_cast(i)); +#else return __ffsll(i); +#endif } KOKKOS_FORCEINLINE_FUNCTION diff --git a/src/sparse/KokkosSparse_CrsMatrix.hpp b/src/sparse/KokkosSparse_CrsMatrix.hpp index c618d3add6..d866a63601 100644 --- a/src/sparse/KokkosSparse_CrsMatrix.hpp +++ b/src/sparse/KokkosSparse_CrsMatrix.hpp @@ -106,7 +106,7 @@ inline int RowsPerThread(const int NNZPerRow) { #endif #ifdef KOKKOS_ENABLE_HIP template<> -inline int RowsPerThread(const int NNZPerRow) { +inline int RowsPerThread(const int NNZPerRow) { return 1; } #endif From eb0ad7c76a91e890a1eccb2d6737aa19ca314bcf Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 29 Oct 2020 10:22:32 -0600 Subject: [PATCH 078/106] Add hip support to dist-1 coloring perftest --- perf_test/graph/KokkosGraph_color.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index f7d8a93e80..a3fecb4c99 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -76,6 +76,9 @@ void print_options(std::ostream &os, const char *app_name, unsigned int indent = #endif #if defined(KOKKOS_ENABLE_CUDA) << spaces << " --cuda Use CUDA (device $id)" << std::endl +#endif +#if defined(KOKKOS_ENABLE_HIP) + << spaces << " --hip Use HIP (device $id)" << std::endl #endif << std::endl << spaces << " Required Parameters:" << std::endl @@ -131,6 +134,9 @@ int parse_inputs (KokkosKernels::Experiment::Parameters ¶ms, int argc, char else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) { params.use_cuda = 1 + atoi(getNextArg(i, argc, argv)); } + else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) { + params.use_hip = 1 + atoi(getNextArg(i, argc, argv)); + } else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) { params.repeat = atoi(getNextArg(i, argc, argv)); } @@ -212,7 +218,7 @@ int parse_inputs (KokkosKernels::Experiment::Parameters ¶ms, int argc, char print_options(std::cout, argv[0]); return 1; } - if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda) + if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip) { print_options(std::cout, argv[0]); return 1; @@ -539,7 +545,7 @@ int main (int argc, char ** argv){ std::cout << "Sizeof(idx):" << sizeof(idx) << " sizeof(size_type):" << sizeof(size_type) << std::endl; const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided as number of threads - const int device_id = 0; + const int device_id = std::max(params.use_cuda, params.use_hip) - 1; Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) ); Kokkos::print_configuration(std::cout); From 2ec709c20d21ac7b3b63443876381598b0d35266 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 29 Oct 2020 14:40:14 -0600 Subject: [PATCH 079/106] cm_test_all_sandia: add caraway --- scripts/cm_test_all_sandia | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index e31ff017d9..edb97e0edf 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -110,6 +110,7 @@ MACHINE="" HOSTNAME=$(hostname) PROCESSOR=`uname -p` CUDA_ENABLE_CMD= +HIP_ENABLE_CMD= #Command(s) for accessing local modules on the current machine, #e.g. "module use ..." #This will be added to reproducer instructions/script. @@ -152,6 +153,10 @@ if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name MACHINE=mayer fi +if [[ "$HOSTNAME" == caraway* ]]; then # Warning: very generic name + MACHINE=caraway +fi + if [[ "$HOSTNAME" == kokkos-dev\.sandia\.gov* ]]; then MACHINE=kokkos-dev fi @@ -629,6 +634,23 @@ elif [ "$MACHINE" = "mayer" ]; then fi SPACK_HOST_ARCH="+armv8_tx2" +elif [ "$MACHINE" = "caraway" ]; then + SKIP_HWLOC=True + BUILD_ONLY=True + # report_and_log_test_result: only testing compilation of code for now, + # output description and success based only on build succes; build time output (no run-time) + + BASE_MODULE_LIST="cmake/3.12.3,/" + + HIPCLANG_BUILD_LIST="Hip_Serial" + HIPCLANG_WARNING_FLAGS="" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("rocm/3.8.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS") + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=VEGA900" + fi elif [ "$MACHINE" = "blake" ]; then MODULE_ENVIRONMENT="source /etc/profile.d/modules.sh" eval "$MODULE_ENVIRONMENT" @@ -1190,6 +1212,10 @@ single_build_and_test() { if [[ "$LOCAL_KOKKOS_DEVICES" = *Cuda* ]]; then CUDA_ENABLE_CMD="--with-cuda=$CUDA_ROOT" fi + if [[ "$LOCAL_KOKKOS_DEVICES" = *Hip* ]]; then + echo "Hip IS THE KOKKOS DEVICE" + HIP_ENABLE_CMD="--with-hip" + fi local arch_code=$(echo $ARCH_FLAG | cut -d "=" -f 2) echo "kokkos devices: ${LOCAL_KOKKOS_DEVICES}" echo "kokkos arch: ${arch_code}" @@ -1204,13 +1230,13 @@ single_build_and_test() { # KOKKOS_OPTIONS and KOKKOS_CUDA_OPTIONS are exported and detected by kokkos' generate_makefile.sh during install of kokkos; we pass them to the reproducer script instructions echo " # Use generate_makefile line below to call cmake which generates makefile for this build:" &> call_generate_makefile.sh - echo " ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args" &>> call_generate_makefile.sh + echo " ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args" &>> call_generate_makefile.sh chmod +x call_generate_makefile.sh # script command with generic path for faster copy/paste of reproducer into issues - echo " # \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args" &> call_generate_makefile_genericpath.sh + echo " # \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args" &> call_generate_makefile_genericpath.sh - run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } local make_par_lvl=12 if [[ "$MACHINE" = white* ]]; then @@ -1257,6 +1283,9 @@ run_in_background() { if [[ "$compiler" == cuda* ]]; then num_jobs=1 fi + if [[ "$compiler" == rocm* ]]; then + num_jobs=1 + fi if [[ "$compiler" == clang ]]; then num_jobs=1 fi From 1e486b0a445e48906090d69831f810e3006b436e Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 29 Oct 2020 14:45:13 -0600 Subject: [PATCH 080/106] cm_test_all_sandia: add sems-cuda/11.1 to kokkos-dev* --- scripts/cm_test_all_sandia | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index edb97e0edf..e461e1ce0c 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -448,6 +448,8 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then module load sems-cmake/3.12.2 BASE_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/" CUDA9_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/6.1.0" + CUDA10_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/7.3.0" + CUDA11_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/9.2.0" CLANG7_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-cuda/9.2" SKIP_HWLOC=True @@ -488,6 +490,8 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" "clang/10.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.1 $CUDA10_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/11.1 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) fi SPACK_CUDA_ARCH="+kepler35" @@ -809,6 +813,7 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then COMPILERS=("cuda/10.0 $NVCC_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/10.1 $NVCC_SEMSMODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/11.0 $NVCC11_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/11.1 $NVCC_SEMSMODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/9.2 $NVCC_SEMSMODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CLANG8_CUDA_WARNING_FLAGS" "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" From b3efcda0ea7021f7cd0dd472435774548bd04286 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 20 Oct 2020 12:46:13 -0600 Subject: [PATCH 081/106] The goal is to make sure the HIP backend in properly instanciated by ETI and tested in the unit-test using the Kokkos::Experimental::HIP and Kokkos::Experimental::HIPSpace execution and memory spaces. Status of the unit-tests: Batched -> does not build Blas -> builds and Blas1 test pass, others fail Common -> does not build Graph -> does not build Sparse -> builds but does few tests run before crash Fix blas and blas3 perf_test: these perf_test use parallel_for to run multiple tests at once while on host and call a host only function to do so. These are now being skipped when KOKKOS_ENABLE_HIP is on. All performance test are building. --- cmake/KokkosKernelsConfig.cmake.in | 1 + cmake/KokkosKernels_config.h.in | 3 + cmake/kokkos_backends.cmake | 1 + cmake/kokkoskernels_eti_devices.cmake | 29 +++++++++ .../blas/blas/KokkosBlas_trtri_perf_test.hpp | 10 +-- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 6 +- unit_test/CMakeLists.txt | 61 +++++++++++++++++++ unit_test/hip/Test_HIP.hpp | 21 +++++++ ..._Batched_SerialEigendecomposition_Real.cpp | 3 + .../Test_HIP_Batched_SerialGemm_Complex.cpp | 3 + .../hip/Test_HIP_Batched_SerialGemm_Real.cpp | 3 + .../Test_HIP_Batched_SerialGemv_Complex.cpp | 3 + .../hip/Test_HIP_Batched_SerialGemv_Real.cpp | 3 + ...st_HIP_Batched_SerialInverseLU_Complex.cpp | 3 + .../Test_HIP_Batched_SerialInverseLU_Real.cpp | 3 + .../hip/Test_HIP_Batched_SerialLU_Complex.cpp | 3 + .../hip/Test_HIP_Batched_SerialLU_Real.cpp | 3 + ...Test_HIP_Batched_SerialMatUtil_Complex.cpp | 3 + .../Test_HIP_Batched_SerialMatUtil_Real.cpp | 3 + ...Test_HIP_Batched_SerialSolveLU_Complex.cpp | 3 + .../Test_HIP_Batched_SerialSolveLU_Real.cpp | 3 + .../Test_HIP_Batched_SerialTrmm_Complex.cpp | 3 + .../hip/Test_HIP_Batched_SerialTrmm_Real.cpp | 3 + .../Test_HIP_Batched_SerialTrsm_Complex.cpp | 3 + .../hip/Test_HIP_Batched_SerialTrsm_Real.cpp | 3 + .../Test_HIP_Batched_SerialTrsv_Complex.cpp | 3 + .../hip/Test_HIP_Batched_SerialTrsv_Real.cpp | 3 + .../Test_HIP_Batched_SerialTrtri_Complex.cpp | 3 + .../hip/Test_HIP_Batched_SerialTrtri_Real.cpp | 3 + .../hip/Test_HIP_Batched_TeamGemm_Complex.cpp | 3 + .../hip/Test_HIP_Batched_TeamGemm_Real.cpp | 3 + .../hip/Test_HIP_Batched_TeamGemv_Complex.cpp | 3 + .../hip/Test_HIP_Batched_TeamGemv_Real.cpp | 3 + ...Test_HIP_Batched_TeamInverseLU_Complex.cpp | 3 + .../Test_HIP_Batched_TeamInverseLU_Real.cpp | 3 + .../hip/Test_HIP_Batched_TeamLU_Complex.cpp | 3 + .../hip/Test_HIP_Batched_TeamLU_Real.cpp | 3 + .../Test_HIP_Batched_TeamMatUtil_Complex.cpp | 3 + .../hip/Test_HIP_Batched_TeamMatUtil_Real.cpp | 3 + .../Test_HIP_Batched_TeamSolveLU_Complex.cpp | 3 + .../hip/Test_HIP_Batched_TeamSolveLU_Real.cpp | 3 + .../hip/Test_HIP_Batched_TeamTrsm_Complex.cpp | 3 + .../hip/Test_HIP_Batched_TeamTrsm_Real.cpp | 3 + .../hip/Test_HIP_Batched_TeamTrsv_Complex.cpp | 3 + .../hip/Test_HIP_Batched_TeamTrsv_Real.cpp | 3 + .../Test_HIP_Batched_TeamVectorQR_Real.cpp | 3 + ...d_TeamVectorQR_WithColumnPivoting_Real.cpp | 3 + ...st_HIP_Batched_TeamVectorSolveUTV_Real.cpp | 6 ++ .../Test_HIP_Batched_TeamVectorUTV_Real.cpp | 3 + unit_test/hip/Test_HIP_Blas1_abs.cpp | 2 + unit_test/hip/Test_HIP_Blas1_asum.cpp | 2 + unit_test/hip/Test_HIP_Blas1_axpby.cpp | 2 + unit_test/hip/Test_HIP_Blas1_axpy.cpp | 2 + unit_test/hip/Test_HIP_Blas1_dot.cpp | 2 + unit_test/hip/Test_HIP_Blas1_iamax.cpp | 2 + unit_test/hip/Test_HIP_Blas1_mult.cpp | 2 + unit_test/hip/Test_HIP_Blas1_nrm1.cpp | 2 + unit_test/hip/Test_HIP_Blas1_nrm2.cpp | 2 + unit_test/hip/Test_HIP_Blas1_nrm2_squared.cpp | 2 + unit_test/hip/Test_HIP_Blas1_nrminf.cpp | 2 + unit_test/hip/Test_HIP_Blas1_reciprocal.cpp | 2 + unit_test/hip/Test_HIP_Blas1_scal.cpp | 2 + unit_test/hip/Test_HIP_Blas1_sum.cpp | 2 + unit_test/hip/Test_HIP_Blas1_team_abs.cpp | 4 ++ unit_test/hip/Test_HIP_Blas1_team_axpby.cpp | 4 ++ unit_test/hip/Test_HIP_Blas1_team_axpy.cpp | 4 ++ unit_test/hip/Test_HIP_Blas1_team_dot.cpp | 4 ++ unit_test/hip/Test_HIP_Blas1_team_mult.cpp | 4 ++ unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp | 4 ++ unit_test/hip/Test_HIP_Blas1_team_scal.cpp | 4 ++ unit_test/hip/Test_HIP_Blas1_team_update.cpp | 4 ++ unit_test/hip/Test_HIP_Blas1_update.cpp | 2 + unit_test/hip/Test_HIP_Blas2_gemv.cpp | 2 + unit_test/hip/Test_HIP_Blas2_team_gemv.cpp | 4 ++ unit_test/hip/Test_HIP_Blas3_gemm.cpp | 2 + unit_test/hip/Test_HIP_Blas3_trmm.cpp | 2 + unit_test/hip/Test_HIP_Blas3_trsm.cpp | 2 + unit_test/hip/Test_HIP_Blas_gesv.cpp | 4 ++ unit_test/hip/Test_HIP_Blas_trtri.cpp | 2 + unit_test/hip/Test_HIP_Common_ArithTraits.cpp | 2 + unit_test/hip/Test_HIP_Common_Sorting.cpp | 2 + unit_test/hip/Test_HIP_Common_Transpose.cpp | 2 + .../hip/Test_HIP_Common_set_bit_count.cpp | 2 + unit_test/hip/Test_HIP_Graph_graph_color.cpp | 2 + .../hip/Test_HIP_Graph_graph_color_d2.cpp | 2 + ...st_HIP_Graph_graph_color_deterministic.cpp | 2 + unit_test/hip/Test_HIP_Sparse_CrsMatrix.cpp | 3 + .../Test_HIP_Sparse_block_gauss_seidel.cpp | 2 + .../hip/Test_HIP_Sparse_findRelOffset.cpp | 2 + .../hip/Test_HIP_Sparse_gauss_seidel.cpp | 2 + .../hip/Test_HIP_Sparse_replaceSumInto.cpp | 2 + .../Test_HIP_Sparse_replaceSumIntoLonger.cpp | 2 + unit_test/hip/Test_HIP_Sparse_spadd.cpp | 3 + unit_test/hip/Test_HIP_Sparse_spgemm.cpp | 2 + .../hip/Test_HIP_Sparse_spgemm_jacobi.cpp | 2 + unit_test/hip/Test_HIP_Sparse_spiluk.cpp | 2 + unit_test/hip/Test_HIP_Sparse_spmv.cpp | 2 + unit_test/hip/Test_HIP_Sparse_sptrsv.cpp | 2 + unit_test/hip/Test_HIP_Sparse_trsv.cpp | 2 + unit_test/standalone/main.cpp | 3 + 100 files changed, 375 insertions(+), 8 deletions(-) create mode 100644 unit_test/hip/Test_HIP.hpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialEigendecomposition_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialGemm_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialGemm_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialGemv_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialGemv_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialInverseLU_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialInverseLU_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialLU_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialLU_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialMatUtil_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialMatUtil_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialSolveLU_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialSolveLU_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialTrmm_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialTrmm_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialTrsm_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialTrsm_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialTrsv_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialTrsv_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialTrtri_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_SerialTrtri_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamGemm_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamGemm_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamGemv_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamGemv_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamInverseLU_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamInverseLU_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamLU_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamLU_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamMatUtil_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamMatUtil_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamSolveLU_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamSolveLU_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamTrsm_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamTrsm_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamTrsv_Complex.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamTrsv_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamVectorQR_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamVectorQR_WithColumnPivoting_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamVectorSolveUTV_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Batched_TeamVectorUTV_Real.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_abs.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_asum.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_axpby.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_axpy.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_dot.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_iamax.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_mult.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_nrm1.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_nrm2.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_nrm2_squared.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_nrminf.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_reciprocal.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_scal.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_sum.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_team_abs.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_team_axpby.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_team_axpy.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_team_dot.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_team_mult.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_team_scal.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_team_update.cpp create mode 100644 unit_test/hip/Test_HIP_Blas1_update.cpp create mode 100644 unit_test/hip/Test_HIP_Blas2_gemv.cpp create mode 100644 unit_test/hip/Test_HIP_Blas2_team_gemv.cpp create mode 100644 unit_test/hip/Test_HIP_Blas3_gemm.cpp create mode 100644 unit_test/hip/Test_HIP_Blas3_trmm.cpp create mode 100644 unit_test/hip/Test_HIP_Blas3_trsm.cpp create mode 100644 unit_test/hip/Test_HIP_Blas_gesv.cpp create mode 100644 unit_test/hip/Test_HIP_Blas_trtri.cpp create mode 100644 unit_test/hip/Test_HIP_Common_ArithTraits.cpp create mode 100644 unit_test/hip/Test_HIP_Common_Sorting.cpp create mode 100644 unit_test/hip/Test_HIP_Common_Transpose.cpp create mode 100644 unit_test/hip/Test_HIP_Common_set_bit_count.cpp create mode 100644 unit_test/hip/Test_HIP_Graph_graph_color.cpp create mode 100644 unit_test/hip/Test_HIP_Graph_graph_color_d2.cpp create mode 100644 unit_test/hip/Test_HIP_Graph_graph_color_deterministic.cpp create mode 100644 unit_test/hip/Test_HIP_Sparse_CrsMatrix.cpp create mode 100644 unit_test/hip/Test_HIP_Sparse_block_gauss_seidel.cpp create mode 100644 unit_test/hip/Test_HIP_Sparse_findRelOffset.cpp create mode 100644 unit_test/hip/Test_HIP_Sparse_gauss_seidel.cpp create mode 100644 unit_test/hip/Test_HIP_Sparse_replaceSumInto.cpp create mode 100644 unit_test/hip/Test_HIP_Sparse_replaceSumIntoLonger.cpp create mode 100644 unit_test/hip/Test_HIP_Sparse_spadd.cpp create mode 100644 unit_test/hip/Test_HIP_Sparse_spgemm.cpp create mode 100644 unit_test/hip/Test_HIP_Sparse_spgemm_jacobi.cpp create mode 100644 unit_test/hip/Test_HIP_Sparse_spiluk.cpp create mode 100644 unit_test/hip/Test_HIP_Sparse_spmv.cpp create mode 100644 unit_test/hip/Test_HIP_Sparse_sptrsv.cpp create mode 100644 unit_test/hip/Test_HIP_Sparse_trsv.cpp diff --git a/cmake/KokkosKernelsConfig.cmake.in b/cmake/KokkosKernelsConfig.cmake.in index 54e0006aa0..31d77bda94 100644 --- a/cmake/KokkosKernelsConfig.cmake.in +++ b/cmake/KokkosKernelsConfig.cmake.in @@ -11,6 +11,7 @@ find_dependency(Kokkos HINTS @Kokkos_DIR@) SET(Kokkos_ENABLE_OPENMP @Kokkos_ENABLE_OPENMP@) SET(Kokkos_ENABLE_CUDA @Kokkos_ENABLE_CUDA@) +SET(Kokkos_ENABLE_HIP @Kokkos_ENABLE_HIP@) SET(Kokkos_ENABLE_PTHREAD @Kokkos_ENABLE_PTHREAD@) SET(Kokkos_ENABLE_SERIAL @Kokkos_ENABLE_SERIAL@) diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index b5611c20ca..c0a1e98ec6 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -34,6 +34,9 @@ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_CUDA #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_CUDASPACE #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE +/* Whether to build kernels for execution space Kokkos::Experimental::HIP */ +#cmakedefine KOKKOSKERNELS_INST_EXECSPACE_HIP +#cmakedefine KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE /* Whether to build kernels for execution space Kokkos::OpenMP */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_OPENMP /* Whether to build kernels for execution space Kokkos::Threads */ diff --git a/cmake/kokkos_backends.cmake b/cmake/kokkos_backends.cmake index e9dde7bf66..c2f46bb8e3 100644 --- a/cmake/kokkos_backends.cmake +++ b/cmake/kokkos_backends.cmake @@ -10,6 +10,7 @@ MACRO(CHECK_KOKKOS_BACKEND BE) ENDMACRO(CHECK_KOKKOS_BACKEND) CHECK_KOKKOS_BACKEND(CUDA) +CHECK_KOKKOS_BACKEND(HIP) CHECK_KOKKOS_BACKEND(OPENMP) CHECK_KOKKOS_BACKEND(PTHREAD) CHECK_KOKKOS_BACKEND(SERIAL) diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index ffb5715e32..ede934023c 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -4,11 +4,13 @@ SET(EXEC_SPACES EXECSPACE_CUDA + EXECSPACE_HIP EXECSPACE_OPENMP EXECSPACE_PTHREAD EXECSPACE_SERIAL ) SET(EXECSPACE_CUDA_CPP_TYPE Kokkos::Cuda) +SET(EXECSPACE_HIP_CPP_TYPE Kokkos::Experimental::HIP) SET(EXECSPACE_OPENMP_CPP_TYPE Kokkos::OpenMP) SET(EXECSPACE_PTHREAD_CPP_TYPE Kokkos::Threads) SET(EXECSPACE_SERIAL_CPP_TYPE Kokkos::Serial) @@ -16,11 +18,13 @@ SET(EXECSPACE_SERIAL_CPP_TYPE Kokkos::Serial) SET(MEM_SPACES MEMSPACE_CUDASPACE MEMSPACE_CUDAUVMSPACE + MEMSPACE_HIPSPACE MEMSPACE_HOSTSPACE MEMSPACE_HBWSPACE ) SET(MEMSPACE_CUDASPACE_CPP_TYPE Kokkos::CudaSpace) SET(MEMSPACE_CUDAUVMSPACE_CPP_TYPE Kokkos::CudaUVMSpace) +SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::Experimental::HIPSpace) SET(MEMSPACE_HOSTSPACE_CPP_TYPE Kokkos::HostSpace) SET(MEMSPACE_HBWSPACE_CPP_TYPE Kokkos::HBWSpace) @@ -57,6 +61,30 @@ IF(KOKKOS_ENABLE_CUDA) ENDIF() +IF(KOKKOS_ENABLE_HIP) + KOKKOSKERNELS_ADD_OPTION( + INST_EXECSPACE_HIP + ${KOKKOSKERNELS_INST_EXECSPACE_HIP_DEFAULT} + BOOL + "Whether to pre instantiate kernels for the execution space Kokkos::Experimental::HIP. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." + ) + KOKKOSKERNELS_ADD_OPTION( + INST_MEMSPACE_HIPSPACE + ${KOKKOSKERNELS_INST_EXECSPACE_HIP_DEFAULT} + BOOL + "Whether to pre instantiate kernels for the memory space Kokkos::Experimental::HIPSpace. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." + ) + + IF(KOKKOSKERNELS_INST_EXECSPACE_HIP AND KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE) + LIST(APPEND DEVICE_LIST "") + ENDIF() + + IF( Trilinos_ENABLE_COMPLEX_DOUBLE AND ((NOT DEFINED CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS) OR (NOT CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS)) ) + MESSAGE( WARNING "The CMake option CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS is either undefined or OFF. Please set CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS:BOOL=ON when building with HIP and complex double enabled.") + ENDIF() + +ENDIF() + KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_HOSTSPACE ${KOKKOSKERNELS_ADD_DEFAULT_ETI} @@ -109,6 +137,7 @@ KOKKOSKERNELS_ADD_OPTION( ) SET(EXECSPACE_CUDA_VALID_MEM_SPACES CUDASPACE CUDAUVMSPACE) +SET(EXECSPACE_HIP_VALID_MEM_SPACES HIPSPACE) SET(EXECSPACE_SERIAL_VALID_MEM_SPACES HBWSPACE HOSTSPACE) SET(EXECSPACE_OPENMP_VALID_MEM_SPACES HBWSPACE HOSTSPACE) SET(EXECSPACE_PTHREAD_VALID_MEM_SPACES HBWSPACE HOSTSPACE) diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp index 0ac85a560a..e6b7b825a7 100644 --- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp @@ -213,7 +213,7 @@ void __do_trtri_serial_batched(options_t options, trtri_args_t trtri_args) { return; } -#if !defined(KOKKOS_ENABLE_CUDA) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) template struct parallel_blas_trtri { trtri_args_t trtri_args_; @@ -227,11 +227,11 @@ struct parallel_blas_trtri { KokkosBlas::trtri(&trtri_args_.uplo, &trtri_args_.diag, svA); } }; -#endif // !KOKKOS_ENABLE_CUDA +#endif // !KOKKOS_ENABLE_CUDA && !KOKKOS_ENABLE_HIP template void __do_trtri_parallel_blas(options_t options, trtri_args_t trtri_args) { -#if !defined(KOKKOS_ENABLE_CUDA) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; Kokkos::Timer timer; @@ -254,9 +254,9 @@ void __do_trtri_parallel_blas(options_t options, trtri_args_t trtri_args) { __trtri_output_csv_row(options, trtri_args, timer.seconds()); #else std::cerr << std::string(__func__) - << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl; + << " disabled since KOKKOS_ENABLE_CUDA and/or KOKKOS_ENABLE_HIP is defined." << std::endl; __trtri_output_csv_row(options, trtri_args, -1); -#endif // !KOKKOS_ENABLE_CUDA +#endif // !KOKKOS_ENABLE_CUDA && !KOKKOS_ENABLE_HIP return; } diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index e2b62ef8eb..c1d42fe9c1 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -292,7 +292,7 @@ void __do_trmm_serial_batched(options_t options, trmm_args_t trmm_args) { return; } -#if !defined(KOKKOS_ENABLE_CUDA) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) template struct parallel_blas_trmm { trmm_args_t trmm_args_; @@ -312,7 +312,7 @@ struct parallel_blas_trmm { template void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) { -#if !defined(KOKKOS_ENABLE_CUDA) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; Kokkos::Timer timer; @@ -335,7 +335,7 @@ void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) { __trmm_output_csv_row(options, trmm_args, timer.seconds()); #else std::cerr << std::string(__func__) - << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl; + << " disabled since KOKKOS_ENABLE_CUDA and/or KOKKOS_ENABLE_HIP is defined." << std::endl; __trmm_output_csv_row(options, trmm_args, -1); #endif // !KOKKOS_ENABLE_CUDA return; diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt index b8060d3cb1..fdea69a028 100644 --- a/unit_test/CMakeLists.txt +++ b/unit_test/CMakeLists.txt @@ -98,6 +98,67 @@ IF (KOKKOS_ENABLE_CUDA) ) ENDIF () +IF (KOKKOS_ENABLE_HIP) + KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/hip) + KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/hip) + + APPEND_GLOB(HIP_BLAS_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Blas*.cpp) + KOKKOSKERNELS_ADD_UNIT_TEST( + blas_hip + SOURCES + Test_Main.cpp + ${HIP_BLAS_SOURCES} + COMPONENTS blas + ) + + # APPEND_GLOB(HIP_BATCHED_DLA_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Batched*.cpp) + # KOKKOSKERNELS_ADD_UNIT_TEST( + # batched_dla_hip + # SOURCES + # Test_Main.cpp + # ${HIP_BATCHED_DLA_SOURCES} + # COMPONENTS batched + # ) + + APPEND_GLOB(HIP_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Sparse*.cpp) + # HIP does not provide UVM, these two tests are henced remove permanently + # IF (NOT KOKKOS_ENABLE_CUDA_UVM) + LIST(REMOVE_ITEM HIP_SPARSE_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Sparse_findRelOffset.cpp") + LIST(REMOVE_ITEM HIP_SPARSE_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Sparse_trsv.cpp") + # ENDIF() + + KOKKOSKERNELS_ADD_UNIT_TEST( + sparse_hip + SOURCES + Test_Main.cpp + ${HIP_SPARSE_SOURCES} + COMPONENTS sparse + ) + + # APPEND_GLOB(HIP_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Graph*.cpp) + + # KOKKOSKERNELS_ADD_UNIT_TEST( + # graph_hip + # SOURCES + # Test_Main.cpp + # ${HIP_GRAPH_SOURCES} + # COMPONENTS graph + # ) + + # #currently float 128 test is not working. So common tests are explicitly added. + # APPEND_GLOB(HIP_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Common*.cpp) + + + # KOKKOSKERNELS_ADD_UNIT_TEST( + # common_hip + # SOURCES + # Test_Main.cpp + # ${HIP_COMMON_SOURCES} + # ) +ENDIF () + IF (KOKKOS_ENABLE_OPENMP) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/openmp) KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/openmp) diff --git a/unit_test/hip/Test_HIP.hpp b/unit_test/hip/Test_HIP.hpp new file mode 100644 index 0000000000..95f979e9e6 --- /dev/null +++ b/unit_test/hip/Test_HIP.hpp @@ -0,0 +1,21 @@ +#include +#include +#include + +#if defined(KOKKOSKERNELS_TEST_ETI_ONLY) && !defined(KOKKOSKERNELS_ETI_ONLY) +#define KOKKOSKERNELS_ETI_ONLY +#endif + +class hip : public ::testing::Test { +protected: + static void SetUpTestCase() + { + } + + static void TearDownTestCase() + { + } +}; + +#define TestCategory hip +#define TestExecSpace Kokkos::Experimental::HIPSpace diff --git a/unit_test/hip/Test_HIP_Batched_SerialEigendecomposition_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialEigendecomposition_Real.cpp new file mode 100644 index 0000000000..1aceff3e62 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialEigendecomposition_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialEigendecomposition.hpp" +#include "Test_Batched_SerialEigendecomposition_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialGemm_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialGemm_Complex.cpp new file mode 100644 index 0000000000..280d12eb89 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialGemm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialGemm.hpp" +#include "Test_Batched_SerialGemm_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialGemm_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialGemm_Real.cpp new file mode 100644 index 0000000000..0a3425962a --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialGemm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialGemm.hpp" +#include "Test_Batched_SerialGemm_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialGemv_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialGemv_Complex.cpp new file mode 100644 index 0000000000..1f405f4caa --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialGemv_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialGemv.hpp" +#include "Test_Batched_SerialGemv_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialGemv_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialGemv_Real.cpp new file mode 100644 index 0000000000..98e69da8e1 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialGemv_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialGemv.hpp" +#include "Test_Batched_SerialGemv_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Complex.cpp new file mode 100644 index 0000000000..7d0f3bcdea --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialInverseLU.hpp" +#include "Test_Batched_SerialInverseLU_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Real.cpp new file mode 100644 index 0000000000..c147695515 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialInverseLU.hpp" +#include "Test_Batched_SerialInverseLU_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialLU_Complex.cpp new file mode 100644 index 0000000000..ac11b50956 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialLU.hpp" +#include "Test_Batched_SerialLU_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialLU_Real.cpp new file mode 100644 index 0000000000..b9bdbfb95a --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialLU.hpp" +#include "Test_Batched_SerialLU_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Complex.cpp new file mode 100644 index 0000000000..d7070fd0b5 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialMatUtil.hpp" +#include "Test_Batched_SerialMatUtil_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Real.cpp new file mode 100644 index 0000000000..65674e04b9 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialMatUtil.hpp" +#include "Test_Batched_SerialMatUtil_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Complex.cpp new file mode 100644 index 0000000000..059877ff2d --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialSolveLU.hpp" +#include "Test_Batched_SerialSolveLU_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Real.cpp new file mode 100644 index 0000000000..d09271a0e6 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialSolveLU.hpp" +#include "Test_Batched_SerialSolveLU_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrmm_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrmm_Complex.cpp new file mode 100644 index 0000000000..e10cb11259 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrmm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrmm.hpp" +#include "Test_Batched_SerialTrmm_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrmm_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrmm_Real.cpp new file mode 100644 index 0000000000..95b412a3a8 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrmm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrmm.hpp" +#include "Test_Batched_SerialTrmm_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrsm_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrsm_Complex.cpp new file mode 100644 index 0000000000..b12b6fc203 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrsm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrsm.hpp" +#include "Test_Batched_SerialTrsm_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrsm_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrsm_Real.cpp new file mode 100644 index 0000000000..660293cfd2 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrsm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrsm.hpp" +#include "Test_Batched_SerialTrsm_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrsv_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrsv_Complex.cpp new file mode 100644 index 0000000000..f82c94e5e9 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrsv_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrsv.hpp" +#include "Test_Batched_SerialTrsv_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrsv_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrsv_Real.cpp new file mode 100644 index 0000000000..34c80371e1 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrsv_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrsv.hpp" +#include "Test_Batched_SerialTrsv_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrtri_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrtri_Complex.cpp new file mode 100644 index 0000000000..387aee1cc2 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrtri_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrtri.hpp" +#include "Test_Batched_SerialTrtri_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrtri_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrtri_Real.cpp new file mode 100644 index 0000000000..1f996ca4e1 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrtri_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrtri.hpp" +#include "Test_Batched_SerialTrtri_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamGemm_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamGemm_Complex.cpp new file mode 100644 index 0000000000..49b75ee6fa --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamGemm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamGemm.hpp" +#include "Test_Batched_TeamGemm_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamGemm_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamGemm_Real.cpp new file mode 100644 index 0000000000..52cacfa3c8 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamGemm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamGemm.hpp" +#include "Test_Batched_TeamGemm_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamGemv_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamGemv_Complex.cpp new file mode 100644 index 0000000000..fed2bad261 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamGemv_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamGemv.hpp" +#include "Test_Batched_TeamGemv_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamGemv_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamGemv_Real.cpp new file mode 100644 index 0000000000..2d589ba4ef --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamGemv_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamGemv.hpp" +#include "Test_Batched_TeamGemv_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Complex.cpp new file mode 100644 index 0000000000..fa4ab4b3a1 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamInverseLU.hpp" +#include "Test_Batched_TeamInverseLU_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Real.cpp new file mode 100644 index 0000000000..9877053d34 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamInverseLU.hpp" +#include "Test_Batched_TeamInverseLU_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamLU_Complex.cpp new file mode 100644 index 0000000000..068f2faa3f --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamLU.hpp" +#include "Test_Batched_TeamLU_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamLU_Real.cpp new file mode 100644 index 0000000000..0e09a25fb2 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamLU.hpp" +#include "Test_Batched_TeamLU_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Complex.cpp new file mode 100644 index 0000000000..8a2b9d4c44 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamMatUtil.hpp" +#include "Test_Batched_TeamMatUtil_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Real.cpp new file mode 100644 index 0000000000..8262c3c2eb --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamMatUtil.hpp" +#include "Test_Batched_TeamMatUtil_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Complex.cpp new file mode 100644 index 0000000000..b5474a3a24 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamSolveLU.hpp" +#include "Test_Batched_TeamSolveLU_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Real.cpp new file mode 100644 index 0000000000..469fce62a9 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamSolveLU.hpp" +#include "Test_Batched_TeamSolveLU_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamTrsm_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamTrsm_Complex.cpp new file mode 100644 index 0000000000..e48617a7b6 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamTrsm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamTrsm.hpp" +#include "Test_Batched_TeamTrsm_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamTrsm_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamTrsm_Real.cpp new file mode 100644 index 0000000000..83ce8988d0 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamTrsm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamTrsm.hpp" +#include "Test_Batched_TeamTrsm_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamTrsv_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamTrsv_Complex.cpp new file mode 100644 index 0000000000..ff75837fca --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamTrsv_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamTrsv.hpp" +#include "Test_Batched_TeamTrsv_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamTrsv_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamTrsv_Real.cpp new file mode 100644 index 0000000000..5fba12911e --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamTrsv_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamTrsv.hpp" +#include "Test_Batched_TeamTrsv_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamVectorQR_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamVectorQR_Real.cpp new file mode 100644 index 0000000000..e8ee97ffc7 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamVectorQR_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamVectorQR.hpp" +#include "Test_Batched_TeamVectorQR_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamVectorQR_WithColumnPivoting_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamVectorQR_WithColumnPivoting_Real.cpp new file mode 100644 index 0000000000..a55667f9d4 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamVectorQR_WithColumnPivoting_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamVectorQR_WithColumnPivoting.hpp" +#include "Test_Batched_TeamVectorQR_WithColumnPivoting_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamVectorSolveUTV_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamVectorSolveUTV_Real.cpp new file mode 100644 index 0000000000..aaa8ad4f91 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamVectorSolveUTV_Real.cpp @@ -0,0 +1,6 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamVectorSolveUTV.hpp" +#include "Test_Batched_TeamVectorSolveUTV_Real.hpp" + +#include "Test_Batched_TeamVectorSolveUTV2.hpp" +#include "Test_Batched_TeamVectorSolveUTV2_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamVectorUTV_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamVectorUTV_Real.cpp new file mode 100644 index 0000000000..f60705ae07 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamVectorUTV_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamVectorUTV.hpp" +#include "Test_Batched_TeamVectorUTV_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Blas1_abs.cpp b/unit_test/hip/Test_HIP_Blas1_abs.cpp new file mode 100644 index 0000000000..e175c8970e --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_abs.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_asum.cpp b/unit_test/hip/Test_HIP_Blas1_asum.cpp new file mode 100644 index 0000000000..c93f5f32fd --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_asum.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_axpby.cpp b/unit_test/hip/Test_HIP_Blas1_axpby.cpp new file mode 100644 index 0000000000..2814ecc583 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_axpby.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_axpy.cpp b/unit_test/hip/Test_HIP_Blas1_axpy.cpp new file mode 100644 index 0000000000..8c7170d275 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_axpy.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_dot.cpp b/unit_test/hip/Test_HIP_Blas1_dot.cpp new file mode 100644 index 0000000000..2892b1e7e7 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_dot.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_iamax.cpp b/unit_test/hip/Test_HIP_Blas1_iamax.cpp new file mode 100644 index 0000000000..8fb34c05db --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_iamax.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_mult.cpp b/unit_test/hip/Test_HIP_Blas1_mult.cpp new file mode 100644 index 0000000000..e124061c58 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_mult.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_nrm1.cpp b/unit_test/hip/Test_HIP_Blas1_nrm1.cpp new file mode 100644 index 0000000000..fb292630e7 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_nrm1.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_nrm2.cpp b/unit_test/hip/Test_HIP_Blas1_nrm2.cpp new file mode 100644 index 0000000000..cf2f9e7237 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_nrm2.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_nrm2_squared.cpp b/unit_test/hip/Test_HIP_Blas1_nrm2_squared.cpp new file mode 100644 index 0000000000..4d91e62f85 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_nrm2_squared.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_nrminf.cpp b/unit_test/hip/Test_HIP_Blas1_nrminf.cpp new file mode 100644 index 0000000000..67a07902f0 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_nrminf.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_reciprocal.cpp b/unit_test/hip/Test_HIP_Blas1_reciprocal.cpp new file mode 100644 index 0000000000..892469cb7c --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_reciprocal.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_scal.cpp b/unit_test/hip/Test_HIP_Blas1_scal.cpp new file mode 100644 index 0000000000..11df7e89b5 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_scal.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_sum.cpp b/unit_test/hip/Test_HIP_Blas1_sum.cpp new file mode 100644 index 0000000000..3be74c5d9a --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_sum.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_team_abs.cpp b/unit_test/hip/Test_HIP_Blas1_team_abs.cpp new file mode 100644 index 0000000000..f253a72735 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_abs.cpp @@ -0,0 +1,4 @@ +#include +#ifdef KOKKOS_ENABLE_CUDA_LAMBDA +#include +#endif diff --git a/unit_test/hip/Test_HIP_Blas1_team_axpby.cpp b/unit_test/hip/Test_HIP_Blas1_team_axpby.cpp new file mode 100644 index 0000000000..be7f570a02 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_axpby.cpp @@ -0,0 +1,4 @@ +#include +#ifdef KOKKOS_ENABLE_CUDA_LAMBDA +#include +#endif diff --git a/unit_test/hip/Test_HIP_Blas1_team_axpy.cpp b/unit_test/hip/Test_HIP_Blas1_team_axpy.cpp new file mode 100644 index 0000000000..d6e22c8891 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_axpy.cpp @@ -0,0 +1,4 @@ +#include +#ifdef KOKKOS_ENABLE_CUDA_LAMBDA +#include +#endif diff --git a/unit_test/hip/Test_HIP_Blas1_team_dot.cpp b/unit_test/hip/Test_HIP_Blas1_team_dot.cpp new file mode 100644 index 0000000000..bab30ef2b8 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_dot.cpp @@ -0,0 +1,4 @@ +#include +#ifdef KOKKOS_ENABLE_CUDA_LAMBDA +#include +#endif diff --git a/unit_test/hip/Test_HIP_Blas1_team_mult.cpp b/unit_test/hip/Test_HIP_Blas1_team_mult.cpp new file mode 100644 index 0000000000..25d51c964f --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_mult.cpp @@ -0,0 +1,4 @@ +#include +#ifdef KOKKOS_ENABLE_CUDA_LAMBDA +#include +#endif diff --git a/unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp b/unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp new file mode 100644 index 0000000000..4c067a39be --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp @@ -0,0 +1,4 @@ +#include +#ifdef KOKKOS_ENABLE_CUDA_LAMBDA +#include +#endif diff --git a/unit_test/hip/Test_HIP_Blas1_team_scal.cpp b/unit_test/hip/Test_HIP_Blas1_team_scal.cpp new file mode 100644 index 0000000000..e685a03a04 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_scal.cpp @@ -0,0 +1,4 @@ +#include +#ifdef KOKKOS_ENABLE_CUDA_LAMBDA +#include +#endif diff --git a/unit_test/hip/Test_HIP_Blas1_team_update.cpp b/unit_test/hip/Test_HIP_Blas1_team_update.cpp new file mode 100644 index 0000000000..9ffc89a8eb --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_update.cpp @@ -0,0 +1,4 @@ +#include +#ifdef KOKKOS_ENABLE_CUDA_LAMBDA +#include +#endif diff --git a/unit_test/hip/Test_HIP_Blas1_update.cpp b/unit_test/hip/Test_HIP_Blas1_update.cpp new file mode 100644 index 0000000000..f2388dbc9b --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_update.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas2_gemv.cpp b/unit_test/hip/Test_HIP_Blas2_gemv.cpp new file mode 100644 index 0000000000..9df86cde64 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas2_gemv.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas2_team_gemv.cpp b/unit_test/hip/Test_HIP_Blas2_team_gemv.cpp new file mode 100644 index 0000000000..7e5c60a527 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas2_team_gemv.cpp @@ -0,0 +1,4 @@ +#include +#ifdef KOKKOS_ENABLE_CUDA_LAMBDA +#include +#endif diff --git a/unit_test/hip/Test_HIP_Blas3_gemm.cpp b/unit_test/hip/Test_HIP_Blas3_gemm.cpp new file mode 100644 index 0000000000..9fdd5004a4 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas3_gemm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas3_trmm.cpp b/unit_test/hip/Test_HIP_Blas3_trmm.cpp new file mode 100644 index 0000000000..baaf52d8a5 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas3_trmm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas3_trsm.cpp b/unit_test/hip/Test_HIP_Blas3_trsm.cpp new file mode 100644 index 0000000000..fa4ce5e728 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas3_trsm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas_gesv.cpp b/unit_test/hip/Test_HIP_Blas_gesv.cpp new file mode 100644 index 0000000000..7d4a4bb0c4 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas_gesv.cpp @@ -0,0 +1,4 @@ +#include +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include +#endif diff --git a/unit_test/hip/Test_HIP_Blas_trtri.cpp b/unit_test/hip/Test_HIP_Blas_trtri.cpp new file mode 100644 index 0000000000..e5b58ad470 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas_trtri.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Common_ArithTraits.cpp b/unit_test/hip/Test_HIP_Common_ArithTraits.cpp new file mode 100644 index 0000000000..6482ba2dba --- /dev/null +++ b/unit_test/hip/Test_HIP_Common_ArithTraits.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Common_Sorting.cpp b/unit_test/hip/Test_HIP_Common_Sorting.cpp new file mode 100644 index 0000000000..f01730e654 --- /dev/null +++ b/unit_test/hip/Test_HIP_Common_Sorting.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Common_Transpose.cpp b/unit_test/hip/Test_HIP_Common_Transpose.cpp new file mode 100644 index 0000000000..d81855df62 --- /dev/null +++ b/unit_test/hip/Test_HIP_Common_Transpose.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Common_set_bit_count.cpp b/unit_test/hip/Test_HIP_Common_set_bit_count.cpp new file mode 100644 index 0000000000..bd2fd76423 --- /dev/null +++ b/unit_test/hip/Test_HIP_Common_set_bit_count.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Graph_graph_color.cpp b/unit_test/hip/Test_HIP_Graph_graph_color.cpp new file mode 100644 index 0000000000..01343e32c5 --- /dev/null +++ b/unit_test/hip/Test_HIP_Graph_graph_color.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Graph_graph_color_d2.cpp b/unit_test/hip/Test_HIP_Graph_graph_color_d2.cpp new file mode 100644 index 0000000000..5ca8df65dc --- /dev/null +++ b/unit_test/hip/Test_HIP_Graph_graph_color_d2.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Graph_graph_color_deterministic.cpp b/unit_test/hip/Test_HIP_Graph_graph_color_deterministic.cpp new file mode 100644 index 0000000000..b24e4bf4b4 --- /dev/null +++ b/unit_test/hip/Test_HIP_Graph_graph_color_deterministic.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_CrsMatrix.cpp b/unit_test/hip/Test_HIP_Sparse_CrsMatrix.cpp new file mode 100644 index 0000000000..782e8152a2 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_CrsMatrix.cpp @@ -0,0 +1,3 @@ +#include +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_block_gauss_seidel.cpp b/unit_test/hip/Test_HIP_Sparse_block_gauss_seidel.cpp new file mode 100644 index 0000000000..986460a37b --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_block_gauss_seidel.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_findRelOffset.cpp b/unit_test/hip/Test_HIP_Sparse_findRelOffset.cpp new file mode 100644 index 0000000000..0d82182e9b --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_findRelOffset.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_gauss_seidel.cpp b/unit_test/hip/Test_HIP_Sparse_gauss_seidel.cpp new file mode 100644 index 0000000000..b63fee6a94 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_gauss_seidel.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_replaceSumInto.cpp b/unit_test/hip/Test_HIP_Sparse_replaceSumInto.cpp new file mode 100644 index 0000000000..72bf132cf0 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_replaceSumInto.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_replaceSumIntoLonger.cpp b/unit_test/hip/Test_HIP_Sparse_replaceSumIntoLonger.cpp new file mode 100644 index 0000000000..daf96e433d --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_replaceSumIntoLonger.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_spadd.cpp b/unit_test/hip/Test_HIP_Sparse_spadd.cpp new file mode 100644 index 0000000000..98736daebf --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_spadd.cpp @@ -0,0 +1,3 @@ +#include +#include + diff --git a/unit_test/hip/Test_HIP_Sparse_spgemm.cpp b/unit_test/hip/Test_HIP_Sparse_spgemm.cpp new file mode 100644 index 0000000000..2402f7596e --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_spgemm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_spgemm_jacobi.cpp b/unit_test/hip/Test_HIP_Sparse_spgemm_jacobi.cpp new file mode 100644 index 0000000000..6ab09e6743 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_spgemm_jacobi.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_spiluk.cpp b/unit_test/hip/Test_HIP_Sparse_spiluk.cpp new file mode 100644 index 0000000000..83f2a59192 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_spiluk.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_spmv.cpp b/unit_test/hip/Test_HIP_Sparse_spmv.cpp new file mode 100644 index 0000000000..18edf035e8 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_spmv.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_sptrsv.cpp b/unit_test/hip/Test_HIP_Sparse_sptrsv.cpp new file mode 100644 index 0000000000..cb18ff3ed8 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_sptrsv.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_trsv.cpp b/unit_test/hip/Test_HIP_Sparse_trsv.cpp new file mode 100644 index 0000000000..c371d334e9 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_trsv.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/standalone/main.cpp b/unit_test/standalone/main.cpp index 68d336805e..259a572c7a 100644 --- a/unit_test/standalone/main.cpp +++ b/unit_test/standalone/main.cpp @@ -3,6 +3,9 @@ #ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA #include #endif +#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP +#include +#endif #ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL #include #endif From 8d6ce5f275dfc7dbf9e211614941f4bab394bc72 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 29 Oct 2020 12:35:37 -0600 Subject: [PATCH 082/106] HIP: fixing bug in Test_HIP.hpp and enabling more tests TestExecSpace was set to HIPSpace instead of HIP creating problems at build and execution time in unit_test. We can now enable common and graph unit-tests for build. common_hip is passing without failures. --- unit_test/CMakeLists.txt | 35 ++++++++++++++++------------------- unit_test/hip/Test_HIP.hpp | 2 +- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt index fdea69a028..128bfa3030 100644 --- a/unit_test/CMakeLists.txt +++ b/unit_test/CMakeLists.txt @@ -137,26 +137,23 @@ IF (KOKKOS_ENABLE_HIP) COMPONENTS sparse ) - # APPEND_GLOB(HIP_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Graph*.cpp) - - # KOKKOSKERNELS_ADD_UNIT_TEST( - # graph_hip - # SOURCES - # Test_Main.cpp - # ${HIP_GRAPH_SOURCES} - # COMPONENTS graph - # ) - - # #currently float 128 test is not working. So common tests are explicitly added. - # APPEND_GLOB(HIP_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Common*.cpp) + APPEND_GLOB(HIP_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Graph*.cpp) + KOKKOSKERNELS_ADD_UNIT_TEST( + graph_hip + SOURCES + Test_Main.cpp + ${HIP_GRAPH_SOURCES} + COMPONENTS graph + ) - - # KOKKOSKERNELS_ADD_UNIT_TEST( - # common_hip - # SOURCES - # Test_Main.cpp - # ${HIP_COMMON_SOURCES} - # ) + #currently float 128 test is not working. So common tests are explicitly added. + APPEND_GLOB(HIP_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Common*.cpp) + KOKKOSKERNELS_ADD_UNIT_TEST( + common_hip + SOURCES + Test_Main.cpp + ${HIP_COMMON_SOURCES} + ) ENDIF () IF (KOKKOS_ENABLE_OPENMP) diff --git a/unit_test/hip/Test_HIP.hpp b/unit_test/hip/Test_HIP.hpp index 95f979e9e6..cd4c49f16d 100644 --- a/unit_test/hip/Test_HIP.hpp +++ b/unit_test/hip/Test_HIP.hpp @@ -18,4 +18,4 @@ class hip : public ::testing::Test { }; #define TestCategory hip -#define TestExecSpace Kokkos::Experimental::HIPSpace +#define TestExecSpace Kokkos::Experimental::HIP From 6206bfe71471f7420e6fba6f1586a6508abfece2 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 29 Oct 2020 12:45:21 -0600 Subject: [PATCH 083/106] HIP: removing unecessary CUDA guard from HIP unit-tests --- unit_test/hip/Test_HIP_Blas1_team_abs.cpp | 2 -- unit_test/hip/Test_HIP_Blas1_team_axpby.cpp | 2 -- unit_test/hip/Test_HIP_Blas1_team_axpy.cpp | 2 -- unit_test/hip/Test_HIP_Blas1_team_dot.cpp | 2 -- unit_test/hip/Test_HIP_Blas1_team_mult.cpp | 2 -- unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp | 2 -- unit_test/hip/Test_HIP_Blas1_team_scal.cpp | 2 -- unit_test/hip/Test_HIP_Blas1_team_update.cpp | 2 -- unit_test/hip/Test_HIP_Blas2_team_gemv.cpp | 2 -- 9 files changed, 18 deletions(-) diff --git a/unit_test/hip/Test_HIP_Blas1_team_abs.cpp b/unit_test/hip/Test_HIP_Blas1_team_abs.cpp index f253a72735..d59b6a61de 100644 --- a/unit_test/hip/Test_HIP_Blas1_team_abs.cpp +++ b/unit_test/hip/Test_HIP_Blas1_team_abs.cpp @@ -1,4 +1,2 @@ #include -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA #include -#endif diff --git a/unit_test/hip/Test_HIP_Blas1_team_axpby.cpp b/unit_test/hip/Test_HIP_Blas1_team_axpby.cpp index be7f570a02..0f3a2a5fec 100644 --- a/unit_test/hip/Test_HIP_Blas1_team_axpby.cpp +++ b/unit_test/hip/Test_HIP_Blas1_team_axpby.cpp @@ -1,4 +1,2 @@ #include -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA #include -#endif diff --git a/unit_test/hip/Test_HIP_Blas1_team_axpy.cpp b/unit_test/hip/Test_HIP_Blas1_team_axpy.cpp index d6e22c8891..823154d5af 100644 --- a/unit_test/hip/Test_HIP_Blas1_team_axpy.cpp +++ b/unit_test/hip/Test_HIP_Blas1_team_axpy.cpp @@ -1,4 +1,2 @@ #include -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA #include -#endif diff --git a/unit_test/hip/Test_HIP_Blas1_team_dot.cpp b/unit_test/hip/Test_HIP_Blas1_team_dot.cpp index bab30ef2b8..05987c8dd4 100644 --- a/unit_test/hip/Test_HIP_Blas1_team_dot.cpp +++ b/unit_test/hip/Test_HIP_Blas1_team_dot.cpp @@ -1,4 +1,2 @@ #include -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA #include -#endif diff --git a/unit_test/hip/Test_HIP_Blas1_team_mult.cpp b/unit_test/hip/Test_HIP_Blas1_team_mult.cpp index 25d51c964f..ca54d031f1 100644 --- a/unit_test/hip/Test_HIP_Blas1_team_mult.cpp +++ b/unit_test/hip/Test_HIP_Blas1_team_mult.cpp @@ -1,4 +1,2 @@ #include -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA #include -#endif diff --git a/unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp b/unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp index 4c067a39be..9994255a31 100644 --- a/unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp +++ b/unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp @@ -1,4 +1,2 @@ #include -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA #include -#endif diff --git a/unit_test/hip/Test_HIP_Blas1_team_scal.cpp b/unit_test/hip/Test_HIP_Blas1_team_scal.cpp index e685a03a04..2f804c4dc5 100644 --- a/unit_test/hip/Test_HIP_Blas1_team_scal.cpp +++ b/unit_test/hip/Test_HIP_Blas1_team_scal.cpp @@ -1,4 +1,2 @@ #include -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA #include -#endif diff --git a/unit_test/hip/Test_HIP_Blas1_team_update.cpp b/unit_test/hip/Test_HIP_Blas1_team_update.cpp index 9ffc89a8eb..99cc8746ed 100644 --- a/unit_test/hip/Test_HIP_Blas1_team_update.cpp +++ b/unit_test/hip/Test_HIP_Blas1_team_update.cpp @@ -1,4 +1,2 @@ #include -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA #include -#endif diff --git a/unit_test/hip/Test_HIP_Blas2_team_gemv.cpp b/unit_test/hip/Test_HIP_Blas2_team_gemv.cpp index 7e5c60a527..da40621400 100644 --- a/unit_test/hip/Test_HIP_Blas2_team_gemv.cpp +++ b/unit_test/hip/Test_HIP_Blas2_team_gemv.cpp @@ -1,4 +1,2 @@ #include -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA #include -#endif From a1afb71c67d030ca3ce8648d20aa11fc9b39a76d Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 29 Oct 2020 23:31:54 -0600 Subject: [PATCH 084/106] HIP: fixing gemm unit-test by changing kernel launch bounds The min launch bound was set to 2 and is now set to 0. This new setting allows the BLAS unit-test to successfully run to completion. --- src/blas/impl/KokkosBlas3_gemm_impl.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/blas/impl/KokkosBlas3_gemm_impl.hpp b/src/blas/impl/KokkosBlas3_gemm_impl.hpp index fc5ba4dfa6..124909c159 100644 --- a/src/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/src/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -517,7 +517,17 @@ struct GEMMImpl { ViewTypeBScratch::shmem_size() + ViewTypeCScratch::shmem_size(); +#if defined(KOKKOS_ENABLE_HIP) + // Note lbv, 10/29/20: The LaunchBounds<384,2> leads + // to an error with HIP as the heuristics on that platform + // yield an optimal_num_blocks=0 which means no ressources + // are allocated... Switching to LaunchBounds<384,2> fixes + // that problem but I'm not sure if that it a good perf + // parameter or why it is set to 2 for Cuda? + Kokkos::TeamPolicy> policy(num_blocks_0*num_blocks_1,team_size,vector_length); +#else Kokkos::TeamPolicy> policy(num_blocks_0*num_blocks_1,team_size,vector_length); +#endif Kokkos::parallel_for(impl_gemm_label::label,policy.set_scratch_size(scratch_level,Kokkos::PerTeam(scratch_memory_size)),*this); } From f06aaa559bb4e5ba8a5121c71ac57cf54157ff3c Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 29 Oct 2020 23:58:01 -0600 Subject: [PATCH 085/106] HIP: modifying unit_test/CMakeList and cm_test_all_sandia With the new modifications the spot-check can be run on caraway04 where BLAS and Common unit-tests will be built and tested as well as the wiki tests. --- scripts/cm_test_all_sandia | 2 +- unit_test/CMakeLists.txt | 46 +++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index e461e1ce0c..c5f7148125 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -640,7 +640,7 @@ elif [ "$MACHINE" = "mayer" ]; then SPACK_HOST_ARCH="+armv8_tx2" elif [ "$MACHINE" = "caraway" ]; then SKIP_HWLOC=True - BUILD_ONLY=True + # BUILD_ONLY=True # report_and_log_test_result: only testing compilation of code for now, # output description and success based only on build succes; build time output (no run-time) diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt index 128bfa3030..534782e590 100644 --- a/unit_test/CMakeLists.txt +++ b/unit_test/CMakeLists.txt @@ -120,31 +120,31 @@ IF (KOKKOS_ENABLE_HIP) # COMPONENTS batched # ) - APPEND_GLOB(HIP_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Sparse*.cpp) - # HIP does not provide UVM, these two tests are henced remove permanently - # IF (NOT KOKKOS_ENABLE_CUDA_UVM) - LIST(REMOVE_ITEM HIP_SPARSE_SOURCES - "${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Sparse_findRelOffset.cpp") - LIST(REMOVE_ITEM HIP_SPARSE_SOURCES - "${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Sparse_trsv.cpp") - # ENDIF() + # APPEND_GLOB(HIP_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Sparse*.cpp) + # # HIP does not provide UVM, these two tests are henced remove permanently + # # IF (NOT KOKKOS_ENABLE_CUDA_UVM) + # LIST(REMOVE_ITEM HIP_SPARSE_SOURCES + # "${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Sparse_findRelOffset.cpp") + # LIST(REMOVE_ITEM HIP_SPARSE_SOURCES + # "${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Sparse_trsv.cpp") + # # ENDIF() - KOKKOSKERNELS_ADD_UNIT_TEST( - sparse_hip - SOURCES - Test_Main.cpp - ${HIP_SPARSE_SOURCES} - COMPONENTS sparse - ) + # KOKKOSKERNELS_ADD_UNIT_TEST( + # sparse_hip + # SOURCES + # Test_Main.cpp + # ${HIP_SPARSE_SOURCES} + # COMPONENTS sparse + # ) - APPEND_GLOB(HIP_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Graph*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST( - graph_hip - SOURCES - Test_Main.cpp - ${HIP_GRAPH_SOURCES} - COMPONENTS graph - ) + # APPEND_GLOB(HIP_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Graph*.cpp) + # KOKKOSKERNELS_ADD_UNIT_TEST( + # graph_hip + # SOURCES + # Test_Main.cpp + # ${HIP_GRAPH_SOURCES} + # COMPONENTS graph + # ) #currently float 128 test is not working. So common tests are explicitly added. APPEND_GLOB(HIP_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Common*.cpp) From d7942ee431716f96d6de6f6d18e85c3cd36abdc9 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 3 Nov 2020 11:48:31 -0800 Subject: [PATCH 086/106] Replace HAVE_KOKKOS_HALFMATH with KOKKOS_HALF_T_IS_FLOAT --- CMakeLists.txt | 7 ------- cmake/HalfPrecisionSupport.cmake | 6 ------ cmake/KokkosKernels_config.h.in | 3 --- src/KokkosKernels_Half.hpp | 4 ++-- src/Kokkos_ArithTraits.hpp | 8 ++++---- test_common/KokkosKernels_TestUtils.hpp | 4 ++-- test_common/Test_Common_ArithTraits.hpp | 12 ++++++------ unit_test/batched/Test_Batched_SerialGemm_Real.hpp | 4 ++-- unit_test/batched/Test_Batched_TeamGemm_Real.hpp | 4 ++-- .../batched/Test_Batched_TeamVectorGemm_Real.hpp | 4 ++-- 10 files changed, 20 insertions(+), 36 deletions(-) delete mode 100644 cmake/HalfPrecisionSupport.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 091b9b5541..7353c1f2ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -170,10 +170,6 @@ ELSE() # ================================================================== INCLUDE(cmake/kokkoskernels_eti_layouts.cmake) # ================================================================== - # Determine half precision support - # ================================================================== - INCLUDE(cmake/HalfPrecisionSupport.cmake) - # ================================================================== # Enable Third Party Libraries # ================================================================== INCLUDE(cmake/kokkoskernels_tpls.cmake) @@ -191,9 +187,6 @@ ELSE() MESSAGE(" Offsets: ${OFFSET_LIST}") MESSAGE(" Layouts: ${LAYOUT_LIST}") MESSAGE("") - MESSAGE("KokkosKernels Half Precision Types") - MESSAGE(" HAVE_KOKKOS_HALFMATH: ${HAVE_KOKKOS_HALFMATH}") - MESSAGE("") MESSAGE("KokkosKernels TPLs") FOREACH(TPL ${KOKKOSKERNELS_TPL_LIST}) PAD_STRING("${TPL}:" TPL_PADDED 12) diff --git a/cmake/HalfPrecisionSupport.cmake b/cmake/HalfPrecisionSupport.cmake deleted file mode 100644 index 0e2b0fc0ce..0000000000 --- a/cmake/HalfPrecisionSupport.cmake +++ /dev/null @@ -1,6 +0,0 @@ -# Check whether Kokkos has half precision headers -IF(EXISTS ${Kokkos_DIR}/../../../include/Kokkos_Half.hpp) - SET(HAVE_KOKKOS_HALFMATH TRUE) -ELSE() - SET(HAVE_KOKKOS_HALFMATH FALSE) -ENDIF() diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index a79c30427b..b5611c20ca 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -16,9 +16,6 @@ /* Define this macro if the quadmath TPL is enabled */ #cmakedefine HAVE_KOKKOSKERNELS_QUADMATH -/* Define this macro if half precision is supported by kokkos */ -#cmakedefine HAVE_KOKKOS_HALFMATH - /* Define this macro if the MKL TPL is enabled. This is different than just linking against the MKL to get the BLAS and LAPACK; it requires (a) header file(s) as well, and may use functions other diff --git a/src/KokkosKernels_Half.hpp b/src/KokkosKernels_Half.hpp index 5db55cf514..b8108a43db 100644 --- a/src/KokkosKernels_Half.hpp +++ b/src/KokkosKernels_Half.hpp @@ -45,9 +45,9 @@ #ifndef KOKKOSKERNELS_HALF_HPP #define KOKKOSKERNELS_HALF_HPP -#if defined(HAVE_KOKKOS_HALFMATH) +#if defined(KOKKOS_HALF_T_IS_FLOAT) #include "Kokkos_Half.hpp" -#endif // HAVE_KOKKOS_HALFMATH +#endif // KOKKOS_HALF_T_IS_FLOAT namespace KokkosKernels { namespace Experimental { diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index 3067ab954f..7dee4e24a3 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -667,9 +667,9 @@ class ArithTraits { }; // Since Kokkos::Experimental::half_t falls back to float, only define -// ArithTraits if an IMPL type exists -#if defined(HAVE_KOKKOS_HALFMATH) &&\ - defined(KOKKOS_ENABLE_CUDA_HALF) // TODO: Check for other backends +// ArithTraits if half_t is a backend specialization +#if defined(KOKKOS_HALF_T_IS_FLOAT) &&\ + !KOKKOS_HALF_T_IS_FLOAT template <> class ArithTraits { public: @@ -838,7 +838,7 @@ class ArithTraits { return KOKKOSKERNELS_IMPL_FP16_MAX; } }; -#endif // HAVE_KOKKOS_HALFMATH && KOKKOS_ENABLE_CUDA +#endif // KOKKOS_HALF_T_IS_FLOAT && KOKKOS_ENABLE_CUDA_HALF template<> class ArithTraits { diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 23b8030342..bf86768d16 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -106,9 +106,9 @@ namespace Test { } } - #if defined(HAVE_KOKKOS_HALFMATH) + #if defined(KOKKOS_HALF_T_IS_FLOAT) using halfScalarType = Kokkos::Experimental::half_t; - #endif // HAVE_KOKKOS_HALFMATH + #endif // KOKKOS_HALF_T_IS_FLOAT template struct SharedVanillaGEMM { diff --git a/test_common/Test_Common_ArithTraits.hpp b/test_common/Test_Common_ArithTraits.hpp index d447a1fa92..bba54ff6f0 100644 --- a/test_common/Test_Common_ArithTraits.hpp +++ b/test_common/Test_Common_ArithTraits.hpp @@ -1037,7 +1037,7 @@ class ArithTraitsTesterComplexBase : // Apparently, std::numeric_limits::is_signed is 1 // only for real numbers. -#if defined(HAVE_KOKKOS_HALFMATH) +#if defined(KOKKOS_HALF_T_IS_FLOAT) if (std::is_same::value) { if (AT::is_signed != 0x1) FAILURE(); @@ -1052,7 +1052,7 @@ class ArithTraitsTesterComplexBase : FAILURE(); } } -#endif // HAVE_KOKKOS_HALFMATH +#endif // KOKKOS_HALF_T_IS_FLOAT if (AT::is_complex) { FAILURE(); @@ -1561,13 +1561,13 @@ int runAllArithTraitsDeviceTests (std::ostream& out, const int verbose) // Built-in real floating-point types // -#if defined(HAVE_KOKKOS_HALFMATH) +#if defined(KOKKOS_HALF_T_IS_FLOAT) TRACE(); success = success && curSuccess; curSuccess = testArithTraitsOnDevice( out, verbose); -#endif // HAVE_KOKKOS_HALFMATH +#endif // KOKKOS_HALF_T_IS_FLOAT success = success && curSuccess; curSuccess = testArithTraitsOnDevice (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnDevice (out, verbose); @@ -1645,12 +1645,12 @@ int runAllArithTraitsHostTests (std::ostream& out, const int verbose) // Kokkos' complex floating-point types // -#if defined(HAVE_KOKKOS_HALFMATH) +#if defined(KOKKOS_HALF_T_IS_FLOAT) success = success && curSuccess; TRACE(); curSuccess = testArithTraitsOnHost( out, verbose); -#endif // HAVE_KOKKOS_HALFMATH +#endif // KOKKOS_HALF_T_IS_FLOAT success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); //success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); diff --git a/unit_test/batched/Test_Batched_SerialGemm_Real.hpp b/unit_test/batched/Test_Batched_SerialGemm_Real.hpp index 5ef1df82bf..087c94f997 100644 --- a/unit_test/batched/Test_Batched_SerialGemm_Real.hpp +++ b/unit_test/batched/Test_Batched_SerialGemm_Real.hpp @@ -1,4 +1,4 @@ -#if defined(HAVE_KOKKOS_HALFMATH) +#if defined(KOKKOS_HALF_T_IS_FLOAT) TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_half_half ) { typedef ::Test::ParamTag param_tag_type; @@ -23,7 +23,7 @@ TEST_F( TestCategory, batched_scalar_serial_gemm_t_t_half_half ) { test_batched_gemm_half(); test_batched_gemm_half(); } -#endif // HAVE_KOKKOS_HALFMATH +#endif // KOKKOS_HALF_T_IS_FLOAT #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_float_float ) { diff --git a/unit_test/batched/Test_Batched_TeamGemm_Real.hpp b/unit_test/batched/Test_Batched_TeamGemm_Real.hpp index e8fe47b202..327b1bcc21 100644 --- a/unit_test/batched/Test_Batched_TeamGemm_Real.hpp +++ b/unit_test/batched/Test_Batched_TeamGemm_Real.hpp @@ -1,4 +1,4 @@ -#if defined(HAVE_KOKKOS_HALFMATH) +#if defined(KOKKOS_HALF_T_IS_FLOAT) TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_half_half ) { typedef ::Test::ParamTag param_tag_type; @@ -23,7 +23,7 @@ TEST_F( TestCategory, batched_scalar_team_gemm_t_t_half_half ) { test_batched_teamgemm_half(); test_batched_teamgemm_half(); } -#endif // HAVE_KOKKOS_HALFMATH +#endif // KOKKOS_HALF_T_IS_FLOAT #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_float_float ) { diff --git a/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp b/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp index 747c483b97..de7748bd65 100644 --- a/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp +++ b/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp @@ -1,4 +1,4 @@ -#if defined(HAVE_KOKKOS_HALFMATH) +#if defined(KOKKOS_HALF_T_IS_FLOAT) TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_half_half ) { typedef ::Test::ParamTag param_tag_type; @@ -23,7 +23,7 @@ TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_half_half ) { //test_batched_teamvectorgemm_half(); test_batched_teamvectorgemm_half(); } -#endif // HAVE_KOKKOS_HALFMATH +#endif // KOKKOS_HALF_T_IS_FLOAT #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_float_float ) { From 4613eeaf95e71b1711e7a1952d974c4a64b556bc Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 11 Nov 2020 08:41:57 -0700 Subject: [PATCH 087/106] Include Kokkos_Core.hpp instead of Kokkos_Half.hpp --- src/KokkosKernels_Half.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/KokkosKernels_Half.hpp b/src/KokkosKernels_Half.hpp index b8108a43db..5ecb959f7e 100644 --- a/src/KokkosKernels_Half.hpp +++ b/src/KokkosKernels_Half.hpp @@ -45,9 +45,7 @@ #ifndef KOKKOSKERNELS_HALF_HPP #define KOKKOSKERNELS_HALF_HPP -#if defined(KOKKOS_HALF_T_IS_FLOAT) -#include "Kokkos_Half.hpp" -#endif // KOKKOS_HALF_T_IS_FLOAT +#include "Kokkos_Core.hpp" namespace KokkosKernels { namespace Experimental { From c16e171f9fb92c741fe13e0552720280553badaf Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 13 Nov 2020 15:58:12 -0800 Subject: [PATCH 088/106] Add half_t explicit conversions --- src/Kokkos_ArithTraits.hpp | 66 +++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index 7dee4e24a3..bdf86e1598 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -683,7 +683,7 @@ class ArithTraits { static const bool is_complex = false; static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return HUGE_VALF; } + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return Kokkos::Experimental::cast_to_half(HUGE_VALF); } static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) { #ifndef __CUDA_ARCH__ @@ -698,78 +698,78 @@ class ArithTraits { return isnan(Kokkos::Experimental::cast_from_half(x)); } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) { - return fabs(Kokkos::Experimental::cast_from_half(x)); + return Kokkos::Experimental::cast_to_half(fabs(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type zero () { - return 0.0F; + return Kokkos::Experimental::cast_to_half(0.0F); } static KOKKOS_FORCEINLINE_FUNCTION val_type one () { - return 1.0F; + return Kokkos::Experimental::cast_to_half(1.0F); } static KOKKOS_FORCEINLINE_FUNCTION val_type min () { - return -KOKKOSKERNELS_IMPL_FP16_MAX; + return Kokkos::Experimental::cast_to_half(-KOKKOSKERNELS_IMPL_FP16_MAX); } static KOKKOS_FORCEINLINE_FUNCTION val_type max () { - return KOKKOSKERNELS_IMPL_FP16_MAX; + return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); } static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type) { - return 0.0F; + return Kokkos::Experimental::cast_to_half(0.0F); } static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const val_type y) { - return ::pow(Kokkos::Experimental::cast_from_half(x), - Kokkos::Experimental::cast_from_half(y)); + return Kokkos::Experimental::cast_to_half(::pow(Kokkos::Experimental::cast_from_half(x), + Kokkos::Experimental::cast_from_half(y))); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { - return ::sqrt (Kokkos::Experimental::cast_from_half(x)); + return Kokkos::Experimental::cast_to_half(::sqrt (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - return ::cbrt (Kokkos::Experimental::cast_from_half(x)); + return Kokkos::Experimental::cast_to_half(::cbrt (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { - return ::exp (Kokkos::Experimental::cast_from_half(x)); + return Kokkos::Experimental::cast_to_half(::exp (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) { - return ::log (Kokkos::Experimental::cast_from_half(x)); + return Kokkos::Experimental::cast_to_half(::log (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) { - return ::log10 (Kokkos::Experimental::cast_from_half(x)); + return Kokkos::Experimental::cast_to_half(::log10 (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - return ::sin (Kokkos::Experimental::cast_from_half(x)); + return Kokkos::Experimental::cast_to_half(::sin (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - return ::cos (Kokkos::Experimental::cast_from_half(x)); + return Kokkos::Experimental::cast_to_half(::cos (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - return ::tan (Kokkos::Experimental::cast_from_half(x)); + return Kokkos::Experimental::cast_to_half(::tan (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - return ::sinh (Kokkos::Experimental::cast_from_half(x)); + return Kokkos::Experimental::cast_to_half(::sinh (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - return ::cosh (Kokkos::Experimental::cast_from_half(x)); + return Kokkos::Experimental::cast_to_half(::cosh (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - return ::tanh (Kokkos::Experimental::cast_from_half(x)); + return Kokkos::Experimental::cast_to_half(::tanh (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - return ::asin (Kokkos::Experimental::cast_from_half(x)); + return Kokkos::Experimental::cast_to_half(::asin (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - return ::acos (Kokkos::Experimental::cast_from_half(x)); + return Kokkos::Experimental::cast_to_half(::acos (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - return ::atan (Kokkos::Experimental::cast_from_half(x)); + return Kokkos::Experimental::cast_to_half(::atan (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () { //return ::pow(2, -KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS); - return KOKKOSKERNELS_IMPL_FP16_EPSILON; + return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_EPSILON); } // Backwards compatibility with Teuchos::ScalarTraits. typedef mag_type magnitudeType; @@ -785,29 +785,29 @@ class ArithTraits { return isNan (x) || isInf (x); } static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) { - return abs (Kokkos::Experimental::cast_from_half(x)); + return abs(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) { - return conj (Kokkos::Experimental::cast_from_half(x)); + return conj(x); } static std::string name () { return "half"; } static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) { - return sqrt (Kokkos::Experimental::cast_from_half(x)); + return sqrt(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type nan () { #ifdef __CUDA_ARCH__ - return CUDART_NAN_F; + return Kokkos::Experimental::cast_to_half(CUDART_NAN_F); #else - return std::numeric_limits::quiet_NaN(); + return Kokkos::Experimental::cast_to_half(std::numeric_limits::quiet_NaN()); #endif // __CUDA_ARCH__ } static KOKKOS_FORCEINLINE_FUNCTION mag_type eps () { return epsilon (); } static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin () { - return KOKKOSKERNELS_IMPL_FP16_MIN; + return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); } static KOKKOS_FORCEINLINE_FUNCTION int base () { return KOKKOSKERNELS_IMPL_FP16_RADIX; @@ -823,19 +823,19 @@ class ArithTraits { return KOKKOSKERNELS_IMPL_FP16_MANT_DIG; } static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd () { - return 1.0; + return Kokkos::Experimental::cast_to_half(1.0); } static KOKKOS_FORCEINLINE_FUNCTION int emin () { return KOKKOSKERNELS_IMPL_FP16_MIN_EXP; } static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin () { - return KOKKOSKERNELS_IMPL_FP16_MIN; + return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); } static KOKKOS_FORCEINLINE_FUNCTION int emax () { return KOKKOSKERNELS_IMPL_FP16_MAX_EXP; } static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax () { - return KOKKOSKERNELS_IMPL_FP16_MAX; + return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); } }; #endif // KOKKOS_HALF_T_IS_FLOAT && KOKKOS_ENABLE_CUDA_HALF From 8ca31c45d03162142baae90b045b737d0ad3c97c Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 13 Nov 2020 19:20:20 -0800 Subject: [PATCH 089/106] Add half_t explicit conversions --- ...KokkosBatched_Gemm_TeamVector_Internal.hpp | 2 +- .../KokkosBatched_Gemm_Team_Internal.hpp | 2 +- ...okkosBatched_InnerGemmFixC_Serial_Impl.hpp | 80 +++++++++---------- unit_test/batched/Test_Batched_SerialGemm.hpp | 3 +- unit_test/batched/Test_Batched_TeamGemm.hpp | 2 +- .../batched/Test_Batched_TeamVectorGemm.hpp | 2 +- 6 files changed, 46 insertions(+), 45 deletions(-) diff --git a/src/batched/KokkosBatched_Gemm_TeamVector_Internal.hpp b/src/batched/KokkosBatched_Gemm_TeamVector_Internal.hpp index 3b53e9a577..971389902e 100644 --- a/src/batched/KokkosBatched_Gemm_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_Gemm_TeamVector_Internal.hpp @@ -70,7 +70,7 @@ namespace KokkosBatched { const ValueType *__restrict__ pB = B+j*bs1; - ValueType c = 0; + ValueType c = ValueType(0); for (int p=0;p; /// randomized input testing views - ScalarType alpha = 1.5, beta = 3.0; + ScalarType alpha = ScalarType(1.5); + ScalarType beta = ScalarType(3.0); ViewType a_expected("a_expected", N, matAdim1, matAdim2), a1("a1", N, matAdim1, matAdim2), diff --git a/unit_test/batched/Test_Batched_TeamGemm.hpp b/unit_test/batched/Test_Batched_TeamGemm.hpp index fb8f235f12..10f11d686d 100644 --- a/unit_test/batched/Test_Batched_TeamGemm.hpp +++ b/unit_test/batched/Test_Batched_TeamGemm.hpp @@ -147,7 +147,7 @@ namespace Test { using ats = Kokkos::Details::ArithTraits; /// randomized input testing views - ScalarType alpha = 1.5, beta = 3.0; + ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0); ViewType a_expected("a_expected", N, matAdim1, matAdim2), a1("a1", N, matAdim1, matAdim2), diff --git a/unit_test/batched/Test_Batched_TeamVectorGemm.hpp b/unit_test/batched/Test_Batched_TeamVectorGemm.hpp index 18db6d8bed..09b2dfa89c 100644 --- a/unit_test/batched/Test_Batched_TeamVectorGemm.hpp +++ b/unit_test/batched/Test_Batched_TeamVectorGemm.hpp @@ -142,7 +142,7 @@ namespace Test { using ats = Kokkos::Details::ArithTraits; /// randomized input testing views - ScalarType alpha = 1.5, beta = 3.0; + ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0); ViewType a_expected("a_expected", N, matAdim1, matAdim2), a1("a1", N, matAdim1, matAdim2), From 6c661232ee5b0d2f588d31f7ec1e9b52731d7e3a Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 3 Nov 2020 00:26:40 -0600 Subject: [PATCH 090/106] New issue with HIP and Kokkos_ArithTraits isinf and isnan are not defined for long double with HIP The input parameter is casted to avoid ambiguous function call The function is also defined as __device__ so it needs to be decorated with KOKKOS_FORCE_INLINE_FUNCTION --- src/Kokkos_ArithTraits.hpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index 7dee4e24a3..ba9cf585a6 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -1385,17 +1385,25 @@ class ArithTraits { static constexpr bool has_infinity = true; static KOKKOS_FORCEINLINE_FUNCTION long double infinity() { return HUGE_VALL; } - static bool isInf (const val_type& x) { + static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; #endif +#if defined(KOKKOS_ENABLE_HIP) + return isinf (static_cast(x)); +#else return isinf (x); +#endif } - static bool isNan (const val_type& x) { + static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; #endif +#if defined(KOKKOS_ENABLE_HIP) + return isnan (static_cast(x)); +#else return isnan (x); +#endif } static mag_type abs (const val_type& x) { return ::fabsl (x); From b3e45880f8bd6996b47c68b4b0b0f1581c802f07 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 17 Nov 2020 09:45:31 -0600 Subject: [PATCH 091/106] HIP: removing long double ArithTraits from device build As neither Nvidia nor AMD supports long double properly we might as well just remove the instantiation from the code on these platforms all together. --- src/Kokkos_ArithTraits.hpp | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index ba9cf585a6..e4008ecedf 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -1370,6 +1370,7 @@ class ArithTraits { // CUDA does not support long double in device functions, so none of // the class methods in this specialization are marked as device // functions. +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST template<> class ArithTraits { public: @@ -1383,27 +1384,15 @@ class ArithTraits { static const bool is_complex = false; static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION long double infinity() { return HUGE_VALL; } + static long double infinity() { return HUGE_VALL; } - static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type& x) { - #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + static bool isInf (const val_type& x) { using std::isinf; - #endif -#if defined(KOKKOS_ENABLE_HIP) - return isinf (static_cast(x)); -#else return isinf (x); -#endif } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type& x) { - #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + static bool isNan (const val_type& x) { using std::isnan; - #endif -#if defined(KOKKOS_ENABLE_HIP) - return isnan (static_cast(x)); -#else return isnan (x); -#endif } static mag_type abs (const val_type& x) { return ::fabsl (x); @@ -1537,7 +1526,8 @@ class ArithTraits { static mag_type rmax () { return LDBL_MAX; } -}; +}; // long double specialization +#endif // KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST #ifdef HAVE_KOKKOSKERNELS_QUADMATH From 1840bef2c8a6036f82f6f142167d5d53ed2f7b35 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 17 Nov 2020 10:02:35 -0800 Subject: [PATCH 092/106] Device: guarding host only instantiation properly An instantiation of the inner product on long double is not intended to be compiled on device so it is now guarded to avoid issues. --- src/Kokkos_ArithTraits.hpp | 6 +++--- src/Kokkos_InnerProductSpaceTraits.hpp | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index e4008ecedf..4c94a03912 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -1367,9 +1367,9 @@ class ArithTraits { }; -// CUDA does not support long double in device functions, so none of -// the class methods in this specialization are marked as device -// functions. +// CUDA and HIP do not support long double in device functions, +// so none of the class methods in this specialization are marked +// as device functions. #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST template<> class ArithTraits { diff --git a/src/Kokkos_InnerProductSpaceTraits.hpp b/src/Kokkos_InnerProductSpaceTraits.hpp index 65f3feaf8e..82cab6cc3b 100644 --- a/src/Kokkos_InnerProductSpaceTraits.hpp +++ b/src/Kokkos_InnerProductSpaceTraits.hpp @@ -170,6 +170,7 @@ class InnerProductSpaceTraits { /// \brief Partial specialization for long double. /// /// \warning CUDA does not support long double in device functions. +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST template<> struct InnerProductSpaceTraits { @@ -184,6 +185,7 @@ struct InnerProductSpaceTraits return x * y; } }; +#endif //! Partial specialization for Kokkos::complex. template From 3319d432a07476d2226b9fbf2bcfcdbeb20d5a46 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 24 Sep 2020 16:54:22 -0600 Subject: [PATCH 093/106] perf_test/blas/blas3: Add GEMM support --- perf_test/blas/blas3/KokkosBlas3_common.hpp | 64 +- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 1015 +++++++++++++++++ .../blas/blas3/KokkosBlas3_perf_test.cpp | 251 ++-- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 18 +- perf_test/blas/blas3/README.md | 8 +- src/blas/impl/KokkosBlas3_trmm_spec.hpp | 2 +- 6 files changed, 1264 insertions(+), 94 deletions(-) create mode 100644 perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index 8374c4502d..4952a8e606 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -56,8 +56,11 @@ #define DEFAULT_STEP 3 #define DEFAULT_WARM_UP_N 100 #define DEFAULT_N 100 +#define DEFAULT_K 1024 #define DEFAULT_OUT &std::cout -#define DEFAULT_BLAS_ROUTINES "trmm," +#define DEFAULT_BLAS_ROUTINES "trmm,gemm," +#define DEFAULT_TEAM_SIZE 1 +#define DEFAULT_VECTOR_LEN 1 /************************ blas routine structure definitions **********/ struct perf_test_trmm_args { @@ -66,29 +69,40 @@ struct perf_test_trmm_args { }; typedef struct perf_test_trmm_args pt_trmm_args_t; +struct perf_test_gemm_args { + std::string gemm_args; //[N,T,C][N,T,C] for transA and transB + default_scalar alpha; + default_scalar beta; +}; +typedef struct perf_test_gemm_args pt_gemm_args_t; // ADD MORE BLAS3 ROUTINE ARG STRUCTS HERE. struct blas_args { pt_trmm_args_t trmm; + pt_gemm_args_t gemm; // ADD MORE BLAS3 ROUTINES HERE + int team_size; + int vector_len; + // ADD MORE COMMON BLAS3 OPTIONS HERE }; typedef struct blas_args blas_args_t; typedef enum BLAS_ROUTINES { TRMM, + GEMM, // ADD MORE BLAS3 ROUTINES HERE BLAS_ROUTINES_N } blas_routines_e; static std::string blas_routines_e_str[BLAS_ROUTINES_N] = { - "trmm" + "trmm", "gemm" // ADD MORE BLAS3 ROUTINES HERE }; /************************ perf test type definitions ************************/ /** - * @var SERIAL: Run the blas routine iterativley, within a for-loop - * @var PARALLEL: Run the blas routine iterativley, within a + * @var SERIAL: Run the blas routine iteratively, within a for-loop + * @var PARALLEL: Run the blas routine iteratively, within a * Kokkos::parallel_for-loop */ typedef enum LOOP { @@ -98,27 +112,47 @@ typedef enum LOOP { LOOP_N } loop_e; -static std::string loop_e_str[LOOP_N] = {"SERIAL", "PARALLEL"}; +static std::string loop_e_str[LOOP_N] = {"serial", "parallel"}; /** - * @var BLAS: Run the blas routine through the KokkosBlas namespace. - * @var BATCHED: Run the blas routine through the KokkosBatched namespace. + * @var BLAS: Run the blas routine through the + * KokkosBlas namespace. + * @var BATCHED_SERIAL{_BLOCKED}: Run the serial blas routine through the + * KokkosBatched namespace. + * @var BATCHED_TEAM{_BLOCKED}: Run the team blas routine through the + * KokkosBatched namespace. + * @var BATCHED_TEAM_VECTOR{_BLOCKED}: Run the team vector blas routine through + * the KokkosBatched namespace. + * @var EXPERIMENT: Run the blas routine as a custom + * experiment. */ typedef enum TEST { BLAS, - BATCHED, + BATCHED_SERIAL, + BATCHED_SERIAL_BLOCKED, + BATCHED_TEAM, + BATCHED_TEAM_BLOCKED, + BATCHED_TEAM_VECTOR, + BATCHED_TEAM_VECTOR_BLOCKED, // ADD MORE TEST TYPES HERE + EXPERIMENT, TEST_N } test_e; -static std::string test_e_str[TEST_N]{"BLAS", "BATCHED"}; +static std::string test_e_str[TEST_N]{ + "blas", "batched_serial", "batched_serial_blocked", "batched_team", + "batched_team_blocked", "batched_team_vector", + "batched_team_vector_blocked", + // ADD MORE TEST TYPES HERE + "experiment"}; /** + * @var k: Number of 2D matrices. * @var m: Number of rows. * @var n: Number of columns. */ struct matrix_dim { - int m, n; + int k, m, n; }; typedef struct matrix_dim matrix_dim_t; @@ -157,4 +191,14 @@ struct perf_test_options { std::string blas_routines; }; typedef struct perf_test_options options_t; + +/*************************** Print macros **************************/ +//#define PERF_TEST_DEBUG +#ifdef PERF_TEST_DEBUG +#define STATUS printf("STATUS: %s:%d.\n", __func__, __LINE__); +#else +#define STATUS +#endif // PERF_TEST_DEBUG +#define FATAL_ERROR(msg) \ + printf("FATAL_ERROR: %s:%s:%d %s\n", __FILE__, __func__, __LINE__, (msg)); #endif // KOKKOSBLAS3_COMMON_H_ diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp new file mode 100644 index 0000000000..159d73c4f6 --- /dev/null +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -0,0 +1,1015 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOSBLAS3_GEMM_PERF_TEST_H_ +#define KOKKOSBLAS3_GEMM_PERF_TEST_H_ + +//#include +#include "KokkosBlas3_common.hpp" + +#include + +#include + +#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_Gemm_Serial_Impl.hpp" +//#include "KokkosBatched_Gemm_Team_Impl.hpp" +//#include "KokkosBatched_Gemm_TeamVector_Impl.hpp" +#include "KokkosBatched_Util.hpp" + +//#define GEMM_PERF_TEST_DEBUG + +// Forward declarations +void do_gemm_serial_blas(options_t options); +void do_gemm_serial_batched(options_t options); +void do_gemm_serial_batched_blocked(options_t options); +// void do_gemm_experiment(options_t options); + +// void do_gemm_serial_blas_parallel(options_t options); +// Not valid! The KokkosBlas::gemm function may take the entire device per +// invocation! +void do_gemm_serial_batched_parallel(options_t options); +void do_gemm_serial_batched_blocked_parallel(options_t options); +void do_gemm_team_batched_parallel(options_t options); +void do_gemm_team_batched_blocked_parallel(options_t options); +void do_gemm_team_vector_batched_parallel(options_t options); +void do_gemm_team_vector_batched_blocked_parallel(options_t options); +void do_gemm_experiment_parallel(options_t options); + +struct SerialTag {}; +struct TeamTag {}; +struct TeamVectorTag {}; +struct LayoutLeftTag {}; +struct LayoutRightTag {}; +struct SimdCpuTag {}; + +// gemm invoke table +void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { + { + do_gemm_serial_blas, // BLAS + do_gemm_serial_batched, do_gemm_serial_batched_blocked, // Serial + NULL, NULL, // Team + NULL, NULL, // TeamVector + NULL // Serial Experiment + }, + { + NULL, // BLAS + do_gemm_serial_batched_parallel, + do_gemm_serial_batched_blocked_parallel, // Serial + do_gemm_team_batched_parallel, + do_gemm_team_batched_blocked_parallel, // Team + do_gemm_team_vector_batched_parallel, NULL, // TeamVector + do_gemm_experiment_parallel // Parallel Experiment + }}; + +/*************************** Test types and defaults **************************/ +#define DEFAULT_GEMM_ARGS "NN" +#define DEFAULT_GEMM_ALPHA 1.0 + +using view_type_3d = + Kokkos::View; + +struct batched_params { + int team_size; + int vector_len; +}; +typedef struct batched_params batched_params_t; + +struct gemm_args { + char transA, transB; + default_scalar alpha; + default_scalar beta; + view_type_3d A, B, C; + batched_params_t bp; +}; +typedef struct gemm_args gemm_args_t; + +static std::string gemm_csv_header_str = + "algorithm,transAtransB,alpha,beta,team_size,vector_len,loop_type,A_dims,B_" + "dims,C_dims,warm_up_n," + "iter,total_time(s),average_time(s)"; + +/*************************** Internal helper fns **************************/ +static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, + double time_in_seconds, + const char *experiment_name = nullptr) { + std::string algo_name = test_e_str[options.test]; + if (experiment_name) algo_name = std::string(experiment_name); + + options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," + << options.blas_args.gemm.alpha << "," + << options.blas_args.gemm.beta << "," << gemm_args.bp.team_size + << "," << gemm_args.bp.vector_len << "," + << loop_e_str[options.loop] << "," << gemm_args.A.extent(0) + << "x" << gemm_args.A.extent(1) << "x" << gemm_args.A.extent(2) + << "," << gemm_args.B.extent(0) << "x" << gemm_args.B.extent(1) + << "x" << gemm_args.B.extent(2) << "," << gemm_args.C.extent(0) + << "x" << gemm_args.C.extent(1) << "x" << gemm_args.C.extent(2) + << "," << options.warm_up_n << "," << options.n << "," + << time_in_seconds << "," << time_in_seconds / options.n + << std::endl; +} + +static void __print_gemm_perf_test_options(options_t options) { +#ifdef PERF_TEST_DEBUG + printf("options.test = %s\n", test_e_str[options.test].c_str()); + printf("options.loop = %s\n", loop_e_str[options.loop].c_str()); + printf("options.start = %dx%d,%dx%d\n", options.start.a.m, + options.start.a.n, options.start.b.m, options.start.b.n); + printf("options.stop = %dx%d,%dx%d\n", options.stop.a.m, + options.stop.a.n, options.stop.b.m, options.stop.b.n); + printf("options.step = %d\n", options.step); + printf("options.warm_up_n = %d\n", options.warm_up_n); + printf("options.n = %d\n", options.n); + printf("options.blas_args.gemm.gemm_args = %s\n", + options.blas_args.gemm.gemm_args.c_str()); + printf("options.out_file = %s\n", options.out_file.c_str()); + if (std::is_same::value) + printf("options.alpha = %lf\n", options.blas_args.gemm.alpha); + else if (std::is_same::value) + printf("options.alpha = %f\n", options.blas_args.gemm.alpha); +#endif // PERF_TEST_DEBUG + return; +} + +/*************************** Internal templated fns **************************/ +template +void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) { +// Need to take subviews on the device +#if !defined(KOKKOS_ENABLE_CUDA) + Kokkos::Timer timer; + + STATUS; + + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) { + for (uint32_t i = 0; i < n; ++i) { + auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha, + A, B, _gemm_args.beta, C); + } + }; + __do_loop(options.warm_up_n, gemm_args); + Kokkos::fence(); + + timer.reset(); + __do_loop(options.n, gemm_args); + Kokkos::fence(); + + __gemm_output_csv_row(options, gemm_args, timer.seconds()); +#else + std::cerr << std::string(__func__) + << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl; +#endif // !KOKKOS_ENABLE_CUDA + return; +} + +template +void __do_gemm_serial_batched_template(options_t options, + gemm_args_t gemm_args) { +// Need to take subviews on the device +#if !defined(KOKKOS_ENABLE_CUDA) + Kokkos::Timer timer; + + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) { + for (uint32_t i = 0; i < n; ++i) { + auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL()); + + SerialGemm::invoke( + _gemm_args.alpha, A, B, _gemm_args.beta, C); + } + }; + + __do_loop(options.warm_up_n, gemm_args); + Kokkos::fence(); + + timer.reset(); + __do_loop(options.n, gemm_args); + Kokkos::fence(); + __gemm_output_csv_row(options, gemm_args, timer.seconds()); +#else + std::cerr << std::string(__func__) + << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl; +#endif // !KOKKOS_ENABLE_CUDA +} + +template +void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) { + char a = gemm_args.transA; + char b = gemm_args.transB; + using N = Trans::NoTranspose; + using T = Trans::Transpose; + // using C = Trans::ConjTranspose; + + STATUS; + + if (a == 'N' && b == 'N') { + __do_gemm_serial_batched_template(options, gemm_args); + } else if (a == 'N' && b == 'T') { + __do_gemm_serial_batched_template(options, gemm_args); + //} else if (a == 'N' && b == 'C') { + // __do_gemm_serial_batched_template(options, gemm_args); + } else if (a == 'T' && b == 'N') { + __do_gemm_serial_batched_template(options, gemm_args); + } else if (a == 'T' && b == 'T') { + __do_gemm_serial_batched_template(options, gemm_args); + //} else if (a == 'T' && b == 'C') { + // __do_gemm_serial_batched_template(options, gemm_args); + //} else if (a == 'C' && b == 'N') { + // __do_gemm_serial_batched_template(options, gemm_args); + //} else if (a == 'C' && b == 'T') { + // __do_gemm_serial_batched_template(options, gemm_args); + //} else if (a == 'C' && b == 'C') { + // __do_gemm_serial_batched_template(options, gemm_args); + } else { + FATAL_ERROR("Bad gemm_args TransA or TransB value"); + } + return; +} + +#if !defined(KOKKOS_ENABLE_CUDA) +template +struct parallel_blas_gemm { + gemm_args_t gemm_args_; + + parallel_blas_gemm(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int &i) const { + auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBlas::gemm(&gemm_args_.transA, &gemm_args_.transB, gemm_args_.alpha, + svA, svB, gemm_args_.beta, svC); + } +}; +#endif // !KOKKOS_ENABLE_CUDA + +template +void __do_gemm_parallel_blas(options_t options, gemm_args_t gemm_args) { +#if !defined(KOKKOS_ENABLE_CUDA) + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + Kokkos::Timer timer; + using execution_space = typename device_type::execution_space; + using functor_type = parallel_blas_gemm; + functor_type parallel_blas_gemm_functor(gemm_args); + + STATUS; + + Kokkos::parallel_for("parallelBlasWarmUpLoopGemm", + Kokkos::RangePolicy(0, warm_up_n), + parallel_blas_gemm_functor); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for("parallelBlasTimedLoopGemm", + Kokkos::RangePolicy(0, n), + parallel_blas_gemm_functor); + Kokkos::fence(); + __gemm_output_csv_row(options, gemm_args, timer.seconds()); +#else + std::cerr << std::string(__func__) + << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl; + __gemm_output_csv_row(options, gemm_args, -1); +#endif // !KOKKOS_ENABLE_CUDA + return; +} + +template +struct parallel_batched_gemm { + gemm_args_t gemm_args_; + + parallel_batched_gemm(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const SerialTag &, const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamTag &, const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::TeamGemm::invoke(member, gemm_args_.alpha, svA, + svB, gemm_args_.beta, svC); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorTag &, const MemberType &member) const { + auto team_idx = member.league_rank(); + auto svA = + Kokkos::subview(gemm_args_.A, team_idx, Kokkos::ALL(), Kokkos::ALL()); + auto svB = + Kokkos::subview(gemm_args_.B, team_idx, Kokkos::ALL(), Kokkos::ALL()); + auto svC = + Kokkos::subview(gemm_args_.C, team_idx, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::TeamVectorGemm::invoke(member, + gemm_args_.alpha, svA, + svB, gemm_args_.beta, + svC); + } +}; + +template +void __do_gemm_parallel_batched_template(options_t options, + gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + using functor_type = + parallel_batched_gemm; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto league_size = options.start.c.k; + Kokkos::Timer timer; + + STATUS; + + functor_type parallel_batched_gemm_functor(gemm_args); + auto team_size = gemm_args.bp.team_size; + auto vector_len = gemm_args.bp.vector_len; + + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); + } + Kokkos::fence(); + + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); + } + Kokkos::fence(); + + __gemm_output_csv_row(options, gemm_args, timer.seconds()); + + return; +} + +template +void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { + char a = gemm_args.transA; + char b = gemm_args.transB; + using N = Trans::NoTranspose; + using T = Trans::Transpose; + // using C = Trans::ConjTranspose; + + STATUS; + + if (a == 'N' && b == 'N') { + __do_gemm_parallel_batched_template(options, gemm_args); + } else if (a == 'N' && b == 'T') { + __do_gemm_parallel_batched_template(options, gemm_args); + //} else if (a == 'N' && b == 'C') { + // __do_gemm_parallel_batched_template(options, gemm_args); + } else if (a == 'T' && b == 'N') { + __do_gemm_parallel_batched_template(options, gemm_args); + } else if (a == 'T' && b == 'T') { + __do_gemm_parallel_batched_template(options, gemm_args); + //} else if (a == 'T' && b == 'C') { + // __do_gemm_parallel_batched_template(options, gemm_args); + //} else if (a == 'C' && b == 'N') { + // __do_gemm_parallel_batched_template(options, gemm_args); + //} else if (a == 'C' && b == 'T') { + // __do_gemm_parallel_batched_template(options, gemm_args); + //} else if (a == 'C' && b == 'C') { + // __do_gemm_parallel_batched_template(options, gemm_args); + } else { + FATAL_ERROR("Bad gemm_args TransA or TransB value"); + } + + return; +} + +template +struct parallel_batched_gemm_experiment1 { + gemm_args_t gemm_args_; + + parallel_batched_gemm_experiment1(gemm_args_t gemm_args) + : gemm_args_(gemm_args) {} + + KOKKOS_INLINE_FUNCTION + + void operator()(const SerialTag &, const int &i) const { + auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); + + // Uses two serial for-loops internally + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } +}; + +/** + * 1. parallel_for(rangePolicy(N)): serialGemm + * + */ +template +void __do_gemm_parallel_experiment1(options_t options, gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::RangePolicy; + using functor_type = + parallel_batched_gemm_experiment1; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto k = options.start.c.k; + Kokkos::Timer timer; + STATUS; + + functor_type experiment1_functor(gemm_args); + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment1Gemm", + policy_type(0, k), experiment1_functor); + } + Kokkos::fence(); + + timer.reset(); + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment1Gemm", + policy_type(0, k), experiment1_functor); + } + Kokkos::fence(); + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment1"); + return; +} + +template +struct parallel_batched_gemm_experiment2_3_4 { + gemm_args_t gemm_args_; + + parallel_batched_gemm_experiment2_3_4(gemm_args_t gemm_args) + : gemm_args_(gemm_args) {} + + // Experiment 2 + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorTag &, const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); + + // Uses TeamThreadRange over C-rows + // ThreadVectorRange over C-cols + KokkosBatched::TeamVectorGemm::invoke(member, + gemm_args_.alpha, svA, + svB, gemm_args_.beta, + svC); + } + + // Experiment 3 + KOKKOS_INLINE_FUNCTION + void operator()(const LayoutLeftTag &, const MemberType &member) const { + auto team_idx = member.league_rank(); + auto svA = + Kokkos::subview(gemm_args_.A, team_idx, Kokkos::ALL(), Kokkos::ALL()); + auto svB = + Kokkos::subview(gemm_args_.B, team_idx, Kokkos::ALL(), Kokkos::ALL()); + auto svC = + Kokkos::subview(gemm_args_.C, team_idx, Kokkos::ALL(), Kokkos::ALL()); + + // TeamThreadRange: splits the index range over the threads of the team + // ThreadVectorRange: splits the index range over the vector lanes of the + // calling thread + + auto svC_cols = svC.extent(1); + // In a given team, for each vector lane, compute zero or more output + // columns of C depending on the index range + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, svC_cols), [&](const int &lane_idx) { + auto svB_col = Kokkos::subview(svB, Kokkos::ALL(), lane_idx); + auto svC_col = Kokkos::subview(svC, Kokkos::ALL(), lane_idx); + // TeamGemm Calls TeamThreadRange over M*N meaning the flat M*N array + // is split over all threads of the team + KokkosBatched::TeamGemm::invoke(member, + gemm_args_.alpha, svA, + svB_col, + gemm_args_.beta, + svC_col); + }); + } + + // TODO: Why is this faster than the LayoutLeftTag operator above for both + // LayoutLeft and LayoutRight? Experiment 4 + KOKKOS_INLINE_FUNCTION + void operator()(const LayoutRightTag &, const MemberType &member) const { + auto team_idx = member.league_rank(); + auto svA = + Kokkos::subview(gemm_args_.A, team_idx, Kokkos::ALL(), Kokkos::ALL()); + auto svB = + Kokkos::subview(gemm_args_.B, team_idx, Kokkos::ALL(), Kokkos::ALL()); + auto svC = + Kokkos::subview(gemm_args_.C, team_idx, Kokkos::ALL(), Kokkos::ALL()); + + // TeamThreadRange: splits the index range over the threads of the team + // ThreadVectorRange: splits the index range over the vector lanes of the + // calling thread + + auto svC_rows = svC.extent(0); + // In a given team, for each vector lane, compute zero or more output rows + // of C depending on the index range + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, svC_rows), [&](const int &lane_idx) { + auto svA_row = Kokkos::subview(svA, lane_idx, Kokkos::ALL()); + auto svC_row = Kokkos::subview(svC, lane_idx, Kokkos::ALL()); + // TeamGemm Calls TeamThreadRange over M*N meaning the flat M*N array + // is split over all threads of the team + KokkosBatched::TeamGemm::invoke(member, + gemm_args_.alpha, + svA_row, svB, + gemm_args_.beta, + svC_row); + }); + } +}; + +/** + * 2. case a) + * parallel_for(teamPolicy): TeamVectorGemm + * + */ +template +void __do_gemm_parallel_experiment2(options_t options, gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + using functor_type = + parallel_batched_gemm_experiment2_3_4; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto league_size = options.start.c.k; + Kokkos::Timer timer; + STATUS; + + functor_type experiment2_functor(gemm_args); + + auto team_size = gemm_args.bp.team_size; + auto vector_len = gemm_args.bp.vector_len; + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment2Gemm", + policy_type(league_size, team_size, vector_len), + experiment2_functor); + } + Kokkos::fence(); + + timer.reset(); + + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment2Gemm", + policy_type(league_size, team_size, vector_len), + experiment2_functor); + } + Kokkos::fence(); + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment2"); + return; +} + +/** + * 3. case b) + * parallel_for(teamPolicy): + * parallel_for(TeamThreadRange): + * VectorGemm + * + * VectorGemm has not been implemented! + * I think this experiment can be removed. TeamGemm calls TeamThreadRange + * internally! TeamVectorGemm calls both TeamThreadRange and ThreadVectorRange + * internnally! + */ +template +void __do_gemm_parallel_experiment3(options_t options, gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + // using layout_tag = std::conditional::value, LayoutLeftTag, LayoutRightTag>::type; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + using functor_type = + parallel_batched_gemm_experiment2_3_4; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto league_size = options.start.c.k; + Kokkos::Timer timer; + STATUS; + + functor_type experiment4_functor(gemm_args); + + auto team_size = gemm_args.bp.team_size; + auto vector_len = gemm_args.bp.vector_len; + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment3Gemm", + policy_type(league_size, team_size, vector_len), + experiment4_functor); + } + Kokkos::fence(); + + timer.reset(); + + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment3Gemm", + policy_type(league_size, team_size, vector_len), + experiment4_functor); + } + Kokkos::fence(); + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment3"); + return; +} + +/** + * 4. case c) + * parallel_for(teamPolicy): + * parallel_for(ThreadVectorRange) + * TeamGemm + */ +template +void __do_gemm_parallel_experiment4(options_t options, gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + // using layout_tag = std::conditional::value, LayoutLeftTag, LayoutRightTag>::type; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + using functor_type = + parallel_batched_gemm_experiment2_3_4; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto league_size = options.start.c.k; + Kokkos::Timer timer; + STATUS; + + functor_type experiment4_functor(gemm_args); + + auto team_size = gemm_args.bp.team_size; + auto vector_len = gemm_args.bp.vector_len; + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment4Gemm", + policy_type(league_size, team_size, vector_len), + experiment4_functor); + } + Kokkos::fence(); + + timer.reset(); + + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment4Gemm", + policy_type(league_size, team_size, vector_len), + experiment4_functor); + } + Kokkos::fence(); + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment4"); + return; +} + +template +class parallel_batched_gemm_experiment5 { + private: + SimdViewType &A, &B, &C; + gemm_args_t gemm_args; + + public: + parallel_batched_gemm_experiment5(SimdViewType &_A, SimdViewType &_B, + SimdViewType &_C, gemm_args_t _gemm_args) + : A(_A), B(_B), C(_C), gemm_args(_gemm_args) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const SimdCpuTag &, const int &i) const { + auto svA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); + + // Uses two serial for-loops internally + KokkosBatched::SerialGemm::invoke( + gemm_args.alpha, svA, svB, gemm_args.beta, svC); + } +}; + +/** + * 5. + * parallel_for(RangePolicy(N/vl+(N%vl>0)>): + * serialGemm + * + * Not portable to GPU + */ +template +void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::RangePolicy; + + // Construct the SimdType + using scalar_type = typename view_type_3d::value_type; + constexpr int vl = + KokkosBatched::DefaultVectorLength::value; + using simd_type = KokkosBatched::Vector, vl>; + using simd_view_type = + Kokkos::View; + using functor_type = + parallel_batched_gemm_experiment5; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto k = options.start.c.k; + Kokkos::Timer timer; + auto simd_batch_size = k / vl + (k % vl > 0); + STATUS; + + // Increases each array size by sizeof(scalar_type) * (vl-1) bytes! + simd_view_type A("A", simd_batch_size, gemm_args.A.extent(0), + gemm_args.A.extent(1)); + simd_view_type B("B", simd_batch_size, gemm_args.B.extent(0), + gemm_args.B.extent(1)); + simd_view_type C("C", simd_batch_size, gemm_args.C.extent(0), + gemm_args.C.extent(1)); + + // uint64_t seed = Kokkos::Impl::clock_tic(); + // Kokkos::Random_XorShift64_Pool rand_pool(seed); + // Kokkos::fill_random(A, rand_pool, + // Kokkos::rand, simd_type>::max()); + // Kokkos::fill_random(B, rand_pool, + // Kokkos::rand, simd_type>::max()); + // Kokkos::fill_random(C, rand_pool, + // Kokkos::rand, simd_type>::max()); + // execution_space::fence(); + + functor_type experiment5_functor(A, B, C, gemm_args); + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment5Gemm", + policy_type(0, simd_batch_size), experiment5_functor); + } + Kokkos::fence(); + + timer.reset(); + + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment5Gemm", + policy_type(0, simd_batch_size), experiment5_functor); + } + Kokkos::fence(); + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment5"); +#else + std::cerr + << std::string(__func__) + << " disabled since KOKKOS_ENABLE_CUDA or KOKKOS_ENABLE_HIP is defined." + << std::endl; +#endif // !KOKKOS_ENABLE_CUDA || !KOKKOS_ENABLE_HIP + return; +} + +/*************************** Internal setup fns **************************/ +template +gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { + using execution_space = typename device_type::execution_space; + + gemm_args_t gemm_args; + uint64_t seed = Kokkos::Impl::clock_tic(); + Kokkos::Random_XorShift64_Pool rand_pool(seed); + STATUS; + + gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; + gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; + gemm_args.A = vta("gemm_args.A", dim.a.k, dim.a.m, dim.a.n); + gemm_args.B = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n); + gemm_args.C = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n); + gemm_args.alpha = options.blas_args.gemm.alpha; + gemm_args.alpha = options.blas_args.gemm.beta; + gemm_args.bp.team_size = options.blas_args.team_size; + gemm_args.bp.vector_len = options.blas_args.vector_len; + + Kokkos::fill_random(gemm_args.A, rand_pool, + Kokkos::rand, + scalar_type>::max()); + Kokkos::fill_random(gemm_args.B, rand_pool, + Kokkos::rand, + scalar_type>::max()); + Kokkos::fill_random(gemm_args.C, rand_pool, + Kokkos::rand, + scalar_type>::max()); + + return gemm_args; +} + +/*************************** Interal run helper fns **************************/ +void __do_loop_and_invoke(options_t options, + void (*fn)(options_t, gemm_args_t)) { + matrix_dims_t cur_dims; + gemm_args_t gemm_args; + STATUS; + + __print_gemm_perf_test_options(options); + std::cout << "SCALAR:" << typeid(default_scalar).name() + << ", LAYOUT:" << typeid(default_layout).name() + << ", DEVICE:" << typeid(default_device).name() << std::endl; + + options.out[0] << gemm_csv_header_str << std::endl; + + for (cur_dims = options.start; + cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n && + cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n && + cur_dims.c.m <= options.stop.c.m && cur_dims.c.n <= options.stop.c.n; + cur_dims.a.m *= options.step, cur_dims.a.n *= options.step, + cur_dims.b.m *= options.step, cur_dims.b.n *= options.step, + cur_dims.c.m *= options.step, cur_dims.c.n *= options.step) { + gemm_args = __do_setup(options, cur_dims); + fn(options, gemm_args); + } + return; +} + +/*************************** External fns **************************/ +void do_gemm_serial_blas(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_serial_blas); + return; +} + +void do_gemm_serial_batched(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_serial_batched); + return; +} + +void do_gemm_serial_batched_blocked(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_serial_batched); + return; +} + +void do_gemm_serial_batched_parallel(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_serial_batched_blocked_parallel(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_team_batched_parallel(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_team_batched_blocked_parallel(options_t options) { + STATUS; + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + return; +} + +void do_gemm_team_vector_batched_parallel(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +/* void do_gemm_team_vector_batched_blocked_parallel(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; +} */ + +void do_gemm_experiment_parallel(options_t options) { + STATUS; + using TransAType = Trans::NoTranspose; + using TransBType = Trans::NoTranspose; + using BlockingType = Algo::Gemm::Unblocked; + + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment1); + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment2); + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment3); + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment4); + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment5); +} + +#endif // KOKKOSBLAS3_GEMM_PERF_TEST_H_ diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 122f0b3817..b493c244d8 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -43,6 +43,7 @@ */ #include "KokkosBlas3_common.hpp" #include "KokkosBlas3_trmm_perf_test.hpp" +#include "KokkosBlas3_gemm_perf_test.hpp" #include #include @@ -61,6 +62,11 @@ static struct option long_options[] = { {"routines", required_argument, 0, 'r'}, {"trmm_options", required_argument, 0, 'o'}, {"trmm_alpha", required_argument, 0, 'a'}, + {"gemm_options", required_argument, 0, 'g'}, + {"gemm_alpha", required_argument, 0, 'p'}, + {"team_size", required_argument, 0, 'z'}, + {"vector_len", required_argument, 0, 'n'}, + {"batch_size", required_argument, 0, 'k'}, {0, 0, 0, 0}}; static void __print_help_blas3_perf_test() { @@ -72,14 +78,12 @@ static void __print_help_blas3_perf_test() { printf("\t-t, --test=OPTION\n"); printf("\t\tAlgorithm selection.\n"); printf("\t\t\tValid values for OPTION:\n"); - printf("%c[1m", 27); - printf("\t\t\t\tblas:"); - printf("%c[0m", 27); - printf(" invoke Kokkos::trmm the loop-body. (default)\n"); - printf("%c[1m", 27); - printf("\t\t\t\tbatched:"); - printf("%c[0m", 27); - printf(" invoke KokkosBatched::SerialTrmm in the loop-body.\n\n"); + for (int i = 0; i < TEST_N; i++) { + printf("%c[1m", 27); + printf("\t\t\t\t%s", test_e_str[i].c_str()); + printf("%c[0m", 27); + printf("\n"); + } printf("\t-o, --trmm_options=OPTION_STRING\n"); printf("\t\tTRMM side, uplo, trans, and diag options.\n"); @@ -93,6 +97,33 @@ static void __print_help_blas3_perf_test() { printf("\t\t\tThe value of alpha in floating point. (default: %lf)\n", DEFAULT_TRMM_ALPHA); + printf("\t-g, --gemm_options=OPTION_STRING\n"); + printf("\t\tGEMM transA, and transB options.\n"); + printf( + "\t\t\tValid format for OPTION_STRING is \"%%c%%c\". (default: " + "%s)\n", + DEFAULT_GEMM_ARGS); + + printf("\t-p, --gemm_alpha=SCALAR_VALUE\n"); + printf("\t\tGEMM alpha value.\n"); + printf("\t\t\tThe value of alpha in floating point. (default: %lf)\n", + DEFAULT_GEMM_ALPHA); + + printf("\t-z, --team_size=SIZE\n"); + printf("\t\tKokkos team size.\n"); + printf("\t\t\tThe value of SIZE as an integer. (default: %d)\n", + DEFAULT_TEAM_SIZE); + + printf("\t-n, --vector_len=LEN\n"); + printf("\t\tKokkos vector length (Heirarchical parallelism).\n"); + printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", + DEFAULT_VECTOR_LEN); + + printf("\t-k, --batch_size=LEN\n"); + printf("\t\tBatch size. Adds third dimension to matrices A, B, and C.\n"); + printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", + DEFAULT_VECTOR_LEN); + printf("\t-l, --loop_type=OPTION\n"); printf("\t\tLoop selection.\n"); printf("\t\t\tValid values for OPTION:\n"); @@ -105,21 +136,25 @@ static void __print_help_blas3_perf_test() { printf("%c[0m", 27); printf(" invoke blas routine in a Kokkos::parallel_for-loop.\n\n"); - printf("\t-b, --matrix_size_start=MxN,IxJ\n"); - printf("\t\tMatrix size selection where A is MxN and B is IxJ (start)\n"); + printf("\t-b, --matrix_size_start=MxN,IxJ,PxQ\n"); + printf( + "\t\tMatrix size selection where A is MxN, B is IxJ, and C is PxQ " + "(start)\n"); printf( "\t\t\tValid values for M and N are any non-negative 32-bit integers. " - "(default: %dx%d,%dx%d)\n\n", + "(default: %dx%d,%dx%d,%dx%d)\n\n", DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, - DEFAULT_MATRIX_START); + DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START); - printf("\t-e, --matrix_size_stop=PxQ,SxT\n"); - printf("\t\tMatrix size selection where A is PxQ and B is SxT (stop)\n"); + printf("\t-e, --matrix_size_stop=SxT,LxK,OxR\n"); + printf( + "\t\tMatrix size selection where A is SxT, B is LxK, and C is OxR " + "(stop)\n"); printf( - "\t\t\tValid values for P and Q are any non-negative 32-bit integers. " - "(default: %dx%d,%dx%d)\n\n", + "\t\t\tValid dimension values are any non-negative 32-bit integers. " + "(default: %dx%d,%dx%d,%dx%d)\n\n", DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, - DEFAULT_MATRIX_STOP); + DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP); printf("\t-s, --matrix_size_step=K\n"); printf("\t\tMatrix step selection.\n"); @@ -156,72 +191,106 @@ static void __print_help_blas3_perf_test() { DEFAULT_BLAS_ROUTINES); } -static void __blas3_perf_test_input_error(char **argv, int option_idx) { - fprintf(stderr, "ERROR: invalid option \"%s %s\".\n", argv[option_idx], - argv[option_idx + 1]); - __print_help_blas3_perf_test(); +static void __blas3_perf_test_input_error(char **argv, char short_opt, + char *getopt_optarg) { + fprintf(stderr, "ERROR: invalid option \"-%c %s\". Try --help.\n", short_opt, + getopt_optarg); exit(-EINVAL); } int main(int argc, char **argv) { options_t options; - int option_idx = 0, ret; - char *n_str = nullptr, *adim = nullptr, *bdim = nullptr; + int option_idx = 0, ret, i; + char *n_str = nullptr, *adim = nullptr, *bdim = nullptr, *cdim = nullptr; std::filebuf fb; - char *out_file = nullptr; + char *out_file = nullptr; + using rt_type = decltype(do_trmm_invoke); + rt_type *routine_table[BLAS_ROUTINES_N] = { + &do_trmm_invoke, &do_gemm_invoke + // ADD MORE BLAS3 ROUTINES HERE + }; /* set default options */ - options.test = DEFAULT_TEST; - options.loop = DEFAULT_LOOP; - options.start.a.m = DEFAULT_MATRIX_START; - options.start.a.n = DEFAULT_MATRIX_START; - options.stop.a.m = DEFAULT_MATRIX_STOP; - options.stop.a.n = DEFAULT_MATRIX_STOP; - options.start.b.m = DEFAULT_MATRIX_START; - options.start.b.n = DEFAULT_MATRIX_START; - options.stop.b.m = DEFAULT_MATRIX_STOP; - options.stop.b.n = DEFAULT_MATRIX_STOP; - options.step = DEFAULT_STEP; - options.warm_up_n = DEFAULT_WARM_UP_N; - options.n = DEFAULT_N; - options.out = DEFAULT_OUT; - options.blas_routines = std::string(DEFAULT_BLAS_ROUTINES); + options.test = DEFAULT_TEST; + options.loop = DEFAULT_LOOP; + options.start.a.k = DEFAULT_K; + options.start.a.m = DEFAULT_MATRIX_START; + options.start.a.n = DEFAULT_MATRIX_START; + options.stop.a.k = DEFAULT_K; + options.stop.a.m = DEFAULT_MATRIX_STOP; + options.stop.a.n = DEFAULT_MATRIX_STOP; + options.start.b.k = DEFAULT_K; + options.start.b.m = DEFAULT_MATRIX_START; + options.start.b.n = DEFAULT_MATRIX_START; + options.stop.b.k = DEFAULT_K; + options.stop.b.m = DEFAULT_MATRIX_STOP; + options.stop.b.n = DEFAULT_MATRIX_STOP; + options.start.c.k = DEFAULT_K; + options.start.c.m = DEFAULT_MATRIX_START; + options.start.c.n = DEFAULT_MATRIX_START; + options.stop.c.k = DEFAULT_K; + options.stop.c.m = DEFAULT_MATRIX_STOP; + options.stop.c.n = DEFAULT_MATRIX_STOP; + options.step = DEFAULT_STEP; + options.warm_up_n = DEFAULT_WARM_UP_N; + options.n = DEFAULT_N; + options.out = DEFAULT_OUT; + options.blas_routines = std::string(DEFAULT_BLAS_ROUTINES); + options.blas_args.team_size = DEFAULT_TEAM_SIZE; + options.blas_args.vector_len = DEFAULT_VECTOR_LEN; options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS; options.blas_args.trmm.alpha = DEFAULT_TRMM_ALPHA; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:", long_options, - &option_idx)) != -1) { + options.blas_args.gemm.gemm_args = DEFAULT_GEMM_ARGS; + options.blas_args.gemm.alpha = DEFAULT_GEMM_ALPHA; + + while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:", + long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas3_perf_test(); return 0; case 't': - // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); - if (!strncasecmp(optarg, "blas", 4)) { - options.test = BLAS; - } else if (!strncasecmp(optarg, "batched", 6)) { - options.test = BATCHED; - } else { - __blas3_perf_test_input_error(argv, option_idx); + for (i = 0; i < TEST_N; i++) { + if (!test_e_str[i].compare(optarg)) { + options.test = (test_e)i; + break; + } + } + if (i == TEST_N) { + __blas3_perf_test_input_error(argv, ret, optarg); } break; case 'o': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); if (strlen(optarg) != 4) { - __blas3_perf_test_input_error(argv, option_idx); + __blas3_perf_test_input_error(argv, ret, optarg); } options.blas_args.trmm.trmm_args = optarg; break; + case 'g': + // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); + if (strlen(optarg) != 3) { + __blas3_perf_test_input_error(argv, ret, optarg); + } + options.blas_args.gemm.gemm_args = optarg; + break; + case 'p': + // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); + options.blas_args.gemm.alpha = (default_scalar)atof(optarg); + break; case 'a': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); options.blas_args.trmm.alpha = (default_scalar)atof(optarg); break; case 'l': - if (!strncasecmp(optarg, "serial", 6)) { - options.loop = SERIAL; - } else if (!strncasecmp(optarg, "parallel", 8)) { - options.loop = PARALLEL; - } else { - __blas3_perf_test_input_error(argv, option_idx); + for (i = 0; i < LOOP_N; i++) { + if (!loop_e_str[i].compare(optarg)) { + options.loop = (loop_e)i; + break; + } + } + if (i == LOOP_N) { + __blas3_perf_test_input_error(argv, ret, optarg); } break; case 'b': @@ -229,51 +298,78 @@ int main(int argc, char **argv) { bdim = strcasestr(optarg, ","); bdim[0] = '\0'; bdim = &bdim[1]; + cdim = strcasestr(bdim, ","); + cdim[0] = '\0'; + cdim = &cdim[1]; n_str = strcasestr(adim, "x"); - if (n_str == NULL) __blas3_perf_test_input_error(argv, option_idx); + if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg); n_str[0] = '\0'; options.start.a.m = atoi(adim); options.start.a.n = atoi(&n_str[1]); n_str = strcasestr(bdim, "x"); - if (n_str == NULL) __blas3_perf_test_input_error(argv, option_idx); + if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg); n_str[0] = '\0'; options.start.b.m = atoi(bdim); options.start.b.n = atoi(&n_str[1]); + + n_str = strcasestr(cdim, "x"); + if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg); + + n_str[0] = '\0'; + options.start.c.m = atoi(cdim); + options.start.c.n = atoi(&n_str[1]); break; case 'e': adim = optarg; bdim = strcasestr(optarg, ","); bdim[0] = '\0'; bdim = &bdim[1]; + cdim = strcasestr(bdim, ","); + cdim[0] = '\0'; + cdim = &cdim[1]; n_str = strcasestr(adim, "x"); - if (n_str == NULL) __blas3_perf_test_input_error(argv, option_idx); + if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg); n_str[0] = '\0'; options.stop.a.m = atoi(adim); options.stop.a.n = atoi(&n_str[1]); n_str = strcasestr(bdim, "x"); - if (n_str == NULL) __blas3_perf_test_input_error(argv, option_idx); + if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg); n_str[0] = '\0'; options.stop.b.m = atoi(bdim); options.stop.b.n = atoi(&n_str[1]); + + n_str = strcasestr(cdim, "x"); + if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg); + + n_str[0] = '\0'; + options.stop.c.m = atoi(cdim); + options.stop.c.n = atoi(&n_str[1]); break; case 's': options.step = atoi(optarg); break; case 'w': options.warm_up_n = atoi(optarg); break; case 'i': options.n = atoi(optarg); break; + case 'k': + options.start.a.k = options.start.b.k = options.start.c.k = + options.stop.a.k = options.stop.b.k = options.stop.c.k = + atoi(optarg); + break; + case 'z': options.blas_args.team_size = atoi(optarg); break; + case 'n': options.blas_args.vector_len = atoi(optarg); break; case 'c': out_file = optarg; options.out_file = std::string(out_file); break; - case 'r': options.blas_routines = std::string(optarg); break; + case 'r': options.blas_routines = optarg; break; case '?': - default: __blas3_perf_test_input_error(argv, option_idx); + default: __blas3_perf_test_input_error(argv, ret, optarg); } } @@ -283,16 +379,35 @@ int main(int argc, char **argv) { options.out = &out; } - if (options.warm_up_n > options.n) - __blas3_perf_test_input_error(argv, option_idx); + if (options.warm_up_n > options.n) { + fprintf(stderr, "ERROR: warm_up_n=%d > n=%d. Try --help.\n", + options.warm_up_n, options.n); + exit(-EINVAL); + } Kokkos::initialize(argc, argv); - for (int i = 0; i < BLAS_ROUTINES_N; i++) { - if (options.blas_routines.find(blas_routines_e_str[TRMM]) != - std::string::npos) - do_trmm_invoke[options.loop][options.test](options); - // ADD MORE BLAS3 ROUTINES HERE + int err = 0; + for (i = 0; i < BLAS_ROUTINES_N; i++) { + if (options.blas_routines.find(blas_routines_e_str[i]) != + std::string::npos) { + std::cout << "Testing " << blas_routines_e_str[i] << "..." << std::endl; + + auto routine = routine_table[i]; + + if (!routine || !routine[0][options.loop][options.test]) { + std::cerr << "do_" << blas_routines_e_str[i] << "_invoke["; + err = 1; + break; + } + routine[0][options.loop][options.test](options); + } + } + + if (err) { + std::cerr << loop_e_str[options.loop] << "][" << test_e_str[options.test] + << "] not yet implemented!" << std::endl; + exit(-EINVAL); } if (out_file != nullptr) fb.close(); diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index c1d42fe9c1..70f7664679 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -55,7 +55,7 @@ #include "KokkosBatched_Trmm_Serial_Impl.hpp" #include "KokkosBatched_Util.hpp" -//#define TRMM_PERF_TEST_DEBUG +//#define PERF_TEST_DEBUG // Forward declarations void do_trmm_serial_blas(options_t options); @@ -68,13 +68,6 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { {do_trmm_serial_blas, do_trmm_serial_batched}, {do_trmm_parallel_blas, do_trmm_parallel_batched}}; -/*************************** Print macros **************************/ -#ifdef TRMM_PERF_TEST_DEBUG -#define STATUS printf("STATUS: %s:%d.\n", __func__, __LINE__); -#else -#define STATUS -#endif // TRMM_PERF_TEST_DEBUG - /*************************** Test types and defaults **************************/ #define DEFAULT_TRMM_ARGS "LUNU" #define DEFAULT_TRMM_ALPHA 1.0 @@ -106,7 +99,7 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, } static void __print_trmm_perf_test_options(options_t options) { -#ifdef TRMM_PERF_TEST_DEBUG +#ifdef PERF_TEST_DEBUG printf("options.test = %s\n", test_e_str[options.test].c_str()); printf("options.loop = %s\n", loop_e_str[options.loop].c_str()); printf("options.start = %dx%d,%dx%d\n", options.start.a.m, @@ -123,7 +116,7 @@ static void __print_trmm_perf_test_options(options_t options) { printf("options.alpha = %lf\n", options.blas_args.trmm.alpha); else if (std::is_same::value) printf("options.alpha = %f\n", options.blas_args.trmm.alpha); -#endif // TRMM_PERF_TEST_DEBUG +#endif // PERF_TEST_DEBUG return; } @@ -146,6 +139,7 @@ void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) { &trmm_args.diag, trmm_args.alpha, A, B); } + Kokkos::fence(); timer.reset(); for (uint32_t i = 0; i < n; ++i) { auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); @@ -335,7 +329,9 @@ void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) { __trmm_output_csv_row(options, trmm_args, timer.seconds()); #else std::cerr << std::string(__func__) - << " disabled since KOKKOS_ENABLE_CUDA and/or KOKKOS_ENABLE_HIP is defined." << std::endl; + << " disabled since KOKKOS_ENABLE_CUDA and/or KOKKOS_ENABLE_HIP is " + "defined." + << std::endl; __trmm_output_csv_row(options, trmm_args, -1); #endif // !KOKKOS_ENABLE_CUDA return; diff --git a/perf_test/blas/blas3/README.md b/perf_test/blas/blas3/README.md index af718ee906..d150d61a32 100644 --- a/perf_test/blas/blas3/README.md +++ b/perf_test/blas/blas3/README.md @@ -19,8 +19,8 @@ void (*do_ROUTINE_invoke[LOOP_N][TEST_N])(options_t) = { }; ``` 3. Update the definitions in `KokkosBlas3_common.hpp`, where the comment `//ADD MORE BLAS3 ROUTINES HERE` is. -4. Add a conditional to invoke the new routine via `do_ROUTINE_invoke` in - `KokkosBlas3_trmm_perf_test.hpp`, where the comment `//ADD MORE BLAS3 ROUTINES HERE` is. +4. Add the `do_ROUTINE_invoke` table to the `routine_table` in + `KokkosBlas3_perf_test.cpp`, where the comment `//ADD MORE BLAS3 ROUTINES HERE` is. 5. Update the commandline argument processing in - `KokkosBlas3_trmm_perf_test.hpp` to specify how to run ROUTINE. -6. Append `ROUTINE,` to `#define DEFAULT_BLAS_ROUTINES` in `KokkosBlas3_common.hpp`. + `KokkosBlas3_perf_test.cpp` to specify how to run ROUTINE. +6. To run the new routine by default, append `ROUTINE,` to `#define DEFAULT_BLAS_ROUTINES` in `KokkosBlas3_common.hpp`. diff --git a/src/blas/impl/KokkosBlas3_trmm_spec.hpp b/src/blas/impl/KokkosBlas3_trmm_spec.hpp index 13c87a299e..3c0bd9df6f 100644 --- a/src/blas/impl/KokkosBlas3_trmm_spec.hpp +++ b/src/blas/impl/KokkosBlas3_trmm_spec.hpp @@ -74,7 +74,7 @@ struct trmm_eti_spec_avail { > { enum : bool { value = true }; }; // -// This Macros provides the ETI specialization of trmm, currently not available. +// This Macros provides the ETI specialization of trmm // #define KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \ KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL_LAYOUT( SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE) From e4749e2e99b2f480fc9b509ba6135a5218a92ac9 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 24 Nov 2020 11:52:04 -0700 Subject: [PATCH 094/106] Add serial implementation of RCM RCM with a permanent interface. Later will create parallel implementation. Unit test - verifies that bandwidth was reduced and that it handles multiple connected components. Wiki example that displays an example reordered matrix. --- example/wiki/graph/CMakeLists.txt | 5 + .../graph/KokkosGraph_wiki_9pt_stencil.hpp | 13 +- example/wiki/graph/KokkosGraph_wiki_rcm.cpp | 68 ++++++ src/graph/KokkosGraph_RCM.hpp | 78 +++++++ src/graph/impl/KokkosGraph_BFS_impl.hpp | 159 ++++++++++++++ unit_test/cuda/Test_Cuda_Graph_rcm.cpp | 2 + unit_test/graph/Test_Graph_rcm.hpp | 197 ++++++++++++++++++ unit_test/hip/Test_HIP_Graph_rcm.cpp | 2 + unit_test/openmp/Test_OpenMP_Graph_rcm.cpp | 2 + unit_test/serial/Test_Serial_Graph_rcm.cpp | 2 + unit_test/threads/Test_Threads_Graph_rcm.cpp | 3 + 11 files changed, 528 insertions(+), 3 deletions(-) create mode 100644 example/wiki/graph/KokkosGraph_wiki_rcm.cpp create mode 100644 src/graph/KokkosGraph_RCM.hpp create mode 100644 src/graph/impl/KokkosGraph_BFS_impl.hpp create mode 100644 unit_test/cuda/Test_Cuda_Graph_rcm.cpp create mode 100644 unit_test/graph/Test_Graph_rcm.hpp create mode 100644 unit_test/hip/Test_HIP_Graph_rcm.cpp create mode 100644 unit_test/openmp/Test_OpenMP_Graph_rcm.cpp create mode 100644 unit_test/serial/Test_Serial_Graph_rcm.cpp create mode 100644 unit_test/threads/Test_Threads_Graph_rcm.cpp diff --git a/example/wiki/graph/CMakeLists.txt b/example/wiki/graph/CMakeLists.txt index f1122958c2..b271038d91 100644 --- a/example/wiki/graph/CMakeLists.txt +++ b/example/wiki/graph/CMakeLists.txt @@ -18,3 +18,8 @@ KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( SOURCES KokkosGraph_wiki_coarsening.cpp ) +KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( + wiki_rcm + SOURCES KokkosGraph_wiki_rcm.cpp + ) + diff --git a/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp b/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp index 4561300dea..93e5660c07 100644 --- a/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp +++ b/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp @@ -23,9 +23,16 @@ using Handle = KokkosKernels::Experimental:: namespace GraphDemo { - constexpr Ordinal gridX = 15; - constexpr Ordinal gridY = 25; - constexpr Ordinal numVertices = gridX * gridY; + Ordinal gridX = 15; + Ordinal gridY = 25; + Ordinal numVertices = gridX * gridY; + + void setGridDimensions(Ordinal newX, Ordinal newY) + { + gridX = newX; + gridY = newY; + numVertices = gridX * gridY; + } //Helper to get the vertex ID given grid coordinates Ordinal getVertexID(Ordinal x, Ordinal y) diff --git a/example/wiki/graph/KokkosGraph_wiki_rcm.cpp b/example/wiki/graph/KokkosGraph_wiki_rcm.cpp new file mode 100644 index 0000000000..bfc6260533 --- /dev/null +++ b/example/wiki/graph/KokkosGraph_wiki_rcm.cpp @@ -0,0 +1,68 @@ +#include "KokkosGraph_wiki_9pt_stencil.hpp" +#include "KokkosGraph_RCM.hpp" + +template +void printReorderedMatrix(const rowmap_t& rowmapIn, const entries_t& entriesIn, const labels_t& invPermIn) +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + auto rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmapIn); + auto entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entriesIn); + auto invPerm = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), invPermIn); + lno_t numVerts = rowmap.extent(0) - 1; + decltype(invPerm) perm(Kokkos::ViewAllocateWithoutInitializing("Perm"), numVerts); + for(lno_t i = 0; i < numVerts; i++) + perm(invPerm(i)) = i; + std::vector neighbors; + for(lno_t i = 0; i < numVerts; i++) + { + lno_t origRow = perm(i); + neighbors.clear(); + for(size_type j = rowmap(origRow); j < rowmap(origRow + 1); j++) + { + lno_t origNei = entries(j); + lno_t nei = invPerm(origNei); + neighbors.push_back(nei); + } + std::sort(neighbors.begin(), neighbors.end()); + lno_t it = 0; + for(lno_t j = 0; j < numVerts; j++) + { + if(it < neighbors.size() && j == neighbors[it]) + { + std::cout << '*'; + it++; + } + else + std::cout << ' '; + } + std::cout << '\n'; + } + std::cout << '\n'; +} + + +int main(int argc, char* argv[]) +{ + Kokkos::initialize(); + { + using GraphDemo::numVertices; + GraphDemo::setGridDimensions(6, 6); + RowmapType rowmapDevice; + ColindsType colindsDevice; + //Make the graph smaller so the matrix can be printed easily + //Step 1: Generate the graph on host, allocate space on device, and copy. + //See function "generate9pt" below. + GraphDemo::generate9pt(rowmapDevice, colindsDevice); + //Step 2: Run RCM and print the reordered matrix + { + auto rcmDevice = KokkosGraph::Experimental::graph_rcm( + rowmapDevice, colindsDevice); + std::cout << "Graph reordered by reverse Cuthill-McKee:\n"; + printReorderedMatrix(rowmapDevice, colindsDevice, rcmDevice); + } + } + Kokkos::finalize(); + return 0; +} + diff --git a/src/graph/KokkosGraph_RCM.hpp b/src/graph/KokkosGraph_RCM.hpp new file mode 100644 index 0000000000..039c3f5eac --- /dev/null +++ b/src/graph/KokkosGraph_RCM.hpp @@ -0,0 +1,78 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOSGRAPH_RCM_HPP +#define _KOKKOSGRAPH_RCM_HPP + +#include "KokkosGraph_BFS_impl.hpp" + +namespace KokkosGraph +{ +namespace Experimental +{ + +//Compute the reverse Cuthill-McKee ordering of a graph. +//The graph must be symmetric, but it may have any number of connected components. +//This function returns a list of vertices in RCM order. + +template +labels_t +graph_rcm(const rowmap_t& rowmap, const colinds_t& colinds) +{ + using lno_t = typename colinds_t::non_const_value_type; + if(rowmap.extent(0) <= 2) + { + //there are 0 or 1 vertices - return trivial ordering + lno_t numVerts = rowmap.extent(0); + if(numVerts) + numVerts--; + return labels_t("RCM Labels", numVerts); + } + Impl::SerialBFS bfs(rowmap, colinds); + return bfs.rcm(); +} + +}} //namespace KokkosGraph::Experimental + +#endif diff --git a/src/graph/impl/KokkosGraph_BFS_impl.hpp b/src/graph/impl/KokkosGraph_BFS_impl.hpp new file mode 100644 index 0000000000..255d28aeea --- /dev/null +++ b/src/graph/impl/KokkosGraph_BFS_impl.hpp @@ -0,0 +1,159 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOSGRAPH_BFS_IMPL_HPP +#define _KOKKOSGRAPH_BFS_IMPL_HPP + +#include "Kokkos_Core.hpp" +#include "KokkosKernels_Utils.hpp" +#include +#include + +namespace KokkosGraph { +namespace Experimental { +namespace Impl { + +template +struct SerialBFS +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using host_rowmap_t = Kokkos::View; + using host_lno_view_t = Kokkos::View; + using host_lno_2D_view_t = Kokkos::View; + + lno_t numVerts; + host_rowmap_t rowmap; + host_lno_view_t entries; + + SerialBFS(const rowmap_t& rowmap_, const entries_t& entries_) : + numVerts(rowmap_.extent(0) - 1), + rowmap(Kokkos::ViewAllocateWithoutInitializing("HostRowmap"), rowmap_.extent(0)), + entries(Kokkos::ViewAllocateWithoutInitializing("HostEntries"), entries_.extent(0)) + { + Kokkos::deep_copy(rowmap, rowmap_); + Kokkos::deep_copy(entries, entries_); + } + + lno_t findPseudoPeripheral() + { + //Choose vertex with smallest degree + lno_t periph = -1; + lno_t periphDeg = numVerts; + for(lno_t i = 0; i < numVerts; i++) + { + lno_t deg = rowmap(i + 1) - rowmap(i); + if(deg < periphDeg) + { + periph = i; + periphDeg = deg; + } + } + return periph; + } + + lno_view_t rcm() + { + lno_t start = findPseudoPeripheral(); + host_lno_view_t q(Kokkos::ViewAllocateWithoutInitializing("Queue"), numVerts); + host_lno_view_t label(Kokkos::ViewAllocateWithoutInitializing("Permutation"), numVerts); + for(lno_t i = 0; i < numVerts; i++) + label(i) = -1; + lno_t qhead = 0; + lno_t qtail = 0; + label(start) = qtail; + q(qtail++) = start; + std::vector neighbors; + lno_t outerQueue = 0; + while(true) + { + lno_t v = q(qhead++); + neighbors.clear(); + for(size_type j = rowmap(v); j < rowmap(v + 1); j++) + { + lno_t nei = entries(j); + if(nei == v || nei >= numVerts) + continue; + if(label(nei) == -1) + { + neighbors.push_back(nei); + } + } + std::sort(neighbors.begin(), neighbors.end(), + [&](lno_t n1, lno_t n2) -> bool + { + //return true of n1 has a lower degree than n2 + return (rowmap(n1 + 1) - rowmap(n1)) < (rowmap(n2 + 1) - rowmap(n2)); + }); + //label and enqueue all unlabeled neighbors + for(lno_t nei : neighbors) + { + label(nei) = qtail; + q(qtail++) = nei; + } + if(qtail == numVerts) + { + //have labeled all vertices + break; + } + else if(qhead == qtail) + { + //have exhausted this connected component, but others remain unlabeled + while(label(outerQueue) != -1) + outerQueue++; + label(outerQueue) = qtail; + q(qtail++) = outerQueue; + } + } + lno_view_t labelOut(Kokkos::ViewAllocateWithoutInitializing("RCM Permutation"), numVerts); + //reverse the labels + for(lno_t i = 0; i < numVerts; i++) + label(i) = numVerts - label(i) - 1; + Kokkos::deep_copy(labelOut, label); + return labelOut; + } +}; + +}}} //namespace KokkosGraph::Experimental::Impl +#endif diff --git a/unit_test/cuda/Test_Cuda_Graph_rcm.cpp b/unit_test/cuda/Test_Cuda_Graph_rcm.cpp new file mode 100644 index 0000000000..e7fb84820d --- /dev/null +++ b/unit_test/cuda/Test_Cuda_Graph_rcm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/graph/Test_Graph_rcm.hpp b/unit_test/graph/Test_Graph_rcm.hpp new file mode 100644 index 0000000000..eb3cd45a37 --- /dev/null +++ b/unit_test/graph/Test_Graph_rcm.hpp @@ -0,0 +1,197 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +#include "KokkosGraph_RCM.hpp" +#include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_CrsMatrix.hpp" + +#include + +//Generates a graph from 3D 7-pt stencil. Slices grid into 2 connected components near the middle of X dimension. +template +void generate7pt(rowmap_t& rowmapView, entries_t& entriesView, int gridX, int gridY, int gridZ) +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + auto getVertexID = + [=](lno_t x, lno_t y, lno_t z) -> lno_t + { + return x + y * gridX + z * gridX * gridY; + }; + lno_t numVertices = gridX * gridY * gridZ; + //Generate the graph on host (use std::vector to not need to know + //how many entries ahead of time) + std::vector rowmap(numVertices + 1); + std::vector entries; + rowmap[0] = 0; + lno_t xslice = gridX / 2; + for(lno_t k = 0; k < gridZ; k++) + { + for(lno_t j = 0; j < gridY; j++) + { + for(lno_t i = 0; i < gridX; i++) + { + lno_t v = getVertexID(i, j, k); + if(i != 0 && i != xslice + 1) + entries.push_back(getVertexID(i - 1, j, k)); + if(i != gridX - 1 && i != xslice) + entries.push_back(getVertexID(i + 1, j, k)); + if(j != 0) + entries.push_back(getVertexID(i, j - 1, k)); + if(j != gridY - 1) + entries.push_back(getVertexID(i, j + 1, k)); + if(k != 0) + entries.push_back(getVertexID(i, j, k - 1)); + if(k != gridZ - 1) + entries.push_back(getVertexID(i, j, k + 1)); + rowmap[v + 1] = entries.size(); + } + } + } + size_type numEdges = entries.size(); + //Now that the graph is formed, copy rowmap and entries to Kokkos::Views in device memory + //The nonowning host views just alias the std::vectors. + Kokkos::View> rowmapHost(rowmap.data(), numVertices + 1); + Kokkos::View> entriesHost(entries.data(), numEdges); + //Allocate owning views on device with the correct size. + rowmapView = rowmap_t(Kokkos::ViewAllocateWithoutInitializing("Rowmap"), numVertices + 1); + entriesView = entries_t(Kokkos::ViewAllocateWithoutInitializing("Colinds"), numEdges); + //Copy the graph from host to device + Kokkos::deep_copy(rowmapView, rowmapHost); + Kokkos::deep_copy(entriesView, entriesHost); +} + +template +int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries, const labels_t& invPerm, const labels_t& perm) +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + lno_t numVerts = rowmap.extent(0) - 1; + int bw = 0; + for(lno_t i = 0; i < numVerts; i++) + { + lno_t origRow = perm(i); + for(size_type j = rowmap(origRow); j < rowmap(origRow + 1); j++) + { + lno_t origNei = entries(j); + lno_t nei = invPerm(origNei); + if(nei > i) + { + lno_t thisBW = nei - i; + if(thisBW > bw) + bw = thisBW; + } + } + } + return bw; +} + +template +void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ) +{ + typedef typename KokkosSparse::CrsMatrix crsMat_t; + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type rowmap_t; + typedef typename graph_t::entries_type entries_t; + lno_t numVerts = gridX * gridY * gridZ; + typename rowmap_t::non_const_type rowmap; + typename entries_t::non_const_type entries; + generate7pt(rowmap, entries, gridX, gridY, gridZ); + auto rcm = KokkosGraph::Experimental::graph_rcm(rowmap, entries); + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries); + auto rcmHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rcm); + decltype(rcmHost) rcmPermHost(Kokkos::ViewAllocateWithoutInitializing("RCMPerm"), numVerts); + for(lno_t i = 0; i < numVerts; i++) + rcmPermHost(rcmHost(i)) = i; + //make sure each row index shows up exactly once + { + std::vector counts(numVerts); + for(lno_t i = 0; i < numVerts; i++) + { + lno_t orig = rcmHost(i); + ASSERT_GE(orig, 0); + ASSERT_LT(orig, numVerts); + counts[orig]++; + } + for(lno_t i = 0; i < numVerts; i++) + ASSERT_EQ(counts[i], 1); + } + Kokkos::View identityOrder(Kokkos::ViewAllocateWithoutInitializing("Identity"), numVerts); + for(lno_t i = 0; i < numVerts; i++) + identityOrder(i) = i; + size_t origBW = maxBandwidth(rowmapHost, entriesHost, identityOrder, identityOrder); + size_t rcmBW = maxBandwidth(rowmapHost, entriesHost, rcmHost, rcmPermHost); + EXPECT_LE(rcmBW, origBW); +} + +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +TEST_F( TestCategory, graph ## _ ## rcm ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ + test_rcm(6, 3, 3); \ + test_rcm(20, 20, 20); \ + test_rcm(100, 100, 1); \ +} + +#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ + && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + EXECUTE_TEST(double, int, int, TestExecSpace) +#endif + +#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ + && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + EXECUTE_TEST(double, int64_t, int, TestExecSpace) +#endif + +#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ + && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + EXECUTE_TEST(double, int, size_t, TestExecSpace) +#endif + +#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ + && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +#endif diff --git a/unit_test/hip/Test_HIP_Graph_rcm.cpp b/unit_test/hip/Test_HIP_Graph_rcm.cpp new file mode 100644 index 0000000000..652eb9ade5 --- /dev/null +++ b/unit_test/hip/Test_HIP_Graph_rcm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/openmp/Test_OpenMP_Graph_rcm.cpp b/unit_test/openmp/Test_OpenMP_Graph_rcm.cpp new file mode 100644 index 0000000000..eb8164cb30 --- /dev/null +++ b/unit_test/openmp/Test_OpenMP_Graph_rcm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/serial/Test_Serial_Graph_rcm.cpp b/unit_test/serial/Test_Serial_Graph_rcm.cpp new file mode 100644 index 0000000000..ac225ba858 --- /dev/null +++ b/unit_test/serial/Test_Serial_Graph_rcm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/threads/Test_Threads_Graph_rcm.cpp b/unit_test/threads/Test_Threads_Graph_rcm.cpp new file mode 100644 index 0000000000..37184bb806 --- /dev/null +++ b/unit_test/threads/Test_Threads_Graph_rcm.cpp @@ -0,0 +1,3 @@ +#include +#include + From c9827f595182851111f9fdf4c6068a4367552f16 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 1 Dec 2020 12:30:52 -0700 Subject: [PATCH 095/106] Rename impl to SerialRCM (not SerialBFS) since it only does RCM, not general BFS. This is an impl detail and can be changed later when general BFS/SSSP etc. are added. --- src/graph/KokkosGraph_RCM.hpp | 4 ++-- src/graph/impl/KokkosGraph_BFS_impl.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/graph/KokkosGraph_RCM.hpp b/src/graph/KokkosGraph_RCM.hpp index 039c3f5eac..8f1109aa63 100644 --- a/src/graph/KokkosGraph_RCM.hpp +++ b/src/graph/KokkosGraph_RCM.hpp @@ -69,8 +69,8 @@ graph_rcm(const rowmap_t& rowmap, const colinds_t& colinds) numVerts--; return labels_t("RCM Labels", numVerts); } - Impl::SerialBFS bfs(rowmap, colinds); - return bfs.rcm(); + Impl::SerialRCM algo(rowmap, colinds); + return algo.rcm(); } }} //namespace KokkosGraph::Experimental diff --git a/src/graph/impl/KokkosGraph_BFS_impl.hpp b/src/graph/impl/KokkosGraph_BFS_impl.hpp index 255d28aeea..39d5821e55 100644 --- a/src/graph/impl/KokkosGraph_BFS_impl.hpp +++ b/src/graph/impl/KokkosGraph_BFS_impl.hpp @@ -55,7 +55,7 @@ namespace Experimental { namespace Impl { template -struct SerialBFS +struct SerialRCM { using size_type = typename rowmap_t::non_const_value_type; using lno_t = typename entries_t::non_const_value_type; @@ -67,7 +67,7 @@ struct SerialBFS host_rowmap_t rowmap; host_lno_view_t entries; - SerialBFS(const rowmap_t& rowmap_, const entries_t& entries_) : + SerialRCM(const rowmap_t& rowmap_, const entries_t& entries_) : numVerts(rowmap_.extent(0) - 1), rowmap(Kokkos::ViewAllocateWithoutInitializing("HostRowmap"), rowmap_.extent(0)), entries(Kokkos::ViewAllocateWithoutInitializing("HostEntries"), entries_.extent(0)) From 00c7bac1c43f1e8c32db2c13300785dd379a14d3 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 1 Dec 2020 13:01:03 -0700 Subject: [PATCH 096/106] Fix sign compare warning --- example/wiki/graph/KokkosGraph_wiki_rcm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/wiki/graph/KokkosGraph_wiki_rcm.cpp b/example/wiki/graph/KokkosGraph_wiki_rcm.cpp index bfc6260533..31073954a4 100644 --- a/example/wiki/graph/KokkosGraph_wiki_rcm.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_rcm.cpp @@ -25,7 +25,7 @@ void printReorderedMatrix(const rowmap_t& rowmapIn, const entries_t& entriesIn, neighbors.push_back(nei); } std::sort(neighbors.begin(), neighbors.end()); - lno_t it = 0; + size_t it = 0; for(lno_t j = 0; j < numVerts; j++) { if(it < neighbors.size() && j == neighbors[it]) From c383c28bd2c44ce61f8f46da0167518449d7a0c6 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 16 Nov 2020 18:27:41 -0700 Subject: [PATCH 097/106] Adding Changelog for Release 3.2.01 Part of Kokkos C++ Performance Portability Programming EcoSystem 3.2 (cherry picked from commit 0f06afcc663dcd963f07323db7253bbc11f36ed0) --- CHANGELOG.md | 7 +++++++ CMakeLists.txt | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0380691d0f..4a9a1ff7a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Change Log +## [3.2.01](https://github.com/kokkos/kokkos-kernels/tree/3.2.01) (2020-11-17) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.2.00...3.2.01) + +**Fixed bugs:** + +- Cpp14 Fixes: [\#790](https://github.com/kokkos/kokkos-kernels/pull/790) + ## [3.2.00](https://github.com/kokkos/kokkos-kernels/tree/3.2.00) (2020-08-19) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.1.01...3.2.00) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7353c1f2ce..5196745b5c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) ENDIF() SET(KokkosKernels_VERSION_MAJOR 3) SET(KokkosKernels_VERSION_MINOR 2) - SET(KokkosKernels_VERSION_PATCH 0) + SET(KokkosKernels_VERSION_PATCH 1) ENDIF() IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") From b263752e43462b33c36aba7f1e4e6bb3f5e8d8cb Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Tue, 1 Dec 2020 18:33:01 -0700 Subject: [PATCH 098/106] Remove unused typedef, spelling typo --- src/graph/impl/KokkosGraph_BFS_impl.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/graph/impl/KokkosGraph_BFS_impl.hpp b/src/graph/impl/KokkosGraph_BFS_impl.hpp index 39d5821e55..1bc4340046 100644 --- a/src/graph/impl/KokkosGraph_BFS_impl.hpp +++ b/src/graph/impl/KokkosGraph_BFS_impl.hpp @@ -61,7 +61,6 @@ struct SerialRCM using lno_t = typename entries_t::non_const_value_type; using host_rowmap_t = Kokkos::View; using host_lno_view_t = Kokkos::View; - using host_lno_2D_view_t = Kokkos::View; lno_t numVerts; host_rowmap_t rowmap; @@ -123,7 +122,7 @@ struct SerialRCM std::sort(neighbors.begin(), neighbors.end(), [&](lno_t n1, lno_t n2) -> bool { - //return true of n1 has a lower degree than n2 + //return true if n1 has a lower degree than n2 return (rowmap(n1 + 1) - rowmap(n1)) < (rowmap(n2 + 1) - rowmap(n2)); }); //label and enqueue all unlabeled neighbors From 6d07808a216431caa3b150ff21d108bcaef7ce67 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 1 Dec 2020 18:59:39 -0700 Subject: [PATCH 099/106] Return early if degree 0 node is found --- src/graph/impl/KokkosGraph_BFS_impl.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/graph/impl/KokkosGraph_BFS_impl.hpp b/src/graph/impl/KokkosGraph_BFS_impl.hpp index 1bc4340046..df652902c0 100644 --- a/src/graph/impl/KokkosGraph_BFS_impl.hpp +++ b/src/graph/impl/KokkosGraph_BFS_impl.hpp @@ -87,6 +87,8 @@ struct SerialRCM { periph = i; periphDeg = deg; + if(deg == 0) + break; } } return periph; From 8029d05c26915606890f449f390247a068dc42f6 Mon Sep 17 00:00:00 2001 From: Evan Harvey <57234914+e10harvey@users.noreply.github.com> Date: Wed, 2 Dec 2020 07:53:19 -0700 Subject: [PATCH 100/106] Update perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp Co-authored-by: Nathan Ellingwood --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 159d73c4f6..7861547bc6 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -660,7 +660,7 @@ void __do_gemm_parallel_experiment2(options_t options, gemm_args_t gemm_args) { * VectorGemm has not been implemented! * I think this experiment can be removed. TeamGemm calls TeamThreadRange * internally! TeamVectorGemm calls both TeamThreadRange and ThreadVectorRange - * internnally! + * internally! */ template From 768a0d9212bf98214f308544e82b977870fc4879 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 2 Dec 2020 07:53:56 -0700 Subject: [PATCH 101/106] perf_test/blas/blas3: PR feedback --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 7861547bc6..f26fbb7287 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -293,7 +293,7 @@ struct parallel_blas_gemm { template void __do_gemm_parallel_blas(options_t options, gemm_args_t gemm_args) { -#if !defined(KOKKOS_ENABLE_CUDA) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; Kokkos::Timer timer; @@ -680,7 +680,7 @@ void __do_gemm_parallel_experiment3(options_t options, gemm_args_t gemm_args) { Kokkos::Timer timer; STATUS; - functor_type experiment4_functor(gemm_args); + functor_type experiment3_functor(gemm_args); auto team_size = gemm_args.bp.team_size; auto vector_len = gemm_args.bp.vector_len; @@ -688,7 +688,7 @@ void __do_gemm_parallel_experiment3(options_t options, gemm_args_t gemm_args) { for (uint32_t i = 0; i < warm_up_n; ++i) { Kokkos::parallel_for("parallelBatchedUntimedExperiment3Gemm", policy_type(league_size, team_size, vector_len), - experiment4_functor); + experiment3_functor); } Kokkos::fence(); @@ -697,7 +697,7 @@ void __do_gemm_parallel_experiment3(options_t options, gemm_args_t gemm_args) { for (uint32_t i = 0; i < n; ++i) { Kokkos::parallel_for("parallelBatchedTimedExperiment3Gemm", policy_type(league_size, team_size, vector_len), - experiment4_functor); + experiment3_functor); } Kokkos::fence(); From d2fc5d12e1455de458f302420d454d4c80299e1d Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Sat, 5 Dec 2020 16:35:54 -0700 Subject: [PATCH 102/106] Merge pull request #857 from brian-kelley/RestoreD1Default Restore distance-1 default algos from 3.2.0 (cherry picked from commit 128e3d10b74c6c87c5d285eeefc851d6f5532685) --- src/graph/KokkosGraph_Distance1ColorHandle.hpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/src/graph/KokkosGraph_Distance1ColorHandle.hpp index 503c6c9310..077104ef9f 100644 --- a/src/graph/KokkosGraph_Distance1ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance1ColorHandle.hpp @@ -239,15 +239,22 @@ class GraphColoringHandle if(exec == KokkosKernels::Impl::Exec_SERIAL) { this->coloring_algorithm_type = COLORING_SERIAL; -#ifdef VERBOSE +#ifdef VERBOSE std:cout << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n"; +#endif + } + else if(KokkosKernels::Impl::kk_is_gpu_exec_space()) + { + this->coloring_algorithm_type = COLORING_EB; +#ifdef VERBOSE + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_EB\n"; #endif } else { - this->coloring_algorithm_type = COLORING_VBBIT; -#ifdef VERBOSE - std:cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VBBIT\n"; + this->coloring_algorithm_type = COLORING_VB; +#ifdef VERBOSE + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VB\n"; #endif } } From 59f174eb5651565ce1ae62639a29c7411098fe61 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 7 Dec 2020 12:05:11 -0700 Subject: [PATCH 103/106] Merge pull request #856 from kuberry/develop Modified UTV, SolveUTV, and SetIdentity to support m x n matrices for QR with pivoting (cherry picked from commit df145565dfe9d58328c97de7afd09ee4b8843add) --- ...atched_Householder_TeamVector_Internal.hpp | 1 + ...osBatched_QR_FormQ_TeamVector_Internal.hpp | 6 +++--- ...WithColumnPivoting_TeamVector_Internal.hpp | 4 +++- .../KokkosBatched_SetIdentity_Impl.hpp | 4 ++-- .../KokkosBatched_SetIdentity_Internal.hpp | 12 +++++------ ...KokkosBatched_SolveUTV_TeamVector_Impl.hpp | 4 ++-- ...osBatched_SolveUTV_TeamVector_Internal.hpp | 21 ++++++++++++------- .../KokkosBatched_UTV_TeamVector_Impl.hpp | 2 +- .../KokkosBatched_UTV_TeamVector_Internal.hpp | 15 ++++++------- 9 files changed, 40 insertions(+), 29 deletions(-) diff --git a/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp b/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp index 7b3d8b293e..b63ca28fcf 100644 --- a/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp @@ -68,6 +68,7 @@ namespace KokkosBatched { [&](const int &i) { x2[i*x2s] *= inv_chi1_minus_alpha; }); + member.team_barrier(); // later consider to use the following // SerialScaleInternal::invoke(m_x2, inv_chi1_minus_alpha, x2, x2s); diff --git a/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp b/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp index d1b59d652f..d443bad513 100644 --- a/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp @@ -23,7 +23,7 @@ namespace KokkosBatched { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, + const int m, const int n, const int k, /* */ ValueType * A, const int as0, const int as1, /* */ ValueType * t, const int ts, @@ -44,12 +44,12 @@ namespace KokkosBatched { if (is_Q_zero) TeamVectorSetInternal::invoke(member, m, value_type(1), Q, qs0+qs1); else - TeamVectorSetIdentityInternal::invoke(member, m, Q, qs0, qs1); + TeamVectorSetIdentityInternal::invoke(member, m, n, Q, qs0, qs1); member.team_barrier(); return TeamVectorApplyQ_LeftForwardInternal ::invoke(member, - m, m, k, + m, n, k, A, as0, as1, t, ts, Q, qs0, qs1, diff --git a/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp b/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp index 2b0c1e4569..08439b0b28 100644 --- a/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp @@ -96,6 +96,7 @@ namespace KokkosBatched { A, as0, as1, A, as0, as1, norm, 1); + member.team_barrier(); const bool finish_when_rank_found = (matrix_rank == -1); @@ -158,7 +159,7 @@ namespace KokkosBatched { if (m_atl == 0) max_diag = ats::abs(A[0]); const value_type val_diag = ats::abs(A_part3x3.A11[0]), - threshold(max_diag*ats::epsilon()); + threshold(10*max_diag*ats::epsilon()); if (val_diag < threshold) { matrix_rank = m_atl; if (finish_when_rank_found) @@ -171,6 +172,7 @@ namespace KokkosBatched { n_A22, A_part3x3.A12, as1, norm_part1x3.A2, 1); + member.team_barrier(); /// ----------------------------------------------------- A_part2x2.mergeToATL (A_part3x3); t_part2x1.mergeToAT (t_part3x1); diff --git a/src/batched/KokkosBatched_SetIdentity_Impl.hpp b/src/batched/KokkosBatched_SetIdentity_Impl.hpp index 4c0ea12348..0bf12243ee 100644 --- a/src/batched/KokkosBatched_SetIdentity_Impl.hpp +++ b/src/batched/KokkosBatched_SetIdentity_Impl.hpp @@ -19,7 +19,7 @@ namespace KokkosBatched { SerialSetIdentity:: invoke(const AViewType &A) { return SerialSetIdentityInternal:: - invoke(A.extent(0), + invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1()); } @@ -36,7 +36,7 @@ namespace KokkosBatched { const AViewType &A) { return TeamSetIdentityInternal:: invoke(member, - A.extent(0), + A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1()); } diff --git a/src/batched/KokkosBatched_SetIdentity_Internal.hpp b/src/batched/KokkosBatched_SetIdentity_Internal.hpp index 40d8bbbaaf..8f7f6cf3f9 100644 --- a/src/batched/KokkosBatched_SetIdentity_Internal.hpp +++ b/src/batched/KokkosBatched_SetIdentity_Internal.hpp @@ -15,10 +15,10 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION static int - invoke(const int m, + invoke(const int m, const int n, /* */ ValueType *__restrict__ A, const int as0, const int as1) { const ValueType one(1), zero(0); - for (int j=0;j ::invoke(member, @@ -133,6 +133,7 @@ namespace KokkosBatched { B, bs0, bs1, zero, W, ws0, ws1); + member.team_barrier(); /// W = T^{-1} W TeamVectorTrsmInternalLeftLower @@ -142,26 +143,31 @@ namespace KokkosBatched { one, T, ts0, ts1, W, ws0, ws1); + member.team_barrier(); /// X = V^T W TeamVectorGemmInternal ::invoke(member, - m, nrhs, matrix_rank, + n, nrhs, matrix_rank, one, V, vs1, vs0, W, ws0, ws1, zero, X, xs0, xs1); + member.team_barrier(); } else { + /// W = U^T B TeamVectorGemmInternal ::invoke(member, - m, nrhs, matrix_rank, + matrix_rank, nrhs, m, one, U, us1, us0, B, bs0, bs1, zero, X, xs0, xs1); + member.team_barrier(); + /// X = T^{-1} X TeamVectorTrsmInternalLeftUpper ::invoke(member, false, @@ -169,12 +175,13 @@ namespace KokkosBatched { one, T, ts0, ts1, X, xs0, xs1); + member.team_barrier(); } /// X = P^T X TeamVectorApplyPivotMatrixBackwardInternal ::invoke(member, - nrhs, m, + nrhs, n, p, ps0, X, xs0, xs1); diff --git a/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp b/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp index 69b958d22d..b06c76b02a 100644 --- a/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp +++ b/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp @@ -31,7 +31,7 @@ namespace KokkosBatched { int &matrix_rank) { return TeamVectorUTV_Internal:: invoke(member, - A.extent(0), //A.extent(1), + A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), p.data(), p.stride(0), U.data(), U.stride(0), U.stride(1), diff --git a/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp b/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp index 6f9a86e115..354dfa7c44 100644 --- a/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp @@ -23,7 +23,7 @@ namespace KokkosBatched { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, // m = NumRows(A) + const int m, const int n, // m = NumRows(A), n = NumCols(A) /* */ ValueType * A, const int as0, const int as1, /* */ IntType * p, const int ps0, /* */ ValueType * U, const int us0, const int us1, @@ -41,23 +41,24 @@ namespace KokkosBatched { matrix_rank = -1; TeamVectorQR_WithColumnPivotingInternal ::invoke(member, - m, m, + m, n, A, as0, as1, t, ts0, p, ps0, work, matrix_rank); - + TeamVectorQR_FormQ_Internal ::invoke(member, - m, matrix_rank, + m, matrix_rank, matrix_rank, A, as0, as1, t, ts0, U, us0, us1, work); + member.team_barrier(); /// for rank deficient matrix - if (matrix_rank < m) { + if (matrix_rank < n) { const value_type zero(0); TeamVectorSetLowerTriangularInternal ::invoke(member, @@ -67,14 +68,14 @@ namespace KokkosBatched { TeamVectorQR_Internal ::invoke(member, - m, matrix_rank, + n, matrix_rank, A, as1, as0, t, ts0, work); TeamVectorQR_FormQ_Internal ::invoke(member, - m, matrix_rank, + n, matrix_rank, matrix_rank, A, as1, as0, t, ts0, V, vs1, vs0, From d04f0d7aff7a3d579d044efac95f1bdc1d0125e2 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 8 Dec 2020 21:48:34 -0700 Subject: [PATCH 104/106] Merge pull request #858 from ndellingwood/ibm-builtin-fix BitUtils: update xl builtin popcount calls for xlclang++ (cherry picked from commit ab062fe086b2e8ef6ce6b604eb2f9c2e0e116e69) --- src/common/KokkosKernels_BitUtils.hpp | 61 ++++++++++++++++++++------- 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/src/common/KokkosKernels_BitUtils.hpp b/src/common/KokkosKernels_BitUtils.hpp index 3dc78f77b1..c845e37c53 100644 --- a/src/common/KokkosKernels_BitUtils.hpp +++ b/src/common/KokkosKernels_BitUtils.hpp @@ -112,66 +112,95 @@ int pop_count( long long i ){ return _popcnt64(i); } -#elif defined( KOKKOS_COMPILER_IBM ) +#elif defined( __GNUC__ ) || defined( __GNUG__ ) KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned i ){ - return __popcnt4(i); + return __builtin_popcount(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned long i ){ - return __popcnt8(i); + return __builtin_popcountl(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned long long i ){ - return __popcnt8(i); + return __builtin_popcountll(i); } +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( int i ){ + return __builtin_popcount(i); +} +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( long i ){ + return __builtin_popcountl(i); +} + +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( long long i ){ + return __builtin_popcountll(i); +} +#elif defined(__ibmxl_vrm__) +// See https://www.ibm.com/support/knowledgecenter/SSGH3R_16.1.0/com.ibm.xlcpp161.aix.doc/compiler_ref/compiler_builtins.html +// link gives info about builtin names for xlclang++ +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( unsigned i ){ + return __builtin_popcnt4(i); +} +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( unsigned long i ){ + return __builtin_popcnt8(i); +} +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( unsigned long long i ){ + return __builtin_popcnt8(i); +} KOKKOS_FORCEINLINE_FUNCTION int pop_count( int i ){ - return __popcnt4(i); + return __builtin_popcnt4(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( long i ){ - return __popcnt8(i); + return __builtin_popcnt8(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( long long i ){ - return __popcnt8(i); + return __builtin_popcnt8(i); } -#elif defined( __GNUC__ ) || defined( __GNUG__ ) +#elif defined(__IBMCPP__) || defined(__IBMC__) KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned i ){ - return __builtin_popcount(i); + return __popcnt4(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned long i ){ - return __builtin_popcountl(i); + return __popcnt8(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned long long i ){ - return __builtin_popcountll(i); + return __popcnt8(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( int i ){ - return __builtin_popcount(i); + return __popcnt4(i); } + KOKKOS_FORCEINLINE_FUNCTION -int pop_count( long i ){ - return __builtin_popcountl(i); +int pop_count( long i ){ + return __popcnt8(i); } KOKKOS_FORCEINLINE_FUNCTION -int pop_count( long long i ){ - return __builtin_popcountll(i); +int pop_count( long long i ){ + return __popcnt8(i); } #else From 5d0102497ca44f2e274caf6bc9fe83041b2275b0 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Tue, 8 Dec 2020 21:55:03 -0700 Subject: [PATCH 105/106] Merge pull request #859 from brian-kelley/RemoveStaticConstexpr spadd: remove static constexpr member var (cherry picked from commit 30295edcc3fa1cfbf1eabbe3731966b41f5e5343) --- src/sparse/KokkosSparse_spadd.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp index 9ed66ce2ad..1efae2c1a7 100644 --- a/src/sparse/KokkosSparse_spadd.hpp +++ b/src/sparse/KokkosSparse_spadd.hpp @@ -47,7 +47,7 @@ #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_Sorting.hpp" -#include +#include "Kokkos_ArithTraits.hpp" namespace KokkosSparse { namespace Experimental { @@ -86,10 +86,10 @@ struct SortedCountEntries { Bcolinds(Bcolinds_), Crowcounts(Crowcounts_) {} - static constexpr ordinal_type ORDINAL_MAX = std::numeric_limits::max(); - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); + // count the union of nonzeros in Arow and Brow size_type numEntries = 0; size_type ai = 0; @@ -417,7 +417,6 @@ template struct SortedNumericSumFunctor { using CscalarT = typename CvaluesT::non_const_value_type; - static constexpr ordinal_type ORDINAL_MAX = std::numeric_limits::max(); SortedNumericSumFunctor(const ArowptrsT& Arowptrs_, const BrowptrsT& Browptrs_, @@ -441,6 +440,8 @@ struct SortedNumericSumFunctor { KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); + // count the union of nonzeros in Arow and Brow size_type ai = 0; size_type bi = 0; From fcebfb0472fc75cb6d6c93f5b8782c68cb5145c0 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 16 Dec 2020 11:23:43 -0700 Subject: [PATCH 106/106] Adding Changelog for Release 3.3.00 Part of Kokkos C++ Performance Portability Programming EcoSystem 3.3 --- CHANGELOG.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a9a1ff7a2..51e31ef007 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,34 @@ # Change Log +## [3.3.00](https://github.com/kokkos/kokkos-kernels/tree/3.3.00) (2020-12-16) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.2.01...3.3.00) + +**Implemented enhancements:** +- Add permanent RCM reordering interface, and a basic serial implementation [\#854](https://github.com/kokkos/kokkos/pull/#854) +- Half\_t explicit conversions [\#849](https://github.com/kokkos/kokkos/pull/#849) +- Add batched gemm performance tests [\#838](https://github.com/kokkos/kokkos/pull/#838) +- Add HIP support to src and perf\_test [\#828](https://github.com/kokkos/kokkos/pull/#828) +- Factor out coarsening [\#827](https://github.com/kokkos/kokkos/pull/#827) +- Allow enabling/disabling components at configuration time [\#823](https://github.com/kokkos/kokkos/pull/#823) +- HIP: CMake work on tests and ETI [\#820](https://github.com/kokkos/kokkos/pull/#820) +- HIP: KokkosBatched - hip specialization [\#812](https://github.com/kokkos/kokkos/pull/#812) +- Distance-2 maximal independent set [\#801](https://github.com/kokkos/kokkos/pull/#801) +- Use batched TRTRI & TRMM for Supernode-sptrsv setup [\#797](https://github.com/kokkos/kokkos/pull/#797) +- Initial support for half precision [\#794](https://github.com/kokkos/kokkos/pull/#794) + +**Fixed bugs:** +- Fix issue with HIP and Kokkos\_ArithTraits [\#844](https://github.com/kokkos/kokkos/pull/#844) +- HIP: fixing round of issues on AMD [\#840](https://github.com/kokkos/kokkos/pull/#840) +- Throw an exception if BLAS GESV is not enabled [\#837](https://github.com/kokkos/kokkos/pull/#837) +- Fixes -Werror for gcc with c++20 [\#836](https://github.com/kokkos/kokkos/pull/#836) +- Add fallback condition to use spmv\_native when cuSPARSE does not work [\#834](https://github.com/kokkos/kokkos/pull/#834) +- Fix install testing refactor for inline builds [\#811](https://github.com/kokkos/kokkos/pull/#811) +- HIP: fix ArithTraits to support HIP backend [\#809](https://github.com/kokkos/kokkos/pull/#809) +- cuSPARSE 11: fix spgemm and spmv\_struct\_tunning compilation error [\#804](https://github.com/kokkos/kokkos/pull/#804) + +**Incompatibilities:** +- Remove pre-3.0 deprecated code [\#825](https://github.com/kokkos/kokkos/pull/#825) + ## [3.2.01](https://github.com/kokkos/kokkos-kernels/tree/3.2.01) (2020-11-17) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.2.00...3.2.01)