diff --git a/CHANGELOG.md b/CHANGELOG.md index 934eb67143..28e43dff49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,34 @@ # Change Log +## [3.3.00](https://github.com/kokkos/kokkos-kernels/tree/3.3.00) (2020-12-16) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.2.01...3.3.00) + +**Implemented enhancements:** +- Add permanent RCM reordering interface, and a basic serial implementation [\#854](https://github.com/kokkos/kokkos/pull/#854) +- Half\_t explicit conversions [\#849](https://github.com/kokkos/kokkos/pull/#849) +- Add batched gemm performance tests [\#838](https://github.com/kokkos/kokkos/pull/#838) +- Add HIP support to src and perf\_test [\#828](https://github.com/kokkos/kokkos/pull/#828) +- Factor out coarsening [\#827](https://github.com/kokkos/kokkos/pull/#827) +- Allow enabling/disabling components at configuration time [\#823](https://github.com/kokkos/kokkos/pull/#823) +- HIP: CMake work on tests and ETI [\#820](https://github.com/kokkos/kokkos/pull/#820) +- HIP: KokkosBatched - hip specialization [\#812](https://github.com/kokkos/kokkos/pull/#812) +- Distance-2 maximal independent set [\#801](https://github.com/kokkos/kokkos/pull/#801) +- Use batched TRTRI & TRMM for Supernode-sptrsv setup [\#797](https://github.com/kokkos/kokkos/pull/#797) +- Initial support for half precision [\#794](https://github.com/kokkos/kokkos/pull/#794) + +**Fixed bugs:** +- Fix issue with HIP and Kokkos\_ArithTraits [\#844](https://github.com/kokkos/kokkos/pull/#844) +- HIP: fixing round of issues on AMD [\#840](https://github.com/kokkos/kokkos/pull/#840) +- Throw an exception if BLAS GESV is not enabled [\#837](https://github.com/kokkos/kokkos/pull/#837) +- Fixes -Werror for gcc with c++20 [\#836](https://github.com/kokkos/kokkos/pull/#836) +- Add fallback condition to use spmv\_native when cuSPARSE does not work [\#834](https://github.com/kokkos/kokkos/pull/#834) +- Fix install testing refactor for inline builds [\#811](https://github.com/kokkos/kokkos/pull/#811) +- HIP: fix ArithTraits to support HIP backend [\#809](https://github.com/kokkos/kokkos/pull/#809) +- cuSPARSE 11: fix spgemm and spmv\_struct\_tunning compilation error [\#804](https://github.com/kokkos/kokkos/pull/#804) + +**Incompatibilities:** +- Remove pre-3.0 deprecated code [\#825](https://github.com/kokkos/kokkos/pull/#825) + ## [3.2.01](https://github.com/kokkos/kokkos-kernels/tree/3.2.01) (2020-11-17) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.2.00...3.2.01) @@ -28,8 +57,8 @@ - Nightly test failure: spgemm unit tests failing on White \(Power8\) [\#780](https://github.com/kokkos/kokkos-kernels/issues/780) - supernodal does not build with UVM enabled [\#633](https://github.com/kokkos/kokkos-kernels/issues/633) -## [3.1.1](https://github.com/kokkos/kokkos-kernels/tree/3.1.1) (2020-05-04) -[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.1.00...3.1.1) +## [3.1.01](https://github.com/kokkos/kokkos-kernels/tree/3.1.01) (2020-05-04) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.1.00...3.1.01) ** Fixed bugs:** diff --git a/CMakeLists.txt b/CMakeLists.txt index 44fbf3aba8..209db7ce6e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,8 +24,8 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) PROJECT(KokkosKernels CXX) ENDIF() SET(KokkosKernels_VERSION_MAJOR 3) - SET(KokkosKernels_VERSION_MINOR 2) - SET(KokkosKernels_VERSION_PATCH 1) + SET(KokkosKernels_VERSION_MINOR 3) + SET(KokkosKernels_VERSION_PATCH 0) ENDIF() IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") @@ -83,7 +83,8 @@ IF (KokkosKernels_INSTALL_TESTING) ELSE() # Regular build, not install testing # Do all the regular option processing - IF (NOT KOKKOSKERNELS_HAS_TRILINOS) + IF (NOT KOKKOSKERNELS_HAS_TRILINOS AND NOT KOKKOSKERNELS_HAS_PARENT) + # This is a standalone build FIND_PACKAGE(Kokkos REQUIRED) MESSAGE(STATUS "Found Kokkos at ${Kokkos_DIR}") KOKKOS_CHECK(OPTIONS CUDA_UVM RETURN_VALUE KOKKOS_ENABLE_CUDA_UVM) @@ -139,6 +140,15 @@ ELSE() BOOL "Whether to restrict testing to ETI types. Default: ON" ) + + KOKKOSKERNELS_ADD_OPTION( + ENABLED_COMPONENTS + "ALL" + STRING + "A list of components to enable in testing and building" + VALID_ENTRIES BATCHED BLAS GRAPH SPARSE ALL + ) + # ================================================================== # Enable Device Types for ETI (exec- + mem-space) # ================================================================== @@ -159,7 +169,6 @@ ELSE() # Enable Layout Types for ETI # ================================================================== INCLUDE(cmake/kokkoskernels_eti_layouts.cmake) - # ================================================================== # Enable Third Party Libraries # ================================================================== diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index 2bdb004ec2..bb246df3c6 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -21,8 +21,18 @@ get_kokkos_device_list() { for DEVICE_ in $PARSE_DEVICES_LST do UC_DEVICE=$(echo $DEVICE_ | tr "[:lower:]" "[:upper:]") + if [ "${UC_DEVICE}" == "CUDA" ]; then + WITH_CUDA_BACKEND=ON + fi + if [ "${UC_DEVICE}" == "HIP" ]; then + WITH_HIP_BACKEND=ON + fi KOKKOS_DEVICE_CMD="-DKokkos_ENABLE_${UC_DEVICE}=ON ${KOKKOS_DEVICE_CMD}" done + if [ "${WITH_CUDA_BACKEND}" == "ON" ] && [ "${WITH_HIP_BACKEND}" == "ON" ]; then + echo "Invalid configuration - Cuda and Hip cannot be simultaneously enabled" + exit + fi } get_kokkos_arch_list() { @@ -59,6 +69,24 @@ get_kokkos_cuda_option_list() { done } +get_kokkos_hip_option_list() { + echo parsing KOKKOS_HIP_OPTIONS=$KOKKOS_HIP_OPTIONS + KOKKOS_HIP_OPTION_CMD= + PARSE_HIP_LST=$(echo $KOKKOS_HIP_OPTIONS | tr "," "\n") + for HIP_ in $PARSE_HIP_LST + do + HIP_OPT_NAME= + if [ "${HIP_}" == "rdc" ]; then + HIP_OPT_NAME=HIP_RELOCATABLE_DEVICE_CODE + else + echo "${HIP_} is not a valid hip option..." + fi + if [ "${HIP_OPT_NAME}" != "" ]; then + KOKKOS_HIP_OPTION_CMD="-DKokkos_ENABLE_${HIP_OPT_NAME}=ON ${KOKKOS_HIP_OPTION_CMD}" + fi + done +} + get_kokkos_option_list() { echo parsing KOKKOS_OPTIONS=$KOKKOS_OPTIONS KOKKOS_OPTION_CMD= @@ -196,15 +224,21 @@ display_help_text() { echo "--prefix=/Install/Path: Path to install the KokkosKernels library." echo "" echo "--with-cuda[=/Path/To/Cuda]: Enable Cuda and set path to Cuda Toolkit." + echo "--with-hip[=/Path/To/Hip]: Enable Hip and set path to ROCM Toolkit." echo "--with-openmp: Enable OpenMP backend." echo "--with-pthread: Enable Pthreads backend." echo "--with-serial: Enable Serial backend." echo "--with-devices: Explicitly add a set of backends." echo "" echo "--arch=[OPT]: Set target architectures. Options are:" - echo " [AMD]" + echo " [AMD: CPU]" echo " AMDAVX = AMD CPU" - echo " EPYC = AMD EPYC Zen-Core CPU" + echo " ZEN = AMD Zen-Core CPU" + echo " ZEN2 = AMD Zen2-Core CPU" + echo " [AMD: GPU]" + echo " VEGA900 = AMD GPU MI25 GFX900" + echo " VEGA906 = AMD GPU MI50/MI60 GFX906" + echo " VEGA908 = AMD GPU" echo " [ARM]" echo " ARMV80 = ARMv8.0 Compatible CPU" echo " ARMV81 = ARMv8.1 Compatible CPU" @@ -264,6 +298,8 @@ display_help_text() { echo " " echo "--with-cuda-options=[OPT]: Additional options to CUDA:" echo " force_uvm, use_ldg, enable_lambda, rdc" + echo "--with-hip-options=[OPT]: Additional options to HIP:" + echo " rdc" echo "--with-scalars=[SCALARS]: Set scalars to be instantiated." echo " Options: float, double, complex_float, complex_double" echo "--with-ordinals=[ORDINALS]: Set ordinals to be instantiated." @@ -307,6 +343,10 @@ KOKKOS_MAKEINSTALL_J=4 KERNELS_DEFAULT_ETI_OPTION="" +# For tracking if Cuda and Hip devices are enabled simultaneously +WITH_CUDA_BACKEND=OFF +WITH_HIP_BACKEND=OFF + while [[ $# > 0 ]] do key="$1" @@ -340,6 +380,19 @@ do update_kokkos_devices Cuda CUDA_PATH="${key#*=}" ;; + --with-hip) + update_kokkos_devices Hip + HIP_PATH_HIPCC=$(command -v hipcc) + HIP_PATH=${HIP_PATH_HIPCC%/bin/hipcc} + ;; + # Catch this before '--with-hip*' + --with-hip-options*) + KOKKOS_HIP_OPTIONS="${key#*=}" + ;; + --with-hip*) + update_kokkos_devices Hip + HIP_PATH="${key#*=}" + ;; --with-openmp) update_kokkos_devices OpenMP ;; @@ -606,6 +659,7 @@ get_kokkos_device_list get_kokkos_option_list get_kokkos_arch_list get_kokkos_cuda_option_list +get_kokkos_hip_option_list get_kernels_scalar_list get_kernels_ordinals_list @@ -655,9 +709,9 @@ cd ${KOKKOS_INSTALL_PATH} # Configure kokkos echo "" -echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH} +echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH} echo "" -cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH} # Install kokkos library make install -j $KOKKOS_MAKEINSTALL_J diff --git a/cmake/KokkosKernelsConfig.cmake.in b/cmake/KokkosKernelsConfig.cmake.in index 54e0006aa0..31d77bda94 100644 --- a/cmake/KokkosKernelsConfig.cmake.in +++ b/cmake/KokkosKernelsConfig.cmake.in @@ -11,6 +11,7 @@ find_dependency(Kokkos HINTS @Kokkos_DIR@) SET(Kokkos_ENABLE_OPENMP @Kokkos_ENABLE_OPENMP@) SET(Kokkos_ENABLE_CUDA @Kokkos_ENABLE_CUDA@) +SET(Kokkos_ENABLE_HIP @Kokkos_ENABLE_HIP@) SET(Kokkos_ENABLE_PTHREAD @Kokkos_ENABLE_PTHREAD@) SET(Kokkos_ENABLE_SERIAL @Kokkos_ENABLE_SERIAL@) diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index b5611c20ca..c0a1e98ec6 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -34,6 +34,9 @@ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_CUDA #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_CUDASPACE #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE +/* Whether to build kernels for execution space Kokkos::Experimental::HIP */ +#cmakedefine KOKKOSKERNELS_INST_EXECSPACE_HIP +#cmakedefine KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE /* Whether to build kernels for execution space Kokkos::OpenMP */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_OPENMP /* Whether to build kernels for execution space Kokkos::Threads */ diff --git a/cmake/fake_tribits.cmake b/cmake/fake_tribits.cmake index 8d623a67fe..26737b8919 100644 --- a/cmake/fake_tribits.cmake +++ b/cmake/fake_tribits.cmake @@ -24,6 +24,13 @@ ENDFOREACH() ENDFUNCTION() FUNCTION(kokkoskernels_add_option SUFFIX DEFAULT TYPE DOCSTRING) + CMAKE_PARSE_ARGUMENTS(OPT + "" + "" + "VALID_ENTRIES" #if this is a list variable, the valid values in the list + ${ARGN} + ) + SET(CAMEL_NAME KokkosKernels_${SUFFIX}) STRING(TOUPPER ${CAMEL_NAME} UC_NAME) @@ -40,13 +47,28 @@ FUNCTION(kokkoskernels_add_option SUFFIX DEFAULT TYPE DOCSTRING) ENDIF() ENDFOREACH() + #okay, great, we passed the validation test - use the default IF (DEFINED ${CAMEL_NAME}) - SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + IF (OPT_VALID_ENTRIES) + STRING(TOUPPER "${OPT_VALID_ENTRIES}" OPT_VALID_ENTRIES_UC) + FOREACH(entry ${${CAMEL_NAME}}) + STRING(TOUPPER ${entry} ENTRY_UC) + IF (NOT ${ENTRY_UC} IN_LIST OPT_VALID_ENTRIES_UC) + MESSAGE(FATAL_ERROR "Given entry ${entry} in list for option ${SUFFIX}. " + "Valid case-insensitive values are any of ${OPT_VALID_ENTRIES}") + ENDIF() + ENDFOREACH() + STRING(TOUPPER "${${CAMEL_NAME}}" GIVEN_ENTRIES_UC) + SET(${UC_NAME} ${GIVEN_ENTRIES_UC} PARENT_SCOPE) + ELSE() + SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + ENDIF() ELSE() SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE) ENDIF() + ENDFUNCTION() MACRO(KOKKOSKERNELS_ADD_OPTION_AND_DEFINE USER_OPTION_NAME MACRO_DEFINE_NAME DOCSTRING DEFAULT_VALUE ) @@ -87,12 +109,6 @@ LIST(APPEND TEMP ${ARGN}) GLOBAL_SET(${VARNAME} ${TEMP}) ENDFUNCTION() -FUNCTION(VERIFY_EMPTY CONTEXT) -IF(${ARGN}) - MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") -ENDIF() -ENDFUNCTION() - MACRO(PREPEND_GLOBAL_SET VARNAME) ASSERT_DEFINED(${VARNAME}) GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}}) @@ -161,52 +177,68 @@ IF(NOT TARGET check) ENDIF() FUNCTION(KOKKOSKERNELS_ADD_TEST) -IF (KOKKOSKERNELS_HAS_TRILINOS) - CMAKE_PARSE_ARGUMENTS(TEST - "" - "EXE;NAME" - "" - ${ARGN}) - IF(TEST_EXE) - SET(EXE_ROOT ${TEST_EXE}) - ELSE() - SET(EXE_ROOT ${TEST_NAME}) - ENDIF() - TRIBITS_ADD_TEST( - ${EXE_ROOT} - NAME ${TEST_NAME} - ${ARGN} - COMM serial mpi - NUM_MPI_PROCS 1 - ${TEST_UNPARSED_ARGUMENTS} - ) -ELSE() - CMAKE_PARSE_ARGUMENTS(TEST - "WILL_FAIL" - "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME" - "CATEGORIES" - ${ARGN}) - IF(TEST_EXE) - SET(EXE ${TEST_EXE}) - ELSE() - SET(EXE ${TEST_NAME}) - ENDIF() - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX}) +CMAKE_PARSE_ARGUMENTS(PARSE + "" + "" + "COMPONENTS" + ${ARGN}) + +KOKKOSKERNELS_IS_ENABLED( + COMPONENTS ${PARSE_COMPONENTS} + OUTPUT_VARIABLE IS_ENABLED +) + +IF (IS_ENABLED) + IF (KOKKOSKERNELS_HAS_TRILINOS) + CMAKE_PARSE_ARGUMENTS(TEST + "" + "EXE;NAME" + "" + ${PARSE_UNPARSED_ARGUMENTS}) + IF(TEST_EXE) + SET(EXE_ROOT ${TEST_EXE}) + ELSE() + SET(EXE_ROOT ${TEST_NAME}) + ENDIF() + + TRIBITS_ADD_TEST( + ${EXE_ROOT} + NAME ${TEST_NAME} + ${ARGN} + COMM serial mpi + NUM_MPI_PROCS 1 + ${TEST_UNPARSED_ARGUMENTS} + ) ELSE() - ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE}) - ENDIF() - IF(TEST_WILL_FAIL) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) - ENDIF() - IF(TEST_FAIL_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_PASS_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + CMAKE_PARSE_ARGUMENTS(TEST + "WILL_FAIL" + "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME" + "CATEGORIES" + ${PARSE_UNPARSED_ARGUMENTS}) + IF(TEST_EXE) + SET(EXE ${TEST_EXE}) + ELSE() + SET(EXE ${TEST_NAME}) + ENDIF() + IF(WIN32) + ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX}) + ELSE() + ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE}) + ENDIF() + IF(TEST_WILL_FAIL) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) + ENDIF() + IF(TEST_FAIL_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) + ENDIF() + IF(TEST_PASS_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + ENDIF() + VERIFY_EMPTY(KOKKOSKERNELS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) ENDIF() - VERIFY_EMPTY(KOKKOSKERNELS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) +ELSE() + MESSAGE(STATUS "Skipping test ${TEST_NAME} because not all necessary components enabled") ENDIF() ENDFUNCTION() diff --git a/cmake/kokkos_backends.cmake b/cmake/kokkos_backends.cmake index e9dde7bf66..c2f46bb8e3 100644 --- a/cmake/kokkos_backends.cmake +++ b/cmake/kokkos_backends.cmake @@ -10,6 +10,7 @@ MACRO(CHECK_KOKKOS_BACKEND BE) ENDMACRO(CHECK_KOKKOS_BACKEND) CHECK_KOKKOS_BACKEND(CUDA) +CHECK_KOKKOS_BACKEND(HIP) CHECK_KOKKOS_BACKEND(OPENMP) CHECK_KOKKOS_BACKEND(PTHREAD) CHECK_KOKKOS_BACKEND(SERIAL) diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index ffb5715e32..ede934023c 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -4,11 +4,13 @@ SET(EXEC_SPACES EXECSPACE_CUDA + EXECSPACE_HIP EXECSPACE_OPENMP EXECSPACE_PTHREAD EXECSPACE_SERIAL ) SET(EXECSPACE_CUDA_CPP_TYPE Kokkos::Cuda) +SET(EXECSPACE_HIP_CPP_TYPE Kokkos::Experimental::HIP) SET(EXECSPACE_OPENMP_CPP_TYPE Kokkos::OpenMP) SET(EXECSPACE_PTHREAD_CPP_TYPE Kokkos::Threads) SET(EXECSPACE_SERIAL_CPP_TYPE Kokkos::Serial) @@ -16,11 +18,13 @@ SET(EXECSPACE_SERIAL_CPP_TYPE Kokkos::Serial) SET(MEM_SPACES MEMSPACE_CUDASPACE MEMSPACE_CUDAUVMSPACE + MEMSPACE_HIPSPACE MEMSPACE_HOSTSPACE MEMSPACE_HBWSPACE ) SET(MEMSPACE_CUDASPACE_CPP_TYPE Kokkos::CudaSpace) SET(MEMSPACE_CUDAUVMSPACE_CPP_TYPE Kokkos::CudaUVMSpace) +SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::Experimental::HIPSpace) SET(MEMSPACE_HOSTSPACE_CPP_TYPE Kokkos::HostSpace) SET(MEMSPACE_HBWSPACE_CPP_TYPE Kokkos::HBWSpace) @@ -57,6 +61,30 @@ IF(KOKKOS_ENABLE_CUDA) ENDIF() +IF(KOKKOS_ENABLE_HIP) + KOKKOSKERNELS_ADD_OPTION( + INST_EXECSPACE_HIP + ${KOKKOSKERNELS_INST_EXECSPACE_HIP_DEFAULT} + BOOL + "Whether to pre instantiate kernels for the execution space Kokkos::Experimental::HIP. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." + ) + KOKKOSKERNELS_ADD_OPTION( + INST_MEMSPACE_HIPSPACE + ${KOKKOSKERNELS_INST_EXECSPACE_HIP_DEFAULT} + BOOL + "Whether to pre instantiate kernels for the memory space Kokkos::Experimental::HIPSpace. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." + ) + + IF(KOKKOSKERNELS_INST_EXECSPACE_HIP AND KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE) + LIST(APPEND DEVICE_LIST "") + ENDIF() + + IF( Trilinos_ENABLE_COMPLEX_DOUBLE AND ((NOT DEFINED CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS) OR (NOT CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS)) ) + MESSAGE( WARNING "The CMake option CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS is either undefined or OFF. Please set CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS:BOOL=ON when building with HIP and complex double enabled.") + ENDIF() + +ENDIF() + KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_HOSTSPACE ${KOKKOSKERNELS_ADD_DEFAULT_ETI} @@ -109,6 +137,7 @@ KOKKOSKERNELS_ADD_OPTION( ) SET(EXECSPACE_CUDA_VALID_MEM_SPACES CUDASPACE CUDAUVMSPACE) +SET(EXECSPACE_HIP_VALID_MEM_SPACES HIPSPACE) SET(EXECSPACE_SERIAL_VALID_MEM_SPACES HBWSPACE HOSTSPACE) SET(EXECSPACE_OPENMP_VALID_MEM_SPACES HBWSPACE HOSTSPACE) SET(EXECSPACE_PTHREAD_VALID_MEM_SPACES HBWSPACE HOSTSPACE) diff --git a/cmake/kokkoskernels_tribits.cmake b/cmake/kokkoskernels_tribits.cmake index 0bd8c04963..4eebb97c7b 100644 --- a/cmake/kokkoskernels_tribits.cmake +++ b/cmake/kokkoskernels_tribits.cmake @@ -5,6 +5,12 @@ IF (KOKKOSKERNELS_HAS_TRILINOS) INCLUDE(TribitsETISupport) ENDIF() +FUNCTION(VERIFY_EMPTY CONTEXT) + IF(${ARGN}) + MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") + ENDIF() +ENDFUNCTION() + #MESSAGE(STATUS "The project name is: ${PROJECT_NAME}") MACRO(KOKKOSKERNELS_PACKAGE_POSTPROCESS) @@ -127,88 +133,121 @@ ENDIF() ENDFUNCTION() FUNCTION(KOKKOSKERNELS_ADD_EXECUTABLE EXE_NAME) -IF (KOKKOSKERNELS_HAS_TRILINOS) - TRIBITS_ADD_EXECUTABLE(${EXE_NAME} ${ARGN}) -ELSE() - CMAKE_PARSE_ARGUMENTS(PARSE - "TESTONLY" - "" - "SOURCES;TESTONLYLIBS" - ${ARGN}) +CMAKE_PARSE_ARGUMENTS(PARSE + "" + "" + "SOURCES;COMPONENTS;TESTONLYLIBS" + ${ARGN}) +VERIFY_EMPTY(KOKKOSKERNELS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) - ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES}) - TARGET_LINK_LIBRARIES(${EXE_NAME} Kokkos::kokkoskernels) - IF (PARSE_TESTONLYLIBS) - TARGET_LINK_LIBRARIES(${EXE_NAME} ${PARSE_TESTONLYLIBS}) +KOKKOSKERNELS_IS_ENABLED( + COMPONENTS ${PARSE_COMPONENTS} + OUTPUT_VARIABLE IS_ENABLED +) + +IF (IS_ENABLED) + IF (KOKKOSKERNELS_HAS_TRILINOS) + TRIBITS_ADD_EXECUTABLE(${EXE_NAME} + SOURCES ${PARSE_SOURCES} + TESTONLYLIBS ${TESTONLYLIBS}) + ELSE() + ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES}) + TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkoskernels) + IF (PARSE_TESTONLYLIBS) + TARGET_LINK_LIBRARIES(${EXE_NAME} ${PARSE_TESTONLYLIBS}) + ENDIF() ENDIF() - VERIFY_EMPTY(KOKKOSKERNELS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) +ELSE() + MESSAGE(STATUS "Skipping executable ${EXE_NAME} because not all necessary components enabled") ENDIF() ENDFUNCTION() -FUNCTION(KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST ROOT_NAME) -IF (KOKKOSKERNELS_HAS_TRILINOS) - TRIBITS_ADD_EXECUTABLE_AND_TEST( +FUNCTION(KOKKOSKERNELS_ADD_UNIT_TEST ROOT_NAME) + KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( ${ROOT_NAME} TESTONLYLIBS kokkoskernels_gtest ${ARGN} - NUM_MPI_PROCS 1 - COMM serial mpi ) -ELSE() - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES;CATEGORIES" - ${ARGN}) - VERIFY_EMPTY(KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE(${EXE_NAME} - SOURCES ${PARSE_SOURCES} - ) - KOKKOSKERNELS_ADD_TEST(NAME ${ROOT_NAME} - EXE ${EXE_NAME} - ) -ENDIF() ENDFUNCTION() -FUNCTION(KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) -IF (KOKKOSKERNELS_HAS_TRILINOS) - TRIBITS_ADD_EXECUTABLE_AND_TEST( - ${ROOT_NAME} - ${ARGN} - NUM_MPI_PROCS 1 - COMM serial mpi - ) -ELSE() +FUNCTION(KOKKOSKERNELS_IS_ENABLED) CMAKE_PARSE_ARGUMENTS(PARSE "" - "" - "SOURCES;CATEGORIES" + "OUTPUT_VARIABLE" + "COMPONENTS" ${ARGN}) - VERIFY_EMPTY(KOKKOSKERNELS_ADD_EXECUTABLE_AND_RUN_VERIFY ${PARSE_UNPARSED_ARGUMENTS}) - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - KOKKOSKERNELS_ADD_EXECUTABLE(${EXE_NAME} - SOURCES ${PARSE_SOURCES} - ) - KOKKOSKERNELS_ADD_TEST(NAME ${ROOT_NAME} - EXE ${EXE_NAME} - ) -ENDIF() + + IF (KOKKOSKERNELS_ENABLED_COMPONENTS STREQUAL "ALL") + SET(${PARSE_OUTPUT_VARIABLE} TRUE PARENT_SCOPE) + ELSEIF(PARSE_COMPONENTS) + SET(ENABLED TRUE) + FOREACH(comp ${PARSE_COMPONENTS}) + STRING(TOUPPER ${comp} COMP_UC) + # make sure this is in the list of enabled components + IF(NOT "${COMP_UC}" IN_LIST KOKKOSKERNELS_ENABLED_COMPONENTS) + # if not in the list, one or more components is missing + SET(ENABLED FALSE) + ENDIF() + ENDFOREACH() + SET(${PARSE_OUTPUT_VARIABLE} ${ENABLED} PARENT_SCOPE) + ELSE() + # we did not enable all components and no components + # were given as part of this - we consider this enabled + SET(${PARSE_OUTPUT_VARIABLE} TRUE PARENT_SCOPE) + ENDIF() ENDFUNCTION() -MACRO(KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE EXE_NAME) -CMAKE_PARSE_ARGUMENTS(PARSE +FUNCTION(KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) + +CMAKE_PARSE_ARGUMENTS(PARSE "" "" - "SOURCES" + "SOURCES;CATEGORIES;COMPONENTS;TESTONLYLIBS" ${ARGN}) -KOKKOSKERNELS_ADD_EXECUTABLE(${EXE_NAME} - SOURCES ${PARSE_SOURCES} - TESTONLYLIBS kokkoskernels_gtest - ${PARSE_UNPARSED_ARGUMENTS} +VERIFY_EMPTY(KOKKOSKERNELS_ADD_EXECUTABLE_AND_RUN_VERIFY ${PARSE_UNPARSED_ARGUMENTS}) + +KOKKOSKERNELS_IS_ENABLED( + COMPONENTS ${PARSE_COMPONENTS} + OUTPUT_VARIABLE IS_ENABLED ) -IF (NOT KOKKOSKERNELS_HAS_TRILINOS) - TARGET_LINK_LIBRARIES(${EXE_NAME} kokkoskernels_gtest) + +IF (IS_ENABLED) + IF (KOKKOSKERNELS_HAS_TRILINOS) + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ROOT_NAME} + SOURCES ${PARSE_SOURCES} + CATEGORIES ${PARSE_CATEGORIES} + TESTONLYLIBS ${PARSE_TESTONLYLIBS} + NUM_MPI_PROCS 1 + COMM serial mpi + ) + ELSE() + SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + KOKKOSKERNELS_ADD_EXECUTABLE(${EXE_NAME} + SOURCES ${PARSE_SOURCES} + ) + IF (PARSE_TESTONLYLIBS) + TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS}) + ENDIF() + KOKKOSKERNELS_ADD_TEST(NAME ${ROOT_NAME} + EXE ${EXE_NAME} + ) + ENDIF() +ELSE() + MESSAGE(STATUS "Skipping executable/test ${ROOT_NAME} because not all necessary components enabled") ENDIF() -ADD_DEPENDENCIES(check ${EXE_NAME}) -ENDMACRO(KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE) + +ENDFUNCTION() + +MACRO(ADD_COMPONENT_SUBDIRECTORY SUBDIR) + KOKKOSKERNELS_IS_ENABLED( + COMPONENTS ${SUBDIR} + OUTPUT_VARIABLE COMP_SUBDIR_ENABLED + ) + IF (COMP_SUBDIR_ENABLED) + ADD_SUBDIRECTORY(${SUBDIR}) + ELSE() + MESSAGE(STATUS "Skipping subdirectory ${SUBDIR} because component is not enabled") + ENDIF() + UNSET(COMP_SUBDIR_ENABLED) +ENDMACRO() diff --git a/example/wiki/graph/CMakeLists.txt b/example/wiki/graph/CMakeLists.txt index a8ddec070d..b271038d91 100644 --- a/example/wiki/graph/CMakeLists.txt +++ b/example/wiki/graph/CMakeLists.txt @@ -8,3 +8,18 @@ KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( SOURCES KokkosGraph_wiki_coloring.cpp ) +KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( + wiki_mis2 + SOURCES KokkosGraph_wiki_mis2.cpp + ) + +KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( + wiki_coarsening + SOURCES KokkosGraph_wiki_coarsening.cpp + ) + +KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( + wiki_rcm + SOURCES KokkosGraph_wiki_rcm.cpp + ) + diff --git a/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp b/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp new file mode 100644 index 0000000000..93e5660c07 --- /dev/null +++ b/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp @@ -0,0 +1,140 @@ +#ifndef WIKI_9PT_STENCIL_H +#define WIKI_9PT_STENCIL_H + +#include "Kokkos_Core.hpp" +#include "KokkosKernels_default_types.hpp" +#include "KokkosKernels_Handle.hpp" +#include +#include +#include +#include +#include + +using Ordinal = default_lno_t; +using Offset = default_size_type; +using Layout = default_layout; +using ExecSpace = Kokkos::DefaultExecutionSpace; +using DeviceSpace = typename ExecSpace::memory_space; +using Kokkos::HostSpace; +using RowmapType = Kokkos::View; +using ColindsType = Kokkos::View; +using Handle = KokkosKernels::Experimental:: + KokkosKernelsHandle; + +namespace GraphDemo +{ + Ordinal gridX = 15; + Ordinal gridY = 25; + Ordinal numVertices = gridX * gridY; + + void setGridDimensions(Ordinal newX, Ordinal newY) + { + gridX = newX; + gridY = newY; + numVertices = gridX * gridY; + } + + //Helper to get the vertex ID given grid coordinates + Ordinal getVertexID(Ordinal x, Ordinal y) + { + return y * gridX + x; + } + + //Inverse of getVertexID + void getVertexPos(Ordinal vert, Ordinal& x, Ordinal& y) + { + x = vert % gridX; + y = vert / gridX; + } + + //Helper to print out colors in the shape of the grid + template + void printColoring(ColorView colors, Ordinal numColors) + { + //Read colors on host + auto colorsHost = Kokkos::create_mirror_view_and_copy(HostSpace(), colors); + int numDigits = ceil(log10(numColors + 1)); + //Print out the grid, with columns aligned and at least one space between numbers + std::ostringstream numFmtStream; + numFmtStream << '%' << numDigits + 1 << 'd'; + std::string numFmt = numFmtStream.str(); + for(Ordinal y = 0; y < gridY; y++) + { + for(Ordinal x = 0; x < gridX; x++) + { + Ordinal vertex = getVertexID(x, y); + int color = colorsHost(vertex); + printf(numFmt.c_str(), color); + } + putchar('\n'); + } + } + + template + void printMIS(MISView misList) + { + //Read colors on host + auto misHost = Kokkos::create_mirror_view_and_copy(HostSpace(), misList); + std::set mis; + for(Offset i = 0; i < (Offset) misList.extent(0); i++) + mis.insert(misHost(i)); + for(Ordinal y = 0; y < gridY; y++) + { + for(Ordinal x = 0; x < gridX; x++) + { + Ordinal vertex = getVertexID(x, y); + if(mis.find(vertex) == mis.end()) + printf(". "); + else + printf("# "); + } + putchar('\n'); + } + } + + //Build the graph on host, allocate these views on device and copy the graph to them. + //Both rowmapDevice and colindsDevice are output parameters and should default-initialized (empty) on input. + void generate9pt(RowmapType& rowmapDevice, ColindsType& colindsDevice) + { + //Generate the graph on host (use std::vector to not need to know + //how many entries ahead of time) + std::vector rowmap(numVertices + 1); + std::vector colinds; + rowmap[0] = 0; + for(Ordinal vert = 0; vert < numVertices; vert++) + { + Ordinal x, y; + getVertexPos(vert, x, y); + //Loop over the neighbors in a 3x3 region + for(Ordinal ny = y - 1; ny <= y + 1; ny++) + { + for(Ordinal nx = x - 1; nx <= x + 1; nx++) + { + //exclude the edge to self + if(nx == x && ny == y) + continue; + //exclude vertices that would be outside the grid + if(nx < 0 || nx >= gridX || ny < 0 || ny >= gridY) + continue; + //add the neighbor to colinds, forming an edge + colinds.push_back(getVertexID(nx, ny)); + } + } + //mark where the current row ends + rowmap[vert + 1] = colinds.size(); + } + Offset numEdges = colinds.size(); + //Now that the graph is formed, copy rowmap and colinds to Kokkos::Views in device memory + //The nonowning host views just alias the std::vectors. + Kokkos::View> rowmapHost(rowmap.data(), numVertices + 1); + Kokkos::View> colindsHost(colinds.data(), numEdges); + //Allocate owning views on device with the correct size. + rowmapDevice = RowmapType("Rowmap", numVertices + 1); + colindsDevice = ColindsType("Colinds", numEdges); + //Copy the graph from host to device + Kokkos::deep_copy(rowmapDevice, rowmapHost); + Kokkos::deep_copy(colindsDevice, colindsHost); + } +} + +#endif diff --git a/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp new file mode 100644 index 0000000000..dded3fd258 --- /dev/null +++ b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp @@ -0,0 +1,28 @@ +#include "KokkosGraph_wiki_9pt_stencil.hpp" +#include "KokkosGraph_MIS2.hpp" + +int main(int argc, char* argv[]) +{ + Kokkos::initialize(); + { + using GraphDemo::numVertices; + RowmapType rowmapDevice; + ColindsType colindsDevice; + //Step 1: Generate the graph on host, allocate space on device, and copy. + //See function "generate9pt" below. + GraphDemo::generate9pt(rowmapDevice, colindsDevice); + //Step 2: Run MIS-2 based coarsening and print the result + { + std::cout << "Coarsened vertex labels:\n"; + Ordinal numClusters = 0; + auto labels = KokkosGraph::Experimental::graph_mis2_coarsen( + rowmapDevice, colindsDevice, numClusters, KokkosGraph::MIS2_FAST); + //coarsening labels can be printed in the same way as colors + GraphDemo::printColoring(labels, numClusters); + putchar('\n'); + } + } + Kokkos::finalize(); + return 0; +} + diff --git a/example/wiki/graph/KokkosGraph_wiki_coloring.cpp b/example/wiki/graph/KokkosGraph_wiki_coloring.cpp index 7e561f5883..56639dad3a 100644 --- a/example/wiki/graph/KokkosGraph_wiki_coloring.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_coloring.cpp @@ -1,10 +1,4 @@ -#include -#include -#include -#include -#include "Kokkos_Core.hpp" -#include "KokkosKernels_default_types.hpp" -#include "KokkosKernels_Handle.hpp" +#include "KokkosGraph_wiki_9pt_stencil.hpp" #include "KokkosGraph_Distance1Color.hpp" #include "KokkosGraph_Distance2Color.hpp" @@ -17,114 +11,16 @@ // -Different constraint: two vertices separated by a path of length 1 OR 2 // must have different colors) -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; -using ExecSpace = Kokkos::DefaultExecutionSpace; -using DeviceSpace = typename ExecSpace::memory_space; -using Kokkos::HostSpace; -using RowmapType = Kokkos::View; -using ColindsType = Kokkos::View; -using Handle = KokkosKernels::Experimental:: - KokkosKernelsHandle; - -namespace ColoringDemo -{ - constexpr Ordinal gridX = 15; - constexpr Ordinal gridY = 25; - constexpr Ordinal numVertices = gridX * gridY; - - //Helper to get the vertex ID given grid coordinates - Ordinal getVertexID(Ordinal x, Ordinal y) - { - return y * gridX + x; - } - - //Inverse of getVertexID - void getVertexPos(Ordinal vert, Ordinal& x, Ordinal& y) - { - x = vert % gridX; - y = vert / gridX; - } - - //Helper to print out colors in the shape of the grid - template - void printColoring(ColorView colors, Ordinal numColors) - { - //Read colors on host - auto colorsHost = Kokkos::create_mirror_view_and_copy(HostSpace(), colors); - int numDigits = ceil(log10(numColors + 1)); - //Print out the grid, with columns aligned and at least one space between numbers - std::ostringstream numFmtStream; - numFmtStream << '%' << numDigits + 1 << 'd'; - std::string numFmt = numFmtStream.str(); - for(Ordinal y = 0; y < gridY; y++) - { - for(Ordinal x = 0; x < gridX; x++) - { - Ordinal vertex = getVertexID(x, y); - int color = colorsHost(vertex); - printf(numFmt.c_str(), color); - } - putchar('\n'); - } - } - - //Build the graph on host, allocate these views on device and copy the graph to them. - //Both rowmapDevice and colindsDevice are output parameters and should default-initialized (empty) on input. - void generate9pt(RowmapType& rowmapDevice, ColindsType& colindsDevice) - { - //Generate the graph on host (use std::vector to not need to know - //how many entries ahead of time) - std::vector rowmap(numVertices + 1); - std::vector colinds; - rowmap[0] = 0; - for(Ordinal vert = 0; vert < numVertices; vert++) - { - Ordinal x, y; - getVertexPos(vert, x, y); - //Loop over the neighbors in a 3x3 region - for(Ordinal ny = y - 1; ny <= y + 1; ny++) - { - for(Ordinal nx = x - 1; nx <= x + 1; nx++) - { - //exclude the edge to self - if(nx == x && ny == y) - continue; - //exclude vertices that would be outside the grid - if(nx < 0 || nx >= gridX || ny < 0 || ny >= gridY) - continue; - //add the neighbor to colinds, forming an edge - colinds.push_back(getVertexID(nx, ny)); - } - } - //mark where the current row ends - rowmap[vert + 1] = colinds.size(); - } - Offset numEdges = colinds.size(); - //Now that the graph is formed, copy rowmap and colinds to Kokkos::Views in device memory - //The nonowning host views just alias the std::vectors. - Kokkos::View> rowmapHost(rowmap.data(), numVertices + 1); - Kokkos::View> colindsHost(colinds.data(), numEdges); - //Allocate owning views on device with the correct size. - rowmapDevice = RowmapType("Rowmap", numVertices + 1); - colindsDevice = ColindsType("Colinds", numEdges); - //Copy the graph from host to device - Kokkos::deep_copy(rowmapDevice, rowmapHost); - Kokkos::deep_copy(colindsDevice, colindsHost); - } -} - int main(int argc, char* argv[]) { Kokkos::initialize(); { - using ColoringDemo::numVertices; + using GraphDemo::numVertices; RowmapType rowmapDevice; ColindsType colindsDevice; //Step 1: Generate the graph on host, allocate space on device, and copy. //See function "generate9pt" below. - ColoringDemo::generate9pt(rowmapDevice, colindsDevice); + GraphDemo::generate9pt(rowmapDevice, colindsDevice); //Step 2: Create handle and run distance-1 coloring. { Handle handle; @@ -136,7 +32,7 @@ int main(int argc, char* argv[]) auto colors = handle.get_graph_coloring_handle()->get_vertex_colors(); Ordinal numColors = handle.get_graph_coloring_handle()->get_num_colors(); printf("9-pt stencil: Distance-1 Colors (used %d):\n", (int) numColors); - ColoringDemo::printColoring(colors, numColors); + GraphDemo::printColoring(colors, numColors); putchar('\n'); //Clean up handle.destroy_graph_coloring_handle(); @@ -152,7 +48,7 @@ int main(int argc, char* argv[]) auto colors = handle.get_distance2_graph_coloring_handle()->get_vertex_colors(); Ordinal numColors = handle.get_distance2_graph_coloring_handle()->get_num_colors(); printf("9-pt stencil: Distance-2 Colors (used %d):\n", (int) numColors); - ColoringDemo::printColoring(colors, numColors); + GraphDemo::printColoring(colors, numColors); putchar('\n'); //Clean up handle.destroy_distance2_graph_coloring_handle(); diff --git a/example/wiki/graph/KokkosGraph_wiki_mis2.cpp b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp new file mode 100644 index 0000000000..416164981b --- /dev/null +++ b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp @@ -0,0 +1,34 @@ +#include "KokkosGraph_wiki_9pt_stencil.hpp" +#include "KokkosGraph_MIS2.hpp" + +int main(int argc, char* argv[]) +{ + Kokkos::initialize(); + { + using GraphDemo::numVertices; + RowmapType rowmapDevice; + ColindsType colindsDevice; + //Step 1: Generate the graph on host, allocate space on device, and copy. + //See function "generate9pt" below. + GraphDemo::generate9pt(rowmapDevice, colindsDevice); + //Step 2: Run distance-2 MIS and print the results, with three different algorithms + { + //Run coloring + auto misDevice = KokkosGraph::Experimental::graph_d2_mis( + rowmapDevice, colindsDevice, KokkosGraph::MIS2_FAST); + std::cout << "Distance-2 MIS, FAST algorithm: contains " + << misDevice.extent(0) << " out of " << GraphDemo::numVertices << " vertices.\n"; + GraphDemo::printMIS(misDevice); + putchar('\n'); + misDevice = KokkosGraph::Experimental::graph_d2_mis( + rowmapDevice, colindsDevice, KokkosGraph::MIS2_QUALITY); + std::cout << "Distance-2 MIS, QUALITY algorithm: contains " + << misDevice.extent(0) << " out of " << GraphDemo::numVertices << " vertices.\n"; + GraphDemo::printMIS(misDevice); + putchar('\n'); + } + } + Kokkos::finalize(); + return 0; +} + diff --git a/example/wiki/graph/KokkosGraph_wiki_rcm.cpp b/example/wiki/graph/KokkosGraph_wiki_rcm.cpp new file mode 100644 index 0000000000..31073954a4 --- /dev/null +++ b/example/wiki/graph/KokkosGraph_wiki_rcm.cpp @@ -0,0 +1,68 @@ +#include "KokkosGraph_wiki_9pt_stencil.hpp" +#include "KokkosGraph_RCM.hpp" + +template +void printReorderedMatrix(const rowmap_t& rowmapIn, const entries_t& entriesIn, const labels_t& invPermIn) +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + auto rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmapIn); + auto entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entriesIn); + auto invPerm = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), invPermIn); + lno_t numVerts = rowmap.extent(0) - 1; + decltype(invPerm) perm(Kokkos::ViewAllocateWithoutInitializing("Perm"), numVerts); + for(lno_t i = 0; i < numVerts; i++) + perm(invPerm(i)) = i; + std::vector neighbors; + for(lno_t i = 0; i < numVerts; i++) + { + lno_t origRow = perm(i); + neighbors.clear(); + for(size_type j = rowmap(origRow); j < rowmap(origRow + 1); j++) + { + lno_t origNei = entries(j); + lno_t nei = invPerm(origNei); + neighbors.push_back(nei); + } + std::sort(neighbors.begin(), neighbors.end()); + size_t it = 0; + for(lno_t j = 0; j < numVerts; j++) + { + if(it < neighbors.size() && j == neighbors[it]) + { + std::cout << '*'; + it++; + } + else + std::cout << ' '; + } + std::cout << '\n'; + } + std::cout << '\n'; +} + + +int main(int argc, char* argv[]) +{ + Kokkos::initialize(); + { + using GraphDemo::numVertices; + GraphDemo::setGridDimensions(6, 6); + RowmapType rowmapDevice; + ColindsType colindsDevice; + //Make the graph smaller so the matrix can be printed easily + //Step 1: Generate the graph on host, allocate space on device, and copy. + //See function "generate9pt" below. + GraphDemo::generate9pt(rowmapDevice, colindsDevice); + //Step 2: Run RCM and print the reordered matrix + { + auto rcmDevice = KokkosGraph::Experimental::graph_rcm( + rowmapDevice, colindsDevice); + std::cout << "Graph reordered by reverse Cuthill-McKee:\n"; + printReorderedMatrix(rowmapDevice, colindsDevice, rcmDevice); + } + } + Kokkos::finalize(); + return 0; +} + diff --git a/master_history.txt b/master_history.txt index 85a5174166..aa7c3dbe54 100644 --- a/master_history.txt +++ b/master_history.txt @@ -10,3 +10,4 @@ tag: 3.0.00 date: 01/31/2020 master: d86db111 release-candidate-3.0: cf2 tag: 3.1.00 date: 04/14/2020 master: f199f45d develop: 8d063eae tag: 3.1.01 date: 05/04/2020 master: 43773523 release: 6fce7502 tag: 3.2.00 date: 08/19/2020 master: 07a60bcc release: ea3f2b77 +tag: 3.3.00 date: 12/16/2020 master: 42defc56 release: e5279e55 diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index 2ec1ff57c8..fe3b3c51ba 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -10,11 +10,10 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../test_common) # build correctly with or without MPI, but only run them with a single # MPI process. -ADD_SUBDIRECTORY(batched) -ADD_SUBDIRECTORY(graph) -ADD_SUBDIRECTORY(sparse) +ADD_COMPONENT_SUBDIRECTORY(batched) +ADD_COMPONENT_SUBDIRECTORY(graph) +ADD_COMPONENT_SUBDIRECTORY(sparse) +ADD_COMPONENT_SUBDIRECTORY(blas) ADD_SUBDIRECTORY(performance) -ADD_SUBDIRECTORY(blas/blas3) -ADD_SUBDIRECTORY(blas/blas) #ADD_SUBDIRECTORY(common) diff --git a/perf_test/batched/CMakeLists.txt b/perf_test/batched/CMakeLists.txt index b9613c7802..36435ecfc1 100644 --- a/perf_test/batched/CMakeLists.txt +++ b/perf_test/batched/CMakeLists.txt @@ -1,5 +1,9 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockTridiag SOURCES KokkosBatched_Test_BlockTridiagDirect.cpp) -KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockJacobi SOURCES KokkosBatched_Test_BlockJacobi_Tutorial.cpp) +KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockTridiag + SOURCES KokkosBatched_Test_BlockTridiagDirect.cpp +) +KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockJacobi + SOURCES KokkosBatched_Test_BlockJacobi_Tutorial.cpp +) diff --git a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp index f37c2d1b6f..ac8abb18f7 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp @@ -51,7 +51,7 @@ using namespace KokkosBatched; int main (int argc, char *argv[]) { Kokkos::initialize(argc, argv); -#if !defined(__CUDA_ARCH__) +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; const bool detail = false; diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp index adff41c48b..2fffa06855 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp @@ -29,7 +29,7 @@ int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); -#if !defined(__CUDA_ARCH__) +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) const int ntest = 1; //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; int N[1] = { 128*128 }; diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp index 7bb2a2907c..031909d540 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp @@ -27,7 +27,7 @@ void run(const int N) { int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); -#if !defined(__CUDA_ARCH__) +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) const int ntest = 1; //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; int N[1] = { 128*128 }; diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp index 8468800ee6..56ade7a446 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp @@ -27,7 +27,7 @@ void run(const int N) { int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); -#if !defined(__CUDA_ARCH__) +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) const int ntest = 1; //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; const int N[1] = { 128*128 }; diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp index 7b39c624f2..7d352283c6 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp @@ -21,7 +21,7 @@ int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); -#if !defined(__CUDA_ARCH__) +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int N = 128*128; for (int i=1;i struct parallel_blas_trtri { trtri_args_t trtri_args_; @@ -227,11 +227,11 @@ struct parallel_blas_trtri { KokkosBlas::trtri(&trtri_args_.uplo, &trtri_args_.diag, svA); } }; -#endif // !KOKKOS_ENABLE_CUDA +#endif // !KOKKOS_ENABLE_CUDA && !KOKKOS_ENABLE_HIP template void __do_trtri_parallel_blas(options_t options, trtri_args_t trtri_args) { -#if !defined(KOKKOS_ENABLE_CUDA) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; Kokkos::Timer timer; @@ -254,9 +254,9 @@ void __do_trtri_parallel_blas(options_t options, trtri_args_t trtri_args) { __trtri_output_csv_row(options, trtri_args, timer.seconds()); #else std::cerr << std::string(__func__) - << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl; + << " disabled since KOKKOS_ENABLE_CUDA and/or KOKKOS_ENABLE_HIP is defined." << std::endl; __trtri_output_csv_row(options, trtri_args, -1); -#endif // !KOKKOS_ENABLE_CUDA +#endif // !KOKKOS_ENABLE_CUDA && !KOKKOS_ENABLE_HIP return; } diff --git a/perf_test/blas/blas3/CMakeLists.txt b/perf_test/blas/blas3/CMakeLists.txt index a46d4a7712..c1e3a117fa 100644 --- a/perf_test/blas/blas3/CMakeLists.txt +++ b/perf_test/blas/blas3/CMakeLists.txt @@ -2,4 +2,6 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) KOKKOSKERNELS_ADD_EXECUTABLE( - KokkosBlas3_perf_test SOURCES KokkosBlas3_perf_test.cpp) + KokkosBlas3_perf_test + SOURCES KokkosBlas3_perf_test.cpp +) diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index 8374c4502d..4952a8e606 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -56,8 +56,11 @@ #define DEFAULT_STEP 3 #define DEFAULT_WARM_UP_N 100 #define DEFAULT_N 100 +#define DEFAULT_K 1024 #define DEFAULT_OUT &std::cout -#define DEFAULT_BLAS_ROUTINES "trmm," +#define DEFAULT_BLAS_ROUTINES "trmm,gemm," +#define DEFAULT_TEAM_SIZE 1 +#define DEFAULT_VECTOR_LEN 1 /************************ blas routine structure definitions **********/ struct perf_test_trmm_args { @@ -66,29 +69,40 @@ struct perf_test_trmm_args { }; typedef struct perf_test_trmm_args pt_trmm_args_t; +struct perf_test_gemm_args { + std::string gemm_args; //[N,T,C][N,T,C] for transA and transB + default_scalar alpha; + default_scalar beta; +}; +typedef struct perf_test_gemm_args pt_gemm_args_t; // ADD MORE BLAS3 ROUTINE ARG STRUCTS HERE. struct blas_args { pt_trmm_args_t trmm; + pt_gemm_args_t gemm; // ADD MORE BLAS3 ROUTINES HERE + int team_size; + int vector_len; + // ADD MORE COMMON BLAS3 OPTIONS HERE }; typedef struct blas_args blas_args_t; typedef enum BLAS_ROUTINES { TRMM, + GEMM, // ADD MORE BLAS3 ROUTINES HERE BLAS_ROUTINES_N } blas_routines_e; static std::string blas_routines_e_str[BLAS_ROUTINES_N] = { - "trmm" + "trmm", "gemm" // ADD MORE BLAS3 ROUTINES HERE }; /************************ perf test type definitions ************************/ /** - * @var SERIAL: Run the blas routine iterativley, within a for-loop - * @var PARALLEL: Run the blas routine iterativley, within a + * @var SERIAL: Run the blas routine iteratively, within a for-loop + * @var PARALLEL: Run the blas routine iteratively, within a * Kokkos::parallel_for-loop */ typedef enum LOOP { @@ -98,27 +112,47 @@ typedef enum LOOP { LOOP_N } loop_e; -static std::string loop_e_str[LOOP_N] = {"SERIAL", "PARALLEL"}; +static std::string loop_e_str[LOOP_N] = {"serial", "parallel"}; /** - * @var BLAS: Run the blas routine through the KokkosBlas namespace. - * @var BATCHED: Run the blas routine through the KokkosBatched namespace. + * @var BLAS: Run the blas routine through the + * KokkosBlas namespace. + * @var BATCHED_SERIAL{_BLOCKED}: Run the serial blas routine through the + * KokkosBatched namespace. + * @var BATCHED_TEAM{_BLOCKED}: Run the team blas routine through the + * KokkosBatched namespace. + * @var BATCHED_TEAM_VECTOR{_BLOCKED}: Run the team vector blas routine through + * the KokkosBatched namespace. + * @var EXPERIMENT: Run the blas routine as a custom + * experiment. */ typedef enum TEST { BLAS, - BATCHED, + BATCHED_SERIAL, + BATCHED_SERIAL_BLOCKED, + BATCHED_TEAM, + BATCHED_TEAM_BLOCKED, + BATCHED_TEAM_VECTOR, + BATCHED_TEAM_VECTOR_BLOCKED, // ADD MORE TEST TYPES HERE + EXPERIMENT, TEST_N } test_e; -static std::string test_e_str[TEST_N]{"BLAS", "BATCHED"}; +static std::string test_e_str[TEST_N]{ + "blas", "batched_serial", "batched_serial_blocked", "batched_team", + "batched_team_blocked", "batched_team_vector", + "batched_team_vector_blocked", + // ADD MORE TEST TYPES HERE + "experiment"}; /** + * @var k: Number of 2D matrices. * @var m: Number of rows. * @var n: Number of columns. */ struct matrix_dim { - int m, n; + int k, m, n; }; typedef struct matrix_dim matrix_dim_t; @@ -157,4 +191,14 @@ struct perf_test_options { std::string blas_routines; }; typedef struct perf_test_options options_t; + +/*************************** Print macros **************************/ +//#define PERF_TEST_DEBUG +#ifdef PERF_TEST_DEBUG +#define STATUS printf("STATUS: %s:%d.\n", __func__, __LINE__); +#else +#define STATUS +#endif // PERF_TEST_DEBUG +#define FATAL_ERROR(msg) \ + printf("FATAL_ERROR: %s:%s:%d %s\n", __FILE__, __func__, __LINE__, (msg)); #endif // KOKKOSBLAS3_COMMON_H_ diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp new file mode 100644 index 0000000000..f26fbb7287 --- /dev/null +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -0,0 +1,1015 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOSBLAS3_GEMM_PERF_TEST_H_ +#define KOKKOSBLAS3_GEMM_PERF_TEST_H_ + +//#include +#include "KokkosBlas3_common.hpp" + +#include + +#include + +#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_Gemm_Serial_Impl.hpp" +//#include "KokkosBatched_Gemm_Team_Impl.hpp" +//#include "KokkosBatched_Gemm_TeamVector_Impl.hpp" +#include "KokkosBatched_Util.hpp" + +//#define GEMM_PERF_TEST_DEBUG + +// Forward declarations +void do_gemm_serial_blas(options_t options); +void do_gemm_serial_batched(options_t options); +void do_gemm_serial_batched_blocked(options_t options); +// void do_gemm_experiment(options_t options); + +// void do_gemm_serial_blas_parallel(options_t options); +// Not valid! The KokkosBlas::gemm function may take the entire device per +// invocation! +void do_gemm_serial_batched_parallel(options_t options); +void do_gemm_serial_batched_blocked_parallel(options_t options); +void do_gemm_team_batched_parallel(options_t options); +void do_gemm_team_batched_blocked_parallel(options_t options); +void do_gemm_team_vector_batched_parallel(options_t options); +void do_gemm_team_vector_batched_blocked_parallel(options_t options); +void do_gemm_experiment_parallel(options_t options); + +struct SerialTag {}; +struct TeamTag {}; +struct TeamVectorTag {}; +struct LayoutLeftTag {}; +struct LayoutRightTag {}; +struct SimdCpuTag {}; + +// gemm invoke table +void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { + { + do_gemm_serial_blas, // BLAS + do_gemm_serial_batched, do_gemm_serial_batched_blocked, // Serial + NULL, NULL, // Team + NULL, NULL, // TeamVector + NULL // Serial Experiment + }, + { + NULL, // BLAS + do_gemm_serial_batched_parallel, + do_gemm_serial_batched_blocked_parallel, // Serial + do_gemm_team_batched_parallel, + do_gemm_team_batched_blocked_parallel, // Team + do_gemm_team_vector_batched_parallel, NULL, // TeamVector + do_gemm_experiment_parallel // Parallel Experiment + }}; + +/*************************** Test types and defaults **************************/ +#define DEFAULT_GEMM_ARGS "NN" +#define DEFAULT_GEMM_ALPHA 1.0 + +using view_type_3d = + Kokkos::View; + +struct batched_params { + int team_size; + int vector_len; +}; +typedef struct batched_params batched_params_t; + +struct gemm_args { + char transA, transB; + default_scalar alpha; + default_scalar beta; + view_type_3d A, B, C; + batched_params_t bp; +}; +typedef struct gemm_args gemm_args_t; + +static std::string gemm_csv_header_str = + "algorithm,transAtransB,alpha,beta,team_size,vector_len,loop_type,A_dims,B_" + "dims,C_dims,warm_up_n," + "iter,total_time(s),average_time(s)"; + +/*************************** Internal helper fns **************************/ +static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, + double time_in_seconds, + const char *experiment_name = nullptr) { + std::string algo_name = test_e_str[options.test]; + if (experiment_name) algo_name = std::string(experiment_name); + + options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," + << options.blas_args.gemm.alpha << "," + << options.blas_args.gemm.beta << "," << gemm_args.bp.team_size + << "," << gemm_args.bp.vector_len << "," + << loop_e_str[options.loop] << "," << gemm_args.A.extent(0) + << "x" << gemm_args.A.extent(1) << "x" << gemm_args.A.extent(2) + << "," << gemm_args.B.extent(0) << "x" << gemm_args.B.extent(1) + << "x" << gemm_args.B.extent(2) << "," << gemm_args.C.extent(0) + << "x" << gemm_args.C.extent(1) << "x" << gemm_args.C.extent(2) + << "," << options.warm_up_n << "," << options.n << "," + << time_in_seconds << "," << time_in_seconds / options.n + << std::endl; +} + +static void __print_gemm_perf_test_options(options_t options) { +#ifdef PERF_TEST_DEBUG + printf("options.test = %s\n", test_e_str[options.test].c_str()); + printf("options.loop = %s\n", loop_e_str[options.loop].c_str()); + printf("options.start = %dx%d,%dx%d\n", options.start.a.m, + options.start.a.n, options.start.b.m, options.start.b.n); + printf("options.stop = %dx%d,%dx%d\n", options.stop.a.m, + options.stop.a.n, options.stop.b.m, options.stop.b.n); + printf("options.step = %d\n", options.step); + printf("options.warm_up_n = %d\n", options.warm_up_n); + printf("options.n = %d\n", options.n); + printf("options.blas_args.gemm.gemm_args = %s\n", + options.blas_args.gemm.gemm_args.c_str()); + printf("options.out_file = %s\n", options.out_file.c_str()); + if (std::is_same::value) + printf("options.alpha = %lf\n", options.blas_args.gemm.alpha); + else if (std::is_same::value) + printf("options.alpha = %f\n", options.blas_args.gemm.alpha); +#endif // PERF_TEST_DEBUG + return; +} + +/*************************** Internal templated fns **************************/ +template +void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) { +// Need to take subviews on the device +#if !defined(KOKKOS_ENABLE_CUDA) + Kokkos::Timer timer; + + STATUS; + + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) { + for (uint32_t i = 0; i < n; ++i) { + auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha, + A, B, _gemm_args.beta, C); + } + }; + __do_loop(options.warm_up_n, gemm_args); + Kokkos::fence(); + + timer.reset(); + __do_loop(options.n, gemm_args); + Kokkos::fence(); + + __gemm_output_csv_row(options, gemm_args, timer.seconds()); +#else + std::cerr << std::string(__func__) + << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl; +#endif // !KOKKOS_ENABLE_CUDA + return; +} + +template +void __do_gemm_serial_batched_template(options_t options, + gemm_args_t gemm_args) { +// Need to take subviews on the device +#if !defined(KOKKOS_ENABLE_CUDA) + Kokkos::Timer timer; + + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) { + for (uint32_t i = 0; i < n; ++i) { + auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL()); + + SerialGemm::invoke( + _gemm_args.alpha, A, B, _gemm_args.beta, C); + } + }; + + __do_loop(options.warm_up_n, gemm_args); + Kokkos::fence(); + + timer.reset(); + __do_loop(options.n, gemm_args); + Kokkos::fence(); + __gemm_output_csv_row(options, gemm_args, timer.seconds()); +#else + std::cerr << std::string(__func__) + << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl; +#endif // !KOKKOS_ENABLE_CUDA +} + +template +void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) { + char a = gemm_args.transA; + char b = gemm_args.transB; + using N = Trans::NoTranspose; + using T = Trans::Transpose; + // using C = Trans::ConjTranspose; + + STATUS; + + if (a == 'N' && b == 'N') { + __do_gemm_serial_batched_template(options, gemm_args); + } else if (a == 'N' && b == 'T') { + __do_gemm_serial_batched_template(options, gemm_args); + //} else if (a == 'N' && b == 'C') { + // __do_gemm_serial_batched_template(options, gemm_args); + } else if (a == 'T' && b == 'N') { + __do_gemm_serial_batched_template(options, gemm_args); + } else if (a == 'T' && b == 'T') { + __do_gemm_serial_batched_template(options, gemm_args); + //} else if (a == 'T' && b == 'C') { + // __do_gemm_serial_batched_template(options, gemm_args); + //} else if (a == 'C' && b == 'N') { + // __do_gemm_serial_batched_template(options, gemm_args); + //} else if (a == 'C' && b == 'T') { + // __do_gemm_serial_batched_template(options, gemm_args); + //} else if (a == 'C' && b == 'C') { + // __do_gemm_serial_batched_template(options, gemm_args); + } else { + FATAL_ERROR("Bad gemm_args TransA or TransB value"); + } + return; +} + +#if !defined(KOKKOS_ENABLE_CUDA) +template +struct parallel_blas_gemm { + gemm_args_t gemm_args_; + + parallel_blas_gemm(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int &i) const { + auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBlas::gemm(&gemm_args_.transA, &gemm_args_.transB, gemm_args_.alpha, + svA, svB, gemm_args_.beta, svC); + } +}; +#endif // !KOKKOS_ENABLE_CUDA + +template +void __do_gemm_parallel_blas(options_t options, gemm_args_t gemm_args) { +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + Kokkos::Timer timer; + using execution_space = typename device_type::execution_space; + using functor_type = parallel_blas_gemm; + functor_type parallel_blas_gemm_functor(gemm_args); + + STATUS; + + Kokkos::parallel_for("parallelBlasWarmUpLoopGemm", + Kokkos::RangePolicy(0, warm_up_n), + parallel_blas_gemm_functor); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for("parallelBlasTimedLoopGemm", + Kokkos::RangePolicy(0, n), + parallel_blas_gemm_functor); + Kokkos::fence(); + __gemm_output_csv_row(options, gemm_args, timer.seconds()); +#else + std::cerr << std::string(__func__) + << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl; + __gemm_output_csv_row(options, gemm_args, -1); +#endif // !KOKKOS_ENABLE_CUDA + return; +} + +template +struct parallel_batched_gemm { + gemm_args_t gemm_args_; + + parallel_batched_gemm(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const SerialTag &, const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamTag &, const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::TeamGemm::invoke(member, gemm_args_.alpha, svA, + svB, gemm_args_.beta, svC); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorTag &, const MemberType &member) const { + auto team_idx = member.league_rank(); + auto svA = + Kokkos::subview(gemm_args_.A, team_idx, Kokkos::ALL(), Kokkos::ALL()); + auto svB = + Kokkos::subview(gemm_args_.B, team_idx, Kokkos::ALL(), Kokkos::ALL()); + auto svC = + Kokkos::subview(gemm_args_.C, team_idx, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::TeamVectorGemm::invoke(member, + gemm_args_.alpha, svA, + svB, gemm_args_.beta, + svC); + } +}; + +template +void __do_gemm_parallel_batched_template(options_t options, + gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + using functor_type = + parallel_batched_gemm; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto league_size = options.start.c.k; + Kokkos::Timer timer; + + STATUS; + + functor_type parallel_batched_gemm_functor(gemm_args); + auto team_size = gemm_args.bp.team_size; + auto vector_len = gemm_args.bp.vector_len; + + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); + } + Kokkos::fence(); + + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); + } + Kokkos::fence(); + + __gemm_output_csv_row(options, gemm_args, timer.seconds()); + + return; +} + +template +void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { + char a = gemm_args.transA; + char b = gemm_args.transB; + using N = Trans::NoTranspose; + using T = Trans::Transpose; + // using C = Trans::ConjTranspose; + + STATUS; + + if (a == 'N' && b == 'N') { + __do_gemm_parallel_batched_template(options, gemm_args); + } else if (a == 'N' && b == 'T') { + __do_gemm_parallel_batched_template(options, gemm_args); + //} else if (a == 'N' && b == 'C') { + // __do_gemm_parallel_batched_template(options, gemm_args); + } else if (a == 'T' && b == 'N') { + __do_gemm_parallel_batched_template(options, gemm_args); + } else if (a == 'T' && b == 'T') { + __do_gemm_parallel_batched_template(options, gemm_args); + //} else if (a == 'T' && b == 'C') { + // __do_gemm_parallel_batched_template(options, gemm_args); + //} else if (a == 'C' && b == 'N') { + // __do_gemm_parallel_batched_template(options, gemm_args); + //} else if (a == 'C' && b == 'T') { + // __do_gemm_parallel_batched_template(options, gemm_args); + //} else if (a == 'C' && b == 'C') { + // __do_gemm_parallel_batched_template(options, gemm_args); + } else { + FATAL_ERROR("Bad gemm_args TransA or TransB value"); + } + + return; +} + +template +struct parallel_batched_gemm_experiment1 { + gemm_args_t gemm_args_; + + parallel_batched_gemm_experiment1(gemm_args_t gemm_args) + : gemm_args_(gemm_args) {} + + KOKKOS_INLINE_FUNCTION + + void operator()(const SerialTag &, const int &i) const { + auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); + + // Uses two serial for-loops internally + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } +}; + +/** + * 1. parallel_for(rangePolicy(N)): serialGemm + * + */ +template +void __do_gemm_parallel_experiment1(options_t options, gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::RangePolicy; + using functor_type = + parallel_batched_gemm_experiment1; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto k = options.start.c.k; + Kokkos::Timer timer; + STATUS; + + functor_type experiment1_functor(gemm_args); + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment1Gemm", + policy_type(0, k), experiment1_functor); + } + Kokkos::fence(); + + timer.reset(); + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment1Gemm", + policy_type(0, k), experiment1_functor); + } + Kokkos::fence(); + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment1"); + return; +} + +template +struct parallel_batched_gemm_experiment2_3_4 { + gemm_args_t gemm_args_; + + parallel_batched_gemm_experiment2_3_4(gemm_args_t gemm_args) + : gemm_args_(gemm_args) {} + + // Experiment 2 + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorTag &, const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); + + // Uses TeamThreadRange over C-rows + // ThreadVectorRange over C-cols + KokkosBatched::TeamVectorGemm::invoke(member, + gemm_args_.alpha, svA, + svB, gemm_args_.beta, + svC); + } + + // Experiment 3 + KOKKOS_INLINE_FUNCTION + void operator()(const LayoutLeftTag &, const MemberType &member) const { + auto team_idx = member.league_rank(); + auto svA = + Kokkos::subview(gemm_args_.A, team_idx, Kokkos::ALL(), Kokkos::ALL()); + auto svB = + Kokkos::subview(gemm_args_.B, team_idx, Kokkos::ALL(), Kokkos::ALL()); + auto svC = + Kokkos::subview(gemm_args_.C, team_idx, Kokkos::ALL(), Kokkos::ALL()); + + // TeamThreadRange: splits the index range over the threads of the team + // ThreadVectorRange: splits the index range over the vector lanes of the + // calling thread + + auto svC_cols = svC.extent(1); + // In a given team, for each vector lane, compute zero or more output + // columns of C depending on the index range + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, svC_cols), [&](const int &lane_idx) { + auto svB_col = Kokkos::subview(svB, Kokkos::ALL(), lane_idx); + auto svC_col = Kokkos::subview(svC, Kokkos::ALL(), lane_idx); + // TeamGemm Calls TeamThreadRange over M*N meaning the flat M*N array + // is split over all threads of the team + KokkosBatched::TeamGemm::invoke(member, + gemm_args_.alpha, svA, + svB_col, + gemm_args_.beta, + svC_col); + }); + } + + // TODO: Why is this faster than the LayoutLeftTag operator above for both + // LayoutLeft and LayoutRight? Experiment 4 + KOKKOS_INLINE_FUNCTION + void operator()(const LayoutRightTag &, const MemberType &member) const { + auto team_idx = member.league_rank(); + auto svA = + Kokkos::subview(gemm_args_.A, team_idx, Kokkos::ALL(), Kokkos::ALL()); + auto svB = + Kokkos::subview(gemm_args_.B, team_idx, Kokkos::ALL(), Kokkos::ALL()); + auto svC = + Kokkos::subview(gemm_args_.C, team_idx, Kokkos::ALL(), Kokkos::ALL()); + + // TeamThreadRange: splits the index range over the threads of the team + // ThreadVectorRange: splits the index range over the vector lanes of the + // calling thread + + auto svC_rows = svC.extent(0); + // In a given team, for each vector lane, compute zero or more output rows + // of C depending on the index range + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, svC_rows), [&](const int &lane_idx) { + auto svA_row = Kokkos::subview(svA, lane_idx, Kokkos::ALL()); + auto svC_row = Kokkos::subview(svC, lane_idx, Kokkos::ALL()); + // TeamGemm Calls TeamThreadRange over M*N meaning the flat M*N array + // is split over all threads of the team + KokkosBatched::TeamGemm::invoke(member, + gemm_args_.alpha, + svA_row, svB, + gemm_args_.beta, + svC_row); + }); + } +}; + +/** + * 2. case a) + * parallel_for(teamPolicy): TeamVectorGemm + * + */ +template +void __do_gemm_parallel_experiment2(options_t options, gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + using functor_type = + parallel_batched_gemm_experiment2_3_4; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto league_size = options.start.c.k; + Kokkos::Timer timer; + STATUS; + + functor_type experiment2_functor(gemm_args); + + auto team_size = gemm_args.bp.team_size; + auto vector_len = gemm_args.bp.vector_len; + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment2Gemm", + policy_type(league_size, team_size, vector_len), + experiment2_functor); + } + Kokkos::fence(); + + timer.reset(); + + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment2Gemm", + policy_type(league_size, team_size, vector_len), + experiment2_functor); + } + Kokkos::fence(); + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment2"); + return; +} + +/** + * 3. case b) + * parallel_for(teamPolicy): + * parallel_for(TeamThreadRange): + * VectorGemm + * + * VectorGemm has not been implemented! + * I think this experiment can be removed. TeamGemm calls TeamThreadRange + * internally! TeamVectorGemm calls both TeamThreadRange and ThreadVectorRange + * internally! + */ +template +void __do_gemm_parallel_experiment3(options_t options, gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + // using layout_tag = std::conditional::value, LayoutLeftTag, LayoutRightTag>::type; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + using functor_type = + parallel_batched_gemm_experiment2_3_4; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto league_size = options.start.c.k; + Kokkos::Timer timer; + STATUS; + + functor_type experiment3_functor(gemm_args); + + auto team_size = gemm_args.bp.team_size; + auto vector_len = gemm_args.bp.vector_len; + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment3Gemm", + policy_type(league_size, team_size, vector_len), + experiment3_functor); + } + Kokkos::fence(); + + timer.reset(); + + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment3Gemm", + policy_type(league_size, team_size, vector_len), + experiment3_functor); + } + Kokkos::fence(); + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment3"); + return; +} + +/** + * 4. case c) + * parallel_for(teamPolicy): + * parallel_for(ThreadVectorRange) + * TeamGemm + */ +template +void __do_gemm_parallel_experiment4(options_t options, gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + // using layout_tag = std::conditional::value, LayoutLeftTag, LayoutRightTag>::type; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + using functor_type = + parallel_batched_gemm_experiment2_3_4; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto league_size = options.start.c.k; + Kokkos::Timer timer; + STATUS; + + functor_type experiment4_functor(gemm_args); + + auto team_size = gemm_args.bp.team_size; + auto vector_len = gemm_args.bp.vector_len; + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment4Gemm", + policy_type(league_size, team_size, vector_len), + experiment4_functor); + } + Kokkos::fence(); + + timer.reset(); + + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment4Gemm", + policy_type(league_size, team_size, vector_len), + experiment4_functor); + } + Kokkos::fence(); + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment4"); + return; +} + +template +class parallel_batched_gemm_experiment5 { + private: + SimdViewType &A, &B, &C; + gemm_args_t gemm_args; + + public: + parallel_batched_gemm_experiment5(SimdViewType &_A, SimdViewType &_B, + SimdViewType &_C, gemm_args_t _gemm_args) + : A(_A), B(_B), C(_C), gemm_args(_gemm_args) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const SimdCpuTag &, const int &i) const { + auto svA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); + + // Uses two serial for-loops internally + KokkosBatched::SerialGemm::invoke( + gemm_args.alpha, svA, svB, gemm_args.beta, svC); + } +}; + +/** + * 5. + * parallel_for(RangePolicy(N/vl+(N%vl>0)>): + * serialGemm + * + * Not portable to GPU + */ +template +void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::RangePolicy; + + // Construct the SimdType + using scalar_type = typename view_type_3d::value_type; + constexpr int vl = + KokkosBatched::DefaultVectorLength::value; + using simd_type = KokkosBatched::Vector, vl>; + using simd_view_type = + Kokkos::View; + using functor_type = + parallel_batched_gemm_experiment5; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto k = options.start.c.k; + Kokkos::Timer timer; + auto simd_batch_size = k / vl + (k % vl > 0); + STATUS; + + // Increases each array size by sizeof(scalar_type) * (vl-1) bytes! + simd_view_type A("A", simd_batch_size, gemm_args.A.extent(0), + gemm_args.A.extent(1)); + simd_view_type B("B", simd_batch_size, gemm_args.B.extent(0), + gemm_args.B.extent(1)); + simd_view_type C("C", simd_batch_size, gemm_args.C.extent(0), + gemm_args.C.extent(1)); + + // uint64_t seed = Kokkos::Impl::clock_tic(); + // Kokkos::Random_XorShift64_Pool rand_pool(seed); + // Kokkos::fill_random(A, rand_pool, + // Kokkos::rand, simd_type>::max()); + // Kokkos::fill_random(B, rand_pool, + // Kokkos::rand, simd_type>::max()); + // Kokkos::fill_random(C, rand_pool, + // Kokkos::rand, simd_type>::max()); + // execution_space::fence(); + + functor_type experiment5_functor(A, B, C, gemm_args); + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment5Gemm", + policy_type(0, simd_batch_size), experiment5_functor); + } + Kokkos::fence(); + + timer.reset(); + + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment5Gemm", + policy_type(0, simd_batch_size), experiment5_functor); + } + Kokkos::fence(); + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment5"); +#else + std::cerr + << std::string(__func__) + << " disabled since KOKKOS_ENABLE_CUDA or KOKKOS_ENABLE_HIP is defined." + << std::endl; +#endif // !KOKKOS_ENABLE_CUDA || !KOKKOS_ENABLE_HIP + return; +} + +/*************************** Internal setup fns **************************/ +template +gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { + using execution_space = typename device_type::execution_space; + + gemm_args_t gemm_args; + uint64_t seed = Kokkos::Impl::clock_tic(); + Kokkos::Random_XorShift64_Pool rand_pool(seed); + STATUS; + + gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; + gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; + gemm_args.A = vta("gemm_args.A", dim.a.k, dim.a.m, dim.a.n); + gemm_args.B = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n); + gemm_args.C = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n); + gemm_args.alpha = options.blas_args.gemm.alpha; + gemm_args.alpha = options.blas_args.gemm.beta; + gemm_args.bp.team_size = options.blas_args.team_size; + gemm_args.bp.vector_len = options.blas_args.vector_len; + + Kokkos::fill_random(gemm_args.A, rand_pool, + Kokkos::rand, + scalar_type>::max()); + Kokkos::fill_random(gemm_args.B, rand_pool, + Kokkos::rand, + scalar_type>::max()); + Kokkos::fill_random(gemm_args.C, rand_pool, + Kokkos::rand, + scalar_type>::max()); + + return gemm_args; +} + +/*************************** Interal run helper fns **************************/ +void __do_loop_and_invoke(options_t options, + void (*fn)(options_t, gemm_args_t)) { + matrix_dims_t cur_dims; + gemm_args_t gemm_args; + STATUS; + + __print_gemm_perf_test_options(options); + std::cout << "SCALAR:" << typeid(default_scalar).name() + << ", LAYOUT:" << typeid(default_layout).name() + << ", DEVICE:" << typeid(default_device).name() << std::endl; + + options.out[0] << gemm_csv_header_str << std::endl; + + for (cur_dims = options.start; + cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n && + cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n && + cur_dims.c.m <= options.stop.c.m && cur_dims.c.n <= options.stop.c.n; + cur_dims.a.m *= options.step, cur_dims.a.n *= options.step, + cur_dims.b.m *= options.step, cur_dims.b.n *= options.step, + cur_dims.c.m *= options.step, cur_dims.c.n *= options.step) { + gemm_args = __do_setup(options, cur_dims); + fn(options, gemm_args); + } + return; +} + +/*************************** External fns **************************/ +void do_gemm_serial_blas(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_serial_blas); + return; +} + +void do_gemm_serial_batched(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_serial_batched); + return; +} + +void do_gemm_serial_batched_blocked(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_serial_batched); + return; +} + +void do_gemm_serial_batched_parallel(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_serial_batched_blocked_parallel(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_team_batched_parallel(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_team_batched_blocked_parallel(options_t options) { + STATUS; + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + return; +} + +void do_gemm_team_vector_batched_parallel(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +/* void do_gemm_team_vector_batched_blocked_parallel(options_t options) { + STATUS; + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; +} */ + +void do_gemm_experiment_parallel(options_t options) { + STATUS; + using TransAType = Trans::NoTranspose; + using TransBType = Trans::NoTranspose; + using BlockingType = Algo::Gemm::Unblocked; + + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment1); + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment2); + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment3); + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment4); + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment5); +} + +#endif // KOKKOSBLAS3_GEMM_PERF_TEST_H_ diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 122f0b3817..b493c244d8 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -43,6 +43,7 @@ */ #include "KokkosBlas3_common.hpp" #include "KokkosBlas3_trmm_perf_test.hpp" +#include "KokkosBlas3_gemm_perf_test.hpp" #include #include @@ -61,6 +62,11 @@ static struct option long_options[] = { {"routines", required_argument, 0, 'r'}, {"trmm_options", required_argument, 0, 'o'}, {"trmm_alpha", required_argument, 0, 'a'}, + {"gemm_options", required_argument, 0, 'g'}, + {"gemm_alpha", required_argument, 0, 'p'}, + {"team_size", required_argument, 0, 'z'}, + {"vector_len", required_argument, 0, 'n'}, + {"batch_size", required_argument, 0, 'k'}, {0, 0, 0, 0}}; static void __print_help_blas3_perf_test() { @@ -72,14 +78,12 @@ static void __print_help_blas3_perf_test() { printf("\t-t, --test=OPTION\n"); printf("\t\tAlgorithm selection.\n"); printf("\t\t\tValid values for OPTION:\n"); - printf("%c[1m", 27); - printf("\t\t\t\tblas:"); - printf("%c[0m", 27); - printf(" invoke Kokkos::trmm the loop-body. (default)\n"); - printf("%c[1m", 27); - printf("\t\t\t\tbatched:"); - printf("%c[0m", 27); - printf(" invoke KokkosBatched::SerialTrmm in the loop-body.\n\n"); + for (int i = 0; i < TEST_N; i++) { + printf("%c[1m", 27); + printf("\t\t\t\t%s", test_e_str[i].c_str()); + printf("%c[0m", 27); + printf("\n"); + } printf("\t-o, --trmm_options=OPTION_STRING\n"); printf("\t\tTRMM side, uplo, trans, and diag options.\n"); @@ -93,6 +97,33 @@ static void __print_help_blas3_perf_test() { printf("\t\t\tThe value of alpha in floating point. (default: %lf)\n", DEFAULT_TRMM_ALPHA); + printf("\t-g, --gemm_options=OPTION_STRING\n"); + printf("\t\tGEMM transA, and transB options.\n"); + printf( + "\t\t\tValid format for OPTION_STRING is \"%%c%%c\". (default: " + "%s)\n", + DEFAULT_GEMM_ARGS); + + printf("\t-p, --gemm_alpha=SCALAR_VALUE\n"); + printf("\t\tGEMM alpha value.\n"); + printf("\t\t\tThe value of alpha in floating point. (default: %lf)\n", + DEFAULT_GEMM_ALPHA); + + printf("\t-z, --team_size=SIZE\n"); + printf("\t\tKokkos team size.\n"); + printf("\t\t\tThe value of SIZE as an integer. (default: %d)\n", + DEFAULT_TEAM_SIZE); + + printf("\t-n, --vector_len=LEN\n"); + printf("\t\tKokkos vector length (Heirarchical parallelism).\n"); + printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", + DEFAULT_VECTOR_LEN); + + printf("\t-k, --batch_size=LEN\n"); + printf("\t\tBatch size. Adds third dimension to matrices A, B, and C.\n"); + printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", + DEFAULT_VECTOR_LEN); + printf("\t-l, --loop_type=OPTION\n"); printf("\t\tLoop selection.\n"); printf("\t\t\tValid values for OPTION:\n"); @@ -105,21 +136,25 @@ static void __print_help_blas3_perf_test() { printf("%c[0m", 27); printf(" invoke blas routine in a Kokkos::parallel_for-loop.\n\n"); - printf("\t-b, --matrix_size_start=MxN,IxJ\n"); - printf("\t\tMatrix size selection where A is MxN and B is IxJ (start)\n"); + printf("\t-b, --matrix_size_start=MxN,IxJ,PxQ\n"); + printf( + "\t\tMatrix size selection where A is MxN, B is IxJ, and C is PxQ " + "(start)\n"); printf( "\t\t\tValid values for M and N are any non-negative 32-bit integers. " - "(default: %dx%d,%dx%d)\n\n", + "(default: %dx%d,%dx%d,%dx%d)\n\n", DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, - DEFAULT_MATRIX_START); + DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START); - printf("\t-e, --matrix_size_stop=PxQ,SxT\n"); - printf("\t\tMatrix size selection where A is PxQ and B is SxT (stop)\n"); + printf("\t-e, --matrix_size_stop=SxT,LxK,OxR\n"); + printf( + "\t\tMatrix size selection where A is SxT, B is LxK, and C is OxR " + "(stop)\n"); printf( - "\t\t\tValid values for P and Q are any non-negative 32-bit integers. " - "(default: %dx%d,%dx%d)\n\n", + "\t\t\tValid dimension values are any non-negative 32-bit integers. " + "(default: %dx%d,%dx%d,%dx%d)\n\n", DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, - DEFAULT_MATRIX_STOP); + DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP); printf("\t-s, --matrix_size_step=K\n"); printf("\t\tMatrix step selection.\n"); @@ -156,72 +191,106 @@ static void __print_help_blas3_perf_test() { DEFAULT_BLAS_ROUTINES); } -static void __blas3_perf_test_input_error(char **argv, int option_idx) { - fprintf(stderr, "ERROR: invalid option \"%s %s\".\n", argv[option_idx], - argv[option_idx + 1]); - __print_help_blas3_perf_test(); +static void __blas3_perf_test_input_error(char **argv, char short_opt, + char *getopt_optarg) { + fprintf(stderr, "ERROR: invalid option \"-%c %s\". Try --help.\n", short_opt, + getopt_optarg); exit(-EINVAL); } int main(int argc, char **argv) { options_t options; - int option_idx = 0, ret; - char *n_str = nullptr, *adim = nullptr, *bdim = nullptr; + int option_idx = 0, ret, i; + char *n_str = nullptr, *adim = nullptr, *bdim = nullptr, *cdim = nullptr; std::filebuf fb; - char *out_file = nullptr; + char *out_file = nullptr; + using rt_type = decltype(do_trmm_invoke); + rt_type *routine_table[BLAS_ROUTINES_N] = { + &do_trmm_invoke, &do_gemm_invoke + // ADD MORE BLAS3 ROUTINES HERE + }; /* set default options */ - options.test = DEFAULT_TEST; - options.loop = DEFAULT_LOOP; - options.start.a.m = DEFAULT_MATRIX_START; - options.start.a.n = DEFAULT_MATRIX_START; - options.stop.a.m = DEFAULT_MATRIX_STOP; - options.stop.a.n = DEFAULT_MATRIX_STOP; - options.start.b.m = DEFAULT_MATRIX_START; - options.start.b.n = DEFAULT_MATRIX_START; - options.stop.b.m = DEFAULT_MATRIX_STOP; - options.stop.b.n = DEFAULT_MATRIX_STOP; - options.step = DEFAULT_STEP; - options.warm_up_n = DEFAULT_WARM_UP_N; - options.n = DEFAULT_N; - options.out = DEFAULT_OUT; - options.blas_routines = std::string(DEFAULT_BLAS_ROUTINES); + options.test = DEFAULT_TEST; + options.loop = DEFAULT_LOOP; + options.start.a.k = DEFAULT_K; + options.start.a.m = DEFAULT_MATRIX_START; + options.start.a.n = DEFAULT_MATRIX_START; + options.stop.a.k = DEFAULT_K; + options.stop.a.m = DEFAULT_MATRIX_STOP; + options.stop.a.n = DEFAULT_MATRIX_STOP; + options.start.b.k = DEFAULT_K; + options.start.b.m = DEFAULT_MATRIX_START; + options.start.b.n = DEFAULT_MATRIX_START; + options.stop.b.k = DEFAULT_K; + options.stop.b.m = DEFAULT_MATRIX_STOP; + options.stop.b.n = DEFAULT_MATRIX_STOP; + options.start.c.k = DEFAULT_K; + options.start.c.m = DEFAULT_MATRIX_START; + options.start.c.n = DEFAULT_MATRIX_START; + options.stop.c.k = DEFAULT_K; + options.stop.c.m = DEFAULT_MATRIX_STOP; + options.stop.c.n = DEFAULT_MATRIX_STOP; + options.step = DEFAULT_STEP; + options.warm_up_n = DEFAULT_WARM_UP_N; + options.n = DEFAULT_N; + options.out = DEFAULT_OUT; + options.blas_routines = std::string(DEFAULT_BLAS_ROUTINES); + options.blas_args.team_size = DEFAULT_TEAM_SIZE; + options.blas_args.vector_len = DEFAULT_VECTOR_LEN; options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS; options.blas_args.trmm.alpha = DEFAULT_TRMM_ALPHA; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:", long_options, - &option_idx)) != -1) { + options.blas_args.gemm.gemm_args = DEFAULT_GEMM_ARGS; + options.blas_args.gemm.alpha = DEFAULT_GEMM_ALPHA; + + while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:", + long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas3_perf_test(); return 0; case 't': - // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); - if (!strncasecmp(optarg, "blas", 4)) { - options.test = BLAS; - } else if (!strncasecmp(optarg, "batched", 6)) { - options.test = BATCHED; - } else { - __blas3_perf_test_input_error(argv, option_idx); + for (i = 0; i < TEST_N; i++) { + if (!test_e_str[i].compare(optarg)) { + options.test = (test_e)i; + break; + } + } + if (i == TEST_N) { + __blas3_perf_test_input_error(argv, ret, optarg); } break; case 'o': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); if (strlen(optarg) != 4) { - __blas3_perf_test_input_error(argv, option_idx); + __blas3_perf_test_input_error(argv, ret, optarg); } options.blas_args.trmm.trmm_args = optarg; break; + case 'g': + // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); + if (strlen(optarg) != 3) { + __blas3_perf_test_input_error(argv, ret, optarg); + } + options.blas_args.gemm.gemm_args = optarg; + break; + case 'p': + // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); + options.blas_args.gemm.alpha = (default_scalar)atof(optarg); + break; case 'a': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); options.blas_args.trmm.alpha = (default_scalar)atof(optarg); break; case 'l': - if (!strncasecmp(optarg, "serial", 6)) { - options.loop = SERIAL; - } else if (!strncasecmp(optarg, "parallel", 8)) { - options.loop = PARALLEL; - } else { - __blas3_perf_test_input_error(argv, option_idx); + for (i = 0; i < LOOP_N; i++) { + if (!loop_e_str[i].compare(optarg)) { + options.loop = (loop_e)i; + break; + } + } + if (i == LOOP_N) { + __blas3_perf_test_input_error(argv, ret, optarg); } break; case 'b': @@ -229,51 +298,78 @@ int main(int argc, char **argv) { bdim = strcasestr(optarg, ","); bdim[0] = '\0'; bdim = &bdim[1]; + cdim = strcasestr(bdim, ","); + cdim[0] = '\0'; + cdim = &cdim[1]; n_str = strcasestr(adim, "x"); - if (n_str == NULL) __blas3_perf_test_input_error(argv, option_idx); + if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg); n_str[0] = '\0'; options.start.a.m = atoi(adim); options.start.a.n = atoi(&n_str[1]); n_str = strcasestr(bdim, "x"); - if (n_str == NULL) __blas3_perf_test_input_error(argv, option_idx); + if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg); n_str[0] = '\0'; options.start.b.m = atoi(bdim); options.start.b.n = atoi(&n_str[1]); + + n_str = strcasestr(cdim, "x"); + if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg); + + n_str[0] = '\0'; + options.start.c.m = atoi(cdim); + options.start.c.n = atoi(&n_str[1]); break; case 'e': adim = optarg; bdim = strcasestr(optarg, ","); bdim[0] = '\0'; bdim = &bdim[1]; + cdim = strcasestr(bdim, ","); + cdim[0] = '\0'; + cdim = &cdim[1]; n_str = strcasestr(adim, "x"); - if (n_str == NULL) __blas3_perf_test_input_error(argv, option_idx); + if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg); n_str[0] = '\0'; options.stop.a.m = atoi(adim); options.stop.a.n = atoi(&n_str[1]); n_str = strcasestr(bdim, "x"); - if (n_str == NULL) __blas3_perf_test_input_error(argv, option_idx); + if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg); n_str[0] = '\0'; options.stop.b.m = atoi(bdim); options.stop.b.n = atoi(&n_str[1]); + + n_str = strcasestr(cdim, "x"); + if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg); + + n_str[0] = '\0'; + options.stop.c.m = atoi(cdim); + options.stop.c.n = atoi(&n_str[1]); break; case 's': options.step = atoi(optarg); break; case 'w': options.warm_up_n = atoi(optarg); break; case 'i': options.n = atoi(optarg); break; + case 'k': + options.start.a.k = options.start.b.k = options.start.c.k = + options.stop.a.k = options.stop.b.k = options.stop.c.k = + atoi(optarg); + break; + case 'z': options.blas_args.team_size = atoi(optarg); break; + case 'n': options.blas_args.vector_len = atoi(optarg); break; case 'c': out_file = optarg; options.out_file = std::string(out_file); break; - case 'r': options.blas_routines = std::string(optarg); break; + case 'r': options.blas_routines = optarg; break; case '?': - default: __blas3_perf_test_input_error(argv, option_idx); + default: __blas3_perf_test_input_error(argv, ret, optarg); } } @@ -283,16 +379,35 @@ int main(int argc, char **argv) { options.out = &out; } - if (options.warm_up_n > options.n) - __blas3_perf_test_input_error(argv, option_idx); + if (options.warm_up_n > options.n) { + fprintf(stderr, "ERROR: warm_up_n=%d > n=%d. Try --help.\n", + options.warm_up_n, options.n); + exit(-EINVAL); + } Kokkos::initialize(argc, argv); - for (int i = 0; i < BLAS_ROUTINES_N; i++) { - if (options.blas_routines.find(blas_routines_e_str[TRMM]) != - std::string::npos) - do_trmm_invoke[options.loop][options.test](options); - // ADD MORE BLAS3 ROUTINES HERE + int err = 0; + for (i = 0; i < BLAS_ROUTINES_N; i++) { + if (options.blas_routines.find(blas_routines_e_str[i]) != + std::string::npos) { + std::cout << "Testing " << blas_routines_e_str[i] << "..." << std::endl; + + auto routine = routine_table[i]; + + if (!routine || !routine[0][options.loop][options.test]) { + std::cerr << "do_" << blas_routines_e_str[i] << "_invoke["; + err = 1; + break; + } + routine[0][options.loop][options.test](options); + } + } + + if (err) { + std::cerr << loop_e_str[options.loop] << "][" << test_e_str[options.test] + << "] not yet implemented!" << std::endl; + exit(-EINVAL); } if (out_file != nullptr) fb.close(); diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index e2b62ef8eb..70f7664679 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -55,7 +55,7 @@ #include "KokkosBatched_Trmm_Serial_Impl.hpp" #include "KokkosBatched_Util.hpp" -//#define TRMM_PERF_TEST_DEBUG +//#define PERF_TEST_DEBUG // Forward declarations void do_trmm_serial_blas(options_t options); @@ -68,13 +68,6 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { {do_trmm_serial_blas, do_trmm_serial_batched}, {do_trmm_parallel_blas, do_trmm_parallel_batched}}; -/*************************** Print macros **************************/ -#ifdef TRMM_PERF_TEST_DEBUG -#define STATUS printf("STATUS: %s:%d.\n", __func__, __LINE__); -#else -#define STATUS -#endif // TRMM_PERF_TEST_DEBUG - /*************************** Test types and defaults **************************/ #define DEFAULT_TRMM_ARGS "LUNU" #define DEFAULT_TRMM_ALPHA 1.0 @@ -106,7 +99,7 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, } static void __print_trmm_perf_test_options(options_t options) { -#ifdef TRMM_PERF_TEST_DEBUG +#ifdef PERF_TEST_DEBUG printf("options.test = %s\n", test_e_str[options.test].c_str()); printf("options.loop = %s\n", loop_e_str[options.loop].c_str()); printf("options.start = %dx%d,%dx%d\n", options.start.a.m, @@ -123,7 +116,7 @@ static void __print_trmm_perf_test_options(options_t options) { printf("options.alpha = %lf\n", options.blas_args.trmm.alpha); else if (std::is_same::value) printf("options.alpha = %f\n", options.blas_args.trmm.alpha); -#endif // TRMM_PERF_TEST_DEBUG +#endif // PERF_TEST_DEBUG return; } @@ -146,6 +139,7 @@ void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) { &trmm_args.diag, trmm_args.alpha, A, B); } + Kokkos::fence(); timer.reset(); for (uint32_t i = 0; i < n; ++i) { auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); @@ -292,7 +286,7 @@ void __do_trmm_serial_batched(options_t options, trmm_args_t trmm_args) { return; } -#if !defined(KOKKOS_ENABLE_CUDA) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) template struct parallel_blas_trmm { trmm_args_t trmm_args_; @@ -312,7 +306,7 @@ struct parallel_blas_trmm { template void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) { -#if !defined(KOKKOS_ENABLE_CUDA) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; Kokkos::Timer timer; @@ -335,7 +329,9 @@ void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) { __trmm_output_csv_row(options, trmm_args, timer.seconds()); #else std::cerr << std::string(__func__) - << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl; + << " disabled since KOKKOS_ENABLE_CUDA and/or KOKKOS_ENABLE_HIP is " + "defined." + << std::endl; __trmm_output_csv_row(options, trmm_args, -1); #endif // !KOKKOS_ENABLE_CUDA return; diff --git a/perf_test/blas/blas3/README.md b/perf_test/blas/blas3/README.md index af718ee906..d150d61a32 100644 --- a/perf_test/blas/blas3/README.md +++ b/perf_test/blas/blas3/README.md @@ -19,8 +19,8 @@ void (*do_ROUTINE_invoke[LOOP_N][TEST_N])(options_t) = { }; ``` 3. Update the definitions in `KokkosBlas3_common.hpp`, where the comment `//ADD MORE BLAS3 ROUTINES HERE` is. -4. Add a conditional to invoke the new routine via `do_ROUTINE_invoke` in - `KokkosBlas3_trmm_perf_test.hpp`, where the comment `//ADD MORE BLAS3 ROUTINES HERE` is. +4. Add the `do_ROUTINE_invoke` table to the `routine_table` in + `KokkosBlas3_perf_test.cpp`, where the comment `//ADD MORE BLAS3 ROUTINES HERE` is. 5. Update the commandline argument processing in - `KokkosBlas3_trmm_perf_test.hpp` to specify how to run ROUTINE. -6. Append `ROUTINE,` to `#define DEFAULT_BLAS_ROUTINES` in `KokkosBlas3_common.hpp`. + `KokkosBlas3_perf_test.cpp` to specify how to run ROUTINE. +6. To run the new routine by default, append `ROUTINE,` to `#define DEFAULT_BLAS_ROUTINES` in `KokkosBlas3_common.hpp`. diff --git a/perf_test/graph/CMakeLists.txt b/perf_test/graph/CMakeLists.txt index bf7ae17082..134a7acc2e 100644 --- a/perf_test/graph/CMakeLists.txt +++ b/perf_test/graph/CMakeLists.txt @@ -11,6 +11,11 @@ KOKKOSKERNELS_ADD_EXECUTABLE( SOURCES KokkosGraph_color_d2.cpp ) +KOKKOSKERNELS_ADD_EXECUTABLE( + graph_mis_d2 + SOURCES KokkosGraph_mis_d2.cpp + ) + #Below will probably fail on GPUs. #KOKKOSKERNELS_ADD_EXECUTABLE( diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index cbc3697517..a3fecb4c99 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -76,6 +76,9 @@ void print_options(std::ostream &os, const char *app_name, unsigned int indent = #endif #if defined(KOKKOS_ENABLE_CUDA) << spaces << " --cuda Use CUDA (device $id)" << std::endl +#endif +#if defined(KOKKOS_ENABLE_HIP) + << spaces << " --hip Use HIP (device $id)" << std::endl #endif << std::endl << spaces << " Required Parameters:" << std::endl @@ -131,6 +134,9 @@ int parse_inputs (KokkosKernels::Experiment::Parameters ¶ms, int argc, char else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) { params.use_cuda = 1 + atoi(getNextArg(i, argc, argv)); } + else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) { + params.use_hip = 1 + atoi(getNextArg(i, argc, argv)); + } else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) { params.repeat = atoi(getNextArg(i, argc, argv)); } @@ -212,7 +218,7 @@ int parse_inputs (KokkosKernels::Experiment::Parameters ¶ms, int argc, char print_options(std::cout, argv[0]); return 1; } - if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda) + if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip) { print_options(std::cout, argv[0]); return 1; @@ -539,7 +545,7 @@ int main (int argc, char ** argv){ std::cout << "Sizeof(idx):" << sizeof(idx) << " sizeof(size_type):" << sizeof(size_type) << std::endl; const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided as number of threads - const int device_id = 0; + const int device_id = std::max(params.use_cuda, params.use_hip) - 1; Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) ); Kokkos::print_configuration(std::cout); @@ -579,6 +585,15 @@ int main (int argc, char ** argv){ #endif +#if defined( KOKKOS_ENABLE_HIP ) + if (params.use_hip) { + KokkosKernels::Experiment::run_multi_mem_experiment + ( + params + ); + } +#endif + #if defined( KOKKOS_ENABLE_SERIAL ) if (params.use_serial) { #ifdef KOKKOSKERNELS_MULTI_MEM diff --git a/perf_test/graph/KokkosGraph_color_d2.cpp b/perf_test/graph/KokkosGraph_color_d2.cpp index 970bafa380..04d977527d 100644 --- a/perf_test/graph/KokkosGraph_color_d2.cpp +++ b/perf_test/graph/KokkosGraph_color_d2.cpp @@ -81,6 +81,7 @@ struct D2Parameters int use_threads; int use_openmp; int use_cuda; + int use_hip; int use_serial; const char* mtx_file; ColoringMode d2_color_type; @@ -93,6 +94,7 @@ struct D2Parameters use_threads = 0; use_openmp = 0; use_cuda = 0; + use_hip = 0; use_serial = 0; mtx_file = NULL; d2_color_type = MODE_D2_SYMMETRIC; @@ -147,6 +149,9 @@ void print_options(std::ostream &os, const char *app_name, unsigned int indent = #endif #ifdef KOKKOS_ENABLE_CUDA << spaces << " --cuda Use given CUDA device" << std::endl +#endif +#ifdef KOKKOS_ENABLE_HIP + << spaces << " --hip Use given HIP device" << std::endl #endif << std::endl << spaces << " Coloring modes:" << std::endl @@ -199,6 +204,10 @@ int parse_inputs(D2Parameters ¶ms, int argc, char **argv) { params.use_cuda = 1 + atoi(getNextArg(i, argc, argv)); } + else if(0 == strcasecmp(argv[i], "--hip")) + { + params.use_hip = 1 + atoi(getNextArg(i, argc, argv)); + } else if(0 == strcasecmp(argv[i], "--repeat")) { params.repeat = atoi(getNextArg(i, argc, argv)); @@ -273,7 +282,7 @@ int parse_inputs(D2Parameters ¶ms, int argc, char **argv) print_options(std::cout, argv[0]); return 1; } - if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda) + if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip) { print_options(std::cout, argv[0]); return 1; @@ -603,6 +612,8 @@ int main(int argc, char *argv[]) int device_id = 0; if(params.use_cuda) device_id = params.use_cuda - 1; + else if(params.use_hip) + device_id = params.use_hip - 1; Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); // Print out verbose information about the configuration of the run. @@ -645,6 +656,16 @@ int main(int argc, char *argv[]) } #endif + #if defined(KOKKOS_ENABLE_HIP) + if(params.use_hip) + { + if(!use_multi_mem) + { + KokkosKernels::Experiment::experiment_driver(params); + } + } + #endif + #if defined(KOKKOS_ENABLE_SERIAL) if(params.use_serial) { diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp new file mode 100644 index 0000000000..32ff5f5fbd --- /dev/null +++ b/perf_test/graph/KokkosGraph_mis_d2.cpp @@ -0,0 +1,397 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "KokkosKernels_Utils.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_spadd.hpp" +#include "KokkosGraph_MIS2.hpp" +#include "KokkosKernels_default_types.hpp" + +using namespace KokkosGraph; + +struct MIS2Parameters +{ + int repeat = 1; + bool verbose = false; + int use_threads = 0; + int use_openmp = 0; + int use_cuda = 0; + int use_hip = 0; + int use_serial = 0; + const char* mtx_file = NULL; + MIS2_Algorithm algo = MIS2_FAST; +}; + +template +bool verifyD2MIS( + lno_t numVerts, + const rowmap_t& rowmap, const entries_t& entries, + const mis_t& misArray) +{ + //set a std::set of the mis, for fast membership test + std::set mis; + for(size_t i = 0; i < misArray.extent(0); i++) + mis.insert(misArray(i)); + for(lno_t i = 0; i < numVerts; i++) + { + //determine whether another vertex in the set is + //within 2 hops of i. + bool misIn2Hops = false; + for(size_type j = rowmap(i); j < rowmap(i + 1); j++) + { + lno_t nei1 = entries(j); + if(nei1 == i || nei1 >= numVerts) + continue; + if(mis.find(nei1) != mis.end()) + { + misIn2Hops = true; + break; + } + for(size_type k = rowmap(nei1); k < rowmap(nei1 + 1); k++) + { + lno_t nei2 = entries(k); + if(nei2 == i || nei2 >= numVerts) + continue; + if(mis.find(nei2) != mis.end()) + { + misIn2Hops = true; + break; + } + } + } + if(mis.find(i) == mis.end()) + { + //i is not in the set + if(!misIn2Hops) + { + std::cout << "INVALID D2 MIS: vertex " << i << " is not in the set,\n"; + std::cout << "but there are no vertices in the set within 2 hops.\n"; + return false; + } + } + else + { + //i is in the set + if(misIn2Hops) + { + std::cout << "INVALID D2 MIS: vertex " << i << " is in the set,\n"; + std::cout << "but there is another vertex within 2 hops which is also in the set.\n"; + return false; + } + } + } + return true; +} + +void print_options(std::ostream &os, const char *app_name, unsigned int indent = 0) +{ + std::string spaces(indent, ' '); + os << "Usage:" << std::endl + << spaces << " " << app_name << " [parameters]" << std::endl + << std::endl + << spaces << "Parameters:" << std::endl + << spaces << " Required Parameters:" << std::endl + << spaces << " --amtx Input file in Matrix Market format (.mtx)." << std::endl + << std::endl + << spaces << " Device type (the following are enabled in this build):" << std::endl +#ifdef KOKKOS_ENABLE_SERIAL + << spaces << " --serial Execute serially." << std::endl +#endif +#ifdef KOKKOS_ENABLE_THREADS + << spaces << " --threads Use posix threads.\n" +#endif +#ifdef KOKKOS_ENABLE_OPENMP + << spaces << " --openmp Use OpenMP.\n" +#endif +#ifdef KOKKOS_ENABLE_CUDA + << spaces << " --cuda Use CUDA.\n" +#endif +#ifdef KOKKOS_ENABLE_HIP + << spaces << " --hip Use HIP.\n" +#endif + << std::endl + << spaces << " Optional Parameters:" << std::endl + << spaces << " --algo alg alg: fast, quality" << std::endl + << spaces << " --repeat Set number of test repetitions (Default: 1) " << std::endl + << spaces << " --verbose Enable verbose mode (record and print timing + extra information)" << std::endl + << spaces << " --help Print out command line help." << std::endl + << spaces << " " << std::endl; +} + +static char* getNextArg(int& i, int argc, char** argv) +{ + i++; + if(i >= argc) + { + std::cerr << "Error: expected additional command-line argument!\n"; + exit(1); + } + return argv[i]; +} + +int parse_inputs(MIS2Parameters ¶ms, int argc, char **argv) +{ + bool got_required_param_amtx = false; + for(int i = 1; i < argc; ++i) + { + if(0 == strcasecmp(argv[i], "--threads")) + { + params.use_threads = 1; + } + else if(0 == strcasecmp(argv[i], "--serial")) + { + params.use_serial = 1; + } + else if(0 == strcasecmp(argv[i], "--openmp")) + { + params.use_openmp = 1; + } + else if(0 == strcasecmp(argv[i], "--cuda")) + { + params.use_cuda = 1; + } + else if(0 == strcasecmp(argv[i], "--hip")) + { + params.use_hip = 1; + } + else if(0 == strcasecmp(argv[i], "--repeat")) + { + params.repeat = atoi(getNextArg(i, argc, argv)); + if(params.repeat <= 0) + { + std::cout << "*** Repeat count must be positive, defaulting to 1.\n"; + params.repeat = 1; + } + } + else if(0 == strcasecmp(argv[i], "--amtx")) + { + got_required_param_amtx = true; + params.mtx_file = getNextArg(i, argc, argv); + } + else if(0 == strcasecmp(argv[i], "--algo")) + { + const char* algName = getNextArg(i, argc, argv); + if(!strcasecmp(algName, "fast")) + params.algo = MIS2_FAST; + else if(!strcasecmp(algName, "quality")) + params.algo = MIS2_QUALITY; + else + throw std::invalid_argument("Algorithm not valid: must be 'fast' or 'quality'"); + } + else if(0 == strcasecmp(argv[i], "--verbose")) + { + params.verbose = true; + } + else if(0 == strcasecmp(argv[i], "--help") || 0 == strcasecmp(argv[i], "-h")) + { + print_options(std::cout, argv[0]); + return 1; + } + else + { + std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; + print_options(std::cout, argv[0]); + return 1; + } + } + + if(!got_required_param_amtx) + { + std::cout << "Missing required parameter amtx" << std::endl << std::endl; + print_options(std::cout, argv[0]); + return 1; + } + if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip) + { + print_options(std::cout, argv[0]); + return 1; + } + return 0; +} + +template +void run_mis2(const MIS2Parameters& params) +{ + using size_type = default_size_type; + using lno_t = default_lno_t; + using exec_space = typename device_t::execution_space; + using mem_space = typename device_t::memory_space; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using lno_view_t = typename crsMat_t::index_type::non_const_type; + using KKH = KokkosKernels::Experimental::KokkosKernelsHandle; + + Kokkos::Timer t; + crsMat_t A_in = KokkosKernels::Impl::read_kokkos_crst_matrix(params.mtx_file); + std::cout << "I/O time: " << t.seconds() << " s\n"; + t.reset(); + //Symmetrize the matrix just in case + crsMat_t At_in = KokkosKernels::Impl::transpose_matrix(A_in); + crsMat_t A; + KKH kkh; + kkh.create_spadd_handle(false); + KokkosSparse::spadd_symbolic(&kkh, A_in, At_in, A); + KokkosSparse::spadd_numeric(&kkh, 1.0, A_in, 1.0, At_in, A); + kkh.destroy_spadd_handle(); + std::cout << "Time to symmetrize: " << t.seconds() << " s\n"; + auto rowmap = A.graph.row_map; + auto entries = A.graph.entries; + lno_t numVerts = A.numRows(); + + std::cout << "Num verts: " << numVerts << '\n' + << "Num edges: " << A.nnz() << '\n'; + + lno_view_t mis; + + t.reset(); + for(int rep = 0; rep < params.repeat; rep++) + { + mis = KokkosGraph::Experimental::graph_d2_mis(rowmap, entries, params.algo); + exec_space().fence(); + } + double totalTime = t.seconds(); + std::cout << "MIS-2 average time: " << totalTime / params.repeat << '\n'; + std::cout << "MIS size: " << mis.extent(0) << '\n'; + + if(params.verbose) + { + std::cout << "Vertices in independent set:\n"; + KokkosKernels::Impl::print_1Dview(mis); + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries); + auto misHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), mis); + if(verifyD2MIS + + (numVerts, rowmapHost, entriesHost, misHost)) + std::cout << "MIS-2 is correct.\n"; + else + std::cout << "*** MIS-2 not correct! ***\n"; + } +} + +int main(int argc, char *argv[]) +{ + MIS2Parameters params; + + if(parse_inputs(params, argc, argv)) + { + return 1; + } + + if(params.mtx_file == NULL) + { + std::cerr << "Provide a matrix file" << std::endl; + return 0; + } + + Kokkos::initialize(); + + bool run = false; + + #if defined(KOKKOS_ENABLE_OPENMP) + if(params.use_openmp) + { + run_mis2(params); + run = true; + } + #endif + + #if defined(KOKKOS_ENABLE_THREADS) + if(params.use_threads) + { + run_mis2(params); + run = true; + } + #endif + + #if defined(KOKKOS_ENABLE_CUDA) + if(params.use_cuda) + { + run_mis2(params); + run = true; + } + #endif + + #if defined(KOKKOS_ENABLE_HIP) + if(params.use_hip) + { + run_mis2(params); + run = true; + } + #endif + + #if defined(KOKKOS_ENABLE_SERIAL) + if(params.use_serial) + { + run_mis2(params); + run = true; + } + #endif + + if(!run) + { + std::cerr << "*** ERROR: did not run, none of the supported device types were selected.\n"; + } + + Kokkos::finalize(); + + return 0; +} diff --git a/perf_test/graph/KokkosGraph_triangle.cpp b/perf_test/graph/KokkosGraph_triangle.cpp index 6f0b6c73df..63a52dbaea 100644 --- a/perf_test/graph/KokkosGraph_triangle.cpp +++ b/perf_test/graph/KokkosGraph_triangle.cpp @@ -54,7 +54,7 @@ void print_options(){ std::cerr << "Options\n" << std::endl; - std::cerr << "Choose BackEnd : --openmp [numthreads] | --cuda" << std::endl; + std::cerr << "Choose BackEnd : --openmp [numthreads] | --cuda | --hip" << std::endl; std::cerr << "Input Matrix : --amtx [path_to_input_matrix]" << std::endl; std::cerr << "\tInput Matrix format can be multiple formats. If it ends with:" << std::endl; std::cerr << "\t\t.mtx: it will read matrix market format." << std::endl; @@ -96,6 +96,9 @@ int parse_inputs (KokkosKernels::Experiment::Parameters ¶ms, int argc, char else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) { params.use_cuda = 1; } + else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) { + params.use_hip = 1; + } else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) { params.repeat = atoi( argv[++i] ); } @@ -292,7 +295,6 @@ int main (int argc, char ** argv){ const int device_id = 0; Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) ); -#if !defined (KOKKOS_ENABLE_CUDA) #if defined( KOKKOS_ENABLE_OPENMP ) if (params.use_openmp) { @@ -311,10 +313,9 @@ int main (int argc, char ** argv){ } #endif -#endif -#if defined( KOKKOS_ENABLE_CUDA1 ) +#if defined( KOKKOS_ENABLE_CUDA ) if (params.use_cuda) { Kokkos::Cuda::print_configuration(std::cout); #ifdef KOKKOSKERNELS_MULTI_MEM @@ -332,6 +333,16 @@ int main (int argc, char ** argv){ #endif +#if defined( KOKKOS_ENABLE_HIP ) + if (params.use_hip) { + Kokkos::Experimental::HIP::print_configuration(std::cout); + KokkosKernels::Experiment::run_multi_mem_triangle + ( + params + ); + } +#endif + Kokkos::finalize(); return 0; diff --git a/perf_test/performance/CMakeLists.txt b/perf_test/performance/CMakeLists.txt index 09593b3128..93d377ba60 100644 --- a/perf_test/performance/CMakeLists.txt +++ b/perf_test/performance/CMakeLists.txt @@ -11,12 +11,12 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) IF(TPL_ENABLE_yaml-cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( performance_validate SOURCES performance_validate.cpp ) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( performance_example SOURCES performance_example.cpp ) diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index da22993cda..f0662e4a08 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -43,6 +43,11 @@ KOKKOSKERNELS_ADD_EXECUTABLE( SOURCES KokkosSparse_spmv.cpp ) +KOKKOSKERNELS_ADD_EXECUTABLE( + sparse_kk_spmv + SOURCES KokkosSparse_kk_spmv.cpp + ) + IF(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) KOKKOSKERNELS_ADD_EXECUTABLE( sparse_spmv_merge diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp new file mode 100644 index 0000000000..aa8f2ddfa3 --- /dev/null +++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp @@ -0,0 +1,185 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "KokkosKernels_default_types.hpp" + +typedef default_scalar Scalar; +typedef default_lno_t Ordinal; +typedef default_size_type Offset; + +template +void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, int num_vecs, char mode, Scalar beta) { + typedef KokkosSparse::CrsMatrix matrix_type; + typedef typename Kokkos::View mv_type; + typedef typename mv_type::HostMirror h_mv_type; + + srand(17312837); + matrix_type A; + if(filename) + A = KokkosKernels::Impl::read_kokkos_crst_matrix(filename); + else + { + Offset nnz = 10 * numRows; + //note: the help text says the bandwidth is fixed at 0.01 * numRows + A = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows, numCols, nnz, 0, 0.01 * numRows); + } + numRows = A.numRows(); + numCols = A.numCols(); + mv_type x("X", numCols, num_vecs); + mv_type y("Y", numRows, num_vecs); + h_mv_type h_x = Kokkos::create_mirror_view(x); + h_mv_type h_y = Kokkos::create_mirror_view(y); + h_mv_type h_y_compare = Kokkos::create_mirror(y); + + for(int v = 0; v < num_vecs; v++) + { + for(int i=0; i::value) + layout = 'L'; + else + layout = 'R'; + int loop = 100; + int num_vecs = 1; + Scalar beta = 0.0; + + if(argc == 1) { + print_help(); + return 0; + } + + for(int i=0;i(size,size,filename,loop,num_vecs,mode,beta); + else + run_spmv(size,size,filename,loop,num_vecs,mode,beta); + + Kokkos::finalize(); +} + diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index 681327dfaf..0f6351189b 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -43,32 +43,24 @@ */ #include -#if defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) #include "KokkosSparse_pcg.hpp" #include "KokkosKernels_Utils.hpp" -#include #include "KokkosKernels_IOUtils.hpp" +#include "KokkosKernels_default_types.hpp" +#include #define MAXVAL 1 -#define SIZE_TYPE size_t -#define INDEX_TYPE int -#define SCALAR_TYPE double - - - template -scalar_view_t create_x_vector(INDEX_TYPE nv, SCALAR_TYPE max_value = 1.0){ +scalar_view_t create_x_vector(default_lno_t nv, default_scalar max_value = 1.0){ scalar_view_t kok_x ("X", nv); typename scalar_view_t::HostMirror h_x = Kokkos::create_mirror_view (kok_x); - for (INDEX_TYPE i = 0; i < nv; ++i){ - SCALAR_TYPE r = static_cast (rand()) / static_cast (RAND_MAX / max_value); + for (default_lno_t i = 0; i < nv; ++i){ + default_scalar r = static_cast (rand()) / static_cast (RAND_MAX / max_value); h_x(i) = r; } Kokkos::deep_copy (kok_x, h_x); @@ -98,7 +90,7 @@ void run_experiment( typedef typename lno_view_t::value_type size_type; typedef typename scalar_view_t::value_type scalar_t; - INDEX_TYPE nv = crsmat.numRows(); + default_lno_t nv = crsmat.numRows(); scalar_view_t kok_x_original = create_x_vector(nv, MAXVAL); scalar_view_t kok_b_vector = create_y_vector(crsmat, kok_x_original); @@ -255,25 +247,70 @@ void run_experiment( */ } - - - enum { CMD_USE_THREADS = 0 , CMD_USE_NUMA , CMD_USE_CORE_PER_NUMA , CMD_USE_CUDA + , CMD_USE_HIP , CMD_USE_OPENMP - , CMD_USE_CUDA_DEV + , CMD_DEVICE , CMD_BIN_MTX , CMD_CLUSTER_SIZE , CMD_USE_SEQUENTIAL_SGS , CMD_ERROR , CMD_COUNT }; +template +void run_pcg(int* cmdline, const char* mtx_file) +{ + default_lno_t nv = 0, ne = 0; + default_lno_t *xadj, *adj; + default_scalar *ew; + + KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_file); + + typedef typename KokkosSparse::CrsMatrix crsMat_t; + + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; + typedef typename crsMat_t::index_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; + + row_map_view_t rowmap_view("rowmap_view", nv+1); + cols_view_t columns_view("colsmap_view", ne); + values_view_t values_view("values_view", ne); + + { + typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view); + typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view); + typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view); + + for (default_lno_t i = 0; i <= nv; ++i){ + hr(i) = xadj[i]; + } + + for (default_lno_t i = 0; i < ne; ++i){ + hc(i) = adj[i]; + hv(i) = ew[i]; + } + Kokkos::deep_copy (rowmap_view , hr); + Kokkos::deep_copy (columns_view , hc); + Kokkos::deep_copy (values_view , hv); + } + graph_t static_graph (columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); + + delete [] xadj; + delete [] adj; + delete [] ew; + + run_experiment(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]); +} + int main (int argc, char ** argv){ int cmdline[ CMD_COUNT ] ; - char *mtx_bin_file = NULL; + char *mtx_file = NULL; for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ; for ( int i = 1 ; i < argc ; ++i ) { @@ -283,17 +320,22 @@ int main (int argc, char ** argv){ else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) { cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] ); } + /* else if ( 0 == strcasecmp( argv[i] , "--cores" ) ) { + //Note BMK: specifying #NUMA regions isn't supported by initialize sscanf( argv[++i] , "%dx%d" , cmdline + CMD_USE_NUMA , cmdline + CMD_USE_CORE_PER_NUMA ); } + */ else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) { cmdline[ CMD_USE_CUDA ] = 1 ; } - else if ( 0 == strcasecmp( argv[i] , "--cuda-dev" ) ) { - cmdline[ CMD_USE_CUDA ] = 1 ; - cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ; + else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) { + cmdline[ CMD_USE_HIP ] = 1 ; + } + else if ( 0 == strcasecmp( argv[i] , "--device-id" ) ) { + cmdline[ CMD_DEVICE ] = atoi( argv[++i] ) ; } else if ( 0 == strcasecmp( argv[i] , "--cluster-size" ) ) { cmdline[CMD_CLUSTER_SIZE] = atoi(argv[++i]); @@ -303,12 +345,12 @@ int main (int argc, char ** argv){ } else if ( 0 == strcasecmp( argv[i] , "--mtx" ) ) { - mtx_bin_file = argv[++i]; + mtx_file = argv[++i]; } else { cmdline[ CMD_ERROR ] = 1 ; std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ; - std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl; + std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--hip\n\t--device-id[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl; return 0; } @@ -317,190 +359,43 @@ int main (int argc, char ** argv){ if(cmdline[CMD_CLUSTER_SIZE] == 0) cmdline[CMD_CLUSTER_SIZE] = 1; - if (mtx_bin_file == NULL){ - std::cerr << "Provide a mtx binary file" << std::endl ; - std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl; + if (mtx_file == NULL){ + std::cerr << "Provide a matrix file" << std::endl ; + std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--hip\n\t--device-id[DeviceIndex]\n\t--mtx[matrix]" << std::endl; return 0; } + Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space -#if defined( KOKKOS_ENABLE_THREADS ) - - if ( cmdline[ CMD_USE_THREADS ] ) { - - Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space - - if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) { - init_args.num_threads = cmdline[ CMD_USE_THREADS ]; - init_args.num_numa = cmdline[ CMD_USE_NUMA ]; - //const int core_per_numa = cmdline[ CMD_USE_CORE_PER_NUMA ]; // How to get this to initialize() without using impl_initialize()? - } - else { - init_args.num_threads = cmdline[ CMD_USE_THREADS ]; - } - - Kokkos::initialize( init_args ); - Kokkos::print_configuration(std::cout); - { - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; - - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - - typedef Kokkos::Threads myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); - - KokkosKernels::Impl::copy_vector(ne, ew, values_view); - KokkosKernels::Impl::copy_vector(ne, adj, columns_view); - KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); - - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - delete [] xadj; - delete [] adj; - delete [] ew; - - run_experiment(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]); - } + init_args.device_id = cmdline[ CMD_DEVICE ]; + if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) { + init_args.num_threads = std::max(cmdline[ CMD_USE_THREADS ], cmdline [ CMD_USE_OPENMP ]); + init_args.num_numa = cmdline[ CMD_USE_NUMA ]; + } + else { + init_args.num_threads = cmdline[ CMD_USE_THREADS ]; + } - Kokkos::finalize(); - } + Kokkos::initialize( init_args ); + { +#if defined( KOKKOS_ENABLE_THREADS ) + if(cmdline[CMD_USE_THREADS]) + run_pcg(cmdline, mtx_file); #endif - #if defined( KOKKOS_ENABLE_OPENMP ) - - if ( cmdline[ CMD_USE_OPENMP ] ) { - - Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space - - if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) { - init_args.num_threads = cmdline[ CMD_USE_OPENMP ]; - init_args.num_numa = cmdline[ CMD_USE_NUMA ]; - //const int core_per_numa = cmdline[ CMD_USE_CORE_PER_NUMA ]; - } - else { - init_args.num_threads = cmdline[ CMD_USE_OPENMP ]; - } - - Kokkos::initialize( init_args ); - Kokkos::print_configuration(std::cout); - { - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; - - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - - - typedef Kokkos::OpenMP myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsMat_t::index_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); - - KokkosKernels::Impl::copy_vector(ne, ew, values_view); - KokkosKernels::Impl::copy_vector(ne, adj, columns_view); - KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); - - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - - //crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj); - delete [] xadj; - delete [] adj; - delete [] ew; - - run_experiment(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]); - } - Kokkos::finalize(); - } + if(cmdline[CMD_USE_OPENMP]) + run_pcg(cmdline, mtx_file); #endif - #if defined( KOKKOS_ENABLE_CUDA ) - if ( cmdline[ CMD_USE_CUDA ] ) { - - Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space - - // Use the last device: - init_args.device_id = cmdline[ CMD_USE_CUDA_DEV ]; - - Kokkos::initialize( init_args ); - Kokkos::print_configuration(std::cout); - { - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; - - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - - - typedef Kokkos::Cuda myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsMat_t::index_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); - - - { - typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view); - typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view); - typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view); - - for (INDEX_TYPE i = 0; i <= nv; ++i){ - hr(i) = xadj[i]; - } - - for (INDEX_TYPE i = 0; i < ne; ++i){ - hc(i) = adj[i]; - hv(i) = ew[i]; - } - Kokkos::deep_copy (rowmap_view , hr); - Kokkos::deep_copy (columns_view , hc); - Kokkos::deep_copy (values_view , hv); - - - } - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - - // typedef typename KokkosSparse::CrsMatrix crsMat_t; - // crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj); - delete [] xadj; - delete [] adj; - delete [] ew; - - run_experiment(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]); - } - Kokkos::finalize(); - } + if(cmdline[CMD_USE_CUDA]) + run_pcg(cmdline, mtx_file); #endif - +#if defined( KOKKOS_ENABLE_HIP ) + if(cmdline[CMD_USE_HIP]) + run_pcg(cmdline, mtx_file); +#endif + } + Kokkos::finalize(); return 0; } -#else -int main() { -} -#endif diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index f90c6179f7..959e9d973c 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -60,7 +60,7 @@ void print_options(){ std::cerr << "Options\n" << std::endl; - std::cerr << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]'" << std::endl; + std::cerr << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" << std::endl; std::cerr << "\t[Required] --amtx :: 1st input matrix" << std::endl; std::cerr << "\t[Required] --bmtx :: 2nd input matrix" << std::endl; diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp index 80e4ab7c34..0f1c9f6210 100644 --- a/perf_test/sparse/KokkosSparse_spgemm.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm.cpp @@ -52,7 +52,7 @@ void print_options(){ std::cerr << "\t[Required] INPUT MATRIX: '--amtx [left_hand_side.mtx]' -- for C=AxA" << std::endl; - std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' --> if none are specified, Serial is used (if enabled)" << std::endl; + std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]' --> if none are specified, Serial is used (if enabled)" << std::endl; std::cerr << "\t[Optional] '--algorithm [DEFAULT=KKDEFAULT=KKSPGEMM|KKMEM|KKDENSE|MKL|CUSPARSE|CUSP|VIENNA|MKL2]' --> to choose algorithm. KKMEM is outdated, use KKSPGEMM instead." << std::endl; std::cerr << "\t[Optional] --bmtx [righ_hand_side.mtx]' for C = AxB" << std::endl; std::cerr << "\t[Optional] OUTPUT MATRICES: '--cmtx [output_matrix.mtx]' --> to write output C=AxB" << std::endl; @@ -84,6 +84,9 @@ int parse_inputs (KokkosKernels::Experiment::Parameters ¶ms, int argc, char else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) { params.use_cuda = atoi(getNextArg(i, argc, argv)) + 1; } + else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) { + params.use_hip = atoi(getNextArg(i, argc, argv)) + 1; + } else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) { params.repeat = atoi(getNextArg(i, argc, argv)); } @@ -297,7 +300,7 @@ int main (int argc, char ** argv){ } const int num_threads = std::max(params.use_openmp, params.use_threads); - const int device_id = params.use_cuda - 1; + const int device_id = params.use_cuda ? params.use_cuda - 1 : params.use_hip - 1; Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) ); Kokkos::print_configuration(std::cout); @@ -336,6 +339,16 @@ int main (int argc, char ** argv){ } #endif +#if defined( KOKKOS_ENABLE_HIP ) + if (params.use_hip) { + KokkosKernels::Experiment::run_multi_mem_spgemm + ( + params + ); + + } +#endif + #if defined( KOKKOS_ENABLE_THREADS ) //If only serial is enabled (or no other device was specified), run with serial if (params.use_threads) diff --git a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp index 60779f7fe5..afef5968f0 100644 --- a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp @@ -66,12 +66,6 @@ enum {STRUCT, UNSTR}; -#ifdef INT64 -typedef long long int LocalOrdinalType; -#else -typedef int LocalOrdinalType; -#endif - void print_help() { printf("SPMV_struct benchmark code written by Luc Berger-Vergiat.\n"); printf("Options:\n"); @@ -482,6 +476,73 @@ int main(int argc, char **argv) if(compare_cusparse) { #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) +#ifdef CUSPARSE_VERSION + KokkosKernels::Experimental::Controls controls; + + cusparseIndexType_t myCusparseOffsetType = CUSPARSE_INDEX_32I; + cusparseIndexType_t myCusparseEntryType = CUSPARSE_INDEX_32I; + cudaDataType myCudaDataType = CUDA_R_64F; + + /* create matrix */ + cusparseSpMatDescr_t A_cusparse; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(&A_cusparse, A.numRows(), A.numCols(), A.nnz(), + (void*) A.graph.row_map.data(), + (void*) A.graph.entries.data(), + (void*) A.values.data(), + myCusparseOffsetType, + myCusparseEntryType, + CUSPARSE_INDEX_BASE_ZERO, + myCudaDataType)); + + /* create lhs and rhs */ + cusparseDnVecDescr_t vecX, vecY; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(&vecX, x1.extent_int(0), (void*) x1.data(), myCudaDataType)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(&vecY, y1.extent_int(0), (void*) y1.data(), myCudaDataType)); + + const double alpha = 1.0, beta = 1.0; + size_t bufferSize = 0; + void* dBuffer = NULL; + cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize(controls.getCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, A_cusparse, vecX, &beta, vecY, myCudaDataType, + alg, &bufferSize)); + CUDA_SAFE_CALL(cudaMalloc(&dBuffer, bufferSize)); + + /* perform SpMV */ + Kokkos::Profiling::pushRegion("cuSparse spmv test"); + double min_time = 1.0e32; + double max_time = 0.0; + double ave_time = 0.0; + for(int i=0;imax_time) max_time = time; + if(time call_generate_makefile.sh - echo " ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} --no-examples $extra_args" &>> call_generate_makefile.sh + echo " ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args" &>> call_generate_makefile.sh chmod +x call_generate_makefile.sh # script command with generic path for faster copy/paste of reproducer into issues - echo " # \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} --no-examples $extra_args" &> call_generate_makefile_genericpath.sh + echo " # \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args" &> call_generate_makefile_genericpath.sh - run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --no-examples $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } local make_par_lvl=12 - if [[ "$MACHINE" = bowman* ]] || [[ "$MACHINE" = white* ]]; then + if [[ "$MACHINE" = white* ]]; then make_par_lvl=48 fi local -i build_start_time=$(date +%s) @@ -1261,7 +1253,7 @@ single_build_and_test() { comment="build_time=$(($build_end_time-$build_start_time))" if [[ "$BUILD_ONLY" == False ]]; then - run_cmd ctest --timeout 2500 -V --output-on-failure >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; } + run_cmd ctest --timeout ${CTESTTIMEOUT} -V --output-on-failure >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; } local -i run_end_time=$(date +%s) comment="$comment run_time=$(($run_end_time-$build_end_time))" fi @@ -1296,6 +1288,9 @@ run_in_background() { if [[ "$compiler" == cuda* ]]; then num_jobs=1 fi + if [[ "$compiler" == rocm* ]]; then + num_jobs=1 + fi if [[ "$compiler" == clang ]]; then num_jobs=1 fi @@ -1405,8 +1400,7 @@ wait_summarize_and_exit() { # CM_ALL_SCRIPT=$0 -CM_ALL_SCRIPT_PATH=`pwd` -CM_ALL_SCRIPT_PATH=${CM_ALL_SCRIPT_PATH}/`dirname $CM_ALL_SCRIPT` +CM_ALL_SCRIPT_PATH=$(cd `dirname $CM_ALL_SCRIPT` && pwd) ROOT_DIR=$(get_test_root_dir) mkdir -p $ROOT_DIR diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff-dbg.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff-dbg.sh deleted file mode 100755 index 375b7f8712..0000000000 --- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff-dbg.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -export TRILINOS_DIR=${PWD}/../.. - -# Load modules -module purge -source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh cuda-9.2-dbg - -# Packages -PACKAGE1=Tpetra -PACKAGE2=Sacado -PACKAGE3=Stokhos -PACKAGE4=MueLu -PACKAGE5=Intrepid2 -PACKAGE6=Ifpack2 -PACKAGE7=Panzer -PACKAGE8=Phalanx -PACKAGE9=Stratimikos -PACKAGE10=Belos - -# Configure -cmake \ - -GNinja \ - -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \ - -DTrilinos_ENABLE_TESTS=ON \ - -DTrilinos_ENABLE_${PACKAGE1}=ON \ - -DTrilinos_ENABLE_${PACKAGE2}=ON \ - -DTrilinos_ENABLE_${PACKAGE3}=ON \ - -DTrilinos_ENABLE_${PACKAGE4}=ON \ - -DTrilinos_ENABLE_${PACKAGE5}=ON \ - -DTrilinos_ENABLE_${PACKAGE6}=ON \ - -DTrilinos_ENABLE_${PACKAGE7}=ON \ - -DTrilinos_ENABLE_${PACKAGE8}=ON \ - -DTrilinos_ENABLE_${PACKAGE9}=ON \ - -DTrilinos_ENABLE_${PACKAGE10}=ON \ - -DKOKKOS_ENABLE_DEPRECATED_CODE=OFF \ - -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ - -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ - -DTpetra_ENABLE_DEBUG=ON \ -$TRILINOS_DIR - -# Notes: -# Compile using ninja -# make NP=32 - -# Allocate node: -# bsub -J TestCompare-DepOffdbg -W 06:00 -Is -n 16 -q rhel7W bash - -# Run tests -# ctest -j8 - -# Submit tests as job -# bsub -x -Is -q rhel7W -n 16 ctest -j8 diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff.sh deleted file mode 100755 index 9f35eeed3f..0000000000 --- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash - -echo "SOURCE this script!!" - -export TRILINOS_DIR=${PWD}/../.. - -# Load modules -module purge -source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh cuda-9.2-opt - -# Packages -PACKAGE1=Tpetra -PACKAGE2=Sacado -PACKAGE3=Stokhos -PACKAGE4=MueLu -PACKAGE5=Intrepid2 -PACKAGE6=Ifpack2 -PACKAGE7=Panzer -PACKAGE8=Phalanx -PACKAGE9=Stratimikos -PACKAGE10=Belos - - -rm -rf CMake* - -# Configure -cmake \ - -GNinja \ - -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \ - -DTrilinos_ENABLE_TESTS=ON \ - -DTrilinos_ENABLE_${PACKAGE1}=ON \ - -DTrilinos_ENABLE_${PACKAGE2}=ON \ - -DTrilinos_ENABLE_${PACKAGE3}=ON \ - -DTrilinos_ENABLE_${PACKAGE4}=ON \ - -DTrilinos_ENABLE_${PACKAGE5}=ON \ - -DTrilinos_ENABLE_${PACKAGE6}=ON \ - -DTrilinos_ENABLE_${PACKAGE7}=ON \ - -DTrilinos_ENABLE_${PACKAGE8}=ON \ - -DTrilinos_ENABLE_${PACKAGE9}=ON \ - -DTrilinos_ENABLE_${PACKAGE10}=ON \ - -DKOKKOS_ENABLE_DEPRECATED_CODE=OFF \ - -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ - -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ -$TRILINOS_DIR - - -# Notes: -# Compile using ninja -# make NP=32 - -# Allocate node: -# bsub -J TestCompare-DepCodeOFF -W 06:00 -Is -n 16 -q rhel7W bash - -# Run tests -# ctest -j8 - -# Or submit tests as job -# bsub -x -Is -q rhel7W -n 16 ctest -j8 diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon-dbg.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon-dbg.sh index 41160c938c..c6af962034 100755 --- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon-dbg.sh +++ b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon-dbg.sh @@ -33,7 +33,6 @@ cmake \ -DTrilinos_ENABLE_${PACKAGE8}=ON \ -DTrilinos_ENABLE_${PACKAGE9}=ON \ -DTrilinos_ENABLE_${PACKAGE10}=ON \ - -DKOKKOS_ENABLE_DEPRECATED_CODE=ON \ -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ -DTpetra_ENABLE_DEBUG=ON \ diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon.sh index 955821005f..9403741586 100755 --- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon.sh +++ b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon.sh @@ -38,7 +38,6 @@ cmake \ -DTrilinos_ENABLE_${PACKAGE8}=ON \ -DTrilinos_ENABLE_${PACKAGE9}=ON \ -DTrilinos_ENABLE_${PACKAGE10}=ON \ - -DKOKKOS_ENABLE_DEPRECATED_CODE=ON \ -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ $TRILINOS_DIR diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depoff.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depoff.sh deleted file mode 100755 index da9017e388..0000000000 --- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depoff.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -echo "SOURCE this script!!" - -export TRILINOS_DIR=${PWD}/../.. - -# Load modules -module purge -source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh cuda-9.2-rdc-release-debug-pt - -rm -rf CMake* - -# Configure -cmake \ - -GNinja \ - -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \ - -DTrilinos_ENABLE_TESTS=ON \ - -DTrilinos_ENABLE_ALL_PACKAGES=ON \ - -DKOKKOS_ENABLE_DEPRECATED_CODE=OFF \ - -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ - -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ -$TRILINOS_DIR - -# Notes: -# Compile using ninja -# make NP=32 - -# Allocate node: -# bsub -J TestKokkos-DepCodeOn-rdcpt -W 07:00 -Is -n 16 -q rhel7W bash - -# Run tests -# ctest -j8 - -# Submit tests as job -# bsub -x -Is -q rhel7W -n 16 ctest -j8 diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depon.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depon.sh index 01e2def015..d508d4c77a 100755 --- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depon.sh +++ b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depon.sh @@ -16,7 +16,6 @@ cmake \ -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \ -DTrilinos_ENABLE_TESTS=ON \ -DTrilinos_ENABLE_ALL_PACKAGES=ON \ - -DKOKKOS_ENABLE_DEPRECATED_CODE=ON \ -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ $TRILINOS_DIR diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-env.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-env.sh index 76e0391912..7be71edc1c 100755 --- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-env.sh +++ b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-env.sh @@ -38,7 +38,6 @@ cmake \ -DTrilinos_ENABLE_${PACKAGE8}=ON \ -DTrilinos_ENABLE_${PACKAGE9}=ON \ -DTrilinos_ENABLE_${PACKAGE10}=ON \ - -DKOKKOS_ENABLE_DEPRECATED_CODE=ON \ -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ $TRILINOS_DIR diff --git a/scripts/update_lib.sh b/scripts/update_lib.sh index ce3693409c..822efa28b8 100755 --- a/scripts/update_lib.sh +++ b/scripts/update_lib.sh @@ -1,6 +1,30 @@ #!/bin/bash -if [ "$1" = bowman ]; then - export LIBRARY_PATH=/home/projects/x86-64-knl/gcc/6.2.0/lib/gcc/x86_64-pc-linux-gnu/6.2.0:/home/projects/x86-64-knl/cloog/0.18.4/lib:/home/projects/x86-64-knl/isl/0.16.1/lib:/home/projects/x86-64-knl/gmp/6.1.0/lib:/home/projects/x86-64-knl/mpfr/3.1.3/lib:/home/projects/x86-64-knl/mpc/1.0.3/lib:/home/projects/x86-64-knl/binutils/2.26.0/lib:/usr/lib/gcc/x86_64-redhat-linux/4.8.3:$LIBRARY_PATH - export LD_LIBRARY_PATH=/home/projects/x86-64-knl/gcc/6.2.0/lib64:/home/projects/x86-64-knl/gcc/6.2.0/lib:/home/projects/x86-64-knl/cloog/0.18.4/lib:/home/projects/x86-64-knl/isl/0.16.1/lib:/home/projects/x86-64-knl/gmp/6.1.0/lib:/home/projects/x86-64-knl/mpfr/3.1.3/lib:/home/projects/x86-64-knl/mpc/1.0.3/lib:/home/projects/x86-64-knl/binutils/2.26.0/lib:/usr/lib/gcc/x86_64-redhat-linux/4.8.3:$LD_LIBRARY_PATH +if [ "$1" = blake ]; then + ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" + if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.0.128 ]]; then + module swap gcc/4.9.3 gcc/6.4.0 + module list + fi +fi +if [ "$1" = kokkos-dev ]; then + ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" + if [[ "${ICPCVER}" = 17.* ]]; then + module swap sems-gcc/4.9.3 sems-gcc/6.4.0 + module list + fi +fi +if [ "$1" = kokkos-dev-2 ]; then + ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" + if [[ "${ICPCVER}" = 17.* ]]; then + module swap sems-gcc/4.9.3 sems-gcc/6.4.0 + module list + fi +fi +if [ "$1" = sems ]; then + ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" + if [[ "${ICPCVER}" = 17.* ]]; then + module swap sems-gcc/4.8.4 sems-gcc/6.4.0 + module list + fi fi diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index df360c69de..22c17b5247 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -56,285 +56,332 @@ SET(ETI_HEADERS) #Generate @X@ variables in the template X.hpp.in and X.cpp.in #files containing the list of all needed macros KOKKOSKERNELS_GENERATE_ETI(Blas1_abs abs + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_abs_mv abs + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_scal scal + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_scal_mv scal + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_dot dot + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_dot_mv dot + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas_gesv gesv + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_axpby axpby + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_axpby_mv axpby + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_update update + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_update_mv update + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_sum sum + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_sum_mv sum + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm1 nrm1 + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm1_mv nrm1 + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm2w nrm2w + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm2w_mv nrm2w + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrminf nrminf + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrminf_mv nrminf + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_iamax iamax + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_iamax_mv iamax + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm2 nrm2 + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm2_mv nrm2 + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_mult mult + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_mult_mv mult + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_reciprocal reciprocal + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas1_reciprocal_mv reciprocal + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas2_gemv gemv + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas3_gemm gemm + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas3_trsm trsm + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas3_trmm trmm + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Blas_trtri trtri + COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_sptrsv_solve sptrsv_solve + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_struct spmv + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_mv_struct spmv + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv spmv + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_mv spmv + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_symbolic spgemm_symbolic + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_numeric spgemm_numeric + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_jacobi spgemm_jacobi + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spiluk_symbolic spiluk_symbolic + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_spiluk_numeric spiluk_numeric + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_sptrsv_symbolic sptrsv_symbolic + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_trsv trsv + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Sparse_gauss_seidel_symbolic gauss_seidel_symbolic + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) KOKKOSKERNELS_GENERATE_ETI(Sparse_gauss_seidel_numeric gauss_seidel_numeric + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) KOKKOSKERNELS_GENERATE_ETI(Sparse_gauss_seidel_apply gauss_seidel_apply + COMPONENTS sparse HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) LIST(APPEND HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h) diff --git a/src/KokkosKernels_Half.hpp b/src/KokkosKernels_Half.hpp new file mode 100644 index 0000000000..5ecb959f7e --- /dev/null +++ b/src/KokkosKernels_Half.hpp @@ -0,0 +1,65 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSKERNELS_HALF_HPP +#define KOKKOSKERNELS_HALF_HPP + +#include "Kokkos_Core.hpp" + +namespace KokkosKernels { + namespace Experimental { + ////////////// BEGIN FP16/binary16 limits ////////////// + #define KOKKOSKERNELS_IMPL_FP16_MAX 65504.0F // Maximum normalized number + #define KOKKOSKERNELS_IMPL_FP16_MIN 0.000000059604645F // Minimum normalized positive half precision number + #define KOKKOSKERNELS_IMPL_FP16_RADIX 2 // Value of the base of the exponent representation. TODO: Confirm this + #define KOKKOSKERNELS_IMPL_FP16_MANT_DIG 15 // Number of digits in the matissa that can be represented without losing precision. TODO: Confirm this + #define KOKKOSKERNELS_IMPL_FP16_MIN_EXP -14 // This is the smallest possible exponent value + #define KOKKOSKERNELS_IMPL_FP16_MAX_EXP 15 // This is the largest possible exponent value + #define KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS 10 + #define KOKKOSKERNELS_IMPL_FP16_EPSILON 0.0009765625F + #define KOKKOSKERNELS_IMPL_HUGE_VALH 0x7c00 // bits [10,14] set. + ////////////// END FP16/binary16 limits ////////////// + } // Experimental +} // KokkosKernels +#endif // KOKKOSKERNELS_HALF_HPP diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index 23e6f5e125..83c483a3d6 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -50,6 +50,8 @@ #include #include +#include +#include #ifdef HAVE_KOKKOSKERNELS_QUADMATH # include @@ -63,16 +65,6 @@ #ifdef __CUDACC__ # include #endif -// -// mfh 24 Dec 2013: Temporary measure for testing; will go away. -// -#ifndef KOKKOS_FORCEINLINE_FUNCTION -# ifdef __CUDA_ARCH__ -# define KOKKOS_FORCEINLINE_FUNCTION inline __host__ __device__ -# else -# define KOKKOS_FORCEINLINE_FUNCTION -# endif // __CUDA_ARCH__ -#endif // KOKKOS_FORCEINLINE_FUNCTION namespace { // anonymous @@ -674,6 +666,179 @@ class ArithTraits { //@} }; +// Since Kokkos::Experimental::half_t falls back to float, only define +// ArithTraits if half_t is a backend specialization +#if defined(KOKKOS_HALF_T_IS_FLOAT) &&\ + !KOKKOS_HALF_T_IS_FLOAT +template <> +class ArithTraits { +public: + typedef Kokkos::Experimental::half_t val_type; + typedef val_type mag_type; + + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool is_complex = false; + + static constexpr bool has_infinity = true; + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return Kokkos::Experimental::cast_to_half(HUGE_VALF); } + + static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) { + #ifndef __CUDA_ARCH__ + using std::isinf; + #endif + return isinf (Kokkos::Experimental::cast_from_half(x)); + } + static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) { + #ifndef __CUDA_ARCH__ + using std::isnan; + #endif + return isnan(Kokkos::Experimental::cast_from_half(x)); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) { + return Kokkos::Experimental::cast_to_half(fabs(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type zero () { + return Kokkos::Experimental::cast_to_half(0.0F); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type one () { + return Kokkos::Experimental::cast_to_half(1.0F); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type min () { + return Kokkos::Experimental::cast_to_half(-KOKKOSKERNELS_IMPL_FP16_MAX); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type max () { + return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) { + return x; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type) { + return Kokkos::Experimental::cast_to_half(0.0F); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) { + return x; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const val_type y) { + return Kokkos::Experimental::cast_to_half(::pow(Kokkos::Experimental::cast_from_half(x), + Kokkos::Experimental::cast_from_half(y))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { + return Kokkos::Experimental::cast_to_half(::sqrt (Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { + return Kokkos::Experimental::cast_to_half(::cbrt (Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { + return Kokkos::Experimental::cast_to_half(::exp (Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) { + return Kokkos::Experimental::cast_to_half(::log (Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) { + return Kokkos::Experimental::cast_to_half(::log10 (Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { + return Kokkos::Experimental::cast_to_half(::sin (Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { + return Kokkos::Experimental::cast_to_half(::cos (Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { + return Kokkos::Experimental::cast_to_half(::tan (Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { + return Kokkos::Experimental::cast_to_half(::sinh (Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { + return Kokkos::Experimental::cast_to_half(::cosh (Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { + return Kokkos::Experimental::cast_to_half(::tanh (Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { + return Kokkos::Experimental::cast_to_half(::asin (Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { + return Kokkos::Experimental::cast_to_half(::acos (Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { + return Kokkos::Experimental::cast_to_half(::atan (Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () { + //return ::pow(2, -KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS); + return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_EPSILON); + } + // Backwards compatibility with Teuchos::ScalarTraits. + typedef mag_type magnitudeType; + // C++ doesn't have a standard "half-float" type. + typedef val_type halfPrecision; + typedef double doublePrecision; + + static const bool isComplex = false; + static const bool isOrdinal = false; + static const bool isComparable = true; + static const bool hasMachineParameters = true; + static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const val_type x) { + return isNan (x) || isInf (x); + } + static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) { + return abs(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) { + return conj(x); + } + static std::string name () { + return "half"; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) { + return sqrt(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type nan () { +#ifdef __CUDA_ARCH__ + return Kokkos::Experimental::cast_to_half(CUDART_NAN_F); +#else + return Kokkos::Experimental::cast_to_half(std::numeric_limits::quiet_NaN()); +#endif // __CUDA_ARCH__ + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type eps () { + return epsilon (); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin () { + return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); + } + static KOKKOS_FORCEINLINE_FUNCTION int base () { + return KOKKOSKERNELS_IMPL_FP16_RADIX; + } + // Use float to allow running on both host and device + static KOKKOS_FORCEINLINE_FUNCTION float prec () { + float e = KOKKOSKERNELS_IMPL_FP16_EPSILON; + float b = (float) base(); + float r = e * b; + return r; + } + static KOKKOS_FORCEINLINE_FUNCTION int t () { + return KOKKOSKERNELS_IMPL_FP16_MANT_DIG; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd () { + return Kokkos::Experimental::cast_to_half(1.0); + } + static KOKKOS_FORCEINLINE_FUNCTION int emin () { + return KOKKOSKERNELS_IMPL_FP16_MIN_EXP; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin () { + return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); + } + static KOKKOS_FORCEINLINE_FUNCTION int emax () { + return KOKKOSKERNELS_IMPL_FP16_MAX_EXP; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax () { + return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); + } +}; +#endif // KOKKOS_HALF_T_IS_FLOAT && KOKKOS_ENABLE_CUDA_HALF template<> class ArithTraits { @@ -691,13 +856,13 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION float infinity() { return HUGE_VALF; } static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const float x) { - #ifndef __CUDA_ARCH__ + #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; #endif return isinf (x); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const float x) { - #ifndef __CUDA_ARCH__ + #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; #endif return isnan (x); @@ -801,9 +966,11 @@ class ArithTraits { return sqrt (x); } static KOKKOS_FORCEINLINE_FUNCTION float nan () { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) return CUDART_NAN_F; //return nan (); //this returns 0??? +#elif defined(__HIP_DEVICE_COMPILE__) + return ::nanf(""); #else return std::numeric_limits::quiet_NaN(); #endif // __CUDA_ARCH__ @@ -840,7 +1007,6 @@ class ArithTraits { } }; - /// \brief Partial specialization for std::complex. /// /// The C++ Standard Library (with C++03 at least) only allows @@ -865,13 +1031,13 @@ class ArithTraits > { } static bool isInf (const std::complex& x) { - #ifndef __CUDA_ARCH__ + #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; #endif return isinf (real (x)) || isinf (imag (x)); } static bool isNan (const std::complex& x) { - #ifndef __CUDA_ARCH__ + #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; #endif return isnan (real (x)) || isnan (imag (x)); @@ -1045,13 +1211,13 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION double infinity() { return HUGE_VAL; } static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) { - #ifndef __CUDA_ARCH__ + #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; #endif return isinf (x); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) { - #ifndef __CUDA_ARCH__ + #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; #endif return isnan (x); @@ -1126,9 +1292,11 @@ class ArithTraits { return ::atan (x); } static KOKKOS_FORCEINLINE_FUNCTION val_type nan () { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) return CUDART_NAN; //return nan (); // this returns 0 ??? +#elif defined(__HIP_DEVICE_COMPILE__) + return ::nan(""); #else return std::numeric_limits::quiet_NaN(); #endif // __CUDA_ARCH__ @@ -1140,8 +1308,10 @@ class ArithTraits { // Backwards compatibility with Teuchos::ScalarTraits. typedef mag_type magnitudeType; typedef float halfPrecision; -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) typedef double doublePrecision; // CUDA doesn't support long double, unfortunately +#elif defined(__HIP_DEVICE_COMPILE__) + typedef double doublePrecision; // HIP does not support long double unfortunately #else typedef long double doublePrecision; #endif // __CUDA_ARCH__ @@ -1197,9 +1367,10 @@ class ArithTraits { }; -// CUDA does not support long double in device functions, so none of -// the class methods in this specialization are marked as device -// functions. +// CUDA and HIP do not support long double in device functions, +// so none of the class methods in this specialization are marked +// as device functions. +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST template<> class ArithTraits { public: @@ -1213,18 +1384,14 @@ class ArithTraits { static const bool is_complex = false; static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION long double infinity() { return HUGE_VALL; } + static long double infinity() { return HUGE_VALL; } static bool isInf (const val_type& x) { - #ifndef __CUDA_ARCH__ using std::isinf; - #endif return isinf (x); } static bool isNan (const val_type& x) { - #ifndef __CUDA_ARCH__ using std::isnan; - #endif return isnan (x); } static mag_type abs (const val_type& x) { @@ -1359,7 +1526,8 @@ class ArithTraits { static mag_type rmax () { return LDBL_MAX; } -}; +}; // long double specialization +#endif // KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST #ifdef HAVE_KOKKOSKERNELS_QUADMATH @@ -2923,11 +3091,13 @@ class ArithTraits { return intPowSigned (x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { -#ifdef __CUDA_ARCH__ - return static_cast ( ::sqrt (static_cast (abs (x)))); + using std::sqrt; + using std::abs; +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + return static_cast ( sqrt (static_cast (abs (x)))); #else - return static_cast ( ::sqrt (static_cast (abs (x)))); -#endif // __CUDA_ARCH__ + return static_cast ( sqrt (static_cast (abs (x)))); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) { return static_cast ( ::log (static_cast (abs (x)))); @@ -3048,18 +3218,20 @@ class ArithTraits { return intPowUnsigned (x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { -#ifdef __CUDA_ARCH__ - return static_cast ( ::sqrt (static_cast (x))); + using std::sqrt; +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + return static_cast ( sqrt (static_cast (x))); #else - return static_cast ( ::sqrt (static_cast (x))); -#endif // __CUDA_ARCH__ + return static_cast ( sqrt (static_cast (x))); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { -#ifdef __CUDA_ARCH__ - return static_cast ( ::cbrt (static_cast (x))); -#else +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::cbrtl; return static_cast ( ::cbrtl (static_cast (x))); -#endif // __CUDA_ARCH__ +#else + return static_cast ( ::cbrt (static_cast (x))); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { return static_cast ( ::exp (static_cast (x))); @@ -3184,7 +3356,15 @@ class ArithTraits { return intPowSigned (x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { -#ifdef __CUDA_ARCH__ +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::sqrt; + using std::abs; + // IEEE 754 promises that long double has at least 64 significand + // bits, so we can use it to represent any signed or unsigned + // 64-bit integer type exactly. However, CUDA does not implement + // long double for device functions. + return static_cast ( sqrt (static_cast (abs (x)))); +#else // Casting from a 64-bit integer type to double does result in a // loss of accuracy. However, it gives us a good first // approximation. For very large numbers, we may lose some @@ -3196,20 +3376,16 @@ class ArithTraits { // correctness. It actually should suffice to check numbers // within 1 of the result. return static_cast ( ::sqrt (static_cast (abs (x)))); -#else - // IEEE 754 promises that long double has at least 64 significand - // bits, so we can use it to represent any signed or unsigned - // 64-bit integer type exactly. However, CUDA does not implement - // long double for device functions. - return static_cast ( ::sqrt (static_cast (abs (x)))); -#endif // __CUDA_ARCH__ +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { -#ifdef __CUDA_ARCH__ - return static_cast ( ::cbrt (static_cast (abs (x)))); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::cbrtl; + using std::abs; + return static_cast ( cbrtl (static_cast (abs (x)))); #else - return static_cast ( ::cbrtl (static_cast (abs (x)))); -#endif // __CUDA_ARCH__ + return static_cast ( ::cbrt (static_cast (abs (x)))); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { return static_cast ( ::exp (static_cast (abs (x)))); @@ -3334,18 +3510,20 @@ class ArithTraits { return intPowUnsigned (x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { -#ifdef __CUDA_ARCH__ - return static_cast ( ::sqrt (static_cast (x))); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::sqrt; + return static_cast ( sqrt (static_cast (x))); #else - return static_cast ( ::sqrt (static_cast (x))); -#endif // __CUDA_ARCH__ + return static_cast ( ::sqrt (static_cast (x))); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { -#ifdef __CUDA_ARCH__ - return static_cast ( ::cbrt (static_cast (x))); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::cbrtl; + return static_cast ( cbrtl (static_cast (x))); #else - return static_cast ( ::cbrtl (static_cast (x))); -#endif // __CUDA_ARCH__ + return static_cast ( ::cbrt (static_cast (x))); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { return static_cast ( ::exp (static_cast (x))); diff --git a/src/Kokkos_InnerProductSpaceTraits.hpp b/src/Kokkos_InnerProductSpaceTraits.hpp index 65f3feaf8e..82cab6cc3b 100644 --- a/src/Kokkos_InnerProductSpaceTraits.hpp +++ b/src/Kokkos_InnerProductSpaceTraits.hpp @@ -170,6 +170,7 @@ class InnerProductSpaceTraits { /// \brief Partial specialization for long double. /// /// \warning CUDA does not support long double in device functions. +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST template<> struct InnerProductSpaceTraits { @@ -184,6 +185,7 @@ struct InnerProductSpaceTraits return x * y; } }; +#endif //! Partial specialization for Kokkos::complex. template diff --git a/src/batched/KokkosBatched_Gemm_Serial_Internal.hpp b/src/batched/KokkosBatched_Gemm_Serial_Internal.hpp index afad371334..5875029dd1 100644 --- a/src/batched/KokkosBatched_Gemm_Serial_Internal.hpp +++ b/src/batched/KokkosBatched_Gemm_Serial_Internal.hpp @@ -45,7 +45,7 @@ namespace KokkosBatched { /**/ ValueType *__restrict__ C, const int cs0, const int cs1) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - + const ScalarType one(1.0), zero(0.0); if (beta == zero) SerialSetInternal ::invoke(m, n, zero, C, cs0, cs1); diff --git a/src/batched/KokkosBatched_Gemm_TeamVector_Impl.hpp b/src/batched/KokkosBatched_Gemm_TeamVector_Impl.hpp index 4e1c4d9579..0b68727f0e 100644 --- a/src/batched/KokkosBatched_Gemm_TeamVector_Impl.hpp +++ b/src/batched/KokkosBatched_Gemm_TeamVector_Impl.hpp @@ -138,7 +138,7 @@ namespace KokkosBatched { const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal:: + return TeamVectorGemmInternal:: invoke(member, C.extent(0), C.extent(1), A.extent(0), alpha, diff --git a/src/batched/KokkosBatched_Gemm_TeamVector_Internal.hpp b/src/batched/KokkosBatched_Gemm_TeamVector_Internal.hpp index 3b53e9a577..971389902e 100644 --- a/src/batched/KokkosBatched_Gemm_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_Gemm_TeamVector_Internal.hpp @@ -70,7 +70,7 @@ namespace KokkosBatched { const ValueType *__restrict__ pB = B+j*bs1; - ValueType c = 0; + ValueType c = ValueType(0); for (int p=0;p inner(as0, as1, bs0, bs1, cs0, cs1); auto gemm = [&](const int ib, const int jb, @@ -128,13 +129,16 @@ namespace KokkosBatched { Kokkos::parallel_for (Kokkos::TeamThreadRange(member, mq*nq ), [&](const int &ij) { -#if \ - defined (KOKKOS_ENABLE_CUDA) && \ - defined (KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) - const int i = ij%mq*mb, j = ij/mq*nb; -#else - const int i = ij/nq*mb, j = ij%nq*nb; -#endif + int i, j; + //note: the condition is constexpr + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + i = ij%mq*mb; + j = ij/mq*nb; + } + else { + i = ij/nq*mb; + j = ij%nq*nb; + } inner.serial_invoke(alpha, AA+i*as0, BB+j*bs1, (i+mb) > ib ? mp : mb, diff --git a/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp b/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp index 7b3d8b293e..b63ca28fcf 100644 --- a/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp @@ -68,6 +68,7 @@ namespace KokkosBatched { [&](const int &i) { x2[i*x2s] *= inv_chi1_minus_alpha; }); + member.team_barrier(); // later consider to use the following // SerialScaleInternal::invoke(m_x2, inv_chi1_minus_alpha, x2, x2s); diff --git a/src/batched/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp b/src/batched/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp index 11174eafb6..2e62c20f32 100644 --- a/src/batched/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp +++ b/src/batched/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp @@ -469,10 +469,10 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, - a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, - a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, - a_3p, b_p3, c_30 = 0, c_31 = 0, c_32 = 0, c_33 = 0; + a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0), + a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), c_13 = ValueType(0), + a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0), + a_3p, b_p3, c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0), c_33 = ValueType(0); const int i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0, @@ -516,10 +516,10 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, - a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, - a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, - a_3p, c_30 = 0, c_31 = 0, c_32 = 0; + a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), + a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), + a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0), + a_3p, c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0); const int i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0, @@ -563,10 +563,10 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0, c_01 = 0, - a_1p, b_p1, c_10 = 0, c_11 = 0, - a_2p, c_20 = 0, c_21 = 0, - a_3p, c_30 = 0, c_31 = 0; + a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), + a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), + a_2p, c_20 = ValueType(0), c_21 = ValueType(0), + a_3p, c_30 = ValueType(0), c_31 = ValueType(0); const int i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0, @@ -610,10 +610,10 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0, - a_1p, c_10 = 0, - a_2p, c_20 = 0, - a_3p, c_30 = 0; + a_0p, b_p0, c_00 = ValueType(0), + a_1p, c_10 = ValueType(0), + a_2p, c_20 = ValueType(0), + a_3p, c_30 = ValueType(0); const int i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0, @@ -657,9 +657,9 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, - a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, - a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, + a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0), + a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), c_13 = ValueType(0), + a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0), /**/ b_p3; const int @@ -702,8 +702,8 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, - a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, + a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0), + a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), c_13 = ValueType(0), /**/ b_p2, /**/ b_p3; @@ -745,7 +745,7 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, + a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0), /**/ b_p1, /**/ b_p2, /**/ b_p3; @@ -790,9 +790,9 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, - a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, - a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0; + a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), + a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), + a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0); const int i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, @@ -833,9 +833,9 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0, c_01 = 0, - a_1p, b_p1, c_10 = 0, c_11 = 0, - a_2p, c_20 = 0, c_21 = 0; + a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), + a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), + a_2p, c_20 = ValueType(0), c_21 = ValueType(0); const int i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, @@ -876,9 +876,9 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0, - a_1p, c_10 = 0, - a_2p, c_20 = 0; + a_0p, b_p0, c_00 = ValueType(0), + a_1p, c_10 = ValueType(0), + a_2p, c_20 = ValueType(0); const int i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, @@ -919,8 +919,8 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, - a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, + a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), + a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), /**/ b_p2; const int @@ -959,7 +959,7 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, + a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), /**/ b_p1, /**/ b_p2; @@ -1002,8 +1002,8 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0, c_01 = 0, - a_1p, b_p1, c_10 = 0, c_11 = 0; + a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), + a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0); const int i0 = 0*_as0, i1 = 1*_as0, @@ -1041,8 +1041,8 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0, - a_1p, c_10 = 0; + a_0p, b_p0, c_00 = ValueType(0), + a_1p, c_10 = ValueType(0); const int i0 = 0*_as0, i1 = 1*_as0, @@ -1080,7 +1080,7 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0, c_01 = 0, + a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), /**/ b_p1; const int i0 = 0*_as0, @@ -1120,7 +1120,7 @@ namespace KokkosBatched { if (k <= 0) return 0; ValueType - a_0p, b_p0, c_00 = 0; + a_0p, b_p0, c_00 = ValueType(0); const int i0 = 0*_as0, diff --git a/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp b/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp index d1b59d652f..d443bad513 100644 --- a/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp @@ -23,7 +23,7 @@ namespace KokkosBatched { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, + const int m, const int n, const int k, /* */ ValueType * A, const int as0, const int as1, /* */ ValueType * t, const int ts, @@ -44,12 +44,12 @@ namespace KokkosBatched { if (is_Q_zero) TeamVectorSetInternal::invoke(member, m, value_type(1), Q, qs0+qs1); else - TeamVectorSetIdentityInternal::invoke(member, m, Q, qs0, qs1); + TeamVectorSetIdentityInternal::invoke(member, m, n, Q, qs0, qs1); member.team_barrier(); return TeamVectorApplyQ_LeftForwardInternal ::invoke(member, - m, m, k, + m, n, k, A, as0, as1, t, ts, Q, qs0, qs1, diff --git a/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp b/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp index 2b0c1e4569..08439b0b28 100644 --- a/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp @@ -96,6 +96,7 @@ namespace KokkosBatched { A, as0, as1, A, as0, as1, norm, 1); + member.team_barrier(); const bool finish_when_rank_found = (matrix_rank == -1); @@ -158,7 +159,7 @@ namespace KokkosBatched { if (m_atl == 0) max_diag = ats::abs(A[0]); const value_type val_diag = ats::abs(A_part3x3.A11[0]), - threshold(max_diag*ats::epsilon()); + threshold(10*max_diag*ats::epsilon()); if (val_diag < threshold) { matrix_rank = m_atl; if (finish_when_rank_found) @@ -171,6 +172,7 @@ namespace KokkosBatched { n_A22, A_part3x3.A12, as1, norm_part1x3.A2, 1); + member.team_barrier(); /// ----------------------------------------------------- A_part2x2.mergeToATL (A_part3x3); t_part2x1.mergeToAT (t_part3x1); diff --git a/src/batched/KokkosBatched_SetIdentity_Impl.hpp b/src/batched/KokkosBatched_SetIdentity_Impl.hpp index 4c0ea12348..0bf12243ee 100644 --- a/src/batched/KokkosBatched_SetIdentity_Impl.hpp +++ b/src/batched/KokkosBatched_SetIdentity_Impl.hpp @@ -19,7 +19,7 @@ namespace KokkosBatched { SerialSetIdentity:: invoke(const AViewType &A) { return SerialSetIdentityInternal:: - invoke(A.extent(0), + invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1()); } @@ -36,7 +36,7 @@ namespace KokkosBatched { const AViewType &A) { return TeamSetIdentityInternal:: invoke(member, - A.extent(0), + A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1()); } diff --git a/src/batched/KokkosBatched_SetIdentity_Internal.hpp b/src/batched/KokkosBatched_SetIdentity_Internal.hpp index 40d8bbbaaf..8f7f6cf3f9 100644 --- a/src/batched/KokkosBatched_SetIdentity_Internal.hpp +++ b/src/batched/KokkosBatched_SetIdentity_Internal.hpp @@ -15,10 +15,10 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION static int - invoke(const int m, + invoke(const int m, const int n, /* */ ValueType *__restrict__ A, const int as0, const int as1) { const ValueType one(1), zero(0); - for (int j=0;j ::invoke(member, @@ -133,6 +133,7 @@ namespace KokkosBatched { B, bs0, bs1, zero, W, ws0, ws1); + member.team_barrier(); /// W = T^{-1} W TeamVectorTrsmInternalLeftLower @@ -142,26 +143,31 @@ namespace KokkosBatched { one, T, ts0, ts1, W, ws0, ws1); + member.team_barrier(); /// X = V^T W TeamVectorGemmInternal ::invoke(member, - m, nrhs, matrix_rank, + n, nrhs, matrix_rank, one, V, vs1, vs0, W, ws0, ws1, zero, X, xs0, xs1); + member.team_barrier(); } else { + /// W = U^T B TeamVectorGemmInternal ::invoke(member, - m, nrhs, matrix_rank, + matrix_rank, nrhs, m, one, U, us1, us0, B, bs0, bs1, zero, X, xs0, xs1); + member.team_barrier(); + /// X = T^{-1} X TeamVectorTrsmInternalLeftUpper ::invoke(member, false, @@ -169,12 +175,13 @@ namespace KokkosBatched { one, T, ts0, ts1, X, xs0, xs1); + member.team_barrier(); } /// X = P^T X TeamVectorApplyPivotMatrixBackwardInternal ::invoke(member, - nrhs, m, + nrhs, n, p, ps0, X, xs0, xs1); diff --git a/src/batched/KokkosBatched_Trsm_Team_Internal.hpp b/src/batched/KokkosBatched_Trsm_Team_Internal.hpp index 085bd9e293..64d8368f16 100644 --- a/src/batched/KokkosBatched_Trsm_Team_Internal.hpp +++ b/src/batched/KokkosBatched_Trsm_Team_Internal.hpp @@ -5,6 +5,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "KokkosBatched_Util.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBatched_Set_Internal.hpp" #include "KokkosBatched_Scale_Internal.hpp" @@ -114,7 +115,7 @@ namespace KokkosBatched { /// case host: team size is small and blocksize (mb,nb) is large /// - /// case cuda: team size is large and blocksize (mb,nb) is small + /// case GPU: team size is large and blocksize (mb,nb) is small InnerTrsmLeftLowerUnitDiag trsm_u(as0, as1, bs0, bs1); InnerTrsmLeftLowerNonUnitDiag trsm_n(as0, as1, bs0, bs1); @@ -195,7 +196,6 @@ namespace KokkosBatched { const ScalarType alpha, const ValueType *__restrict__ A, const int as0, const int as1, /**/ ValueType *__restrict__ B, const int bs0, const int bs1) { - const ScalarType one(1.0), zero(0.0); // note that parallel range is different ( m*n vs m-1*n); @@ -223,13 +223,15 @@ namespace KokkosBatched { } Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,iend*jend),[&](const int &ij) { -#if \ - defined (KOKKOS_ENABLE_CUDA) && \ - defined (KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) - const int i = ij%iend, j = ij/iend; -#else - const int i = ij/jend, j = ij%jend; -#endif + int i, j; + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + i = ij%iend; + j = ij/iend; + } + else { + i = ij/jend; + j = ij%jend; + } B0[i*bs0+j*bs1] -= a01[i*as0] * b1t[j*bs1]; }); } diff --git a/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp b/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp index 618f8dc614..5bf26f0865 100644 --- a/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp +++ b/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp @@ -99,7 +99,7 @@ namespace KokkosBatched { if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; - /// case cuda: team size is large and blocksize (mb,nb) is small + /// case GPU: team size is large and blocksize (mb,nb) is small InnerTrsmLeftLowerUnitDiag trsm_u(as0, as1, bs0, 0); InnerTrsmLeftLowerNonUnitDiag trsm_n(as0, as1, bs0, 0); diff --git a/src/batched/KokkosBatched_Trsv_Team_Internal.hpp b/src/batched/KokkosBatched_Trsv_Team_Internal.hpp index 20ee624006..7d72f01e15 100644 --- a/src/batched/KokkosBatched_Trsv_Team_Internal.hpp +++ b/src/batched/KokkosBatched_Trsv_Team_Internal.hpp @@ -115,7 +115,7 @@ namespace KokkosBatched { if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; - /// case cuda: team size is large and blocksize (mb,nb) is small + /// case GPU: team size is large and blocksize (mb,nb) is small InnerTrsmLeftLowerUnitDiag trsm_u(as0, as1, bs0, 0); InnerTrsmLeftLowerNonUnitDiag trsm_n(as0, as1, bs0, 0); diff --git a/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp b/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp index 69b958d22d..b06c76b02a 100644 --- a/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp +++ b/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp @@ -31,7 +31,7 @@ namespace KokkosBatched { int &matrix_rank) { return TeamVectorUTV_Internal:: invoke(member, - A.extent(0), //A.extent(1), + A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), p.data(), p.stride(0), U.data(), U.stride(0), U.stride(1), diff --git a/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp b/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp index 6f9a86e115..354dfa7c44 100644 --- a/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp @@ -23,7 +23,7 @@ namespace KokkosBatched { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, // m = NumRows(A) + const int m, const int n, // m = NumRows(A), n = NumCols(A) /* */ ValueType * A, const int as0, const int as1, /* */ IntType * p, const int ps0, /* */ ValueType * U, const int us0, const int us1, @@ -41,23 +41,24 @@ namespace KokkosBatched { matrix_rank = -1; TeamVectorQR_WithColumnPivotingInternal ::invoke(member, - m, m, + m, n, A, as0, as1, t, ts0, p, ps0, work, matrix_rank); - + TeamVectorQR_FormQ_Internal ::invoke(member, - m, matrix_rank, + m, matrix_rank, matrix_rank, A, as0, as1, t, ts0, U, us0, us1, work); + member.team_barrier(); /// for rank deficient matrix - if (matrix_rank < m) { + if (matrix_rank < n) { const value_type zero(0); TeamVectorSetLowerTriangularInternal ::invoke(member, @@ -67,14 +68,14 @@ namespace KokkosBatched { TeamVectorQR_Internal ::invoke(member, - m, matrix_rank, + n, matrix_rank, A, as1, as0, t, ts0, work); TeamVectorQR_FormQ_Internal ::invoke(member, - m, matrix_rank, + n, matrix_rank, matrix_rank, A, as1, as0, t, ts0, V, vs1, vs0, diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 372b5e1753..3253b6ce12 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -270,12 +270,17 @@ namespace KokkosBatched { // regieter blocking (not about team parallelism). // this mb should vary according to // - team policy (smaller) or range policy (bigger) - // - space (cuda vs host) + // - space (gpu vs host) // - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc. #if defined(KOKKOS_ENABLE_CUDA) template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> ::type mb() { return 2; } +#endif +#if defined(KOKKOS_ENABLE_HIP) + template KOKKOS_INLINE_FUNCTION static constexpr + typename std::enable_if::value,int> + ::type mb() { return 2; } #endif template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> @@ -300,7 +305,7 @@ namespace KokkosBatched { using Gemm = Level3; using Trsm = Level3; using Trmm = Level3; - using Trtri = Level3; // TODO: Need new level for Trtri? + using Trtri = Level3; using LU = Level3; using InverseLU = Level3; using SolveLU = Level3; @@ -320,6 +325,11 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> ::type mb() { return 1; } +#endif +#if defined(KOKKOS_ENABLE_HIP) + template KOKKOS_INLINE_FUNCTION static constexpr + typename std::enable_if::value,int> + ::type mb() { return 1; } #endif template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> diff --git a/src/batched/KokkosBatched_Vector.hpp b/src/batched/KokkosBatched_Vector.hpp index 8737d72850..28a537f885 100644 --- a/src/batched/KokkosBatched_Vector.hpp +++ b/src/batched/KokkosBatched_Vector.hpp @@ -104,6 +104,25 @@ namespace KokkosBatched { }; #endif +#if defined(KOKKOS_ENABLE_HIP) + template<> + struct DefaultVectorLength { + enum : int { value = 16 }; + }; + template<> + struct DefaultVectorLength { + enum : int { value = 16 }; + }; + template<> + struct DefaultVectorLength,Kokkos::Experimental::HIPSpace> { + enum : int { value = 16 }; + }; + template<> + struct DefaultVectorLength,Kokkos::Experimental::HIPSpace> { + enum : int { value = 16 }; + }; +#endif + template struct DefaultInternalVectorLength { enum : int { value = 1 }; @@ -147,6 +166,25 @@ namespace KokkosBatched { enum : int { value = 1 }; }; #endif + +#if defined(KOKKOS_ENABLE_HIP) + template<> + struct DefaultInternalVectorLength { + enum : int { value = 8 }; + }; + template<> + struct DefaultInternalVectorLength { + enum : int { value = 4 }; + }; + template<> + struct DefaultInternalVectorLength,Kokkos::Experimental::HIPSpace> { + enum : int { value = 4 }; + }; + template<> + struct DefaultInternalVectorLength,Kokkos::Experimental::HIPSpace> { + enum : int { value = 2 }; + }; +#endif template struct MagnitudeScalarType; diff --git a/src/batched/KokkosBatched_Vector_SIMD.hpp b/src/batched/KokkosBatched_Vector_SIMD.hpp index d59f0f9be4..a950e5e41f 100644 --- a/src/batched/KokkosBatched_Vector_SIMD.hpp +++ b/src/batched/KokkosBatched_Vector_SIMD.hpp @@ -6,7 +6,7 @@ #include #include -#if defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) #undef __KOKKOSBATCHED_ENABLE_AVX__ #else // compiler bug with AVX in some architectures @@ -129,7 +129,7 @@ namespace KokkosBatched { } -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) namespace KokkosBatched { template<> @@ -143,7 +143,7 @@ namespace KokkosBatched { typedef float2 data_type; KOKKOS_INLINE_FUNCTION - static const char* label() { return "CudaFloat2"; } + static const char* label() { return "GpuFloat2"; } template friend class Vector; @@ -224,7 +224,7 @@ namespace KokkosBatched { typedef double2 data_type; KOKKOS_INLINE_FUNCTION - static const char* label() { return "CudaDouble2"; } + static const char* label() { return "GpuDouble2"; } template friend class Vector; @@ -305,7 +305,7 @@ namespace KokkosBatched { typedef float4 data_type; KOKKOS_INLINE_FUNCTION - static const char* label() { return "CudaFloat4"; } + static const char* label() { return "GpuFloat4"; } template friend class Vector; @@ -400,7 +400,7 @@ namespace KokkosBatched { typedef double4 data_type; KOKKOS_INLINE_FUNCTION - static const char* label() { return "CudaDouble4"; } + static const char* label() { return "GpuDouble4"; } template friend class Vector; diff --git a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp index 95ab97d882..49317ca9d4 100644 --- a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp +++ b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp @@ -77,7 +77,7 @@ namespace KokkosBatched { return r_val; } -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) @@ -298,7 +298,7 @@ namespace KokkosBatched { return r_val; } -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) @@ -568,7 +568,7 @@ namespace KokkosBatched { return r_val; } -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) @@ -858,7 +858,7 @@ namespace KokkosBatched { return r_val; } -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp index 74d15af1c3..db5bc9fbca 100644 --- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -139,8 +139,8 @@ struct SingleLevelNontransposeGEMV { // matrix A and the input vector x. The output vector y is the // reduction result. // -// WARNING: NOT RECOMMENDED FOR CUDA. Reduction result may have -// arbitrary length. This is bad on CUDA because the CUDA +// WARNING: NOT RECOMMENDED FOR GPU. Reduction result may have +// arbitrary length. This is bad on GPU because the GPU // implementation of Kokkos::parallel_reduce may use shared memory for // intermediate results. template struct impl_gemm_choose_copy_layout { - typedef LayoutAScratch type; + using type = LayoutAScratch; }; #ifdef KOKKOS_ENABLE_CUDA template struct impl_gemm_choose_copy_layout { - typedef LayoutA type; + using type = LayoutA; +}; +#endif + +#ifdef KOKKOS_ENABLE_HIP +template +struct impl_gemm_choose_copy_layout { + using type = LayoutA; }; #endif @@ -392,7 +399,7 @@ KOKKOS_INLINE_FUNCTION void impl_team_gemm_block(const TeamHandle& team, const ViewTypeC& C, const ViewTypeA& A, const ViewTypeB& B) { typedef typename ViewTypeC::non_const_value_type ScalarC; // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) || !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && (!defined(__CUDA_ARCH__) || !defined(__HIP_DEVICE_COMPILE__)) int blockA0 = A.extent_int(0); int blockA1 = A.extent_int(1); int blockB1 = B.extent_int(1); @@ -510,7 +517,17 @@ struct GEMMImpl { ViewTypeBScratch::shmem_size() + ViewTypeCScratch::shmem_size(); +#if defined(KOKKOS_ENABLE_HIP) + // Note lbv, 10/29/20: The LaunchBounds<384,2> leads + // to an error with HIP as the heuristics on that platform + // yield an optimal_num_blocks=0 which means no ressources + // are allocated... Switching to LaunchBounds<384,2> fixes + // that problem but I'm not sure if that it a good perf + // parameter or why it is set to 2 for Cuda? + Kokkos::TeamPolicy> policy(num_blocks_0*num_blocks_1,team_size,vector_length); +#else Kokkos::TeamPolicy> policy(num_blocks_0*num_blocks_1,team_size,vector_length); +#endif Kokkos::parallel_for(impl_gemm_label::label,policy.set_scratch_size(scratch_level,Kokkos::PerTeam(scratch_memory_size)),*this); } diff --git a/src/blas/impl/KokkosBlas3_gemm_spec.hpp b/src/blas/impl/KokkosBlas3_gemm_spec.hpp index 877d73c5fa..2a63c3736f 100644 --- a/src/blas/impl/KokkosBlas3_gemm_spec.hpp +++ b/src/blas/impl/KokkosBlas3_gemm_spec.hpp @@ -157,6 +157,10 @@ struct GEMM { if(std::is_same::value) team_size = blockA0; #endif + #if defined(KOKKOS_ENABLE_HIP) + if(std::is_same::value) + team_size = blockA0; + #endif #if defined(KOKKOS_ENABLE_ROCM) if(std::is_same::value) team_size = blockA0; diff --git a/src/blas/impl/KokkosBlas3_trmm_spec.hpp b/src/blas/impl/KokkosBlas3_trmm_spec.hpp index 13c87a299e..3c0bd9df6f 100644 --- a/src/blas/impl/KokkosBlas3_trmm_spec.hpp +++ b/src/blas/impl/KokkosBlas3_trmm_spec.hpp @@ -74,7 +74,7 @@ struct trmm_eti_spec_avail { > { enum : bool { value = true }; }; // -// This Macros provides the ETI specialization of trmm, currently not available. +// This Macros provides the ETI specialization of trmm // #define KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \ KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL_LAYOUT( SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE) diff --git a/src/blas/impl/KokkosBlas_gesv_spec.hpp b/src/blas/impl/KokkosBlas_gesv_spec.hpp index e1e0b77f67..8f2d171436 100644 --- a/src/blas/impl/KokkosBlas_gesv_spec.hpp +++ b/src/blas/impl/KokkosBlas_gesv_spec.hpp @@ -118,6 +118,7 @@ struct GESV{ const IPIVV& IPIV) { //NOTE: Might add the implementation of KokkosBlas::gesv later + throw std::runtime_error("No fallback implementation of GESV (general LU factorization & solve) exists. Enable BLAS and/or MAGMA TPL."); } }; diff --git a/src/common/KokkosKernels_BitUtils.hpp b/src/common/KokkosKernels_BitUtils.hpp index b22d86a8bb..c845e37c53 100644 --- a/src/common/KokkosKernels_BitUtils.hpp +++ b/src/common/KokkosKernels_BitUtils.hpp @@ -51,7 +51,7 @@ namespace KokkosKernels{ namespace Impl{ // POP COUNT function returns the number of set bits -#if defined( __CUDA_ARCH__ ) +#if defined( __CUDA_ARCH__ ) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned i ){ return __popc(i); @@ -112,66 +112,95 @@ int pop_count( long long i ){ return _popcnt64(i); } -#elif defined( KOKKOS_COMPILER_IBM ) +#elif defined( __GNUC__ ) || defined( __GNUG__ ) KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned i ){ - return __popcnt4(i); + return __builtin_popcount(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned long i ){ - return __popcnt8(i); + return __builtin_popcountl(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned long long i ){ - return __popcnt8(i); + return __builtin_popcountll(i); } +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( int i ){ + return __builtin_popcount(i); +} +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( long i ){ + return __builtin_popcountl(i); +} +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( long long i ){ + return __builtin_popcountll(i); +} +#elif defined(__ibmxl_vrm__) +// See https://www.ibm.com/support/knowledgecenter/SSGH3R_16.1.0/com.ibm.xlcpp161.aix.doc/compiler_ref/compiler_builtins.html +// link gives info about builtin names for xlclang++ +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( unsigned i ){ + return __builtin_popcnt4(i); +} +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( unsigned long i ){ + return __builtin_popcnt8(i); +} + +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( unsigned long long i ){ + return __builtin_popcnt8(i); +} KOKKOS_FORCEINLINE_FUNCTION int pop_count( int i ){ - return __popcnt4(i); + return __builtin_popcnt4(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( long i ){ - return __popcnt8(i); + return __builtin_popcnt8(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( long long i ){ - return __popcnt8(i); + return __builtin_popcnt8(i); } -#elif defined( __GNUC__ ) || defined( __GNUG__ ) +#elif defined(__IBMCPP__) || defined(__IBMC__) KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned i ){ - return __builtin_popcount(i); + return __popcnt4(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned long i ){ - return __builtin_popcountl(i); + return __popcnt8(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned long long i ){ - return __builtin_popcountll(i); + return __popcnt8(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( int i ){ - return __builtin_popcount(i); + return __popcnt4(i); } + KOKKOS_FORCEINLINE_FUNCTION -int pop_count( long i ){ - return __builtin_popcountl(i); +int pop_count( long i ){ + return __popcnt8(i); } KOKKOS_FORCEINLINE_FUNCTION -int pop_count( long long i ){ - return __builtin_popcountll(i); +int pop_count( long long i ){ + return __popcnt8(i); } #else @@ -181,7 +210,7 @@ int pop_count( long long i ){ // least_set_bit function returns the position of right most set bit -#if defined( __CUDA_ARCH__ ) +#if defined( __CUDA_ARCH__ ) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION int least_set_bit( unsigned i ){ return __ffs(i); @@ -189,7 +218,11 @@ int least_set_bit( unsigned i ){ KOKKOS_FORCEINLINE_FUNCTION int least_set_bit( unsigned long i ){ +#if defined(__HIP_DEVICE_COMPILE__) + return __ffsll(static_cast(i)); +#else return __ffsll(i); +#endif } @@ -207,7 +240,11 @@ int least_set_bit( int i ){ KOKKOS_FORCEINLINE_FUNCTION int least_set_bit( long i ){ +#if defined(__HIP_DEVICE_COMPILE__) + return __ffsll(static_cast(i)); +#else return __ffsll(i); +#endif } KOKKOS_FORCEINLINE_FUNCTION diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp index c0ae6ce5eb..59bcf487fb 100644 --- a/src/common/KokkosKernels_ExecSpaceUtils.hpp +++ b/src/common/KokkosKernels_ExecSpaceUtils.hpp @@ -53,9 +53,9 @@ namespace KokkosKernels{ namespace Impl{ -enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA}; +enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA, Exec_HIP}; template -inline ExecSpaceType kk_get_exec_space_type(){ +KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){ ExecSpaceType exec_space = Exec_SERIAL; #if defined( KOKKOS_ENABLE_SERIAL ) if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ @@ -81,6 +81,12 @@ inline ExecSpaceType kk_get_exec_space_type(){ } #endif +#if defined( KOKKOS_ENABLE_HIP ) + if (std::is_same::value){ + exec_space = Exec_HIP; + } +#endif + #if defined( KOKKOS_ENABLE_QTHREAD) if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ exec_space = Exec_QTHREADS; @@ -90,6 +96,60 @@ inline ExecSpaceType kk_get_exec_space_type(){ } +template +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { + return false; +} + +#ifdef KOKKOS_ENABLE_CUDA +template <> +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { + return true; +} +#endif + +#ifdef KOKKOS_ENABLE_HIP +template <> +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { + return true; +} +#endif + +//Host function to determine free and total device memory. +//Will throw if execution space doesn't support this. +template +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) +{ + std::ostringstream oss; + oss << "Error: memory space " << MemorySpace::name() << " does not support querying free/total memory."; + throw std::runtime_error(oss.str()); +} + +#ifdef KOKKOS_ENABLE_CUDA +template <> +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) +{ + cudaMemGetInfo(&free_mem, &total_mem); +} +template <> +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) +{ + cudaMemGetInfo(&free_mem, &total_mem); +} +template <> +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) +{ + cudaMemGetInfo(&free_mem, &total_mem); +} +#endif + +#ifdef KOKKOS_ENABLE_HIP +template <> +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) +{ + hipMemGetInfo(&free_mem, &total_mem); +} +#endif inline int kk_get_suggested_vector_size( const size_t nr, const size_t nnz, const ExecSpaceType exec_space){ @@ -103,7 +163,7 @@ inline int kk_get_suggested_vector_size( case Exec_QTHREADS: break; case Exec_CUDA: - + case Exec_HIP: if (nr > 0) suggested_vector_size_ = nnz / double (nr) + 0.5; if (suggested_vector_size_ < 3){ @@ -119,7 +179,14 @@ inline int kk_get_suggested_vector_size( suggested_vector_size_ = 16; } else { - suggested_vector_size_ = 32; + if(exec_space == Exec_CUDA || suggested_vector_size_ <= 48) { + //use full CUDA warp, or half a HIP wavefront + suggested_vector_size_ = 32; + } + else { + //use full HIP wavefront + suggested_vector_size_ = 64; + } } break; } @@ -129,7 +196,9 @@ inline int kk_get_suggested_vector_size( inline int kk_get_suggested_team_size(const int vector_size, const ExecSpaceType exec_space){ - if (exec_space == Exec_CUDA){ + if (exec_space == Exec_CUDA || exec_space == Exec_HIP) { + //TODO: where this is used, tune the target value for + //threads per block (but 256 is probably OK for CUDA and HIP) return 256 / vector_size; } else { @@ -171,6 +240,25 @@ struct SpaceInstance { }; #endif +#ifdef KOKKOS_ENABLE_HIP +template <> +struct SpaceInstance { + static Kokkos::Experimental::HIP create() { + hipStream_t stream; + hipStreamCreate(&stream); + return Kokkos::Experimental::HIP(stream); + } + static void destroy(Kokkos::Experimental::HIP& space) { + hipStream_t stream = space.hip_stream(); + hipStreamDestroy(stream); + } + static bool overlap() { + //TODO: does HIP have an equivalent for CUDA_LAUNCH_BLOCKING? + return true; + } +}; +#endif + } } diff --git a/src/common/KokkosKernels_Handle.hpp b/src/common/KokkosKernels_Handle.hpp index 1713e7c460..2e335d4f04 100644 --- a/src/common/KokkosKernels_Handle.hpp +++ b/src/common/KokkosKernels_Handle.hpp @@ -371,7 +371,7 @@ class KokkosKernelsHandle return this->team_work_size; } else { - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (my_exec_space == KokkosKernels::Impl::Exec_CUDA || my_exec_space == KokkosKernels::Impl::Exec_HIP) { return team_size; } else { @@ -609,10 +609,10 @@ class KokkosKernelsHandle } } - void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t verts_per_cluster) { + void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t hint_verts_per_cluster) { this->destroy_gs_handle(); this->is_owner_of_the_gs_handle = true; - this->gsHandle = new ClusterGaussSeidelHandleType(clusterAlgo, verts_per_cluster); + this->gsHandle = new ClusterGaussSeidelHandleType(clusterAlgo, hint_verts_per_cluster); } void destroy_gs_handle(){ if (is_owner_of_the_gs_handle && this->gsHandle != NULL){ diff --git a/src/common/KokkosKernels_Macros.hpp b/src/common/KokkosKernels_Macros.hpp index 84de9048c9..ced946fe4f 100644 --- a/src/common/KokkosKernels_Macros.hpp +++ b/src/common/KokkosKernels_Macros.hpp @@ -46,10 +46,10 @@ #define _KOKKOSKERNELS_MACROUTILS_HPP_ // If KOKKOSKERNELS_ENABLE_OMP_SIMD is defined, it's legal to place -// "#pragma omp simd" before a for loop. It's never defined if CUDA is enabled, +// "#pragma omp simd" before a for loop. It's never defined if a GPU-type device is enabled, // since in that case, Kokkos::ThreadVectorRange should be used instead for SIMD parallel loops. -#if !defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_OPENMP) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ENABLE_OPENMP) #if defined(KOKKOS_COMPILER_GNU) // GCC 4.8.5 and older do not support #pragma omp simd #if (KOKKOS_COMPILER_GNU > 485 ) diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp index bdb93c71b1..be37765594 100644 --- a/src/common/KokkosKernels_Sorting.hpp +++ b/src/common/KokkosKernels_Sorting.hpp @@ -250,7 +250,7 @@ struct DefaultComparator //Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter template> KOKKOS_INLINE_FUNCTION void -TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem) +TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) { //Algorithm only works on power-of-two input size only. //If n is not a power-of-two, will implicitly pretend @@ -277,7 +277,6 @@ TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem) Ordinal boxStart = boxID << (1 + i - j); //boxID * boxSize Ordinal boxOffset = t - (boxStart >> 1); //t - boxID * boxSize / 2; Ordinal elem1 = boxStart + boxOffset; - Comparator comp; if(j == 0) { //first phase (brown box): within a block, compare with the opposite value in the box @@ -316,7 +315,7 @@ TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem) //Sort "values", while applying the same swaps to "perm" template> KOKKOS_INLINE_FUNCTION void -TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem) +TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) { //Algorithm only works on power-of-two input size only. //If n is not a power-of-two, will implicitly pretend @@ -343,7 +342,6 @@ TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember Ordinal boxStart = boxID << (1 + i - j); //boxID * boxSize Ordinal boxOffset = t - (boxStart >> 1); //t - boxID * boxSize / 2; Ordinal elem1 = boxStart + boxOffset; - Comparator comp; if(j == 0) { //first phase (brown box): within a block, compare with the opposite value in the box @@ -389,19 +387,20 @@ TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember template struct BitonicSingleTeamFunctor { - BitonicSingleTeamFunctor(View& v_) : v(v_) {} + BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) : v(v_), comp(comp_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { - TeamBitonicSort(v.data(), v.extent(0), t); + TeamBitonicSort(v.data(), v.extent(0), t, comp); }; View v; + Comparator comp; }; //Functor that sorts equally sized chunks on each team template struct BitonicChunkFunctor { - BitonicChunkFunctor(View& v_, Ordinal chunkSize_) : v(v_), chunkSize(chunkSize_) {} + BitonicChunkFunctor(View& v_, const Comparator& comp_, Ordinal chunkSize_) : v(v_), comp(comp_), chunkSize(chunkSize_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { Ordinal chunk = t.league_rank(); @@ -409,9 +408,10 @@ struct BitonicChunkFunctor Ordinal n = chunkSize; if(chunkStart + n > Ordinal(v.extent(0))) n = v.extent(0) - chunkStart; - TeamBitonicSort(v.data() + chunkStart, n, t); + TeamBitonicSort(v.data() + chunkStart, n, t, comp); }; View v; + Comparator comp; Ordinal chunkSize; }; @@ -420,8 +420,8 @@ template> (logSubBoxSize - 1); @@ -519,6 +518,7 @@ struct BitonicPhase2Functor } }; View v; + Comparator comp; Ordinal boxSize; Ordinal teamsPerBox; }; @@ -531,16 +531,16 @@ struct BitonicPhase2Functor //and an arbitrary device-compatible comparison operator (provided through operator() of Comparator) //If comparator is void, use operator< (which should only be used for primitives) template> -void bitonicSort(View v) +void bitonicSort(View v, const Comparator& comp = Comparator()) { typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; Ordinal n = v.extent(0); //If n is small, just sort on a single team - if(n <= Ordinal(1) << 16) + if(n <= Ordinal(1) << 12) { Kokkos::parallel_for(team_policy(1, Kokkos::AUTO()), - BitonicSingleTeamFunctor(v)); + BitonicSingleTeamFunctor(v, comp)); } else { @@ -552,16 +552,16 @@ void bitonicSort(View v) Ordinal numTeams = npot / chunkSize; //First, sort within teams Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), - BitonicChunkFunctor(v, chunkSize)); + BitonicChunkFunctor(v, comp, chunkSize)); for(int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2) { Ordinal boxSize = teamsPerBox * chunkSize; Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), - BitonicPhase1Functor(v, boxSize, teamsPerBox)); + BitonicPhase1Functor(v, comp, boxSize, teamsPerBox)); for(int boxDiv = 1; teamsPerBox >> boxDiv; boxDiv++) { Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), - BitonicPhase2Functor(v, boxSize >> boxDiv, teamsPerBox >> boxDiv)); + BitonicPhase2Functor(v, comp, boxSize >> boxDiv, teamsPerBox >> boxDiv)); } } } diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp index 16a336f200..6979f15847 100644 --- a/src/common/KokkosKernels_SparseUtils.hpp +++ b/src/common/KokkosKernels_SparseUtils.hpp @@ -858,13 +858,84 @@ inline size_t kk_is_d1_coloring_valid( struct ColorChecker cc(num_rows, xadj, adj, v_colors, team_work_chunk_size); size_t num_conf = 0; - Kokkos::parallel_reduce( "KokkosKernels::Common::IsD1ColoringValie", dynamic_team_policy(num_rows / team_work_chunk_size + 1 , + Kokkos::parallel_reduce( "KokkosKernels::Common::IsD1ColoringValid", dynamic_team_policy(num_rows / team_work_chunk_size + 1 , suggested_team_size, vector_size), cc, num_conf); MyExecSpace().fence(); return num_conf; } +template +struct MinMaxDegreeFunctor +{ + using ReducerVal = typename Reducer::value_type; + MinMaxDegreeFunctor(const rowmap_t& rowmap_) + : rowmap(rowmap_) {} + KOKKOS_INLINE_FUNCTION void operator()(ordinal_t i, ReducerVal& lminmax) const + { + ordinal_t deg = rowmap(i + 1) - rowmap(i); + if(deg < lminmax.min_val) + lminmax.min_val = deg; + if(deg > lminmax.max_val) + lminmax.max_val = deg; + } + rowmap_t rowmap; +}; + +template +struct MaxDegreeFunctor +{ + using ReducerVal = typename Reducer::value_type; + MaxDegreeFunctor(const rowmap_t& rowmap_) + : rowmap(rowmap_) {} + KOKKOS_INLINE_FUNCTION void operator()(ordinal_t i, ReducerVal& lmax) const + { + ordinal_t deg = rowmap(i + 1) - rowmap(i); + if(deg > lmax) + lmax = deg; + } + rowmap_t rowmap; +}; + +template +ordinal_t graph_max_degree(const rowmap_t& rowmap) +{ + using Reducer = Kokkos::Max; + ordinal_t nrows = rowmap.extent(0); + if(nrows) + nrows--; + if(nrows == 0) + return 0; + ordinal_t val; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, nrows), + MaxDegreeFunctor(rowmap), + Reducer(val)); + return val; +} + +template +void graph_min_max_degree(const rowmap_t& rowmap, ordinal_t& min_degree, ordinal_t& max_degree) +{ + using Reducer = Kokkos::MinMax; + ordinal_t nrows = rowmap.extent(0); + if(nrows) + nrows--; + if(nrows == 0) + { + min_degree = 0; + max_degree = 0; + return; + } + typename Reducer::value_type result; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, nrows), + MinMaxDegreeFunctor(rowmap), + Reducer(result)); + min_degree = result.min_val; + max_degree = result.max_val; +} + template struct SortCrsMatrixFunctor { @@ -970,12 +1041,7 @@ void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const val { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; -#ifdef KOKKOS_ENABLE_CUDA - //only CUDA benefits from using team-based bitonic - bool useRadix = std::is_same::value ? false : true; -#else - bool useRadix = true; -#endif + bool useRadix = !kk_is_gpu_exec_space(); SortCrsMatrixFunctor funct(useRadix, rowmap, entries, values); lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; @@ -1023,12 +1089,7 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; -#ifdef KOKKOS_ENABLE_CUDA - //only CUDA benefits from using team-based bitonic - bool useRadix = std::is_same::value ? false : true; -#else - bool useRadix = true; -#endif + bool useRadix = !kk_is_gpu_exec_space(); SortCrsGraphFunctor funct(useRadix, rowmap, entries); lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; @@ -1097,14 +1158,14 @@ struct MergedRowmapFunctor }; template -struct MergedEntriesFunctor +struct MatrixMergedEntriesFunctor { using size_type = typename rowmap_t::non_const_value_type; using lno_t = typename entries_t::non_const_value_type; using scalar_t = typename values_t::non_const_value_type; //Precondition: entries are sorted within each row - MergedEntriesFunctor( + MatrixMergedEntriesFunctor( const rowmap_t& rowmap_, const entries_t& entries_, const values_t& values_, const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_, const values_t& mergedValues_) : rowmap(rowmap_), entries(entries_), values(values_), @@ -1154,6 +1215,52 @@ struct MergedEntriesFunctor values_t mergedValues; }; +template +struct GraphMergedEntriesFunctor +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + + //Precondition: entries are sorted within each row + GraphMergedEntriesFunctor( + const rowmap_t& rowmap_, const entries_t& entries_, + const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_) + : rowmap(rowmap_), entries(entries_), + mergedRowmap(mergedRowmap_), mergedEntries(mergedEntries_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const + { + size_type rowBegin = rowmap(row); + size_type rowEnd = rowmap(row + 1); + if(rowEnd == rowBegin) + { + //Row was empty to begin with, nothing to do + return; + } + //Otherwise, accumulate the value for each column + lno_t accumCol = entries(rowBegin); + size_type insertPos = mergedRowmap(row); + for(size_type j = rowBegin + 1; j < rowEnd; j++) + { + if(accumCol != entries(j)) + { + //write out and reset + mergedEntries(insertPos) = accumCol; + insertPos++; + accumCol = entries(j); + } + } + //always left with the last unique entry + mergedEntries(insertPos) = accumCol; + } + + rowmap_t rowmap; + entries_t entries; + rowmap_t mergedRowmap; + entries_t mergedEntries; +}; + //Sort the rows of matrix, and merge duplicate entries. template crsMat_t sort_and_merge_matrix(const crsMat_t& A) @@ -1177,7 +1284,7 @@ crsMat_t sort_and_merge_matrix(const crsMat_t& A) values_t mergedValues("SortedMerged values", numCompressedEntries); //Compute merged entries and values Kokkos::parallel_for(range_t(0, A.numRows()), - MergedEntriesFunctor + MatrixMergedEntriesFunctor (A.graph.row_map, A.graph.entries, A.values, mergedRowmap, mergedEntries, mergedValues)); //Finally, construct the new compressed matrix @@ -1185,6 +1292,41 @@ crsMat_t sort_and_merge_matrix(const crsMat_t& A) mergedValues, mergedRowmap, mergedEntries); } +template +void sort_and_merge_graph( + const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, + rowmap_t& rowmap_out, entries_t& entries_out) +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using range_t = Kokkos::RangePolicy; + using const_rowmap_t = typename rowmap_t::const_type; + lno_t numRows = rowmap_in.extent(0); + if(numRows <= 1) + { + //Matrix has zero rows + rowmap_out = rowmap_t(); + entries_out = entries_t(); + return; + } + numRows--; + //Sort in place + sort_crs_graph(rowmap_in, entries_in); + //Count entries per row into a new rowmap, in terms of merges that can be done + rowmap_out = rowmap_t(Kokkos::ViewAllocateWithoutInitializing("SortedMerged rowmap"), numRows + 1); + size_type numCompressedEntries = 0; + Kokkos::parallel_reduce(range_t(0, numRows), + MergedRowmapFunctor(rowmap_out, rowmap_in, entries_in), numCompressedEntries); + //Prefix sum to get rowmap + kk_exclusive_parallel_prefix_sum(numRows + 1, rowmap_out); + entries_out = entries_t("SortedMerged entries", numCompressedEntries); + //Compute merged entries and values + Kokkos::parallel_for(range_t(0, numRows), + GraphMergedEntriesFunctor + (rowmap_in, entries_in, + rowmap_out, entries_out)); +} + template (); - - if (exec == Exec_CUDA){ - typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj); - Kokkos::deep_copy (hr, in_xadj); - typename lno_nnz_view_t::HostMirror he = Kokkos::create_mirror_view (in_adj); - Kokkos::deep_copy (he, in_adj); - typename scalar_view_t::HostMirror hv = Kokkos::create_mirror_view (in_vals); - Kokkos::deep_copy (hv, in_vals); - MyExecSpace().fence(); - - typename lno_nnz_view_t::HostMirror heo = Kokkos::create_mirror_view (out_adj); - typename scalar_view_t::HostMirror hvo = Kokkos::create_mirror_view (out_vals); + // TODO BMK: can this function be deprecated? + typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj); + Kokkos::deep_copy (hr, in_xadj); + typename lno_nnz_view_t::HostMirror he = Kokkos::create_mirror_view (in_adj); + Kokkos::deep_copy (he, in_adj); + typename scalar_view_t::HostMirror hv = Kokkos::create_mirror_view (in_vals); + Kokkos::deep_copy (hv, in_vals); + MyExecSpace().fence(); + typename lno_nnz_view_t::HostMirror heo = Kokkos::create_mirror_view (out_adj); + typename scalar_view_t::HostMirror hvo = Kokkos::create_mirror_view (out_vals); - typedef typename lno_view_t::non_const_value_type size_type; - typedef typename lno_nnz_view_t::non_const_value_type lno_t; - typedef typename scalar_view_t::non_const_value_type scalar_t; + typedef typename lno_view_t::non_const_value_type size_type; + typedef typename lno_nnz_view_t::non_const_value_type lno_t; + typedef typename scalar_view_t::non_const_value_type scalar_t; - lno_t nrows = in_xadj.extent(0) - 1; - std::vector > edges(in_adj.extent(0)); + lno_t nrows = in_xadj.extent(0) - 1; + std::vector > edges(in_adj.extent(0)); - size_type row_size = 0; - for (lno_t i = 0; i < nrows; ++i){ - for (size_type j = hr(i); j < hr(i + 1); ++j){ - edges[row_size].src = i; - edges[row_size].dst = he(j); - edges[row_size++].ew = hv(j); - } + size_type row_size = 0; + for (lno_t i = 0; i < nrows; ++i){ + for (size_type j = hr(i); j < hr(i + 1); ++j){ + edges[row_size].src = i; + edges[row_size].dst = he(j); + edges[row_size++].ew = hv(j); } - std::sort (edges.begin(), edges.begin() + row_size); - size_type ne = in_adj.extent(0); - for(size_type i = 0; i < ne; ++i){ - heo(i) = edges[i].dst; - hvo(i) = edges[i].ew; - } - - - Kokkos::deep_copy (out_adj, heo); - Kokkos::deep_copy (out_vals, hvo); - MyExecSpace().fence(); } - else { - - - typedef typename lno_view_t::non_const_value_type size_type; - typedef typename lno_nnz_view_t::non_const_value_type lno_t; - typedef typename scalar_view_t::non_const_value_type scalar_t; - - lno_t nrows = in_xadj.extent(0) - 1; - std::vector > edges(in_adj.extent(0)); - - size_type row_size = 0; - for (lno_t i = 0; i < nrows; ++i){ - for (size_type j = in_xadj(i); j < in_xadj(i + 1); ++j){ - edges[row_size].src = i; - edges[row_size].dst = in_adj(j); - edges[row_size++].ew = in_vals(j); - } - } - std::sort (edges.begin(), edges.begin() + row_size); - size_type ne = in_adj.extent(0); - for(size_type i = 0; i < ne; ++i){ - out_adj(i) = edges[i].dst; - out_vals(i) = edges[i].ew; - } - + std::sort (edges.begin(), edges.begin() + row_size); + size_type ne = in_adj.extent(0); + for(size_type i = 0; i < ne; ++i){ + heo(i) = edges[i].dst; + hvo(i) = edges[i].ew; + } - } + Kokkos::deep_copy (out_adj, heo); + Kokkos::deep_copy (out_vals, hvo); + MyExecSpace().fence(); } /* @@ -1562,47 +1672,46 @@ struct LowerTriangularMatrix{ const size_type write_end = t_xadj[row_index + 1]; const lno_t write_left_work = write_end - write_begin; - switch (exec_space){ - case Exec_CUDA: - //TODO: Write cuda version here. - /* + //TODO: Write GPU (vector-level) version here: + /* + if(kk_is_gpu_exec_space()) + { Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, read_left_work), [&] (lno_t i) { const size_type adjind = i + col_begin; const lno_t colIndex = adj[adjind]; - }); - */ + } + else + ... + */ - default: - for (lno_t r = 0 , w = 0; r < read_left_work && w < write_left_work; ++r){ - const size_type adjind = r + col_begin; - const lno_t colIndex = adj[adjind]; - lno_t colperm = colIndex; - if (permutation != NULL){ - colperm = permutation[colIndex]; - } - if (is_lower){ - if (row_perm > colperm){ - if (in_vals != NULL){ - t_vals[write_begin + w] = in_vals[adjind]; - } - t_adj[write_begin + w++] = colIndex; + for (lno_t r = 0 , w = 0; r < read_left_work && w < write_left_work; ++r){ + const size_type adjind = r + col_begin; + const lno_t colIndex = adj[adjind]; + lno_t colperm = colIndex; + if (permutation != NULL){ + colperm = permutation[colIndex]; + } + if (is_lower){ + if (row_perm > colperm){ + if (in_vals != NULL){ + t_vals[write_begin + w] = in_vals[adjind]; } + t_adj[write_begin + w++] = colIndex; } - else { - if (row_perm < colperm){ - if (in_vals != NULL){ - t_vals[write_begin + w] = in_vals[adjind]; - } - t_adj[write_begin + w++] = colIndex; + } + else { + if (row_perm < colperm){ + if (in_vals != NULL){ + t_vals[write_begin + w] = in_vals[adjind]; } + t_adj[write_begin + w++] = colIndex; } + } - } - break; } }); } @@ -2188,7 +2297,6 @@ void kk_create_incidence_tranpose_matrix_from_lower_triangle( bool use_dynamic_scheduling = false, bool chunksize = 4){ -#ifndef KOKKOS_ENABLE_CUDA //typedef typename row_map_view_t::const_type const_row_map_view_t; //typedef typename cols_view_t::const_type const_cols_view_t; @@ -2229,7 +2337,6 @@ void kk_create_incidence_tranpose_matrix_from_lower_triangle( } }); -#endif } template (in_elements, in_view, histogram); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE -template -void get_suggested_vector_team_size( - int max_allowed_team_size, - int &suggested_vector_size_, - int &suggested_team_size_, - idx nr, idx nnz){ - - - suggested_vector_size_ = 1; - suggested_team_size_ = 1; - -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ - suggested_vector_size_ = 1; - suggested_team_size_ = 1; - } -#endif - -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - suggested_vector_size_ = 1; - suggested_team_size_ = 1; - } -#endif - -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - suggested_vector_size_ = 1; - suggested_team_size_ = 1; - } -#endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - - suggested_vector_size_ = nnz / double (nr) + 0.5; - - if (suggested_vector_size_ <= 3){ - suggested_vector_size_ = 2; - } - else if (suggested_vector_size_ <= 6){ - suggested_vector_size_ = 4; - } - else if (suggested_vector_size_ <= 12){ - suggested_vector_size_ = 8; - } - else if (suggested_vector_size_ <= 24){ - suggested_vector_size_ = 16; - } - else { - suggested_vector_size_ = 32; - } - - suggested_team_size_ = max_allowed_team_size / suggested_vector_size_; - } -#else - (void)max_allowed_team_size; - (void)nr; - (void)nnz; -#endif - -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ - suggested_vector_size_ = 1; - suggested_team_size_ = 1; - } -#endif - -} - -#else template void get_suggested_vector_size( int &suggested_vector_size_, - idx nr, idx nnz){ - - suggested_vector_size_ = 1; - -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ - suggested_vector_size_ = 1; - } -#endif - -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - suggested_vector_size_ = 1; - } -#endif - -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - suggested_vector_size_ = 1; - } -#endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - - suggested_vector_size_ = nnz / double (nr) + 0.5; - - if (suggested_vector_size_ <= 3){ - suggested_vector_size_ = 2; - } - else if (suggested_vector_size_ <= 6){ - suggested_vector_size_ = 4; - } - else if (suggested_vector_size_ <= 12){ - suggested_vector_size_ = 8; - } - else if (suggested_vector_size_ <= 24){ - suggested_vector_size_ = 16; - } - else { - suggested_vector_size_ = 32; - } - } -#endif - -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ - suggested_vector_size_ = 1; - } -#endif - + idx nr, idx nnz) { + suggested_vector_size_ = kk_get_suggested_vector_size(nr, nnz, get_exec_space_type()); } //Get the best team size for the given functor. @@ -224,36 +103,28 @@ void get_suggested_vector_size( template int get_suggested_team_size(Functor& f, int vector_size) { -#ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) + using execution_space = typename team_policy_t::traits::execution_space; + if(kk_is_gpu_exec_space()) { team_policy_t temp(1, 1, vector_size); return temp.team_size_recommended(f, ParallelTag()); } else -#endif - { return 1; - } } -#endif //ifdef KOKKOS_ENABLE_DEPRECATED_CODE ... else - template int get_suggested_team_size(Functor& f, int vector_size, size_t sharedPerTeam, size_t sharedPerThread) { -#ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) + using execution_space = typename team_policy_t::traits::execution_space; + if(kk_is_gpu_exec_space()) { team_policy_t temp = team_policy_t(1, 1, vector_size). set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam), Kokkos::PerThread(sharedPerThread)); return temp.team_size_recommended(f, ParallelTag()); } else -#endif - { return 1; - } } template ( - max_allowed_team_size, - vector_size, - teamSizeMax, - xadj.extent(0) - 1, nnz); -#else get_suggested_vector_size( vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(fse, vector_size); -#endif //std::cout << "max_allowed_team_size:" << max_allowed_team_size << " vs:" << vector_size << " tsm:" << teamSizeMax<< std::endl; team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); @@ -1186,21 +1048,12 @@ void symmetrize_and_get_lower_diagonal_edge_list( int teamSizeMax = 0; int vector_size = 0; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - int max_allowed_team_size = team_policy::team_size_max(FSCH); - get_suggested_vector_team_size( - max_allowed_team_size, - vector_size, - teamSizeMax, - xadj.extent(0) - 1, nnz); -#else get_suggested_vector_size( vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(FSCH, vector_size); -#endif team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S1", pol, FSCH); @@ -1261,21 +1114,12 @@ void symmetrize_graph_symbolic_hashmap( int teamSizeMax = 0; int vector_size = 0; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - int max_allowed_team_size = team_policy::team_size_max(fse); - get_suggested_vector_team_size( - max_allowed_team_size, - vector_size, - teamSizeMax, - xadj.extent(0) - 1, nnz); -#else get_suggested_vector_size( vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(fse, vector_size); -#endif team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S0", @@ -1311,22 +1155,13 @@ void symmetrize_graph_symbolic_hashmap( int teamSizeMax = 0; int vector_size = 0; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - int max_allowed_team_size = team_policy::team_size_max(FSCH); - get_suggested_vector_team_size( - max_allowed_team_size, - vector_size, - teamSizeMax, - xadj.extent(0) - 1, nnz); -#else get_suggested_vector_size( vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(FSCH, vector_size); -#endif team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S1", diff --git a/src/common/KokkosKernels_default_types.hpp b/src/common/KokkosKernels_default_types.hpp index a83752b282..0b8724b794 100644 --- a/src/common/KokkosKernels_default_types.hpp +++ b/src/common/KokkosKernels_default_types.hpp @@ -82,6 +82,8 @@ #if defined(KOKKOS_ENABLE_CUDA) typedef Kokkos::Cuda default_device; +#elif defined(KOKKOS_ENABLE_HIP) + typedef Kokkos::Experimental::HIP default_device; #elif defined(KOKKOS_ENABLE_OPENMP) typedef Kokkos::OpenMP default_device; #elif defined(KOKKOS_ENABLE_PTHREAD) || defined(KOKKOS_ENABLE_THREADS) diff --git a/src/graph/KokkosGraph_Distance1Color.hpp b/src/graph/KokkosGraph_Distance1Color.hpp index 83070c6e66..2e9a4bc03d 100644 --- a/src/graph/KokkosGraph_Distance1Color.hpp +++ b/src/graph/KokkosGraph_Distance1Color.hpp @@ -44,8 +44,6 @@ #ifndef _KOKKOSGRAPH_DISTANCE1_COLOR_HPP #define _KOKKOSGRAPH_DISTANCE1_COLOR_HPP -#include - #include "KokkosGraph_Distance1ColorHandle.hpp" #include "KokkosGraph_Distance1Color_impl.hpp" #include "KokkosKernels_Utils.hpp" diff --git a/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/src/graph/KokkosGraph_Distance1ColorHandle.hpp index 49e20d5395..077104ef9f 100644 --- a/src/graph/KokkosGraph_Distance1ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance1ColorHandle.hpp @@ -109,7 +109,7 @@ class GraphColoringHandle typedef typename Kokkos::View nnz_lno_persistent_work_view_t; typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type - typedef Kokkos::TeamPolicy team_policy_t ; + typedef Kokkos::TeamPolicy team_policy_t ; typedef typename team_policy_t::member_type team_member_t ; typedef typename Kokkos::View non_const_1d_size_type_view_t; @@ -229,54 +229,34 @@ class GraphColoringHandle } - /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise. + /** \brief Chooses best algorithm based on the execution space. COLORING_SERIAL if serial, otherwise COLORING_VBBIT. + * VBBIT is the fastest parallel algorithm (unless on GPU and the graph's maximum degree is very large, but + * we don't have information about the graph here) */ void choose_default_algorithm() { -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ + auto exec = KokkosKernels::Impl::kk_get_exec_space_type(); + if(exec == KokkosKernels::Impl::Exec_SERIAL) + { this->coloring_algorithm_type = COLORING_SERIAL; #ifdef VERBOSE - std::cout << "Serial Execution Space, Default Algorithm: COLORING_VB" << std::endl; + std:cout << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n"; #endif } -#endif - -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - this->coloring_algorithm_type = COLORING_VB; -#ifdef VERBOSE - std::cout << "PTHREAD Execution Space, Default Algorithm: COLORING_VB" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - this->coloring_algorithm_type = COLORING_VB; -#ifdef VERBOSE - std::cout << "OpenMP Execution Space, Default Algorithm: COLORING_VB" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ + else if(KokkosKernels::Impl::kk_is_gpu_exec_space()) + { this->coloring_algorithm_type = COLORING_EB; #ifdef VERBOSE - std::cout << "Cuda Execution Space, Default Algorithm: COLORING_VB" << std::endl; + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_EB\n"; #endif } -#endif - -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ + else + { this->coloring_algorithm_type = COLORING_VB; #ifdef VERBOSE - std::cout << "Qthread Execution Space, Default Algorithm: COLORING_VB" << std::endl; + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VB\n"; #endif } -#endif } template @@ -357,7 +337,7 @@ class GraphColoringHandle } }, new_edge_count); - Kokkos::single(Kokkos::PerThread(teamMember),[=] () { + Kokkos::single(Kokkos::PerThread(teamMember),[&] () { lower_xadj_counts(ii + 1) = new_edge_count; }); } @@ -463,7 +443,7 @@ class GraphColoringHandle row_index_view_type xadj, nonzero_view_type adj){ KokkosKernels::Impl::symmetrize_and_get_lower_diagonal_edge_list - + ( nv, xadj, @@ -496,13 +476,8 @@ class GraphColoringHandle size_type_temp_work_view_t lower_count("LowerXADJ", nv + 1); size_type new_num_edge = 0; - typedef Kokkos::RangePolicy my_exec_space; - - if ( false -#if defined( KOKKOS_ENABLE_CUDA ) - || std::is_same::value -#endif - ) + typedef Kokkos::RangePolicy my_exec_space; + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { @@ -510,30 +485,22 @@ class GraphColoringHandle int vector_size = 0; CountLowerTriangleTeam clt (nv, xadj, adj, lower_count); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - int max_allowed_team_size = team_policy_t::team_size_max(clt); - KokkosKernels::Impl::get_suggested_vector_team_size( - max_allowed_team_size, - vector_size, - teamSizeMax, - nv, ne); -#else + KokkosKernels::Impl::get_suggested_vector_size( vector_size, nv, ne); teamSizeMax = KokkosKernels::Impl::get_suggested_team_size(clt, vector_size); -#endif Kokkos::parallel_for("KokkosGraph::CountLowerTriangleTeam", team_policy_t((nv + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size), clt//, new_num_edge ); - KokkosKernels::Impl::inclusive_parallel_prefix_sum + KokkosKernels::Impl::inclusive_parallel_prefix_sum (nv+1, lower_count); //Kokkos::parallel_scan (my_exec_space(0, nv + 1), PPS(lower_count)); - HandleExecSpace().fence(); + ExecutionSpace().fence(); auto lower_total_count = Kokkos::subview(lower_count, nv); auto hlower = Kokkos::create_mirror_view (lower_total_count); Kokkos::deep_copy (hlower, lower_total_count); @@ -559,7 +526,7 @@ class GraphColoringHandle //Kokkos::parallel_scan (my_exec_space(0, nv + 1), PPS(lower_count)); - KokkosKernels::Impl::inclusive_parallel_prefix_sum + KokkosKernels::Impl::inclusive_parallel_prefix_sum (nv+1, lower_count); nnz_lno_persistent_work_view_t half_src (Kokkos::ViewAllocateWithoutInitializing("HALF SRC"),new_num_edge); nnz_lno_persistent_work_view_t half_dst (Kokkos::ViewAllocateWithoutInitializing("HALF DST"),new_num_edge); diff --git a/src/graph/KokkosGraph_Distance2Color.hpp b/src/graph/KokkosGraph_Distance2Color.hpp index dacf9c99db..53f2b4a26b 100644 --- a/src/graph/KokkosGraph_Distance2Color.hpp +++ b/src/graph/KokkosGraph_Distance2Color.hpp @@ -245,80 +245,6 @@ void bipartite_color_columns( gch_d2->set_coloring_time(timer.seconds()); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE -/** - * (DEPRECATED) Compute the left-side coloring of a bipartite matrix/graph. - * Equivalent to bipartite_color_rows(), except this interface requires the user - * to compute (col_map, col_entries) as the transpose of the graph (if nonsymmetric). - * - * This function is deprecated because it's not possible to support both undirected - * distance-2 coloring and bipartite one-sided coloring - * in a single interface. However, if the input graph has all diagonal entries present and - * is symmetric (which is generally the case for discretized PDE matrices), then this - * function is also equivalent to graph_color_distance2(). - * - * In any case, the graphs (row_map, row_entries) and (col_map, col_entries) must be transposes - * of each other. - * - * @param[in] handle The Kernel Handle - * @param[in] num_rows Number of rows in the matrix (number of vertices) - * @param[in] num_cols Number of columns in the matrix - * @param[in] row_map Row map - * @param[in] row_entries Row entries - * @param[in] col_map Column map - * @param[in] col_entries Column entries - */ -template -void graph_compute_distance2_color(KernelHandle *handle, - typename KernelHandle::nnz_lno_t num_rows, - typename KernelHandle::nnz_lno_t num_cols, - lno_row_view_t_ row_map, - lno_nnz_view_t_ row_entries, - // If graph is symmetric, simply pass the same graph twice: - // row_map == col_map, row_entries == col_entries - lno_col_view_t_ col_map, - lno_colnnz_view_t_ col_entries) -{ - using lno_t = typename KernelHandle::nnz_lno_t; - using size_type = typename KernelHandle::size_type; - using memory_space = typename KernelHandle::HandleTempMemorySpace; - static_assert(std::is_same::value, - "Row and col maps must have the same value type (size_type)."); - static_assert(std::is_same::value, - "Row and col entries must have the same value type (nnz_lno_t)."); - //Internally, coloring accesses the graph through unmanaged views - //These are explicitly nonconst so that copies of adj for edge-filtering - //(which must be mutable) can use the same type. - // - //The original input graphs will never be modified. - using InternalRowmap = Kokkos::View >; - using InternalColinds = Kokkos::View >; - if(row_entries.extent(0) != col_entries.extent(0)) - { - throw std::runtime_error("row_entries and col_entries must represent transposes of each other, but they have different lengths"); - } - Kokkos::Impl::Timer timer; - // Set our handle pointer to a GraphColoringHandleType. - auto *gch_d2 = handle->get_distance2_graph_coloring_handle(); - // Create a view to save the colors to. - using color_view_type = typename KernelHandle::GraphColorDistance2HandleType::color_view_type; - color_view_type colors_out("Graph Colors", num_rows); - InternalRowmap rowmap_internal(row_map.data(), row_map.extent(0)); - InternalColinds rowentries_internal(row_entries.data(), row_entries.extent(0)); - InternalRowmap colmap_internal(col_map.data(), col_map.extent(0)); - InternalColinds colentries_internal(col_entries.data(), col_entries.extent(0)); - Impl::GraphColorDistance2 - gc(num_rows, num_cols, row_map, row_entries, col_map, col_entries, gch_d2); - gc.compute_distance2_color(); - gch_d2->add_to_overall_coloring_time(timer.seconds()); - gch_d2->set_coloring_time(timer.seconds()); -} -#endif - } // end namespace Experimental } // end namespace KokkosGraph diff --git a/src/graph/KokkosGraph_Distance2ColorHandle.hpp b/src/graph/KokkosGraph_Distance2ColorHandle.hpp index f4624f545b..39d66b744f 100644 --- a/src/graph/KokkosGraph_Distance2ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance2ColorHandle.hpp @@ -198,71 +198,27 @@ class GraphColorDistance2Handle * Chooses best algorithm based on the execution space. * * This chooses the best algorithm based on the execution space: - * - COLORING_D2_SERIAL if the execution space is SERIAL - * - COLORING_D2_NB_BIT otherwise + * - COLORING_D2_SERIAL if the execution space is SERIAL (more work efficient than NB_BIT) + * - COLORING_D2_NB_BIT otherwise (fastest parallel algorithm) * */ void choose_default_algorithm() { - bool found = false; -#if defined(KOKKOS_ENABLE_SERIAL) - if(std::is_same::value) + if(KokkosKernels::Impl::kk_get_exec_space_type() == KokkosKernels::Impl::Exec_SERIAL) { this->coloring_algorithm_type = COLORING_D2_SERIAL; - found = true; -#ifdef VERBOSE - std::cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL" << std::endl; +#ifdef VERBOSE + std:cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n"; #endif } -#endif - -#if defined(KOKKOS_ENABLE_THREADS) - if(std::is_same::value) - { - this->coloring_algorithm_type = COLORING_D2_NB_BIT; - found = true; -#ifdef VERBOSE - std::cout << "PTHREAD Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl; -#endif - } -#endif - -#if defined(KOKKOS_ENABLE_OPENMP) - if(std::is_same::value) - { - this->coloring_algorithm_type = COLORING_D2_NB_BIT; - found = true; -#ifdef VERBOSE - std::cout << "OpenMP Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl; -#endif - } -#endif - -#if defined(KOKKOS_ENABLE_CUDA) - if(std::is_same::value) - { - this->coloring_algorithm_type = COLORING_D2_NB_BIT; - found = true; -#ifdef VERBOSE - std::cout << "Cuda Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl; -#endif - } -#endif - -#if defined(KOKKOS_ENABLE_QTHREAD) - if(std::is_same::value) + else { this->coloring_algorithm_type = COLORING_D2_NB_BIT; - found = true; -#ifdef VERBOSE - std::cout << "Qthread Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl; +#ifdef VERBOSE + std:cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n"; #endif } -#endif - //Since this logic is based on checking every exec space, detect when a new one needs to be supported - if(!found) - throw std::logic_error("D2 coloring: default algorithm hasn't been chosen for the current execution space"); } diff --git a/src/graph/KokkosGraph_ExplicitCoarsening.hpp b/src/graph/KokkosGraph_ExplicitCoarsening.hpp new file mode 100644 index 0000000000..212cb7c383 --- /dev/null +++ b/src/graph/KokkosGraph_ExplicitCoarsening.hpp @@ -0,0 +1,117 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSGRAPH_EXPLICIT_COARSEN_HPP +#define KOKKOSGRAPH_EXPLICIT_COARSEN_HPP + +#include "KokkosGraph_ExplicitCoarsening_impl.hpp" +#include "KokkosKernels_SparseUtils.hpp" + +namespace KokkosGraph { +namespace Experimental { + +//Given a CRS graph and coarse labels, produce a new CRS graph representing the coarsened graph. +//If A is nonsquare, entries in columns >= numVerts are discarded. +//The labels should be in the range [0, numCoarseVerts), and the output graph wil have numCoarseVerts. +// +//If compress, sort and merge entries in each row. +//An uncompressed graph will still work as input to some things like D1 graph coloring. + +template +void graph_explicit_coarsen( + const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, + const labels_t& labels, typename fine_entries_t::non_const_value_type numCoarseVerts, + coarse_rowmap_t& coarseRowmap, coarse_entries_t& coarseEntries, + bool compress = true) +{ + using size_type = typename fine_rowmap_t::non_const_value_type; + using lno_t = typename fine_entries_t::non_const_value_type; + using exec_space = typename device_t::execution_space; + static_assert(std::is_same::value, + "graph_explicit_coarsen: The coarse and fine entry Views have different value types."); + KokkosGraph::Impl::ExplicitGraphCoarsening + egc(fineRowmap, fineEntries, labels, numCoarseVerts); + coarseRowmap = egc.coarseRowmap; + coarseEntries = egc.coarseEntries; + if(compress) + { + coarse_rowmap_t mergedRowmap; + coarse_entries_t mergedEntries; + KokkosKernels::Impl::sort_and_merge_graph + (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); + } +} + +//Same as above, but also produce the map from coarse vertices to fine vertices (inverse map of labels) +template +void graph_explicit_coarsen_with_inverse_map( + const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, + const labels_t& labels, typename fine_entries_t::non_const_value_type numCoarseVerts, + coarse_rowmap_t& coarseRowmap, coarse_entries_t& coarseEntries, + ordinal_view_t& inverseOffsets, ordinal_view_t& inverseLabels, + bool compress = true) +{ + using size_type = typename fine_rowmap_t::non_const_value_type; + using lno_t = typename fine_entries_t::non_const_value_type; + using exec_space = typename device_t::execution_space; + static_assert(std::is_same::value, + "graph_explicit_coarsen: The coarse and fine entry Views have different value types."); + KokkosGraph::Impl::ExplicitGraphCoarsening + egc(fineRowmap, fineEntries, labels, numCoarseVerts); + coarseRowmap = egc.coarseRowmap; + coarseEntries = egc.coarseEntries; + inverseOffsets = egc.clusterOffsets; + inverseLabels = egc.clusterVerts; + if(compress) + { + coarse_rowmap_t mergedRowmap; + coarse_entries_t mergedEntries; + KokkosKernels::Impl::sort_and_merge_graph + (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); + } +} + +}} + +#endif diff --git a/src/graph/KokkosGraph_GraphColorHandle.hpp b/src/graph/KokkosGraph_GraphColorHandle.hpp index de9fd6d8f4..9526c34b0e 100644 --- a/src/graph/KokkosGraph_GraphColorHandle.hpp +++ b/src/graph/KokkosGraph_GraphColorHandle.hpp @@ -49,12 +49,3 @@ * KokkosGraph_Distance1Color.hpp to be more consistent with file naming * used in other places within Kokkos-Kernels. */ -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE -#include "KokkosGraph_Distance1ColorHandle.hpp" - -// This interface should be deprecated in version 3.0 -#pragma message("DEPRECATION WARNING: The KokkosGraph_GraphColorHandle.hpp header is replaced by KokkosGraph_Distance1ColorHandle.hpp") - -#endif - - diff --git a/src/graph/KokkosGraph_MIS2.hpp b/src/graph/KokkosGraph_MIS2.hpp new file mode 100644 index 0000000000..c578a97271 --- /dev/null +++ b/src/graph/KokkosGraph_MIS2.hpp @@ -0,0 +1,108 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOSGRAPH_DISTANCE2_MIS_HPP +#define _KOKKOSGRAPH_DISTANCE2_MIS_HPP + +#include "KokkosGraph_Distance2MIS_impl.hpp" + +namespace KokkosGraph{ + +enum MIS2_Algorithm +{ + MIS2_QUALITY, + MIS2_FAST +}; + +namespace Experimental{ + +// Compute a distance-2 maximal independent set, given a symmetric CRS graph. +// Returns a list of the vertices in the set. +// +// Column indices >= num_verts are ignored. + +template +lno_view_t +graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, MIS2_Algorithm algo = MIS2_FAST) +{ + if(rowmap.extent(0) <= 1) + { + //zero vertices means the MIS is empty. + return lno_view_t(); + } + switch(algo) + { + case MIS2_QUALITY: + { + Impl::D2_MIS_FixedPriority mis(rowmap, colinds); + return mis.compute(); + } + case MIS2_FAST: + { + Impl::D2_MIS_RandomPriority mis(rowmap, colinds); + return mis.compute(); + } + } + throw std::invalid_argument("graph_d2_mis: invalid algorithm"); +} + +template +labels_t +graph_mis2_coarsen(const rowmap_t& rowmap, const colinds_t& colinds, typename colinds_t::non_const_value_type& numClusters, MIS2_Algorithm algo = MIS2_FAST) +{ + if(rowmap.extent(0) <= 1) + { + //there are no vertices to label + return labels_t(); + } + labels_t mis2 = graph_d2_mis(rowmap, colinds, algo); + numClusters = mis2.extent(0); + Impl::D2_MIS_Coarsening coarsening(rowmap, colinds, mis2); + return coarsening.compute(); +} + +} // end namespace Experimental +} // end namespace KokkosGraph + +#endif diff --git a/src/graph/KokkosGraph_RCM.hpp b/src/graph/KokkosGraph_RCM.hpp new file mode 100644 index 0000000000..8f1109aa63 --- /dev/null +++ b/src/graph/KokkosGraph_RCM.hpp @@ -0,0 +1,78 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOSGRAPH_RCM_HPP +#define _KOKKOSGRAPH_RCM_HPP + +#include "KokkosGraph_BFS_impl.hpp" + +namespace KokkosGraph +{ +namespace Experimental +{ + +//Compute the reverse Cuthill-McKee ordering of a graph. +//The graph must be symmetric, but it may have any number of connected components. +//This function returns a list of vertices in RCM order. + +template +labels_t +graph_rcm(const rowmap_t& rowmap, const colinds_t& colinds) +{ + using lno_t = typename colinds_t::non_const_value_type; + if(rowmap.extent(0) <= 2) + { + //there are 0 or 1 vertices - return trivial ordering + lno_t numVerts = rowmap.extent(0); + if(numVerts) + numVerts--; + return labels_t("RCM Labels", numVerts); + } + Impl::SerialRCM algo(rowmap, colinds); + return algo.rcm(); +} + +}} //namespace KokkosGraph::Experimental + +#endif diff --git a/src/graph/KokkosGraph_graph_color.hpp b/src/graph/KokkosGraph_graph_color.hpp index 4494ecc509..9526c34b0e 100644 --- a/src/graph/KokkosGraph_graph_color.hpp +++ b/src/graph/KokkosGraph_graph_color.hpp @@ -49,12 +49,3 @@ * KokkosGraph_Distance1Color.hpp to be more consistent with file naming * used in other places within Kokkos-Kernels. */ -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE -#include "KokkosGraph_Distance1Color.hpp" - -// This interface should be deprecated in version 3.0 -#pragma message("DEPRECATION WARNING: The KokkosGraph_graph_color.hpp header will be replaced by KokkosGraph_Distance1Color.hpp") - -#endif - - diff --git a/src/graph/impl/KokkosGraph_BFS_impl.hpp b/src/graph/impl/KokkosGraph_BFS_impl.hpp new file mode 100644 index 0000000000..df652902c0 --- /dev/null +++ b/src/graph/impl/KokkosGraph_BFS_impl.hpp @@ -0,0 +1,160 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOSGRAPH_BFS_IMPL_HPP +#define _KOKKOSGRAPH_BFS_IMPL_HPP + +#include "Kokkos_Core.hpp" +#include "KokkosKernels_Utils.hpp" +#include +#include + +namespace KokkosGraph { +namespace Experimental { +namespace Impl { + +template +struct SerialRCM +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using host_rowmap_t = Kokkos::View; + using host_lno_view_t = Kokkos::View; + + lno_t numVerts; + host_rowmap_t rowmap; + host_lno_view_t entries; + + SerialRCM(const rowmap_t& rowmap_, const entries_t& entries_) : + numVerts(rowmap_.extent(0) - 1), + rowmap(Kokkos::ViewAllocateWithoutInitializing("HostRowmap"), rowmap_.extent(0)), + entries(Kokkos::ViewAllocateWithoutInitializing("HostEntries"), entries_.extent(0)) + { + Kokkos::deep_copy(rowmap, rowmap_); + Kokkos::deep_copy(entries, entries_); + } + + lno_t findPseudoPeripheral() + { + //Choose vertex with smallest degree + lno_t periph = -1; + lno_t periphDeg = numVerts; + for(lno_t i = 0; i < numVerts; i++) + { + lno_t deg = rowmap(i + 1) - rowmap(i); + if(deg < periphDeg) + { + periph = i; + periphDeg = deg; + if(deg == 0) + break; + } + } + return periph; + } + + lno_view_t rcm() + { + lno_t start = findPseudoPeripheral(); + host_lno_view_t q(Kokkos::ViewAllocateWithoutInitializing("Queue"), numVerts); + host_lno_view_t label(Kokkos::ViewAllocateWithoutInitializing("Permutation"), numVerts); + for(lno_t i = 0; i < numVerts; i++) + label(i) = -1; + lno_t qhead = 0; + lno_t qtail = 0; + label(start) = qtail; + q(qtail++) = start; + std::vector neighbors; + lno_t outerQueue = 0; + while(true) + { + lno_t v = q(qhead++); + neighbors.clear(); + for(size_type j = rowmap(v); j < rowmap(v + 1); j++) + { + lno_t nei = entries(j); + if(nei == v || nei >= numVerts) + continue; + if(label(nei) == -1) + { + neighbors.push_back(nei); + } + } + std::sort(neighbors.begin(), neighbors.end(), + [&](lno_t n1, lno_t n2) -> bool + { + //return true if n1 has a lower degree than n2 + return (rowmap(n1 + 1) - rowmap(n1)) < (rowmap(n2 + 1) - rowmap(n2)); + }); + //label and enqueue all unlabeled neighbors + for(lno_t nei : neighbors) + { + label(nei) = qtail; + q(qtail++) = nei; + } + if(qtail == numVerts) + { + //have labeled all vertices + break; + } + else if(qhead == qtail) + { + //have exhausted this connected component, but others remain unlabeled + while(label(outerQueue) != -1) + outerQueue++; + label(outerQueue) = qtail; + q(qtail++) = outerQueue; + } + } + lno_view_t labelOut(Kokkos::ViewAllocateWithoutInitializing("RCM Permutation"), numVerts); + //reverse the labels + for(lno_t i = 0; i < numVerts; i++) + label(i) = numVerts - label(i) - 1; + Kokkos::deep_copy(labelOut, label); + return labelOut; + } +}; + +}}} //namespace KokkosGraph::Experimental::Impl +#endif diff --git a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp index 4e6f322bce..110756a364 100644 --- a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp @@ -124,7 +124,13 @@ class GraphColor const_lno_nnz_view_t entries, HandleType *coloring_handle): nv (nv_), ne(ne_),xadj(row_map), adj (entries), - kok_src(), kok_dst(), cp(coloring_handle){} + kok_src(), kok_dst(), cp(coloring_handle) + { + static_assert(std::is_same::value, + "Row map element type does not match handle's size_type."); + static_assert(std::is_same::value, + "Entries element type does not match handle's nnz_lno_t."); + } /** \brief GraphColor destructor. */ diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp new file mode 100644 index 0000000000..866ad54daf --- /dev/null +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -0,0 +1,975 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOSGRAPH_DISTANCE2_MIS_IMPL_HPP +#define _KOKKOSGRAPH_DISTANCE2_MIS_IMPL_HPP + +#include "Kokkos_Core.hpp" +#include "Kokkos_Bitset.hpp" +#include "KokkosKernels_Utils.hpp" +#include + +namespace KokkosGraph { +namespace Experimental { +namespace Impl { + +template +struct D2_MIS_RandomPriority +{ + using exec_space = typename device_t::execution_space; + using mem_space = typename device_t::memory_space; + using bitset_t = Kokkos::Bitset; + using const_bitset_t = Kokkos::ConstBitset; + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + //The type of status/priority values. + using status_t = typename std::make_unsigned::type; + using status_view_t = Kokkos::View; + using range_pol = Kokkos::RangePolicy; + using team_pol = Kokkos::TeamPolicy; + using team_mem = typename team_pol::member_type; + using all_worklists_t = Kokkos::View; + using worklist_t = Kokkos::View; + + KOKKOS_INLINE_FUNCTION static uint32_t xorshiftHash(uint32_t in) + { + uint32_t x = in; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + return x; + } + + // Priority values 0 and max are special, they mean the vertex is + // in the independent set or eliminated from consideration, respectively. + // Values in between represent a priority for being added to the set, + // based on degree and vertex ID as a tiebreak + // (higher priority = less preferred to being in the independent set) + + static constexpr status_t IN_SET = 0; + static constexpr status_t OUT_SET = ~IN_SET; + + D2_MIS_RandomPriority(const rowmap_t& rowmap_, const entries_t& entries_) + : rowmap(rowmap_), entries(entries_), numVerts(rowmap.extent(0) - 1) + { + status_t i = numVerts + 1; + nvBits = 0; + while(i) + { + i >>= 1; + nvBits++; + } + //Each value in rowStatus represents the status and priority of each row. + //Each value in colStatus represents the lowest nonzero priority of any row adjacent to the column. + // This counts up monotonically as vertices are eliminated (given status OUT_SET) + rowStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("RowStatus"), numVerts); + colStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("ColStatus"), numVerts); + allWorklists = Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("AllWorklists"), numVerts, 3); + } + + struct RefreshRowStatus + { + RefreshRowStatus(const status_view_t& rowStatus_, const worklist_t& worklist_, lno_t nvBits_, int round) + : rowStatus(rowStatus_), worklist(worklist_), nvBits(nvBits_) + { + hashedRound = xorshiftHash(round); + } + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const + { + lno_t i = worklist(w); + //Combine vertex and round to get some pseudorandom priority bits that change each round + status_t priority = xorshiftHash(i + hashedRound); + //Generate unique status per row, with IN_SET < status < OUT_SET, + int priorityBits = sizeof(status_t) * 8 - nvBits; + status_t priorityMask = 1; + priorityMask <<= priorityBits; + priorityMask--; + status_t newStatus = (status_t) (i + 1) + ((priority & priorityMask) << nvBits); + if(newStatus == OUT_SET) + newStatus--; + rowStatus(i) = newStatus; + } + + status_view_t rowStatus; + worklist_t worklist; + int nvBits; + uint32_t hashedRound; + }; + + struct RefreshColStatus + { + RefreshColStatus(const status_view_t& colStatus_, const worklist_t& worklist_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, lno_t worklistLen_) + : colStatus(colStatus_), worklist(worklist_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklistLen(worklistLen_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const + { + lno_t i = worklist(w); + //iterate over {i} union the neighbors of i, to find + //minimum status. + status_t s = OUT_SET; + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + for(size_type j = rowBegin; j <= rowEnd; j++) + { + lno_t nei = (j == rowEnd) ? i : entries(j); + if(nei < nv) + { + status_t neiStat = rowStatus(nei); + if(neiStat < s) + s = neiStat; + } + } + if(s == IN_SET) + s = OUT_SET; + colStatus(i) = s; + } + + KOKKOS_INLINE_FUNCTION void operator()(const team_mem& t) const + { + using MinReducer = Kokkos::Min; + lno_t w = t.league_rank() * t.team_size() + t.team_rank(); + if(w >= worklistLen) + return; + lno_t i = worklist(w); + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowLen = rowEnd - rowBegin; + //iterate over {i} union the neighbors of i, to find + //minimum status. + status_t s; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(t, rowLen + 1), + [&](lno_t j, status_t& ls) + { + lno_t nei = (j == rowLen) ? i : entries(rowBegin + j); + if(nei < nv) + { + status_t neiStat = rowStatus(nei); + if(neiStat < ls) + ls = neiStat; + } + }, MinReducer(s)); + Kokkos::single(Kokkos::PerThread(t), + [&]() + { + if(s == IN_SET) + s = OUT_SET; + colStatus(i) = s; + }); + } + + status_view_t colStatus; + worklist_t worklist; + status_view_t rowStatus; + rowmap_t rowmap; + entries_t entries; + lno_t nv; + lno_t worklistLen; + }; + + struct DecideSetFunctor + { + DecideSetFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const worklist_t& worklist_, lno_t worklistLen_) + : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklist(worklist_), worklistLen(worklistLen_) + {} + + //Enum values to be used as flags, so that the team policy version can + //express the neighbor checking as an OR-reduction + enum + { + NEI_OUT_SET = 1, + NEI_DIFFERENT_STATUS = 2 + }; + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const + { + lno_t i = worklist(w); + //Processing row i. + status_t s = rowStatus(i); + if(s == IN_SET || s == OUT_SET) + return; + //s is the status which must be the minimum among all neighbors + //to decide that i is IN_SET. + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + bool neiOut = false; + bool neiMismatchS = false; + for(size_type j = rowBegin; j <= rowEnd; j++) + { + lno_t nei = (j == rowEnd) ? i : entries(j); + if(nei >= nv) + continue; + status_t neiStat = colStatus(nei); + if(neiStat == OUT_SET) + { + neiOut = true; + break; + } + else if(neiStat != s) + { + neiMismatchS = true; + } + } + if(neiOut) + { + //In order to make future progress, need to update the + //col statuses for all neighbors of i. + rowStatus(i) = OUT_SET; + } + else if(!neiMismatchS) + { + //all neighboring col statuses match s, therefore s is the minimum status among all d2 neighbors + rowStatus(i) = IN_SET; + } + } + + KOKKOS_INLINE_FUNCTION void operator()(const team_mem& t) const + { + using OrReducer = Kokkos::BOr; + lno_t w = t.league_rank() * t.team_size() + t.team_rank(); + if(w >= worklistLen) + return; + lno_t i = worklist(w); + //Processing row i. + status_t s = rowStatus(i); + if(s == IN_SET || s == OUT_SET) + return; + //s is the status which must be the minimum among all neighbors + //to decide that i is IN_SET. + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowLen = rowEnd - rowBegin; + int flags = 0; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(t, rowLen + 1), + [&](lno_t j, int& lflags) + { + lno_t nei = (j == rowLen) ? i : entries(rowBegin + j); + if(nei >= nv) + return; + status_t neiStat = colStatus(nei); + if(neiStat == OUT_SET) + lflags |= NEI_OUT_SET; + else if(neiStat != s) + lflags |= NEI_DIFFERENT_STATUS; + }, OrReducer(flags)); + Kokkos::single(Kokkos::PerThread(t), + [&]() + { + if(flags & NEI_OUT_SET) + { + //In order to make future progress, need to update the + //col statuses for all neighbors of i. + rowStatus(i) = OUT_SET; + } + else if(!(flags & NEI_DIFFERENT_STATUS)) + { + //all neighboring col statuses match s, therefore s is the minimum status among all d2 neighbors + rowStatus(i) = IN_SET; + } + }); + } + + status_view_t rowStatus; + status_view_t colStatus; + rowmap_t rowmap; + entries_t entries; + lno_t nv; + worklist_t worklist; + lno_t worklistLen; + }; + + struct CountInSet + { + CountInSet(const status_view_t& rowStatus_) + : rowStatus(rowStatus_) + {} + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet) const + { + if(rowStatus(i) == IN_SET) + lNumInSet++; + } + status_view_t rowStatus; + }; + + struct CompactInSet + { + CompactInSet(const status_view_t& rowStatus_, const lno_view_t& setList_) + : rowStatus(rowStatus_), setList(setList_) + {} + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, bool finalPass) const + { + if(rowStatus(i) == IN_SET) + { + if(finalPass) + setList(lNumInSet) = i; + lNumInSet++; + } + } + status_view_t rowStatus; + lno_view_t setList; + }; + + struct InitWorklistFunctor + { + InitWorklistFunctor(const worklist_t& worklist_) + : worklist(worklist_) + {} + KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const + { + worklist(i) = i; + } + worklist_t worklist; + }; + + struct CompactWorklistFunctor + { + CompactWorklistFunctor(const worklist_t& src_, const worklist_t& dst_, const status_view_t& status_) + : src(src_), dst(dst_), status(status_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lNumInSet, bool finalPass) const + { + lno_t i = src(w); + status_t s = status(i); + if(s != IN_SET && s != OUT_SET) + { + //next worklist needs to contain i + if(finalPass) + dst(lNumInSet) = i; + lNumInSet++; + } + } + + worklist_t src; + worklist_t dst; + status_view_t status; + }; + + lno_view_t compute() + { + //Initialize first worklist to 0...numVerts + worklist_t rowWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 0); + Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(rowWorklist)); + worklist_t colWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 1); + Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(colWorklist)); + worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); + auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); + bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space() && (entries.extent(0) / numVerts >= 16); + int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum); + int round = 0; + lno_t rowWorkLen = numVerts; + lno_t colWorkLen = numVerts; + int refreshColTeamSize = 0; + int decideSetTeamSize = 0; + if(useTeams) + { + team_pol dummyPolicy(1, 1, vectorLength); + //Compute the recommended team size for RefreshColStatus and DecideSetFunctor (will be constant) + { + RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen); + refreshColTeamSize = dummyPolicy.team_size_max(refreshCol, Kokkos::ParallelForTag()); + } + { + DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen); + decideSetTeamSize = dummyPolicy.team_size_max(decideSet, Kokkos::ParallelForTag()); + } + } + while(true) + { + //Compute new row statuses + Kokkos::parallel_for(range_pol(0, rowWorkLen), RefreshRowStatus(rowStatus, rowWorklist, nvBits, round)); + //Compute new col statuses + { + RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen); + if(useTeams) + Kokkos::parallel_for(team_pol((colWorkLen + refreshColTeamSize - 1) / refreshColTeamSize, refreshColTeamSize, vectorLength), refreshCol); + else + Kokkos::parallel_for(range_pol(0, colWorkLen), refreshCol); + } + //Decide row statuses where enough information is available + { + DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen); + if(useTeams) + Kokkos::parallel_for(team_pol((rowWorkLen + decideSetTeamSize - 1) / decideSetTeamSize, decideSetTeamSize, vectorLength), decideSet); + else + Kokkos::parallel_for(range_pol(0, rowWorkLen), decideSet); + } + //Compact row worklist + Kokkos::parallel_scan(range_pol(0, rowWorkLen), CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus), rowWorkLen); + if(rowWorkLen == 0) + break; + std::swap(rowWorklist, thirdWorklist); + //Compact col worklist + Kokkos::parallel_scan(range_pol(0, colWorkLen), CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus), colWorkLen); + std::swap(colWorklist, thirdWorklist); + round++; + } + //now that every vertex has been decided IN_SET/OUT_SET, + //build a compact list of the vertices which are IN_SET. + lno_t numInSet = 0; + Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet); + lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), numInSet); + Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList)); + return setList; + } + + rowmap_t rowmap; + entries_t entries; + lno_t numVerts; + status_view_t rowStatus; + status_view_t colStatus; + all_worklists_t allWorklists; + //The number of bits required to represent vertex IDs, in the ECL-MIS tiebreak scheme: + // ceil(log_2(numVerts + 1)) + int nvBits; +}; + +// UNUSED CODE +// Version of RefreshRowStatus, which does linear interpolation between a degree-based score and a random score. +// By gradually increasing the interpolation coefficient in favor of random, the MIS can converge much faster than +// constant priorities. +// +// KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const +// { +// lno_t i = worklist(w); +// int degBits = sizeof(status_t) * 8 - nvBits; +// if(degBits == 0) +// { +// //no space to store degree information. Algorithm will still work but will +// //probably produce a lower quality MIS. +// rowStatus(i) = i + 1; +// return; +// } +// //Combine vertex and round to get some pseudorandom priority bits that change each round +// status_t maxDegRange = (((status_t) 1) << degBits) - 2; +// lno_t deg = rowmap(i + 1) - rowmap(i); +// //Compute degree-based score and random score +// float degScore = (float) (deg - minDeg) * invDegRange; +// float randScore = (xorshiftHash(i + hashedRound) & 0xFFFF) / 65536.f; +// //Then linearly interpolate using k +// float finalScore = k * randScore + (1.f - k) * degScore; +// rowStatus(i) = (status_t) (i + 1) + (((status_t) (finalScore * maxDegRange)) << nvBits); +// } +// */ + +template +struct D2_MIS_FixedPriority +{ + using exec_space = typename device_t::execution_space; + using mem_space = typename device_t::memory_space; + using bitset_t = Kokkos::Bitset; + using const_bitset_t = Kokkos::ConstBitset; + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + //The type of status/priority values. + using status_t = typename std::make_unsigned::type; + using status_view_t = Kokkos::View; + using range_pol = Kokkos::RangePolicy; + + // Priority values 0 and max are special, they mean the vertex is + // in the independent set or eliminated from consideration, respectively. + // Values in between represent a priority for being added to the set, + // based on degree and vertex ID as a tiebreak + // (higher priority = less preferred to being in the independent set) + + static constexpr status_t IN_SET = 0; + static constexpr status_t OUT_SET = ~IN_SET; + + D2_MIS_FixedPriority(const rowmap_t& rowmap_, const entries_t& entries_) + : rowmap(rowmap_), entries(entries_), numVerts(rowmap.extent(0) - 1), colUpdateBitset(numVerts), + worklist1(Kokkos::ViewAllocateWithoutInitializing("WL1"), numVerts), + worklist2(Kokkos::ViewAllocateWithoutInitializing("WL2"), numVerts) + { + status_t i = numVerts + 1; + nvBits = 0; + while(i) + { + i >>= 1; + nvBits++; + } + //Each value in rowStatus represents the status and priority of each row. + //Each value in colStatus represents the lowest nonzero priority of any row adjacent to the column. + // This counts up monotonically as vertices are eliminated (given status OUT_SET) + rowStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("RowStatus"), numVerts); + colStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("ColStatus"), numVerts); + KokkosKernels::Impl::graph_min_max_degree(rowmap, minDegree, maxDegree); + //Compute row statuses + Kokkos::parallel_for(range_pol(0, numVerts), InitRowStatus(rowStatus, rowmap, numVerts, nvBits, minDegree, maxDegree)); + //Compute col statuses + Kokkos::parallel_for(range_pol(0, numVerts), InitColStatus(colStatus, rowStatus, rowmap, entries, numVerts)); + } + + struct InitRowStatus + { + InitRowStatus(const status_view_t& rowStatus_, const rowmap_t& rowmap_, lno_t nv_, lno_t nvBits_, lno_t minDeg_, lno_t maxDeg_) + : rowStatus(rowStatus_), rowmap(rowmap_), nv(nv_), nvBits(nvBits_), minDeg(minDeg_), maxDeg(maxDeg_), invDegRange(1.f / (maxDeg - minDeg)) {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const + { + //Generate unique status per row, with IN_SET < status < OUT_SET, + int degBits = sizeof(status_t) * 8 - nvBits; + if(degBits == 0) + { + //no space to store degree information. Algorithm will still work but will + //probably produce a lower quality MIS. + rowStatus(i) = i + 1; + return; + } + status_t maxDegRange = (((status_t) 1) << degBits) - 2; + lno_t deg = rowmap(i + 1) - rowmap(i); + float degScore = (float) (deg - minDeg) * invDegRange; + rowStatus(i) = (status_t) (i + 1) + (((status_t) (degScore * maxDegRange)) << nvBits); + } + + status_view_t rowStatus; + rowmap_t rowmap; + lno_t nv; + int nvBits; + lno_t minDeg; + lno_t maxDeg; + float invDegRange; + }; + + struct InitColStatus + { + InitColStatus(const status_view_t& colStatus_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) + : colStatus(colStatus_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const + { + //iterate over {i} union the neighbors of i, to find + //minimum status. + status_t s = rowStatus(i); + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + for(size_type j = rowBegin; j < rowEnd; j++) + { + lno_t nei = entries(j); + if(nei != i && nei < nv) + { + status_t neiStat = rowStatus(nei); + if(neiStat < s) + s = neiStat; + } + } + colStatus(i) = s; + } + + status_view_t colStatus; + status_view_t rowStatus; + rowmap_t rowmap; + entries_t entries; + lno_t nv; + }; + + struct IterateStatusFunctor + { + IterateStatusFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const lno_view_t& worklist_, const bitset_t& colUpdateBitset_) + : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklist(worklist_), colUpdateBitset(colUpdateBitset_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const + { + lno_t i = worklist(w); + //Processing row i. + status_t s = rowStatus(i); + //s is the status which must be the minimum among all neighbors + //to decide that i is IN_SET. + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + bool neiOut = false; + bool neiMismatchS = false; + for(size_type j = rowBegin; j <= rowEnd; j++) + { + lno_t nei = (j == rowEnd) ? i : entries(j); + if(nei >= nv) + continue; + status_t neiStat = colStatus(nei); + if(neiStat == OUT_SET) + { + neiOut = true; + break; + } + else if(neiStat != s) + { + neiMismatchS = true; + } + } + bool statusChanged = neiOut || !neiMismatchS; + if(neiOut) + { + //In order to make future progress, need to update the + //col statuses for all neighbors of i which have status s. + //This will increase the minimum to the next smallest row, + //so that another nearby vertex can be added to the set. + rowStatus(i) = OUT_SET; + } + else if(!neiMismatchS) + { + rowStatus(i) = IN_SET; + } + if(statusChanged) + { + for(size_type j = rowBegin; j <= rowEnd; j++) + { + lno_t nei = (j == rowEnd) ? i : entries(j); + if(nei < nv && colStatus(nei) == s) + colUpdateBitset.set(nei); + } + } + //else: still undecided + } + + status_view_t rowStatus; + status_view_t colStatus; + rowmap_t rowmap; + entries_t entries; + lno_t nv; + lno_view_t worklist; + bitset_t colUpdateBitset; + }; + + struct UpdateWorklistFunctor + { + UpdateWorklistFunctor(const status_view_t& rowStatus_, const lno_view_t& oldWorklist_, const lno_view_t& newWorklist_) + : rowStatus(rowStatus_), oldWorklist(oldWorklist_), newWorklist(newWorklist_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lcount, bool finalPass) const + { + //processing row i + lno_t i = oldWorklist(w); + //Bit i will be set when it's decided IN_SET/OUT_SET. + //If clear, vertex i needs to be processed still. + status_t s = rowStatus(i); + if(s != IN_SET && s != OUT_SET) + { + if(finalPass) + newWorklist(lcount) = i; + lcount++; + } + } + + status_view_t rowStatus; + lno_view_t oldWorklist; + lno_view_t newWorklist; + }; + + struct ColRefreshWorklist + { + ColRefreshWorklist(const bitset_t& colUpdateBitset_, const lno_view_t& refreshList_) + : colUpdateBitset(colUpdateBitset_), refreshList(refreshList_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lindex, bool finalPass) const + { + if(colUpdateBitset.test(i)) + { + if(finalPass) + { + refreshList(lindex) = i; + colUpdateBitset.reset(i); + } + lindex++; + } + } + + bitset_t colUpdateBitset; + lno_view_t refreshList; + }; + + struct RefreshColStatus + { + RefreshColStatus(const lno_view_t& worklist_, const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) + : worklist(worklist_), rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const + { + lno_t col = worklist(w); + status_t minNeiStat = OUT_SET; + size_type rowBegin = rowmap(col); + size_type rowEnd = rowmap(col + 1); + for(size_type j = rowBegin; j <= rowEnd; j++) + { + lno_t nei = (j == rowEnd) ? col : entries(j); + if(nei >= nv) + continue; + status_t neiStat = rowStatus(nei); + if(neiStat < minNeiStat) + minNeiStat = neiStat; + } + if(minNeiStat == IN_SET) + minNeiStat = OUT_SET; + colStatus(col) = minNeiStat; + } + + lno_view_t worklist; + status_view_t rowStatus; + status_view_t colStatus; + rowmap_t rowmap; + entries_t entries; + lno_t nv; + }; + + struct InitWorklistFunctor + { + InitWorklistFunctor(const lno_view_t& worklist_) + : worklist(worklist_) + {} + KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const + { + worklist(i) = i; + } + lno_view_t worklist; + }; + + struct CountInSet + { + CountInSet(const status_view_t& rowStatus_) + : rowStatus(rowStatus_) + {} + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet) const + { + if(rowStatus(i) == IN_SET) + lNumInSet++; + } + status_view_t rowStatus; + }; + + struct CompactInSet + { + CompactInSet(const status_view_t& rowStatus_, const lno_view_t& setList_) + : rowStatus(rowStatus_), setList(setList_) + {} + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, bool finalPass) const + { + if(rowStatus(i) == IN_SET) + { + if(finalPass) + setList(lNumInSet) = i; + lNumInSet++; + } + } + status_view_t rowStatus; + lno_view_t setList; + }; + + lno_view_t compute() + { + //Initialize first worklist to 0...numVerts + Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(worklist1)); + lno_t workRemain = numVerts; + int numIter = 0; + while(workRemain) + { + //do another iteration + Kokkos::parallel_for(range_pol(0, workRemain), + IterateStatusFunctor(rowStatus, colStatus, rowmap, entries, numVerts, worklist1, colUpdateBitset)); + //And refresh the column statuses using the other worklist. + lno_t colsToRefresh; + Kokkos::parallel_scan(range_pol(0, numVerts), + ColRefreshWorklist(colUpdateBitset, worklist2), colsToRefresh); + Kokkos::parallel_for(range_pol(0, colsToRefresh), + RefreshColStatus(worklist2, rowStatus, colStatus, rowmap, entries, numVerts)); + //then build the next worklist with a scan. Also get the length of the next worklist. + lno_t newWorkRemain = 0; + Kokkos::parallel_scan(range_pol(0, workRemain), + UpdateWorklistFunctor(rowStatus, worklist1, worklist2), + newWorkRemain); + //Finally, flip the worklists + std::swap(worklist1, worklist2); + workRemain = newWorkRemain; + numIter++; + } + //now that every vertex has been decided IN_SET/OUT_SET, + //build a compact list of the vertices which are IN_SET. + lno_t numInSet = 0; + Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet); + lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), numInSet); + Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList)); + return setList; + } + + rowmap_t rowmap; + entries_t entries; + lno_t numVerts; + status_view_t rowStatus; + status_view_t colStatus; + //The number of bits required to represent vertex IDs, in the ECL-MIS tiebreak scheme: + // ceil(log_2(numVerts + 1)) + int nvBits; + lno_t minDegree; + lno_t maxDegree; + //Bitset representing columns whose status needs to be recomputed + //These bits are cleared after each refresh. + bitset_t colUpdateBitset; + lno_view_t worklist1; + lno_view_t worklist2; +}; + +template +struct D2_MIS_Coarsening +{ + using exec_space = typename device_t::execution_space; + using mem_space = typename device_t::memory_space; + using bitset_t = Kokkos::Bitset; + using const_bitset_t = Kokkos::ConstBitset; + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using lno_view_t = typename entries_t::non_const_type; + //The type of status/priority values. + using status_t = typename std::make_unsigned::type; + using status_view_t = Kokkos::View; + using range_pol = Kokkos::RangePolicy; + + D2_MIS_Coarsening(const rowmap_t& rowmap_, const entries_t& entries_, const labels_t& mis2_) + : rowmap(rowmap_), entries(entries_), mis2(mis2_), + numVerts(rowmap.extent(0) - 1), + labels(Kokkos::ViewAllocateWithoutInitializing("Cluster Labels"), numVerts) + { + Kokkos::deep_copy(labels, (lno_t) -1); + } + + //Phase 1 (over 0...numClusters) labels roots and immediate neighbors of roots. + struct Phase1Functor + { + Phase1Functor(const rowmap_t& rowmap_, const entries_t& entries_, const labels_t& mis2_, lno_t numVerts_, const labels_t& labels_) + : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(numVerts_), labels(labels_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const + { + lno_t root = mis2(i); + size_type rowBegin = rowmap(root); + size_type rowEnd = rowmap(root + 1); + labels(root) = i; + for(size_type j = rowBegin; j < rowEnd; j++) + { + lno_t nei = entries(j); + if(nei != root && nei < numVerts) + { + labels(nei) = i; + } + } + } + + rowmap_t rowmap; + entries_t entries; + labels_t mis2; + lno_t numVerts; + labels_t labels; + }; + + KOKKOS_INLINE_FUNCTION static uint32_t xorshiftHash(uint32_t in) + { + uint32_t x = in; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + return x; + } + + //Phase 2 (over 0...numVerts) joins unlabeled vertices to the smallest adjacent cluster + struct Phase2Functor + { + Phase2Functor(const rowmap_t& rowmap_, const entries_t& entries_, const labels_t& mis2_, lno_t numVerts_, const labels_t& labels_) + : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(numVerts_), labels(labels_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const + { + if(labels(i) != (lno_t) -1) + return; + size_type rowBegin = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t cluster = -1; + uint32_t minScore = ~(uint32_t) 0; + for(size_type j = rowBegin; j < rowEnd; j++) + { + lno_t nei = entries(j); + if(nei == i || nei >= numVerts) + continue; + lno_t neiCluster = labels(nei); + if(neiCluster != -1 && neiCluster != cluster) + { + //check if this cluster is smaller + uint32_t score = xorshiftHash(i + xorshiftHash(neiCluster)); + if(score < minScore) + { + cluster = neiCluster; + minScore = score; + } + } + } + labels(i) = cluster; + } + + rowmap_t rowmap; + entries_t entries; + labels_t mis2; + lno_t numVerts; + labels_t labels; + }; + + labels_t compute() + { + lno_t numClusters = mis2.extent(0); + Kokkos::parallel_for(range_pol(0, numClusters), Phase1Functor(rowmap, entries, mis2, numVerts, labels)); + Kokkos::parallel_for(range_pol(0, numVerts), Phase2Functor(rowmap, entries, mis2, numVerts, labels)); + return labels; + } + + rowmap_t rowmap; + entries_t entries; + labels_t mis2; + lno_t numVerts; + labels_t labels; +}; + +}}} + +#endif diff --git a/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp b/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp new file mode 100644 index 0000000000..51fa777c79 --- /dev/null +++ b/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp @@ -0,0 +1,303 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSGRAPH_EXPLICIT_COARSEN_IMPL_HPP +#define KOKKOSGRAPH_EXPLICIT_COARSEN_IMPL_HPP + +namespace KokkosGraph { +namespace Impl { + +template +struct ExplicitGraphCoarsening +{ + using exec_space = typename device_t::execution_space; + using range_pol = Kokkos::RangePolicy; + using team_pol = Kokkos::TeamPolicy; + using team_member_t = typename team_pol::member_type; + using bitset_t = Kokkos::Bitset; + using const_bitset_t = Kokkos::ConstBitset; + + struct ClusterSizeFunctor + { + ClusterSizeFunctor(const ordinal_view_t& counts_, const labels_t& vertClusters_) + : counts(counts_), vertClusters(vertClusters_) + {} + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const + { + Kokkos::atomic_increment(&counts(vertClusters(i))); + } + ordinal_view_t counts; + labels_t vertClusters; + }; + + struct FillClusterVertsFunctor + { + FillClusterVertsFunctor(const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, const labels_t& vertClusters_, const ordinal_view_t& insertCounts_) + : clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), insertCounts(insertCounts_) + {} + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const + { + lno_t cluster = vertClusters(i); + lno_t offset = clusterOffsets(cluster) + Kokkos::atomic_fetch_add(&insertCounts(cluster), 1); + clusterVerts(offset) = i; + } + ordinal_view_t clusterOffsets; + ordinal_view_t clusterVerts; + labels_t vertClusters; + ordinal_view_t insertCounts; + }; + + struct BuildCrossClusterMaskFunctor + { + BuildCrossClusterMaskFunctor(const fine_rowmap_t& rowmap_, const fine_entries_t& colinds_, const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, const labels_t& vertClusters_, const bitset_t& mask_) + : numRows(rowmap_.extent(0) - 1), rowmap(rowmap_), colinds(colinds_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), mask(mask_) + {} + + //Used a fixed-size hash set in shared memory + KOKKOS_INLINE_FUNCTION constexpr int tableSize() const + { + //Should always be a power-of-two, so that X % tableSize() reduces to a bitwise and. + return 512; + } + + //Given a cluster index, get the hash table index. + //This is the 32-bit xorshift RNG, but it works as a hash function. + KOKKOS_INLINE_FUNCTION unsigned xorshiftHash(lno_t cluster) const + { + unsigned x = cluster; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + return x; + } + + KOKKOS_INLINE_FUNCTION bool lookup(lno_t cluster, int* table) const + { + unsigned h = xorshiftHash(cluster); + for(unsigned i = h; i < h + 2; i++) + { + if(table[i % tableSize()] == cluster) + return true; + } + return false; + } + + //Try to insert the edge between cluster (team's cluster) and neighbor (neighboring cluster) + //by inserting nei into the table. + KOKKOS_INLINE_FUNCTION bool insert(lno_t cluster, lno_t nei, int* table) const + { + unsigned h = xorshiftHash(nei); + for(unsigned i = h; i < h + 2; i++) + { + if(Kokkos::atomic_compare_exchange_strong(&table[i % tableSize()], cluster, nei)) + return true; + } + return false; + } + + KOKKOS_INLINE_FUNCTION void operator()(const team_member_t t) const + { + lno_t cluster = t.league_rank(); + lno_t clusterSize = clusterOffsets(cluster + 1) - clusterOffsets(cluster); + //Use a fixed-size hash table per thread to accumulate neighbor of the cluster. + //If it fills up (very unlikely) then just count every remaining edge going to another cluster + //not already in the table; this provides a reasonable upper bound for overallocating the cluster graph. + //each thread handles a cluster + int* table = (int*) t.team_shmem().get_shmem(tableSize() * sizeof(int)); + //mark every entry as cluster (self-loop) to represent free/empty + Kokkos::parallel_for(Kokkos::TeamVectorRange(t, tableSize()), + [&](const lno_t i) + { + table[i] = cluster; + }); + t.team_barrier(); + //now, for each row belonging to the cluster, iterate through the neighbors + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clusterSize), + [&] (const lno_t i) + { + lno_t row = clusterVerts(clusterOffsets(cluster) + i); + lno_t rowDeg = rowmap(row + 1) - rowmap(row); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, rowDeg), + [&] (const lno_t j) + { + lno_t nei = colinds(rowmap(row) + j); + //Remote neighbors are not included + if(nei >= numRows) + return; + lno_t neiCluster = vertClusters(nei); + if(neiCluster != cluster) + { + //Have a neighbor. Try to find it in the table. + if(!lookup(neiCluster, table)) + { + //Not in the table. Try to insert it. + insert(cluster, neiCluster, table); + //Whether or not insertion succeeded, + //this is a cross-cluster edge possibly not seen before + mask.set(rowmap(row) + j); + } + } + }); + }); + } + + size_t team_shmem_size(int teamSize) const + { + return tableSize() * sizeof(int); + } + + lno_t numRows; + fine_rowmap_t rowmap; + fine_entries_t colinds; + ordinal_view_t clusterOffsets; + ordinal_view_t clusterVerts; + labels_t vertClusters; + bitset_t mask; + }; + + struct FillClusterEntriesFunctor + { + FillClusterEntriesFunctor( + const fine_rowmap_t& rowmap_, const fine_entries_t& colinds_, const coarse_rowmap_t& clusterRowmap_, const coarse_entries_t& clusterEntries_, const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, const labels_t& vertClusters_, const bitset_t& edgeMask_) + : rowmap(rowmap_), colinds(colinds_), clusterRowmap(clusterRowmap_), clusterEntries(clusterEntries_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), edgeMask(edgeMask_) + {} + //Run this scan over entries in clusterVerts (reordered point rows) + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i, lno_t& lcount, const bool& finalPass) const + { + lno_t numRows = rowmap.extent(0) - 1; + lno_t row = clusterVerts(i); + size_type rowStart = rowmap(row); + size_type rowEnd = rowmap(row + 1); + lno_t cluster = vertClusters(row); + lno_t clusterStart = clusterOffsets(cluster); + //Count the number of entries in this row. + //This is how much lcount will be increased by, + //yielding the offset corresponding to + //these point entries in the cluster entries. + lno_t rowEntries = 0; + for(size_type j = rowStart; j < rowEnd; j++) + { + if(edgeMask.test(j)) + rowEntries++; + } + if(finalPass) + { + //if this is the last row in the cluster, update the upper bound in clusterRowmap + if(i == clusterStart) + { + clusterRowmap(cluster) = lcount; + } + lno_t clusterEdge = lcount; + //populate clusterEntries for these edges + for(size_type j = rowStart; j < rowEnd; j++) + { + if(edgeMask.test(j)) + { + clusterEntries(clusterEdge++) = vertClusters(colinds(j)); + } + } + } + //update the scan result at the end (exclusive) + lcount += rowEntries; + if(i == numRows - 1 && finalPass) + { + //on the very last row, set the last entry of the cluster rowmap + clusterRowmap(clusterRowmap.extent(0) - 1) = lcount; + } + } + fine_rowmap_t rowmap; + fine_entries_t colinds; + coarse_rowmap_t clusterRowmap; + coarse_entries_t clusterEntries; + ordinal_view_t clusterOffsets; + ordinal_view_t clusterVerts; + labels_t vertClusters; + const_bitset_t edgeMask; + }; + + //Constructor just does the computation and outputs to coarseRowmap, coarseEntries. + ExplicitGraphCoarsening(const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, const labels_t& labels, lno_t numCoarseVerts) + { + lno_t numFineVerts = fineRowmap.extent(0); + if(numFineVerts <= 1) + { + coarseRowmap = coarse_rowmap_t(); + coarseEntries = coarse_entries_t(); + return; + } + numFineVerts--; + clusterOffsets = ordinal_view_t("Cluster offsets", numCoarseVerts + 1); + clusterVerts = ordinal_view_t(Kokkos::ViewAllocateWithoutInitializing("Cluster verts"), numFineVerts); + Kokkos::parallel_for(range_pol(0, numFineVerts), ClusterSizeFunctor(clusterOffsets, labels)); + KokkosKernels::Impl::exclusive_parallel_prefix_sum(numCoarseVerts + 1, clusterOffsets); + { + ordinal_view_t tempInsertCounts("Temporary cluster insert counts", numCoarseVerts); + Kokkos::parallel_for(range_pol(0, numFineVerts), FillClusterVertsFunctor(clusterOffsets, clusterVerts, labels, tempInsertCounts)); + } + //Determine the set of edges (in the point graph) that cross between two distinct clusters + int vectorSize = KokkosKernels::Impl::kk_get_suggested_vector_size(numFineVerts, fineEntries.extent(0), KokkosKernels::Impl::kk_get_exec_space_type()); + bitset_t crossClusterEdgeMask(fineEntries.extent(0)); + size_type numClusterEdges; + { + BuildCrossClusterMaskFunctor buildEdgeMask(fineRowmap, fineEntries, clusterOffsets, clusterVerts, labels, crossClusterEdgeMask); + int sharedPerTeam = buildEdgeMask.team_shmem_size(0); //using team-size = 0 for since no per-thread shared is used. + int teamSize = KokkosKernels::Impl::get_suggested_team_size(buildEdgeMask, vectorSize, sharedPerTeam, 0); + Kokkos::parallel_for(team_pol(numCoarseVerts, teamSize, vectorSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)), buildEdgeMask); + numClusterEdges = crossClusterEdgeMask.count(); + } + coarseRowmap = coarse_rowmap_t(Kokkos::ViewAllocateWithoutInitializing("Cluster graph rowmap"), numCoarseVerts + 1); + coarseEntries = coarse_entries_t(Kokkos::ViewAllocateWithoutInitializing("Cluster graph colinds"), numClusterEdges); + Kokkos::parallel_scan(range_pol(0, numFineVerts), FillClusterEntriesFunctor + (fineRowmap, fineEntries, coarseRowmap, coarseEntries, clusterOffsets, clusterVerts, labels, crossClusterEdgeMask)); + } + + coarse_rowmap_t coarseRowmap; + coarse_entries_t coarseEntries; + ordinal_view_t clusterOffsets; + ordinal_view_t clusterVerts; +}; + +}} + +#endif diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index e9596fb772..ced3476539 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -50,6 +50,7 @@ #include "cusparse.h" #include "KokkosKernels_SparseUtils_cusparse.hpp" #include "KokkosKernels_Controls.hpp" +#include "KokkosSparse_spmv_impl.hpp" namespace KokkosSparse { namespace Impl { @@ -64,8 +65,18 @@ namespace Impl { const YVector& y) { using KAT = Kokkos::Details::ArithTraits; - std::cout << "It is currently not possible to use the native SpMV implementation" - " when cuSPARSE is enabled" << std::endl; + if (beta == KAT::zero ()) { + KokkosSparse::Impl::spmv_beta (controls, mode, alpha, A, x, beta, y); + } + else if (beta == KAT::one ()) { + KokkosSparse::Impl::spmv_beta (controls, mode, alpha, A, x, beta, y); + } + else if (beta == -KAT::one ()) { + KokkosSparse::Impl::spmv_beta (controls, mode, alpha, A, x, beta, y); + } + else { + KokkosSparse::Impl::spmv_beta (controls, mode, alpha, A, x, beta, y); + } } template @@ -84,9 +95,24 @@ namespace Impl { cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); /* Set the operation mode */ - cusparseOperation_t myCusparseOperation = CUSPARSE_OPERATION_NON_TRANSPOSE; - if(mode[0] == Transpose[0]) {myCusparseOperation = CUSPARSE_OPERATION_TRANSPOSE;} - else if(mode[0] == ConjugateTranspose[0]) {myCusparseOperation = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE;} + cusparseOperation_t myCusparseOperation; + switch(toupper(mode[0])) + { + case 'N': + myCusparseOperation = CUSPARSE_OPERATION_NON_TRANSPOSE; + break; + case 'T': + myCusparseOperation = CUSPARSE_OPERATION_TRANSPOSE; + break; + case 'H': + myCusparseOperation = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE; + break; + default: + { + std::cerr << "Mode " << mode << " invalid for cuSPARSE SpMV.\n"; + throw std::invalid_argument("Invalid mode"); + } + } #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) @@ -239,8 +265,9 @@ namespace Impl { const XVector& x, \ const coefficient_type& beta, \ const YVector& y) { \ - if(controls.isParameter("algorithm") && controls.getParameter("algorithm") == "native") { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + Kokkos::ArithTraits::name() + "]"; \ + bool fallback = *mode == 'C' || ((*mode == 'T' || *mode == 'H') && 9000 <= CUDA_VERSION && CUDA_VERSION < 10000); \ + if((controls.isParameter("algorithm") && controls.getParameter("algorithm") == "native") || fallback) { \ + std::string label = "KokkosSparse::spmv[NATIVE," + Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ spmv_native(controls, mode, alpha, A, x, beta, y); \ Kokkos::Profiling::popRegion(); \ @@ -252,6 +279,11 @@ namespace Impl { } \ } \ }; + +//BMK: cuSPARSE that comes with CUDA 9 does not support tranpose or conjugate transpose modes. +//No version of cuSPARSE supports mode C (conjugate, non transpose). +//In those cases, fall back to KokkosKernels native spmv. + #if (9000 <= CUDA_VERSION) KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutRight, Kokkos::CudaSpace, true) diff --git a/src/kokkoskernels_eti.cmake b/src/kokkoskernels_eti.cmake index 1179ec9c41..04a6f412c9 100644 --- a/src/kokkoskernels_eti.cmake +++ b/src/kokkoskernels_eti.cmake @@ -126,7 +126,7 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) CMAKE_PARSE_ARGUMENTS(ETI "" "HEADER_LIST;SOURCE_LIST" - "TYPE_LISTS" + "TYPE_LISTS;COMPONENTS" ${ARGN}) STRING(TOUPPER "${FUNCTION_NAME}" UPPER_NAME) @@ -134,26 +134,38 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) SET(ETI_AVAIL_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_AVAIL") SET(ETI_INST_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_INST") - KOKKOSKERNELS_ETI_MAKE_LIST(${FUNCTION_NAME}_eti TYPE_LISTS ${ETI_TYPE_LISTS}) - FOREACH(ETI ${${FUNCTION_NAME}_eti}) - SET(MACRO_STRING "(") - FOREACH(TYPE_NAME ${${ETI}}) - STRING(APPEND MACRO_STRING "${${TYPE_NAME}_CPP_TYPE},") + # if this is tied to particular components + # see whether those components are enabled + KOKKOSKERNELS_IS_ENABLED( + COMPONENTS ${ETI_COMPONENTS} + OUTPUT_VARIABLE ETI_COMP_IS_ENABLED + ) + + IF (ETI_COMP_IS_ENABLED) + MESSAGE(STATUS "Creating ETI files for ${FUNCTION_NAME}") + KOKKOSKERNELS_ETI_MAKE_LIST(${FUNCTION_NAME}_eti TYPE_LISTS ${ETI_TYPE_LISTS}) + FOREACH(ETI ${${FUNCTION_NAME}_eti}) + SET(MACRO_STRING "(") + FOREACH(TYPE_NAME ${${ETI}}) + STRING(APPEND MACRO_STRING "${${TYPE_NAME}_CPP_TYPE},") + ENDFOREACH() + STRING(APPEND MACRO_STRING ")") + STRING(REPLACE ",)" ")" MACRO_STRING ${MACRO_STRING}) + #Make a single header file for all instances + LIST(APPEND ${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") + LIST(APPEND ${UPPER_NAME}_ETI_AVAIL_LIST "${ETI_AVAIL_MACRO}${MACRO_STRING}") + SET(${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") + #Make a different source file for each instance + SET(INST_SOURCE "impl/generated_specializations_cpp/${SUBFOLDER}/${ETI}.cpp") + SET(INST_TEMPLATE "impl/generated_specializations_cpp/${SUBFOLDER}/Kokkos${FUNCTION_NAME}_eti_spec_inst.cpp.in") + SET(${UPPER_NAME}_ETI_INST_BLOCK "${ETI_INST_MACRO}${MACRO_STRING}") + CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${INST_TEMPLATE} + ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE}) + LIST(APPEND ${ETI_SOURCE_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE}) ENDFOREACH() - STRING(APPEND MACRO_STRING ")") - STRING(REPLACE ",)" ")" MACRO_STRING ${MACRO_STRING}) - #Make a single header file for all instances - LIST(APPEND ${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") - LIST(APPEND ${UPPER_NAME}_ETI_AVAIL_LIST "${ETI_AVAIL_MACRO}${MACRO_STRING}") - SET(${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") - #Make a different source file for each instance - SET(INST_SOURCE "impl/generated_specializations_cpp/${SUBFOLDER}/${ETI}.cpp") - SET(INST_TEMPLATE "impl/generated_specializations_cpp/${SUBFOLDER}/Kokkos${FUNCTION_NAME}_eti_spec_inst.cpp.in") - SET(${UPPER_NAME}_ETI_INST_BLOCK "${ETI_INST_MACRO}${MACRO_STRING}") - CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${INST_TEMPLATE} - ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE}) - LIST(APPEND ${ETI_SOURCE_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE}) - ENDFOREACH() + ELSE() + MESSAGE(STATUS "Skipping ETI files for ${FUNCTION_NAME} because not all components are enabled") + ENDIF() SET(AVAIL_HEADER "impl/generated_specializations_hpp/Kokkos${FUNCTION_NAME}_eti_spec_avail.hpp") SET(AVAIL_TEMPLATE "${AVAIL_HEADER}.in") @@ -163,7 +175,6 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_INST_BLOCK "${${UPPER_NAME}_ETI_INST_LIST}") STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_AVAIL_BLOCK "${${UPPER_NAME}_ETI_AVAIL_LIST}") - MESSAGE(STATUS "Creating ETI files for ${FUNCTION_NAME}") CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${DECL_TEMPLATE} ${CMAKE_CURRENT_BINARY_DIR}/${DECL_HEADER}) CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${AVAIL_TEMPLATE} diff --git a/src/sparse/KokkosSparse_BlockCrsMatrix.hpp b/src/sparse/KokkosSparse_BlockCrsMatrix.hpp index 9cfd97afbb..61f3550275 100644 --- a/src/sparse/KokkosSparse_BlockCrsMatrix.hpp +++ b/src/sparse/KokkosSparse_BlockCrsMatrix.hpp @@ -415,17 +415,10 @@ class BlockCrsMatrix { //! Type of a host-memory mirror of the sparse matrix. typedef BlockCrsMatrix HostMirror; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - //! Type of the graph structure of the sparse matrix. - typedef Kokkos::StaticCrsGraph StaticCrsGraphType; - //! Type of the graph structure of the sparse matrix - consistent with Kokkos. - typedef Kokkos::StaticCrsGraph staticcrsgraph_type; -#else //! Type of the graph structure of the sparse matrix. typedef Kokkos::StaticCrsGraph StaticCrsGraphType; //! Type of the graph structure of the sparse matrix - consistent with Kokkos. typedef Kokkos::StaticCrsGraph staticcrsgraph_type; -#endif //! Type of column indices in the sparse matrix. typedef typename staticcrsgraph_type::entries_type index_type; //! Const version of the type of column indices in the sparse matrix. diff --git a/src/sparse/KokkosSparse_CrsMatrix.hpp b/src/sparse/KokkosSparse_CrsMatrix.hpp index bba54c613c..d866a63601 100644 --- a/src/sparse/KokkosSparse_CrsMatrix.hpp +++ b/src/sparse/KokkosSparse_CrsMatrix.hpp @@ -104,6 +104,12 @@ inline int RowsPerThread(const int NNZPerRow) { return 1; } #endif +#ifdef KOKKOS_ENABLE_HIP +template<> +inline int RowsPerThread(const int NNZPerRow) { + return 1; +} +#endif // A simple struct for storing a kernel launch configuration. // This is currently used by CrsMatrix to allow the user to have some control @@ -406,17 +412,10 @@ class CrsMatrix { //! Type of a host-memory mirror of the sparse matrix. typedef CrsMatrix HostMirror; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - //! Type of the graph structure of the sparse matrix. - typedef Kokkos::StaticCrsGraph StaticCrsGraphType; - //! Type of the graph structure of the sparse matrix - consistent with Kokkos. - typedef Kokkos::StaticCrsGraph staticcrsgraph_type; -#else //! Type of the graph structure of the sparse matrix. typedef Kokkos::StaticCrsGraph StaticCrsGraphType; //! Type of the graph structure of the sparse matrix - consistent with Kokkos. typedef Kokkos::StaticCrsGraph staticcrsgraph_type; -#endif //! Type of column indices in the sparse matrix. typedef typename staticcrsgraph_type::entries_type index_type; //! Const version of the type of column indices in the sparse matrix. diff --git a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp index e4ded70d54..fd4a9b58d9 100644 --- a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp +++ b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp @@ -57,20 +57,16 @@ namespace KokkosSparse{ enum GSAlgorithm{GS_DEFAULT, GS_PERMUTED, GS_TEAM, GS_CLUSTER, GS_TWOSTAGE}; enum GSDirection{GS_FORWARD, GS_BACKWARD, GS_SYMMETRIC}; - enum ClusteringAlgorithm{CLUSTER_DEFAULT, CLUSTER_BALLOON, CLUSTER_CUTHILL_MCKEE, CLUSTER_DO_NOTHING, NUM_CLUSTERING_ALGORITHMS}; + enum ClusteringAlgorithm{CLUSTER_DEFAULT, CLUSTER_MIS2, CLUSTER_BALLOON, NUM_CLUSTERING_ALGORITHMS}; inline const char* getClusterAlgoName(ClusteringAlgorithm ca) { switch(ca) { - case CLUSTER_DEFAULT: - return "Default"; case CLUSTER_BALLOON: return "Balloon"; - case CLUSTER_CUTHILL_MCKEE: - return "Cuthill-McKee"; - case CLUSTER_DO_NOTHING: - return "No-op"; + case CLUSTER_MIS2: + return "MIS(2)"; default:; } return "INVALID CLUSTERING ALGORITHM"; @@ -192,12 +188,8 @@ namespace KokkosSparse{ return; } else { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - KokkosKernels::Impl::get_suggested_vector_team_size(max_allowed_team_size, suggested_vector_size_, suggested_team_size_, nr, nnz); -#else KokkosKernels::Impl::get_suggested_vector_size(suggested_vector_size_, nr, nnz); KokkosKernels::Impl::get_suggested_team_size(max_allowed_team_size, suggested_vector_size_, suggested_team_size_); -#endif this->suggested_team_size = suggested_vector_size_; this->suggested_vector_size = suggested_vector_size_; @@ -282,53 +274,11 @@ namespace KokkosSparse{ void set_block_size(nnz_lno_t bs){this->block_size = bs; } nnz_lno_t get_block_size() const {return this->block_size;} - /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise. - */ void choose_default_algorithm(){ -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ - this->algorithm_type = GS_PERMUTED; -#ifdef VERBOSE - std::cout << "Serial Execution Space, Default Algorithm: GS_PERMUTED" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - this->algorithm_type = GS_PERMUTED; -#ifdef VERBOSE - std::cout << "PTHREAD Execution Space, Default Algorithm: GS_PERMUTED" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - this->algorithm_type = GS_PERMUTED; -#ifdef VERBOSE - std::cout << "OpenMP Execution Space, Default Algorithm: GS_PERMUTED" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) this->algorithm_type = GS_TEAM; -#ifdef VERBOSE - std::cout << "Cuda Execution Space, Default Algorithm: GS_TEAM" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ + else this->algorithm_type = GS_PERMUTED; -#ifdef VERBOSE - std::cout << "Qthread Execution Space, Default Algorithm: GS_PERMUTED" << std::endl; -#endif - } -#endif } ~PointGaussSeidelHandle() = default; @@ -449,13 +399,8 @@ namespace KokkosSparse{ return; } else { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - KokkosKernels::Impl::get_suggested_vector_team_size( - max_allowed_team_size, suggested_vector_size_, suggested_team_size_, nr, nnz); -#else KokkosKernels::Impl::get_suggested_vector_size(suggested_vector_size_, nr, nnz); KokkosKernels::Impl::get_suggested_team_size(max_allowed_team_size, suggested_vector_size_, suggested_team_size_); -#endif this->suggested_team_size = suggested_vector_size_; this->suggested_vector_size = suggested_vector_size_; @@ -572,33 +517,7 @@ namespace KokkosSparse{ bool use_teams() const { - bool return_value = false; -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value) { - return_value = false; - } -#endif -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - return_value = false; - } -#endif -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - return_value = false; - } -#endif -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - return_value = true; - } -#endif -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ - return_value = false; - } -#endif - return return_value; + return KokkosKernels::Impl::kk_is_gpu_exec_space(); } ~ClusterGaussSeidelHandle() = default; diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp index 820afbbaa3..1efae2c1a7 100644 --- a/src/sparse/KokkosSparse_spadd.hpp +++ b/src/sparse/KokkosSparse_spadd.hpp @@ -47,7 +47,7 @@ #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_Sorting.hpp" -#include +#include "Kokkos_ArithTraits.hpp" namespace KokkosSparse { namespace Experimental { @@ -86,10 +86,10 @@ struct SortedCountEntries { Bcolinds(Bcolinds_), Crowcounts(Crowcounts_) {} - static constexpr ordinal_type ORDINAL_MAX = std::numeric_limits::max(); - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); + // count the union of nonzeros in Arow and Brow size_type numEntries = 0; size_type ai = 0; @@ -202,67 +202,6 @@ struct UnmergedSumFunctor { CcolindsT ABperm; }; -template -struct SortEntriesFunctor { - SortEntriesFunctor(const CrowptrsT& Crowptrs_, const CcolindsT& Ccolinds_, - const CcolindsT& ABperm_) - : Crowptrs(Crowptrs_), - Ccolinds(Ccolinds_), - CcolindsAux("C colind aux", Ccolinds_.extent(0)), - ABperm(ABperm_), - ABpermAux("AB perm aux", ABperm_.extent(0)) {} - typedef typename Kokkos::TeamPolicy::member_type TeamMember; - KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { - // 3: Sort each row's colinds (permuting values at same time), then count - // unique colinds (write that to Crowptr(i)) CrowptrTemp tells how many - // entries in each oversized row - ordinal_type i = t.league_rank(); - size_type rowStart = Crowptrs(i); - size_type rowEnd = Crowptrs(i + 1); - size_type rowNum = rowEnd - rowStart; - using lno_t = typename CcolindsT::non_const_value_type; - using unsigned_lno_t = typename std::make_unsigned::type; - KokkosKernels::Impl::SerialRadixSort2( - (unsigned_lno_t*)Ccolinds.data() + rowStart, - (unsigned_lno_t*)CcolindsAux.data() + rowStart, - ABperm.data() + rowStart, ABpermAux.data() + rowStart, rowNum); - } - CrowptrsT Crowptrs; - CcolindsT Ccolinds; - CcolindsT CcolindsAux; - CcolindsT ABperm; - CcolindsT ABpermAux; -}; - -#ifdef KOKKOS_ENABLE_CUDA -template -struct SortEntriesFunctor { - SortEntriesFunctor(const CrowptrsT& Crowptrs_, CcolindsT& Ccolinds_, - CcolindsT& ABperm_) - : Crowptrs(Crowptrs_), Ccolinds(Ccolinds_), ABperm(ABperm_) {} - typedef typename Kokkos::TeamPolicy::member_type TeamMember; - KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { - // 3: Sort each row's colinds (permuting values at same time), then count - // unique colinds (write that to Crowptr(i)) CrowptrTemp tells how many - // entries in each oversized row - size_type i = t.league_rank(); - size_type rowStart = Crowptrs(i); - size_type rowEnd = Crowptrs(i + 1); - size_type rowNum = rowEnd - rowStart; - KokkosKernels::Impl::TeamBitonicSort2< - size_type, typename CcolindsT::non_const_value_type, - typename CcolindsT::non_const_value_type, TeamMember>( - Ccolinds.data() + rowStart, ABperm.data() + rowStart, rowNum, t); - } - CrowptrsT Crowptrs; - CcolindsT Ccolinds; - CcolindsT ABperm; -}; -#endif - template struct MergeEntriesFunctor { @@ -478,7 +417,6 @@ template struct SortedNumericSumFunctor { using CscalarT = typename CvaluesT::non_const_value_type; - static constexpr ordinal_type ORDINAL_MAX = std::numeric_limits::max(); SortedNumericSumFunctor(const ArowptrsT& Arowptrs_, const BrowptrsT& Browptrs_, @@ -502,6 +440,8 @@ struct SortedNumericSumFunctor { KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); + // count the union of nonzeros in Arow and Brow size_type ai = 0; size_type bi = 0; diff --git a/src/sparse/KokkosSparse_spgemm_handle.hpp b/src/sparse/KokkosSparse_spgemm_handle.hpp index b34d349457..f517682d5e 100644 --- a/src/sparse/KokkosSparse_spgemm_handle.hpp +++ b/src/sparse/KokkosSparse_spgemm_handle.hpp @@ -504,8 +504,6 @@ class SPGEMMHandle{ return this->cuSPARSEHandle; } #endif - /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise. - */ void choose_default_algorithm(){ #if defined( KOKKOS_ENABLE_SERIAL ) if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ @@ -543,6 +541,15 @@ class SPGEMMHandle{ } #endif +#if defined( KOKKOS_ENABLE_HIP ) + if (std::is_same::value){ + this->algorithm_type = SPGEMM_KK; +#ifdef VERBOSE + std::cout << "HIP Execution Space, Default Algorithm: SPGEMM_KK" << std::endl; +#endif + } +#endif + #if defined( KOKKOS_ENABLE_QTHREAD) if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ this->algorithm_type = SPGEMM_SERIAL; @@ -604,67 +611,20 @@ class SPGEMMHandle{ //suggested_vector_size_=this->suggested_vector_size = 1; //return; if (this->suggested_team_size && this->suggested_vector_size) { + //already set in the handle suggested_vector_size_ = this->suggested_vector_size; suggested_team_size_ = this->suggested_team_size; return; } -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ - suggested_vector_size_ = this->suggested_vector_size = 1; - suggested_team_size_ = this->suggested_team_size = max_allowed_team_size; - return; - } -#endif - -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - suggested_vector_size_ = this->suggested_vector_size = 1; - suggested_team_size_ = this->suggested_team_size = max_allowed_team_size; - return; - } -#endif - -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - suggested_vector_size_ = this->suggested_vector_size = 1; - suggested_team_size_ = this->suggested_team_size = max_allowed_team_size; - } -#endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - - this->suggested_vector_size = nnz / double (nr) + 0.5; - - if (this->suggested_vector_size <= 3){ - this->suggested_vector_size = 2; - } - else if (this->suggested_vector_size <= 6){ - this->suggested_vector_size = 4; - } - else if (this->suggested_vector_size <= 12){ - this->suggested_vector_size = 8; - } - else if (this->suggested_vector_size <= 24){ - this->suggested_vector_size = 16; - } - else { - this->suggested_vector_size = 32; - } - - suggested_vector_size_ = this->suggested_vector_size; - this->suggested_team_size= suggested_team_size_ = max_allowed_team_size / this->suggested_vector_size; - } -#endif - -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ - suggested_vector_size_ = this->suggested_vector_size = 1; - suggested_team_size_ = this->suggested_team_size = max_allowed_team_size; - } -#endif - + //otherwise, recompute team_size/vector_size based on heuristic and save them in the handle + suggested_vector_size_ = KokkosKernels::Impl::kk_get_suggested_vector_size(nr, nnz, KokkosKernels::Impl::kk_get_exec_space_type()); + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) + suggested_team_size_ = max_allowed_team_size / suggested_vector_size_; + else + suggested_team_size = max_allowed_team_size; + this->suggested_vector_size = suggested_vector_size_; + this->suggested_team_size = suggested_vector_size_; } void set_compression_steps(bool isCompressionSingleStep){ diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp index e18bc4690f..4c26f5cd6e 100644 --- a/src/sparse/KokkosSparse_spmv.hpp +++ b/src/sparse/KokkosSparse_spmv.hpp @@ -151,7 +151,7 @@ spmv (KokkosKernels::Experimental::Controls controls, KokkosBlas::scal(y_i, beta, y_i); return; } - return Impl::SPMV< + Impl::SPMV< typename AMatrix_Internal::value_type, typename AMatrix_Internal::ordinal_type, typename AMatrix_Internal::device_type, diff --git a/src/sparse/KokkosSparse_sptrsv_supernode.hpp b/src/sparse/KokkosSparse_sptrsv_supernode.hpp index 6f230780cc..e73837e3a4 100644 --- a/src/sparse/KokkosSparse_sptrsv_supernode.hpp +++ b/src/sparse/KokkosSparse_sptrsv_supernode.hpp @@ -57,8 +57,14 @@ #include "KokkosBlas3_trmm.hpp" #include "KokkosBlas_trtri.hpp" -#include "KokkosSparse_sptrsv.hpp" +#include "KokkosBatched_Trtri_Decl.hpp" +#include "KokkosBatched_Trtri_Serial_Impl.hpp" + +#include "KokkosBatched_Trmm_Decl.hpp" +#include "KokkosBatched_Trmm_Serial_Impl.hpp" + +#include "KokkosSparse_sptrsv.hpp" namespace KokkosSparse { namespace Experimental { @@ -900,6 +906,7 @@ void sptrsv_supernodal_symbolic( host_graph_t graphU_host, KernelHandle *kernelHandleU) { #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + int nrows = graphL_host.numRows (); double time_seconds = 0.0; Kokkos::Timer timer; Kokkos::Timer tic; @@ -946,7 +953,6 @@ void sptrsv_supernodal_symbolic( int nsuper_merged = nsuper; #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE tic.reset (); - int nrows = graphL_host.numRows (); check_supernode_sizes("Original L-structure", nrows, nsuper, supercols_merged, graphL_host); check_supernode_sizes("Original U-structure", nrows, nsuper, supercols_merged, graphU_host); #endif @@ -1082,6 +1088,89 @@ void sptrsv_supernodal_symbolic( /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ /* Auxiliary functions for numeric computation */ +/* ========================================================================================= */ + struct Tag_SupTrtriFunctor{}; + struct Tag_SupTrtriTrmmFunctor{}; + + template + struct TriSupernodalTrtriFunctor { + + integer_view_host_t supernode_ids; + const input_size_type *nb; + row_map_type hr; + index_type hc; + values_type hv; + + KOKKOS_INLINE_FUNCTION + TriSupernodalTrtriFunctor(integer_view_host_t supernode_ids_, const input_size_type *nb_, + row_map_type& hr_, index_type& hc_, values_type& hv_) : + supernode_ids(supernode_ids_), + nb(nb_), + hr(hr_), + hc(hc_), + hv(hv_) + {} + + // functor: just invert diagonal + KOKKOS_INLINE_FUNCTION + void operator() (const Tag_SupTrtriFunctor&, const int i) const { + using execution_space = typename values_type::execution_space; + using memory_space = typename execution_space::memory_space; + using values_view_t = typename values_type::non_const_type; + using scalar_t = typename values_view_t::value_type; + + using range_type = Kokkos::pair; + using TrtriAlgoType = KokkosBatched::Algo::Trtri::Unblocked; + + int s = supernode_ids(i); + int j1 = nb[s]; + int nsrow = hr(j1+1) - hr(j1); + int nscol = nb[s +1] - nb[s]; + + // invert diagonal + auto nnzD = hr (j1); + Kokkos::View + viewL (&hv(nnzD), nsrow, nscol); + auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ()); + KokkosBatched::SerialTrtri::invoke(Ljj); + } + + // functor: invert diagonal + apply inverse to off-diagonal + KOKKOS_INLINE_FUNCTION + void operator() (const Tag_SupTrtriTrmmFunctor&, const int i) const { + using execution_space = typename values_type::execution_space; + using memory_space = typename execution_space::memory_space; + using values_view_t = typename values_type::non_const_type; + using scalar_t = typename values_view_t::value_type; + + using range_type = Kokkos::pair; + using TrtriAlgoType = KokkosBatched::Algo::Trtri::Unblocked; + using Side = KokkosBatched::Side; + using Trans = KokkosBatched::Trans; + + int s = supernode_ids(i); + int j1 = nb[s]; + int nsrow = hr(j1+1) - hr(j1); + int nscol = nb[s +1] - nb[s]; + + // invert diagonal + auto nnzD = hr (j1); + Kokkos::View + viewL (&hv(nnzD), nsrow, nscol); + auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ()); + KokkosBatched::SerialTrtri::invoke(Ljj); + + // apply invse to off-diagonal + //if (nsrow > nscol && invert_offdiag) + { + const scalar_t one (1.0); + auto Lij = Kokkos::subview (viewL, range_type (nscol, nsrow), Kokkos::ALL ()); + KokkosBatched::SerialTrmm:: + invoke(one, Ljj, Lij); + } + } + }; /* ========================================================================================= */ template @@ -1090,10 +1179,11 @@ invert_supernodal_columns(KernelHandle kernelHandle, bool unit_diag, int nsuper, row_map_type& hr, index_type& hc, values_type& hv) { using execution_space = typename values_type::execution_space; - using memory_space = typename execution_space::memory_space; - using values_view_t = typename values_type::non_const_type; - using scalar_t = typename values_view_t::value_type; + using memory_space = typename execution_space::memory_space; + using values_view_t = typename values_type::non_const_type; + using scalar_t = typename values_view_t::value_type; using range_type = Kokkos::pair; + using integer_view_host_t = Kokkos::View; const scalar_t one (1.0); @@ -1109,46 +1199,136 @@ invert_supernodal_columns(KernelHandle kernelHandle, bool unit_diag, int nsuper, // quick return if (!invert_diag) return; + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE Kokkos::Timer timer; double time1 = 0.0; double time2 = 0.0; + double time3 = 0.0; + #endif // ---------------------------------------------------------- // now let's invert some blocks + // > first go through all the supernode columns + // > use KokkosBlas on large blocks, and keep track of small blocks + // > to call batchedBlas on them + int num_batchs = 0; + int size_unblocked = handle->get_supernode_size_unblocked(); + integer_view_host_t supernode_ids ("supernode_batch", nsuper); for (int s2 = 0; s2 < nsuper; s2++) { - int j1 = nb[s2]; - int nsrow = hr(j1+1) - hr(j1); int nscol = nb[s2+1] - nb[s2]; - auto nnzD = hr (j1); - char uplo_char = (lower ? 'L' : 'U'); - char diag_char = (unit_diag ? 'U' : 'N'); + if (nscol >= size_unblocked) { + int j1 = nb[s2]; + int nsrow = hr(j1+1) - hr(j1); - Kokkos::View - viewL (&hv(nnzD), nsrow, nscol); - auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ()); + auto nnzD = hr (j1); + char uplo_char = (lower ? 'L' : 'U'); + char diag_char = (unit_diag ? 'U' : 'N'); - timer.reset (); - KokkosBlas::trtri(&uplo_char, &diag_char, Ljj); - time1 += timer.seconds (); - - if (nsrow > nscol && invert_offdiag) { - char side_char = 'R'; - char tran_char = 'N'; - auto Lij = Kokkos::subview (viewL, range_type (nscol, nsrow), Kokkos::ALL ()); + Kokkos::View + viewL (&hv(nnzD), nsrow, nscol); + auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ()); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE timer.reset (); - KokkosBlas::trmm (&side_char, &uplo_char, - &tran_char, &diag_char, - one, Ljj, Lij); - time2 += timer.seconds (); + #endif + KokkosBlas::trtri(&uplo_char, &diag_char, Ljj); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time1 += timer.seconds (); + #endif + + if (nsrow > nscol && invert_offdiag) { + char side_char = 'R'; + char tran_char = 'N'; + auto Lij = Kokkos::subview (viewL, range_type (nscol, nsrow), Kokkos::ALL ()); + + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + timer.reset (); + #endif + KokkosBlas::trmm (&side_char, &uplo_char, + &tran_char, &diag_char, + one, Ljj, Lij); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time2 += timer.seconds (); + #endif + } + } + else { + supernode_ids (num_batchs) = s2; + num_batchs ++; } } + // now call batchedBLAS + if (num_batchs > 0) { + using Uplo = KokkosBatched::Uplo; + using Diag = KokkosBatched::Diag; + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + timer.reset (); + #endif + if (lower) { + if (unit_diag) { + if (invert_offdiag) { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } else { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } + } else { + if (invert_offdiag) { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } else { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } + } + } else { + if (unit_diag) { + if (invert_offdiag) { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } else { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } + } else { + if (invert_offdiag) { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } else { + using range_policy = Kokkos::RangePolicy; + TriSupernodalTrtriFunctor + sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv); + Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor); + } + } + } + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time3 = timer.seconds (); + #endif + } #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE std::cout << " invert_supernodes" << std::endl; + std::cout << " + num supernodes = " << nsuper << " num batchs = " << num_batchs << std::endl; std::cout << " > Time for inversion::trtri : " << time1 << std::endl; std::cout << " > Time for inversion::trmm : " << time2 << std::endl; + std::cout << " > Time for batchs : " << time3 << std::endl; #endif } diff --git a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index f2cdee87bb..bb1f96c4e3 100644 --- a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -56,6 +56,8 @@ #include "KokkosKernels_BitUtils.hpp" #include "KokkosKernels_SimpleUtils.hpp" #include "KokkosSparse_partitioning_impl.hpp" +#include "KokkosGraph_MIS2.hpp" +#include "KokkosGraph_ExplicitCoarsening.hpp" namespace KokkosSparse{ namespace Impl{ @@ -80,6 +82,10 @@ namespace KokkosSparse{ typedef typename HandleType::nnz_lno_t nnz_lno_t; typedef typename HandleType::nnz_scalar_t nnz_scalar_t; + static_assert(std::is_same::value, + "ClusterGaussSeidel: Handle's size_type does not match input rowmap's element type."); + static_assert(std::is_same::value, + "ClusterGaussSeidel: Handle's nnz_lno_t does not match input entries's element type."); typedef typename in_lno_row_view_t::const_type const_lno_row_view_t; typedef typename in_lno_row_view_t::non_const_type non_const_lno_row_view_t; @@ -306,7 +312,7 @@ namespace KokkosSparse{ for(int j = 0; j < N; j++) lsum.data[j] += val * _Xvector(colIndex, colStart + j); }, sum); - Kokkos::single(Kokkos::PerThread(teamMember),[=] () + Kokkos::single(Kokkos::PerThread(teamMember),[&] () { nnz_scalar_t invDiagonalVal = _inverse_diagonal(row); for(int i = 0; i < N; i++) @@ -494,208 +500,6 @@ namespace KokkosSparse{ nnz_lno_t clusterSize; }; - template - struct ClusterSizeFunctor - { - ClusterSizeFunctor(nnz_view_t& counts_, nnz_view_t& vertClusters_) - : counts(counts_), vertClusters(vertClusters_) - {} - KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i) const - { - Kokkos::atomic_increment(&counts(vertClusters(i))); - } - nnz_view_t counts; - nnz_view_t vertClusters; - }; - - template - struct FillClusterVertsFunctor - { - FillClusterVertsFunctor(nnz_view_t& clusterOffsets_, nnz_view_t& clusterVerts_, nnz_view_t& vertClusters_, nnz_view_t& insertCounts_) - : clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), insertCounts(insertCounts_) - {} - KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i) const - { - nnz_lno_t cluster = vertClusters(i); - nnz_lno_t offset = clusterOffsets(cluster) + Kokkos::atomic_fetch_add(&insertCounts(cluster), 1); - clusterVerts(offset) = i; - } - nnz_view_t clusterOffsets; - nnz_view_t clusterVerts; - nnz_view_t vertClusters; - nnz_view_t insertCounts; - }; - - template - struct BuildCrossClusterMaskFunctor - { - BuildCrossClusterMaskFunctor(Rowmap& rowmap_, Colinds& colinds_, nnz_view_t& clusterOffsets_, nnz_view_t& clusterVerts_, nnz_view_t& vertClusters_, bitset_t& mask_) - : numRows(rowmap_.extent(0) - 1), rowmap(rowmap_), colinds(colinds_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), mask(mask_) - {} - - //Used a fixed-size hash set in shared memory - KOKKOS_INLINE_FUNCTION constexpr int tableSize() const - { - //Should always be a power-of-two, so that X % tableSize() reduces to a bitwise and. - return 512; - } - - //Given a cluster index, get the hash table index. - //This is the 32-bit xorshift RNG, but it works as a hash function. - KOKKOS_INLINE_FUNCTION unsigned xorshiftHash(nnz_lno_t cluster) const - { - unsigned x = cluster; - x ^= x << 13; - x ^= x >> 17; - x ^= x << 5; - return x; - } - - KOKKOS_INLINE_FUNCTION bool lookup(nnz_lno_t cluster, int* table) const - { - unsigned h = xorshiftHash(cluster); - for(unsigned i = h; i < h + 2; i++) - { - if(table[i % tableSize()] == cluster) - return true; - } - return false; - } - - //Try to insert the edge between cluster (team's cluster) and neighbor (neighboring cluster) - //by inserting nei into the table. - KOKKOS_INLINE_FUNCTION bool insert(nnz_lno_t cluster, nnz_lno_t nei, int* table) const - { - unsigned h = xorshiftHash(nei); - for(unsigned i = h; i < h + 2; i++) - { - if(Kokkos::atomic_compare_exchange_strong(&table[i % tableSize()], cluster, nei)) - return true; - } - return false; - } - - KOKKOS_INLINE_FUNCTION void operator()(const team_member_t t) const - { - nnz_lno_t cluster = t.league_rank(); - nnz_lno_t clusterSize = clusterOffsets(cluster + 1) - clusterOffsets(cluster); - //Use a fixed-size hash table per thread to accumulate neighbor of the cluster. - //If it fills up (very unlikely) then just count every remaining edge going to another cluster - //not already in the table; this provides a reasonable upper bound for overallocating the cluster graph. - //each thread handles a cluster - int* table = (int*) t.team_shmem().get_shmem(tableSize() * sizeof(int)); - //mark every entry as cluster (self-loop) to represent free/empty - Kokkos::parallel_for(Kokkos::TeamVectorRange(t, tableSize()), - [&](const nnz_lno_t i) - { - table[i] = cluster; - }); - t.team_barrier(); - //now, for each row belonging to the cluster, iterate through the neighbors - Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clusterSize), - [&] (const nnz_lno_t i) - { - nnz_lno_t row = clusterVerts(clusterOffsets(cluster) + i); - nnz_lno_t rowDeg = rowmap(row + 1) - rowmap(row); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, rowDeg), - [&] (const nnz_lno_t j) - { - nnz_lno_t nei = colinds(rowmap(row) + j); - //Remote neighbors are not included - if(nei >= numRows) - return; - nnz_lno_t neiCluster = vertClusters(nei); - if(neiCluster != cluster) - { - //Have a neighbor. Try to find it in the table. - if(!lookup(neiCluster, table)) - { - //Not in the table. Try to insert it. - insert(cluster, neiCluster, table); - //Whether or not insertion succeeded, - //this is a cross-cluster edge possibly not seen before - mask.set(rowmap(row) + j); - } - } - }); - }); - } - - size_t team_shmem_size(int teamSize) const - { - return tableSize() * sizeof(int); - } - - nnz_lno_t numRows; - Rowmap rowmap; - Colinds colinds; - nnz_view_t clusterOffsets; - nnz_view_t clusterVerts; - nnz_view_t vertClusters; - bitset_t mask; - }; - - template - struct FillClusterEntriesFunctor - { - FillClusterEntriesFunctor( - Rowmap& rowmap_, Colinds& colinds_, nnz_view_t& clusterRowmap_, nnz_view_t& clusterEntries_, nnz_view_t& clusterOffsets_, nnz_view_t& clusterVerts_, nnz_view_t& vertClusters_, bitset_t& edgeMask_) - : rowmap(rowmap_), colinds(colinds_), clusterRowmap(clusterRowmap_), clusterEntries(clusterEntries_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), edgeMask(edgeMask_) - {} - //Run this scan over entries in clusterVerts (reordered point rows) - KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i, nnz_lno_t& lcount, const bool& finalPass) const - { - nnz_lno_t numRows = rowmap.extent(0) - 1; - nnz_lno_t row = clusterVerts(i); - size_type rowStart = rowmap(row); - size_type rowEnd = rowmap(row + 1); - nnz_lno_t cluster = vertClusters(row); - nnz_lno_t clusterStart = clusterOffsets(cluster); - //Count the number of entries in this row. - //This is how much lcount will be increased by, - //yielding the offset corresponding to - //these point entries in the cluster entries. - nnz_lno_t rowEntries = 0; - for(size_type j = rowStart; j < rowEnd; j++) - { - if(edgeMask.test(j)) - rowEntries++; - } - if(finalPass) - { - //if this is the last row in the cluster, update the upper bound in clusterRowmap - if(i == clusterStart) - { - clusterRowmap(cluster) = lcount; - } - nnz_lno_t clusterEdge = lcount; - //populate clusterEntries for these edges - for(size_type j = rowStart; j < rowEnd; j++) - { - if(edgeMask.test(j)) - { - clusterEntries(clusterEdge++) = vertClusters(colinds(j)); - } - } - } - //update the scan result at the end (exclusive) - lcount += rowEntries; - if(i == numRows - 1 && finalPass) - { - //on the very last row, set the last entry of the cluster rowmap - clusterRowmap(clusterRowmap.extent(0) - 1) = lcount; - } - } - Rowmap rowmap; - Colinds colinds; - nnz_view_t clusterRowmap; - nnz_view_t clusterEntries; - nnz_view_t clusterOffsets; - nnz_view_t clusterVerts; - nnz_view_t vertClusters; - const_bitset_t edgeMask; - }; - //Assign cluster labels to vertices, given that the vertices are naturally //ordered so that contiguous groups of vertices form decent clusters. template @@ -740,9 +544,9 @@ namespace KokkosSparse{ using nnz_view_t = nnz_lno_persistent_work_view_t; using in_rowmap_t = const_lno_row_view_t; using in_colinds_t = const_lno_nnz_view_t; - using rowmap_t = Kokkos::View; + using rowmap_t = Kokkos::View; using colinds_t = Kokkos::View; - using raw_rowmap_t = Kokkos::View>; + using raw_rowmap_t = Kokkos::View>; using raw_colinds_t = Kokkos::View>; auto gsHandle = get_gs_handle(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE @@ -767,8 +571,6 @@ namespace KokkosSparse{ //Now that a symmetric graph is available, build the cluster graph (also symmetric) nnz_lno_t clusterSize = gsHandle->get_cluster_size(); nnz_lno_t numClusters = (num_rows + clusterSize - 1) / clusterSize; - nnz_view_t clusterOffsets("Cluster offsets", numClusters + 1); - nnz_view_t clusterVerts("Cluster -> vertices", num_rows); raw_rowmap_t raw_sym_xadj; raw_colinds_t raw_sym_adj; if(this->is_symmetric) @@ -784,15 +586,13 @@ namespace KokkosSparse{ nnz_view_t vertClusters; auto clusterAlgo = gsHandle->get_clustering_algo(); if(clusterAlgo == CLUSTER_DEFAULT) - clusterAlgo = CLUSTER_BALLOON; + clusterAlgo = CLUSTER_MIS2; switch(clusterAlgo) { - case CLUSTER_CUTHILL_MCKEE: + case CLUSTER_MIS2: { - RCM rcm(num_rows, raw_sym_xadj, raw_sym_adj); - nnz_view_t cmOrder = rcm.cuthill_mckee(); - vertClusters = nnz_view_t("Cluster labels", num_rows); - Kokkos::parallel_for(my_exec_space(0, num_rows), ReorderedClusteringFunctor(vertClusters, cmOrder, clusterSize)); + vertClusters = KokkosGraph::Experimental::graph_mis2_coarsen + (raw_sym_xadj, raw_sym_adj, numClusters, KokkosGraph::MIS2_FAST); break; } case CLUSTER_BALLOON: @@ -801,12 +601,6 @@ namespace KokkosSparse{ vertClusters = balloon.run(clusterSize); break; } - case CLUSTER_DO_NOTHING: - { - vertClusters = nnz_view_t("Cluster labels", num_rows); - Kokkos::parallel_for(my_exec_space(0, num_rows), NopVertClusteringFunctor(vertClusters, clusterSize)); - break; - } case CLUSTER_DEFAULT: { throw std::logic_error("Logic to choose default clustering algorithm is incorrect"); @@ -818,46 +612,12 @@ namespace KokkosSparse{ std::cout << "Graph clustering: " << timer.seconds() << '\n'; timer.reset(); #endif - //Construct the cluster offset and vertex array. These allow fast iteration over all vertices in a given cluster. - Kokkos::parallel_for(my_exec_space(0, num_rows), ClusterSizeFunctor(clusterOffsets, vertClusters)); - KokkosKernels::Impl::exclusive_parallel_prefix_sum(numClusters + 1, clusterOffsets); - { - nnz_view_t tempInsertCounts("Temporary cluster insert counts", numClusters); - Kokkos::parallel_for(my_exec_space(0, num_rows), FillClusterVertsFunctor(clusterOffsets, clusterVerts, vertClusters, tempInsertCounts)); - } -#if KOKKOSSPARSE_IMPL_PRINTDEBUG - { - auto clusterOffsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), clusterOffsets); - auto clusterVertsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), clusterVerts); - puts("Clusters (cluster #, and vertex #s):"); - for(nnz_lno_t i = 0; i < numClusters; i++) - { - printf("%d: ", (int) i); - for(nnz_lno_t j = clusterOffsetsHost(i); j < clusterOffsetsHost(i + 1); j++) - { - printf("%d ", (int) clusterVerts(j)); - } - putchar('\n'); - } - printf("\n\n\n"); - } -#endif - //Determine the set of edges (in the point graph) that cross between two distinct clusters - int vectorSize = this->handle->get_suggested_vector_size(num_rows, raw_sym_adj.extent(0)); - bitset_t crossClusterEdgeMask(raw_sym_adj.extent(0)); - size_type numClusterEdges; - { - BuildCrossClusterMaskFunctor - buildEdgeMask(raw_sym_xadj, raw_sym_adj, clusterOffsets, clusterVerts, vertClusters, crossClusterEdgeMask); - int sharedPerTeam = buildEdgeMask.team_shmem_size(0); //using team-size = 0 for since no per-thread shared is used. - int teamSize = KokkosKernels::Impl::get_suggested_team_size(buildEdgeMask, vectorSize, sharedPerTeam, 0); - Kokkos::parallel_for(team_policy_t(numClusters, teamSize, vectorSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)), buildEdgeMask); - numClusterEdges = crossClusterEdgeMask.count(); - } - nnz_view_t clusterRowmap = nnz_view_t("Cluster graph rowmap", numClusters + 1); - nnz_view_t clusterEntries = nnz_view_t("Cluster graph colinds", numClusterEdges); - Kokkos::parallel_scan(my_exec_space(0, num_rows), FillClusterEntriesFunctor - (raw_sym_xadj, raw_sym_adj, clusterRowmap, clusterEntries, clusterOffsets, clusterVerts, vertClusters, crossClusterEdgeMask)); + rowmap_t clusterRowmap; + colinds_t clusterEntries; + nnz_view_t clusterOffsets; + nnz_view_t clusterVerts; + KokkosGraph::Experimental::graph_explicit_coarsen_with_inverse_map, raw_rowmap_t, raw_colinds_t, nnz_view_t, rowmap_t, colinds_t, nnz_view_t> + (raw_sym_xadj, raw_sym_adj, vertClusters, numClusters, clusterRowmap, clusterEntries, clusterOffsets, clusterVerts, false); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "Building explicit cluster graph: " << timer.seconds() << '\n'; timer.reset(); @@ -892,7 +652,7 @@ namespace KokkosSparse{ Kokkos::deep_copy(colors, h_colors); #else //Create a handle that uses nnz_lno_t as the size_type, since the cluster graph should never be larger than 2^31 entries. - KokkosKernels::Experimental::KokkosKernelsHandle kh; + HandleType kh; kh.create_graph_coloring_handle(KokkosGraph::COLORING_DEFAULT); KokkosGraph::Experimental::graph_color_symbolic(&kh, numClusters, numClusters, clusterRowmap, clusterEntries); //retrieve colors diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 5c50815f34..d5c111862f 100644 --- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -276,7 +276,7 @@ namespace KokkosSparse{ for(int j = 0; j < N; j++) lsum.data[j] += val * _Xvector(colIndex, colStart + j); }, sum); - Kokkos::single(Kokkos::PerThread(teamMember),[=] () + Kokkos::single(Kokkos::PerThread(teamMember),[&] () { nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(row); for(int i = 0; i < N; i++) @@ -420,7 +420,7 @@ namespace KokkosSparse{ product += product2; //update the new vector entries. - Kokkos::single(Kokkos::PerThread(teamMember),[=] () { + Kokkos::single(Kokkos::PerThread(teamMember),[&] () { nnz_lno_t block_row_index = ii * block_size + i; nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(block_row_index); _Xvector(block_row_index, vec) += omega * (_Yvector(block_row_index, vec) - product) * invDiagonalVal; @@ -484,7 +484,7 @@ namespace KokkosSparse{ Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& ii) { #if KOKKOSSPARSE_IMPL_PRINTDEBUG - Kokkos::single(Kokkos::PerThread(teamMember),[=] () { + Kokkos::single(Kokkos::PerThread(teamMember),[&] () { for(nnz_lno_t i = 0; i < block_size; diagonal_positions[i++] = -1); }); #endif @@ -542,7 +542,7 @@ namespace KokkosSparse{ valueToUpdate += all_shared_memory[colind] * _adj_vals(current_row_begin + colind); }, product); - Kokkos::single(Kokkos::PerThread(teamMember),[=] () + Kokkos::single(Kokkos::PerThread(teamMember),[&] () { nnz_lno_t block_row_index = ii * block_size + i; nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(block_row_index); @@ -554,9 +554,8 @@ namespace KokkosSparse{ } }); -#if !defined(__CUDA_ARCH__) #if KOKKOSSPARSE_IMPL_PRINTDEBUG - if (/*i == 0 && ii == 1*/ ii == 0 || (block_size == 1 && ii < 2) ){ + if (!KokkosKernels::Impl::kk_is_gpu_exec_space() && (ii == 0 || (block_size == 1 && ii < 2))){ std::cout << "\n\n\nrow:" << ii * block_size + i; std::cout << "\nneighbors:"; for (nnz_lno_t z = 0; z < block_row_size; ++z){ @@ -573,7 +572,6 @@ namespace KokkosSparse{ std::cout << std::endl << "block_row_index:" << ii * block_size + i << " _Xvector(block_row_index):" << _Xvector(ii * block_size + i, vec) << std::endl << std::endl<< std::endl; } -#endif #endif //row_begin += row_size * block_size; } @@ -737,31 +735,16 @@ namespace KokkosSparse{ timer.reset(); #endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - for (nnz_lno_t i = 0; i < numColors; ++i){ - nnz_lno_t color_index_begin = h_color_xadj(i); - nnz_lno_t color_index_end = h_color_xadj(i + 1); - - if (color_index_begin + 1 >= color_index_end ) continue; - auto colorsubset = - subview(color_adj, Kokkos::pair (color_index_begin, color_index_end)); - MyExecSpace().fence(); - Kokkos::sort (colorsubset); - //TODO: MD 08/2017: If I remove the below fence, code fails on cuda. - //I do not see any reason yet it to fail. - MyExecSpace().fence(); - } - } -#endif - - MyExecSpace().fence(); + // TODO BMK: Why are the vertices in each color set only being sorted on GPU? + // Wouldn't it have a locality benefit on CPU too? + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + KokkosKernels::Impl::sort_crs_graph(color_xadj, color_adj); + MyExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - std::cout << "SORT_TIME:" << timer.seconds() << std::endl; - timer.reset(); - //std::cout << "sort" << std::endl; + std::cout << "SORT_TIME:" << timer.seconds() << std::endl; + timer.reset(); #endif + } row_lno_persistent_work_view_t permuted_xadj ("new xadj", num_rows + 1); nnz_lno_persistent_work_view_t old_to_new_map ("old_to_new_index_", num_rows ); @@ -843,8 +826,7 @@ namespace KokkosSparse{ nnz_lno_t num_values_in_l2 = 0; nnz_lno_t num_big_rows = 0; - KokkosKernels::Impl::ExecSpaceType ex_sp = this->handle->get_handle_exec_space(); - if (ex_sp != KokkosKernels::Impl::Exec_CUDA){ + if (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { //again, if it is on CPUs, we make L1 as big as we need. size_t l1mem = 1; while(l1mem < level_1_mem){ @@ -882,12 +864,11 @@ namespace KokkosSparse{ num_big_rows = KOKKOSKERNELS_MACRO_MIN(num_large_rows, (size_type)(MyExecSpace::concurrency() / suggested_vector_size)); //std::cout << "num_big_rows:" << num_big_rows << std::endl; -#if defined( KOKKOS_ENABLE_CUDA ) - if (ex_sp == KokkosKernels::Impl::Exec_CUDA) { + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { //check if we have enough memory for this. lower the concurrency if we do not have enugh memory. size_t free_byte ; size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); size_t required_size = size_t (num_big_rows) * level_2_mem; if (required_size + num_big_rows * sizeof(int) > free_byte){ num_big_rows = ((((free_byte - num_big_rows * sizeof(int))* 0.8) /8 ) * 8) / level_2_mem; @@ -900,7 +881,6 @@ namespace KokkosSparse{ num_big_rows = min_chunk_size; } } -#endif } } @@ -1165,7 +1145,7 @@ namespace KokkosSparse{ // change fill_matrix_numeric so that they store the internal matrix as above. // the rest will wok fine. - if (this->handle->get_handle_exec_space() == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::Team_fill_matrix_numeric", team_policy_t(num_rows / rows_per_team + 1 , suggested_team_size, suggested_vector_size), fill_matrix_numeric( @@ -1209,7 +1189,7 @@ namespace KokkosSparse{ block_size, block_matrix_size); - if (this->handle->get_handle_exec_space() == KokkosKernels::Impl::Exec_CUDA || block_size > 1){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space() || block_size > 1){ Kokkos::parallel_for("KokkosSparse::GaussSeidel::team_get_matrix_diagonals", team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), gmd ); diff --git a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp index ddfcb70f92..af10787c46 100644 --- a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp +++ b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp @@ -74,555 +74,6 @@ struct IotaFunctor View v; }; -template -struct RCM -{ - typedef typename HandleType::HandleExecSpace MyExecSpace; - typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace; - typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; - - typedef typename HandleType::size_type size_type; - typedef typename HandleType::nnz_lno_t nnz_lno_t; - - typedef typename lno_row_view_t::const_type const_lno_row_view_t; - typedef typename lno_row_view_t::non_const_type non_const_lno_row_view_t; - typedef typename non_const_lno_row_view_t::value_type offset_t; - - typedef typename lno_nnz_view_t::const_type const_lno_nnz_view_t; - typedef typename lno_nnz_view_t::non_const_type non_const_lno_nnz_view_t; - - typedef typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t; - typedef typename HandleType::row_lno_persistent_work_view_t row_lno_persistent_work_view_t; - typedef typename HandleType::row_lno_persistent_work_host_view_t row_lno_persistent_work_host_view_t; //Host view type - - typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; - typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t; - typedef typename HandleType::nnz_lno_persistent_work_host_view_t nnz_lno_persistent_work_host_view_t; //Host view type - - typedef nnz_lno_persistent_work_view_t nnz_view_t; - typedef Kokkos::View> single_view_t; - typedef Kokkos::View> single_view_host_t; - - typedef Kokkos::RangePolicy my_exec_space; - - typedef Kokkos::RangePolicy range_policy_t ; - typedef Kokkos::TeamPolicy team_policy_t ; - typedef typename team_policy_t::member_type team_member_t ; - - typedef nnz_lno_t LO; - - RCM(size_type numRows_, lno_row_view_t& rowmap_, lno_nnz_view_t& colinds_) - : numRows(numRows_), rowmap(rowmap_), colinds(colinds_) - {} - - nnz_lno_t numRows; - const_lno_row_view_t rowmap; - const_lno_nnz_view_t colinds; - - template - struct MaxDegreeFunctor - { - typedef typename std::remove_cv::type size_type; - MaxDegreeFunctor(Rowmap& rowmap_) : r(rowmap_) {} - KOKKOS_INLINE_FUNCTION void operator()(const size_type i, size_type& lmax) const - { - size_type ideg = r(i + 1) - r(i); - if(ideg > lmax) - lmax = ideg; - } - Rowmap r; - }; - - //simple parallel reduction to find max degree in graph - size_type find_max_degree() - { - size_type maxDeg = 0; - Kokkos::parallel_reduce(range_policy_t(0, numRows), MaxDegreeFunctor(rowmap), Kokkos::Max(maxDeg)); - //max degree should be computed as an offset_t, - //but must fit in a nnz_lno_t - return maxDeg; - } - - //radix sort keys according to their corresponding values ascending. - //keys are NOT preserved since the use of this in RCM doesn't care about degree after sorting - template - KOKKOS_INLINE_FUNCTION static void - radixSortKeysAndValues(KeyType* keys, KeyType* keysAux, ValueType* values, ValueType* valuesAux, IndexType n, const member_t& mem) - { - if(n <= 1) - return; - //sort 4 bits at a time - KeyType mask = 0xF; - bool inAux = false; - //maskPos counts the low bit index of mask (0, 4, 8, ...) - IndexType maskPos = 0; - IndexType sortBits = 0; - KeyType minKey = Kokkos::ArithTraits::max(); - KeyType maxKey = 0; - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(mem, n), - [=](size_type i, KeyType& lminkey) - { - if(keys[i] < lminkey) - lminkey = keys[i]; - }, Kokkos::Min(minKey)); - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(mem, n), - [=](size_type i, KeyType& lmaxkey) - { - if(keys[i] > lmaxkey) - lmaxkey = keys[i]; - }, Kokkos::Max(maxKey)); - //apply a bias so that key range always starts at 0 - //also invert key values here for a descending sort - Kokkos::parallel_for(Kokkos::ThreadVectorRange(mem, n), - [=](size_type i) - { - keys[i] -= minKey; - }); - KeyType upperBound = maxKey - minKey; - while(upperBound) - { - upperBound >>= 1; - sortBits++; - } - for(IndexType s = 0; s < (sortBits + 3) / 4; s++) - { - //Count the number of elements in each bucket - IndexType count[16] = {0}; - IndexType offset[17]; - if(!inAux) - { - for(IndexType i = 0; i < n; i++) - { - count[(keys[i] & mask) >> maskPos]++; - } - } - else - { - for(IndexType i = 0; i < n; i++) - { - count[(keysAux[i] & mask) >> maskPos]++; - } - } - offset[0] = 0; - //get offset as the prefix sum for count - for(IndexType i = 0; i < 16; i++) - { - offset[i + 1] = offset[i] + count[i]; - } - //now for each element in [lo, hi), move it to its offset in the other buffer - //this branch should be ok because whichBuf is the same on all threads - if(!inAux) - { - //copy from *Over to *Aux - for(IndexType i = 0; i < n; i++) - { - IndexType bucket = (keys[i] & mask) >> maskPos; - keysAux[offset[bucket + 1] - count[bucket]] = keys[i]; - valuesAux[offset[bucket + 1] - count[bucket]] = values[i]; - count[bucket]--; - } - } - else - { - //copy from *Aux to *Over - for(IndexType i = 0; i < n; i++) - { - IndexType bucket = (keysAux[i] & mask) >> maskPos; - keys[offset[bucket + 1] - count[bucket]] = keysAux[i]; - values[offset[bucket + 1] - count[bucket]] = valuesAux[i]; - count[bucket]--; - } - } - inAux = !inAux; - mask = mask << 4; - maskPos += 4; - } - //move keys/values back from aux if they are currently in aux, - //and remove bias - if(inAux) - { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(mem, n), - [=](size_type i) - { - //TODO: when everything works, is safe to remove next line - //since keys (BFS visit scores) will never be needed again - keys[i] = keysAux[i]; - values[i] = valuesAux[i]; - }); - } - } - - //Functor that does breadth-first search on a sparse graph. - struct BfsFunctor - { - typedef Kokkos::View> WorkView; - - BfsFunctor(const WorkView& workQueue_, const WorkView& scratch_, const nnz_view_t& visit_, const const_lno_row_view_t& rowmap_, const const_lno_nnz_view_t& colinds_, const single_view_t& numLevels_, const nnz_view_t& threadNeighborCounts_, nnz_lno_t start_, nnz_lno_t numRows_) - : workQueue(workQueue_), scratch(scratch_), visit(visit_), rowmap(rowmap_), colinds(colinds_), numLevels(numLevels_), threadNeighborCounts(threadNeighborCounts_), start(start_), numRows(numRows_) - {} - - KOKKOS_INLINE_FUNCTION void operator()(const team_member_t mem) const - { - const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); - const nnz_lno_t NOT_VISITED = LNO_MAX; - const nnz_lno_t QUEUED = NOT_VISITED - 1; - int nthreads = mem.team_size(); - nnz_lno_t tid = mem.team_rank(); - auto neighborList = Kokkos::subview(scratch, tid, Kokkos::ALL()); - //active and next indicate which buffer in workQueue holds the nodes in current/next frontiers, respectively - //active, next and visitCounter are thread-local, but always kept consistent across threads - int active = 0; - int next = 1; - nnz_lno_t visitCounter = 0; - Kokkos::single(Kokkos::PerTeam(mem), - [=]() - { - workQueue(active, 0) = start; - visit(start) = QUEUED; - }); - nnz_lno_t activeQSize = 1; - nnz_lno_t nextQSize = 0; - //KK create_reverse_map() expects incoming values to start at 1 - nnz_lno_t level = 1; - //do this until all nodes have been visited and added to a level - while(visitCounter < numRows) - { - mem.team_barrier(); - //each thread works on a contiguous block of nodes in queue (for locality) - //compute in size_t to avoid possible 32-bit overflow - nnz_lno_t workStart = tid * activeQSize / nthreads; - nnz_lno_t workEnd = (tid + 1) * activeQSize / nthreads; - //the maximum work batch size (among all threads) - //the following loop contains barriers so all threads must iterate same # of times - nnz_lno_t maxBatch = (activeQSize + nthreads - 1) / nthreads; - for(nnz_lno_t loop = 0; loop < maxBatch; loop++) - { - //this thread may not actually have anything to work on (if nthreads doesn't divide qSize) - bool busy = loop < workEnd - workStart; - nnz_lno_t neiCount = 0; - nnz_lno_t process = LNO_MAX; - if(busy) - { - process = workQueue(active, workStart + loop); - offset_t rowStart = rowmap(process); - offset_t rowEnd = rowmap(process + 1); - //build a list of all non-visited neighbors - for(offset_t j = rowStart; j < rowEnd; j++) - { - nnz_lno_t col = colinds(j); - //use atomic here to guarantee neighbors are added to neighborList exactly once - if(col < numRows && Kokkos::atomic_compare_exchange_strong(&visit(col), NOT_VISITED, QUEUED)) - { - //this thread is the first to see that col needs to be queued - neighborList(neiCount) = col; - neiCount++; - } - } - } - threadNeighborCounts(tid) = neiCount; - mem.team_barrier(); - size_type queueUpdateOffset = 0; - for(nnz_lno_t i = 0; i < tid; i++) - { - queueUpdateOffset += threadNeighborCounts(i); - } - //write out all updates to next queue in parallel - if(busy) - { - nnz_lno_t nextQueueIter = 0; - for(nnz_lno_t i = 0; i < neiCount; i++) - { - nnz_lno_t toQueue = neighborList(i); - visit(toQueue) = QUEUED; - workQueue(next, nextQSize + queueUpdateOffset + nextQueueIter) = toQueue; - nextQueueIter++; - } - //assign level to to process - visit(process) = level; - } - nnz_lno_t totalAdded = 0; - for(nnz_lno_t i = 0; i < nthreads; i++) - { - totalAdded += threadNeighborCounts(i); - } - nextQSize += totalAdded; - mem.team_barrier(); - } - //swap queue buffers - active = next; - next = 1 - next; - //all threads have a consistent value of qSize here. - //update visitCounter in preparation for next frontier - visitCounter += activeQSize; - activeQSize = nextQSize; - nextQSize = 0; - if(visitCounter < numRows && activeQSize == 0) - { - Kokkos::single(Kokkos::PerTeam(mem), - [=]() - { - //Some nodes are unreachable from start (graph not connected) - //Find an unvisited node to resume BFS - for(nnz_lno_t search = numRows - 1; search >= 0; search--) - { - if(visit(search) == NOT_VISITED) - { - workQueue(active, 0) = search; - visit(search) = QUEUED; - break; - } - } - }); - activeQSize = 1; - } - level++; - } - Kokkos::single(Kokkos::PerTeam(mem), - [=] - { - numLevels() = level - 1; - }); - } - - WorkView workQueue; - WorkView scratch; - nnz_view_t visit; - const_lno_row_view_t rowmap; - const_lno_nnz_view_t colinds; - single_view_t numLevels; - nnz_view_t threadNeighborCounts; - nnz_lno_t start; - nnz_lno_t numRows; - }; - - //Parallel breadth-first search, producing level structure in (xadj, adj) form: - //xadj(level) gives index in adj where level begins. - //Returns the total number of levels, and sets xadj, adj and maxDeg. - nnz_lno_t parallel_bfs(nnz_lno_t start, nnz_view_t& xadj, nnz_view_t& adj, nnz_lno_t& maxDeg, nnz_lno_t nthreads) - { - //need to know maximum degree to allocate scratch space for threads - maxDeg = find_max_degree(); - //view for storing the visit timestamps - nnz_view_t visit("BFS visited nodes", numRows); - const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); - const nnz_lno_t NOT_VISITED = LNO_MAX; - KokkosBlas::fill(visit, NOT_VISITED); - //the visit queue - //one of q1,q2 is active at a time and holds the nodes to process in next BFS level - //elements which are LNO_MAX are just placeholders (nothing to process) - Kokkos::View> workQueue("BFS queue (double buffered)", 2, numRows); - nnz_view_t threadNeighborCounts("Number of nodes to queue on each thread", nthreads); - single_view_t numLevels("# of BFS levels"); - single_view_host_t numLevelsHost("# of BFS levels"); - Kokkos::View> scratch("Scratch buffer shared by threads", nthreads, maxDeg); - Kokkos::parallel_for(team_policy_t(1, nthreads), BfsFunctor(workQueue, scratch, visit, rowmap, colinds, numLevels, threadNeighborCounts, start, numRows)); - Kokkos::deep_copy(numLevelsHost, numLevels); - //now that level structure has been computed, construct xadj/adj - KokkosKernels::Impl::create_reverse_map - (numRows, numLevelsHost(), visit, xadj, adj); - return numLevelsHost(); - } - - struct CuthillMcKeeFunctor - { - typedef Kokkos::View> ScoreView; - - CuthillMcKeeFunctor(nnz_lno_t numLevels_, nnz_lno_t maxDegree_, const const_lno_row_view_t& rowmap_, const const_lno_nnz_view_t& colinds_, const ScoreView& scores_, const ScoreView& scoresAux_, const nnz_view_t& visit_, const nnz_view_t& xadj_, const nnz_view_t& adj_, const nnz_view_t& adjAux_) - : numLevels(numLevels_), maxDegree(maxDegree_), rowmap(rowmap_), colinds(colinds_), scores(scores_), scoresAux(scoresAux_), visit(visit_), xadj(xadj_), adj(adj_), adjAux(adjAux_) - { - numRows = rowmap.extent(0) - 1; - } - - KOKKOS_INLINE_FUNCTION void operator()(const team_member_t mem) const - { - int tid = mem.team_rank(); - int nthreads = mem.team_size(); - const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); - nnz_lno_t visitCounter = 0; - for(nnz_lno_t level = 0; level < numLevels; level++) - { - //iterate over vertices in this level and compute - //min predecessors (minimum-labeled vertices from previous level) - nnz_lno_t levelOffset = xadj(level); - nnz_lno_t levelSize = xadj(level + 1) - levelOffset; - //compute as offset_t to avoid overflow, but the upper bound on - //the scores is approx. numRows * maxDegree, which should be representable - nnz_lno_t workStart = tid * levelSize / nthreads; - nnz_lno_t workEnd = (tid + 1) * levelSize / nthreads; - for(nnz_lno_t i = workStart; i < workEnd; i++) - { - nnz_lno_t process = adj(levelOffset + i); - nnz_lno_t minNeighbor = LNO_MAX; - offset_t rowStart = rowmap(process); - offset_t rowEnd = rowmap(process + 1); - for(offset_t j = rowStart; j < rowEnd; j++) - { - nnz_lno_t neighbor = colinds(j); - if(neighbor < numRows) - { - nnz_lno_t neighborVisit = visit(neighbor); - if(neighborVisit < minNeighbor) - minNeighbor = neighborVisit; - } - } - scores(i) = ((offset_t) minNeighbor * (maxDegree + 1)) + (rowmap(process + 1) - rowmap(process)); - } - mem.team_barrier(); - Kokkos::single(Kokkos::PerTeam(mem), - [=]() - { - radixSortKeysAndValues - (scores.data(), scoresAux.data(), adj.data() + levelOffset, adjAux.data(), levelSize, mem); - }); - mem.team_barrier(); - //label all vertices (which are now in label order within their level) - for(nnz_lno_t i = workStart; i < workEnd; i++) - { - nnz_lno_t process = adj(levelOffset + i); - //visit counter increases with levels, so flip the range for the "reverse" in RCM - visit(process) = visitCounter + i; - } - visitCounter += levelSize; - } - } - - nnz_lno_t numRows; - nnz_lno_t numLevels; - nnz_lno_t maxDegree; - const_lno_row_view_t rowmap; - const_lno_nnz_view_t colinds; - ScoreView scores; - ScoreView scoresAux; - nnz_view_t visit; - //The levels, stored in CRS format. - //xadj stores offsets for each level, and adj stores the rows in each level. - nnz_view_t xadj; - nnz_view_t adj; - nnz_view_t adjAux; - }; - - //Does the reversing in "reverse Cuthill-McKee") - struct OrderReverseFunctor - { - OrderReverseFunctor(const nnz_view_t& visit_, nnz_lno_t numRows_) - : visit(visit_), numRows(numRows_) - {} - - KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const - { - visit(i) = numRows - visit(i) - 1; - } - nnz_view_t visit; - nnz_lno_t numRows; - }; - - //breadth-first search, producing a reverse Cuthill-McKee ordering - nnz_view_t parallel_cuthill_mckee(nnz_lno_t start) - { - size_type nthreads = MyExecSpace::concurrency(); - if(nthreads > 64) - nthreads = 64; - #ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) - { - nthreads = 256; - } - #endif - nnz_view_t xadj, adj; - nnz_lno_t maxDegree = 0; - //parallel_bfs will compute maxDegree - auto numLevels = parallel_bfs(start, xadj, adj, maxDegree, nthreads); - nnz_lno_t maxLevelSize = 0; - Kokkos::parallel_reduce(range_policy_t(0, numLevels), MaxDegreeFunctor(xadj), Kokkos::Max(maxLevelSize)); - //visit (to be returned) contains the RCM numberings of each row - nnz_view_t visit("RCM labels", numRows); - //Populate visit wth LNO_MAX so that the "min-labeled neighbor" - //is always a node in the previous level - const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); - KokkosBlas::fill(visit, LNO_MAX); - //the "score" of a node is a single value that provides an ordering equivalent - //to sorting by min predecessor and then by min degree - //reduce nthreads to be a power of 2 - Kokkos::View> scores("RCM scores for sorting", maxLevelSize); - Kokkos::View> scoresAux("RCM scores for sorting (radix sort aux)", maxLevelSize); - nnz_view_t adjAux("RCM scores for sorting (radix sort aux)", maxLevelSize); - Kokkos::parallel_for(team_policy_t(1, nthreads), CuthillMcKeeFunctor(numLevels, maxDegree, rowmap, colinds, scores, scoresAux, visit, xadj, adj, adjAux)); - //reverse the visit order (for the 'R' in RCM) - Kokkos::parallel_for(range_policy_t(0, numRows), OrderReverseFunctor(visit, numRows)); - return visit; - } - - template - struct MinDegreeRowFunctor - { - typedef typename Reducer::value_type Value; - MinDegreeRowFunctor(const const_lno_row_view_t& rowmap_) : rowmap(rowmap_) {} - KOKKOS_INLINE_FUNCTION void operator()(const size_type i, Value& lval) const - { - size_type ideg = rowmap(i + 1) - rowmap(i); - if(ideg < lval.val) - { - lval.val = ideg; - lval.loc = i; - } - } - const_lno_row_view_t rowmap; - }; - - //parallel-for functor that assigns a cluster given a envelope-reduced reordering (like RCM) - struct OrderToClusterFunctor - { - OrderToClusterFunctor(const nnz_view_t& ordering_, const nnz_view_t& vertClusters_, nnz_lno_t clusterSize_) - : ordering(ordering_), vertClusters(vertClusters_), clusterSize(clusterSize_) - {} - - KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const - { - vertClusters(i) = ordering(i) / clusterSize; - } - - const nnz_view_t ordering; - nnz_view_t vertClusters; - nnz_lno_t clusterSize; - }; - - //Find a peripheral node (one of minimal degree), suitable for starting RCM or BFS - nnz_lno_t find_peripheral() - { - typedef Kokkos::MinLoc MinLocReducer; - typedef typename MinLocReducer::value_type MinLocVal; - MinLocVal v; - Kokkos::parallel_reduce(range_policy_t(0, numRows), - MinDegreeRowFunctor(rowmap), MinLocReducer(v)); - return v.loc; - } - - nnz_view_t cuthill_mckee() - { - nnz_lno_t periph = find_peripheral(); - //run Cuthill-McKee BFS from periph - auto ordering = parallel_cuthill_mckee(periph); - return ordering; - } - - nnz_view_t rcm() - { - nnz_view_t cm = cuthill_mckee(); - //reverse the visit order (for the 'R' in RCM) - Kokkos::parallel_for(range_policy_t(0, numRows), OrderReverseFunctor(cm, numRows)); - return cm; - } - - nnz_view_t cm_cluster(nnz_lno_t clusterSize) - { - nnz_view_t cm = cuthill_mckee(); - nnz_view_t vertClusters("Vert to cluster", numRows); - OrderToClusterFunctor makeClusters(cm, vertClusters, clusterSize); - Kokkos::parallel_for(range_policy_t(0, numRows), makeClusters); - return vertClusters; - } -}; - template struct BalloonClustering { diff --git a/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp index 6ed2d1be38..420e622c8f 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp @@ -47,6 +47,7 @@ //#define KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#include "KokkosKernels_Controls.hpp" #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #include "cusparse.h" #endif @@ -78,10 +79,10 @@ namespace Impl{ #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - typedef typename ain_row_index_view_type::device_type device1; - typedef typename ain_nonzero_index_view_type::device_type device2; - - typedef typename KernelHandle::nnz_lno_t idx; + using device1 = typename ain_row_index_view_type::device_type; + using device2 = typename ain_nonzero_index_view_type::device_type; + using idx = typename KernelHandle::nnz_lno_t; + using size_type = typename KernelHandle::size_type; //TODO this is not correct, check memory space. @@ -94,11 +95,14 @@ namespace Impl{ //return; } - if (std::is_same::value){ +#if defined(CUSPARSE_VERSION) && (11000 <= CUSPARSE_VERSION) + throw std::runtime_error ("SpGEMM cuSPARSE backend is not yet supported for this CUDA version\n"); +#else - const idx *a_xadj = (int *)row_mapA.data(); - const idx *b_xadj = (int *)row_mapB.data(); - idx *c_xadj = (int *)row_mapC.data(); + if (std::is_same::value && std::is_same::value){ + const idx *a_xadj = (const idx*) row_mapA.data(); + const idx *b_xadj = (const idx*) row_mapB.data(); + idx *c_xadj = (idx*) row_mapC.data(); const idx *a_adj = entriesA.data(); const idx *b_adj = entriesB.data(); @@ -143,6 +147,7 @@ namespace Impl{ throw std::runtime_error ("CUSPARSE requires local ordinals to be integer.\n"); //return; } +#endif #else (void)handle; (void)m; (void)n; (void)k; @@ -186,6 +191,9 @@ namespace Impl{ cin_nonzero_value_view_type valuesC){ #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#if defined(CUSPARSE_VERSION) && (11000 <= CUSPARSE_VERSION) + throw std::runtime_error ("SpGEMM cuSPARSE backend is not yet supported for this CUDA version\n"); +#else typedef typename KernelHandle::nnz_lno_t idx; typedef typename KernelHandle::nnz_scalar_t value_type; @@ -289,6 +297,7 @@ namespace Impl{ throw std::runtime_error ("CUSPARSE requires local ordinals to be integer.\n"); //return; } +#endif #else (void)handle; (void)m; (void)n; (void)k; diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp index a8a539ef10..06a3153ad9 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp @@ -787,9 +787,35 @@ class KokkosSPGEMM{ typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv, KokkosKernels::Impl::ExecSpaceType my_exec_space); + //Utility to compute the number of pool chunks for L2 hashmap accumulators. + //Uses free memory query for accelerators/GPUs but assumes infinite available host memory. + // + //chunk_bytes: bytes in each chunk + //ideal_num_chunks: number of chunks that would give each thread/team its own chunk (no contention) + template + size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks) + { + if(!KokkosKernels::Impl::kk_is_gpu_exec_space()) + return ideal_num_chunks; + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); + size_t required_size = ideal_num_chunks * chunk_bytes; + if (KOKKOSKERNELS_VERBOSE) + std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; + size_t num_chunks = ideal_num_chunks; + //If there is not enough memory to safely allocate ideal_num_chunks, use half the free memory, rounded down + if (required_size > free_byte / 2) { + num_chunks = (free_byte / 2) / chunk_bytes; + } + //then take the largest power of 2 smaller than that + size_t po2_num_chunks = 1; + while (po2_num_chunks * 2 < num_chunks) { + po2_num_chunks *= 2; + } + return po2_num_chunks; + } }; - } } #include "KokkosSparse_spgemm_imp_outer.hpp" diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp index 5d98e28b98..35f00201a2 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp @@ -206,19 +206,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: @@ -227,6 +219,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } @@ -769,6 +765,7 @@ bool KokkosSPGEMM { //get the execution space type. KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); //get the suggested vectorlane size based on the execution space, and average number of nnzs per row. int suggested_vector_size = this->handle->get_suggested_vector_size(n, nnz); //get the suggested team size. @@ -799,7 +796,7 @@ bool KokkosSPGEMM out_nnz_view_t set_nexts_; out_nnz_view_t set_begins_; #ifdef KOKKOSKERNELSMOREMEM - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { set_nexts_ = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_nexts_"), nnz); set_begins_ = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_begins_"), nnz); Kokkos::deep_copy (set_begins_, -1); @@ -812,8 +809,9 @@ bool KokkosSPGEMM } //if compressing in single step, allocate the memory as upperbound. - //TODO: two step is not there for cuda. - if (compress_in_single_step || lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + //TODO: two step is not there for GPU. + + if (compress_in_single_step || exec_gpu) { out_nnz_indices = out_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("set_entries_"), nnz); out_nnz_sets = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_indices_"), nnz); } @@ -842,7 +840,8 @@ bool KokkosSPGEMM timer1.reset(); //bool compression_applied = false; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + #ifndef KOKKOSKERNELSMOREMEM size_type max_row_nnz = 0; @@ -861,30 +860,9 @@ bool KokkosSPGEMM sszm_compressMatrix.pow2_hash_size = min_hash_size; sszm_compressMatrix.pow2_hash_func = min_hash_size - 1; - size_t num_chunks = concurrency / suggested_vector_size; + nnz_lno_t num_chunks = this->template compute_num_pool_chunks + (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); - -#if defined( KOKKOS_ENABLE_CUDA ) - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks*sizeof(int) > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - size_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } -#endif if (KOKKOSKERNELS_VERBOSE){ std::cout << "\t\tPOOL chunksize:" << chunksize << " num_chunks:" diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp index 595e216700..8fdf276e61 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp @@ -99,7 +99,7 @@ void KokkosSPGEMM Kokkos::Impl::Timer timer1; auto new_row_mapB_begin = Kokkos::subview (row_mapB, std::make_pair (nnz_lno_t(0), b_row_cnt)); auto new_row_mapB_end = Kokkos::subview (row_mapB, std::make_pair (nnz_lno_t(1), b_row_cnt + 1)); - row_lno_persistent_work_view_t flops_per_row(Kokkos::ViewAllocateWithoutInitializing("origianal row flops"), a_row_cnt); + row_lno_persistent_work_view_t flops_per_row(Kokkos::ViewAllocateWithoutInitializing("original row flops"), a_row_cnt); //get maximum row flops. maxNumRoughZeros = this->getMaxRoughRowNNZ(a_row_cnt, row_mapA, entriesA, @@ -121,13 +121,11 @@ void KokkosSPGEMM //number of rows and nnzs nnz_lno_t n = this->row_mapB.extent(0) - 1; size_type nnz = this->entriesB.extent(0); - KokkosKernels::Impl::ExecSpaceType my_exec_space_ = KokkosKernels::Impl::get_exec_space_type(); bool compress_in_single_step = this->handle->get_spgemm_handle()->get_compression_step(); - //compress in single step if it is cuda execution space. - if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA) { + //compress in single step if it is GPU. + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) compress_in_single_step = true; - } //compressed B fields. row_lno_temp_work_view_t new_row_mapB(Kokkos::ViewAllocateWithoutInitializing("new row map"), n+1); diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp index 5303a46c40..a5fc298e2c 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp @@ -221,19 +221,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: @@ -242,6 +234,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -662,7 +658,7 @@ struct KokkosSPGEMM if (c_row_size > max_first_level_hash_size){ { while (tmp == NULL){ - Kokkos::single(Kokkos::PerTeam(teamMember),[=] (volatile nnz_lno_t * &memptr) { + Kokkos::single(Kokkos::PerTeam(teamMember),[&] (volatile nnz_lno_t * &memptr) { memptr = (volatile nnz_lno_t * )( memory_space.allocate_chunk(row_index)); }, tmp); } @@ -1252,7 +1248,7 @@ void //choose parameters if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){ - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { //then chose the best method and parameters. size_type average_row_nnz = overall_nnz / this->a_row_cnt; size_t average_row_flops = original_overall_flops / this->a_row_cnt; @@ -1382,7 +1378,7 @@ void //required memory for L2 - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){ tmp_max_nnz = 1; @@ -1425,29 +1421,9 @@ void chunksize += min_hash_size ; //this is for the hash begins chunksize += max_nnz; //this is for hash nexts } - int num_chunks = concurrency / suggested_vector_size; -#if defined( KOKKOS_ENABLE_CUDA ) - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } -#endif + nnz_lno_t num_chunks = this->template compute_num_pool_chunks + (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); // END SIZE CALCULATIONS FOR MEMORYPOOL @@ -1463,7 +1439,7 @@ void KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } @@ -1513,7 +1489,7 @@ void } timer1.reset(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){ if (thread_shmem_key_size <= 0) { std::cout << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl; @@ -1625,7 +1601,7 @@ void KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } @@ -1675,7 +1651,7 @@ void } timer1.reset(); - if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY2", gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc); MyExecSpace().fence(); } diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp index 3ba3d4e443..e3a4f492a6 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp @@ -130,19 +130,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: @@ -151,6 +143,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -489,7 +485,7 @@ struct KokkosSPGEMM // // Policy typedefs with tags found in: KokkosSparse_spgemm_impl.hpp // -// if Cuda enabled : +// if GPU: // "KokkosSparse::NumericCMEM::KKSPEED::GPU" : gpu_team_policy_t, i.e. GPUTag // // else : @@ -527,7 +523,7 @@ void Kokkos::Impl::Timer numeric_speed_timer_with_free; - if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { //allocate memory for begins and next to be used by the hashmap nnz_lno_temp_work_view_t beginsC (Kokkos::ViewAllocateWithoutInitializing("C keys"), valuesC_.extent(0)); diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp index 36afa46eef..f6f4e8e3a8 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp @@ -197,19 +197,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: @@ -218,6 +210,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -780,19 +776,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: @@ -801,6 +789,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -1509,13 +1501,14 @@ void KokkosSPGEMM ){ SPGEMMAlgorithm current_spgemm_algorithm = this->spgemm_algorithm; + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) + if (exec_gpu) { current_spgemm_algorithm = SPGEMM_KK_MEMORY; } maxNumRoughNonzeros = KOKKOSKERNELS_MACRO_MIN(this->b_col_cnt, maxNumRoughNonzeros); - int shmem_size_to_use = shmem_size; + int shmem_size_to_use = shmem_size; typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space; @@ -1527,7 +1520,7 @@ void KokkosSPGEMM int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz); //this kernel does not really work well if the vector size is less than 4. - if (suggested_vector_size < 4 && lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (suggested_vector_size < 4 && exec_gpu) { if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tsuggested_vector_size:" << suggested_vector_size << " setting it to 4 for Structure kernel" << std::endl; } @@ -1538,7 +1531,7 @@ void KokkosSPGEMM if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){ - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu){ //then chose the best method and parameters. current_spgemm_algorithm = SPGEMM_KK_MEMORY; int estimate_compress = 8; @@ -1649,33 +1642,13 @@ void KokkosSPGEMM } //initizalize value for the mem pool - nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - -#if defined( KOKKOS_ENABLE_CUDA ) - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } -#endif + nnz_lno_t num_chunks = this->template compute_num_pool_chunks + (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl; @@ -1721,8 +1694,8 @@ void KokkosSPGEMM timer1.reset(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - Kokkos::parallel_for("StructureC_NC::CUDA_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); + if (exec_gpu) { + Kokkos::parallel_for("StructureC_NC::GPU_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); } else { if (current_spgemm_algorithm == SPGEMM_KK_DENSE){ @@ -1807,8 +1780,9 @@ void KokkosSPGEMM ){ SPGEMMAlgorithm current_spgemm_algorithm = this->spgemm_algorithm; + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { current_spgemm_algorithm = SPGEMM_KK_MEMORY; } @@ -1816,7 +1790,7 @@ void KokkosSPGEMM nnz_lno_t brows = row_mapB_.extent(0) - 1; size_type bnnz = entriesSetIndex.extent(0); size_type compressed_b_size = bnnz; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { KokkosKernels::Impl::kk_reduce_diff_view (brows, old_row_mapB, row_mapB_, compressed_b_size); if (KOKKOSKERNELS_VERBOSE){ @@ -1826,7 +1800,7 @@ void KokkosSPGEMM int suggested_vector_size = this->handle->get_suggested_vector_size(brows, compressed_b_size); //this kernel does not really work well if the vector size is less than 4. - if (suggested_vector_size < 4 && lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (suggested_vector_size < 4 && exec_gpu) { if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tsuggested_vector_size:" << suggested_vector_size << " setting it to 4 for Structure kernel" << std::endl; } @@ -1837,7 +1811,7 @@ void KokkosSPGEMM int shmem_size_to_use = shmem_size; if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){ - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { //then chose the best method and parameters. current_spgemm_algorithm = SPGEMM_KK_MEMORY; int estimate_compress = 8; @@ -1967,7 +1941,7 @@ void KokkosSPGEMM } - if (current_spgemm_algorithm == SPGEMM_KK_DENSE && lcl_my_exec_space != KokkosKernels::Impl::Exec_CUDA){ + if (current_spgemm_algorithm == SPGEMM_KK_DENSE && !exec_gpu) { nnz_lno_t col_size = this->b_col_cnt / (sizeof (nnz_lno_t) * 8)+ 1; nnz_lno_t max_row_size = KOKKOSKERNELS_MACRO_MIN(col_size, maxNumRoughNonzeros); chunksize = col_size + max_row_size; @@ -1979,34 +1953,13 @@ void KokkosSPGEMM std::cout << "\tDense Acc - COLS:" << col_size << " max_row_size:" << max_row_size << std::endl; } } - nnz_lno_t num_chunks = concurrency / suggested_vector_size; - KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - -#if defined( KOKKOS_ENABLE_CUDA ) - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } -#endif + nnz_lno_t num_chunks = this->template compute_num_pool_chunks + (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl; @@ -2051,7 +2004,7 @@ void KokkosSPGEMM timer1.reset(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { Kokkos::parallel_for("KokkosSparse::StructureC::GPU_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); } else { @@ -2587,19 +2540,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: @@ -2608,6 +2553,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp index c53f8b461c..c06d4c4cb2 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp @@ -206,19 +206,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: @@ -227,6 +219,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -1330,17 +1326,17 @@ void KokkosSPGEMM ){ bool apply_compression = this->handle->get_spgemm_handle()->get_compression(); + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); const nnz_lno_t * min_result_row_for_each_row = this->handle->get_spgemm_handle()->get_min_col_of_row().data(); nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz(); typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space; - int suggested_vector_size = this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz); //this kernel does not really work well if the vector size is less than 4. - if (suggested_vector_size < 4 && MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA){ + if (suggested_vector_size < 4 && exec_gpu) { if (KOKKOSKERNELS_VERBOSE) std::cout << "\tVecSize:" << suggested_vector_size << " Setting it to 4" << std::endl; suggested_vector_size = 4; } @@ -1420,31 +1416,14 @@ void KokkosSPGEMM } - nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } + nnz_lno_t num_chunks = this->template compute_num_pool_chunks + (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); -#if defined( KOKKOS_ENABLE_CUDA ) - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 < num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } -#endif if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << @@ -1494,8 +1473,7 @@ void KokkosSPGEMM timer1.reset(); - //nnz_lno_t runcuda = atoi(getenv("runcuda")); - if (/*runcuda ||*/ MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { Kokkos::parallel_for( gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); } else { @@ -1690,6 +1668,7 @@ void KokkosSPGEMM b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>:: KokkosSPGEMM_symbolic_triangle_setup(){ + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); nnz_lno_t n = this->row_mapB.extent(0) - 1; size_type nnz = this->entriesB.extent(0); @@ -1741,7 +1720,7 @@ void KokkosSPGEMM } size_type bnnz = set_index_entries.extent(0); - if (this->MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { KokkosKernels::Impl::kkp_reduce_diff_view (this->b_row_cnt, p_rowmapB_begins, p_rowmapB_ends, bnnz); if (KOKKOSKERNELS_VERBOSE){ diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp index 119e6cddc6..6a9b67c0b2 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp @@ -202,19 +202,11 @@ struct KokkosSPGEMM #endif #if defined( KOKKOS_ENABLE_OPENMP ) case KokkosKernels::Impl::Exec_OMP: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::OpenMP::hardware_thread_id(); - #else return Kokkos::OpenMP::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_THREADS ) case KokkosKernels::Impl::Exec_PTHREADS: - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE - return Kokkos::Threads::hardware_thread_id(); - #else return Kokkos::Threads::impl_hardware_thread_id(); - #endif #endif #if defined( KOKKOS_ENABLE_QTHREAD) case KokkosKernels::Impl::Exec_QTHREADS: @@ -223,6 +215,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -900,12 +896,13 @@ void KokkosSPGEMM const int num_left_side_nnz_per_row = 2; const nnz_lno_t * min_result_row_for_each_row = this->handle->get_spgemm_handle()->get_min_col_of_row().data(); nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz(); + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space; int suggested_vector_size = this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz); //this kernel does not really work well if the vector size is less than 4. - if (suggested_vector_size < 4 && MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA){ + if (suggested_vector_size < 4 && exec_gpu) { if (KOKKOSKERNELS_VERBOSE) std::cout << "\tVecSize:" << suggested_vector_size << " Setting it to 4" << std::endl; suggested_vector_size = 4; } @@ -966,31 +963,13 @@ void KokkosSPGEMM pool_init_val = 0; } - nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } + nnz_lno_t num_chunks = this->template compute_num_pool_chunks + (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); - -#if defined( KOKKOS_ENABLE_CUDA ) - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 < num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } -#endif if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << @@ -1040,9 +1019,7 @@ void KokkosSPGEMM timer1.reset(); - //nnz_lno_t runcuda = atoi(getenv("runcuda")); - - if (/*runcuda ||*/ MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { Kokkos::parallel_for( gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); } else { diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp index 70b1d05391..d4c2c98a6f 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp @@ -219,6 +219,10 @@ namespace KokkosSparse{ #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -816,7 +820,7 @@ namespace KokkosSparse{ // Initialize hashmaps if (c_row_size > max_first_level_hash_size){ while (tmp == NULL){ - Kokkos::single(Kokkos::PerTeam(teamMember),[=] (volatile nnz_lno_t * &memptr) { + Kokkos::single(Kokkos::PerTeam(teamMember),[&] (volatile nnz_lno_t * &memptr) { memptr = (volatile nnz_lno_t * )( memory_space.allocate_chunk(row_index)); }, tmp); } @@ -1181,6 +1185,8 @@ namespace KokkosSparse{ dinv_view_t dinv, KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space) { + using pool_memory_space = KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t>; + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tSPARSE ACC MODE" << std::endl; } @@ -1238,7 +1244,7 @@ namespace KokkosSparse{ // Choose the SpGEMM algorithm and corresponding parameters if (this->spgemm_algorithm == SPGEMM_KK || this->spgemm_algorithm == SPGEMM_KK_LP){ - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { size_type average_row_nnz = overall_nnz / this->a_row_cnt; size_t average_row_flops = original_overall_flops / this->a_row_cnt; @@ -1310,7 +1316,7 @@ namespace KokkosSparse{ } } } - // If CUDA is not enabled, we decide whether we want to use a sparse or a dense acumulator + // If non-GPU, we decide whether we want to use a sparse or a dense acumulator else { bool run_dense = false; @@ -1364,7 +1370,7 @@ namespace KokkosSparse{ // Compute the memory pool size - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){ tmp_max_nnz = 1; } @@ -1395,26 +1401,9 @@ namespace KokkosSparse{ chunksize += min_hash_size ; //this is for the hash begins chunksize += max_nnz; //this is for hash nexts } - int num_chunks = concurrency / suggested_vector_size; -#if defined( KOKKOS_ENABLE_CUDA ) - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } -#endif + nnz_lno_t num_chunks = this->template compute_num_pool_chunks + (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ std::cout << "\t\t max_nnz: " << max_nnz @@ -1428,11 +1417,10 @@ namespace KokkosSparse{ // Allocate the memory pool KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space; Kokkos::Impl::Timer timer; pool_memory_space m_space(num_chunks, chunksize, -1, my_pool_type); MyExecSpace().fence(); @@ -1470,7 +1458,7 @@ namespace KokkosSparse{ } timer.reset(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){ if (thread_shmem_key_size <= 0) { std::cout << "KokkosSPGEMM_jacobi_sparseacc SPGEMM_KK_MEMORY_SPREADTEAM: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl; diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 3ea7d150b6..7b91f95e09 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -48,6 +48,7 @@ #include "KokkosKernels_Controls.hpp" #include "Kokkos_InnerProductSpaceTraits.hpp" #include "KokkosBlas1_scal.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_spmv_impl_omp.hpp" @@ -80,7 +81,6 @@ struct GetCoeffView,DeviceType> { template struct SPMV_Transpose_Functor { typedef typename AMatrix::execution_space execution_space; @@ -95,55 +95,57 @@ struct SPMV_Transpose_Functor { const coefficient_type alpha; AMatrix m_A; XVector m_x; - const coefficient_type beta; YVector m_y; - const ordinal_type rows_per_thread; + ordinal_type rows_per_team; SPMV_Transpose_Functor (const coefficient_type& alpha_, const AMatrix& m_A_, const XVector& m_x_, - const coefficient_type& beta_, - const YVector& m_y_, - const ordinal_type rows_per_thread_) : - alpha (alpha_), m_A (m_A_), m_x (m_x_), - beta (beta_), m_y (m_y_), - rows_per_thread (rows_per_thread_) + const YVector& m_y_) : + alpha (alpha_), m_A (m_A_), m_x (m_x_), m_y (m_y_) {} + KOKKOS_INLINE_FUNCTION void + operator() (const ordinal_type iRow) const + { + const auto row = m_A.rowConst (iRow); + const ordinal_type row_length = row.length; + for(ordinal_type iEntry = 0; iEntry < row_length; iEntry++) + { + const value_type val = conjugate ? + ATV::conj (row.value(iEntry)) : + row.value(iEntry); + const ordinal_type ind = row.colidx(iEntry); + Kokkos::atomic_add (&m_y(ind), static_cast (alpha * val * m_x(iRow))); + } + } + KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - // This should be a thread loop as soon as we can use C++11 - for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { + const ordinal_type teamWork = dev.league_rank() * rows_per_team; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_team), + [&](ordinal_type loop) + { // iRow represents a row of the matrix, so its correct type is // ordinal_type. - const ordinal_type iRow = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread + loop; + const ordinal_type iRow = teamWork + loop; if (iRow >= m_A.numRows ()) { return; } const auto row = m_A.rowConst (iRow); const ordinal_type row_length = row.length; - -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row_length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row_length; - iEntry ++) -#endif + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length), + [&](ordinal_type iEntry) { const value_type val = conjugate ? ATV::conj (row.value(iEntry)) : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); - Kokkos::atomic_add (&m_y(ind), static_cast (alpha * val * m_x(iRow))); - } - } + }); + }); } }; @@ -184,10 +186,38 @@ struct SPMV_Functor { "YVector must be a rank 1 View."); } + KOKKOS_INLINE_FUNCTION + void operator() (const ordinal_type iRow) const + { + using y_value_type = typename YVector::non_const_value_type; + if (iRow >= m_A.numRows ()) { + return; + } + const KokkosSparse::SparseRowViewConst row = m_A.rowConst(iRow); + const ordinal_type row_length = static_cast (row.length); + y_value_type sum = 0; + + for(ordinal_type iEntry = 0; iEntry < row_length; iEntry++) + { + const value_type val = conjugate ? + ATV::conj (row.value(iEntry)) : + row.value(iEntry); + sum += val * m_x(row.colidx(iEntry)); + } + + sum *= alpha; + + if (dobeta == 0) { + m_y(iRow) = sum ; + } else { + m_y(iRow) = beta * m_y(iRow) + sum; + } + } + KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - typedef typename YVector::non_const_value_type y_value_type; + using y_value_type = typename YVector::non_const_value_type; Kokkos::parallel_for(Kokkos::TeamThreadRange(dev,0,rows_per_team), [&] (const ordinal_type& loop) { @@ -226,19 +256,27 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th if(nnz_per_row < 1) nnz_per_row = 1; + int max_vector_length = 1; +#ifdef KOKKOS_ENABLE_CUDA + if(std::is_same::value) + max_vector_length = 32; +#endif +#ifdef KOKKOS_ENABLE_HIP + if(std::is_same::value) + max_vector_length = 64; +#endif + if(vector_length < 1) { vector_length = 1; - while(vector_length<32 && vector_length*6 < nnz_per_row) + while(vector_length < max_vector_length && vector_length * 6 < nnz_per_row) vector_length*=2; } // Determine rows per thread if(rows_per_thread < 1) { - #ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) rows_per_thread = 1; else - #endif { if(nnz_per_row < 20 && nnz > 5000000 ) { rows_per_thread = 256; @@ -247,14 +285,12 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th } } - #ifdef KOKKOS_ENABLE_CUDA if(team_size < 1) { - if(std::is_same::value) + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { team_size = 256/vector_length; } else { team_size = 1; } } - #endif rows_per_team = rows_per_thread * team_size; @@ -297,21 +333,14 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, ((int) A.graph.row_block_offsets.extent(0) == (int) omp_get_max_threads()+1) && (((uintptr_t)(const void*)(x.data())%64)==0) && (((uintptr_t)(const void*)(y.data())%64)==0) ) { + //Note BMK: this case is typically not called in practice even for OpenMP, since + //it requires row_block_offsets to have been computed in the graph. spmv_raw_openmp_no_transpose(alpha,A,x,beta,y); return; } #endif - int team_size = -1; - int vector_length = -1; - int64_t rows_per_thread = -1; - - // Note on 03/24/20, lbv: We can use the controls - // here to allow the user to pass in some tunning - // parameters. - if(controls.isParameter("team size")) {team_size = std::stoi(controls.getParameter("team size"));} - if(controls.isParameter("vector length")) {vector_length = std::stoi(controls.getParameter("vector length"));} - if(controls.isParameter("rows per thread")) {rows_per_thread = std::stoll(controls.getParameter("rows per thread"));} + bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule bool use_static_schedule = false; // Forces the use of a static schedule if(controls.isParameter("schedule")) { @@ -321,26 +350,45 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, use_static_schedule = true; } } - - int64_t rows_per_team = spmv_launch_parameters(A.numRows(),A.nnz(),rows_per_thread,team_size,vector_length); - int64_t worksets = (y.extent(0)+rows_per_team-1)/rows_per_team; - - SPMV_Functor func (alpha,A,x,beta,y,rows_per_team); - - if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule) { - Kokkos::TeamPolicy > policy(1,1); - if(team_size<0) - policy = Kokkos::TeamPolicy >(worksets,Kokkos::AUTO,vector_length); - else - policy = Kokkos::TeamPolicy >(worksets,team_size,vector_length); - Kokkos::parallel_for("KokkosSparse::spmv",policy,func); - } else { - Kokkos::TeamPolicy > policy(1,1); - if(team_size<0) - policy = Kokkos::TeamPolicy >(worksets,Kokkos::AUTO,vector_length); + if(use_teams) { + int team_size = -1; + int vector_length = -1; + int64_t rows_per_thread = -1; + + // Note on 03/24/20, lbv: We can use the controls + // here to allow the user to pass in some tunning + // parameters. + if(controls.isParameter("team size")) {team_size = std::stoi(controls.getParameter("team size"));} + if(controls.isParameter("vector length")) {vector_length = std::stoi(controls.getParameter("vector length"));} + if(controls.isParameter("rows per thread")) {rows_per_thread = std::stoll(controls.getParameter("rows per thread"));} + + int64_t rows_per_team = spmv_launch_parameters(A.numRows(),A.nnz(),rows_per_thread,team_size,vector_length); + int64_t worksets = (y.extent(0)+rows_per_team-1)/rows_per_team; + + SPMV_Functor func (alpha,A,x,beta,y,rows_per_team); + + if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::TeamPolicy > policy(1,1); + if(team_size<0) + policy = Kokkos::TeamPolicy >(worksets,Kokkos::AUTO,vector_length); + else + policy = Kokkos::TeamPolicy >(worksets,team_size,vector_length); + Kokkos::parallel_for("KokkosSparse::spmv",policy,func); + } else { + Kokkos::TeamPolicy > policy(1,1); + if(team_size<0) + policy = Kokkos::TeamPolicy >(worksets,Kokkos::AUTO,vector_length); + else + policy = Kokkos::TeamPolicy >(worksets,team_size,vector_length); + Kokkos::parallel_for("KokkosSparse::spmv",policy,func); + } + } + else { + SPMV_Functor func (alpha,A,x,beta,y,1); + if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule) + Kokkos::parallel_for("KokkosSparse::spmv",Kokkos::RangePolicy>(0, A.numRows()),func); else - policy = Kokkos::TeamPolicy >(worksets,team_size,vector_length); - Kokkos::parallel_for("KokkosSparse::spmv",policy,func); + Kokkos::parallel_for("KokkosSparse::spmv",Kokkos::RangePolicy>(0, A.numRows()),func); } } @@ -356,7 +404,9 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, typename YVector::const_value_type& beta, const YVector& y) { - typedef typename AMatrix::ordinal_type ordinal_type; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; + using execution_space = typename AMatrix::execution_space; if (A.numRows () <= static_cast (0)) { return; @@ -368,33 +418,46 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, KokkosBlas::scal (y, beta, y); } - typedef typename AMatrix::size_type size_type; - // Assuming that no row contains duplicate entries, NNZPerRow // cannot be more than the number of columns of the matrix. Thus, // the appropriate type is ordinal_type. - const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); + const ordinal_type NNZPerRow = A.nnz () / A.numRows (); int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2; + bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); + int max_vector_length = 1; +#ifdef KOKKOS_ENABLE_CUDA + if(std::is_same::value) + max_vector_length = 32; +#endif +#ifdef KOKKOS_ENABLE_HIP + if(std::is_same::value) + max_vector_length = 64; +#endif + if(use_teams) { + while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) ) + vector_length*=2; + } - typedef SPMV_Transpose_Functor OpType; + typedef SPMV_Transpose_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); - - const int rows_per_thread = RowsPerThread (NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy::team_size_recommended (op, vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); + OpType op (alpha, A, x, y); + if(use_teams) { + const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< execution_space > + ( 0 , nrow ) , op ); + } } template::conj (row.value(iEntry)) : + row.value(iEntry); + const ordinal_type ind = row.colidx(iEntry); + + if (doalpha != 1) { + #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL + #pragma unroll + #endif + for (ordinal_type k = 0; k < n; ++k) { + Kokkos::atomic_add (&m_y(ind,k), + static_cast (alpha * val * m_x(iRow, k))); + } + } else { + #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL + #pragma unroll + #endif + for (ordinal_type k = 0; k < n; ++k) { + Kokkos::atomic_add (&m_y(ind,k), + static_cast (val * m_x(iRow, k))); + } + } + } + } + KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - // This should be a thread loop as soon as we can use C++11 - for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { + const ordinal_type teamWork = dev.league_rank() * rows_per_team; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_team), + [&](ordinal_type loop) + { // iRow represents a row of the matrix, so its correct type is // ordinal_type. - const ordinal_type iRow = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread + loop; + const ordinal_type iRow = teamWork + loop; if (iRow >= m_A.numRows ()) { return; } @@ -486,15 +581,8 @@ struct SPMV_MV_Transpose_Functor { const auto row = m_A.rowConst (iRow); const ordinal_type row_length = row.length; -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < static_cast (row_length); - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row_length; - iEntry ++) -#endif + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length), + [&](ordinal_type iEntry) { const A_value_type val = conjugate ? Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : @@ -518,8 +606,8 @@ struct SPMV_MV_Transpose_Functor { static_cast (val * m_x(iRow, k))); } } - } - } + }); + }); } }; @@ -531,7 +619,7 @@ template struct SPMV_MV_LayoutLeft_Functor { typedef typename AMatrix::execution_space execution_space; - typedef typename AMatrix::non_const_ordinal_type ordinal_type; + typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type A_value_type; typedef typename YVector::non_const_value_type y_value_type; typedef typename Kokkos::TeamPolicy team_policy; @@ -546,21 +634,23 @@ struct SPMV_MV_LayoutLeft_Functor { //! The number of columns in the input and output MultiVectors. ordinal_type n; ordinal_type rows_per_thread; + int vector_length; SPMV_MV_LayoutLeft_Functor (const coefficient_type& alpha_, const AMatrix& m_A_, const XVector& m_x_, const coefficient_type& beta_, const YVector& m_y_, - const ordinal_type rows_per_thread_) : + const ordinal_type rows_per_thread_, + int vector_length_) : alpha (alpha_), m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)), - rows_per_thread (rows_per_thread_) + rows_per_thread (rows_per_thread_), vector_length(vector_length_) {} template KOKKOS_INLINE_FUNCTION void - strip_mine (const team_member& /* dev */, const ordinal_type& iRow, const ordinal_type& kk) const + strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const { y_value_type sum[UNROLL]; @@ -581,142 +671,137 @@ struct SPMV_MV_LayoutLeft_Functor { // assume either that rows have no duplicate entries, or that rows // never have enough duplicate entries to overflow ordinal_type. -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT -#pragma loop count (15) -#endif -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row.length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row.length; - iEntry ++) -#endif - { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row.length), + [&](ordinal_type iEntry) + { const A_value_type val = conjugate ? Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); - #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { sum[k] += val * m_x(ind, kk + k); } - } + }); if (doalpha == -1) { for (int ii=0; ii < UNROLL; ++ii) { - y_value_type sumt = sum[ii]; -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sumt += Kokkos::shfl_down(sumt, 1,blockDim.x); - if (blockDim.x > 2) - sumt += Kokkos::shfl_down(sumt, 2,blockDim.x); - if (blockDim.x > 4) - sumt += Kokkos::shfl_down(sumt, 4,blockDim.x); - if (blockDim.x > 8) - sumt += Kokkos::shfl_down(sumt, 8,blockDim.x); - if (blockDim.x > 16) - sumt += Kokkos::shfl_down(sumt, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - sum[ii] = -sumt; + y_value_type sumt; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length), + [&](ordinal_type, y_value_type& lsum) + { + //in this context, sum[ii] is a partial sum ii on one of the vector lanes. + lsum -= sum[ii]; + }, sumt); + sum[ii] = sumt; + //that was an all-reduce, so sum[ii] is the same on every vector lane } } else { for (int ii=0; ii < UNROLL; ++ii) { - y_value_type sumt = sum[ii]; -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sumt += Kokkos::shfl_down(sumt, 1,blockDim.x); - if (blockDim.x > 2) - sumt += Kokkos::shfl_down(sumt, 2,blockDim.x); - if (blockDim.x > 4) - sumt += Kokkos::shfl_down(sumt, 4,blockDim.x); - if (blockDim.x > 8) - sumt += Kokkos::shfl_down(sumt, 8,blockDim.x); - if (blockDim.x > 16) - sumt += Kokkos::shfl_down(sumt, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - sum[ii] = sumt; + y_value_type sumt; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length), + [&](ordinal_type, y_value_type& lsum) + { + //in this context, sum[ii] is a partial sum ii on one of the vector lanes. + lsum += sum[ii]; + }, sumt); + if(doalpha == 1) + sum[ii] = sumt; + else + sum[ii] = sumt * alpha; } } -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (threadIdx.x==0) -#else - if (true) -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - { - if (doalpha * doalpha != 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - sum[k] *= alpha; - } - } + if (dobeta == 0) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = sum[k]; + }); + } else if (dobeta == 1) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = m_y(iRow, kk + k) + sum[k]; + }); + } else if (dobeta == -1) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; + }); + } else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k]; + }); + } + } + + template + KOKKOS_INLINE_FUNCTION void + strip_mine (const ordinal_type& iRow, const ordinal_type& kk) const + { + y_value_type sum[UNROLL]; - if (dobeta == 0) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = sum[k]; - } - } else if (dobeta == 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) += sum[k]; - } - } else if (dobeta == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; - } - } else { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif + for (int k = 0; k < UNROLL; ++k) { + sum[k] = Kokkos::Details::ArithTraits::zero (); + } + + const auto row = m_A.rowConst (iRow); + + // The correct type of iEntry is ordinal_type, the type of the + // number of columns in the (local) matrix. This is because we + // assume either that rows have no duplicate entries, or that rows + // never have enough duplicate entries to overflow ordinal_type. + + for(ordinal_type iEntry = 0; iEntry < row.length; iEntry++) + { + const A_value_type val = conjugate ? + Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : + row.value(iEntry); + const ordinal_type ind = row.colidx(iEntry); #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k]; - } + for (int k = 0; k < UNROLL; ++k) { + if(doalpha == 1) + sum[k] += val * m_x(ind, kk + k); + else if(doalpha == -1) + sum[k] -= val * m_x(ind, kk + k); + else + sum[k] += alpha * val * m_x(ind, kk + k); } } + + if (dobeta == 0) { + for(ordinal_type k = 0; k < UNROLL; k++) + m_y(iRow, kk + k) = sum[k]; + } else if (dobeta == 1) { + for(ordinal_type k = 0; k < UNROLL; k++) + m_y(iRow, kk + k) = m_y(iRow, kk + k) + sum[k]; + } else if (dobeta == -1) { + for(ordinal_type k = 0; k < UNROLL; k++) + m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; + } else { + for(ordinal_type k = 0; k < UNROLL; k++) + m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k]; + } } KOKKOS_INLINE_FUNCTION void - strip_mine_1 (const team_member& /* dev */, const ordinal_type& iRow) const + strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const { - y_value_type sum = Kokkos::Details::ArithTraits::zero (); - const auto row = m_A.rowConst (iRow); // The correct type of iEntry is ordinal_type, the type of the @@ -724,48 +809,17 @@ struct SPMV_MV_LayoutLeft_Functor { // assume either that rows have no duplicate entries, or that rows // never have enough duplicate entries to overflow ordinal_type. -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT -#pragma loop count (15) -#endif -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row.length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row.length; - iEntry ++) -#endif + y_value_type sum; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, row.length), + [&](ordinal_type iEntry, y_value_type& lsum) { const A_value_type val = conjugate ? Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : row.value(iEntry); - sum += val * m_x(row.colidx(iEntry),0); - } -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sum += Kokkos::shfl_down(sum, 1,blockDim.x); - if (blockDim.x > 2) - sum += Kokkos::shfl_down(sum, 2,blockDim.x); - if (blockDim.x > 4) - sum += Kokkos::shfl_down(sum, 4,blockDim.x); - if (blockDim.x > 8) - sum += Kokkos::shfl_down(sum, 8,blockDim.x); - if (blockDim.x > 16) - sum += Kokkos::shfl_down(sum, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (threadIdx.x==0) -#else - if (true) -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) + lsum += val * m_x(row.colidx(iEntry),0); + }, sum); + Kokkos::single(Kokkos::PerThread(dev), + [&]() { if (doalpha == -1) { sum = -sum; @@ -782,9 +836,144 @@ struct SPMV_MV_LayoutLeft_Functor { } else { m_y(iRow, 0) = beta * m_y(iRow, 0) + sum; } + }); + } + + KOKKOS_INLINE_FUNCTION void + strip_mine_1 (const ordinal_type& iRow) const + { + const auto row = m_A.rowConst (iRow); + + // The correct type of iEntry is ordinal_type, the type of the + // number of columns in the (local) matrix. This is because we + // assume either that rows have no duplicate entries, or that rows + // never have enough duplicate entries to overflow ordinal_type. + + y_value_type sum = y_value_type(); + for(ordinal_type iEntry = 0; iEntry < row.length; iEntry++) + { + const A_value_type val = conjugate ? + Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : + row.value(iEntry); + sum += val * m_x(row.colidx(iEntry),0); + } + if (doalpha == -1) { + sum = -sum; + } else if (doalpha != 1) { + sum *= alpha; + } + + if (dobeta == 0) { + m_y(iRow, 0) = sum ; + } else if (dobeta == 1) { + m_y(iRow, 0) += sum ; + } else if (dobeta == -1) { + m_y(iRow, 0) = -m_y(iRow, 0) + sum; + } else { + m_y(iRow, 0) = beta * m_y(iRow, 0) + sum; } } + KOKKOS_INLINE_FUNCTION void + operator() (const ordinal_type& iRow) const + { + // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it + // needs to have the same type as n. + ordinal_type kk = 0; + +#ifdef KOKKOS_FAST_COMPILE + for (; kk + 4 <= n; kk += 4) { + strip_mine<4>(dev, iRow, kk); + } + for( ; kk < n; ++kk) { + strip_mine<1>(dev, iRow, kk); + } +#else +# if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + if ((n > 8) && (n % 8 == 1)) { + strip_mine<9>(iRow, kk); + kk += 9; + } + for(; kk + 8 <= n; kk += 8) + strip_mine<8>(iRow, kk); + if(kk < n) { + switch(n - kk) { +# else // NOT a GPU + if ((n > 16) && (n % 16 == 1)) { + strip_mine<17>(iRow, kk); + kk += 17; + } + + for (; kk + 16 <= n; kk += 16) { + strip_mine<16>(iRow, kk); + } + + if(kk < n) { + switch(n - kk) { + case 15: + strip_mine<15>(iRow, kk); + break; + + case 14: + strip_mine<14>(iRow, kk); + break; + + case 13: + strip_mine<13>(iRow, kk); + break; + + case 12: + strip_mine<12>(iRow, kk); + break; + + case 11: + strip_mine<11>(iRow, kk); + break; + + case 10: + strip_mine<10>(iRow, kk); + break; + + case 9: + strip_mine<9>(iRow, kk); + break; + + case 8: + strip_mine<8>(iRow, kk); + break; +# endif // if/else: __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__ + case 7: + strip_mine<7>(iRow, kk); + break; + + case 6: + strip_mine<6>(iRow, kk); + break; + + case 5: + strip_mine<5>(iRow, kk); + break; + + case 4: + strip_mine<4>(iRow, kk); + break; + + case 3: + strip_mine<3>(iRow, kk); + break; + + case 2: + strip_mine<2>(iRow, kk); + break; + + case 1: + strip_mine_1(iRow); + break; + } + } +#endif // KOKKOS_FAST_COMPILE + } + KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const @@ -812,91 +1001,92 @@ struct SPMV_MV_LayoutLeft_Functor { strip_mine<1>(dev, iRow, kk); } #else -# ifdef __CUDA_ARCH__ +# if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) if ((n > 8) && (n % 8 == 1)) { strip_mine<9>(dev, iRow, kk); kk += 9; } for(; kk + 8 <= n; kk += 8) strip_mine<8>(dev, iRow, kk); - if(kk < n) + if(kk < n) { switch(n - kk) { -# else // NOT a CUDA device - if ((n > 16) && (n % 16 == 1)) { - strip_mine<17>(dev, iRow, kk); - kk += 17; - } +# else // NOT a GPU + if ((n > 16) && (n % 16 == 1)) { + strip_mine<17>(dev, iRow, kk); + kk += 17; + } - for (; kk + 16 <= n; kk += 16) { - strip_mine<16>(dev, iRow, kk); - } + for (; kk + 16 <= n; kk += 16) { + strip_mine<16>(dev, iRow, kk); + } - if(kk < n) - switch(n - kk) { - case 15: - strip_mine<15>(dev, iRow, kk); - break; - - case 14: - strip_mine<14>(dev, iRow, kk); - break; - - case 13: - strip_mine<13>(dev, iRow, kk); - break; - - case 12: - strip_mine<12>(dev, iRow, kk); - break; - - case 11: - strip_mine<11>(dev, iRow, kk); - break; - - case 10: - strip_mine<10>(dev, iRow, kk); - break; - - case 9: - strip_mine<9>(dev, iRow, kk); - break; - - case 8: - strip_mine<8>(dev, iRow, kk); - break; -# endif // __CUDA_ARCH__ - case 7: - strip_mine<7>(dev, iRow, kk); - break; - - case 6: - strip_mine<6>(dev, iRow, kk); - break; - - case 5: - strip_mine<5>(dev, iRow, kk); - break; - - case 4: - strip_mine<4>(dev, iRow, kk); - break; - - case 3: - strip_mine<3>(dev, iRow, kk); - break; - - case 2: - strip_mine<2>(dev, iRow, kk); - break; - - case 1: - strip_mine_1(dev, iRow); - break; - } -#endif // KOKKOS_FAST_COMPILE + if(kk < n) { + switch(n - kk) { + case 15: + strip_mine<15>(dev, iRow, kk); + break; + + case 14: + strip_mine<14>(dev, iRow, kk); + break; + + case 13: + strip_mine<13>(dev, iRow, kk); + break; + + case 12: + strip_mine<12>(dev, iRow, kk); + break; + + case 11: + strip_mine<11>(dev, iRow, kk); + break; + + case 10: + strip_mine<10>(dev, iRow, kk); + break; + + case 9: + strip_mine<9>(dev, iRow, kk); + break; + + case 8: + strip_mine<8>(dev, iRow, kk); + break; +# endif // if/else: __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__ + case 7: + strip_mine<7>(dev, iRow, kk); + break; + + case 6: + strip_mine<6>(dev, iRow, kk); + break; + + case 5: + strip_mine<5>(dev, iRow, kk); + break; + + case 4: + strip_mine<4>(dev, iRow, kk); + break; + + case 3: + strip_mine<3>(dev, iRow, kk); + break; + + case 2: + strip_mine<2>(dev, iRow, kk); + break; + + case 1: + strip_mine_1(dev, iRow); + break; } + } +#endif // KOKKOS_FAST_COMPILE } - }; + } +}; template (0)) { return; @@ -924,39 +1115,38 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a return; } else { - typedef typename AMatrix::size_type size_type; // Assuming that no row contains duplicate entries, NNZPerRow // cannot be more than the number of columns of the matrix. Thus, // the appropriate type is ordinal_type. - const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); + const ordinal_type NNZPerRow = A.nnz () / A.numRows (); - int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); + ordinal_type vector_length = 1; + if(use_teams) { + while( (vector_length*2*3 <= NNZPerRow) && (vector_length<8) ) + vector_length *= 2; + } #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels typedef SPMV_MV_LayoutLeft_Functor OpType; - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); typename AMatrix::const_ordinal_type nrow = A.numRows(); - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); + if(use_teams) { + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< typename AMatrix::execution_space >( 0, nrow ), op ); + } #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta @@ -965,24 +1155,20 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); - - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); + if(use_teams) { + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< typename AMatrix::execution_space > + ( 0, nrow ) , op ); + } #endif // KOKKOS_FAST_COMPILE } } @@ -1000,7 +1186,8 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph const typename YVector::non_const_value_type& beta, const YVector& y) { - typedef typename AMatrix::ordinal_type ordinal_type; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; if (A.numRows () <= static_cast (0)) { return; @@ -1013,39 +1200,40 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph } if (doalpha != 0) { - typedef typename AMatrix::size_type size_type; // Assuming that no row contains duplicate entries, NNZPerRow // cannot be more than the number of columns of the matrix. Thus, // the appropriate type is ordinal_type. const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); - int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + ordinal_type vector_length = 1; + bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); + //Transpose functor uses atomics which can't be vectorized on CPU + if(use_teams) { + while( (vector_length*2*3 <= NNZPerRow) && (vector_length<8) ) + vector_length*=2; + } #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels typedef SPMV_MV_Transpose_Functor OpType; - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); - - typename AMatrix::const_ordinal_type nrow = A.numRows(); - - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for ("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); + OpType op (alpha, A, x, beta, y); + + const ordinal_type nrow = A.numRows(); + if(use_teams) { + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for ("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for ("KokkosSparse::spmv", Kokkos::RangePolicy < typename AMatrix::execution_space > + ( 0 , nrow ) , op ); + } #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta @@ -1053,24 +1241,21 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph 2, 2, conjugate, SizeType> OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); - - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); + if(use_teams) { + OpType op (alpha, A, x, beta, y); + + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< typename AMatrix::execution_space > + ( 0, nrow ) , op ); + } #endif // KOKKOS_FAST_COMPILE } @@ -1135,7 +1320,6 @@ spmv_alpha_mv (const char mode[], } } -} -} +}} //namespace KokkosSparse::Impl #endif // KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_ diff --git a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp index a4f1c07258..72c8a969fe 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp @@ -47,7 +47,6 @@ namespace Impl { #ifdef KOKKOS_ENABLE_OPENMP template void spmv_raw_openmp_no_transpose(typename YVector::const_value_type& s_a, AMatrix A, XVector x, typename YVector::const_value_type& s_b, YVector y) { - typedef typename YVector::non_const_value_type value_type; typedef typename AMatrix::ordinal_type ordinal_type; typedef typename AMatrix::non_const_size_type size_type; diff --git a/src/sparse/impl/KokkosSparse_spmv_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_spec.hpp index 9d1f44bd2a..b678142dbe 100644 --- a/src/sparse/impl/KokkosSparse_spmv_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_spec.hpp @@ -257,8 +257,6 @@ struct SPMV < AT, AO, AD, AM, AS, { typedef Kokkos::Details::ArithTraits KAT; - typedef Kokkos::Details::ArithTraits KAT; - if (alpha == KAT::zero ()) { if (beta != KAT::one ()) { KokkosBlas::scal (y, beta, y); diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index 51d2189c5c..3179a0cc31 100644 --- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -46,6 +46,7 @@ #define KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_ #include "Kokkos_InnerProductSpaceTraits.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBlas1_scal.hpp" #include "KokkosSparse_CrsMatrix.hpp" @@ -91,12 +92,13 @@ struct SPMV_Struct_Transpose_Functor { KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - // This should be a thread loop as soon as we can use C++11 - for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { + const ordinal_type teamWorkStart = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) * rows_per_thread; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread), + [&](ordinal_type loop) + { // iRow represents a row of the matrix, so its correct type is // ordinal_type. - const ordinal_type iRow = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread + loop; + ordinal_type iRow = teamWorkStart + loop; if (iRow >= m_A.numRows ()) { return; } @@ -104,15 +106,8 @@ struct SPMV_Struct_Transpose_Functor { const auto row = m_A.rowConst (iRow); const ordinal_type row_length = row.length; -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row_length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row_length; - iEntry ++) -#endif + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length), + [&](ordinal_type iEntry) { const value_type val = conjugate ? ATV::conj (row.value(iEntry)) : @@ -120,8 +115,8 @@ struct SPMV_Struct_Transpose_Functor { const ordinal_type ind = row.colidx(iEntry); Kokkos::atomic_add (&m_y(ind), static_cast (alpha * val * m_x(iRow))); - } - } + }); + }); } }; @@ -302,7 +297,7 @@ struct SPMV_Struct_Functor { }); dev.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team), [&] (const ordinal_type& loop) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team),[&] (const ordinal_type& loop) { const ordinal_type interiorIdx = static_cast ( dev.league_rank() ) * rows_per_team + loop; if(interiorIdx >= numInterior) { return; } @@ -665,11 +660,9 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, int nnz_ // Determine rows per thread if(rows_per_thread < 1) { - #ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) rows_per_thread = 1; else - #endif { if(nnz_per_row < 20 && numInterior*nnz_per_row > 5000000 ) { rows_per_thread = 256; @@ -678,14 +671,12 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, int nnz_ } } - #ifdef KOKKOS_ENABLE_CUDA if(team_size < 1) { - if(std::is_same::value) + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { team_size = 128 / vector_length; } else { team_size = 1; } } - #endif rows_per_team = rows_per_thread * team_size; @@ -903,27 +894,19 @@ struct SPMV_MV_Struct_Transpose_Functor { operator() (const team_member& dev) const { // This should be a thread loop as soon as we can use C++11 - for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { - // iRow represents a row of the matrix, so its correct type is - // ordinal_type. - const ordinal_type iRow = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread + loop; + const ordinal_type teamWorkStart = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) * rows_per_thread; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread), + [&](ordinal_type loop) + { + const ordinal_type iRow = teamWorkStart + loop; if (iRow >= m_A.numRows ()) { return; } - const auto row = m_A.rowConst (iRow); const ordinal_type row_length = row.length; -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < static_cast (row_length); - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row_length; - iEntry ++) -#endif + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length), + [&](ordinal_type iEntry) { const A_value_type val = conjugate ? Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : @@ -947,429 +930,334 @@ struct SPMV_MV_Struct_Transpose_Functor { static_cast (val * m_x(iRow, k))); } } - } - } + }); + }); } }; - template - struct SPMV_MV_Struct_LayoutLeft_Functor { - typedef typename AMatrix::execution_space execution_space; - typedef typename AMatrix::non_const_ordinal_type ordinal_type; - typedef typename AMatrix::non_const_value_type A_value_type; - typedef typename YVector::non_const_value_type y_value_type; - typedef typename Kokkos::TeamPolicy team_policy; - typedef typename team_policy::member_type team_member; - typedef typename YVector::non_const_value_type coefficient_type; - - const coefficient_type alpha; - AMatrix m_A; - XVector m_x; - const coefficient_type beta; - YVector m_y; - //! The number of columns in the input and output MultiVectors. - ordinal_type n; - ordinal_type rows_per_thread; - - SPMV_MV_Struct_LayoutLeft_Functor (const coefficient_type& alpha_, - const AMatrix& m_A_, - const XVector& m_x_, - const coefficient_type& beta_, - const YVector& m_y_, - const ordinal_type rows_per_thread_) : - alpha (alpha_), - m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)), - rows_per_thread (rows_per_thread_) - {} - - template - KOKKOS_INLINE_FUNCTION void - strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const - { - y_value_type sum[UNROLL]; - -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - sum[k] = Kokkos::Details::ArithTraits::zero (); - } +template +struct SPMV_MV_Struct_LayoutLeft_Functor { + typedef typename AMatrix::execution_space execution_space; + typedef typename AMatrix::non_const_ordinal_type ordinal_type; + typedef typename AMatrix::non_const_value_type A_value_type; + typedef typename YVector::non_const_value_type y_value_type; + typedef typename Kokkos::TeamPolicy team_policy; + typedef typename team_policy::member_type team_member; + typedef typename YVector::non_const_value_type coefficient_type; - const auto row = m_A.rowConst (iRow); + const coefficient_type alpha; + AMatrix m_A; + XVector m_x; + const coefficient_type beta; + YVector m_y; + //! The number of columns in the input and output MultiVectors. + ordinal_type n; + ordinal_type rows_per_thread; + int vector_length; + + SPMV_MV_Struct_LayoutLeft_Functor (const coefficient_type& alpha_, + const AMatrix& m_A_, + const XVector& m_x_, + const coefficient_type& beta_, + const YVector& m_y_, + const ordinal_type rows_per_thread_, + int vector_length_) : + alpha (alpha_), + m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)), + rows_per_thread (rows_per_thread_), vector_length(vector_length_) + {} - // The correct type of iEntry is ordinal_type, the type of the - // number of columns in the (local) matrix. This is because we - // assume either that rows have no duplicate entries, or that rows - // never have enough duplicate entries to overflow ordinal_type. + template + KOKKOS_INLINE_FUNCTION void + strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const + { + y_value_type sum[UNROLL]; -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif -#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT -#pragma loop count (15) -#endif -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row.length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row.length; - iEntry ++) -#endif - { - const A_value_type val = conjugate ? - Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : - row.value(iEntry); - const ordinal_type ind = row.colidx(iEntry); + for (int k = 0; k < UNROLL; ++k) { + sum[k] = Kokkos::Details::ArithTraits::zero (); + } + + const auto row = m_A.rowConst (iRow); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row.length), + [&](ordinal_type iEntry) + { + const A_value_type val = conjugate ? + Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : + row.value(iEntry); + const ordinal_type ind = row.colidx(iEntry); #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - sum[k] += val * m_x(ind, kk + k); - } - } - - if (doalpha == -1) { - for (int ii=0; ii < UNROLL; ++ii) { - y_value_type sumt = sum[ii]; -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sumt += Kokkos::shfl_down(sumt, 1,blockDim.x); - if (blockDim.x > 2) - sumt += Kokkos::shfl_down(sumt, 2,blockDim.x); - if (blockDim.x > 4) - sumt += Kokkos::shfl_down(sumt, 4,blockDim.x); - if (blockDim.x > 8) - sumt += Kokkos::shfl_down(sumt, 8,blockDim.x); - if (blockDim.x > 16) - sumt += Kokkos::shfl_down(sumt, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - sum[ii] = -sumt; - } + for (int k = 0; k < UNROLL; ++k) { + sum[k] += val * m_x(ind, kk + k); } - else { - for (int ii=0; ii < UNROLL; ++ii) { - y_value_type sumt = sum[ii]; -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sumt += Kokkos::shfl_down(sumt, 1,blockDim.x); - if (blockDim.x > 2) - sumt += Kokkos::shfl_down(sumt, 2,blockDim.x); - if (blockDim.x > 4) - sumt += Kokkos::shfl_down(sumt, 4,blockDim.x); - if (blockDim.x > 8) - sumt += Kokkos::shfl_down(sumt, 8,blockDim.x); - if (blockDim.x > 16) - sumt += Kokkos::shfl_down(sumt, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) + }); + + if (doalpha == -1) { + for (int ii=0; ii < UNROLL; ++ii) { + y_value_type sumt; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length), + [&](ordinal_type , y_value_type& lsum) + { + //in this context, sum[ii] is a partial sum ii on one of the vector lanes. + lsum -= sum[ii]; + }, sumt); + sum[ii] = sumt; + //that was an all-reduce, so sum[ii] is the same on every vector lane + } + } + else { + for (int ii=0; ii < UNROLL; ++ii) { + y_value_type sumt; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length), + [&](ordinal_type, y_value_type& lsum) + { + //in this context, sum[ii] is a partial sum ii on one of the vector lanes. + lsum += sum[ii]; + }, sumt); + if(doalpha == 1) sum[ii] = sumt; - } + else + sum[ii] = sumt * alpha; } - -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (threadIdx.x==0) -#else - if (true) -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - { - if (doalpha * doalpha != 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - sum[k] *= alpha; - } - } - - if (dobeta == 0) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = sum[k]; - } - } else if (dobeta == 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) += sum[k]; - } - } else if (dobeta == -1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; - } - } else { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k]; - } - } - } } - KOKKOS_INLINE_FUNCTION void - strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const + Kokkos::single(Kokkos::PerThread(dev), + [&]() { - y_value_type sum = Kokkos::Details::ArithTraits::zero (); - - const auto row = m_A.rowConst (iRow); + if (dobeta == 0) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = sum[k]; + }); + } else if (dobeta == 1) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) += sum[k]; + }); + } else if (dobeta == -1) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; + }); + } else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k]; + }); + } + }); + } - // The correct type of iEntry is ordinal_type, the type of the - // number of columns in the (local) matrix. This is because we - // assume either that rows have no duplicate entries, or that rows - // never have enough duplicate entries to overflow ordinal_type. + KOKKOS_INLINE_FUNCTION void + strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const + { + const auto row = m_A.rowConst (iRow); -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT -#pragma loop count (15) -#endif -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row.length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row.length; - iEntry ++) -#endif - { - const A_value_type val = conjugate ? - Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : - row.value(iEntry); - sum += val * m_x(row.colidx(iEntry),0); - } -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sum += Kokkos::shfl_down(sum, 1,blockDim.x); - if (blockDim.x > 2) - sum += Kokkos::shfl_down(sum, 2,blockDim.x); - if (blockDim.x > 4) - sum += Kokkos::shfl_down(sum, 4,blockDim.x); - if (blockDim.x > 8) - sum += Kokkos::shfl_down(sum, 8,blockDim.x); - if (blockDim.x > 16) - sum += Kokkos::shfl_down(sum, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (threadIdx.x==0) -#else - if (true) -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - { - if (doalpha == -1) { - sum = -sum; - } else if (doalpha * doalpha != 1) { - sum *= alpha; - } + y_value_type sum; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, row.length), + [&](ordinal_type iEntry, y_value_type& lsum) + { + const A_value_type val = conjugate ? + Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : + row.value(iEntry); + lsum += val * m_x(row.colidx(iEntry),0); + }, sum); + + Kokkos::single(Kokkos::PerThread(dev), + [&]() + { + if (doalpha == -1) { + sum = -sum; + } else if (doalpha * doalpha != 1) { + sum *= alpha; + } - if (dobeta == 0) { - m_y(iRow, 0) = sum ; - } else if (dobeta == 1) { - m_y(iRow, 0) += sum ; - } else if (dobeta == -1) { - m_y(iRow, 0) = -m_y(iRow, 0) + sum; - } else { - m_y(iRow, 0) = beta * m_y(iRow, 0) + sum; - } - } - } + if (dobeta == 0) { + m_y(iRow, 0) = sum; + } else if (dobeta == 1) { + m_y(iRow, 0) += sum; + } else if (dobeta == -1) { + m_y(iRow, 0) = -m_y(iRow, 0) + sum; + } else { + m_y(iRow, 0) = beta * m_y(iRow, 0) + sum; + } + }); + } - KOKKOS_INLINE_FUNCTION void - operator() (const team_member& dev) const - { - for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { + KOKKOS_INLINE_FUNCTION void + operator() (const team_member& dev) const + { + for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { - // iRow indexes over (local) rows of the matrix, so its correct - // type is ordinal_type. + // iRow indexes over (local) rows of the matrix, so its correct + // type is ordinal_type. - const ordinal_type iRow = (dev.league_rank() * dev.team_size() + dev.team_rank()) - * rows_per_thread + loop; - if (iRow >= m_A.numRows ()) { - return; - } + const ordinal_type iRow = (dev.league_rank() * dev.team_size() + dev.team_rank()) + * rows_per_thread + loop; + if (iRow >= m_A.numRows ()) { + return; + } - // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it - // needs to have the same type as n. - ordinal_type kk = 0; + // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it + // needs to have the same type as n. + ordinal_type kk = 0; #ifdef KOKKOS_FAST_COMPILE - for (; kk + 4 <= n; kk += 4) { - strip_mine<4>(dev, iRow, kk); - } - for( ; kk < n; ++kk) { - strip_mine<1>(dev, iRow, kk); - } + for (; kk + 4 <= n; kk += 4) { + strip_mine<4>(dev, iRow, kk); + } + for( ; kk < n; ++kk) { + strip_mine<1>(dev, iRow, kk); + } #else -# ifdef __CUDA_ARCH__ - if ((n > 8) && (n % 8 == 1)) { - strip_mine<9>(dev, iRow, kk); - kk += 9; - } - for(; kk + 8 <= n; kk += 8) - strip_mine<8>(dev, iRow, kk); - if(kk < n) - switch(n - kk) { +# if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + if ((n > 8) && (n % 8 == 1)) { + strip_mine<9>(dev, iRow, kk); + kk += 9; + } + for(; kk + 8 <= n; kk += 8) + strip_mine<8>(dev, iRow, kk); + if(kk < n) + { + switch(n - kk) { # else // NOT a CUDA device - if ((n > 16) && (n % 16 == 1)) { - strip_mine<17>(dev, iRow, kk); - kk += 17; - } - - for (; kk + 16 <= n; kk += 16) { - strip_mine<16>(dev, iRow, kk); - } - - if(kk < n) - switch(n - kk) { - case 15: - strip_mine<15>(dev, iRow, kk); - break; - - case 14: - strip_mine<14>(dev, iRow, kk); - break; - - case 13: - strip_mine<13>(dev, iRow, kk); - break; - - case 12: - strip_mine<12>(dev, iRow, kk); - break; - - case 11: - strip_mine<11>(dev, iRow, kk); - break; - - case 10: - strip_mine<10>(dev, iRow, kk); - break; - - case 9: - strip_mine<9>(dev, iRow, kk); - break; - - case 8: - strip_mine<8>(dev, iRow, kk); - break; -# endif // __CUDA_ARCH__ - case 7: - strip_mine<7>(dev, iRow, kk); - break; - - case 6: - strip_mine<6>(dev, iRow, kk); - break; - - case 5: - strip_mine<5>(dev, iRow, kk); - break; - - case 4: - strip_mine<4>(dev, iRow, kk); - break; - - case 3: - strip_mine<3>(dev, iRow, kk); - break; - - case 2: - strip_mine<2>(dev, iRow, kk); - break; - - case 1: - strip_mine_1(dev, iRow); - break; - } -#endif // KOKKOS_FAST_COMPILE - } + if ((n > 16) && (n % 16 == 1)) { + strip_mine<17>(dev, iRow, kk); + kk += 17; } - }; - - - template - static void - spmv_alpha_beta_mv_struct_no_transpose (const typename YVector::non_const_value_type& alpha, - const AMatrix& A, - const XVector& x, - const typename YVector::non_const_value_type& beta, - const YVector& y) - { - typedef typename AMatrix::ordinal_type ordinal_type; - if (A.numRows () <= static_cast (0)) { - return; + for (; kk + 16 <= n; kk += 16) { + strip_mine<16>(dev, iRow, kk); } - if (doalpha == 0) { - if (dobeta != 1) { - KokkosBlas::scal (y, beta, y); + + if(kk < n) + { + switch(n - kk) { + case 15: + strip_mine<15>(dev, iRow, kk); + break; + + case 14: + strip_mine<14>(dev, iRow, kk); + break; + + case 13: + strip_mine<13>(dev, iRow, kk); + break; + + case 12: + strip_mine<12>(dev, iRow, kk); + break; + + case 11: + strip_mine<11>(dev, iRow, kk); + break; + + case 10: + strip_mine<10>(dev, iRow, kk); + break; + + case 9: + strip_mine<9>(dev, iRow, kk); + break; + + case 8: + strip_mine<8>(dev, iRow, kk); + break; + #endif // __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__ + case 7: + strip_mine<7>(dev, iRow, kk); + break; + + case 6: + strip_mine<6>(dev, iRow, kk); + break; + + case 5: + strip_mine<5>(dev, iRow, kk); + break; + + case 4: + strip_mine<4>(dev, iRow, kk); + break; + + case 3: + strip_mine<3>(dev, iRow, kk); + break; + + case 2: + strip_mine<2>(dev, iRow, kk); + break; + + case 1: + strip_mine_1(dev, iRow); + break; } - return; } - else { - typedef typename AMatrix::size_type size_type; +#endif // KOKKOS_FAST_COMPILE + } + } +}; - // Assuming that no row contains duplicate entries, NNZPerRow - // cannot be more than the number of columns of the matrix. Thus, - // the appropriate type is ordinal_type. - const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); - int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + template + static void + spmv_alpha_beta_mv_struct_no_transpose (const typename YVector::non_const_value_type& alpha, + const AMatrix& A, + const XVector& x, + const typename YVector::non_const_value_type& beta, + const YVector& y) + { + typedef typename AMatrix::ordinal_type ordinal_type; + + if (A.numRows () <= static_cast (0)) { + return; + } + if (doalpha == 0) { + if (dobeta != 1) { + KokkosBlas::scal (y, beta, y); + } + return; + } + else { + typedef typename AMatrix::size_type size_type; + + // Assuming that no row contains duplicate entries, NNZPerRow + // cannot be more than the number of columns of the matrix. Thus, + // the appropriate type is ordinal_type. + const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); + + int vector_length = 1; + while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels - typedef SPMV_MV_Struct_LayoutLeft_Functor OpType; - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + typedef SPMV_MV_Struct_LayoutLeft_Functor OpType; + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); - typename AMatrix::const_ordinal_type nrow = A.numRows(); + typename AMatrix::const_ordinal_type nrow = A.numRows(); // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, @@ -1377,11 +1265,7 @@ struct SPMV_MV_Struct_Transpose_Functor { // team_size is a hardware resource thing so it might legitimately // be int. const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > @@ -1389,12 +1273,12 @@ struct SPMV_MV_Struct_Transpose_Functor { #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta - typedef SPMV_MV_Struct_LayoutLeft_Functor OpType; + typedef SPMV_MV_Struct_LayoutLeft_Functor OpType; - typename AMatrix::const_ordinal_type nrow = A.numRows(); + typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, @@ -1402,63 +1286,58 @@ struct SPMV_MV_Struct_Transpose_Functor { // team_size is a hardware resource thing so it might legitimately // be int. const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); - #endif // KOKKOS_FAST_COMPILE - } } + } - template - static void - spmv_alpha_beta_mv_struct_transpose (const typename YVector::non_const_value_type& alpha, - const AMatrix& A, - const XVector& x, - const typename YVector::non_const_value_type& beta, - const YVector& y) - { - typedef typename AMatrix::ordinal_type ordinal_type; + template + static void + spmv_alpha_beta_mv_struct_transpose (const typename YVector::non_const_value_type& alpha, + const AMatrix& A, + const XVector& x, + const typename YVector::non_const_value_type& beta, + const YVector& y) + { + typedef typename AMatrix::ordinal_type ordinal_type; - if (A.numRows () <= static_cast (0)) { - return; - } + if (A.numRows () <= static_cast (0)) { + return; + } - // We need to scale y first ("scaling" by zero just means filling - // with zeros), since the functor works by atomic-adding into y. - if (dobeta != 1) { - KokkosBlas::scal (y, beta, y); - } + // We need to scale y first ("scaling" by zero just means filling + // with zeros), since the functor works by atomic-adding into y. + if (dobeta != 1) { + KokkosBlas::scal (y, beta, y); + } - if (doalpha != 0) { - typedef typename AMatrix::size_type size_type; + if (doalpha != 0) { + typedef typename AMatrix::size_type size_type; - // Assuming that no row contains duplicate entries, NNZPerRow - // cannot be more than the number of columns of the matrix. Thus, - // the appropriate type is ordinal_type. - const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); + // Assuming that no row contains duplicate entries, NNZPerRow + // cannot be more than the number of columns of the matrix. Thus, + // the appropriate type is ordinal_type. + const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); - int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + int vector_length = 1; + while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels - typedef SPMV_MV_Struct_Transpose_Functor OpType; - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + typedef SPMV_MV_Struct_Transpose_Functor OpType; + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); - typename AMatrix::const_ordinal_type nrow = A.numRows(); + typename AMatrix::const_ordinal_type nrow = A.numRows(); // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, @@ -1466,11 +1345,7 @@ struct SPMV_MV_Struct_Transpose_Functor { // team_size is a hardware resource thing so it might legitimately // be int. const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for ("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > @@ -1478,12 +1353,12 @@ struct SPMV_MV_Struct_Transpose_Functor { #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta - typedef SPMV_MV_Struct_Transpose_Functor OpType; + typedef SPMV_MV_Struct_Transpose_Functor OpType; - typename AMatrix::const_ordinal_type nrow = A.numRows(); + typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, @@ -1491,80 +1366,74 @@ struct SPMV_MV_Struct_Transpose_Functor { // team_size is a hardware resource thing so it might legitimately // be int. const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); #endif // KOKKOS_FAST_COMPILE - } } + } - template - static void - spmv_alpha_beta_mv_struct (const char mode[], - const typename YVector::non_const_value_type& alpha, - const AMatrix& A, - const XVector& x, - const typename YVector::non_const_value_type& beta, - const YVector& y) - { - if (mode[0] == NoTranspose[0]) { - spmv_alpha_beta_mv_struct_no_transpose (alpha, A, x, beta, y); - } - else if (mode[0] == Conjugate[0]) { - spmv_alpha_beta_mv_struct_no_transpose (alpha, A, x, beta, y); - } - else if (mode[0] == Transpose[0]) { - spmv_alpha_beta_mv_struct_transpose (alpha, A, x, beta, y); - } - else if (mode[0] == ConjugateTranspose[0]) { - spmv_alpha_beta_mv_struct_transpose (alpha, A, x, beta, y); - } - else { - Kokkos::Impl::throw_runtime_exception ("Invalid Transpose Mode for KokkosSparse::spmv()"); - } + template + static void + spmv_alpha_beta_mv_struct (const char mode[], + const typename YVector::non_const_value_type& alpha, + const AMatrix& A, + const XVector& x, + const typename YVector::non_const_value_type& beta, + const YVector& y) + { + if (mode[0] == NoTranspose[0]) { + spmv_alpha_beta_mv_struct_no_transpose (alpha, A, x, beta, y); } - - template - void - spmv_alpha_mv_struct (const char mode[], - const typename YVector::non_const_value_type& alpha, - const AMatrix& A, - const XVector& x, - const typename YVector::non_const_value_type& beta, - const YVector& y) - { - typedef typename YVector::non_const_value_type coefficient_type; - typedef Kokkos::Details::ArithTraits KAT; - - if (beta == KAT::zero ()) { - spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); - } - else if (beta == KAT::one ()) { - spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); - } - else if (beta == -KAT::one ()) { - spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); - } - else { - spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); - } + else if (mode[0] == Conjugate[0]) { + spmv_alpha_beta_mv_struct_no_transpose (alpha, A, x, beta, y); + } + else if (mode[0] == Transpose[0]) { + spmv_alpha_beta_mv_struct_transpose (alpha, A, x, beta, y); + } + else if (mode[0] == ConjugateTranspose[0]) { + spmv_alpha_beta_mv_struct_transpose (alpha, A, x, beta, y); + } + else { + Kokkos::Impl::throw_runtime_exception ("Invalid Transpose Mode for KokkosSparse::spmv()"); } + } + template + void + spmv_alpha_mv_struct (const char mode[], + const typename YVector::non_const_value_type& alpha, + const AMatrix& A, + const XVector& x, + const typename YVector::non_const_value_type& beta, + const YVector& y) + { + typedef typename YVector::non_const_value_type coefficient_type; + typedef Kokkos::Details::ArithTraits KAT; + if (beta == KAT::zero ()) { + spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); + } + else if (beta == KAT::one ()) { + spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); + } + else if (beta == -KAT::one ()) { + spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); + } + else { + spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); + } + } } } diff --git a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 25e9844940..623df284ea 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -95,7 +95,7 @@ namespace Impl{ if (!std::is_same::value) sptrsv_handle->allocate_tmp_int_rowmap(row_map.extent(0)); const int* rm = !std::is_same::value ? sptrsv_handle->get_int_rowmap_ptr_copy(row_map) : (const int*)row_map.data(); - const int* ent = entries.data(); + const int* ent = (const int*) entries.data(); const scalar_type* vals = values.data(); if (std::is_same::value) { @@ -297,7 +297,7 @@ namespace Impl{ int nnz = entries.extent_int(0); const int* rm = !std::is_same::value ? sptrsv_handle->get_int_rowmap_ptr() : (const int*)row_map.data(); - const int* ent = entries.data(); + const int* ent = (const int*) entries.data(); const scalar_type* vals = values.data(); const scalar_type* bv = rhs.data(); scalar_type* xv = lhs.data(); diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index a9ffcd282a..271d8b2396 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2464,6 +2464,23 @@ struct ReturnRangePolicyType { } }; #endif +#ifdef KOKKOS_ENABLE_HIP +template <> +struct ReturnRangePolicyType { + using PolicyType = Kokkos::RangePolicy; + + static inline + PolicyType get_policy(int nt, int ts) { + return PolicyType(nt,ts); + } + + template + static inline + PolicyType get_policy(int nt, int ts, ExecInstanceType stream) { + return PolicyType(stream,nt,ts); + } +}; +#endif template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType > void lower_tri_solve_cg( TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) { diff --git a/test_common/KokkosKernels_TestParameters.hpp b/test_common/KokkosKernels_TestParameters.hpp index 295b46df9b..c069c618e6 100644 --- a/test_common/KokkosKernels_TestParameters.hpp +++ b/test_common/KokkosKernels_TestParameters.hpp @@ -72,6 +72,7 @@ struct Parameters{ int use_threads; int use_openmp; int use_cuda; + int use_hip; int use_serial; int a_mem_space, b_mem_space, c_mem_space, work_mem_space; @@ -121,6 +122,7 @@ struct Parameters{ use_threads = 0; use_openmp = 0; use_cuda = 0; + use_hip = 0; use_serial = 0; a_mem_space = b_mem_space = c_mem_space = work_mem_space = 1; a_mtx_bin_file = b_mtx_bin_file = c_mtx_bin_file = NULL; diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 8a9306325f..bf86768d16 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -105,6 +105,92 @@ namespace Test { EXPECT_NEAR_KK(h_v1(i), h_v2(i), tol); } } -} + #if defined(KOKKOS_HALF_T_IS_FLOAT) + using halfScalarType = Kokkos::Experimental::half_t; + #endif // KOKKOS_HALF_T_IS_FLOAT + + template + struct SharedVanillaGEMM { + bool A_t, B_t, A_c, B_c; + int C_rows, C_cols, A_cols; + ViewTypeA A; + ViewTypeB B; + ViewTypeC C; + + typedef typename ViewTypeA::value_type ScalarA; + typedef typename ViewTypeB::value_type ScalarB; + typedef typename ViewTypeC::value_type ScalarC; + typedef Kokkos::Details::ArithTraits APT; + typedef typename APT::mag_type mag_type; + ScalarA alpha; + ScalarC beta; + + KOKKOS_INLINE_FUNCTION + void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,C_rows), [&] (const int& i) { + // Give each kokkos thread a vector of A + auto a_vec = A_t ? Kokkos::subview(A, Kokkos::ALL(), i) : Kokkos::subview(A, i, Kokkos::ALL()); + + // Have all vector lanes perform the dot product + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,C_cols), [&] (const int& j) { + auto b_vec = B_t ? Kokkos::subview(B, j, Kokkos::ALL()) : Kokkos::subview(B, Kokkos::ALL(), j); + ScalarC ab = ScalarC(0); + for (int k = 0; k < A_cols; k++) { + auto a = A_c ? APT::conj(a_vec(k)) : a_vec(k); + auto b = B_c ? APT::conj(b_vec(k)) : b_vec(k); + ab += a * b; + } + C(i,j) = beta * C(i,j) + alpha * ab; + }); + }); + } + }; + // C(i,:,:) = alpha * (A(i,:,:) * B(i,:,:)) + beta * C(i,:,:) + template + struct Functor_BatchedVanillaGEMM { + bool A_t, B_t, A_c, B_c; + ViewTypeA A; + ViewTypeB B; + ViewTypeC C; + + using ScalarA = typename ViewTypeA::value_type; + using ScalarB = typename ViewTypeB::value_type; + using ScalarC = typename ViewTypeC::value_type; + ScalarA alpha; + ScalarC beta; + + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { + int i = team.league_rank(); + + auto _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); + auto _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); + using SubviewTypeA = decltype(_A); + using SubviewTypeB = decltype(_B); + using SubviewTypeC = decltype(_C); + struct SharedVanillaGEMM vgemm; + vgemm.A_t = A_t; vgemm.B_t = B_t; + vgemm.A_c = A_c; vgemm.B_c = B_c; + vgemm.C_rows = C.extent(1); + vgemm.C_cols = C.extent(2); + vgemm.A_cols = A_t?A.extent(1):A.extent(2); + vgemm.A = _A; + vgemm.B = _B; + vgemm.C = _C; + vgemm.alpha = alpha; + vgemm.beta = beta; + vgemm(team); + } + + inline + void run() { + Kokkos::parallel_for( + "Test::VanillaGEMM", + Kokkos::TeamPolicy(C.extent(0), Kokkos::AUTO, 16), + *this); + } + }; +} #endif diff --git a/test_common/Test_Common_ArithTraits.hpp b/test_common/Test_Common_ArithTraits.hpp index 5e253a1820..bba54ff6f0 100644 --- a/test_common/Test_Common_ArithTraits.hpp +++ b/test_common/Test_Common_ArithTraits.hpp @@ -63,6 +63,13 @@ #include // typeid (T) #include +#define FAILURE() {printf("%s:%s:%d: Failure\n", __FILE__, __func__, __LINE__); success = 0;} + +#if 0 +#define TRACE() printf("%s:%s:%d: Trace\n", __FILE__, __func__, __LINE__); +#else +#define TRACE() +#endif namespace { // Whether Kokkos::Details::ArithTraits implements @@ -183,6 +190,7 @@ class ArithTraitsTesterBase { KOKKOS_INLINE_FUNCTION void operator () (size_type iwork, value_type& dst) const { + TRACE(); typedef Kokkos::Details::ArithTraits AT; (void) iwork; // not using this argument int success = 1; @@ -203,7 +211,7 @@ class ArithTraitsTesterBase { // std::numeric_limits. if (! AT::is_specialized) { printf ("! AT::is_specialized\n"); - success = 0; + FAILURE(); } // It's OK to refer to std::numeric_limits constants in a device @@ -211,11 +219,11 @@ class ArithTraitsTesterBase { // as device functions). if (AT::is_integer != std::numeric_limits::is_integer) { printf ("AT::is_integer not same as numeric_limits\n"); - success = 0; + FAILURE(); } if (AT::is_exact != std::numeric_limits::is_exact) { printf ("AT::is_exact not same as numeric_limits\n"); - success = 0; + FAILURE(); } const ScalarType zero = AT::zero (); @@ -224,34 +232,34 @@ class ArithTraitsTesterBase { // Test properties of the arithmetic and multiplicative identities. if (zero + zero != zero) { printf ("0 + 0 != 0\n"); - success = 0; + FAILURE(); } if (zero + one != one) { printf ("0 + 1 != 1\n"); - success = 0; + FAILURE(); } if (one - one != zero) { printf ("1 - 1 != 0\n"); - success = 0; + FAILURE(); } // This is technically 1 even of Z_2, since in that field, one // is its own inverse (so -one == one). if ((one + one) - one != one) { printf ("(1 + 1) - 1 != 1\n"); - success = 0; + FAILURE(); } if (AT::abs (zero) != zero) { printf ("AT::abs(0) != 0\n"); - success = 0; + FAILURE(); } if (AT::abs (one) != one) { printf ("AT::abs(1) != 1\n"); - success = 0; + FAILURE(); } if (AT::is_signed && AT::abs (-one) != one) { printf ("AT::is_signed and AT::abs(-1) != 1\n"); - success = 0; + FAILURE(); } // Need enable_if to test whether T can be compared using <=. // However, mag_type should always be comparable using <=. @@ -260,7 +268,7 @@ class ArithTraitsTesterBase { // They should work even for a set only containing zero. if (AT::abs (zero) > AT::abs (AT::max ())) { printf ("AT::abs(0) > AT::abs (AT::max ())\n"); - success = 0; + FAILURE(); } dst = dst && success; @@ -312,17 +320,17 @@ class ArithTraitsTesterBase { // std::numeric_limits. if (! AT::is_specialized) { out << "ArithTraits is not specialized for T" << endl; - success = 0; + FAILURE(); } if (AT::is_integer != std::numeric_limits::is_integer) { out << "AT::is_integer != std::numeric_limits::is_integer" << endl; - success = 0; + FAILURE(); } if (AT::is_exact != std::numeric_limits::is_exact) { out << "AT::is_exact != std::numeric_limits::is_exact" << endl; - success = 0; + FAILURE(); } const ScalarType zero = AT::zero (); @@ -331,35 +339,35 @@ class ArithTraitsTesterBase { if (zero + zero != zero) { out << "zero + zero != zero" << endl; - success = 0; + FAILURE(); } if (zero + one != one) { out << "zero + one != one" << endl; - success = 0; + FAILURE(); } if (one - one != zero) { out << "one - one != zero" << endl; - success = 0; + FAILURE(); } // This is technically 1 even of Z_2, since in that field, one // is its own inverse (so -one == one). if ((one + one) - one != one) { out << "(one + one) - one != one" << endl; - success = 0; + FAILURE(); } if (AT::abs (zero) != zero) { out << "AT::abs (zero) != zero" << endl; - success = 0; + FAILURE(); } if (AT::abs (one) != one) { out << "AT::abs (one) != one" << endl; - success = 0; + FAILURE(); } if (AT::is_signed) { if (AT::abs (-one) != one) { out << "AT::abs (-one) != one" << endl; - success = 0; + FAILURE(); } } // Need enable_if to test whether T can be compared using <=. @@ -369,19 +377,19 @@ class ArithTraitsTesterBase { // // They should work even for a set only containing zero. if (AT::abs (zero) > AT::abs (AT::max ())) { out << "AT::abs (zero) > AT::abs (AT::max ())" << endl; - success = 0; + FAILURE(); } if (AT::has_infinity) { if (! AT::isInf (AT::infinity())) { out << "AT::isInf (inf) != true" << endl; - success = 0; + FAILURE(); } } if ( ! std::is_same< ScalarType, decltype(AT::infinity()) >::value ) { std::cout << "AT::infinity() return value has wrong type" << endl; - success = 0; + FAILURE(); } // Run the parent class' remaining tests, if any. @@ -462,12 +470,13 @@ class ArithTraitsTesterTranscendentalBase : KOKKOS_INLINE_FUNCTION void operator () (size_type iwork, value_type& dst) const { + TRACE(); //typedef Kokkos::Details::ArithTraits AT; (void) iwork; // forestall compiler warning for unused variable int success = 1; if (HasTranscendentals::value) { - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -488,7 +497,7 @@ class ArithTraitsTesterTranscendentalBase : if (HasTranscendentals::value) { out << "HasTranscendentals::value is true" << endl; - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -542,12 +551,13 @@ class ArithTraitsTesterTranscendentalBase : KOKKOS_INLINE_FUNCTION void operator () (size_type iwork, value_type& dst) const { + TRACE(); typedef Kokkos::Details::ArithTraits AT; (void) iwork; // forestall compiler warning for unused variable int success = 1; if (! HasTranscendentals::value) { - success = 0; + FAILURE(); } const ScalarType zero = AT::zero (); @@ -576,20 +586,20 @@ class ArithTraitsTesterTranscendentalBase : result = AT::pow (two, three); if (!equal(result,eight)) { printf ("AT::pow(2,3) != 8\n"); - success = 0; + FAILURE(); } } if (!equal(AT::pow (three, zero) , one)) { printf ("AT::pow(3,0) != 1\n"); - success = 0; + FAILURE(); } if (!equal(AT::pow (three, one) , three)) { printf ("AT::pow(3,1) != 3\n"); - success = 0; + FAILURE(); } if (!equal(AT::pow (three, two) , nine)) { printf ("AT::pow(3,2) != 9\n"); - success = 0; + FAILURE(); } // This fails inexplicably for complex numbers on gcc 4.2.1 on Mac. @@ -597,7 +607,7 @@ class ArithTraitsTesterTranscendentalBase : result = AT::pow (three, three); if (!equal(result , twentySeven)) { printf ("AT::pow(3,3) != 27\n"); - success = 0; + FAILURE(); } } @@ -606,93 +616,93 @@ class ArithTraitsTesterTranscendentalBase : result = AT::pow (-three, one); if (!equal(result , -three)) { printf ("AT::pow(-3,1) != -3\n"); - success = 0; + FAILURE(); } result = AT::pow (-three, two); if (!equal(result , nine)) { printf ("AT::pow(-3,2) != 9\n"); - success = 0; + FAILURE(); } result = AT::pow (-three, three); if (!equal(result , -twentySeven)) { printf ("AT::pow(-3,3) != 27\n"); - success = 0; + FAILURE(); } } if (!equal(AT::sqrt (zero) , zero)) { printf ("AT::sqrt(0) != 0\n"); - success = 0; + FAILURE(); } if (!equal(AT::sqrt (one) , one)) { printf ("AT::sqrt(1) != 1\n"); - success = 0; + FAILURE(); } if (!equal(AT::sqrt (thirtySix) , six)) { printf ("AT::sqrt(36) != 6\n"); - success = 0; + FAILURE(); } if (!equal(AT::sqrt (sixtyFour) , eight)) { printf ("AT::sqrt(64) != 8\n"); - success = 0; + FAILURE(); } if (AT::is_integer) { if (!equal(AT::sqrt (fortyTwo) , six)) { printf ("AT:sqrt(42) != 6\n"); - success = 0; + FAILURE(); } if (!equal(AT::sqrt (oneTwentySeven) , eleven)) { printf ("AT::sqrt(127) != 11\n"); - success = 0; + FAILURE(); } } if (!equal(AT::cbrt (zero) , zero)) { printf ("AT::cbrt(0) != 0\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (one) , one)) { printf ("AT::cbrt(1) != 1\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (twentySeven) , three)) { printf ("AT::cbrt(27) != 3\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (sixtyFour) , four)) { printf ("AT::cbrt(64) != 4\n"); - success = 0; + FAILURE(); } if (AT::is_integer) { if (!equal(AT::cbrt (fortyTwo) , three)) { printf ("AT:cbrt(42) != 3\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (oneTwentySeven) , five)) { printf ("AT::cbrt(127) != 5\n"); - success = 0; + FAILURE(); } } if (!equal(AT::exp (zero) , one)) { printf ("AT::cbrt(0) != 1\n"); - success = 0; + FAILURE(); } if (AT::is_complex) { const ScalarType val = two; //(two.real(), two.real()); if (!equal(AT::conj (AT::exp (val)) , AT::exp (AT::conj (val)))) { printf ("AT::conj(exp(complex(2,2))) != AT::exp(conj(complex(2,2)))\n"); - success = 0; + FAILURE(); } } if (!equal(AT::log (one) , zero)) { printf ("AT::log(1) != 0\n"); - success = 0; + FAILURE(); } if (!equal(AT::log10 (one) , zero)) { printf ("AT::log10(1) != 0\n"); - success = 0; + FAILURE(); } if (AT::is_complex) { @@ -701,11 +711,11 @@ class ArithTraitsTesterTranscendentalBase : const auto val_cos = AT::cos (val); if (!equal(val_sin*val_sin + val_cos*val_cos , one)) { printf ("AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); - success = 0; + FAILURE(); } if (!equal(val_sin/val_cos , AT::tan(val))) { printf ("AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); - success = 0; + FAILURE(); } } else { ScalarType val = three; @@ -713,25 +723,25 @@ class ArithTraitsTesterTranscendentalBase : const auto val_cos = AT::cos (val); if (!equal(val_sin*val_sin + val_cos*val_cos , one)) { printf ("AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); - success = 0; + FAILURE(); } if (!equal(val_sin/val_cos , AT::tan(val))) { printf ("AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); - success = 0; + FAILURE(); } } if (!equal(AT::asin (AT::sin (one)), one)) { printf ("AT::asin(sin(1)) != 1\n"); - success = 0; + FAILURE(); } if (!equal(AT::acos (AT::cos (one)), one)) { printf ("AT::acos(cos(1)) != 1\n"); - success = 0; + FAILURE(); } if (!equal(AT::atan (AT::tan (one)), one)) { printf ("AT::atan(tan(1)) != 1\n"); - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -752,7 +762,7 @@ class ArithTraitsTesterTranscendentalBase : if (! HasTranscendentals::value) { out << "HasTranscendentals::value is false" << endl; - success = 0; + FAILURE(); } const ScalarType zero = AT::zero (); @@ -781,20 +791,20 @@ class ArithTraitsTesterTranscendentalBase : result = AT::pow (two, three); if (result != eight) { out << "AT::pow (two, three) != eight" << endl; - success = 0; + FAILURE(); } } if (AT::pow (three, zero) != one) { out << "AT::pow (three, zero) != one" << endl; - success = 0; + FAILURE(); } if (AT::pow (three, one) != three) { out << "AT::pow (three, one) != three" << endl; - success = 0; + FAILURE(); } if (AT::pow (three, two) != nine) { out << "AT::pow (three, two) != nine" << endl; - success = 0; + FAILURE(); } // This fails inexplicably for complex numbers on gcc 4.2.1 on Mac. @@ -803,7 +813,7 @@ class ArithTraitsTesterTranscendentalBase : if (result != twentySeven) { out << "AT::pow (three, three) = " << result << " != twentySeven = " << twentySeven << endl; - success = 0; + FAILURE(); } } @@ -813,95 +823,95 @@ class ArithTraitsTesterTranscendentalBase : if (result != -three) { out << "AT::pow (-three, one) = " << result << " != -three = " << -three << endl; - success = 0; + FAILURE(); } result = AT::pow (-three, two); if (result != nine) { out << "AT::pow (-three, two) = " << result << " != nine = " << nine << endl; - success = 0; + FAILURE(); } result = AT::pow (-three, three); if (result != -twentySeven) { out << "AT::pow (-three, three) = " << result << " != -twentySeven = " << twentySeven << endl; - success = 0; + FAILURE(); } } if (AT::sqrt (zero) != zero) { out << "AT::sqrt (zero) != zero" << endl; - success = 0; + FAILURE(); } if (AT::sqrt (one) != one) { out << "AT::sqrt (one) != one" << endl; - success = 0; + FAILURE(); } if (AT::sqrt (thirtySix) != six) { out << "AT::sqrt (thirtySix) != six" << endl; - success = 0; + FAILURE(); } if (AT::sqrt (sixtyFour) != eight) { out << "AT::sqrt (sixtyFour) != eight" << endl; - success = 0; + FAILURE(); } if (AT::is_integer) { if (AT::sqrt (fortyTwo) != six) { out << "AT::sqrt (fortyTwo) != six" << endl; - success = 0; + FAILURE(); } if (AT::sqrt (oneTwentySeven) != eleven) { out << "AT::sqrt (oneTwentySeven) != eleven" << endl; - success = 0; + FAILURE(); } } if (!equal(AT::cbrt (zero) , zero)) { printf ("AT::cbrt(0) != 0\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (one) , one)) { printf ("AT::cbrt(1) != 1\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (twentySeven) , three)) { printf ("AT::cbrt(27) != 3\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (sixtyFour) , four)) { printf ("AT::cbrt(64) != 4\n"); - success = 0; + FAILURE(); } if (AT::is_integer) { if (!equal(AT::cbrt (fortyTwo) , three)) { printf ("AT:cbrt(42) != 3\n"); - success = 0; + FAILURE(); } if (!equal(AT::cbrt (oneTwentySeven) , five)) { printf ("AT::cbrt(127) != 5\n"); - success = 0; + FAILURE(); } } if (!equal(AT::exp (zero) , one)) { printf ("AT::cbrt(0) != 1\n"); - success = 0; + FAILURE(); } if (AT::is_complex) { const ScalarType val = two; //(two.real(), two.real()); if (!equal(AT::conj (AT::exp (val)) , AT::exp (AT::conj (val)))) { printf ("AT::conj(exp(complex(2,0))) != AT::exp(conj(complex(2,0)))\n"); - success = 0; + FAILURE(); } } if (AT::log (one) != zero) { out << "AT::log (one) != zero" << endl; - success = 0; + FAILURE(); } if (AT::log10 (one) != zero) { out << "AT::log10 (one) != zero" << endl; - success = 0; + FAILURE(); } if (AT::is_complex) { @@ -910,11 +920,11 @@ class ArithTraitsTesterTranscendentalBase : const auto val_cos = AT::cos (val); if (!equal(val_sin*val_sin + val_cos*val_cos , one)) { printf ("AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); - success = 0; + FAILURE(); } if (!equal(val_sin/val_cos , AT::tan(val))) { printf ("AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); - success = 0; + FAILURE(); } } else { const ScalarType val = three; @@ -922,25 +932,25 @@ class ArithTraitsTesterTranscendentalBase : const auto val_cos = AT::cos (val); if (!equal(val_sin*val_sin + val_cos*val_cos , one)) { printf ("AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); - success = 0; + FAILURE(); } if (!equal(val_sin/val_cos , AT::tan(val))) { printf ("AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); - success = 0; + FAILURE(); } } if (!equal(AT::asin (AT::sin (three)), three)) { printf ("AT::asin(sin(3)) != 3\n"); - success = 0; + FAILURE(); } if (!equal(AT::acos (AT::cos (three)), three)) { printf ("AT::acos(cos(3)) != 3\n"); - success = 0; + FAILURE(); } if (!equal(AT::atan (AT::tan (three)), three)) { printf ("AT::atan(tan(3)) != 3\n"); - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -1020,17 +1030,32 @@ class ArithTraitsTesterComplexBase : KOKKOS_INLINE_FUNCTION void operator () (size_type iwork, value_type& dst) const { + TRACE(); typedef Kokkos::Details::ArithTraits AT; (void) iwork; // forestall compiler warning for unused variable int success = 1; // Apparently, std::numeric_limits::is_signed is 1 // only for real numbers. - if (AT::is_signed != std::numeric_limits::is_signed) { - success = 0; +#if defined(KOKKOS_HALF_T_IS_FLOAT) + if (std::is_same::value) { + if (AT::is_signed != 0x1) + FAILURE(); + } else +#else + { + if (AT::is_signed != std::numeric_limits::is_signed) { + printf( + "AT::is_signed = 0x%x, std::numeric_limits::is_signed " + "= 0x%x\n", + AT::is_signed, std::numeric_limits::is_signed); + FAILURE(); + } } +#endif // KOKKOS_HALF_T_IS_FLOAT + if (AT::is_complex) { - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -1052,11 +1077,11 @@ class ArithTraitsTesterComplexBase : // Apparently, std::numeric_limits::is_signed is 1 only for real numbers. if (AT::is_signed != std::numeric_limits::is_signed) { out << "ArithTraits::is_signed != std::numeric_limits::is_signed" << endl; - success = 0; + FAILURE(); } if (AT::is_complex) { out << "ArithTraits::is_complex is wrong" << endl; - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' // implementation of testHostImpl() should (must) do this, in @@ -1090,12 +1115,13 @@ class ArithTraitsTesterComplexBase : KOKKOS_INLINE_FUNCTION void operator () (size_type iwork, value_type& dst) const { + TRACE(); typedef Kokkos::Details::ArithTraits AT; (void) iwork; // forestall compiler warning for unused variable int success = 1; if (! AT::is_complex) { - success = 0; + FAILURE(); } typedef typename AT::mag_type mag_type; const mag_type one = Kokkos::Details::ArithTraits::one (); @@ -1108,7 +1134,7 @@ class ArithTraitsTesterComplexBase : // Test conjugation. if (AT::conj (oneMinusOne) != onePlusOne || AT::conj (onePlusOne) != oneMinusOne) { - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -1129,7 +1155,7 @@ class ArithTraitsTesterComplexBase : if (! AT::is_complex) { out << "ArithTraits::is_complex is wrong" << endl; - success = 0; + FAILURE(); } typedef typename AT::mag_type mag_type; const mag_type one = Kokkos::Details::ArithTraits::one (); @@ -1142,11 +1168,11 @@ class ArithTraitsTesterComplexBase : // Test conjugation. if (AT::conj (oneMinusOne) != onePlusOne) { out << "AT::conj ((1, -1)) != (1, 1)" << endl; - success = 0; + FAILURE(); } if (AT::conj (onePlusOne) != oneMinusOne) { out << "AT::conj ((1, 1)) != (1, -1)" << endl; - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' // implementation of testHostImpl() should (must) do this, in @@ -1232,17 +1258,19 @@ class ArithTraitsTesterFloatingPointBase : KOKKOS_INLINE_FUNCTION void operator () (size_type iwork, value_type& dst) const { + TRACE(); typedef Kokkos::Details::ArithTraits AT; (void) iwork; // forestall compiler warning for unused variable int success = 1; if (AT::is_exact) { printf ("AT::is_exact is 1\n"); - success = 0; + FAILURE(); } + if (! AT::isNan (AT::nan ())) { printf ("NaN is not NaN\n"); - success = 0; + FAILURE(); } const ScalarType zero = AT::zero (); @@ -1250,19 +1278,19 @@ class ArithTraitsTesterFloatingPointBase : if (AT::isInf (zero)) { printf ("0 is Inf\n"); - success = 0; + FAILURE(); } if (AT::isInf (one)) { printf ("1 is Inf\n"); - success = 0; + FAILURE(); } if (AT::isNan (zero)) { printf ("0 is NaN\n"); - success = 0; + FAILURE(); } if (AT::isNan (one)) { printf ("1 is NaN\n"); - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -1283,14 +1311,14 @@ class ArithTraitsTesterFloatingPointBase : if (AT::is_exact) { out << "AT::is_exact is wrong" << endl; - success = 0; + FAILURE(); } //if (std::numeric_limits::is_iec559) { //success = success && AT::isInf (AT::inf ()); if (! AT::isNan (AT::nan ())) { out << "isNan or nan failed" << endl; - success = 0; + FAILURE(); } //} @@ -1299,19 +1327,19 @@ class ArithTraitsTesterFloatingPointBase : if (AT::isInf (zero)) { out << "isInf(zero) is 1" << endl; - success = 0; + FAILURE(); } if (AT::isInf (one)) { out << "isInf(one) is 1" << endl; - success = 0; + FAILURE(); } if (AT::isNan (zero)) { out << "isNan(zero) is 1" << endl; - success = 0; + FAILURE(); } if (AT::isNan (one)) { out << "isNan(one) is 1" << endl; - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -1351,13 +1379,14 @@ class ArithTraitsTesterFloatingPointBase : KOKKOS_INLINE_FUNCTION void operator () (size_type iwork, value_type& dst) const { + TRACE(); typedef Kokkos::Details::ArithTraits AT; (void) iwork; // forestall compiler warning for unused variable int success = 1; if (! AT::is_exact) { printf ("! AT:is_exact\n"); - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' @@ -1378,7 +1407,7 @@ class ArithTraitsTesterFloatingPointBase : if (! AT::is_exact) { out << "AT::is_exact is wrong" << endl; - success = 0; + FAILURE(); } // Call the base class' implementation. Every subclass' // implementation of testHostImpl() should (must) do this, in @@ -1532,6 +1561,13 @@ int runAllArithTraitsDeviceTests (std::ostream& out, const int verbose) // Built-in real floating-point types // +#if defined(KOKKOS_HALF_T_IS_FLOAT) + TRACE(); + success = success && curSuccess; + curSuccess = + testArithTraitsOnDevice( + out, verbose); +#endif // KOKKOS_HALF_T_IS_FLOAT success = success && curSuccess; curSuccess = testArithTraitsOnDevice (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnDevice (out, verbose); @@ -1542,7 +1578,7 @@ int runAllArithTraitsDeviceTests (std::ostream& out, const int verbose) success = success && curSuccess; curSuccess = testArithTraitsOnDevice, DeviceType> (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnDevice, DeviceType> (out, verbose); - return success; + return success && curSuccess; } @@ -1598,7 +1634,7 @@ int runAllArithTraitsHostTests (std::ostream& out, const int verbose) success = success && curSuccess; curSuccess = testArithTraitsOnHost (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost (out, verbose); -#ifndef KOKKOS_ENABLE_CUDA +#if !defined( KOKKOS_ENABLE_CUDA ) && !defined( KOKKOS_ENABLE_HIP ) // This would spill tons of warnings about host device stuff otherwise success = success && curSuccess; curSuccess = testArithTraitsOnHost (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); @@ -1609,11 +1645,17 @@ int runAllArithTraitsHostTests (std::ostream& out, const int verbose) // Kokkos' complex floating-point types // +#if defined(KOKKOS_HALF_T_IS_FLOAT) + success = success && curSuccess; + TRACE(); + curSuccess = testArithTraitsOnHost( + out, verbose); +#endif // KOKKOS_HALF_T_IS_FLOAT success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); //success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); - return success; + return success && curSuccess; } template @@ -1627,8 +1669,8 @@ void test_ArithTraits () int overflow(int c) { return c; } }; NullBuffer null_buffer; - //std::ostream &out = std::cout; - std::ostream out(&null_buffer); + std::ostream &out = std::cerr; + //std::ostream out(&null_buffer); bool success = true; diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt index e610ded3f9..534782e590 100644 --- a/unit_test/CMakeLists.txt +++ b/unit_test/CMakeLists.txt @@ -39,19 +39,21 @@ IF (KOKKOS_ENABLE_CUDA) KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/cuda) APPEND_GLOB(CUDA_BLAS_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Blas*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( blas_cuda SOURCES Test_Main.cpp ${CUDA_BLAS_SOURCES} + COMPONENTS blas ) APPEND_GLOB(CUDA_BATCHED_DLA_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Batched*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( batched_dla_cuda SOURCES Test_Main.cpp ${CUDA_BATCHED_DLA_SOURCES} + COMPONENTS batched ) APPEND_GLOB(CUDA_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Sparse*.cpp) @@ -66,27 +68,29 @@ IF (KOKKOS_ENABLE_CUDA) "${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Sparse_Utils_cusparse.cpp") ENDIF() - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( sparse_cuda SOURCES Test_Main.cpp ${CUDA_SPARSE_SOURCES} + COMPONENTS sparse ) APPEND_GLOB(CUDA_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Graph*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( graph_cuda SOURCES Test_Main.cpp ${CUDA_GRAPH_SOURCES} + COMPONENTS graph ) #currently float 128 test is not working. So common tests are explicitly added. APPEND_GLOB(CUDA_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Common*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( common_cuda SOURCES Test_Main.cpp @@ -94,6 +98,64 @@ IF (KOKKOS_ENABLE_CUDA) ) ENDIF () +IF (KOKKOS_ENABLE_HIP) + KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/hip) + KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/hip) + + APPEND_GLOB(HIP_BLAS_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Blas*.cpp) + KOKKOSKERNELS_ADD_UNIT_TEST( + blas_hip + SOURCES + Test_Main.cpp + ${HIP_BLAS_SOURCES} + COMPONENTS blas + ) + + # APPEND_GLOB(HIP_BATCHED_DLA_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Batched*.cpp) + # KOKKOSKERNELS_ADD_UNIT_TEST( + # batched_dla_hip + # SOURCES + # Test_Main.cpp + # ${HIP_BATCHED_DLA_SOURCES} + # COMPONENTS batched + # ) + + # APPEND_GLOB(HIP_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Sparse*.cpp) + # # HIP does not provide UVM, these two tests are henced remove permanently + # # IF (NOT KOKKOS_ENABLE_CUDA_UVM) + # LIST(REMOVE_ITEM HIP_SPARSE_SOURCES + # "${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Sparse_findRelOffset.cpp") + # LIST(REMOVE_ITEM HIP_SPARSE_SOURCES + # "${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Sparse_trsv.cpp") + # # ENDIF() + + # KOKKOSKERNELS_ADD_UNIT_TEST( + # sparse_hip + # SOURCES + # Test_Main.cpp + # ${HIP_SPARSE_SOURCES} + # COMPONENTS sparse + # ) + + # APPEND_GLOB(HIP_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Graph*.cpp) + # KOKKOSKERNELS_ADD_UNIT_TEST( + # graph_hip + # SOURCES + # Test_Main.cpp + # ${HIP_GRAPH_SOURCES} + # COMPONENTS graph + # ) + + #currently float 128 test is not working. So common tests are explicitly added. + APPEND_GLOB(HIP_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Common*.cpp) + KOKKOSKERNELS_ADD_UNIT_TEST( + common_hip + SOURCES + Test_Main.cpp + ${HIP_COMMON_SOURCES} + ) +ENDIF () + IF (KOKKOS_ENABLE_OPENMP) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/openmp) KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/openmp) @@ -104,42 +166,46 @@ IF (KOKKOS_ENABLE_OPENMP) # SET(DISABLE_SLOW_DGEMM_DOUBLE_TEST "--gtest_filter=-openmp.gemm_double") # ENDIF() - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( blas_openmp SOURCES Test_Main.cpp ${OPENMP_BLAS_SOURCES} - ) + COMPONENTS blas + ) APPEND_GLOB(OPENMP_BATCHED_DLA_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Batched*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( batched_dla_openmp SOURCES Test_Main.cpp ${OPENMP_BATCHED_DLA_SOURCES} - ) + COMPONENTS batched + ) APPEND_GLOB(OPENMP_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Sparse*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( sparse_openmp SOURCES Test_Main.cpp ${OPENMP_SPARSE_SOURCES} + COMPONENTS sparse ) APPEND_GLOB(OPENMP_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Graph*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( graph_openmp SOURCES Test_Main.cpp ${OPENMP_GRAPH_SOURCES} + COMPONENTS graph ) APPEND_GLOB(OPENMP_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Common*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( common_openmp SOURCES Test_Main.cpp @@ -157,43 +223,47 @@ IF (KOKKOS_ENABLE_SERIAL) # SET(DISABLE_SLOW_DGEMM_DOUBLE_TEST "--gtest_filter=-serial.gemm_double") # ENDIF() - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( blas_serial SOURCES Test_Main.cpp ${SERIAL_BLAS_SOURCES} # ARGS ${DISABLE_SLOW_DGEMM_DOUBLE_TEST} + COMPONENTS blas ) APPEND_GLOB(SERIAL_BATCHED_DLA_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Batched*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( batched_dla_serial SOURCES Test_Main.cpp ${SERIAL_BATCHED_DLA_SOURCES} + COMPONENTS batched ) APPEND_GLOB(SERIAL_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Sparse*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( sparse_serial SOURCES Test_Main.cpp ${SERIAL_SPARSE_SOURCES} + COMPONENTS sparse ) APPEND_GLOB(SERIAL_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Graph*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( graph_serial SOURCES Test_Main.cpp ${SERIAL_GRAPH_SOURCES} + COMPONENTS graph ) APPEND_GLOB(SERIAL_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Common*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( common_serial SOURCES Test_Main.cpp @@ -207,35 +277,38 @@ IF (KOKKOS_ENABLE_PTHREAD) APPEND_GLOB(THREADS_BLAS_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Blas*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( blas_threads SOURCES Test_Main.cpp ${THREADS_BLAS_SOURCES} + COMPONENTS blas ) APPEND_GLOB(THREADS_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Sparse*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( sparse_threads SOURCES Test_Main.cpp ${THREADS_SPARSE_SOURCES} + COMPONENTS sparse ) APPEND_GLOB(THREADS_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Graph*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( graph_threads SOURCES Test_Main.cpp ${THREADS_GRAPH_SOURCES} + COMPONENTS graph ) APPEND_GLOB(THREADS_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Common*.cpp) - KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST( + KOKKOSKERNELS_ADD_UNIT_TEST( common_threads SOURCES Test_Main.cpp diff --git a/unit_test/batched/Test_Batched_SerialGemm.hpp b/unit_test/batched/Test_Batched_SerialGemm.hpp index 791c22d054..6b6109de47 100644 --- a/unit_test/batched/Test_Batched_SerialGemm.hpp +++ b/unit_test/batched/Test_Batched_SerialGemm.hpp @@ -66,6 +66,97 @@ namespace Test { Kokkos::Profiling::popRegion(); } }; + +template + void impl_test_batched_gemm_half(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, + const int matCdim1, const int matCdim2) { + using layout_type = typename ViewType::array_layout; + using execution_space = typename DeviceType::execution_space; + using host_value_type = float; + using transA = typename ParamTagType::transA; + using transB = typename ParamTagType::transB; + using ViewType_host_value_type = Kokkos::View; + using ats = Kokkos::Details::ArithTraits; + + /// randomized input testing views + ScalarType alpha = ScalarType(1.5); + ScalarType beta = ScalarType(3.0); + + ViewType + a_expected("a_expected", N, matAdim1, matAdim2), a1("a1", N, matAdim1, matAdim2), + b_expected("b_expected", N, matBdim1, matBdim2), b1("b1", N, matBdim1, matBdim2), + c_expected("c_expected", N, matCdim1, matCdim2), c1("c1", N, matCdim1, matCdim2); + + // fill_random does not support half precision, so use float to + // generate random numbers and copy to half views with deep_copy + Kokkos::Random_XorShift64_Pool random(13718); + ViewType_host_value_type + a_expected_host_value_type("a_expected_host_value_type", N, matAdim1, matAdim2), + b_expected_host_value_type("b_expected_host_value_type", N, matBdim1, matBdim2), + c_expected_host_value_type("c_expected_host_value_type", N, matCdim1, matCdim2), + c1_host_value_type("c1_host_value_type", N, matCdim1, matCdim2); + + Kokkos::fill_random(a_expected_host_value_type, random, host_value_type(1.0)); + Kokkos::fill_random(b_expected_host_value_type, random, host_value_type(1.0)); + Kokkos::fill_random(c_expected_host_value_type, random, host_value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(a_expected, a_expected_host_value_type); + Kokkos::deep_copy(b_expected, b_expected_host_value_type); + Kokkos::deep_copy(c_expected, c_expected_host_value_type); + + Kokkos::deep_copy(a1, a_expected); + Kokkos::deep_copy(b1, b_expected); + Kokkos::deep_copy(c1, c_expected); + + Functor_BatchedVanillaGEMM vgemm; + vgemm.A_t = std::is_same::value; + vgemm.B_t = std::is_same::value; + vgemm.A_c = vgemm.B_c = false; + vgemm.A = a_expected; + vgemm.B = b_expected; + vgemm.C = c_expected; + vgemm.alpha = alpha; + vgemm.beta = beta; + vgemm.run(); // Compute c_expected + Functor_TestBatchedSerialGemm(alpha, a1, b1, beta, c1).run(); + + // Convert and copy half to host_value_type, on device + Kokkos::deep_copy(c_expected_host_value_type, c_expected); + Kokkos::deep_copy(c1_host_value_type, c1); + + // We may not have half precision on the host, use single precision here. + // For comparison send it to host, in host compatible type + typename ViewType_host_value_type::HostMirror c_expected_host_value_type_host = Kokkos::create_mirror_view(c_expected_host_value_type); + typename ViewType_host_value_type::HostMirror c1_host_value_type_host = Kokkos::create_mirror_view(c1_host_value_type); + + // Copy host_value_type on device to host_value_type on host + Kokkos::deep_copy(c_expected_host_value_type_host, c_expected_host_value_type); + Kokkos::deep_copy(c1_host_value_type_host, c1_host_value_type); + + Kokkos::fence(); + + // check c_expected = c1 ; this eps is about 2^-9 + // Set mag_type to host_value_type, we may not have half precision on host + using mag_type = host_value_type; + mag_type sum(1), diff(0); + + mag_type eps = (mag_type) (1 << 1) * KOKKOSKERNELS_IMPL_FP16_EPSILON; + + for (int k=0;k ViewType; Test::impl_test_batched_gemm(0, 10, 10, 10, 10, 10, 10); @@ -187,3 +278,65 @@ int test_batched_gemm() { return 0; } + +template +int test_batched_gemm_half() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_gemm_half(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + Test::impl_test_batched_gemm_half(1024, i, i, i, i, i, i); + } + for (int i=1;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_gemm_half(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutRight, Blksize %d\n", i); + Test::impl_test_batched_gemm_half(1024, i, i, i, i, i, i); + } + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm_half(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/Test_Batched_SerialGemm_Real.hpp b/unit_test/batched/Test_Batched_SerialGemm_Real.hpp index 24222cba2f..087c94f997 100644 --- a/unit_test/batched/Test_Batched_SerialGemm_Real.hpp +++ b/unit_test/batched/Test_Batched_SerialGemm_Real.hpp @@ -1,3 +1,30 @@ +#if defined(KOKKOS_HALF_T_IS_FLOAT) +TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_gemm_half(); + test_batched_gemm_half(); +} +TEST_F( TestCategory, batched_scalar_serial_gemm_t_nt_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_gemm_half(); + test_batched_gemm_half(); +} +TEST_F( TestCategory, batched_scalar_serial_gemm_nt_t_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_gemm_half(); + test_batched_gemm_half(); +} +TEST_F( TestCategory, batched_scalar_serial_gemm_t_t_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_gemm_half(); + test_batched_gemm_half(); +} +#endif // KOKKOS_HALF_T_IS_FLOAT + #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_float_float ) { typedef ::Test::ParamTag param_tag_type; diff --git a/unit_test/batched/Test_Batched_SerialTrmm.hpp b/unit_test/batched/Test_Batched_SerialTrmm.hpp index 8f8fd48758..3301f3cd42 100644 --- a/unit_test/batched/Test_Batched_SerialTrmm.hpp +++ b/unit_test/batched/Test_Batched_SerialTrmm.hpp @@ -54,7 +54,7 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/batched/Test_Batched_SerialTrtri.hpp b/unit_test/batched/Test_Batched_SerialTrtri.hpp index c50e26ae35..f4f74d6b7c 100644 --- a/unit_test/batched/Test_Batched_SerialTrtri.hpp +++ b/unit_test/batched/Test_Batched_SerialTrtri.hpp @@ -56,7 +56,7 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/batched/Test_Batched_TeamGemm.hpp b/unit_test/batched/Test_Batched_TeamGemm.hpp index 7418361809..10f11d686d 100644 --- a/unit_test/batched/Test_Batched_TeamGemm.hpp +++ b/unit_test/batched/Test_Batched_TeamGemm.hpp @@ -78,7 +78,7 @@ namespace Test { typename ScalarType, typename ParamTagType, typename AlgoTagType> - void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, + void impl_test_batched_teamgemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, const int matCdim1, const int matCdim2) { typedef typename ViewType::value_type value_type; typedef Kokkos::Details::ArithTraits ats; @@ -130,63 +130,155 @@ namespace Test { } EXPECT_NEAR_KK( diff/sum, 0, eps); } + + template + void impl_test_batched_teamgemm_half(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, + const int matCdim1, const int matCdim2) { + using layout_type = typename ViewType::array_layout; + using transA = typename ParamTagType::transA; + using transB = typename ParamTagType::transB; + using execution_space = typename DeviceType::execution_space; + using host_value_type = float; + using ViewType_host_value_type = Kokkos::View; + using ats = Kokkos::Details::ArithTraits; + + /// randomized input testing views + ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0); + + ViewType + a_expected("a_expected", N, matAdim1, matAdim2), a1("a1", N, matAdim1, matAdim2), + b_expected("b_expected", N, matBdim1, matBdim2), b1("b1", N, matBdim1, matBdim2), + c_expected("c_expected", N, matCdim1, matCdim2), c1("c1", N, matCdim1, matCdim2); + + // fill_random does not support half precision, so use float to + // generate random numbers and copy to half views with deep_copy + Kokkos::Random_XorShift64_Pool random(13718); + ViewType_host_value_type + a_expected_host_value_type("a_expected_host_value_type", N, matAdim1, matAdim2), + b_expected_host_value_type("b_expected_host_value_type", N, matBdim1, matBdim2), + c_expected_host_value_type("c_expected_host_value_type", N, matCdim1, matCdim2), + c1_host_value_type("c1_host_value_type", N, matCdim1, matCdim2); + + Kokkos::fill_random(a_expected_host_value_type, random, host_value_type(1.0)); + Kokkos::fill_random(b_expected_host_value_type, random, host_value_type(1.0)); + Kokkos::fill_random(c_expected_host_value_type, random, host_value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(a_expected, a_expected_host_value_type); + Kokkos::deep_copy(b_expected, b_expected_host_value_type); + Kokkos::deep_copy(c_expected, c_expected_host_value_type); + + Kokkos::deep_copy(a1, a_expected); + Kokkos::deep_copy(b1, b_expected); + Kokkos::deep_copy(c1, c_expected); + + Functor_BatchedVanillaGEMM vgemm; + vgemm.A_t = std::is_same::value; + vgemm.B_t = std::is_same::value; + vgemm.A_c = vgemm.B_c = false; + vgemm.A = a_expected; + vgemm.B = b_expected; + vgemm.C = c_expected; + vgemm.alpha = alpha; + vgemm.beta = beta; + vgemm.run(); // Compute c_expected + + Functor_TestBatchedTeamGemm(alpha, a1, b1, beta, c1).run(); + + Kokkos::fence(); + + // Convert and copy half to host_value_type, on device + Kokkos::deep_copy(c_expected_host_value_type, c_expected); + Kokkos::deep_copy(c1_host_value_type, c1); + + // We may not have half precision on the host, use single precision here. + // For comparison send it to host, in host compatible type + typename ViewType_host_value_type::HostMirror c_expected_host_value_type_host = Kokkos::create_mirror_view(c_expected_host_value_type); + typename ViewType_host_value_type::HostMirror c1_host_value_type_host = Kokkos::create_mirror_view(c1_host_value_type); + + // Copy host_value_type on device to host_value_type on host + Kokkos::deep_copy(c_expected_host_value_type_host, c_expected_host_value_type); + Kokkos::deep_copy(c1_host_value_type_host, c1_host_value_type); + + // check c_expected = c1 ; this eps is about 2^-9 + // Set mag_type to host_value_type, we may not have half precision on host + using mag_type = host_value_type; + mag_type sum(1), diff(0); + + mag_type eps = (mag_type) (1 << 1) * KOKKOSKERNELS_IMPL_FP16_EPSILON; + + for (int k=0;k -int test_batched_gemm() { +int test_batched_teamgemm() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; - Test::impl_test_batched_gemm(0, 10, 10, 10, 10, 10, 10); + Test::impl_test_batched_teamgemm(0, 10, 10, 10, 10, 10, 10); for (int i=0;i<10;++i) { //printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::impl_test_batched_gemm(1024, i, i, i, i, i, i); + Test::impl_test_batched_teamgemm(1024, i, i, i, i, i, i); } for (int i=0;i<10;++i) { //printf("Testing: LayoutLeft, Blksize %d\n", i); int dimM=i; int dimN=2*i; int dimK=3*i; if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { typedef Kokkos::View ViewType; - Test::impl_test_batched_gemm(0, 10, 10, 10, 10, 10, 10); + Test::impl_test_batched_teamgemm(0, 10, 10, 10, 10, 10, 10); for (int i=0;i<10;++i) { //printf("Testing: LayoutRight, Blksize %d\n", i); - Test::impl_test_batched_gemm(1024, i, i, i, i, i, i); + Test::impl_test_batched_teamgemm(1024, i, i, i, i, i, i); } for (int i=0;i<10;++i) { //printf("Testing: LayoutLeft, Blksize %d\n", i); int dimM=i; int dimN=2*i; int dimK=3*i; if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } if ((std::is_same::value) && (std::is_same::value)) { - Test::impl_test_batched_gemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + Test::impl_test_batched_teamgemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } } } #endif @@ -194,3 +286,64 @@ int test_batched_gemm() { return 0; } +template +int test_batched_teamgemm_half() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_teamgemm_half(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + Test::impl_test_batched_teamgemm_half(1024, i, i, i, i, i, i); + } + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_teamgemm_half(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutRight, Blksize %d\n", i); + Test::impl_test_batched_teamgemm_half(1024, i, i, i, i, i, i); + } + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamgemm_half(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/Test_Batched_TeamGemm_Complex.hpp b/unit_test/batched/Test_Batched_TeamGemm_Complex.hpp index abf7983966..2f66860ff4 100644 --- a/unit_test/batched/Test_Batched_TeamGemm_Complex.hpp +++ b/unit_test/batched/Test_Batched_TeamGemm_Complex.hpp @@ -6,32 +6,32 @@ TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_dcomplex ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_dcomplex ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); } TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_dcomplex ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_t_dcomplex_dcomplex ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_dcomplex ) { // typedef ::Test::ParamTag param_tag_type; // typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_gemm_nt_ct_dcomplex_dcomplex ) { // typedef ::Test::ParamTag param_tag_type; // typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } /// dcomplex, double @@ -39,32 +39,32 @@ TEST_F( TestCategory, batched_scalar_team_gemm_t_t_dcomplex_dcomplex ) { TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,double,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,double,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); } TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,double,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_t_dcomplex_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm,double,param_tag_type,algo_tag_type>(); + test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_double ) { // typedef ::Test::ParamTag param_tag_type; // typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,double,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_gemm_nt_ct_dcomplex_double ) { // typedef ::Test::ParamTag param_tag_type; // typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,double,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); // } #endif diff --git a/unit_test/batched/Test_Batched_TeamGemm_Real.hpp b/unit_test/batched/Test_Batched_TeamGemm_Real.hpp index 065fb68c97..327b1bcc21 100644 --- a/unit_test/batched/Test_Batched_TeamGemm_Real.hpp +++ b/unit_test/batched/Test_Batched_TeamGemm_Real.hpp @@ -1,24 +1,50 @@ +#if defined(KOKKOS_HALF_T_IS_FLOAT) +TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_teamgemm_half(); + test_batched_teamgemm_half(); +} +TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_teamgemm_half(); + test_batched_teamgemm_half(); +} +TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_teamgemm_half(); + test_batched_teamgemm_half(); +} +TEST_F( TestCategory, batched_scalar_team_gemm_t_t_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + test_batched_teamgemm_half(); + test_batched_teamgemm_half(); +} +#endif // KOKKOS_HALF_T_IS_FLOAT #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_float_float ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_float_float ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_float_float ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_t_float_float ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } #endif @@ -26,22 +52,22 @@ TEST_F( TestCategory, batched_scalar_team_gemm_t_t_float_float ) { TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_double_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_double_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_double_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } TEST_F( TestCategory, batched_scalar_team_gemm_t_t_double_double ) { typedef ::Test::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_teamgemm(); } #endif diff --git a/unit_test/batched/Test_Batched_TeamVectorGemm.hpp b/unit_test/batched/Test_Batched_TeamVectorGemm.hpp new file mode 100644 index 0000000000..09b2dfa89c --- /dev/null +++ b/unit_test/batched/Test_Batched_TeamVectorGemm.hpp @@ -0,0 +1,346 @@ +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_Gemm_TeamVector_Impl.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { + + template + struct ParamTag { + typedef TA transA; + typedef TB transB; + }; + + template + struct Functor_TestBatchedTeamVector { + ViewType _a, _b, _c; + + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedTeamVector(const ScalarType alpha, + const ViewType &a, + const ViewType &b, + const ScalarType beta, + const ViewType &c) + : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} + + template + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const MemberType &member) const { + const int k = member.league_rank(); + + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); + + TeamVectorGemm:: + invoke(member, _alpha, aa, bb, _beta, cc); + } + + inline + void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamVector"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); + const int league_size = _c.extent(0); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } + }; + + template + void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, + const int matCdim1, const int matCdim2) { + typedef typename ViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + /// randomized input testing views + ScalarType alpha = 1.5, beta = 3.0; + + ViewType + a0("a0", N, matAdim1,matAdim2), a1("a1", N, matAdim1,matAdim2), + b0("b0", N, matBdim1,matBdim2), b1("b1", N, matBdim1,matBdim2), + c0("c0", N, matCdim1,matCdim2), c1("c1", N, matCdim1,matCdim2); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(a0, random, value_type(1.0)); + Kokkos::fill_random(b0, random, value_type(1.0)); + Kokkos::fill_random(c0, random, value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(a1, a0); + Kokkos::deep_copy(b1, b0); + Kokkos::deep_copy(c1, c0); + + /// test body + Functor_TestBatchedTeamVector(alpha, a0, b0, beta, c0).run(); + Functor_TestBatchedTeamVector(alpha, a1, b1, beta, c1).run(); + + Kokkos::fence(); + + /// for comparison send it to host + typename ViewType::HostMirror c0_host = Kokkos::create_mirror_view(c0); + typename ViewType::HostMirror c1_host = Kokkos::create_mirror_view(c1); + + Kokkos::deep_copy(c0_host, c0); + Kokkos::deep_copy(c1_host, c1); + + /// check c0 = c1 ; this eps is about 10^-14 + typedef typename ats::mag_type mag_type; + mag_type sum(1), diff(0); + const mag_type eps = 1.0e3 * ats::epsilon(); + + for (int k=0;k + void impl_test_batched_teamvectorgemm_half(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, + const int matCdim1, const int matCdim2) { + using layout_type = typename ViewType::array_layout; + using transA = typename ParamTagType::transA; + using transB = typename ParamTagType::transB; + using execution_space = typename DeviceType::execution_space; + using host_value_type = float; + using ViewType_host_value_type = Kokkos::View; + using ats = Kokkos::Details::ArithTraits; + + /// randomized input testing views + ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0); + + ViewType + a_expected("a_expected", N, matAdim1, matAdim2), a1("a1", N, matAdim1, matAdim2), + b_expected("b_expected", N, matBdim1, matBdim2), b1("b1", N, matBdim1, matBdim2), + c_expected("c_expected", N, matCdim1, matCdim2), c1("c1", N, matCdim1, matCdim2); + + // fill_random does not support half precision, so use float to + // generate random numbers and copy to half views with deep_copy + Kokkos::Random_XorShift64_Pool random(13718); + ViewType_host_value_type + a_expected_host_value_type("a_expected_host_value_type", N, matAdim1, matAdim2), + b_expected_host_value_type("b_expected_host_value_type", N, matBdim1, matBdim2), + c_expected_host_value_type("c_expected_host_value_type", N, matCdim1, matCdim2), + c1_host_value_type("c1_host_value_type", N, matCdim1, matCdim2); + + Kokkos::fill_random(a_expected_host_value_type, random, host_value_type(1.0)); + Kokkos::fill_random(b_expected_host_value_type, random, host_value_type(1.0)); + Kokkos::fill_random(c_expected_host_value_type, random, host_value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(a_expected, a_expected_host_value_type); + Kokkos::deep_copy(b_expected, b_expected_host_value_type); + Kokkos::deep_copy(c_expected, c_expected_host_value_type); + + Kokkos::deep_copy(a1, a_expected); + Kokkos::deep_copy(b1, b_expected); + Kokkos::deep_copy(c1, c_expected); + + //Functor_TestBatchedTeamVector(alpha, a_expected, b_expected, beta, c_expected).run(); + Functor_BatchedVanillaGEMM vgemm; + vgemm.A_t = std::is_same::value; + vgemm.B_t = std::is_same::value; + vgemm.A_c = vgemm.B_c = false; + vgemm.A = a_expected; + vgemm.B = b_expected; + vgemm.C = c_expected; + vgemm.alpha = alpha; + vgemm.beta = beta; + vgemm.run(); // Compute c_expected + + Functor_TestBatchedTeamVector(alpha, a1, b1, beta, c1).run(); + + Kokkos::fence(); + + // Convert and copy half to host_value_type, on device + Kokkos::deep_copy(c_expected_host_value_type, c_expected); + Kokkos::deep_copy(c1_host_value_type, c1); + + // We may not have half precision on the host, use single precision here. + // For comparison send it to host, in host compatible type + typename ViewType_host_value_type::HostMirror c_expected_host_value_type_host = Kokkos::create_mirror_view(c_expected_host_value_type); + typename ViewType_host_value_type::HostMirror c1_host_value_type_host = Kokkos::create_mirror_view(c1_host_value_type); + + // Copy host_value_type on device to host_value_type on host + Kokkos::deep_copy(c_expected_host_value_type_host, c_expected_host_value_type); + Kokkos::deep_copy(c1_host_value_type_host, c1_host_value_type); + + // check c_expected = c1 ; this eps is about 2^-9 + // Set mag_type to host_value_type, we may not have half precision on host + using mag_type = host_value_type; + mag_type sum(1), diff(0); + + mag_type eps = (mag_type) (1 << 1) * KOKKOSKERNELS_IMPL_FP16_EPSILON; + + for (int k=0;k +int test_batched_teamvectorgemm() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_teamvectorgemm(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + Test::impl_test_batched_teamvectorgemm(1024, i, i, i, i, i, i); + } + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_teamvectorgemm(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutRight, Blksize %d\n", i); + Test::impl_test_batched_teamvectorgemm(1024, i, i, i, i, i, i); + } + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif + + return 0; +} + +template +int test_batched_teamvectorgemm_half() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_teamvectorgemm_half(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + Test::impl_test_batched_teamvectorgemm_half(1024, i, i, i, i, i, i); + } + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_teamvectorgemm_half(0, 10, 10, 10, 10, 10, 10); + for (int i=0;i<10;++i) { + //printf("Testing: LayoutRight, Blksize %d\n", i); + Test::impl_test_batched_teamvectorgemm_half(1024, i, i, i, i, i, i); + } + for (int i=0;i<10;++i) { + //printf("Testing: LayoutLeft, Blksize %d\n", i); + int dimM=i; int dimN=2*i; int dimK=3*i; + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimM, dimK, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimM, dimK, dimN, dimK, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimK, dimM, dimK, dimN, dimM, dimN); } + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_teamvectorgemm_half(1024, dimK, dimM, dimN, dimK, dimM, dimN); } + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/Test_Batched_TeamVectorGemm_Complex.hpp b/unit_test/batched/Test_Batched_TeamVectorGemm_Complex.hpp new file mode 100644 index 0000000000..4926d20670 --- /dev/null +++ b/unit_test/batched/Test_Batched_TeamVectorGemm_Complex.hpp @@ -0,0 +1,53 @@ +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_scomplex_scomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_scomplex_scomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_scomplex_scomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_scomplex_scomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_dcomplex_dcomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_dcomplex_dcomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_dcomplex_dcomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_dcomplex_dcomplex ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Unblocked>(); +} +#endif diff --git a/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp b/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp new file mode 100644 index 0000000000..de7748bd65 --- /dev/null +++ b/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp @@ -0,0 +1,80 @@ +#if defined(KOKKOS_HALF_T_IS_FLOAT) +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm_half(); + test_batched_teamvectorgemm_half(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm_half(); + test_batched_teamvectorgemm_half(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm_half(); + test_batched_teamvectorgemm_half(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_half_half ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm_half(); + test_batched_teamvectorgemm_half(); +} +#endif // KOKKOS_HALF_T_IS_FLOAT + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_float_float ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_float_float ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_float_float ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_float_float ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_double_double ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_double_double ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_double_double ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_double_double ) { + typedef ::Test::ParamTag param_tag_type; + + //test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); +} +#endif diff --git a/unit_test/blas/Test_Blas3_gemm.hpp b/unit_test/blas/Test_Blas3_gemm.hpp index 55c71231f6..451b7fedac 100644 --- a/unit_test/blas/Test_Blas3_gemm.hpp +++ b/unit_test/blas/Test_Blas3_gemm.hpp @@ -25,7 +25,7 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/blas/Test_Blas3_trmm.hpp b/unit_test/blas/Test_Blas3_trmm.hpp index 74fd49b988..9f72bd5e63 100644 --- a/unit_test/blas/Test_Blas3_trmm.hpp +++ b/unit_test/blas/Test_Blas3_trmm.hpp @@ -49,7 +49,7 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/blas/Test_Blas3_trsm.hpp b/unit_test/blas/Test_Blas3_trsm.hpp index e6e98723c2..8fec44b637 100644 --- a/unit_test/blas/Test_Blas3_trsm.hpp +++ b/unit_test/blas/Test_Blas3_trsm.hpp @@ -49,7 +49,8 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/blas/Test_Blas_trtri.hpp b/unit_test/blas/Test_Blas_trtri.hpp index f939b87b31..bcc6b842c8 100644 --- a/unit_test/blas/Test_Blas_trtri.hpp +++ b/unit_test/blas/Test_Blas_trtri.hpp @@ -49,7 +49,7 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Complex.cpp b/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Complex.cpp new file mode 100644 index 0000000000..8ac5c834bc --- /dev/null +++ b/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_Cuda.hpp" +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Complex.hpp" diff --git a/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Real.cpp b/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Real.cpp new file mode 100644 index 0000000000..27e7b3b565 --- /dev/null +++ b/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_Cuda.hpp" +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Real.hpp" diff --git a/unit_test/cuda/Test_Cuda_Graph_mis2.cpp b/unit_test/cuda/Test_Cuda_Graph_mis2.cpp new file mode 100644 index 0000000000..00148fd653 --- /dev/null +++ b/unit_test/cuda/Test_Cuda_Graph_mis2.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/cuda/Test_Cuda_Graph_rcm.cpp b/unit_test/cuda/Test_Cuda_Graph_rcm.cpp new file mode 100644 index 0000000000..e7fb84820d --- /dev/null +++ b/unit_test/cuda/Test_Cuda_Graph_rcm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/graph/Test_Graph_graph_color_distance2.hpp b/unit_test/graph/Test_Graph_graph_color_distance2.hpp index 7dac558bff..6f60fc9d62 100644 --- a/unit_test/graph/Test_Graph_graph_color_distance2.hpp +++ b/unit_test/graph/Test_Graph_graph_color_distance2.hpp @@ -47,6 +47,7 @@ #include #include "KokkosGraph_Distance2Color.hpp" +#include "KokkosGraph_MIS2.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_IOUtils.hpp" #include "KokkosKernels_SparseUtils.hpp" @@ -322,72 +323,12 @@ void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz, lno_t bandwidth } } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE -template -void test_old_d2(lno_t numRows, lno_t numCols, size_type nnz, lno_t bandwidth, lno_t row_size_variance) -{ - using execution_space = typename device::execution_space; - using memory_space = typename device::memory_space; - using crsMat = KokkosSparse::CrsMatrix; - using graph_type = typename crsMat::StaticCrsGraphType; - using c_rowmap_t = typename graph_type::row_map_type; - using c_entries_t = typename graph_type::entries_type; - using rowmap_t = typename graph_type::row_map_type::non_const_type; - using entries_t = typename graph_type::entries_type::non_const_type; - using KernelHandle = KokkosKernelsHandle< - size_type, lno_t, double, - execution_space, memory_space, memory_space>; - //Generate graph - crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows, numCols, nnz, row_size_variance, bandwidth); - auto G = A.graph; - rowmap_t t_rowmap("rowmap^T", numCols + 1); - entries_t t_entries("entries^T", G.entries.extent(0)); - KokkosKernels::Impl::transpose_graph - - (numRows, numCols, G.row_map, G.entries, t_rowmap, t_entries); - auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.row_map); - auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.entries); - auto t_rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_rowmap); - auto t_entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_entries); - std::vector algos = - {COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT}; - for(auto algo : algos) - { - KernelHandle kh; - kh.create_distance2_graph_coloring_handle(algo); - // Compute the one-sided bipartite coloring. - graph_compute_distance2_color - (&kh, numRows, numCols, G.row_map, G.entries, t_rowmap, t_entries); - execution_space().fence(); - auto coloring_handle = kh.get_distance2_graph_coloring_handle(); - auto colors = coloring_handle->get_vertex_colors(); - auto colorsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors); - auto numColors = coloring_handle->get_num_colors(); - EXPECT_LE(numColors, numRows); - bool success = Test::verifyBipartitePartialColoring - - (numRows, numCols, rowmapHost, entriesHost, t_rowmapHost, t_entriesHost, colorsHost); - EXPECT_TRUE(success) << "Old dist-2 coloring: algorithm " << coloring_handle->getD2AlgorithmName() << " produced invalid coloring"; - kh.destroy_distance2_graph_coloring_handle(); - } -} -#define DO_DEPRECATED_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - test_old_d2(2000, 4000, 3000 * 20, 800, 10); \ - test_old_d2(4000, 2000, 3000 * 20, 800, 10); -#else -#define DO_DEPRECATED_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) -#endif - #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, graph##_##graph_color_distance2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ { \ test_dist2_coloring(5000, 5000 * 20, 1000, 10); \ test_dist2_coloring(50, 50 * 10, 40, 10); \ } \ - TEST_F(TestCategory, graph##_##graph_color_deprecated_distance2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ - { \ - DO_DEPRECATED_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - } \ TEST_F(TestCategory, graph##_##graph_color_bipartite_sym##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ { \ test_bipartite_symmetric(50, 50 * 5, 30, 1); \ diff --git a/unit_test/graph/Test_Graph_mis2.hpp b/unit_test/graph/Test_Graph_mis2.hpp new file mode 100644 index 0000000000..30d32fb2dc --- /dev/null +++ b/unit_test/graph/Test_Graph_mis2.hpp @@ -0,0 +1,234 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include + +#include "KokkosGraph_MIS2.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosKernels_IOUtils.hpp" +#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosKernels_Handle.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" + +using namespace KokkosKernels; +using namespace KokkosKernels::Experimental; + +using namespace KokkosGraph; +using namespace KokkosGraph::Experimental; + +namespace Test { + +template +bool verifyD2MIS( + lno_t numVerts, + const rowmap_t& rowmap, const entries_t& entries, + const mis_t& misArray) +{ + //set a std::set of the mis, for fast membership test + std::set mis; + for(size_t i = 0; i < misArray.extent(0); i++) + mis.insert(misArray(i)); + for(lno_t i = 0; i < numVerts; i++) + { + //determine whether another vertex in the set is + //within 2 hops of i. + bool misIn2Hops = false; + for(size_type j = rowmap(i); j < rowmap(i + 1); j++) + { + lno_t nei1 = entries(j); + if(nei1 == i || nei1 >= numVerts) + continue; + if(mis.find(nei1) != mis.end()) + { + misIn2Hops = true; + break; + } + for(size_type k = rowmap(nei1); k < rowmap(nei1 + 1); k++) + { + lno_t nei2 = entries(k); + if(nei2 == i || nei2 >= numVerts) + continue; + if(mis.find(nei2) != mis.end()) + { + misIn2Hops = true; + break; + } + } + } + if(mis.find(i) == mis.end()) + { + //i is not in the set + if(!misIn2Hops) + { + std::cout << "INVALID D2 MIS: vertex " << i << " is not in the set,\n"; + std::cout << "but there are no vertices in the set within 2 hops.\n"; + return false; + } + } + else + { + //i is in the set + if(misIn2Hops) + { + std::cout << "INVALID D2 MIS: vertex " << i << " is in the set,\n"; + std::cout << "but there is another vertex within 2 hops which is also in the set.\n"; + return false; + } + } + } + return true; +} +} + +template +void test_mis2(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) +{ + using execution_space = typename device::execution_space; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + //Generate graph, and add some out-of-bounds columns + crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix(numVerts, numVerts, nnz, row_size_variance, bandwidth); + auto G = A.graph; + //Symmetrize the graph + rowmap_t symRowmap; + entries_t symEntries; + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap + + (numVerts, G.row_map, G.entries, symRowmap, symEntries); + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); + //For each algorithm, compute and verify the MIS + std::vector algos + = {MIS2_FAST, MIS2_QUALITY}; + for(auto algo : algos) + { + auto mis = graph_d2_mis(symRowmap, symEntries, algo); + auto misHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), mis); + bool success = Test::verifyD2MIS + + (numVerts, rowmapHost, entriesHost, misHost); + EXPECT_TRUE(success) << "Dist-2 MIS (algo " << (int) algo << ") produced invalid set."; + } +} + +template +void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) +{ + using execution_space = typename device::execution_space; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + //Generate graph, and add some out-of-bounds columns + crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix(numVerts, numVerts, nnz, row_size_variance, bandwidth); + auto G = A.graph; + //Symmetrize the graph + rowmap_t symRowmap; + entries_t symEntries; + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap + + (numVerts, G.row_map, G.entries, symRowmap, symEntries); + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); + //For each algorithm, compute and verify the MIS + std::vector algos + = {MIS2_FAST, MIS2_QUALITY}; + for(auto algo : algos) + { + lno_t numClusters = 0; + auto labels = graph_mis2_coarsen(symRowmap, symEntries, numClusters, algo); + auto labelsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), labels); + //Not a strong test, but sanity check the number of clusters returned + EXPECT_TRUE(numClusters >= 1 && numClusters <= numVerts); + //Check that every label is in the range [0, numClusters) + for(lno_t i = 0; i < numVerts; i++) + EXPECT_TRUE(0 <= labelsHost(i) && labelsHost(i) < numClusters); + } +} + +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##graph_mis2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ + { \ + test_mis2(5000, 5000 * 20, 1000, 10); \ + test_mis2(50, 50 * 10, 40, 10); \ + test_mis2(5, 5 * 3, 5, 0); \ + } \ + TEST_F(TestCategory, graph##_##graph_mis2_coarsening##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ + { \ + test_mis2_coarsening(5000, 5000 * 20, 1000, 10); \ + test_mis2_coarsening(50, 50 * 10, 40, 10); \ + test_mis2_coarsening(5, 5 * 3, 5, 0); \ + } + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) \ + || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, int, int, TestExecSpace) +#endif + +#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) \ + || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, int64_t, int, TestExecSpace) +#endif + +#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) \ + || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, int, size_t, TestExecSpace) +#endif + +#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) \ + || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +#endif +#endif diff --git a/unit_test/graph/Test_Graph_rcm.hpp b/unit_test/graph/Test_Graph_rcm.hpp new file mode 100644 index 0000000000..eb3cd45a37 --- /dev/null +++ b/unit_test/graph/Test_Graph_rcm.hpp @@ -0,0 +1,197 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +#include "KokkosGraph_RCM.hpp" +#include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_CrsMatrix.hpp" + +#include + +//Generates a graph from 3D 7-pt stencil. Slices grid into 2 connected components near the middle of X dimension. +template +void generate7pt(rowmap_t& rowmapView, entries_t& entriesView, int gridX, int gridY, int gridZ) +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + auto getVertexID = + [=](lno_t x, lno_t y, lno_t z) -> lno_t + { + return x + y * gridX + z * gridX * gridY; + }; + lno_t numVertices = gridX * gridY * gridZ; + //Generate the graph on host (use std::vector to not need to know + //how many entries ahead of time) + std::vector rowmap(numVertices + 1); + std::vector entries; + rowmap[0] = 0; + lno_t xslice = gridX / 2; + for(lno_t k = 0; k < gridZ; k++) + { + for(lno_t j = 0; j < gridY; j++) + { + for(lno_t i = 0; i < gridX; i++) + { + lno_t v = getVertexID(i, j, k); + if(i != 0 && i != xslice + 1) + entries.push_back(getVertexID(i - 1, j, k)); + if(i != gridX - 1 && i != xslice) + entries.push_back(getVertexID(i + 1, j, k)); + if(j != 0) + entries.push_back(getVertexID(i, j - 1, k)); + if(j != gridY - 1) + entries.push_back(getVertexID(i, j + 1, k)); + if(k != 0) + entries.push_back(getVertexID(i, j, k - 1)); + if(k != gridZ - 1) + entries.push_back(getVertexID(i, j, k + 1)); + rowmap[v + 1] = entries.size(); + } + } + } + size_type numEdges = entries.size(); + //Now that the graph is formed, copy rowmap and entries to Kokkos::Views in device memory + //The nonowning host views just alias the std::vectors. + Kokkos::View> rowmapHost(rowmap.data(), numVertices + 1); + Kokkos::View> entriesHost(entries.data(), numEdges); + //Allocate owning views on device with the correct size. + rowmapView = rowmap_t(Kokkos::ViewAllocateWithoutInitializing("Rowmap"), numVertices + 1); + entriesView = entries_t(Kokkos::ViewAllocateWithoutInitializing("Colinds"), numEdges); + //Copy the graph from host to device + Kokkos::deep_copy(rowmapView, rowmapHost); + Kokkos::deep_copy(entriesView, entriesHost); +} + +template +int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries, const labels_t& invPerm, const labels_t& perm) +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + lno_t numVerts = rowmap.extent(0) - 1; + int bw = 0; + for(lno_t i = 0; i < numVerts; i++) + { + lno_t origRow = perm(i); + for(size_type j = rowmap(origRow); j < rowmap(origRow + 1); j++) + { + lno_t origNei = entries(j); + lno_t nei = invPerm(origNei); + if(nei > i) + { + lno_t thisBW = nei - i; + if(thisBW > bw) + bw = thisBW; + } + } + } + return bw; +} + +template +void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ) +{ + typedef typename KokkosSparse::CrsMatrix crsMat_t; + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type rowmap_t; + typedef typename graph_t::entries_type entries_t; + lno_t numVerts = gridX * gridY * gridZ; + typename rowmap_t::non_const_type rowmap; + typename entries_t::non_const_type entries; + generate7pt(rowmap, entries, gridX, gridY, gridZ); + auto rcm = KokkosGraph::Experimental::graph_rcm(rowmap, entries); + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries); + auto rcmHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rcm); + decltype(rcmHost) rcmPermHost(Kokkos::ViewAllocateWithoutInitializing("RCMPerm"), numVerts); + for(lno_t i = 0; i < numVerts; i++) + rcmPermHost(rcmHost(i)) = i; + //make sure each row index shows up exactly once + { + std::vector counts(numVerts); + for(lno_t i = 0; i < numVerts; i++) + { + lno_t orig = rcmHost(i); + ASSERT_GE(orig, 0); + ASSERT_LT(orig, numVerts); + counts[orig]++; + } + for(lno_t i = 0; i < numVerts; i++) + ASSERT_EQ(counts[i], 1); + } + Kokkos::View identityOrder(Kokkos::ViewAllocateWithoutInitializing("Identity"), numVerts); + for(lno_t i = 0; i < numVerts; i++) + identityOrder(i) = i; + size_t origBW = maxBandwidth(rowmapHost, entriesHost, identityOrder, identityOrder); + size_t rcmBW = maxBandwidth(rowmapHost, entriesHost, rcmHost, rcmPermHost); + EXPECT_LE(rcmBW, origBW); +} + +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +TEST_F( TestCategory, graph ## _ ## rcm ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ + test_rcm(6, 3, 3); \ + test_rcm(20, 20, 20); \ + test_rcm(100, 100, 1); \ +} + +#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ + && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + EXECUTE_TEST(double, int, int, TestExecSpace) +#endif + +#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ + && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + EXECUTE_TEST(double, int64_t, int, TestExecSpace) +#endif + +#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ + && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + EXECUTE_TEST(double, int, size_t, TestExecSpace) +#endif + +#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ + && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +#endif diff --git a/unit_test/hip/Test_HIP.hpp b/unit_test/hip/Test_HIP.hpp new file mode 100644 index 0000000000..cd4c49f16d --- /dev/null +++ b/unit_test/hip/Test_HIP.hpp @@ -0,0 +1,21 @@ +#include +#include +#include + +#if defined(KOKKOSKERNELS_TEST_ETI_ONLY) && !defined(KOKKOSKERNELS_ETI_ONLY) +#define KOKKOSKERNELS_ETI_ONLY +#endif + +class hip : public ::testing::Test { +protected: + static void SetUpTestCase() + { + } + + static void TearDownTestCase() + { + } +}; + +#define TestCategory hip +#define TestExecSpace Kokkos::Experimental::HIP diff --git a/unit_test/hip/Test_HIP_Batched_SerialEigendecomposition_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialEigendecomposition_Real.cpp new file mode 100644 index 0000000000..1aceff3e62 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialEigendecomposition_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialEigendecomposition.hpp" +#include "Test_Batched_SerialEigendecomposition_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialGemm_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialGemm_Complex.cpp new file mode 100644 index 0000000000..280d12eb89 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialGemm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialGemm.hpp" +#include "Test_Batched_SerialGemm_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialGemm_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialGemm_Real.cpp new file mode 100644 index 0000000000..0a3425962a --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialGemm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialGemm.hpp" +#include "Test_Batched_SerialGemm_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialGemv_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialGemv_Complex.cpp new file mode 100644 index 0000000000..1f405f4caa --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialGemv_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialGemv.hpp" +#include "Test_Batched_SerialGemv_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialGemv_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialGemv_Real.cpp new file mode 100644 index 0000000000..98e69da8e1 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialGemv_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialGemv.hpp" +#include "Test_Batched_SerialGemv_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Complex.cpp new file mode 100644 index 0000000000..7d0f3bcdea --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialInverseLU.hpp" +#include "Test_Batched_SerialInverseLU_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Real.cpp new file mode 100644 index 0000000000..c147695515 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialInverseLU.hpp" +#include "Test_Batched_SerialInverseLU_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialLU_Complex.cpp new file mode 100644 index 0000000000..ac11b50956 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialLU.hpp" +#include "Test_Batched_SerialLU_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialLU_Real.cpp new file mode 100644 index 0000000000..b9bdbfb95a --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialLU.hpp" +#include "Test_Batched_SerialLU_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Complex.cpp new file mode 100644 index 0000000000..d7070fd0b5 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialMatUtil.hpp" +#include "Test_Batched_SerialMatUtil_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Real.cpp new file mode 100644 index 0000000000..65674e04b9 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialMatUtil.hpp" +#include "Test_Batched_SerialMatUtil_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Complex.cpp new file mode 100644 index 0000000000..059877ff2d --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialSolveLU.hpp" +#include "Test_Batched_SerialSolveLU_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Real.cpp new file mode 100644 index 0000000000..d09271a0e6 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialSolveLU.hpp" +#include "Test_Batched_SerialSolveLU_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrmm_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrmm_Complex.cpp new file mode 100644 index 0000000000..e10cb11259 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrmm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrmm.hpp" +#include "Test_Batched_SerialTrmm_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrmm_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrmm_Real.cpp new file mode 100644 index 0000000000..95b412a3a8 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrmm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrmm.hpp" +#include "Test_Batched_SerialTrmm_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrsm_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrsm_Complex.cpp new file mode 100644 index 0000000000..b12b6fc203 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrsm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrsm.hpp" +#include "Test_Batched_SerialTrsm_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrsm_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrsm_Real.cpp new file mode 100644 index 0000000000..660293cfd2 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrsm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrsm.hpp" +#include "Test_Batched_SerialTrsm_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrsv_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrsv_Complex.cpp new file mode 100644 index 0000000000..f82c94e5e9 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrsv_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrsv.hpp" +#include "Test_Batched_SerialTrsv_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrsv_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrsv_Real.cpp new file mode 100644 index 0000000000..34c80371e1 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrsv_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrsv.hpp" +#include "Test_Batched_SerialTrsv_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrtri_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrtri_Complex.cpp new file mode 100644 index 0000000000..387aee1cc2 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrtri_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrtri.hpp" +#include "Test_Batched_SerialTrtri_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrtri_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrtri_Real.cpp new file mode 100644 index 0000000000..1f996ca4e1 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_SerialTrtri_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_SerialTrtri.hpp" +#include "Test_Batched_SerialTrtri_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamGemm_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamGemm_Complex.cpp new file mode 100644 index 0000000000..49b75ee6fa --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamGemm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamGemm.hpp" +#include "Test_Batched_TeamGemm_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamGemm_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamGemm_Real.cpp new file mode 100644 index 0000000000..52cacfa3c8 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamGemm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamGemm.hpp" +#include "Test_Batched_TeamGemm_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamGemv_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamGemv_Complex.cpp new file mode 100644 index 0000000000..fed2bad261 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamGemv_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamGemv.hpp" +#include "Test_Batched_TeamGemv_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamGemv_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamGemv_Real.cpp new file mode 100644 index 0000000000..2d589ba4ef --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamGemv_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamGemv.hpp" +#include "Test_Batched_TeamGemv_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Complex.cpp new file mode 100644 index 0000000000..fa4ab4b3a1 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamInverseLU.hpp" +#include "Test_Batched_TeamInverseLU_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Real.cpp new file mode 100644 index 0000000000..9877053d34 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamInverseLU.hpp" +#include "Test_Batched_TeamInverseLU_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamLU_Complex.cpp new file mode 100644 index 0000000000..068f2faa3f --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamLU.hpp" +#include "Test_Batched_TeamLU_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamLU_Real.cpp new file mode 100644 index 0000000000..0e09a25fb2 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamLU.hpp" +#include "Test_Batched_TeamLU_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Complex.cpp new file mode 100644 index 0000000000..8a2b9d4c44 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamMatUtil.hpp" +#include "Test_Batched_TeamMatUtil_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Real.cpp new file mode 100644 index 0000000000..8262c3c2eb --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamMatUtil.hpp" +#include "Test_Batched_TeamMatUtil_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Complex.cpp new file mode 100644 index 0000000000..b5474a3a24 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamSolveLU.hpp" +#include "Test_Batched_TeamSolveLU_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Real.cpp new file mode 100644 index 0000000000..469fce62a9 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamSolveLU.hpp" +#include "Test_Batched_TeamSolveLU_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamTrsm_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamTrsm_Complex.cpp new file mode 100644 index 0000000000..e48617a7b6 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamTrsm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamTrsm.hpp" +#include "Test_Batched_TeamTrsm_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamTrsm_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamTrsm_Real.cpp new file mode 100644 index 0000000000..83ce8988d0 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamTrsm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamTrsm.hpp" +#include "Test_Batched_TeamTrsm_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamTrsv_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamTrsv_Complex.cpp new file mode 100644 index 0000000000..ff75837fca --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamTrsv_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamTrsv.hpp" +#include "Test_Batched_TeamTrsv_Complex.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamTrsv_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamTrsv_Real.cpp new file mode 100644 index 0000000000..5fba12911e --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamTrsv_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamTrsv.hpp" +#include "Test_Batched_TeamTrsv_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamVectorQR_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamVectorQR_Real.cpp new file mode 100644 index 0000000000..e8ee97ffc7 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamVectorQR_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamVectorQR.hpp" +#include "Test_Batched_TeamVectorQR_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamVectorQR_WithColumnPivoting_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamVectorQR_WithColumnPivoting_Real.cpp new file mode 100644 index 0000000000..a55667f9d4 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamVectorQR_WithColumnPivoting_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamVectorQR_WithColumnPivoting.hpp" +#include "Test_Batched_TeamVectorQR_WithColumnPivoting_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamVectorSolveUTV_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamVectorSolveUTV_Real.cpp new file mode 100644 index 0000000000..aaa8ad4f91 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamVectorSolveUTV_Real.cpp @@ -0,0 +1,6 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamVectorSolveUTV.hpp" +#include "Test_Batched_TeamVectorSolveUTV_Real.hpp" + +#include "Test_Batched_TeamVectorSolveUTV2.hpp" +#include "Test_Batched_TeamVectorSolveUTV2_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Batched_TeamVectorUTV_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamVectorUTV_Real.cpp new file mode 100644 index 0000000000..f60705ae07 --- /dev/null +++ b/unit_test/hip/Test_HIP_Batched_TeamVectorUTV_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_HIP.hpp" +#include "Test_Batched_TeamVectorUTV.hpp" +#include "Test_Batched_TeamVectorUTV_Real.hpp" diff --git a/unit_test/hip/Test_HIP_Blas1_abs.cpp b/unit_test/hip/Test_HIP_Blas1_abs.cpp new file mode 100644 index 0000000000..e175c8970e --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_abs.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_asum.cpp b/unit_test/hip/Test_HIP_Blas1_asum.cpp new file mode 100644 index 0000000000..c93f5f32fd --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_asum.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_axpby.cpp b/unit_test/hip/Test_HIP_Blas1_axpby.cpp new file mode 100644 index 0000000000..2814ecc583 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_axpby.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_axpy.cpp b/unit_test/hip/Test_HIP_Blas1_axpy.cpp new file mode 100644 index 0000000000..8c7170d275 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_axpy.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_dot.cpp b/unit_test/hip/Test_HIP_Blas1_dot.cpp new file mode 100644 index 0000000000..2892b1e7e7 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_dot.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_iamax.cpp b/unit_test/hip/Test_HIP_Blas1_iamax.cpp new file mode 100644 index 0000000000..8fb34c05db --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_iamax.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_mult.cpp b/unit_test/hip/Test_HIP_Blas1_mult.cpp new file mode 100644 index 0000000000..e124061c58 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_mult.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_nrm1.cpp b/unit_test/hip/Test_HIP_Blas1_nrm1.cpp new file mode 100644 index 0000000000..fb292630e7 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_nrm1.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_nrm2.cpp b/unit_test/hip/Test_HIP_Blas1_nrm2.cpp new file mode 100644 index 0000000000..cf2f9e7237 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_nrm2.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_nrm2_squared.cpp b/unit_test/hip/Test_HIP_Blas1_nrm2_squared.cpp new file mode 100644 index 0000000000..4d91e62f85 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_nrm2_squared.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_nrminf.cpp b/unit_test/hip/Test_HIP_Blas1_nrminf.cpp new file mode 100644 index 0000000000..67a07902f0 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_nrminf.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_reciprocal.cpp b/unit_test/hip/Test_HIP_Blas1_reciprocal.cpp new file mode 100644 index 0000000000..892469cb7c --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_reciprocal.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_scal.cpp b/unit_test/hip/Test_HIP_Blas1_scal.cpp new file mode 100644 index 0000000000..11df7e89b5 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_scal.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_sum.cpp b/unit_test/hip/Test_HIP_Blas1_sum.cpp new file mode 100644 index 0000000000..3be74c5d9a --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_sum.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_team_abs.cpp b/unit_test/hip/Test_HIP_Blas1_team_abs.cpp new file mode 100644 index 0000000000..d59b6a61de --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_abs.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_team_axpby.cpp b/unit_test/hip/Test_HIP_Blas1_team_axpby.cpp new file mode 100644 index 0000000000..0f3a2a5fec --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_axpby.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_team_axpy.cpp b/unit_test/hip/Test_HIP_Blas1_team_axpy.cpp new file mode 100644 index 0000000000..823154d5af --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_axpy.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_team_dot.cpp b/unit_test/hip/Test_HIP_Blas1_team_dot.cpp new file mode 100644 index 0000000000..05987c8dd4 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_dot.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_team_mult.cpp b/unit_test/hip/Test_HIP_Blas1_team_mult.cpp new file mode 100644 index 0000000000..ca54d031f1 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_mult.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp b/unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp new file mode 100644 index 0000000000..9994255a31 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_team_scal.cpp b/unit_test/hip/Test_HIP_Blas1_team_scal.cpp new file mode 100644 index 0000000000..2f804c4dc5 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_scal.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_team_update.cpp b/unit_test/hip/Test_HIP_Blas1_team_update.cpp new file mode 100644 index 0000000000..99cc8746ed --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_team_update.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas1_update.cpp b/unit_test/hip/Test_HIP_Blas1_update.cpp new file mode 100644 index 0000000000..f2388dbc9b --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas1_update.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas2_gemv.cpp b/unit_test/hip/Test_HIP_Blas2_gemv.cpp new file mode 100644 index 0000000000..9df86cde64 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas2_gemv.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas2_team_gemv.cpp b/unit_test/hip/Test_HIP_Blas2_team_gemv.cpp new file mode 100644 index 0000000000..da40621400 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas2_team_gemv.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas3_gemm.cpp b/unit_test/hip/Test_HIP_Blas3_gemm.cpp new file mode 100644 index 0000000000..9fdd5004a4 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas3_gemm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas3_trmm.cpp b/unit_test/hip/Test_HIP_Blas3_trmm.cpp new file mode 100644 index 0000000000..baaf52d8a5 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas3_trmm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas3_trsm.cpp b/unit_test/hip/Test_HIP_Blas3_trsm.cpp new file mode 100644 index 0000000000..fa4ce5e728 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas3_trsm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Blas_gesv.cpp b/unit_test/hip/Test_HIP_Blas_gesv.cpp new file mode 100644 index 0000000000..7d4a4bb0c4 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas_gesv.cpp @@ -0,0 +1,4 @@ +#include +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include +#endif diff --git a/unit_test/hip/Test_HIP_Blas_trtri.cpp b/unit_test/hip/Test_HIP_Blas_trtri.cpp new file mode 100644 index 0000000000..e5b58ad470 --- /dev/null +++ b/unit_test/hip/Test_HIP_Blas_trtri.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Common_ArithTraits.cpp b/unit_test/hip/Test_HIP_Common_ArithTraits.cpp new file mode 100644 index 0000000000..6482ba2dba --- /dev/null +++ b/unit_test/hip/Test_HIP_Common_ArithTraits.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Common_Sorting.cpp b/unit_test/hip/Test_HIP_Common_Sorting.cpp new file mode 100644 index 0000000000..f01730e654 --- /dev/null +++ b/unit_test/hip/Test_HIP_Common_Sorting.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Common_Transpose.cpp b/unit_test/hip/Test_HIP_Common_Transpose.cpp new file mode 100644 index 0000000000..d81855df62 --- /dev/null +++ b/unit_test/hip/Test_HIP_Common_Transpose.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Common_set_bit_count.cpp b/unit_test/hip/Test_HIP_Common_set_bit_count.cpp new file mode 100644 index 0000000000..bd2fd76423 --- /dev/null +++ b/unit_test/hip/Test_HIP_Common_set_bit_count.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Graph_graph_color.cpp b/unit_test/hip/Test_HIP_Graph_graph_color.cpp new file mode 100644 index 0000000000..01343e32c5 --- /dev/null +++ b/unit_test/hip/Test_HIP_Graph_graph_color.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Graph_graph_color_d2.cpp b/unit_test/hip/Test_HIP_Graph_graph_color_d2.cpp new file mode 100644 index 0000000000..5ca8df65dc --- /dev/null +++ b/unit_test/hip/Test_HIP_Graph_graph_color_d2.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Graph_graph_color_deterministic.cpp b/unit_test/hip/Test_HIP_Graph_graph_color_deterministic.cpp new file mode 100644 index 0000000000..b24e4bf4b4 --- /dev/null +++ b/unit_test/hip/Test_HIP_Graph_graph_color_deterministic.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Graph_rcm.cpp b/unit_test/hip/Test_HIP_Graph_rcm.cpp new file mode 100644 index 0000000000..652eb9ade5 --- /dev/null +++ b/unit_test/hip/Test_HIP_Graph_rcm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_CrsMatrix.cpp b/unit_test/hip/Test_HIP_Sparse_CrsMatrix.cpp new file mode 100644 index 0000000000..782e8152a2 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_CrsMatrix.cpp @@ -0,0 +1,3 @@ +#include +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_block_gauss_seidel.cpp b/unit_test/hip/Test_HIP_Sparse_block_gauss_seidel.cpp new file mode 100644 index 0000000000..986460a37b --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_block_gauss_seidel.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_findRelOffset.cpp b/unit_test/hip/Test_HIP_Sparse_findRelOffset.cpp new file mode 100644 index 0000000000..0d82182e9b --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_findRelOffset.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_gauss_seidel.cpp b/unit_test/hip/Test_HIP_Sparse_gauss_seidel.cpp new file mode 100644 index 0000000000..b63fee6a94 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_gauss_seidel.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_replaceSumInto.cpp b/unit_test/hip/Test_HIP_Sparse_replaceSumInto.cpp new file mode 100644 index 0000000000..72bf132cf0 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_replaceSumInto.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_replaceSumIntoLonger.cpp b/unit_test/hip/Test_HIP_Sparse_replaceSumIntoLonger.cpp new file mode 100644 index 0000000000..daf96e433d --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_replaceSumIntoLonger.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_spadd.cpp b/unit_test/hip/Test_HIP_Sparse_spadd.cpp new file mode 100644 index 0000000000..98736daebf --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_spadd.cpp @@ -0,0 +1,3 @@ +#include +#include + diff --git a/unit_test/hip/Test_HIP_Sparse_spgemm.cpp b/unit_test/hip/Test_HIP_Sparse_spgemm.cpp new file mode 100644 index 0000000000..2402f7596e --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_spgemm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_spgemm_jacobi.cpp b/unit_test/hip/Test_HIP_Sparse_spgemm_jacobi.cpp new file mode 100644 index 0000000000..6ab09e6743 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_spgemm_jacobi.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_spiluk.cpp b/unit_test/hip/Test_HIP_Sparse_spiluk.cpp new file mode 100644 index 0000000000..83f2a59192 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_spiluk.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_spmv.cpp b/unit_test/hip/Test_HIP_Sparse_spmv.cpp new file mode 100644 index 0000000000..18edf035e8 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_spmv.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_sptrsv.cpp b/unit_test/hip/Test_HIP_Sparse_sptrsv.cpp new file mode 100644 index 0000000000..cb18ff3ed8 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_sptrsv.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/hip/Test_HIP_Sparse_trsv.cpp b/unit_test/hip/Test_HIP_Sparse_trsv.cpp new file mode 100644 index 0000000000..c371d334e9 --- /dev/null +++ b/unit_test/hip/Test_HIP_Sparse_trsv.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Complex.cpp b/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Complex.cpp new file mode 100644 index 0000000000..9adfd61517 --- /dev/null +++ b/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_OpenMP.hpp" +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Complex.hpp" diff --git a/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Real.cpp b/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Real.cpp new file mode 100644 index 0000000000..e841dea6a5 --- /dev/null +++ b/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_OpenMP.hpp" +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Real.hpp" diff --git a/unit_test/openmp/Test_OpenMP_Graph_mis2.cpp b/unit_test/openmp/Test_OpenMP_Graph_mis2.cpp new file mode 100644 index 0000000000..8622411b0f --- /dev/null +++ b/unit_test/openmp/Test_OpenMP_Graph_mis2.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/openmp/Test_OpenMP_Graph_rcm.cpp b/unit_test/openmp/Test_OpenMP_Graph_rcm.cpp new file mode 100644 index 0000000000..eb8164cb30 --- /dev/null +++ b/unit_test/openmp/Test_OpenMP_Graph_rcm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Complex.cpp b/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Complex.cpp new file mode 100644 index 0000000000..991031d817 --- /dev/null +++ b/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_Serial.hpp" +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Complex.hpp" diff --git a/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Real.cpp b/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Real.cpp new file mode 100644 index 0000000000..cc2041cefc --- /dev/null +++ b/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_Serial.hpp" +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Real.hpp" diff --git a/unit_test/serial/Test_Serial_Graph_mis2.cpp b/unit_test/serial/Test_Serial_Graph_mis2.cpp new file mode 100644 index 0000000000..38db82cfc9 --- /dev/null +++ b/unit_test/serial/Test_Serial_Graph_mis2.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/serial/Test_Serial_Graph_rcm.cpp b/unit_test/serial/Test_Serial_Graph_rcm.cpp new file mode 100644 index 0000000000..ac225ba858 --- /dev/null +++ b/unit_test/serial/Test_Serial_Graph_rcm.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp index 5645f1d2ce..9993d46e22 100644 --- a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp @@ -81,8 +81,8 @@ int run_gauss_seidel( bool is_symmetric_graph, int apply_type = 0, // 0 for symmetric, 1 for forward, 2 for backward. int cluster_size = 1, - ClusteringAlgorithm cluster_algorithm = CLUSTER_DEFAULT, - bool classic = false) // only with two-stage, true for sptrsv instead of richardson + bool classic = false, // only with two-stage, true for sptrsv instead of richardson + ClusteringAlgorithm clusterAlgo = CLUSTER_DEFAULT) { typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type lno_view_t; @@ -101,7 +101,7 @@ int run_gauss_seidel( kh.set_team_work_size(16); kh.set_dynamic_scheduling(true); if(gs_algorithm == GS_CLUSTER) - kh.create_gs_handle(cluster_algorithm, cluster_size); + kh.create_gs_handle(clusterAlgo, cluster_size); else if(gs_algorithm == GS_TWOSTAGE) { // test for two-stage/classical gs kh.create_gs_handle(gs_algorithm); @@ -282,9 +282,10 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ } //*** Cluster-coloring version **** int clusterSizes[3] = {2, 5, 34}; + std::vector clusteringAlgos = {CLUSTER_MIS2, CLUSTER_BALLOON}; for(int csize = 0; csize < 3; csize++) { - for(int algo = 0; algo < (int) NUM_CLUSTERING_ALGORITHMS; algo++) + for(auto clusterAlgo : clusteringAlgos) { for(int apply_type = 0; apply_type < apply_count; ++apply_type) { @@ -292,7 +293,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ //Zero out X before solving Kokkos::deep_copy(x_vector, zero); run_gauss_seidel( - input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, apply_type, clusterSizes[csize], (ClusteringAlgorithm) algo); + input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, apply_type, clusterSizes[csize], false, clusterAlgo); KokkosBlas::axpby(one, solution_x, -one, x_vector); mag_t result_norm_res = KokkosBlas::nrm2(x_vector); EXPECT_LT(result_norm_res, initial_norm_res); @@ -312,10 +313,9 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ //*** Two-stage version (classic) **** for (int apply_type = 0; apply_type < apply_count; ++apply_type) { - ClusteringAlgorithm cluster_algo = (ClusteringAlgorithm)0; Kokkos::deep_copy(x_vector, zero); run_gauss_seidel - (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, cluster_algo, true); + (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true); KokkosBlas::axpby(one, solution_x, -one, x_vector); mag_t result_norm_res = KokkosBlas::nrm2(x_vector); EXPECT_LT(result_norm_res, initial_norm_res); @@ -435,10 +435,9 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ for(int apply_type = 0; apply_type < apply_count; ++apply_type) { //Zero out X before solving - ClusteringAlgorithm cluster_algo = (ClusteringAlgorithm)0; Kokkos::deep_copy(x_vector, zero); run_gauss_seidel - (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, cluster_algo, true); + (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true); Kokkos::deep_copy(x_host, x_vector); for(lno_t i = 0; i < numVecs; i++) { @@ -455,43 +454,6 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ } } -template -void test_rcm(lno_t numRows, size_type nnzPerRow, lno_t bandwidth) -{ - using namespace Test; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type lno_row_view_t; - typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; - typedef KokkosKernelsHandle - KernelHandle; - srand(245); - size_type nnzTotal = nnzPerRow * numRows; - lno_t nnzVariance = nnzPerRow / 4; - crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows, numRows, nnzTotal, nnzVariance, bandwidth); - lno_row_view_t symRowmap; - lno_nnz_view_t symEntries; - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap - - (numRows, A.graph.row_map, A.graph.entries, symRowmap, symEntries); - typedef KokkosSparse::Impl::RCM rcm_t; - rcm_t rcm(numRows, symRowmap, symEntries); - lno_nnz_view_t rcmOrder = rcm.rcm(); - //perm(i) = the node with timestamp i - //make sure that perm is in fact a permutation matrix (contains each row exactly once) - Kokkos::View rcmHost("RCM row ordering", numRows); - Kokkos::deep_copy(rcmHost, rcmOrder); - std::set rowSet; - for(lno_t i = 0; i < numRows; i++) - rowSet.insert(rcmHost(i)); - if((lno_t) rowSet.size() != numRows) - { - std::cerr << "Only got back " << rowSet.size() << " unique row IDs!\n"; - return; - } -} - template void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { const scalar_t zero = Kokkos::Details::ArithTraits::zero(); @@ -659,9 +621,6 @@ TEST_F( TestCategory, sparse ## _ ## gauss_seidel_symmetric_rank2 ## _ ## SCALAR TEST_F( TestCategory, sparse ## _ ## gauss_seidel_zero_rows ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ test_sgs_zero_rows(); \ } \ -TEST_F( TestCategory, sparse ## _ ## rcm ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ - test_rcm(10000, 50, 2000); \ -} \ TEST_F( TestCategory, sparse ## _ ## balloon_clustering ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ test_balloon_clustering(5000, 100, 2000); \ } \ diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index a7b42fa697..5a033fdf34 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -42,7 +42,7 @@ struct fSPMV { if(error > eps) { err++; - printf("expected_y(%d)=%f, y(%d)=%f\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i))); + //printf("expected_y(%d)=%f, y(%d)=%f\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i))); } } }; @@ -51,7 +51,8 @@ struct fSPMV { template void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta){ + typename y_vector_type::non_const_value_type beta, + char mode = 'N'){ using graph_t = typename crsMat_t::StaticCrsGraphType; using size_type_view_t = typename graph_t::row_map_type; @@ -61,7 +62,9 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, using size_type = typename size_type_view_t::non_const_value_type; using lno_t = typename lno_view_t::non_const_value_type; using scalar_t = typename scalar_view_t::non_const_value_type; + using KAT = Kokkos::ArithTraits; + mode = toupper(mode); typename scalar_view_t::HostMirror h_values = Kokkos::create_mirror_view(input_mat.values); Kokkos::deep_copy(h_values,input_mat.values); @@ -84,15 +87,24 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, lno_t nr = input_mat.numRows(); - for (lno_t i = 0; i < nr; ++i){ - scalar_t result = 0; - for (size_type j = h_rowmap(i); j < h_rowmap(i+1); ++j){ + //first, scale y by beta + for(size_t i = 0; i < h_y.extent(0); i++) + h_y(i) *= beta; + + //then go through the matrix and accumulate the matrix-vector product + for (lno_t row = 0; row < nr; ++row) { + for (size_type j = h_rowmap(row); j < h_rowmap(row+1); ++j) { lno_t col = h_entries(j); scalar_t val = h_values(j); - scalar_t vector_val = h_x(col); - result += val * vector_val; + if(mode == 'N') + h_y(row) += alpha * val * h_x(col); + else if(mode == 'C') + h_y(row) += alpha * KAT::conj(val) * h_x(col); + else if(mode == 'T') + h_y(col) += alpha * val * h_x(row); + else if(mode == 'H') + h_y(col) += alpha * KAT::conj(val) * h_x(row); } - h_y(i) = beta * h_y(i) + alpha * result; } KokkosKernels::Impl::safe_host_to_device_deep_copy (y.extent(0), h_y, y); Kokkos::fence(); @@ -102,7 +114,7 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, template void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta) { + typename y_vector_type::non_const_value_type beta, char mode) { //typedef typename crsMat_t::StaticCrsGraphType graph_t; using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; @@ -114,15 +126,25 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, // so let us use y_value_type to determine // the appropriate tolerance precision. const y_value_mag_type eps = std::is_same::value ? 2*1e-3 : 1e-7; - const size_t nr = input_mat.numRows(); - y_vector_type expected_y("expected", nr); + bool transposed = (mode == 'T') || (mode == 'H'); + y_vector_type expected_y("expected", transposed ? input_mat.numCols() : input_mat.numRows()); Kokkos::deep_copy(expected_y, y); Kokkos::fence(); - sequential_spmv(input_mat, x, expected_y, alpha, beta); - //KokkosKernels::Impl::print_1Dview(expected_y); - KokkosSparse::spmv("N", alpha, input_mat, x, beta, y); - //KokkosKernels::Impl::print_1Dview(y); + sequential_spmv(input_mat, x, expected_y, alpha, beta, mode); + bool threw = false; + std::string msg; + try + { + KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y); + Kokkos::fence(); + } + catch(std::exception& e) + { + threw = true; + msg = e.what(); + } + ASSERT_FALSE(threw) << "KokkosSparse::Test::spmv 1D, mode " << mode << ": threw exception:\n" << msg << '\n'; int num_errors = 0; Kokkos::parallel_reduce("KokkosSparse::Test::spmv", my_exec_space(0, y.extent(0)), @@ -137,7 +159,7 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, template void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vector_type expected_y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, int numMV) { + typename y_vector_type::non_const_value_type beta, int numMV, char mode) { using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; using y_value_type = typename y_vector_type::non_const_value_type; @@ -153,8 +175,19 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vecto Kokkos::fence(); - KokkosSparse::spmv("N", alpha, input_mat, x, beta, y); - + bool threw = false; + std::string msg; + try + { + KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y); + Kokkos::fence(); + } + catch(std::exception& e) + { + threw = true; + msg = e.what(); + } + ASSERT_FALSE(threw) << "KokkosSparse::Test::spmv 2D, mode " << mode << ": threw exception:\n" << msg << '\n'; for (int i = 0; i < numMV; ++i){ auto x_i = Kokkos::subview (x, Kokkos::ALL (), i); @@ -162,7 +195,7 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vecto auto y_i = Kokkos::subview (expected_y, Kokkos::ALL (), i); Kokkos::fence(); - sequential_spmv(input_mat, x_i, y_i, alpha, beta); + sequential_spmv(input_mat, x_i, y_i, alpha, beta, mode); auto y_spmv = Kokkos::subview (y, Kokkos::ALL (), i); int num_errors = 0; @@ -170,8 +203,9 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vecto my_exec_space(0,y_i.extent(0)), fSPMV(y_i, y_spmv, eps), num_errors); - if(num_errors>0) printf("KokkosSparse::Test::spmv_mv: %i errors of %i for mv %i\n", - num_errors, y_i.extent_int(0), i); + if(num_errors>0) + std::cout << "KokkosSparse::Test::spmv_mv: " << num_errors << " errors of " << y_i.extent_int(0) + << " for mv " << i << " (alpha=" << alpha << ", beta=" << beta << ", mode = " << mode << ")\n"; EXPECT_TRUE(num_errors==0); } } @@ -306,6 +340,23 @@ void check_spmv_controls(KokkosKernels::Experimental::Controls controls, } // namespace Test +template +scalar_t randomUpperBound(int mag) +{ + return (scalar_t) mag; +} + +template <> +Kokkos::complex randomUpperBound>(int mag) +{ + return Kokkos::complex(mag, mag); +} + +template <> +Kokkos::complex randomUpperBound>(int mag) +{ + return Kokkos::complex(mag, mag); +} template void test_spmv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance){ @@ -326,18 +377,33 @@ void test_spmv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_vari x_vector_type input_x ("x", nc); y_vector_type output_y ("y", nr); + x_vector_type input_xt ("x", nr); + y_vector_type output_yt ("y", nc); Kokkos::Random_XorShift64_Pool rand_pool(13718); typedef typename x_vector_type::value_type ScalarX; typedef typename y_vector_type::value_type ScalarY; - Kokkos::fill_random(input_x,rand_pool,ScalarX(10)); - Kokkos::fill_random(output_y,rand_pool,ScalarY(10)); + Kokkos::fill_random(input_x,rand_pool,randomUpperBound(10)); + Kokkos::fill_random(output_y,rand_pool,randomUpperBound(10)); + Kokkos::fill_random(input_xt,rand_pool,randomUpperBound(10)); + Kokkos::fill_random(output_yt,rand_pool,randomUpperBound(10)); - Test::check_spmv(input_mat, input_x, output_y, 1.0, 0.0); - Test::check_spmv(input_mat, input_x, output_y, 0.0, 1.0); - Test::check_spmv(input_mat, input_x, output_y, 1.0, 1.0); + std::vector nonTransModes = {'N', 'C'}; + std::vector transModes = {'T', 'H'}; + for(auto mode : nonTransModes) + { + Test::check_spmv(input_mat, input_x, output_y, 1.0, 0.0, mode); + Test::check_spmv(input_mat, input_x, output_y, 0.0, 1.0, mode); + Test::check_spmv(input_mat, input_x, output_y, 1.0, 1.0, mode); + } + for(auto mode : transModes) + { + Test::check_spmv(input_mat, input_xt, output_yt, 1.0, 0.0, mode); + Test::check_spmv(input_mat, input_xt, output_yt, 0.0, 1.0, mode); + Test::check_spmv(input_mat, input_xt, output_yt, 1.0, 1.0, mode); + } } template @@ -353,21 +419,66 @@ void test_spmv_mv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_v ViewTypeY b_y("B",numCols,numMV); ViewTypeY b_y_copy("B",numCols,numMV); + ViewTypeX b_xt("A",numCols,numMV); + ViewTypeY b_yt("B",numRows,numMV); + ViewTypeY b_yt_copy("B",numRows,numMV); + Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,scalar_t(10)); - Kokkos::fill_random(b_y,rand_pool,scalar_t(10)); + Kokkos::fill_random(b_x,rand_pool,randomUpperBound(10)); + Kokkos::fill_random(b_y,rand_pool,randomUpperBound(10)); + Kokkos::fill_random(b_xt,rand_pool,randomUpperBound(10)); + Kokkos::fill_random(b_yt,rand_pool,randomUpperBound(10)); crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows,numCols,nnz,row_size_variance, bandwidth); Kokkos::deep_copy(b_y_copy, b_y); + Kokkos::deep_copy(b_yt_copy, b_yt); + std::vector nonTransModes = {'N', 'C'}; + std::vector transModes = {'T', 'H'}; + for(auto mode : nonTransModes) + { + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, numMV, mode); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, numMV, mode); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, numMV, mode); + } + for(auto mode : transModes) + { + Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, 1.0, 0.0, numMV, mode); + Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, 0.0, 1.0, numMV, mode); + Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, 1.0, 1.0, numMV, mode); + } +} - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, numMV); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, numMV); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, numMV); +template +void test_spmv_mv_heavy(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance, int numMV){ + + typedef typename KokkosSparse::CrsMatrix crsMat_t; + typedef Kokkos::View ViewTypeX; + typedef Kokkos::View ViewTypeY; + crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows,numRows,nnz,row_size_variance, bandwidth); + Kokkos::Random_XorShift64_Pool rand_pool(13718); + + for(int nv = 1; nv <= numMV; nv++) { + ViewTypeX b_x("A",numRows,nv); + ViewTypeY b_y("B",numRows,nv); + ViewTypeY b_y_copy("B",numRows,nv); + + Kokkos::fill_random(b_x,rand_pool,scalar_t(10)); + Kokkos::fill_random(b_y,rand_pool,scalar_t(10)); + + Kokkos::deep_copy(b_y_copy, b_y); + + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'N'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'N'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'T'); + } } template @@ -736,6 +847,7 @@ TEST_F( TestCategory,sparse ## _ ## spmv_mv ## _ ## SCALAR ## _ ## ORDINAL ## _ test_spmv_mv (50000, 50000 * 30, 100, 10, 5); \ test_spmv_mv (50000, 50000 * 30, 200, 10, 1); \ test_spmv_mv (10000, 10000 * 20, 100, 5, 10); \ + test_spmv_mv_heavy (200, 200 * 10, 60, 4, 30); \ } #define EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, DEVICE) \ diff --git a/unit_test/standalone/main.cpp b/unit_test/standalone/main.cpp index 68d336805e..259a572c7a 100644 --- a/unit_test/standalone/main.cpp +++ b/unit_test/standalone/main.cpp @@ -3,6 +3,9 @@ #ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA #include #endif +#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP +#include +#endif #ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL #include #endif diff --git a/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Complex.cpp b/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Complex.cpp new file mode 100644 index 0000000000..02b4d3681f --- /dev/null +++ b/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Complex.cpp @@ -0,0 +1,3 @@ +#include +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Complex.hpp" diff --git a/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Real.cpp b/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Real.cpp new file mode 100644 index 0000000000..5c17d8df16 --- /dev/null +++ b/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Real.cpp @@ -0,0 +1,3 @@ +#include +#include "Test_Batched_TeamVectorGemm.hpp" +#include "Test_Batched_TeamVectorGemm_Real.hpp" diff --git a/unit_test/threads/Test_Threads_Graph_mis2.cpp b/unit_test/threads/Test_Threads_Graph_mis2.cpp new file mode 100644 index 0000000000..cbf15a7662 --- /dev/null +++ b/unit_test/threads/Test_Threads_Graph_mis2.cpp @@ -0,0 +1,3 @@ +#include +#include + diff --git a/unit_test/threads/Test_Threads_Graph_rcm.cpp b/unit_test/threads/Test_Threads_Graph_rcm.cpp new file mode 100644 index 0000000000..37184bb806 --- /dev/null +++ b/unit_test/threads/Test_Threads_Graph_rcm.cpp @@ -0,0 +1,3 @@ +#include +#include +