diff --git a/CHANGELOG.md b/CHANGELOG.md
index 934eb67143..28e43dff49 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,34 @@
 # Change Log
 
+## [3.3.00](https://github.com/kokkos/kokkos-kernels/tree/3.3.00) (2020-12-16)
+[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.2.01...3.3.00)
+
+**Implemented enhancements:**
+- Add permanent RCM reordering interface, and a basic serial implementation [\#854](https://github.com/kokkos/kokkos/pull/#854)
+- Half\_t explicit conversions [\#849](https://github.com/kokkos/kokkos/pull/#849)
+- Add batched gemm performance tests [\#838](https://github.com/kokkos/kokkos/pull/#838)
+- Add HIP support to src and perf\_test [\#828](https://github.com/kokkos/kokkos/pull/#828)
+- Factor out coarsening [\#827](https://github.com/kokkos/kokkos/pull/#827)
+- Allow enabling/disabling components at configuration time [\#823](https://github.com/kokkos/kokkos/pull/#823)
+- HIP: CMake work on tests and ETI  [\#820](https://github.com/kokkos/kokkos/pull/#820)
+- HIP: KokkosBatched - hip specialization [\#812](https://github.com/kokkos/kokkos/pull/#812)
+- Distance-2 maximal independent set [\#801](https://github.com/kokkos/kokkos/pull/#801)
+- Use batched TRTRI & TRMM for Supernode-sptrsv setup [\#797](https://github.com/kokkos/kokkos/pull/#797)
+- Initial support for half precision [\#794](https://github.com/kokkos/kokkos/pull/#794)
+
+**Fixed bugs:**
+- Fix issue with HIP and Kokkos\_ArithTraits [\#844](https://github.com/kokkos/kokkos/pull/#844)
+- HIP: fixing round of issues on AMD [\#840](https://github.com/kokkos/kokkos/pull/#840)
+- Throw an exception if BLAS GESV is not enabled [\#837](https://github.com/kokkos/kokkos/pull/#837)
+- Fixes -Werror for gcc with c++20 [\#836](https://github.com/kokkos/kokkos/pull/#836)
+- Add fallback condition to use spmv\_native when cuSPARSE does not work [\#834](https://github.com/kokkos/kokkos/pull/#834)
+- Fix install testing refactor for inline builds [\#811](https://github.com/kokkos/kokkos/pull/#811)
+- HIP: fix ArithTraits to support HIP backend [\#809](https://github.com/kokkos/kokkos/pull/#809)
+- cuSPARSE 11: fix spgemm and spmv\_struct\_tunning compilation error [\#804](https://github.com/kokkos/kokkos/pull/#804)
+
+**Incompatibilities:**
+- Remove pre-3.0 deprecated code [\#825](https://github.com/kokkos/kokkos/pull/#825)
+
 ## [3.2.01](https://github.com/kokkos/kokkos-kernels/tree/3.2.01) (2020-11-17)
 [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.2.00...3.2.01)
 
@@ -28,8 +57,8 @@
 - Nightly test failure: spgemm unit tests failing on White \(Power8\) [\#780](https://github.com/kokkos/kokkos-kernels/issues/780)
 - supernodal does not build with UVM enabled [\#633](https://github.com/kokkos/kokkos-kernels/issues/633)
 
-## [3.1.1](https://github.com/kokkos/kokkos-kernels/tree/3.1.1) (2020-05-04)
-[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.1.00...3.1.1)
+## [3.1.01](https://github.com/kokkos/kokkos-kernels/tree/3.1.01) (2020-05-04)
+[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.1.00...3.1.01)
 
 ** Fixed bugs:** 
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 44fbf3aba8..209db7ce6e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,8 +24,8 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS)
     PROJECT(KokkosKernels CXX)
   ENDIF()
   SET(KokkosKernels_VERSION_MAJOR 3)
-  SET(KokkosKernels_VERSION_MINOR 2)
-  SET(KokkosKernels_VERSION_PATCH 1)
+  SET(KokkosKernels_VERSION_MINOR 3)
+  SET(KokkosKernels_VERSION_PATCH 0)
 ENDIF()
 
 IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0")
@@ -83,7 +83,8 @@ IF (KokkosKernels_INSTALL_TESTING)
 ELSE()
   # Regular build, not install testing
   # Do all the regular option processing
-  IF (NOT KOKKOSKERNELS_HAS_TRILINOS)
+  IF (NOT KOKKOSKERNELS_HAS_TRILINOS AND NOT KOKKOSKERNELS_HAS_PARENT)
+   # This is a standalone build
    FIND_PACKAGE(Kokkos REQUIRED)
    MESSAGE(STATUS "Found Kokkos at ${Kokkos_DIR}")
    KOKKOS_CHECK(OPTIONS CUDA_UVM RETURN_VALUE KOKKOS_ENABLE_CUDA_UVM)
@@ -139,6 +140,15 @@ ELSE()
     BOOL
     "Whether to restrict testing to ETI types. Default: ON"
     )
+
+  KOKKOSKERNELS_ADD_OPTION(
+    ENABLED_COMPONENTS
+    "ALL"
+    STRING
+    "A list of components to enable in testing and building"
+    VALID_ENTRIES BATCHED BLAS GRAPH SPARSE ALL
+  )
+
   # ==================================================================
   # Enable Device Types for ETI (exec- + mem-space)
   # ==================================================================
@@ -159,7 +169,6 @@ ELSE()
   # Enable Layout Types for ETI
   # ==================================================================
   INCLUDE(cmake/kokkoskernels_eti_layouts.cmake)
-
   # ==================================================================
   # Enable Third Party Libraries
   # ==================================================================
diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash
index 2bdb004ec2..bb246df3c6 100755
--- a/cm_generate_makefile.bash
+++ b/cm_generate_makefile.bash
@@ -21,8 +21,18 @@ get_kokkos_device_list() {
   for DEVICE_ in $PARSE_DEVICES_LST
   do 
      UC_DEVICE=$(echo $DEVICE_ | tr "[:lower:]" "[:upper:]")
+     if [ "${UC_DEVICE}" == "CUDA" ]; then
+       WITH_CUDA_BACKEND=ON
+     fi
+     if [ "${UC_DEVICE}" == "HIP" ]; then
+       WITH_HIP_BACKEND=ON
+     fi
      KOKKOS_DEVICE_CMD="-DKokkos_ENABLE_${UC_DEVICE}=ON ${KOKKOS_DEVICE_CMD}"
   done
+  if [ "${WITH_CUDA_BACKEND}" == "ON" ] && [ "${WITH_HIP_BACKEND}" == "ON" ]; then
+     echo "Invalid configuration - Cuda and Hip cannot be simultaneously enabled"
+     exit
+  fi
 }
 
 get_kokkos_arch_list() {
@@ -59,6 +69,24 @@ get_kokkos_cuda_option_list() {
   done
 }
 
+get_kokkos_hip_option_list() {
+  echo parsing KOKKOS_HIP_OPTIONS=$KOKKOS_HIP_OPTIONS
+  KOKKOS_HIP_OPTION_CMD=
+  PARSE_HIP_LST=$(echo $KOKKOS_HIP_OPTIONS | tr "," "\n")
+  for HIP_ in $PARSE_HIP_LST
+  do
+     HIP_OPT_NAME=
+     if  [ "${HIP_}" == "rdc" ]; then
+        HIP_OPT_NAME=HIP_RELOCATABLE_DEVICE_CODE
+     else
+        echo "${HIP_} is not a valid hip option..."
+     fi
+     if [ "${HIP_OPT_NAME}" != "" ]; then
+        KOKKOS_HIP_OPTION_CMD="-DKokkos_ENABLE_${HIP_OPT_NAME}=ON ${KOKKOS_HIP_OPTION_CMD}"
+     fi
+  done
+}
+
 get_kokkos_option_list() {
   echo parsing KOKKOS_OPTIONS=$KOKKOS_OPTIONS
   KOKKOS_OPTION_CMD=
@@ -196,15 +224,21 @@ display_help_text() {
       echo "--prefix=/Install/Path:                       Path to install the KokkosKernels library."
       echo ""
       echo "--with-cuda[=/Path/To/Cuda]:                  Enable Cuda and set path to Cuda Toolkit."
+      echo "--with-hip[=/Path/To/Hip]:                    Enable Hip and set path to ROCM Toolkit."
       echo "--with-openmp:                                Enable OpenMP backend."
       echo "--with-pthread:                               Enable Pthreads backend."
       echo "--with-serial:                                Enable Serial backend."
       echo "--with-devices:                               Explicitly add a set of backends."
       echo ""
       echo "--arch=[OPT]:  Set target architectures. Options are:"
-      echo "               [AMD]"
+      echo "               [AMD: CPU]"
       echo "                 AMDAVX          = AMD CPU"
-      echo "                 EPYC            = AMD EPYC Zen-Core CPU"
+      echo "                 ZEN             = AMD Zen-Core CPU"
+      echo "                 ZEN2            = AMD Zen2-Core CPU"
+      echo "               [AMD: GPU]"
+      echo "                 VEGA900         = AMD GPU MI25 GFX900"
+      echo "                 VEGA906         = AMD GPU MI50/MI60 GFX906"
+      echo "                 VEGA908         = AMD GPU"
       echo "               [ARM]"
       echo "                 ARMV80          = ARMv8.0 Compatible CPU"
       echo "                 ARMV81          = ARMv8.1 Compatible CPU"
@@ -264,6 +298,8 @@ display_help_text() {
       echo "                                "
       echo "--with-cuda-options=[OPT]:    Additional options to CUDA:"
       echo "                                force_uvm, use_ldg, enable_lambda, rdc"
+      echo "--with-hip-options=[OPT]:     Additional options to HIP:"
+      echo "                                rdc"
       echo "--with-scalars=[SCALARS]:     Set scalars to be instantiated."
       echo "                                Options: float, double, complex_float, complex_double"
       echo "--with-ordinals=[ORDINALS]:   Set ordinals to be instantiated."
@@ -307,6 +343,10 @@ KOKKOS_MAKEINSTALL_J=4
 
 KERNELS_DEFAULT_ETI_OPTION=""
 
+# For tracking if Cuda and Hip devices are enabled simultaneously
+WITH_CUDA_BACKEND=OFF
+WITH_HIP_BACKEND=OFF
+
 while [[ $# > 0 ]]
 do
   key="$1"
@@ -340,6 +380,19 @@ do
       update_kokkos_devices Cuda
       CUDA_PATH="${key#*=}"
       ;;
+    --with-hip)
+      update_kokkos_devices Hip
+      HIP_PATH_HIPCC=$(command -v hipcc)
+      HIP_PATH=${HIP_PATH_HIPCC%/bin/hipcc}
+      ;;
+    # Catch this before '--with-hip*'
+    --with-hip-options*)
+      KOKKOS_HIP_OPTIONS="${key#*=}"
+      ;;
+    --with-hip*)
+      update_kokkos_devices Hip
+      HIP_PATH="${key#*=}"
+      ;;
     --with-openmp)
       update_kokkos_devices OpenMP
       ;;
@@ -606,6 +659,7 @@ get_kokkos_device_list
 get_kokkos_option_list
 get_kokkos_arch_list
 get_kokkos_cuda_option_list
+get_kokkos_hip_option_list
 
 get_kernels_scalar_list
 get_kernels_ordinals_list
@@ -655,9 +709,9 @@ cd ${KOKKOS_INSTALL_PATH}
 
 # Configure kokkos
 echo ""
-echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH}
+echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH}
 echo ""
-cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH}
+cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH}
 
 # Install kokkos library
 make install -j $KOKKOS_MAKEINSTALL_J
diff --git a/cmake/KokkosKernelsConfig.cmake.in b/cmake/KokkosKernelsConfig.cmake.in
index 54e0006aa0..31d77bda94 100644
--- a/cmake/KokkosKernelsConfig.cmake.in
+++ b/cmake/KokkosKernelsConfig.cmake.in
@@ -11,6 +11,7 @@ find_dependency(Kokkos HINTS @Kokkos_DIR@)
 
 SET(Kokkos_ENABLE_OPENMP  @Kokkos_ENABLE_OPENMP@)
 SET(Kokkos_ENABLE_CUDA    @Kokkos_ENABLE_CUDA@)
+SET(Kokkos_ENABLE_HIP     @Kokkos_ENABLE_HIP@)
 SET(Kokkos_ENABLE_PTHREAD @Kokkos_ENABLE_PTHREAD@) 
 SET(Kokkos_ENABLE_SERIAL  @Kokkos_ENABLE_SERIAL@)
 
diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in
index b5611c20ca..c0a1e98ec6 100644
--- a/cmake/KokkosKernels_config.h.in
+++ b/cmake/KokkosKernels_config.h.in
@@ -34,6 +34,9 @@
 #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_CUDA
 #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_CUDASPACE
 #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE
+/* Whether to build kernels for execution space Kokkos::Experimental::HIP */
+#cmakedefine KOKKOSKERNELS_INST_EXECSPACE_HIP
+#cmakedefine KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE
 /* Whether to build kernels for execution space Kokkos::OpenMP */
 #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_OPENMP
 /* Whether to build kernels for execution space Kokkos::Threads */
diff --git a/cmake/fake_tribits.cmake b/cmake/fake_tribits.cmake
index 8d623a67fe..26737b8919 100644
--- a/cmake/fake_tribits.cmake
+++ b/cmake/fake_tribits.cmake
@@ -24,6 +24,13 @@ ENDFOREACH()
 ENDFUNCTION()
 
 FUNCTION(kokkoskernels_add_option SUFFIX DEFAULT TYPE DOCSTRING)
+  CMAKE_PARSE_ARGUMENTS(OPT
+    ""
+    ""
+    "VALID_ENTRIES" #if this is a list variable, the valid values in the list
+    ${ARGN}
+  )
+
   SET(CAMEL_NAME KokkosKernels_${SUFFIX})
   STRING(TOUPPER ${CAMEL_NAME} UC_NAME)
 
@@ -40,13 +47,28 @@ FUNCTION(kokkoskernels_add_option SUFFIX DEFAULT TYPE DOCSTRING)
     ENDIF()
   ENDFOREACH()
 
+
   #okay, great, we passed the validation test - use the default
   IF (DEFINED ${CAMEL_NAME})
-    SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE)
+    IF (OPT_VALID_ENTRIES)
+      STRING(TOUPPER   "${OPT_VALID_ENTRIES}" OPT_VALID_ENTRIES_UC)
+      FOREACH(entry ${${CAMEL_NAME}})
+        STRING(TOUPPER ${entry} ENTRY_UC)
+        IF (NOT ${ENTRY_UC} IN_LIST OPT_VALID_ENTRIES_UC)
+          MESSAGE(FATAL_ERROR "Given entry ${entry} in list for option ${SUFFIX}. "
+                  "Valid case-insensitive values are any of ${OPT_VALID_ENTRIES}")
+        ENDIF()
+      ENDFOREACH()
+      STRING(TOUPPER "${${CAMEL_NAME}}" GIVEN_ENTRIES_UC)
+      SET(${UC_NAME} ${GIVEN_ENTRIES_UC} PARENT_SCOPE)
+    ELSE()
+      SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE)
+    ENDIF()
   ELSE()
     SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE)
   ENDIF()
 
+
 ENDFUNCTION()
 
 MACRO(KOKKOSKERNELS_ADD_OPTION_AND_DEFINE USER_OPTION_NAME MACRO_DEFINE_NAME DOCSTRING DEFAULT_VALUE )
@@ -87,12 +109,6 @@ LIST(APPEND TEMP ${ARGN})
 GLOBAL_SET(${VARNAME} ${TEMP})
 ENDFUNCTION()
 
-FUNCTION(VERIFY_EMPTY CONTEXT)
-IF(${ARGN})
- MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}")
-ENDIF()
-ENDFUNCTION()
-
 MACRO(PREPEND_GLOBAL_SET VARNAME)
 ASSERT_DEFINED(${VARNAME})
 GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}})
@@ -161,52 +177,68 @@ IF(NOT TARGET check)
 ENDIF()
 
 FUNCTION(KOKKOSKERNELS_ADD_TEST)
-IF (KOKKOSKERNELS_HAS_TRILINOS)
-  CMAKE_PARSE_ARGUMENTS(TEST 
-    ""
-    "EXE;NAME"
-    ""
-    ${ARGN})
-  IF(TEST_EXE)
-    SET(EXE_ROOT ${TEST_EXE})
-  ELSE()
-    SET(EXE_ROOT ${TEST_NAME})
-  ENDIF()
 
-  TRIBITS_ADD_TEST(
-    ${EXE_ROOT}
-    NAME ${TEST_NAME}
-    ${ARGN} 
-    COMM serial mpi
-    NUM_MPI_PROCS 1
-    ${TEST_UNPARSED_ARGUMENTS}
-  )
-ELSE()
-  CMAKE_PARSE_ARGUMENTS(TEST 
-    "WILL_FAIL"
-    "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME"
-    "CATEGORIES"
-    ${ARGN})
-  IF(TEST_EXE)
-    SET(EXE ${TEST_EXE})
-  ELSE()
-    SET(EXE ${TEST_NAME})
-  ENDIF()
-  IF(WIN32)
-    ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX})
+CMAKE_PARSE_ARGUMENTS(PARSE
+  ""
+  ""
+  "COMPONENTS"
+  ${ARGN})
+
+KOKKOSKERNELS_IS_ENABLED(
+  COMPONENTS ${PARSE_COMPONENTS}
+  OUTPUT_VARIABLE IS_ENABLED
+)
+
+IF (IS_ENABLED)
+  IF (KOKKOSKERNELS_HAS_TRILINOS)
+    CMAKE_PARSE_ARGUMENTS(TEST
+      ""
+      "EXE;NAME"
+      ""
+      ${PARSE_UNPARSED_ARGUMENTS})
+    IF(TEST_EXE)
+      SET(EXE_ROOT ${TEST_EXE})
+    ELSE()
+      SET(EXE_ROOT ${TEST_NAME})
+    ENDIF()
+
+    TRIBITS_ADD_TEST(
+      ${EXE_ROOT}
+      NAME ${TEST_NAME}
+      ${ARGN}
+      COMM serial mpi
+      NUM_MPI_PROCS 1
+      ${TEST_UNPARSED_ARGUMENTS}
+    )
   ELSE()
-    ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE})
-  ENDIF()
-  IF(TEST_WILL_FAIL)
-    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL})
-  ENDIF()
-  IF(TEST_FAIL_REGULAR_EXPRESSION)
-    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION})
-  ENDIF()
-  IF(TEST_PASS_REGULAR_EXPRESSION)
-    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION})
+    CMAKE_PARSE_ARGUMENTS(TEST
+      "WILL_FAIL"
+      "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME"
+      "CATEGORIES"
+      ${PARSE_UNPARSED_ARGUMENTS})
+    IF(TEST_EXE)
+      SET(EXE ${TEST_EXE})
+    ELSE()
+      SET(EXE ${TEST_NAME})
+    ENDIF()
+    IF(WIN32)
+      ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX})
+    ELSE()
+      ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE})
+    ENDIF()
+    IF(TEST_WILL_FAIL)
+      SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL})
+    ENDIF()
+    IF(TEST_FAIL_REGULAR_EXPRESSION)
+      SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION})
+    ENDIF()
+    IF(TEST_PASS_REGULAR_EXPRESSION)
+      SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION})
+    ENDIF()
+    VERIFY_EMPTY(KOKKOSKERNELS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS})
   ENDIF()
-  VERIFY_EMPTY(KOKKOSKERNELS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS})
+ELSE()
+  MESSAGE(STATUS "Skipping test ${TEST_NAME} because not all necessary components enabled")
 ENDIF()
 ENDFUNCTION()
 
diff --git a/cmake/kokkos_backends.cmake b/cmake/kokkos_backends.cmake
index e9dde7bf66..c2f46bb8e3 100644
--- a/cmake/kokkos_backends.cmake
+++ b/cmake/kokkos_backends.cmake
@@ -10,6 +10,7 @@ MACRO(CHECK_KOKKOS_BACKEND BE)
 ENDMACRO(CHECK_KOKKOS_BACKEND)
 
 CHECK_KOKKOS_BACKEND(CUDA)
+CHECK_KOKKOS_BACKEND(HIP)
 CHECK_KOKKOS_BACKEND(OPENMP)
 CHECK_KOKKOS_BACKEND(PTHREAD)
 CHECK_KOKKOS_BACKEND(SERIAL)
diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake
index ffb5715e32..ede934023c 100644
--- a/cmake/kokkoskernels_eti_devices.cmake
+++ b/cmake/kokkoskernels_eti_devices.cmake
@@ -4,11 +4,13 @@
 
 SET(EXEC_SPACES
   EXECSPACE_CUDA
+  EXECSPACE_HIP
   EXECSPACE_OPENMP
   EXECSPACE_PTHREAD
   EXECSPACE_SERIAL
 )
 SET(EXECSPACE_CUDA_CPP_TYPE Kokkos::Cuda)
+SET(EXECSPACE_HIP_CPP_TYPE Kokkos::Experimental::HIP)
 SET(EXECSPACE_OPENMP_CPP_TYPE Kokkos::OpenMP)
 SET(EXECSPACE_PTHREAD_CPP_TYPE Kokkos::Threads)
 SET(EXECSPACE_SERIAL_CPP_TYPE Kokkos::Serial)
@@ -16,11 +18,13 @@ SET(EXECSPACE_SERIAL_CPP_TYPE Kokkos::Serial)
 SET(MEM_SPACES
   MEMSPACE_CUDASPACE
   MEMSPACE_CUDAUVMSPACE
+  MEMSPACE_HIPSPACE
   MEMSPACE_HOSTSPACE
   MEMSPACE_HBWSPACE
 )
 SET(MEMSPACE_CUDASPACE_CPP_TYPE    Kokkos::CudaSpace)
 SET(MEMSPACE_CUDAUVMSPACE_CPP_TYPE Kokkos::CudaUVMSpace)
+SET(MEMSPACE_HIPSPACE_CPP_TYPE     Kokkos::Experimental::HIPSpace)
 SET(MEMSPACE_HOSTSPACE_CPP_TYPE    Kokkos::HostSpace)
 SET(MEMSPACE_HBWSPACE_CPP_TYPE     Kokkos::HBWSpace)
 
@@ -57,6 +61,30 @@ IF(KOKKOS_ENABLE_CUDA)
 
 ENDIF()
 
+IF(KOKKOS_ENABLE_HIP)
+ KOKKOSKERNELS_ADD_OPTION(
+   INST_EXECSPACE_HIP
+   ${KOKKOSKERNELS_INST_EXECSPACE_HIP_DEFAULT}
+   BOOL
+   "Whether to pre instantiate kernels for the execution space Kokkos::Experimental::HIP. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise."
+   )
+ KOKKOSKERNELS_ADD_OPTION(
+   INST_MEMSPACE_HIPSPACE
+   ${KOKKOSKERNELS_INST_EXECSPACE_HIP_DEFAULT}
+   BOOL
+   "Whether to pre instantiate kernels for the memory space Kokkos::Experimental::HIPSpace.  Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise."
+   )
+
+  IF(KOKKOSKERNELS_INST_EXECSPACE_HIP AND KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE)
+    LIST(APPEND DEVICE_LIST "<HIP,HIPSpace>")
+  ENDIF()
+
+  IF( Trilinos_ENABLE_COMPLEX_DOUBLE AND ((NOT DEFINED CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS) OR (NOT CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS)) )
+    MESSAGE( WARNING "The CMake option CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS is either undefined or OFF.  Please set CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS:BOOL=ON when building with HIP and complex double enabled.")
+  ENDIF()
+
+ENDIF()
+
 KOKKOSKERNELS_ADD_OPTION(
  INST_MEMSPACE_HOSTSPACE
  ${KOKKOSKERNELS_ADD_DEFAULT_ETI}
@@ -109,6 +137,7 @@ KOKKOSKERNELS_ADD_OPTION(
 )
 
 SET(EXECSPACE_CUDA_VALID_MEM_SPACES      CUDASPACE CUDAUVMSPACE)
+SET(EXECSPACE_HIP_VALID_MEM_SPACES       HIPSPACE)
 SET(EXECSPACE_SERIAL_VALID_MEM_SPACES    HBWSPACE HOSTSPACE)
 SET(EXECSPACE_OPENMP_VALID_MEM_SPACES    HBWSPACE HOSTSPACE)
 SET(EXECSPACE_PTHREAD_VALID_MEM_SPACES   HBWSPACE HOSTSPACE)
diff --git a/cmake/kokkoskernels_tribits.cmake b/cmake/kokkoskernels_tribits.cmake
index 0bd8c04963..4eebb97c7b 100644
--- a/cmake/kokkoskernels_tribits.cmake
+++ b/cmake/kokkoskernels_tribits.cmake
@@ -5,6 +5,12 @@ IF (KOKKOSKERNELS_HAS_TRILINOS)
 INCLUDE(TribitsETISupport)
 ENDIF()
 
+FUNCTION(VERIFY_EMPTY CONTEXT)
+  IF(${ARGN})
+    MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}")
+  ENDIF()
+ENDFUNCTION()
+
 #MESSAGE(STATUS "The project name is: ${PROJECT_NAME}")
 
 MACRO(KOKKOSKERNELS_PACKAGE_POSTPROCESS)
@@ -127,88 +133,121 @@ ENDIF()
 ENDFUNCTION()
 
 FUNCTION(KOKKOSKERNELS_ADD_EXECUTABLE EXE_NAME)
-IF (KOKKOSKERNELS_HAS_TRILINOS)
-  TRIBITS_ADD_EXECUTABLE(${EXE_NAME} ${ARGN})
-ELSE()
-  CMAKE_PARSE_ARGUMENTS(PARSE 
-    "TESTONLY"
-    ""
-    "SOURCES;TESTONLYLIBS"
-    ${ARGN})
+CMAKE_PARSE_ARGUMENTS(PARSE
+  ""
+  ""
+  "SOURCES;COMPONENTS;TESTONLYLIBS"
+  ${ARGN})
+VERIFY_EMPTY(KOKKOSKERNELS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS})
 
-  ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES})
-  TARGET_LINK_LIBRARIES(${EXE_NAME} Kokkos::kokkoskernels)
-  IF (PARSE_TESTONLYLIBS)
-    TARGET_LINK_LIBRARIES(${EXE_NAME} ${PARSE_TESTONLYLIBS})
+KOKKOSKERNELS_IS_ENABLED(
+  COMPONENTS ${PARSE_COMPONENTS}
+  OUTPUT_VARIABLE IS_ENABLED
+)
+
+IF (IS_ENABLED)
+  IF (KOKKOSKERNELS_HAS_TRILINOS)
+    TRIBITS_ADD_EXECUTABLE(${EXE_NAME}
+      SOURCES ${PARSE_SOURCES}
+      TESTONLYLIBS ${TESTONLYLIBS})
+  ELSE()
+    ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES})
+    TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkoskernels)
+    IF (PARSE_TESTONLYLIBS)
+      TARGET_LINK_LIBRARIES(${EXE_NAME} ${PARSE_TESTONLYLIBS})
+    ENDIF()
   ENDIF()
-  VERIFY_EMPTY(KOKKOSKERNELS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS})
+ELSE()
+  MESSAGE(STATUS "Skipping executable ${EXE_NAME} because not all necessary components enabled")
 ENDIF()
 ENDFUNCTION()
 
-FUNCTION(KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST ROOT_NAME)
-IF (KOKKOSKERNELS_HAS_TRILINOS)
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+FUNCTION(KOKKOSKERNELS_ADD_UNIT_TEST ROOT_NAME)
+  KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST(
     ${ROOT_NAME}
     TESTONLYLIBS kokkoskernels_gtest
     ${ARGN}
-    NUM_MPI_PROCS 1
-    COMM serial mpi
   )
-ELSE()
-  CMAKE_PARSE_ARGUMENTS(PARSE
-    ""
-    ""
-    "SOURCES;CATEGORIES"
-    ${ARGN})
-  VERIFY_EMPTY(KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS})
-  SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME})
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE(${EXE_NAME}
-    SOURCES ${PARSE_SOURCES}
-  )
-  KOKKOSKERNELS_ADD_TEST(NAME ${ROOT_NAME}
-    EXE ${EXE_NAME}
-  )
-ENDIF()
 ENDFUNCTION()
 
-FUNCTION(KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST ROOT_NAME)
-IF (KOKKOSKERNELS_HAS_TRILINOS)
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
-    ${ROOT_NAME}
-    ${ARGN}
-    NUM_MPI_PROCS 1
-    COMM serial mpi
-  )
-ELSE()
+FUNCTION(KOKKOSKERNELS_IS_ENABLED)
   CMAKE_PARSE_ARGUMENTS(PARSE
     ""
-    ""
-    "SOURCES;CATEGORIES"
+    "OUTPUT_VARIABLE"
+    "COMPONENTS"
     ${ARGN})
-  VERIFY_EMPTY(KOKKOSKERNELS_ADD_EXECUTABLE_AND_RUN_VERIFY ${PARSE_UNPARSED_ARGUMENTS})
-  SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME})
-  KOKKOSKERNELS_ADD_EXECUTABLE(${EXE_NAME}
-    SOURCES ${PARSE_SOURCES}
-  )
-  KOKKOSKERNELS_ADD_TEST(NAME ${ROOT_NAME}
-    EXE ${EXE_NAME}
-  )
-ENDIF()
+
+  IF (KOKKOSKERNELS_ENABLED_COMPONENTS STREQUAL "ALL")
+    SET(${PARSE_OUTPUT_VARIABLE} TRUE PARENT_SCOPE)
+  ELSEIF(PARSE_COMPONENTS)
+    SET(ENABLED TRUE)
+    FOREACH(comp ${PARSE_COMPONENTS})
+      STRING(TOUPPER ${comp} COMP_UC)
+      # make sure this is in the list of enabled components
+      IF(NOT "${COMP_UC}" IN_LIST KOKKOSKERNELS_ENABLED_COMPONENTS)
+        # if not in the list, one or more components is missing
+        SET(ENABLED FALSE)
+      ENDIF()
+    ENDFOREACH()
+    SET(${PARSE_OUTPUT_VARIABLE} ${ENABLED} PARENT_SCOPE)
+  ELSE()
+    # we did not enable all components and no components
+    # were given as part of this - we consider this enabled
+    SET(${PARSE_OUTPUT_VARIABLE} TRUE PARENT_SCOPE)
+  ENDIF()
 ENDFUNCTION()
 
-MACRO(KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE EXE_NAME)
-CMAKE_PARSE_ARGUMENTS(PARSE 
+FUNCTION(KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST ROOT_NAME)
+
+CMAKE_PARSE_ARGUMENTS(PARSE
   ""
   ""
-  "SOURCES"
+  "SOURCES;CATEGORIES;COMPONENTS;TESTONLYLIBS"
   ${ARGN})
-KOKKOSKERNELS_ADD_EXECUTABLE(${EXE_NAME}
-  SOURCES ${PARSE_SOURCES}
-  TESTONLYLIBS kokkoskernels_gtest
-  ${PARSE_UNPARSED_ARGUMENTS}
+VERIFY_EMPTY(KOKKOSKERNELS_ADD_EXECUTABLE_AND_RUN_VERIFY ${PARSE_UNPARSED_ARGUMENTS})
+
+KOKKOSKERNELS_IS_ENABLED(
+  COMPONENTS ${PARSE_COMPONENTS}
+  OUTPUT_VARIABLE IS_ENABLED
 )
-IF (NOT KOKKOSKERNELS_HAS_TRILINOS)
-  TARGET_LINK_LIBRARIES(${EXE_NAME} kokkoskernels_gtest)
+
+IF (IS_ENABLED)
+  IF (KOKKOSKERNELS_HAS_TRILINOS)
+    TRIBITS_ADD_EXECUTABLE_AND_TEST(
+      ${ROOT_NAME}
+      SOURCES ${PARSE_SOURCES}
+      CATEGORIES ${PARSE_CATEGORIES}
+      TESTONLYLIBS ${PARSE_TESTONLYLIBS}
+      NUM_MPI_PROCS 1
+      COMM serial mpi
+    )
+  ELSE()
+    SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME})
+    KOKKOSKERNELS_ADD_EXECUTABLE(${EXE_NAME}
+      SOURCES ${PARSE_SOURCES}
+    )
+    IF (PARSE_TESTONLYLIBS)
+      TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS})
+    ENDIF()
+    KOKKOSKERNELS_ADD_TEST(NAME ${ROOT_NAME}
+      EXE ${EXE_NAME}
+    )
+  ENDIF()
+ELSE()
+  MESSAGE(STATUS "Skipping executable/test ${ROOT_NAME} because not all necessary components enabled")
 ENDIF()
-ADD_DEPENDENCIES(check ${EXE_NAME})
-ENDMACRO(KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE)
+
+ENDFUNCTION()
+
+MACRO(ADD_COMPONENT_SUBDIRECTORY SUBDIR)
+  KOKKOSKERNELS_IS_ENABLED(
+    COMPONENTS ${SUBDIR}
+    OUTPUT_VARIABLE COMP_SUBDIR_ENABLED
+  )
+  IF (COMP_SUBDIR_ENABLED)
+    ADD_SUBDIRECTORY(${SUBDIR})
+  ELSE()
+    MESSAGE(STATUS "Skipping subdirectory ${SUBDIR} because component is not enabled")
+  ENDIF()
+  UNSET(COMP_SUBDIR_ENABLED)
+ENDMACRO()
diff --git a/example/wiki/graph/CMakeLists.txt b/example/wiki/graph/CMakeLists.txt
index a8ddec070d..b271038d91 100644
--- a/example/wiki/graph/CMakeLists.txt
+++ b/example/wiki/graph/CMakeLists.txt
@@ -8,3 +8,18 @@ KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST(
   SOURCES KokkosGraph_wiki_coloring.cpp
   )
 
+KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST(
+  wiki_mis2
+  SOURCES KokkosGraph_wiki_mis2.cpp
+  )
+
+KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST(
+  wiki_coarsening
+  SOURCES KokkosGraph_wiki_coarsening.cpp
+  )
+
+KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST(
+  wiki_rcm
+  SOURCES KokkosGraph_wiki_rcm.cpp
+  )
+
diff --git a/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp b/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp
new file mode 100644
index 0000000000..93e5660c07
--- /dev/null
+++ b/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp
@@ -0,0 +1,140 @@
+#ifndef WIKI_9PT_STENCIL_H
+#define WIKI_9PT_STENCIL_H
+
+#include "Kokkos_Core.hpp"
+#include "KokkosKernels_default_types.hpp"
+#include "KokkosKernels_Handle.hpp"
+#include <vector>
+#include <set>
+#include <cstdio>
+#include <cmath>
+#include <sstream>
+
+using Ordinal = default_lno_t;
+using Offset  = default_size_type;
+using Layout  = default_layout;
+using ExecSpace = Kokkos::DefaultExecutionSpace;
+using DeviceSpace = typename ExecSpace::memory_space;
+using Kokkos::HostSpace;
+using RowmapType = Kokkos::View<Offset*, DeviceSpace>;
+using ColindsType = Kokkos::View<Ordinal*, DeviceSpace>;
+using Handle  = KokkosKernels::Experimental::
+  KokkosKernelsHandle<Offset, Ordinal, default_scalar, ExecSpace, DeviceSpace, DeviceSpace>;
+
+namespace GraphDemo
+{
+  Ordinal gridX = 15;
+  Ordinal gridY = 25;
+  Ordinal numVertices = gridX * gridY;
+
+  void setGridDimensions(Ordinal newX, Ordinal newY)
+  {
+    gridX = newX;
+    gridY = newY;
+    numVertices = gridX * gridY;
+  }
+
+  //Helper to get the vertex ID given grid coordinates
+  Ordinal getVertexID(Ordinal x, Ordinal y)
+  {
+    return y * gridX + x;
+  }
+
+  //Inverse of getVertexID
+  void getVertexPos(Ordinal vert, Ordinal& x, Ordinal& y)
+  {
+    x = vert % gridX;
+    y = vert / gridX;
+  }
+
+  //Helper to print out colors in the shape of the grid
+  template<typename ColorView>
+  void printColoring(ColorView colors, Ordinal numColors)
+  {
+    //Read colors on host
+    auto colorsHost = Kokkos::create_mirror_view_and_copy(HostSpace(), colors);
+    int numDigits = ceil(log10(numColors + 1));
+    //Print out the grid, with columns aligned and at least one space between numbers
+    std::ostringstream numFmtStream;
+    numFmtStream << '%' << numDigits + 1 << 'd';
+    std::string numFmt = numFmtStream.str();
+    for(Ordinal y = 0; y < gridY; y++)
+    {
+      for(Ordinal x = 0; x < gridX; x++)
+      {
+        Ordinal vertex = getVertexID(x, y);
+        int color = colorsHost(vertex);
+        printf(numFmt.c_str(), color);
+      }
+      putchar('\n');
+    }
+  }
+
+  template<typename MISView>
+  void printMIS(MISView misList)
+  {
+    //Read colors on host
+    auto misHost = Kokkos::create_mirror_view_and_copy(HostSpace(), misList);
+    std::set<Ordinal> mis;
+    for(Offset i = 0; i < (Offset) misList.extent(0); i++)
+      mis.insert(misHost(i));
+    for(Ordinal y = 0; y < gridY; y++)
+    {
+      for(Ordinal x = 0; x < gridX; x++)
+      {
+        Ordinal vertex = getVertexID(x, y);
+        if(mis.find(vertex) == mis.end())
+          printf(". ");
+        else
+          printf("# ");
+      }
+      putchar('\n');
+    }
+  }
+
+  //Build the graph on host, allocate these views on device and copy the graph to them.
+  //Both rowmapDevice and colindsDevice are output parameters and should default-initialized (empty) on input.
+  void generate9pt(RowmapType& rowmapDevice, ColindsType& colindsDevice)
+  {
+    //Generate the graph on host (use std::vector to not need to know
+    //how many entries ahead of time)
+    std::vector<Offset> rowmap(numVertices + 1);
+    std::vector<Ordinal> colinds;
+    rowmap[0] = 0;
+    for(Ordinal vert = 0; vert < numVertices; vert++)
+    {
+      Ordinal x, y;
+      getVertexPos(vert, x, y);
+      //Loop over the neighbors in a 3x3 region
+      for(Ordinal ny = y - 1; ny <= y + 1; ny++)
+      {
+        for(Ordinal nx = x - 1; nx <= x + 1; nx++)
+        {
+          //exclude the edge to self
+          if(nx == x && ny == y)
+            continue;
+          //exclude vertices that would be outside the grid
+          if(nx < 0 || nx >= gridX || ny < 0 || ny >= gridY)
+            continue;
+          //add the neighbor to colinds, forming an edge
+          colinds.push_back(getVertexID(nx, ny));
+        }
+      }
+      //mark where the current row ends
+      rowmap[vert + 1] = colinds.size();
+    }
+    Offset numEdges = colinds.size();
+    //Now that the graph is formed, copy rowmap and colinds to Kokkos::Views in device memory
+    //The nonowning host views just alias the std::vectors.
+    Kokkos::View<Offset*, HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> rowmapHost(rowmap.data(), numVertices + 1);
+    Kokkos::View<Ordinal*, HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> colindsHost(colinds.data(), numEdges);
+    //Allocate owning views on device with the correct size.
+    rowmapDevice = RowmapType("Rowmap", numVertices + 1);
+    colindsDevice = ColindsType("Colinds", numEdges);
+    //Copy the graph from host to device
+    Kokkos::deep_copy(rowmapDevice, rowmapHost);
+    Kokkos::deep_copy(colindsDevice, colindsHost);
+  }
+}
+
+#endif
diff --git a/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp
new file mode 100644
index 0000000000..dded3fd258
--- /dev/null
+++ b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp
@@ -0,0 +1,28 @@
+#include "KokkosGraph_wiki_9pt_stencil.hpp"
+#include "KokkosGraph_MIS2.hpp"
+
+int main(int argc, char* argv[])
+{
+  Kokkos::initialize();
+  {
+    using GraphDemo::numVertices;
+    RowmapType rowmapDevice;
+    ColindsType colindsDevice;
+    //Step 1: Generate the graph on host, allocate space on device, and copy.
+    //See function "generate9pt" below.
+    GraphDemo::generate9pt(rowmapDevice, colindsDevice);
+    //Step 2: Run MIS-2 based coarsening and print the result
+    {
+      std::cout << "Coarsened vertex labels:\n";
+      Ordinal numClusters = 0;
+      auto labels = KokkosGraph::Experimental::graph_mis2_coarsen<ExecSpace, RowmapType, ColindsType>(
+          rowmapDevice, colindsDevice, numClusters, KokkosGraph::MIS2_FAST);
+      //coarsening labels can be printed in the same way as colors
+      GraphDemo::printColoring(labels, numClusters);
+      putchar('\n');
+    }
+  }
+  Kokkos::finalize();
+  return 0;
+}
+
diff --git a/example/wiki/graph/KokkosGraph_wiki_coloring.cpp b/example/wiki/graph/KokkosGraph_wiki_coloring.cpp
index 7e561f5883..56639dad3a 100644
--- a/example/wiki/graph/KokkosGraph_wiki_coloring.cpp
+++ b/example/wiki/graph/KokkosGraph_wiki_coloring.cpp
@@ -1,10 +1,4 @@
-#include <vector>
-#include <cstdio>
-#include <cmath>
-#include <sstream>
-#include "Kokkos_Core.hpp"
-#include "KokkosKernels_default_types.hpp"
-#include "KokkosKernels_Handle.hpp"
+#include "KokkosGraph_wiki_9pt_stencil.hpp"
 #include "KokkosGraph_Distance1Color.hpp"
 #include "KokkosGraph_Distance2Color.hpp"
 
@@ -17,114 +11,16 @@
 //    -Different constraint: two vertices separated by a path of length 1 OR 2
 //     must have different colors)
 
-using Ordinal = default_lno_t;
-using Offset  = default_size_type;
-using Layout  = default_layout;
-using ExecSpace = Kokkos::DefaultExecutionSpace;
-using DeviceSpace = typename ExecSpace::memory_space;
-using Kokkos::HostSpace;
-using RowmapType = Kokkos::View<Offset*, DeviceSpace>;
-using ColindsType = Kokkos::View<Ordinal*, DeviceSpace>;
-using Handle  = KokkosKernels::Experimental::
-  KokkosKernelsHandle<Offset, Ordinal, default_scalar, ExecSpace, DeviceSpace, DeviceSpace>;
-
-namespace ColoringDemo
-{
-  constexpr Ordinal gridX = 15;
-  constexpr Ordinal gridY = 25;
-  constexpr Ordinal numVertices = gridX * gridY;
-
-  //Helper to get the vertex ID given grid coordinates
-  Ordinal getVertexID(Ordinal x, Ordinal y)
-  {
-    return y * gridX + x;
-  }
-
-  //Inverse of getVertexID
-  void getVertexPos(Ordinal vert, Ordinal& x, Ordinal& y)
-  {
-    x = vert % gridX;
-    y = vert / gridX;
-  }
-
-  //Helper to print out colors in the shape of the grid
-  template<typename ColorView>
-  void printColoring(ColorView colors, Ordinal numColors)
-  {
-    //Read colors on host
-    auto colorsHost = Kokkos::create_mirror_view_and_copy(HostSpace(), colors);
-    int numDigits = ceil(log10(numColors + 1));
-    //Print out the grid, with columns aligned and at least one space between numbers
-    std::ostringstream numFmtStream;
-    numFmtStream << '%' << numDigits + 1 << 'd';
-    std::string numFmt = numFmtStream.str();
-    for(Ordinal y = 0; y < gridY; y++)
-    {
-      for(Ordinal x = 0; x < gridX; x++)
-      {
-        Ordinal vertex = getVertexID(x, y);
-        int color = colorsHost(vertex);
-        printf(numFmt.c_str(), color);
-      }
-      putchar('\n');
-    }
-  }
-
-  //Build the graph on host, allocate these views on device and copy the graph to them.
-  //Both rowmapDevice and colindsDevice are output parameters and should default-initialized (empty) on input.
-  void generate9pt(RowmapType& rowmapDevice, ColindsType& colindsDevice)
-  {
-    //Generate the graph on host (use std::vector to not need to know
-    //how many entries ahead of time)
-    std::vector<Offset> rowmap(numVertices + 1);
-    std::vector<Ordinal> colinds;
-    rowmap[0] = 0;
-    for(Ordinal vert = 0; vert < numVertices; vert++)
-    {
-      Ordinal x, y;
-      getVertexPos(vert, x, y);
-      //Loop over the neighbors in a 3x3 region
-      for(Ordinal ny = y - 1; ny <= y + 1; ny++)
-      {
-        for(Ordinal nx = x - 1; nx <= x + 1; nx++)
-        {
-          //exclude the edge to self
-          if(nx == x && ny == y)
-            continue;
-          //exclude vertices that would be outside the grid
-          if(nx < 0 || nx >= gridX || ny < 0 || ny >= gridY)
-            continue;
-          //add the neighbor to colinds, forming an edge
-          colinds.push_back(getVertexID(nx, ny));
-        }
-      }
-      //mark where the current row ends
-      rowmap[vert + 1] = colinds.size();
-    }
-    Offset numEdges = colinds.size();
-    //Now that the graph is formed, copy rowmap and colinds to Kokkos::Views in device memory
-    //The nonowning host views just alias the std::vectors.
-    Kokkos::View<Offset*, HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> rowmapHost(rowmap.data(), numVertices + 1);
-    Kokkos::View<Ordinal*, HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> colindsHost(colinds.data(), numEdges);
-    //Allocate owning views on device with the correct size.
-    rowmapDevice = RowmapType("Rowmap", numVertices + 1);
-    colindsDevice = ColindsType("Colinds", numEdges);
-    //Copy the graph from host to device
-    Kokkos::deep_copy(rowmapDevice, rowmapHost);
-    Kokkos::deep_copy(colindsDevice, colindsHost);
-  }
-}
-
 int main(int argc, char* argv[])
 {
   Kokkos::initialize();
   {
-    using ColoringDemo::numVertices;
+    using GraphDemo::numVertices;
     RowmapType rowmapDevice;
     ColindsType colindsDevice;
     //Step 1: Generate the graph on host, allocate space on device, and copy.
     //See function "generate9pt" below.
-    ColoringDemo::generate9pt(rowmapDevice, colindsDevice);
+    GraphDemo::generate9pt(rowmapDevice, colindsDevice);
     //Step 2: Create handle and run distance-1 coloring.
     {
       Handle handle;
@@ -136,7 +32,7 @@ int main(int argc, char* argv[])
       auto colors = handle.get_graph_coloring_handle()->get_vertex_colors();
       Ordinal numColors = handle.get_graph_coloring_handle()->get_num_colors();
       printf("9-pt stencil: Distance-1 Colors (used %d):\n", (int) numColors);
-      ColoringDemo::printColoring(colors, numColors);
+      GraphDemo::printColoring(colors, numColors);
       putchar('\n');
       //Clean up
       handle.destroy_graph_coloring_handle();
@@ -152,7 +48,7 @@ int main(int argc, char* argv[])
       auto colors = handle.get_distance2_graph_coloring_handle()->get_vertex_colors();
       Ordinal numColors = handle.get_distance2_graph_coloring_handle()->get_num_colors();
       printf("9-pt stencil: Distance-2 Colors (used %d):\n", (int) numColors);
-      ColoringDemo::printColoring(colors, numColors);
+      GraphDemo::printColoring(colors, numColors);
       putchar('\n');
       //Clean up
       handle.destroy_distance2_graph_coloring_handle();
diff --git a/example/wiki/graph/KokkosGraph_wiki_mis2.cpp b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp
new file mode 100644
index 0000000000..416164981b
--- /dev/null
+++ b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp
@@ -0,0 +1,34 @@
+#include "KokkosGraph_wiki_9pt_stencil.hpp"
+#include "KokkosGraph_MIS2.hpp"
+
+int main(int argc, char* argv[])
+{
+  Kokkos::initialize();
+  {
+    using GraphDemo::numVertices;
+    RowmapType rowmapDevice;
+    ColindsType colindsDevice;
+    //Step 1: Generate the graph on host, allocate space on device, and copy.
+    //See function "generate9pt" below.
+    GraphDemo::generate9pt(rowmapDevice, colindsDevice);
+    //Step 2: Run distance-2 MIS and print the results, with three different algorithms
+    {
+      //Run coloring
+      auto misDevice = KokkosGraph::Experimental::graph_d2_mis<ExecSpace, RowmapType, ColindsType>(
+          rowmapDevice, colindsDevice, KokkosGraph::MIS2_FAST);
+      std::cout << "Distance-2 MIS, FAST algorithm: contains "
+        << misDevice.extent(0) << " out of " << GraphDemo::numVertices << " vertices.\n";
+      GraphDemo::printMIS(misDevice);
+      putchar('\n');
+      misDevice = KokkosGraph::Experimental::graph_d2_mis<ExecSpace, RowmapType, ColindsType>(
+          rowmapDevice, colindsDevice, KokkosGraph::MIS2_QUALITY);
+      std::cout << "Distance-2 MIS, QUALITY algorithm: contains "
+        << misDevice.extent(0) << " out of " << GraphDemo::numVertices << " vertices.\n";
+      GraphDemo::printMIS(misDevice);
+      putchar('\n');
+    }
+  }
+  Kokkos::finalize();
+  return 0;
+}
+
diff --git a/example/wiki/graph/KokkosGraph_wiki_rcm.cpp b/example/wiki/graph/KokkosGraph_wiki_rcm.cpp
new file mode 100644
index 0000000000..31073954a4
--- /dev/null
+++ b/example/wiki/graph/KokkosGraph_wiki_rcm.cpp
@@ -0,0 +1,68 @@
+#include "KokkosGraph_wiki_9pt_stencil.hpp"
+#include "KokkosGraph_RCM.hpp"
+
+template<typename rowmap_t, typename entries_t, typename labels_t>
+void printReorderedMatrix(const rowmap_t& rowmapIn, const entries_t& entriesIn, const labels_t& invPermIn)
+{
+  using size_type = typename rowmap_t::non_const_value_type;
+  using lno_t = typename entries_t::non_const_value_type;
+  auto rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmapIn);
+  auto entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entriesIn);
+  auto invPerm = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), invPermIn);
+  lno_t numVerts = rowmap.extent(0) - 1;
+  decltype(invPerm) perm(Kokkos::ViewAllocateWithoutInitializing("Perm"), numVerts);
+  for(lno_t i = 0; i < numVerts; i++)
+    perm(invPerm(i)) = i;
+  std::vector<lno_t> neighbors;
+  for(lno_t i = 0; i < numVerts; i++)
+  {
+    lno_t origRow = perm(i);
+    neighbors.clear();
+    for(size_type j = rowmap(origRow); j < rowmap(origRow + 1); j++)
+    {
+      lno_t origNei = entries(j);
+      lno_t nei = invPerm(origNei);
+      neighbors.push_back(nei);
+    }
+    std::sort(neighbors.begin(), neighbors.end());
+    size_t it = 0;
+    for(lno_t j = 0; j < numVerts; j++)
+    {
+      if(it < neighbors.size() && j == neighbors[it])
+      {
+        std::cout << '*';
+        it++;
+      }
+      else
+        std::cout << ' ';
+    }
+    std::cout << '\n';
+  }
+  std::cout << '\n';
+}
+
+
+int main(int argc, char* argv[])
+{
+  Kokkos::initialize();
+  {
+    using GraphDemo::numVertices;
+    GraphDemo::setGridDimensions(6, 6);
+    RowmapType rowmapDevice;
+    ColindsType colindsDevice;
+    //Make the graph smaller so the matrix can be printed easily
+    //Step 1: Generate the graph on host, allocate space on device, and copy.
+    //See function "generate9pt" below.
+    GraphDemo::generate9pt(rowmapDevice, colindsDevice);
+    //Step 2: Run RCM and print the reordered matrix
+    {
+      auto rcmDevice = KokkosGraph::Experimental::graph_rcm<ExecSpace, RowmapType, ColindsType>(
+          rowmapDevice, colindsDevice);
+      std::cout << "Graph reordered by reverse Cuthill-McKee:\n";
+      printReorderedMatrix(rowmapDevice, colindsDevice, rcmDevice);
+    }
+  }
+  Kokkos::finalize();
+  return 0;
+}
+
diff --git a/master_history.txt b/master_history.txt
index 85a5174166..aa7c3dbe54 100644
--- a/master_history.txt
+++ b/master_history.txt
@@ -10,3 +10,4 @@ tag: 3.0.00     date: 01/31/2020  master: d86db111    release-candidate-3.0: cf2
 tag: 3.1.00     date: 04/14/2020  master: f199f45d    develop: 8d063eae
 tag: 3.1.01     date: 05/04/2020  master: 43773523    release: 6fce7502 
 tag: 3.2.00     date: 08/19/2020  master: 07a60bcc    release: ea3f2b77
+tag: 3.3.00     date: 12/16/2020  master: 42defc56    release: e5279e55
diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt
index 2ec1ff57c8..fe3b3c51ba 100644
--- a/perf_test/CMakeLists.txt
+++ b/perf_test/CMakeLists.txt
@@ -10,11 +10,10 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../test_common)
 # build correctly with or without MPI, but only run them with a single
 # MPI process.
 
-ADD_SUBDIRECTORY(batched)
-ADD_SUBDIRECTORY(graph)
-ADD_SUBDIRECTORY(sparse)
+ADD_COMPONENT_SUBDIRECTORY(batched)
+ADD_COMPONENT_SUBDIRECTORY(graph)
+ADD_COMPONENT_SUBDIRECTORY(sparse)
+ADD_COMPONENT_SUBDIRECTORY(blas)
 ADD_SUBDIRECTORY(performance)
-ADD_SUBDIRECTORY(blas/blas3)
-ADD_SUBDIRECTORY(blas/blas)
 #ADD_SUBDIRECTORY(common)
 
diff --git a/perf_test/batched/CMakeLists.txt b/perf_test/batched/CMakeLists.txt
index b9613c7802..36435ecfc1 100644
--- a/perf_test/batched/CMakeLists.txt
+++ b/perf_test/batched/CMakeLists.txt
@@ -1,5 +1,9 @@
 KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 
-KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockTridiag SOURCES KokkosBatched_Test_BlockTridiagDirect.cpp)
-KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockJacobi SOURCES KokkosBatched_Test_BlockJacobi_Tutorial.cpp)
+KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockTridiag
+  SOURCES KokkosBatched_Test_BlockTridiagDirect.cpp
+)
+KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockJacobi
+  SOURCES KokkosBatched_Test_BlockJacobi_Tutorial.cpp
+)
diff --git a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp
index f37c2d1b6f..ac8abb18f7 100644
--- a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp
+++ b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp
@@ -51,7 +51,7 @@ using namespace KokkosBatched;
 int main (int argc, char *argv[]) {
   Kokkos::initialize(argc, argv); 
 
-#if !defined(__CUDA_ARCH__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   typedef Kokkos::DefaultHostExecutionSpace HostSpaceType;
   const bool detail = false;
 
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp
index adff41c48b..2fffa06855 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp
@@ -29,7 +29,7 @@ int main(int argc, char *argv[]) {
   
   Kokkos::initialize(argc, argv);
 
-#if !defined(__CUDA_ARCH__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   const int ntest = 1;
   //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 };
   int N[1] = { 128*128 };
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp
index 7bb2a2907c..031909d540 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp
@@ -27,7 +27,7 @@ void run(const int N) {
 int main(int argc, char *argv[]) {
   
   Kokkos::initialize(argc, argv);
-#if !defined(__CUDA_ARCH__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   const int ntest = 1;
   //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 };
   int N[1] = { 128*128 };
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp
index 8468800ee6..56ade7a446 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp
@@ -27,7 +27,7 @@ void run(const int N) {
 int main(int argc, char *argv[]) {
   
   Kokkos::initialize(argc, argv);
-#if !defined(__CUDA_ARCH__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   const int ntest = 1;
   //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 };
   const int N[1] = { 128*128 };
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp
index 7b39c624f2..7d352283c6 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp
@@ -21,7 +21,7 @@ int main(int argc, char *argv[]) {
 
   Kokkos::initialize(argc, argv);
 
-#if !defined(__CUDA_ARCH__) 
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   int N = 128*128;
 
   for (int i=1;i<argc;++i) {
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp
index 1afeecd540..bb82e0e56d 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp
@@ -61,7 +61,7 @@ void run(const int N) {
 int main(int argc, char *argv[]) {
 
   Kokkos::initialize(argc, argv);
-#if !defined(__CUDA_ARCH__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   int N = 128*128;
 
   for (int i=1;i<argc;++i) {
diff --git a/perf_test/blas/CMakeLists.txt b/perf_test/blas/CMakeLists.txt
new file mode 100644
index 0000000000..2d93de0458
--- /dev/null
+++ b/perf_test/blas/CMakeLists.txt
@@ -0,0 +1,2 @@
+ADD_SUBDIRECTORY(blas)
+ADD_SUBDIRECTORY(blas3)
diff --git a/perf_test/blas/blas/CMakeLists.txt b/perf_test/blas/blas/CMakeLists.txt
index 98e4ed0859..762c472e22 100644
--- a/perf_test/blas/blas/CMakeLists.txt
+++ b/perf_test/blas/blas/CMakeLists.txt
@@ -2,4 +2,6 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
-    KokkosBlas_perf_test SOURCES KokkosBlas_perf_test.cpp)
+    KokkosBlas_perf_test
+    SOURCES KokkosBlas_perf_test.cpp
+  )
diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
index 0ac85a560a..e6b7b825a7 100644
--- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
+++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
@@ -213,7 +213,7 @@ void __do_trtri_serial_batched(options_t options, trtri_args_t trtri_args) {
   return;
 }
 
-#if !defined(KOKKOS_ENABLE_CUDA)
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
 template <class ExecutionSpace>
 struct parallel_blas_trtri {
   trtri_args_t trtri_args_;
@@ -227,11 +227,11 @@ struct parallel_blas_trtri {
     KokkosBlas::trtri(&trtri_args_.uplo, &trtri_args_.diag, svA);
   }
 };
-#endif  // !KOKKOS_ENABLE_CUDA
+#endif  // !KOKKOS_ENABLE_CUDA && !KOKKOS_ENABLE_HIP
 
 template <class scalar_type, class vta, class device_type>
 void __do_trtri_parallel_blas(options_t options, trtri_args_t trtri_args) {
-#if !defined(KOKKOS_ENABLE_CUDA)
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
   uint32_t warm_up_n = options.warm_up_n;
   uint32_t n         = options.n;
   Kokkos::Timer timer;
@@ -254,9 +254,9 @@ void __do_trtri_parallel_blas(options_t options, trtri_args_t trtri_args) {
   __trtri_output_csv_row(options, trtri_args, timer.seconds());
 #else
   std::cerr << std::string(__func__)
-            << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl;
+            << " disabled since KOKKOS_ENABLE_CUDA and/or KOKKOS_ENABLE_HIP is defined." << std::endl;
   __trtri_output_csv_row(options, trtri_args, -1);
-#endif  // !KOKKOS_ENABLE_CUDA
+#endif  // !KOKKOS_ENABLE_CUDA && !KOKKOS_ENABLE_HIP
   return;
 }
 
diff --git a/perf_test/blas/blas3/CMakeLists.txt b/perf_test/blas/blas3/CMakeLists.txt
index a46d4a7712..c1e3a117fa 100644
--- a/perf_test/blas/blas3/CMakeLists.txt
+++ b/perf_test/blas/blas3/CMakeLists.txt
@@ -2,4 +2,6 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
-    KokkosBlas3_perf_test SOURCES KokkosBlas3_perf_test.cpp)
+    KokkosBlas3_perf_test
+    SOURCES KokkosBlas3_perf_test.cpp
+)
diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp
index 8374c4502d..4952a8e606 100644
--- a/perf_test/blas/blas3/KokkosBlas3_common.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp
@@ -56,8 +56,11 @@
 #define DEFAULT_STEP 3
 #define DEFAULT_WARM_UP_N 100
 #define DEFAULT_N 100
+#define DEFAULT_K 1024
 #define DEFAULT_OUT &std::cout
-#define DEFAULT_BLAS_ROUTINES "trmm,"
+#define DEFAULT_BLAS_ROUTINES "trmm,gemm,"
+#define DEFAULT_TEAM_SIZE 1
+#define DEFAULT_VECTOR_LEN 1
 
 /************************ blas routine structure definitions **********/
 struct perf_test_trmm_args {
@@ -66,29 +69,40 @@ struct perf_test_trmm_args {
 };
 typedef struct perf_test_trmm_args pt_trmm_args_t;
 
+struct perf_test_gemm_args {
+  std::string gemm_args;  //[N,T,C][N,T,C] for transA and transB
+  default_scalar alpha;
+  default_scalar beta;
+};
+typedef struct perf_test_gemm_args pt_gemm_args_t;
 // ADD MORE BLAS3 ROUTINE ARG STRUCTS HERE.
 
 struct blas_args {
   pt_trmm_args_t trmm;
+  pt_gemm_args_t gemm;
   // ADD MORE BLAS3 ROUTINES HERE
+  int team_size;
+  int vector_len;
+  // ADD MORE COMMON BLAS3 OPTIONS HERE
 };
 typedef struct blas_args blas_args_t;
 
 typedef enum BLAS_ROUTINES {
   TRMM,
+  GEMM,
   // ADD MORE BLAS3 ROUTINES HERE
   BLAS_ROUTINES_N
 } blas_routines_e;
 
 static std::string blas_routines_e_str[BLAS_ROUTINES_N] = {
-    "trmm"
+    "trmm", "gemm"
     // ADD MORE BLAS3 ROUTINES HERE
 };
 
 /************************ perf test type definitions ************************/
 /**
- * @var SERIAL:   Run the blas routine iterativley, within a for-loop
- * @var PARALLEL: Run the blas routine iterativley, within a
+ * @var SERIAL:   Run the blas routine iteratively, within a for-loop
+ * @var PARALLEL: Run the blas routine iteratively, within a
  * Kokkos::parallel_for-loop
  */
 typedef enum LOOP {
@@ -98,27 +112,47 @@ typedef enum LOOP {
   LOOP_N
 } loop_e;
 
-static std::string loop_e_str[LOOP_N] = {"SERIAL", "PARALLEL"};
+static std::string loop_e_str[LOOP_N] = {"serial", "parallel"};
 
 /**
- * @var BLAS:    Run the blas routine through the KokkosBlas namespace.
- * @var BATCHED: Run the blas routine through the KokkosBatched namespace.
+ * @var BLAS:                          Run the blas routine through the
+ * KokkosBlas namespace.
+ * @var BATCHED_SERIAL{_BLOCKED}:      Run the serial blas routine through the
+ *                                     KokkosBatched namespace.
+ * @var BATCHED_TEAM{_BLOCKED}:        Run the team blas routine through the
+ * KokkosBatched namespace.
+ * @var BATCHED_TEAM_VECTOR{_BLOCKED}: Run the team vector blas routine through
+ * the KokkosBatched namespace.
+ * @var EXPERIMENT:                    Run the blas routine as a custom
+ * experiment.
  */
 typedef enum TEST {
   BLAS,
-  BATCHED,
+  BATCHED_SERIAL,
+  BATCHED_SERIAL_BLOCKED,
+  BATCHED_TEAM,
+  BATCHED_TEAM_BLOCKED,
+  BATCHED_TEAM_VECTOR,
+  BATCHED_TEAM_VECTOR_BLOCKED,
   // ADD MORE TEST TYPES HERE
+  EXPERIMENT,
   TEST_N
 } test_e;
 
-static std::string test_e_str[TEST_N]{"BLAS", "BATCHED"};
+static std::string test_e_str[TEST_N]{
+    "blas", "batched_serial", "batched_serial_blocked", "batched_team",
+    "batched_team_blocked", "batched_team_vector",
+    "batched_team_vector_blocked",
+    // ADD MORE TEST TYPES HERE
+    "experiment"};
 
 /**
+ * @var k: Number of 2D matrices.
  * @var m: Number of rows.
  * @var n: Number of columns.
  */
 struct matrix_dim {
-  int m, n;
+  int k, m, n;
 };
 typedef struct matrix_dim matrix_dim_t;
 
@@ -157,4 +191,14 @@ struct perf_test_options {
   std::string blas_routines;
 };
 typedef struct perf_test_options options_t;
+
+/*************************** Print macros **************************/
+//#define PERF_TEST_DEBUG
+#ifdef PERF_TEST_DEBUG
+#define STATUS printf("STATUS: %s:%d.\n", __func__, __LINE__);
+#else
+#define STATUS
+#endif  // PERF_TEST_DEBUG
+#define FATAL_ERROR(msg) \
+  printf("FATAL_ERROR: %s:%s:%d %s\n", __FILE__, __func__, __LINE__, (msg));
 #endif  // KOKKOSBLAS3_COMMON_H_
diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
new file mode 100644
index 0000000000..f26fbb7287
--- /dev/null
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -0,0 +1,1015 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOSBLAS3_GEMM_PERF_TEST_H_
+#define KOKKOSBLAS3_GEMM_PERF_TEST_H_
+
+//#include <complex.h>
+#include "KokkosBlas3_common.hpp"
+
+#include <Kokkos_Random.hpp>
+
+#include <KokkosBlas3_gemm.hpp>
+
+#include "KokkosBatched_Gemm_Decl.hpp"
+#include "KokkosBatched_Gemm_Serial_Impl.hpp"
+//#include "KokkosBatched_Gemm_Team_Impl.hpp"
+//#include "KokkosBatched_Gemm_TeamVector_Impl.hpp"
+#include "KokkosBatched_Util.hpp"
+
+//#define GEMM_PERF_TEST_DEBUG
+
+// Forward declarations
+void do_gemm_serial_blas(options_t options);
+void do_gemm_serial_batched(options_t options);
+void do_gemm_serial_batched_blocked(options_t options);
+// void do_gemm_experiment(options_t options);
+
+// void do_gemm_serial_blas_parallel(options_t options);
+// Not valid! The KokkosBlas::gemm function may take the entire device per
+// invocation!
+void do_gemm_serial_batched_parallel(options_t options);
+void do_gemm_serial_batched_blocked_parallel(options_t options);
+void do_gemm_team_batched_parallel(options_t options);
+void do_gemm_team_batched_blocked_parallel(options_t options);
+void do_gemm_team_vector_batched_parallel(options_t options);
+void do_gemm_team_vector_batched_blocked_parallel(options_t options);
+void do_gemm_experiment_parallel(options_t options);
+
+struct SerialTag {};
+struct TeamTag {};
+struct TeamVectorTag {};
+struct LayoutLeftTag {};
+struct LayoutRightTag {};
+struct SimdCpuTag {};
+
+// gemm invoke table
+void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = {
+    {
+        do_gemm_serial_blas,                                     // BLAS
+        do_gemm_serial_batched, do_gemm_serial_batched_blocked,  // Serial
+        NULL, NULL,                                              // Team
+        NULL, NULL,                                              // TeamVector
+        NULL  // Serial Experiment
+    },
+    {
+        NULL,  // BLAS
+        do_gemm_serial_batched_parallel,
+        do_gemm_serial_batched_blocked_parallel,  // Serial
+        do_gemm_team_batched_parallel,
+        do_gemm_team_batched_blocked_parallel,       // Team
+        do_gemm_team_vector_batched_parallel, NULL,  // TeamVector
+        do_gemm_experiment_parallel                  // Parallel Experiment
+    }};
+
+/*************************** Test types and defaults **************************/
+#define DEFAULT_GEMM_ARGS "NN"
+#define DEFAULT_GEMM_ALPHA 1.0
+
+using view_type_3d =
+    Kokkos::View<default_scalar ***, default_layout, default_device>;
+
+struct batched_params {
+  int team_size;
+  int vector_len;
+};
+typedef struct batched_params batched_params_t;
+
+struct gemm_args {
+  char transA, transB;
+  default_scalar alpha;
+  default_scalar beta;
+  view_type_3d A, B, C;
+  batched_params_t bp;
+};
+typedef struct gemm_args gemm_args_t;
+
+static std::string gemm_csv_header_str =
+    "algorithm,transAtransB,alpha,beta,team_size,vector_len,loop_type,A_dims,B_"
+    "dims,C_dims,warm_up_n,"
+    "iter,total_time(s),average_time(s)";
+
+/*************************** Internal helper fns **************************/
+static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
+                                  double time_in_seconds,
+                                  const char *experiment_name = nullptr) {
+  std::string algo_name = test_e_str[options.test];
+  if (experiment_name) algo_name = std::string(experiment_name);
+
+  options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << ","
+                 << options.blas_args.gemm.alpha << ","
+                 << options.blas_args.gemm.beta << "," << gemm_args.bp.team_size
+                 << "," << gemm_args.bp.vector_len << ","
+                 << loop_e_str[options.loop] << "," << gemm_args.A.extent(0)
+                 << "x" << gemm_args.A.extent(1) << "x" << gemm_args.A.extent(2)
+                 << "," << gemm_args.B.extent(0) << "x" << gemm_args.B.extent(1)
+                 << "x" << gemm_args.B.extent(2) << "," << gemm_args.C.extent(0)
+                 << "x" << gemm_args.C.extent(1) << "x" << gemm_args.C.extent(2)
+                 << "," << options.warm_up_n << "," << options.n << ","
+                 << time_in_seconds << "," << time_in_seconds / options.n
+                 << std::endl;
+}
+
+static void __print_gemm_perf_test_options(options_t options) {
+#ifdef PERF_TEST_DEBUG
+  printf("options.test      = %s\n", test_e_str[options.test].c_str());
+  printf("options.loop      = %s\n", loop_e_str[options.loop].c_str());
+  printf("options.start     = %dx%d,%dx%d\n", options.start.a.m,
+         options.start.a.n, options.start.b.m, options.start.b.n);
+  printf("options.stop      = %dx%d,%dx%d\n", options.stop.a.m,
+         options.stop.a.n, options.stop.b.m, options.stop.b.n);
+  printf("options.step      = %d\n", options.step);
+  printf("options.warm_up_n = %d\n", options.warm_up_n);
+  printf("options.n         = %d\n", options.n);
+  printf("options.blas_args.gemm.gemm_args = %s\n",
+         options.blas_args.gemm.gemm_args.c_str());
+  printf("options.out_file  = %s\n", options.out_file.c_str());
+  if (std::is_same<double, default_scalar>::value)
+    printf("options.alpha     = %lf\n", options.blas_args.gemm.alpha);
+  else if (std::is_same<float, default_scalar>::value)
+    printf("options.alpha     = %f\n", options.blas_args.gemm.alpha);
+#endif  // PERF_TEST_DEBUG
+  return;
+}
+
+/*************************** Internal templated fns **************************/
+template <class scalar_type, class vta, class vtb, class device_type>
+void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) {
+// Need to take subviews on the device
+#if !defined(KOKKOS_ENABLE_CUDA)
+  Kokkos::Timer timer;
+
+  STATUS;
+
+  auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) {
+    for (uint32_t i = 0; i < n; ++i) {
+      auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL());
+      auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
+      auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL());
+
+      KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha,
+                       A, B, _gemm_args.beta, C);
+    }
+  };
+  __do_loop(options.warm_up_n, gemm_args);
+  Kokkos::fence();
+
+  timer.reset();
+  __do_loop(options.n, gemm_args);
+  Kokkos::fence();
+
+  __gemm_output_csv_row(options, gemm_args, timer.seconds());
+#else
+  std::cerr << std::string(__func__)
+            << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl;
+#endif  // !KOKKOS_ENABLE_CUDA
+  return;
+}
+
+template <class TransAType, class TransBType, class AlgoType>
+void __do_gemm_serial_batched_template(options_t options,
+                                       gemm_args_t gemm_args) {
+// Need to take subviews on the device
+#if !defined(KOKKOS_ENABLE_CUDA)
+  Kokkos::Timer timer;
+
+  auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) {
+    for (uint32_t i = 0; i < n; ++i) {
+      auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL());
+      auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
+      auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL());
+
+      SerialGemm<TransAType, TransBType, AlgoType>::invoke(
+          _gemm_args.alpha, A, B, _gemm_args.beta, C);
+    }
+  };
+
+  __do_loop(options.warm_up_n, gemm_args);
+  Kokkos::fence();
+
+  timer.reset();
+  __do_loop(options.n, gemm_args);
+  Kokkos::fence();
+  __gemm_output_csv_row(options, gemm_args, timer.seconds());
+#else
+  std::cerr << std::string(__func__)
+            << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl;
+#endif  // !KOKKOS_ENABLE_CUDA
+}
+
+template <class scalar_type, class vta, class vtb, class vtc, class device_type,
+          class algo_type>
+void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) {
+  char a  = gemm_args.transA;
+  char b  = gemm_args.transB;
+  using N = Trans::NoTranspose;
+  using T = Trans::Transpose;
+  // using C = Trans::ConjTranspose;
+
+  STATUS;
+
+  if (a == 'N' && b == 'N') {
+    __do_gemm_serial_batched_template<N, N, algo_type>(options, gemm_args);
+  } else if (a == 'N' && b == 'T') {
+    __do_gemm_serial_batched_template<N, T, algo_type>(options, gemm_args);
+    //} else if (a == 'N' && b == 'C') {
+    //  __do_gemm_serial_batched_template<N, C, algo_type>(options, gemm_args);
+  } else if (a == 'T' && b == 'N') {
+    __do_gemm_serial_batched_template<T, N, algo_type>(options, gemm_args);
+  } else if (a == 'T' && b == 'T') {
+    __do_gemm_serial_batched_template<T, T, algo_type>(options, gemm_args);
+    //} else if (a == 'T' && b == 'C') {
+    //  __do_gemm_serial_batched_template<T, C, algo_type>(options, gemm_args);
+    //} else if (a == 'C' && b == 'N') {
+    //  __do_gemm_serial_batched_template<C, N, algo_type>(options, gemm_args);
+    //} else if (a == 'C' && b == 'T') {
+    //  __do_gemm_serial_batched_template<C, T, algo_type>(options, gemm_args);
+    //} else if (a == 'C' && b == 'C') {
+    //  __do_gemm_serial_batched_template<C, C, algo_type>(options, gemm_args);
+  } else {
+    FATAL_ERROR("Bad gemm_args TransA or TransB value");
+  }
+  return;
+}
+
+#if !defined(KOKKOS_ENABLE_CUDA)
+template <class ExecutionSpace>
+struct parallel_blas_gemm {
+  gemm_args_t gemm_args_;
+
+  parallel_blas_gemm(gemm_args_t gemm_args) : gemm_args_(gemm_args) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int &i) const {
+    auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL());
+
+    KokkosBlas::gemm(&gemm_args_.transA, &gemm_args_.transB, gemm_args_.alpha,
+                     svA, svB, gemm_args_.beta, svC);
+  }
+};
+#endif  // !KOKKOS_ENABLE_CUDA
+
+template <class scalar_type, class vta, class vtb, class device_type>
+void __do_gemm_parallel_blas(options_t options, gemm_args_t gemm_args) {
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
+  uint32_t warm_up_n = options.warm_up_n;
+  uint32_t n         = options.n;
+  Kokkos::Timer timer;
+  using execution_space = typename device_type::execution_space;
+  using functor_type    = parallel_blas_gemm<execution_space>;
+  functor_type parallel_blas_gemm_functor(gemm_args);
+
+  STATUS;
+
+  Kokkos::parallel_for("parallelBlasWarmUpLoopGemm",
+                       Kokkos::RangePolicy<execution_space>(0, warm_up_n),
+                       parallel_blas_gemm_functor);
+  Kokkos::fence();
+
+  timer.reset();
+  Kokkos::parallel_for("parallelBlasTimedLoopGemm",
+                       Kokkos::RangePolicy<execution_space>(0, n),
+                       parallel_blas_gemm_functor);
+  Kokkos::fence();
+  __gemm_output_csv_row(options, gemm_args, timer.seconds());
+#else
+  std::cerr << std::string(__func__)
+            << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl;
+  __gemm_output_csv_row(options, gemm_args, -1);
+#endif  // !KOKKOS_ENABLE_CUDA
+  return;
+}
+
+template <class MemberType, class TransAType, class TransBType,
+          class BlockingType>
+struct parallel_batched_gemm {
+  gemm_args_t gemm_args_;
+
+  parallel_batched_gemm(gemm_args_t gemm_args) : gemm_args_(gemm_args) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const SerialTag &, const MemberType &member) const {
+    auto i   = member.league_rank();
+    auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL());
+
+    KokkosBatched::SerialGemm<TransAType, TransBType, BlockingType>::invoke(
+        gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamTag &, const MemberType &member) const {
+    auto i   = member.league_rank();
+    auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL());
+
+    KokkosBatched::TeamGemm<MemberType, TransAType, TransBType,
+                            BlockingType>::invoke(member, gemm_args_.alpha, svA,
+                                                  svB, gemm_args_.beta, svC);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamVectorTag &, const MemberType &member) const {
+    auto team_idx = member.league_rank();
+    auto svA =
+        Kokkos::subview(gemm_args_.A, team_idx, Kokkos::ALL(), Kokkos::ALL());
+    auto svB =
+        Kokkos::subview(gemm_args_.B, team_idx, Kokkos::ALL(), Kokkos::ALL());
+    auto svC =
+        Kokkos::subview(gemm_args_.C, team_idx, Kokkos::ALL(), Kokkos::ALL());
+
+    KokkosBatched::TeamVectorGemm<MemberType, TransAType, TransBType,
+                                  BlockingType>::invoke(member,
+                                                        gemm_args_.alpha, svA,
+                                                        svB, gemm_args_.beta,
+                                                        svC);
+  }
+};
+
+template <class TransAType, class TransBType, class BlockingType, class AlgoTag,
+          class device_type>
+void __do_gemm_parallel_batched_template(options_t options,
+                                         gemm_args_t gemm_args) {
+  using execution_space = typename device_type::execution_space;
+  using policy_type     = Kokkos::TeamPolicy<AlgoTag, execution_space>;
+  using member_type     = typename policy_type::member_type;
+  using functor_type =
+      parallel_batched_gemm<member_type, TransAType, TransBType, BlockingType>;
+
+  uint32_t warm_up_n = options.warm_up_n;
+  uint32_t n         = options.n;
+  auto league_size   = options.start.c.k;
+  Kokkos::Timer timer;
+
+  STATUS;
+
+  functor_type parallel_batched_gemm_functor(gemm_args);
+  auto team_size  = gemm_args.bp.team_size;
+  auto vector_len = gemm_args.bp.vector_len;
+
+  for (uint32_t i = 0; i < warm_up_n; i++) {
+    Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
+                         policy_type(league_size, team_size, vector_len),
+                         parallel_batched_gemm_functor);
+  }
+  Kokkos::fence();
+
+  timer.reset();
+  for (uint32_t i = 0; i < n; i++) {
+    Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
+                         policy_type(league_size, team_size, vector_len),
+                         parallel_batched_gemm_functor);
+  }
+  Kokkos::fence();
+
+  __gemm_output_csv_row(options, gemm_args, timer.seconds());
+
+  return;
+}
+
+template <class algo_tag, class blocking_type, class device_type>
+void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) {
+  char a  = gemm_args.transA;
+  char b  = gemm_args.transB;
+  using N = Trans::NoTranspose;
+  using T = Trans::Transpose;
+  // using C = Trans::ConjTranspose;
+
+  STATUS;
+
+  if (a == 'N' && b == 'N') {
+    __do_gemm_parallel_batched_template<N, N, blocking_type, algo_tag,
+                                        device_type>(options, gemm_args);
+  } else if (a == 'N' && b == 'T') {
+    __do_gemm_parallel_batched_template<N, T, blocking_type, algo_tag,
+                                        device_type>(options, gemm_args);
+    //} else if (a == 'N' && b == 'C') {
+    //  __do_gemm_parallel_batched_template<N, C, blocking_type, algo_tag,
+    //  device_type>(options, gemm_args);
+  } else if (a == 'T' && b == 'N') {
+    __do_gemm_parallel_batched_template<T, N, blocking_type, algo_tag,
+                                        device_type>(options, gemm_args);
+  } else if (a == 'T' && b == 'T') {
+    __do_gemm_parallel_batched_template<T, T, blocking_type, algo_tag,
+                                        device_type>(options, gemm_args);
+    //} else if (a == 'T' && b == 'C') {
+    //  __do_gemm_parallel_batched_template<T, C, blocking_type, algo_tag,
+    //  device_type>(options, gemm_args);
+    //} else if (a == 'C' && b == 'N') {
+    //  __do_gemm_parallel_batched_template<C, N, blocking_type, algo_tag,
+    //  device_type>(options, gemm_args);
+    //} else if (a == 'C' && b == 'T') {
+    //  __do_gemm_parallel_batched_template<C, T, blocking_type, algo_tag,
+    //  device_type>(options, gemm_args);
+    //} else if (a == 'C' && b == 'C') {
+    //  __do_gemm_parallel_batched_template<C, C, blocking_type, algo_tag,
+    //  device_type>(options, gemm_args);
+  } else {
+    FATAL_ERROR("Bad gemm_args TransA or TransB value");
+  }
+
+  return;
+}
+
+template <class TransAType, class TransBType, class BlockingType>
+struct parallel_batched_gemm_experiment1 {
+  gemm_args_t gemm_args_;
+
+  parallel_batched_gemm_experiment1(gemm_args_t gemm_args)
+      : gemm_args_(gemm_args) {}
+
+  KOKKOS_INLINE_FUNCTION
+
+  void operator()(const SerialTag &, const int &i) const {
+    auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL());
+
+    // Uses two serial for-loops internally
+    KokkosBatched::SerialGemm<TransAType, TransBType, BlockingType>::invoke(
+        gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
+  }
+};
+
+/**
+ * 1. parallel_for(rangePolicy<Kokkos::DefaultExecutionSpace>(N)): serialGemm
+ *
+ */
+template <class TransAType, class TransBType, class BlockingType,
+          class device_type>
+void __do_gemm_parallel_experiment1(options_t options, gemm_args_t gemm_args) {
+  using execution_space = typename device_type::execution_space;
+  using policy_type     = Kokkos::RangePolicy<SerialTag, execution_space>;
+  using functor_type =
+      parallel_batched_gemm_experiment1<TransAType, TransBType, BlockingType>;
+
+  uint32_t warm_up_n = options.warm_up_n;
+  uint32_t n         = options.n;
+  auto k             = options.start.c.k;
+  Kokkos::Timer timer;
+  STATUS;
+
+  functor_type experiment1_functor(gemm_args);
+
+  for (uint32_t i = 0; i < warm_up_n; ++i) {
+    Kokkos::parallel_for("parallelBatchedUntimedExperiment1Gemm",
+                         policy_type(0, k), experiment1_functor);
+  }
+  Kokkos::fence();
+
+  timer.reset();
+  for (uint32_t i = 0; i < n; ++i) {
+    Kokkos::parallel_for("parallelBatchedTimedExperiment1Gemm",
+                         policy_type(0, k), experiment1_functor);
+  }
+  Kokkos::fence();
+
+  __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment1");
+  return;
+}
+
+template <class TransAType, class TransBType, class BlockingType,
+          class MemberType>
+struct parallel_batched_gemm_experiment2_3_4 {
+  gemm_args_t gemm_args_;
+
+  parallel_batched_gemm_experiment2_3_4(gemm_args_t gemm_args)
+      : gemm_args_(gemm_args) {}
+
+  // Experiment 2
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamVectorTag &, const MemberType &member) const {
+    auto i   = member.league_rank();
+    auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL());
+
+    // Uses TeamThreadRange over C-rows
+    //        ThreadVectorRange over C-cols
+    KokkosBatched::TeamVectorGemm<MemberType, TransAType, TransBType,
+                                  BlockingType>::invoke(member,
+                                                        gemm_args_.alpha, svA,
+                                                        svB, gemm_args_.beta,
+                                                        svC);
+  }
+
+  // Experiment 3
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const LayoutLeftTag &, const MemberType &member) const {
+    auto team_idx = member.league_rank();
+    auto svA =
+        Kokkos::subview(gemm_args_.A, team_idx, Kokkos::ALL(), Kokkos::ALL());
+    auto svB =
+        Kokkos::subview(gemm_args_.B, team_idx, Kokkos::ALL(), Kokkos::ALL());
+    auto svC =
+        Kokkos::subview(gemm_args_.C, team_idx, Kokkos::ALL(), Kokkos::ALL());
+
+    // TeamThreadRange:   splits the index range over the threads of the team
+    // ThreadVectorRange: splits the index range over the vector lanes of the
+    // calling thread
+
+    auto svC_cols = svC.extent(1);
+    // In a given team, for each vector lane, compute zero or more output
+    // columns of C depending on the index range
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, svC_cols), [&](const int &lane_idx) {
+          auto svB_col = Kokkos::subview(svB, Kokkos::ALL(), lane_idx);
+          auto svC_col = Kokkos::subview(svC, Kokkos::ALL(), lane_idx);
+          // TeamGemm Calls TeamThreadRange over M*N meaning the flat M*N array
+          // is split over all threads of the team
+          KokkosBatched::TeamGemm<MemberType, TransAType, TransBType,
+                                  BlockingType>::invoke(member,
+                                                        gemm_args_.alpha, svA,
+                                                        svB_col,
+                                                        gemm_args_.beta,
+                                                        svC_col);
+        });
+  }
+
+  // TODO: Why is this faster than the LayoutLeftTag operator above for both
+  // LayoutLeft and LayoutRight? Experiment 4
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const LayoutRightTag &, const MemberType &member) const {
+    auto team_idx = member.league_rank();
+    auto svA =
+        Kokkos::subview(gemm_args_.A, team_idx, Kokkos::ALL(), Kokkos::ALL());
+    auto svB =
+        Kokkos::subview(gemm_args_.B, team_idx, Kokkos::ALL(), Kokkos::ALL());
+    auto svC =
+        Kokkos::subview(gemm_args_.C, team_idx, Kokkos::ALL(), Kokkos::ALL());
+
+    // TeamThreadRange:   splits the index range over the threads of the team
+    // ThreadVectorRange: splits the index range over the vector lanes of the
+    // calling thread
+
+    auto svC_rows = svC.extent(0);
+    // In a given team, for each vector lane, compute zero or more output rows
+    // of C depending on the index range
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, svC_rows), [&](const int &lane_idx) {
+          auto svA_row = Kokkos::subview(svA, lane_idx, Kokkos::ALL());
+          auto svC_row = Kokkos::subview(svC, lane_idx, Kokkos::ALL());
+          // TeamGemm Calls TeamThreadRange over M*N meaning the flat M*N array
+          // is split over all threads of the team
+          KokkosBatched::TeamGemm<MemberType, TransAType, TransBType,
+                                  BlockingType>::invoke(member,
+                                                        gemm_args_.alpha,
+                                                        svA_row, svB,
+                                                        gemm_args_.beta,
+                                                        svC_row);
+        });
+  }
+};
+
+/**
+ * 2. case a)
+ * parallel_for(teamPolicy): TeamVectorGemm
+ *
+ */
+template <class TransAType, class TransBType, class BlockingType,
+          class device_type>
+void __do_gemm_parallel_experiment2(options_t options, gemm_args_t gemm_args) {
+  using execution_space = typename device_type::execution_space;
+  using policy_type     = Kokkos::TeamPolicy<TeamVectorTag, execution_space>;
+  using member_type     = typename policy_type::member_type;
+  using functor_type =
+      parallel_batched_gemm_experiment2_3_4<TransAType, TransBType,
+                                            BlockingType, member_type>;
+
+  uint32_t warm_up_n = options.warm_up_n;
+  uint32_t n         = options.n;
+  auto league_size   = options.start.c.k;
+  Kokkos::Timer timer;
+  STATUS;
+
+  functor_type experiment2_functor(gemm_args);
+
+  auto team_size  = gemm_args.bp.team_size;
+  auto vector_len = gemm_args.bp.vector_len;
+
+  for (uint32_t i = 0; i < warm_up_n; ++i) {
+    Kokkos::parallel_for("parallelBatchedUntimedExperiment2Gemm",
+                         policy_type(league_size, team_size, vector_len),
+                         experiment2_functor);
+  }
+  Kokkos::fence();
+
+  timer.reset();
+
+  for (uint32_t i = 0; i < n; ++i) {
+    Kokkos::parallel_for("parallelBatchedTimedExperiment2Gemm",
+                         policy_type(league_size, team_size, vector_len),
+                         experiment2_functor);
+  }
+  Kokkos::fence();
+
+  __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment2");
+  return;
+}
+
+/**
+ * 3. case b)
+ *    parallel_for(teamPolicy):
+ *      parallel_for(TeamThreadRange):
+ *         VectorGemm
+ *
+ * VectorGemm has not been implemented!
+ * I think this experiment can be removed. TeamGemm calls TeamThreadRange
+ * internally! TeamVectorGemm calls both TeamThreadRange and ThreadVectorRange
+ * internally!
+ */
+template <class TransAType, class TransBType, class BlockingType,
+          class device_type>
+void __do_gemm_parallel_experiment3(options_t options, gemm_args_t gemm_args) {
+  using execution_space = typename device_type::execution_space;
+  // using layout_tag = std::conditional<std::is_same<default_layout,
+  // Kokkos::LayoutLeft>::value, LayoutLeftTag, LayoutRightTag>::type;
+  using policy_type = Kokkos::TeamPolicy<LayoutLeftTag, execution_space>;
+  using member_type = typename policy_type::member_type;
+  using functor_type =
+      parallel_batched_gemm_experiment2_3_4<TransAType, TransBType,
+                                            BlockingType, member_type>;
+
+  uint32_t warm_up_n = options.warm_up_n;
+  uint32_t n         = options.n;
+  auto league_size   = options.start.c.k;
+  Kokkos::Timer timer;
+  STATUS;
+
+  functor_type experiment3_functor(gemm_args);
+
+  auto team_size  = gemm_args.bp.team_size;
+  auto vector_len = gemm_args.bp.vector_len;
+
+  for (uint32_t i = 0; i < warm_up_n; ++i) {
+    Kokkos::parallel_for("parallelBatchedUntimedExperiment3Gemm",
+                         policy_type(league_size, team_size, vector_len),
+                         experiment3_functor);
+  }
+  Kokkos::fence();
+
+  timer.reset();
+
+  for (uint32_t i = 0; i < n; ++i) {
+    Kokkos::parallel_for("parallelBatchedTimedExperiment3Gemm",
+                         policy_type(league_size, team_size, vector_len),
+                         experiment3_functor);
+  }
+  Kokkos::fence();
+
+  __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment3");
+  return;
+}
+
+/**
+ * 4. case c)
+ * parallel_for(teamPolicy):
+ *      parallel_for(ThreadVectorRange)
+ *        TeamGemm
+ */
+template <class TransAType, class TransBType, class BlockingType,
+          class device_type>
+void __do_gemm_parallel_experiment4(options_t options, gemm_args_t gemm_args) {
+  using execution_space = typename device_type::execution_space;
+  // using layout_tag = std::conditional<std::is_same<default_layout,
+  // Kokkos::LayoutLeft>::value, LayoutLeftTag, LayoutRightTag>::type;
+  using policy_type = Kokkos::TeamPolicy<LayoutRightTag, execution_space>;
+  using member_type = typename policy_type::member_type;
+  using functor_type =
+      parallel_batched_gemm_experiment2_3_4<TransAType, TransBType,
+                                            BlockingType, member_type>;
+
+  uint32_t warm_up_n = options.warm_up_n;
+  uint32_t n         = options.n;
+  auto league_size   = options.start.c.k;
+  Kokkos::Timer timer;
+  STATUS;
+
+  functor_type experiment4_functor(gemm_args);
+
+  auto team_size  = gemm_args.bp.team_size;
+  auto vector_len = gemm_args.bp.vector_len;
+
+  for (uint32_t i = 0; i < warm_up_n; ++i) {
+    Kokkos::parallel_for("parallelBatchedUntimedExperiment4Gemm",
+                         policy_type(league_size, team_size, vector_len),
+                         experiment4_functor);
+  }
+  Kokkos::fence();
+
+  timer.reset();
+
+  for (uint32_t i = 0; i < n; ++i) {
+    Kokkos::parallel_for("parallelBatchedTimedExperiment4Gemm",
+                         policy_type(league_size, team_size, vector_len),
+                         experiment4_functor);
+  }
+  Kokkos::fence();
+
+  __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment4");
+  return;
+}
+
+template <class SimdViewType, class TransAType, class TransBType,
+          class BlockingType>
+class parallel_batched_gemm_experiment5 {
+ private:
+  SimdViewType &A, &B, &C;
+  gemm_args_t gemm_args;
+
+ public:
+  parallel_batched_gemm_experiment5(SimdViewType &_A, SimdViewType &_B,
+                                    SimdViewType &_C, gemm_args_t _gemm_args)
+      : A(_A), B(_B), C(_C), gemm_args(_gemm_args) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const SimdCpuTag &, const int &i) const {
+    auto svA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svB = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svC = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL());
+
+    // Uses two serial for-loops internally
+    KokkosBatched::SerialGemm<TransAType, TransBType, BlockingType>::invoke(
+        gemm_args.alpha, svA, svB, gemm_args.beta, svC);
+  }
+};
+
+/**
+ * 5.
+ * parallel_for(RangePolicy<Kokkos:DefaultHostExecutionSpace>(N/vl+(N%vl>0)>):
+ * serialGemm
+ *
+ * Not portable to GPU
+ */
+template <class TransAType, class TransBType, class BlockingType,
+          class device_type>
+void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) {
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
+  using execution_space = typename device_type::execution_space;
+  using policy_type     = Kokkos::RangePolicy<SimdCpuTag, execution_space>;
+
+  // Construct the SimdType
+  using scalar_type = typename view_type_3d::value_type;
+  constexpr int vl =
+      KokkosBatched::DefaultVectorLength<scalar_type, execution_space>::value;
+  using simd_type = KokkosBatched::Vector<KokkosBatched::SIMD<scalar_type>, vl>;
+  using simd_view_type =
+      Kokkos::View<simd_type ***, default_layout, default_device>;
+  using functor_type =
+      parallel_batched_gemm_experiment5<simd_view_type, TransAType, TransBType,
+                                        BlockingType>;
+
+  uint32_t warm_up_n = options.warm_up_n;
+  uint32_t n         = options.n;
+  auto k             = options.start.c.k;
+  Kokkos::Timer timer;
+  auto simd_batch_size = k / vl + (k % vl > 0);
+  STATUS;
+
+  // Increases each array size by sizeof(scalar_type) * (vl-1) bytes!
+  simd_view_type A("A", simd_batch_size, gemm_args.A.extent(0),
+                   gemm_args.A.extent(1));
+  simd_view_type B("B", simd_batch_size, gemm_args.B.extent(0),
+                   gemm_args.B.extent(1));
+  simd_view_type C("C", simd_batch_size, gemm_args.C.extent(0),
+                   gemm_args.C.extent(1));
+
+  // uint64_t seed = Kokkos::Impl::clock_tic();
+  // Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
+  // Kokkos::fill_random(A, rand_pool,
+  // Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, simd_type>::max());
+  // Kokkos::fill_random(B, rand_pool,
+  // Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, simd_type>::max());
+  // Kokkos::fill_random(C, rand_pool,
+  // Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, simd_type>::max());
+  // execution_space::fence();
+
+  functor_type experiment5_functor(A, B, C, gemm_args);
+
+  for (uint32_t i = 0; i < warm_up_n; ++i) {
+    Kokkos::parallel_for("parallelBatchedUntimedExperiment5Gemm",
+                         policy_type(0, simd_batch_size), experiment5_functor);
+  }
+  Kokkos::fence();
+
+  timer.reset();
+
+  for (uint32_t i = 0; i < n; ++i) {
+    Kokkos::parallel_for("parallelBatchedTimedExperiment5Gemm",
+                         policy_type(0, simd_batch_size), experiment5_functor);
+  }
+  Kokkos::fence();
+
+  __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment5");
+#else
+  std::cerr
+      << std::string(__func__)
+      << " disabled since KOKKOS_ENABLE_CUDA or KOKKOS_ENABLE_HIP is defined."
+      << std::endl;
+#endif  // !KOKKOS_ENABLE_CUDA || !KOKKOS_ENABLE_HIP
+  return;
+}
+
+/*************************** Internal setup fns **************************/
+template <class scalar_type, class vta, class vtb, class vtc, class device_type>
+gemm_args_t __do_setup(options_t options, matrix_dims_t dim) {
+  using execution_space = typename device_type::execution_space;
+
+  gemm_args_t gemm_args;
+  uint64_t seed = Kokkos::Impl::clock_tic();
+  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
+  STATUS;
+
+  gemm_args.transA        = options.blas_args.gemm.gemm_args.c_str()[0];
+  gemm_args.transB        = options.blas_args.gemm.gemm_args.c_str()[1];
+  gemm_args.A             = vta("gemm_args.A", dim.a.k, dim.a.m, dim.a.n);
+  gemm_args.B             = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n);
+  gemm_args.C             = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n);
+  gemm_args.alpha         = options.blas_args.gemm.alpha;
+  gemm_args.alpha         = options.blas_args.gemm.beta;
+  gemm_args.bp.team_size  = options.blas_args.team_size;
+  gemm_args.bp.vector_len = options.blas_args.vector_len;
+
+  Kokkos::fill_random(gemm_args.A, rand_pool,
+                      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                   scalar_type>::max());
+  Kokkos::fill_random(gemm_args.B, rand_pool,
+                      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                   scalar_type>::max());
+  Kokkos::fill_random(gemm_args.C, rand_pool,
+                      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                   scalar_type>::max());
+
+  return gemm_args;
+}
+
+/*************************** Interal run helper fns **************************/
+void __do_loop_and_invoke(options_t options,
+                          void (*fn)(options_t, gemm_args_t)) {
+  matrix_dims_t cur_dims;
+  gemm_args_t gemm_args;
+  STATUS;
+
+  __print_gemm_perf_test_options(options);
+  std::cout << "SCALAR:" << typeid(default_scalar).name()
+            << ", LAYOUT:" << typeid(default_layout).name()
+            << ", DEVICE:" << typeid(default_device).name() << std::endl;
+
+  options.out[0] << gemm_csv_header_str << std::endl;
+
+  for (cur_dims = options.start;
+       cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n &&
+       cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n &&
+       cur_dims.c.m <= options.stop.c.m && cur_dims.c.n <= options.stop.c.n;
+       cur_dims.a.m *= options.step, cur_dims.a.n *= options.step,
+      cur_dims.b.m *= options.step, cur_dims.b.n *= options.step,
+      cur_dims.c.m *= options.step, cur_dims.c.n *= options.step) {
+    gemm_args = __do_setup<default_scalar, view_type_3d, view_type_3d,
+                           view_type_3d, default_device>(options, cur_dims);
+    fn(options, gemm_args);
+  }
+  return;
+}
+
+/*************************** External fns **************************/
+void do_gemm_serial_blas(options_t options) {
+  STATUS;
+  __do_loop_and_invoke(
+      options, __do_gemm_serial_blas<default_scalar, view_type_3d, view_type_3d,
+                                     default_device>);
+  return;
+}
+
+void do_gemm_serial_batched(options_t options) {
+  STATUS;
+  __do_loop_and_invoke(
+      options, __do_gemm_serial_batched<default_scalar, view_type_3d,
+                                        view_type_3d, view_type_3d,
+                                        default_device, Algo::Gemm::Unblocked>);
+  return;
+}
+
+void do_gemm_serial_batched_blocked(options_t options) {
+  STATUS;
+  __do_loop_and_invoke(
+      options, __do_gemm_serial_batched<default_scalar, view_type_3d,
+                                        view_type_3d, view_type_3d,
+                                        default_device, Algo::Gemm::Blocked>);
+  return;
+}
+
+void do_gemm_serial_batched_parallel(options_t options) {
+  STATUS;
+  __do_loop_and_invoke(
+      options, __do_gemm_parallel_batched<SerialTag, Algo::Gemm::Unblocked,
+                                          default_device>);
+  return;
+}
+
+void do_gemm_serial_batched_blocked_parallel(options_t options) {
+  STATUS;
+  __do_loop_and_invoke(
+      options, __do_gemm_parallel_batched<SerialTag, Algo::Gemm::Blocked,
+                                          default_device>);
+  return;
+}
+
+void do_gemm_team_batched_parallel(options_t options) {
+  STATUS;
+  __do_loop_and_invoke(
+      options, __do_gemm_parallel_batched<TeamTag, Algo::Gemm::Unblocked,
+                                          default_device>);
+  return;
+}
+
+void do_gemm_team_batched_blocked_parallel(options_t options) {
+  STATUS;
+  __do_loop_and_invoke(
+      options,
+      __do_gemm_parallel_batched<TeamTag, Algo::Gemm::Blocked, default_device>);
+  return;
+}
+
+void do_gemm_team_vector_batched_parallel(options_t options) {
+  STATUS;
+  __do_loop_and_invoke(
+      options, __do_gemm_parallel_batched<TeamVectorTag, Algo::Gemm::Unblocked,
+                                          default_device>);
+  return;
+}
+
+/* void do_gemm_team_vector_batched_blocked_parallel(options_t options) {
+  STATUS;
+  __do_loop_and_invoke(
+      options, __do_gemm_parallel_batched<TeamVectorTag, Algo::Gemm::Blocked,
+default_device>); return;
+} */
+
+void do_gemm_experiment_parallel(options_t options) {
+  STATUS;
+  using TransAType   = Trans::NoTranspose;
+  using TransBType   = Trans::NoTranspose;
+  using BlockingType = Algo::Gemm::Unblocked;
+
+  __do_loop_and_invoke(
+      options, __do_gemm_parallel_experiment1<TransAType, TransBType,
+                                              BlockingType, default_device>);
+  __do_loop_and_invoke(
+      options, __do_gemm_parallel_experiment2<TransAType, TransBType,
+                                              BlockingType, default_device>);
+  __do_loop_and_invoke(
+      options, __do_gemm_parallel_experiment3<TransAType, TransBType,
+                                              BlockingType, default_device>);
+  __do_loop_and_invoke(
+      options, __do_gemm_parallel_experiment4<TransAType, TransBType,
+                                              BlockingType, default_device>);
+  __do_loop_and_invoke(
+      options, __do_gemm_parallel_experiment5<TransAType, TransBType,
+                                              BlockingType, default_device>);
+}
+
+#endif  // KOKKOSBLAS3_GEMM_PERF_TEST_H_
diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
index 122f0b3817..b493c244d8 100644
--- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
+++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
@@ -43,6 +43,7 @@
 */
 #include "KokkosBlas3_common.hpp"
 #include "KokkosBlas3_trmm_perf_test.hpp"
+#include "KokkosBlas3_gemm_perf_test.hpp"
 
 #include <cstdlib>
 #include <unistd.h>
@@ -61,6 +62,11 @@ static struct option long_options[] = {
     {"routines", required_argument, 0, 'r'},
     {"trmm_options", required_argument, 0, 'o'},
     {"trmm_alpha", required_argument, 0, 'a'},
+    {"gemm_options", required_argument, 0, 'g'},
+    {"gemm_alpha", required_argument, 0, 'p'},
+    {"team_size", required_argument, 0, 'z'},
+    {"vector_len", required_argument, 0, 'n'},
+    {"batch_size", required_argument, 0, 'k'},
     {0, 0, 0, 0}};
 
 static void __print_help_blas3_perf_test() {
@@ -72,14 +78,12 @@ static void __print_help_blas3_perf_test() {
   printf("\t-t, --test=OPTION\n");
   printf("\t\tAlgorithm selection.\n");
   printf("\t\t\tValid values for OPTION:\n");
-  printf("%c[1m", 27);
-  printf("\t\t\t\tblas:");
-  printf("%c[0m", 27);
-  printf(" invoke Kokkos::trmm the loop-body. (default)\n");
-  printf("%c[1m", 27);
-  printf("\t\t\t\tbatched:");
-  printf("%c[0m", 27);
-  printf(" invoke KokkosBatched::SerialTrmm in the loop-body.\n\n");
+  for (int i = 0; i < TEST_N; i++) {
+    printf("%c[1m", 27);
+    printf("\t\t\t\t%s", test_e_str[i].c_str());
+    printf("%c[0m", 27);
+    printf("\n");
+  }
 
   printf("\t-o, --trmm_options=OPTION_STRING\n");
   printf("\t\tTRMM side, uplo, trans, and diag options.\n");
@@ -93,6 +97,33 @@ static void __print_help_blas3_perf_test() {
   printf("\t\t\tThe value of alpha in floating point. (default: %lf)\n",
          DEFAULT_TRMM_ALPHA);
 
+  printf("\t-g, --gemm_options=OPTION_STRING\n");
+  printf("\t\tGEMM transA, and transB options.\n");
+  printf(
+      "\t\t\tValid format for OPTION_STRING is \"%%c%%c\". (default: "
+      "%s)\n",
+      DEFAULT_GEMM_ARGS);
+
+  printf("\t-p, --gemm_alpha=SCALAR_VALUE\n");
+  printf("\t\tGEMM alpha value.\n");
+  printf("\t\t\tThe value of alpha in floating point. (default: %lf)\n",
+         DEFAULT_GEMM_ALPHA);
+
+  printf("\t-z, --team_size=SIZE\n");
+  printf("\t\tKokkos team size.\n");
+  printf("\t\t\tThe value of SIZE as an integer. (default: %d)\n",
+         DEFAULT_TEAM_SIZE);
+
+  printf("\t-n, --vector_len=LEN\n");
+  printf("\t\tKokkos vector length (Heirarchical parallelism).\n");
+  printf("\t\t\tThe value of LEN as an integer. (default: %d)\n",
+         DEFAULT_VECTOR_LEN);
+
+  printf("\t-k, --batch_size=LEN\n");
+  printf("\t\tBatch size. Adds third dimension to matrices A, B, and C.\n");
+  printf("\t\t\tThe value of LEN as an integer. (default: %d)\n",
+         DEFAULT_VECTOR_LEN);
+
   printf("\t-l, --loop_type=OPTION\n");
   printf("\t\tLoop selection.\n");
   printf("\t\t\tValid values for OPTION:\n");
@@ -105,21 +136,25 @@ static void __print_help_blas3_perf_test() {
   printf("%c[0m", 27);
   printf(" invoke blas routine in a Kokkos::parallel_for-loop.\n\n");
 
-  printf("\t-b, --matrix_size_start=MxN,IxJ\n");
-  printf("\t\tMatrix size selection where A is MxN and B is IxJ (start)\n");
+  printf("\t-b, --matrix_size_start=MxN,IxJ,PxQ\n");
+  printf(
+      "\t\tMatrix size selection where A is MxN, B is IxJ, and C is PxQ "
+      "(start)\n");
   printf(
       "\t\t\tValid values for M and N are any non-negative 32-bit integers. "
-      "(default: %dx%d,%dx%d)\n\n",
+      "(default: %dx%d,%dx%d,%dx%d)\n\n",
       DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START,
-      DEFAULT_MATRIX_START);
+      DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START);
 
-  printf("\t-e, --matrix_size_stop=PxQ,SxT\n");
-  printf("\t\tMatrix size selection where A is PxQ and B is SxT (stop)\n");
+  printf("\t-e, --matrix_size_stop=SxT,LxK,OxR\n");
+  printf(
+      "\t\tMatrix size selection where A is SxT, B is LxK, and C is OxR "
+      "(stop)\n");
   printf(
-      "\t\t\tValid values for P and Q are any non-negative 32-bit integers. "
-      "(default: %dx%d,%dx%d)\n\n",
+      "\t\t\tValid dimension values are any non-negative 32-bit integers. "
+      "(default: %dx%d,%dx%d,%dx%d)\n\n",
       DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP,
-      DEFAULT_MATRIX_STOP);
+      DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP);
 
   printf("\t-s, --matrix_size_step=K\n");
   printf("\t\tMatrix step selection.\n");
@@ -156,72 +191,106 @@ static void __print_help_blas3_perf_test() {
       DEFAULT_BLAS_ROUTINES);
 }
 
-static void __blas3_perf_test_input_error(char **argv, int option_idx) {
-  fprintf(stderr, "ERROR: invalid option \"%s %s\".\n", argv[option_idx],
-          argv[option_idx + 1]);
-  __print_help_blas3_perf_test();
+static void __blas3_perf_test_input_error(char **argv, char short_opt,
+                                          char *getopt_optarg) {
+  fprintf(stderr, "ERROR: invalid option \"-%c %s\". Try --help.\n", short_opt,
+          getopt_optarg);
   exit(-EINVAL);
 }
 
 int main(int argc, char **argv) {
   options_t options;
-  int option_idx = 0, ret;
-  char *n_str = nullptr, *adim = nullptr, *bdim = nullptr;
+  int option_idx = 0, ret, i;
+  char *n_str = nullptr, *adim = nullptr, *bdim = nullptr, *cdim = nullptr;
   std::filebuf fb;
-  char *out_file = nullptr;
+  char *out_file                          = nullptr;
+  using rt_type                           = decltype(do_trmm_invoke);
+  rt_type *routine_table[BLAS_ROUTINES_N] = {
+      &do_trmm_invoke, &do_gemm_invoke
+      // ADD MORE BLAS3 ROUTINES HERE
+  };
 
   /* set default options */
-  options.test          = DEFAULT_TEST;
-  options.loop          = DEFAULT_LOOP;
-  options.start.a.m     = DEFAULT_MATRIX_START;
-  options.start.a.n     = DEFAULT_MATRIX_START;
-  options.stop.a.m      = DEFAULT_MATRIX_STOP;
-  options.stop.a.n      = DEFAULT_MATRIX_STOP;
-  options.start.b.m     = DEFAULT_MATRIX_START;
-  options.start.b.n     = DEFAULT_MATRIX_START;
-  options.stop.b.m      = DEFAULT_MATRIX_STOP;
-  options.stop.b.n      = DEFAULT_MATRIX_STOP;
-  options.step          = DEFAULT_STEP;
-  options.warm_up_n     = DEFAULT_WARM_UP_N;
-  options.n             = DEFAULT_N;
-  options.out           = DEFAULT_OUT;
-  options.blas_routines = std::string(DEFAULT_BLAS_ROUTINES);
+  options.test                 = DEFAULT_TEST;
+  options.loop                 = DEFAULT_LOOP;
+  options.start.a.k            = DEFAULT_K;
+  options.start.a.m            = DEFAULT_MATRIX_START;
+  options.start.a.n            = DEFAULT_MATRIX_START;
+  options.stop.a.k             = DEFAULT_K;
+  options.stop.a.m             = DEFAULT_MATRIX_STOP;
+  options.stop.a.n             = DEFAULT_MATRIX_STOP;
+  options.start.b.k            = DEFAULT_K;
+  options.start.b.m            = DEFAULT_MATRIX_START;
+  options.start.b.n            = DEFAULT_MATRIX_START;
+  options.stop.b.k             = DEFAULT_K;
+  options.stop.b.m             = DEFAULT_MATRIX_STOP;
+  options.stop.b.n             = DEFAULT_MATRIX_STOP;
+  options.start.c.k            = DEFAULT_K;
+  options.start.c.m            = DEFAULT_MATRIX_START;
+  options.start.c.n            = DEFAULT_MATRIX_START;
+  options.stop.c.k             = DEFAULT_K;
+  options.stop.c.m             = DEFAULT_MATRIX_STOP;
+  options.stop.c.n             = DEFAULT_MATRIX_STOP;
+  options.step                 = DEFAULT_STEP;
+  options.warm_up_n            = DEFAULT_WARM_UP_N;
+  options.n                    = DEFAULT_N;
+  options.out                  = DEFAULT_OUT;
+  options.blas_routines        = std::string(DEFAULT_BLAS_ROUTINES);
+  options.blas_args.team_size  = DEFAULT_TEAM_SIZE;
+  options.blas_args.vector_len = DEFAULT_VECTOR_LEN;
 
   options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS;
   options.blas_args.trmm.alpha     = DEFAULT_TRMM_ALPHA;
 
-  while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:", long_options,
-                            &option_idx)) != -1) {
+  options.blas_args.gemm.gemm_args = DEFAULT_GEMM_ARGS;
+  options.blas_args.gemm.alpha     = DEFAULT_GEMM_ALPHA;
+
+  while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:",
+                            long_options, &option_idx)) != -1) {
     switch (ret) {
       case 'h': __print_help_blas3_perf_test(); return 0;
       case 't':
-        // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4));
-        if (!strncasecmp(optarg, "blas", 4)) {
-          options.test = BLAS;
-        } else if (!strncasecmp(optarg, "batched", 6)) {
-          options.test = BATCHED;
-        } else {
-          __blas3_perf_test_input_error(argv, option_idx);
+        for (i = 0; i < TEST_N; i++) {
+          if (!test_e_str[i].compare(optarg)) {
+            options.test = (test_e)i;
+            break;
+          }
+        }
+        if (i == TEST_N) {
+          __blas3_perf_test_input_error(argv, ret, optarg);
         }
         break;
       case 'o':
         // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4));
         if (strlen(optarg) != 4) {
-          __blas3_perf_test_input_error(argv, option_idx);
+          __blas3_perf_test_input_error(argv, ret, optarg);
         }
         options.blas_args.trmm.trmm_args = optarg;
         break;
+      case 'g':
+        // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4));
+        if (strlen(optarg) != 3) {
+          __blas3_perf_test_input_error(argv, ret, optarg);
+        }
+        options.blas_args.gemm.gemm_args = optarg;
+        break;
+      case 'p':
+        // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4));
+        options.blas_args.gemm.alpha = (default_scalar)atof(optarg);
+        break;
       case 'a':
         // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4));
         options.blas_args.trmm.alpha = (default_scalar)atof(optarg);
         break;
       case 'l':
-        if (!strncasecmp(optarg, "serial", 6)) {
-          options.loop = SERIAL;
-        } else if (!strncasecmp(optarg, "parallel", 8)) {
-          options.loop = PARALLEL;
-        } else {
-          __blas3_perf_test_input_error(argv, option_idx);
+        for (i = 0; i < LOOP_N; i++) {
+          if (!loop_e_str[i].compare(optarg)) {
+            options.loop = (loop_e)i;
+            break;
+          }
+        }
+        if (i == LOOP_N) {
+          __blas3_perf_test_input_error(argv, ret, optarg);
         }
         break;
       case 'b':
@@ -229,51 +298,78 @@ int main(int argc, char **argv) {
         bdim    = strcasestr(optarg, ",");
         bdim[0] = '\0';
         bdim    = &bdim[1];
+        cdim    = strcasestr(bdim, ",");
+        cdim[0] = '\0';
+        cdim    = &cdim[1];
 
         n_str = strcasestr(adim, "x");
-        if (n_str == NULL) __blas3_perf_test_input_error(argv, option_idx);
+        if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg);
 
         n_str[0]          = '\0';
         options.start.a.m = atoi(adim);
         options.start.a.n = atoi(&n_str[1]);
 
         n_str = strcasestr(bdim, "x");
-        if (n_str == NULL) __blas3_perf_test_input_error(argv, option_idx);
+        if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg);
 
         n_str[0]          = '\0';
         options.start.b.m = atoi(bdim);
         options.start.b.n = atoi(&n_str[1]);
+
+        n_str = strcasestr(cdim, "x");
+        if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg);
+
+        n_str[0]          = '\0';
+        options.start.c.m = atoi(cdim);
+        options.start.c.n = atoi(&n_str[1]);
         break;
       case 'e':
         adim    = optarg;
         bdim    = strcasestr(optarg, ",");
         bdim[0] = '\0';
         bdim    = &bdim[1];
+        cdim    = strcasestr(bdim, ",");
+        cdim[0] = '\0';
+        cdim    = &cdim[1];
 
         n_str = strcasestr(adim, "x");
-        if (n_str == NULL) __blas3_perf_test_input_error(argv, option_idx);
+        if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg);
 
         n_str[0]         = '\0';
         options.stop.a.m = atoi(adim);
         options.stop.a.n = atoi(&n_str[1]);
 
         n_str = strcasestr(bdim, "x");
-        if (n_str == NULL) __blas3_perf_test_input_error(argv, option_idx);
+        if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg);
 
         n_str[0]         = '\0';
         options.stop.b.m = atoi(bdim);
         options.stop.b.n = atoi(&n_str[1]);
+
+        n_str = strcasestr(cdim, "x");
+        if (n_str == NULL) __blas3_perf_test_input_error(argv, ret, optarg);
+
+        n_str[0]         = '\0';
+        options.stop.c.m = atoi(cdim);
+        options.stop.c.n = atoi(&n_str[1]);
         break;
       case 's': options.step = atoi(optarg); break;
       case 'w': options.warm_up_n = atoi(optarg); break;
       case 'i': options.n = atoi(optarg); break;
+      case 'k':
+        options.start.a.k = options.start.b.k = options.start.c.k =
+            options.stop.a.k = options.stop.b.k = options.stop.c.k =
+                atoi(optarg);
+        break;
+      case 'z': options.blas_args.team_size = atoi(optarg); break;
+      case 'n': options.blas_args.vector_len = atoi(optarg); break;
       case 'c':
         out_file         = optarg;
         options.out_file = std::string(out_file);
         break;
-      case 'r': options.blas_routines = std::string(optarg); break;
+      case 'r': options.blas_routines = optarg; break;
       case '?':
-      default: __blas3_perf_test_input_error(argv, option_idx);
+      default: __blas3_perf_test_input_error(argv, ret, optarg);
     }
   }
 
@@ -283,16 +379,35 @@ int main(int argc, char **argv) {
     options.out = &out;
   }
 
-  if (options.warm_up_n > options.n)
-    __blas3_perf_test_input_error(argv, option_idx);
+  if (options.warm_up_n > options.n) {
+    fprintf(stderr, "ERROR: warm_up_n=%d > n=%d. Try --help.\n",
+            options.warm_up_n, options.n);
+    exit(-EINVAL);
+  }
 
   Kokkos::initialize(argc, argv);
 
-  for (int i = 0; i < BLAS_ROUTINES_N; i++) {
-    if (options.blas_routines.find(blas_routines_e_str[TRMM]) !=
-        std::string::npos)
-      do_trmm_invoke[options.loop][options.test](options);
-    // ADD MORE BLAS3 ROUTINES HERE
+  int err = 0;
+  for (i = 0; i < BLAS_ROUTINES_N; i++) {
+    if (options.blas_routines.find(blas_routines_e_str[i]) !=
+        std::string::npos) {
+      std::cout << "Testing " << blas_routines_e_str[i] << "..." << std::endl;
+
+      auto routine = routine_table[i];
+
+      if (!routine || !routine[0][options.loop][options.test]) {
+        std::cerr << "do_" << blas_routines_e_str[i] << "_invoke[";
+        err = 1;
+        break;
+      }
+      routine[0][options.loop][options.test](options);
+    }
+  }
+
+  if (err) {
+    std::cerr << loop_e_str[options.loop] << "][" << test_e_str[options.test]
+              << "] not yet implemented!" << std::endl;
+    exit(-EINVAL);
   }
 
   if (out_file != nullptr) fb.close();
diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
index e2b62ef8eb..70f7664679 100644
--- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
@@ -55,7 +55,7 @@
 #include "KokkosBatched_Trmm_Serial_Impl.hpp"
 #include "KokkosBatched_Util.hpp"
 
-//#define TRMM_PERF_TEST_DEBUG
+//#define PERF_TEST_DEBUG
 
 // Forward declarations
 void do_trmm_serial_blas(options_t options);
@@ -68,13 +68,6 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = {
     {do_trmm_serial_blas, do_trmm_serial_batched},
     {do_trmm_parallel_blas, do_trmm_parallel_batched}};
 
-/*************************** Print macros **************************/
-#ifdef TRMM_PERF_TEST_DEBUG
-#define STATUS printf("STATUS: %s:%d.\n", __func__, __LINE__);
-#else
-#define STATUS
-#endif  // TRMM_PERF_TEST_DEBUG
-
 /*************************** Test types and defaults **************************/
 #define DEFAULT_TRMM_ARGS "LUNU"
 #define DEFAULT_TRMM_ALPHA 1.0
@@ -106,7 +99,7 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args,
 }
 
 static void __print_trmm_perf_test_options(options_t options) {
-#ifdef TRMM_PERF_TEST_DEBUG
+#ifdef PERF_TEST_DEBUG
   printf("options.test      = %s\n", test_e_str[options.test].c_str());
   printf("options.loop      = %s\n", loop_e_str[options.loop].c_str());
   printf("options.start     = %dx%d,%dx%d\n", options.start.a.m,
@@ -123,7 +116,7 @@ static void __print_trmm_perf_test_options(options_t options) {
     printf("options.alpha     = %lf\n", options.blas_args.trmm.alpha);
   else if (std::is_same<float, default_scalar>::value)
     printf("options.alpha     = %f\n", options.blas_args.trmm.alpha);
-#endif  // TRMM_PERF_TEST_DEBUG
+#endif  // PERF_TEST_DEBUG
   return;
 }
 
@@ -146,6 +139,7 @@ void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) {
                      &trmm_args.diag, trmm_args.alpha, A, B);
   }
 
+  Kokkos::fence();
   timer.reset();
   for (uint32_t i = 0; i < n; ++i) {
     auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL());
@@ -292,7 +286,7 @@ void __do_trmm_serial_batched(options_t options, trmm_args_t trmm_args) {
   return;
 }
 
-#if !defined(KOKKOS_ENABLE_CUDA)
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
 template <class ExecutionSpace>
 struct parallel_blas_trmm {
   trmm_args_t trmm_args_;
@@ -312,7 +306,7 @@ struct parallel_blas_trmm {
 
 template <class scalar_type, class vta, class vtb, class device_type>
 void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) {
-#if !defined(KOKKOS_ENABLE_CUDA)
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
   uint32_t warm_up_n = options.warm_up_n;
   uint32_t n         = options.n;
   Kokkos::Timer timer;
@@ -335,7 +329,9 @@ void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) {
   __trmm_output_csv_row(options, trmm_args, timer.seconds());
 #else
   std::cerr << std::string(__func__)
-            << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl;
+            << " disabled since KOKKOS_ENABLE_CUDA and/or KOKKOS_ENABLE_HIP is "
+               "defined."
+            << std::endl;
   __trmm_output_csv_row(options, trmm_args, -1);
 #endif  // !KOKKOS_ENABLE_CUDA
   return;
diff --git a/perf_test/blas/blas3/README.md b/perf_test/blas/blas3/README.md
index af718ee906..d150d61a32 100644
--- a/perf_test/blas/blas3/README.md
+++ b/perf_test/blas/blas3/README.md
@@ -19,8 +19,8 @@ void (*do_ROUTINE_invoke[LOOP_N][TEST_N])(options_t) = {
 };
 ```
 3. Update the definitions in `KokkosBlas3_common.hpp`, where the comment `//ADD MORE BLAS3 ROUTINES HERE` is.
-4. Add a conditional to invoke the new routine via `do_ROUTINE_invoke` in
-   `KokkosBlas3_trmm_perf_test.hpp`, where the comment `//ADD MORE BLAS3 ROUTINES HERE` is.
+4. Add the `do_ROUTINE_invoke` table to the `routine_table` in
+   `KokkosBlas3_perf_test.cpp`, where the comment `//ADD MORE BLAS3 ROUTINES HERE` is.
 5. Update the commandline argument processing in
-   `KokkosBlas3_trmm_perf_test.hpp` to specify how to run ROUTINE.
-6. Append `ROUTINE,` to `#define DEFAULT_BLAS_ROUTINES` in `KokkosBlas3_common.hpp`.
+   `KokkosBlas3_perf_test.cpp` to specify how to run ROUTINE.
+6. To run the new routine by default, append `ROUTINE,` to `#define DEFAULT_BLAS_ROUTINES` in `KokkosBlas3_common.hpp`.
diff --git a/perf_test/graph/CMakeLists.txt b/perf_test/graph/CMakeLists.txt
index bf7ae17082..134a7acc2e 100644
--- a/perf_test/graph/CMakeLists.txt
+++ b/perf_test/graph/CMakeLists.txt
@@ -11,6 +11,11 @@ KOKKOSKERNELS_ADD_EXECUTABLE(
   SOURCES KokkosGraph_color_d2.cpp       
   )
 
+KOKKOSKERNELS_ADD_EXECUTABLE(
+  graph_mis_d2
+  SOURCES KokkosGraph_mis_d2.cpp       
+  )
+
 
 #Below will probably fail on GPUs.
 #KOKKOSKERNELS_ADD_EXECUTABLE(
diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp
index cbc3697517..a3fecb4c99 100644
--- a/perf_test/graph/KokkosGraph_color.cpp
+++ b/perf_test/graph/KokkosGraph_color.cpp
@@ -76,6 +76,9 @@ void print_options(std::ostream &os, const char *app_name, unsigned int indent =
 #endif
 #if defined(KOKKOS_ENABLE_CUDA)
        << spaces << "      --cuda <id>         Use CUDA (device $id)" << std::endl
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+       << spaces << "      --hip <id>          Use HIP (device $id)" << std::endl
 #endif
        << std::endl
        << spaces << "  Required Parameters:" << std::endl
@@ -131,6 +134,9 @@ int parse_inputs (KokkosKernels::Experiment::Parameters &params, int argc, char
     else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
       params.use_cuda = 1 + atoi(getNextArg(i, argc, argv));
     }
+    else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) {
+      params.use_hip = 1 + atoi(getNextArg(i, argc, argv));
+    }
     else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) {
       params.repeat = atoi(getNextArg(i, argc, argv));
     }
@@ -212,7 +218,7 @@ int parse_inputs (KokkosKernels::Experiment::Parameters &params, int argc, char
     print_options(std::cout, argv[0]);
   return 1;
   }
-  if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda)
+  if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip)
   {
     print_options(std::cout, argv[0]);
     return 1;
@@ -539,7 +545,7 @@ int main (int argc, char ** argv){
   std::cout << "Sizeof(idx):" << sizeof(idx) << " sizeof(size_type):" << sizeof(size_type) << std::endl;
 
   const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided as number of threads
-  const int device_id = 0;
+  const int device_id = std::max(params.use_cuda, params.use_hip) - 1;
   Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) );
   Kokkos::print_configuration(std::cout);
 
@@ -579,6 +585,15 @@ int main (int argc, char ** argv){
 
 #endif
 
+#if defined( KOKKOS_ENABLE_HIP )
+  if (params.use_hip) {
+    KokkosKernels::Experiment::run_multi_mem_experiment
+    <size_type, idx, Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(
+        params
+        );
+  }
+#endif
+
 #if defined( KOKKOS_ENABLE_SERIAL )
   if (params.use_serial) {
 #ifdef KOKKOSKERNELS_MULTI_MEM
diff --git a/perf_test/graph/KokkosGraph_color_d2.cpp b/perf_test/graph/KokkosGraph_color_d2.cpp
index 970bafa380..04d977527d 100644
--- a/perf_test/graph/KokkosGraph_color_d2.cpp
+++ b/perf_test/graph/KokkosGraph_color_d2.cpp
@@ -81,6 +81,7 @@ struct D2Parameters
   int use_threads;
   int use_openmp;
   int use_cuda;
+  int use_hip;
   int use_serial;
   const char* mtx_file;
   ColoringMode d2_color_type;
@@ -93,6 +94,7 @@ struct D2Parameters
     use_threads = 0;
     use_openmp = 0;
     use_cuda = 0;
+    use_hip = 0;
     use_serial = 0;
     mtx_file = NULL;
     d2_color_type = MODE_D2_SYMMETRIC;
@@ -147,6 +149,9 @@ void print_options(std::ostream &os, const char *app_name, unsigned int indent =
 #endif
 #ifdef KOKKOS_ENABLE_CUDA
        << spaces << "          --cuda <device id>  Use given CUDA device" << std::endl
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+       << spaces << "          --hip <device id>  Use given HIP device" << std::endl
 #endif
        << std::endl
        << spaces << "  Coloring modes:" << std::endl
@@ -199,6 +204,10 @@ int parse_inputs(D2Parameters &params, int argc, char **argv)
         {
             params.use_cuda = 1 + atoi(getNextArg(i, argc, argv));
         }
+        else if(0 == strcasecmp(argv[i], "--hip"))
+        {
+            params.use_hip = 1 + atoi(getNextArg(i, argc, argv));
+        }
         else if(0 == strcasecmp(argv[i], "--repeat"))
         {
             params.repeat = atoi(getNextArg(i, argc, argv));
@@ -273,7 +282,7 @@ int parse_inputs(D2Parameters &params, int argc, char **argv)
         print_options(std::cout, argv[0]);
         return 1;
     }
-    if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda)
+    if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip)
     {
         print_options(std::cout, argv[0]);
         return 1;
@@ -603,6 +612,8 @@ int main(int argc, char *argv[])
     int device_id = 0;
     if(params.use_cuda)
       device_id = params.use_cuda - 1;
+    else if(params.use_hip)
+      device_id = params.use_hip - 1;
     Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
 
     // Print out verbose information about the configuration of the run.
@@ -645,6 +656,16 @@ int main(int argc, char *argv[])
     }
     #endif
 
+    #if defined(KOKKOS_ENABLE_HIP)
+    if(params.use_hip)
+    {
+        if(!use_multi_mem)
+        {
+            KokkosKernels::Experiment::experiment_driver<kk_size_type, kk_lno_t, Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace>(params);
+        }
+    }
+    #endif
+
     #if defined(KOKKOS_ENABLE_SERIAL)
     if(params.use_serial)
     {
diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp
new file mode 100644
index 0000000000..32ff5f5fbd
--- /dev/null
+++ b/perf_test/graph/KokkosGraph_mis_d2.cpp
@@ -0,0 +1,397 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Brian Kelley (bmkelle@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdlib.h>
+#include <string>
+#include <set>
+#include <unistd.h>
+
+#include <iostream>
+#include <iomanip>
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <limits>
+#include <string>
+#include <sys/time.h>
+
+#include <Kokkos_Core.hpp>
+
+#include "KokkosKernels_Utils.hpp"
+#include "KokkosSparse_CrsMatrix.hpp"
+#include "KokkosSparse_spadd.hpp"
+#include "KokkosGraph_MIS2.hpp"
+#include "KokkosKernels_default_types.hpp"
+
+using namespace KokkosGraph;
+
+struct MIS2Parameters
+{
+  int repeat = 1;
+  bool verbose = false;
+  int use_threads = 0;
+  int use_openmp = 0;
+  int use_cuda = 0;
+  int use_hip = 0;
+  int use_serial = 0;
+  const char* mtx_file = NULL;
+  MIS2_Algorithm algo = MIS2_FAST;
+};
+
+template<typename lno_t, typename size_type, typename rowmap_t, typename entries_t, typename mis_t>
+bool verifyD2MIS(
+    lno_t numVerts,
+    const rowmap_t& rowmap, const entries_t& entries,
+    const mis_t& misArray)
+{
+  //set a std::set of the mis, for fast membership test
+  std::set<lno_t> mis;
+  for(size_t i = 0; i < misArray.extent(0); i++)
+    mis.insert(misArray(i));
+  for(lno_t i = 0; i < numVerts; i++)
+  {
+    //determine whether another vertex in the set is
+    //within 2 hops of i.
+    bool misIn2Hops = false;
+    for(size_type j = rowmap(i); j < rowmap(i + 1); j++)
+    {
+      lno_t nei1 = entries(j);
+      if(nei1 == i || nei1 >= numVerts)
+        continue;
+      if(mis.find(nei1) != mis.end())
+      {
+        misIn2Hops = true;
+        break;
+      }
+      for(size_type k = rowmap(nei1); k < rowmap(nei1 + 1); k++)
+      {
+        lno_t nei2 = entries(k);
+        if(nei2 == i || nei2 >= numVerts)
+          continue;
+        if(mis.find(nei2) != mis.end())
+        {
+          misIn2Hops = true;
+          break;
+        }
+      }
+    }
+    if(mis.find(i) == mis.end())
+    {
+      //i is not in the set
+      if(!misIn2Hops)
+      {
+        std::cout << "INVALID D2 MIS: vertex " << i << " is not in the set,\n";
+        std::cout << "but there are no vertices in the set within 2 hops.\n";
+        return false;
+      }
+    }
+    else
+    {
+      //i is in the set
+      if(misIn2Hops)
+      {
+        std::cout << "INVALID D2 MIS: vertex " << i << " is in the set,\n";
+        std::cout << "but there is another vertex within 2 hops which is also in the set.\n";
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void print_options(std::ostream &os, const char *app_name, unsigned int indent = 0)
+{
+    std::string spaces(indent, ' ');
+    os << "Usage:" << std::endl
+       << spaces << "  " << app_name << " [parameters]" << std::endl
+       << std::endl
+       << spaces << "Parameters:" << std::endl
+       << spaces << "  Required Parameters:" << std::endl
+       << spaces << "      --amtx <filename>   Input file in Matrix Market format (.mtx)." << std::endl
+       << std::endl
+       << spaces << "      Device type (the following are enabled in this build):" << std::endl
+#ifdef KOKKOS_ENABLE_SERIAL
+       << spaces << "          --serial            Execute serially." << std::endl
+#endif
+#ifdef KOKKOS_ENABLE_THREADS
+       << spaces << "          --threads           Use posix threads.\n"
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+       << spaces << "          --openmp            Use OpenMP.\n"
+#endif
+#ifdef KOKKOS_ENABLE_CUDA
+       << spaces << "          --cuda              Use CUDA.\n"
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+       << spaces << "          --hip               Use HIP.\n"
+#endif
+       << std::endl
+       << spaces << "  Optional Parameters:" << std::endl
+       << spaces << "      --algo alg          alg: fast, quality" << std::endl
+       << spaces << "      --repeat <N>        Set number of test repetitions (Default: 1) " << std::endl
+       << spaces << "      --verbose           Enable verbose mode (record and print timing + extra information)" << std::endl
+       << spaces << "      --help              Print out command line help." << std::endl
+       << spaces << " " << std::endl;
+}
+
+static char* getNextArg(int& i, int argc, char** argv)
+{
+  i++;
+  if(i >= argc)
+  {
+    std::cerr << "Error: expected additional command-line argument!\n";
+    exit(1);
+  }
+  return argv[i];
+}
+
+int parse_inputs(MIS2Parameters &params, int argc, char **argv)
+{
+    bool got_required_param_amtx      = false;
+    for(int i = 1; i < argc; ++i)
+    {
+        if(0 == strcasecmp(argv[i], "--threads"))
+        {
+            params.use_threads = 1;
+        }
+        else if(0 == strcasecmp(argv[i], "--serial"))
+        {
+            params.use_serial = 1;
+        }
+        else if(0 == strcasecmp(argv[i], "--openmp"))
+        {
+            params.use_openmp = 1;
+        }
+        else if(0 == strcasecmp(argv[i], "--cuda"))
+        {
+            params.use_cuda = 1;
+        }
+        else if(0 == strcasecmp(argv[i], "--hip"))
+        {
+            params.use_hip = 1;
+        }
+        else if(0 == strcasecmp(argv[i], "--repeat"))
+        {
+            params.repeat = atoi(getNextArg(i, argc, argv));
+            if(params.repeat <= 0)
+            {
+              std::cout << "*** Repeat count must be positive, defaulting to 1.\n";
+              params.repeat = 1;
+            }
+        }
+        else if(0 == strcasecmp(argv[i], "--amtx"))
+        {
+            got_required_param_amtx = true;
+            params.mtx_file  = getNextArg(i, argc, argv);
+        }
+        else if(0 == strcasecmp(argv[i], "--algo"))
+        {
+            const char* algName = getNextArg(i, argc, argv);
+            if(!strcasecmp(algName, "fast"))
+              params.algo = MIS2_FAST;
+            else if(!strcasecmp(algName, "quality"))
+              params.algo = MIS2_QUALITY;
+            else
+              throw std::invalid_argument("Algorithm not valid: must be 'fast' or 'quality'");
+        }
+        else if(0 == strcasecmp(argv[i], "--verbose"))
+        {
+            params.verbose = true;
+        }
+        else if(0 == strcasecmp(argv[i], "--help") || 0 == strcasecmp(argv[i], "-h"))
+        {
+            print_options(std::cout, argv[0]);
+            return 1;
+        }
+        else
+        {
+            std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl;
+            print_options(std::cout, argv[0]);
+            return 1;
+        }
+    }
+
+    if(!got_required_param_amtx)
+    {
+        std::cout << "Missing required parameter amtx" << std::endl << std::endl;
+        print_options(std::cout, argv[0]);
+        return 1;
+    }
+    if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip)
+    {
+        print_options(std::cout, argv[0]);
+        return 1;
+    }
+    return 0;
+}
+
+template<typename device_t>
+void run_mis2(const MIS2Parameters& params)
+{
+    using size_type = default_size_type;
+    using lno_t = default_lno_t;
+    using exec_space = typename device_t::execution_space;
+    using mem_space = typename device_t::memory_space;
+    using crsMat_t = typename KokkosSparse::CrsMatrix<default_scalar, default_lno_t, device_t, void, default_size_type>;
+    using lno_view_t = typename crsMat_t::index_type::non_const_type;
+    using KKH = KokkosKernels::Experimental::KokkosKernelsHandle<size_type, lno_t, double, exec_space, mem_space, mem_space>;
+ 
+    Kokkos::Timer t;
+    crsMat_t A_in = KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(params.mtx_file);
+    std::cout << "I/O time: " << t.seconds() << " s\n";
+    t.reset();
+    //Symmetrize the matrix just in case
+    crsMat_t At_in = KokkosKernels::Impl::transpose_matrix(A_in);
+    crsMat_t A;
+    KKH kkh;
+    kkh.create_spadd_handle(false);
+    KokkosSparse::spadd_symbolic(&kkh, A_in, At_in, A);
+    KokkosSparse::spadd_numeric(&kkh, 1.0, A_in, 1.0, At_in, A);
+    kkh.destroy_spadd_handle();
+    std::cout << "Time to symmetrize: " << t.seconds() << " s\n";
+    auto rowmap = A.graph.row_map;
+    auto entries = A.graph.entries;
+    lno_t numVerts = A.numRows();
+
+    std::cout << "Num verts: " << numVerts << '\n'
+              << "Num edges: " << A.nnz() << '\n';
+
+    lno_view_t mis;
+
+    t.reset();
+    for(int rep = 0; rep < params.repeat; rep++)
+    {
+      mis = KokkosGraph::Experimental::graph_d2_mis<device_t, decltype(rowmap), decltype(entries)>(rowmap, entries, params.algo);
+      exec_space().fence();
+    }
+    double totalTime = t.seconds();
+    std::cout << "MIS-2 average time: " << totalTime / params.repeat << '\n';
+    std::cout << "MIS size: " << mis.extent(0) << '\n';
+
+    if(params.verbose)
+    {
+      std::cout << "Vertices in independent set:\n";
+      KokkosKernels::Impl::print_1Dview(mis);
+      auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap);
+      auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries);
+      auto misHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), mis);
+      if(verifyD2MIS
+        <lno_t, size_type, decltype(rowmapHost), decltype(entriesHost), decltype(misHost)>
+        (numVerts, rowmapHost, entriesHost, misHost))
+        std::cout << "MIS-2 is correct.\n";
+      else
+        std::cout << "*** MIS-2 not correct! ***\n";
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    MIS2Parameters params;
+
+    if(parse_inputs(params, argc, argv))
+    {
+        return 1;
+    }
+
+    if(params.mtx_file == NULL)
+    {
+        std::cerr << "Provide a matrix file" << std::endl;
+        return 0;
+    }
+
+    Kokkos::initialize();
+
+    bool run = false;
+
+    #if defined(KOKKOS_ENABLE_OPENMP)
+    if(params.use_openmp)
+    {
+      run_mis2<Kokkos::OpenMP>(params);
+      run = true;
+    }
+    #endif
+
+    #if defined(KOKKOS_ENABLE_THREADS)
+    if(params.use_threads)
+    {
+      run_mis2<Kokkos::Threads>(params);
+      run = true;
+    }
+    #endif
+
+    #if defined(KOKKOS_ENABLE_CUDA)
+    if(params.use_cuda)
+    {
+      run_mis2<Kokkos::Cuda>(params);
+      run = true;
+    }
+    #endif
+
+    #if defined(KOKKOS_ENABLE_HIP)
+    if(params.use_hip)
+    {
+      run_mis2<Kokkos::Experimental::HIP>(params);
+      run = true;
+    }
+    #endif
+
+    #if defined(KOKKOS_ENABLE_SERIAL)
+    if(params.use_serial)
+    {
+      run_mis2<Kokkos::Serial>(params);
+      run = true;
+    }
+    #endif
+
+    if(!run)
+    {
+      std::cerr << "*** ERROR: did not run, none of the supported device types were selected.\n";
+    }
+
+    Kokkos::finalize();
+
+    return 0;
+}
diff --git a/perf_test/graph/KokkosGraph_triangle.cpp b/perf_test/graph/KokkosGraph_triangle.cpp
index 6f0b6c73df..63a52dbaea 100644
--- a/perf_test/graph/KokkosGraph_triangle.cpp
+++ b/perf_test/graph/KokkosGraph_triangle.cpp
@@ -54,7 +54,7 @@
 
 void print_options(){
   std::cerr << "Options\n" << std::endl;
-  std::cerr << "Choose BackEnd                     : --openmp [numthreads] | --cuda" << std::endl;
+  std::cerr << "Choose BackEnd                     : --openmp [numthreads] | --cuda | --hip" << std::endl;
   std::cerr << "Input Matrix                       : --amtx [path_to_input_matrix]" << std::endl;
   std::cerr << "\tInput Matrix format can be multiple formats. If it ends with:" << std::endl;
   std::cerr << "\t\t.mtx: it will read matrix market format." << std::endl;
@@ -96,6 +96,9 @@ int parse_inputs (KokkosKernels::Experiment::Parameters &params, int argc, char
     else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
       params.use_cuda = 1;
     }
+    else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) {
+      params.use_hip = 1;
+    }
     else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) {
       params.repeat = atoi( argv[++i] );
     }
@@ -292,7 +295,6 @@ int main (int argc, char ** argv){
   const int device_id = 0;
   Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) );
 
-#if !defined (KOKKOS_ENABLE_CUDA)
 #if defined( KOKKOS_ENABLE_OPENMP )
 
   if (params.use_openmp) {
@@ -311,10 +313,9 @@ int main (int argc, char ** argv){
   }
 
 #endif
-#endif
 
 
-#if defined( KOKKOS_ENABLE_CUDA1 )
+#if defined( KOKKOS_ENABLE_CUDA )
   if (params.use_cuda) {
     Kokkos::Cuda::print_configuration(std::cout);
 #ifdef KOKKOSKERNELS_MULTI_MEM
@@ -332,6 +333,16 @@ int main (int argc, char ** argv){
 
 #endif
 
+#if defined( KOKKOS_ENABLE_HIP )
+  if (params.use_hip) {
+    Kokkos::Experimental::HIP::print_configuration(std::cout);
+    KokkosKernels::Experiment::run_multi_mem_triangle
+    <size_type, idx, Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(
+        params
+        );
+  }
+#endif 
+
   Kokkos::finalize();
 
   return 0;
diff --git a/perf_test/performance/CMakeLists.txt b/perf_test/performance/CMakeLists.txt
index 09593b3128..93d377ba60 100644
--- a/perf_test/performance/CMakeLists.txt
+++ b/perf_test/performance/CMakeLists.txt
@@ -11,12 +11,12 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 
 IF(TPL_ENABLE_yaml-cpp)
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     performance_validate
     SOURCES performance_validate.cpp
     )
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     performance_example
     SOURCES performance_example.cpp
     )
diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt
index da22993cda..f0662e4a08 100644
--- a/perf_test/sparse/CMakeLists.txt
+++ b/perf_test/sparse/CMakeLists.txt
@@ -43,6 +43,11 @@ KOKKOSKERNELS_ADD_EXECUTABLE(
   SOURCES KokkosSparse_spmv.cpp
   )
 
+KOKKOSKERNELS_ADD_EXECUTABLE(
+  sparse_kk_spmv
+  SOURCES KokkosSparse_kk_spmv.cpp
+  )
+
 IF(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
   KOKKOSKERNELS_ADD_EXECUTABLE(
     sparse_spmv_merge
diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp
new file mode 100644
index 0000000000..aa8f2ddfa3
--- /dev/null
+++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp
@@ -0,0 +1,185 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+
+#include <ctime>
+#include <cstring>
+#include <cstdlib>
+#include <limits>
+#include <limits.h>
+#include <cmath>
+#include <unordered_map>
+
+#include <Kokkos_Core.hpp>
+#include <KokkosSparse_CrsMatrix.hpp>
+#include <KokkosKernels_IOUtils.hpp>
+#include <KokkosSparse_spmv.hpp>
+#include "KokkosKernels_default_types.hpp"
+
+typedef default_scalar Scalar;
+typedef default_lno_t Ordinal;
+typedef default_size_type Offset;
+
+template<typename Layout>
+void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, int num_vecs, char mode, Scalar beta) {
+  typedef KokkosSparse::CrsMatrix<Scalar, Ordinal, Kokkos::DefaultExecutionSpace, void, Offset> matrix_type;
+  typedef typename Kokkos::View<Scalar**, Layout> mv_type;
+  typedef typename mv_type::HostMirror h_mv_type;
+
+  srand(17312837);
+  matrix_type A;
+  if(filename)
+    A = KokkosKernels::Impl::read_kokkos_crst_matrix<matrix_type>(filename);
+  else
+  {
+    Offset nnz = 10 * numRows;
+    //note: the help text says the bandwidth is fixed at 0.01 * numRows
+    A = KokkosKernels::Impl::kk_generate_sparse_matrix<matrix_type>(numRows, numCols, nnz, 0, 0.01 * numRows);
+  }
+  numRows = A.numRows();
+  numCols = A.numCols();
+  mv_type x("X", numCols, num_vecs);
+  mv_type y("Y", numRows, num_vecs);
+  h_mv_type h_x = Kokkos::create_mirror_view(x);
+  h_mv_type h_y = Kokkos::create_mirror_view(y);
+  h_mv_type h_y_compare = Kokkos::create_mirror(y);
+
+  for(int v = 0; v < num_vecs; v++)
+  {
+    for(int i=0; i<numCols;i++)
+    {
+      h_x(i, v) = (Scalar) (1.0*(rand()%40)-20.);
+    }
+  }
+
+  Kokkos::deep_copy(x,h_x);
+
+  // Benchmark
+  auto x0 = Kokkos::subview(x, Kokkos::ALL(), 0);
+  auto y0 = Kokkos::subview(y, Kokkos::ALL(), 0);
+  Kokkos::Timer timer;
+  for(int i=0;i<loop;i++) {
+    if(num_vecs == 1)
+    {
+      //run the rank-1 version
+      KokkosSparse::spmv(&mode,1.0,A,x0,beta,y0);
+    }
+    else
+    {
+      //rank-2
+      KokkosSparse::spmv(&mode,1.0,A,x,beta,y);
+    }
+    Kokkos::DefaultExecutionSpace().fence();
+  }
+  double avg_time = timer.seconds() / loop;
+  std::cout << avg_time << " s\n";
+}
+
+void print_help() {
+  printf("  -s [nrows]            : matrix dimension (square)\n");
+  printf("  --nv n                : number of columns in x/y multivector (default 1).\n");
+  printf("  --layout left|right   : memory layout of x/y. Default depends on build's default execution space\n");
+  printf("  -m N|T                : matrix apply mode: N (normal, default), T (transpose)\n");
+  printf("  -f [file],-fb [file]  : Read in Matrix Market (.mtx), or binary (.bin) matrix file.\n");
+  printf("  -l [LOOP]             : How many spmv to run to aggregate average time. \n");
+  printf("  -b beta               : beta, as in y := Ax + (beta)y\n");
+}
+
+int main(int argc, char **argv)
+{
+ long long int size = 110503; // a prime number
+ char* filename = NULL;
+
+ char mode = 'N';
+ char layout;
+ if(std::is_same<default_layout, Kokkos::LayoutLeft>::value)
+   layout = 'L';
+ else
+   layout = 'R';
+ int loop = 100;
+ int num_vecs = 1;
+ Scalar beta = 0.0;
+
+ if(argc == 1) {
+   print_help();
+   return 0;
+ }
+
+ for(int i=0;i<argc;i++)
+ {
+   if((strcmp(argv[i],"-s")==0)) {size=atoi(argv[++i]); continue;}
+   if((strcmp(argv[i],"-f")==0 || strcmp(argv[i], "-fb") == 0)) {filename = argv[++i]; continue;}
+   if((strcmp(argv[i],"-l")==0)) {loop=atoi(argv[++i]); continue;}
+   if((strcmp(argv[i],"-m")==0)) {mode=toupper(argv[++i][0]); continue;}
+   if((strcmp(argv[i],"--nv")==0)) {num_vecs=atoi(argv[++i]); continue;}
+   if((strcmp(argv[i],"-b")==0)) {beta=atof(argv[++i]); continue;}
+   if((strcmp(argv[i],"--layout")==0))
+   {
+     i++;
+     if(toupper(argv[i][0]) == 'L')
+       layout = 'L';
+     else if(toupper(argv[i][0]) == 'R')
+       layout = 'R';
+     else
+       throw std::runtime_error("Invalid layout");
+   }
+   if((strcmp(argv[i],"--help")==0) || (strcmp(argv[i],"-h")==0)) {
+     print_help();
+     return 0;
+   }
+ }
+
+ Kokkos::initialize(argc,argv);
+
+ std::cout << size << " rows/cols, mode " << mode << ", " << num_vecs << " vectors, beta = " << beta << ", layout " << layout << ": ";
+
+ if(layout == 'L')
+   run_spmv<Kokkos::LayoutLeft>(size,size,filename,loop,num_vecs,mode,beta);
+ else
+   run_spmv<Kokkos::LayoutRight>(size,size,filename,loop,num_vecs,mode,beta);
+
+ Kokkos::finalize();
+}
+
diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp
index 681327dfaf..0f6351189b 100644
--- a/perf_test/sparse/KokkosSparse_pcg.cpp
+++ b/perf_test/sparse/KokkosSparse_pcg.cpp
@@ -43,32 +43,24 @@
 */
 
 #include <KokkosKernels_config.h>
-#if defined(KOKKOSKERNELS_INST_DOUBLE) &&  \
-    defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \
-    defined(KOKKOSKERNELS_INST_ORDINAL_INT)
 #include "KokkosSparse_pcg.hpp"
 
 #include "KokkosKernels_Utils.hpp"
-#include <iostream>
 #include "KokkosKernels_IOUtils.hpp"
+#include "KokkosKernels_default_types.hpp"
+#include <iostream>
 
 #define MAXVAL 1
 
-#define SIZE_TYPE size_t
-#define INDEX_TYPE int
-#define SCALAR_TYPE double
-
-
-
 template<typename scalar_view_t>
-scalar_view_t create_x_vector(INDEX_TYPE nv, SCALAR_TYPE max_value = 1.0){
+scalar_view_t create_x_vector(default_lno_t nv, default_scalar max_value = 1.0){
   scalar_view_t kok_x ("X", nv);
 
   typename scalar_view_t::HostMirror h_x =  Kokkos::create_mirror_view (kok_x);
 
 
-  for (INDEX_TYPE i = 0; i < nv; ++i){
-    SCALAR_TYPE r = static_cast <SCALAR_TYPE> (rand()) / static_cast <SCALAR_TYPE> (RAND_MAX / max_value);
+  for (default_lno_t i = 0; i < nv; ++i){
+    default_scalar r = static_cast <default_scalar> (rand()) / static_cast <default_scalar> (RAND_MAX / max_value);
     h_x(i) = r;
   }
   Kokkos::deep_copy (kok_x, h_x);
@@ -98,7 +90,7 @@ void run_experiment(
   typedef typename lno_view_t::value_type size_type;
   typedef typename scalar_view_t::value_type scalar_t;
 
-  INDEX_TYPE nv = crsmat.numRows();
+  default_lno_t nv = crsmat.numRows();
   scalar_view_t kok_x_original = create_x_vector<scalar_view_t>(nv, MAXVAL);
   scalar_view_t kok_b_vector = create_y_vector(crsmat, kok_x_original);
 
@@ -255,25 +247,70 @@ void run_experiment(
   */
 }
 
-
-
-
 enum { CMD_USE_THREADS = 0
      , CMD_USE_NUMA
      , CMD_USE_CORE_PER_NUMA
      , CMD_USE_CUDA
+     , CMD_USE_HIP
      , CMD_USE_OPENMP
-     , CMD_USE_CUDA_DEV
+     , CMD_DEVICE
      , CMD_BIN_MTX
      , CMD_CLUSTER_SIZE
      , CMD_USE_SEQUENTIAL_SGS
      , CMD_ERROR
      , CMD_COUNT };
 
+template<typename execution_space>
+void run_pcg(int* cmdline, const char* mtx_file)
+{
+  default_lno_t nv = 0, ne = 0;
+  default_lno_t *xadj, *adj;
+  default_scalar *ew;
+
+  KokkosKernels::Impl::read_matrix<default_lno_t,default_lno_t, default_scalar> (&nv, &ne, &xadj, &adj, &ew, mtx_file);
+
+  typedef typename KokkosSparse::CrsMatrix<default_scalar, default_lno_t, execution_space, void, default_size_type> crsMat_t;
+
+  typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t;
+  typedef typename crsMat_t::index_type::non_const_type   cols_view_t;
+  typedef typename crsMat_t::values_type::non_const_type values_view_t;
+
+  row_map_view_t rowmap_view("rowmap_view", nv+1);
+  cols_view_t columns_view("colsmap_view", ne);
+  values_view_t values_view("values_view", ne);
+
+  {
+    typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view);
+    typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view);
+    typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view);
+
+    for (default_lno_t i = 0; i <= nv; ++i){
+      hr(i) = xadj[i];
+    }
+
+    for (default_lno_t i = 0; i < ne; ++i){
+      hc(i) = adj[i];
+      hv(i) = ew[i];
+    }
+    Kokkos::deep_copy (rowmap_view , hr);
+    Kokkos::deep_copy (columns_view , hc);
+    Kokkos::deep_copy (values_view , hv);
+  }
+  graph_t static_graph (columns_view, rowmap_view);
+  crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph);
+
+  delete [] xadj;
+  delete [] adj;
+  delete [] ew;
+
+  run_experiment<execution_space, crsMat_t>(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]);
+}
+
 int main (int argc, char ** argv){
 
   int cmdline[ CMD_COUNT ] ;
-  char *mtx_bin_file = NULL;
+  char *mtx_file = NULL;
   for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ;
 
   for ( int i = 1 ; i < argc ; ++i ) {
@@ -283,17 +320,22 @@ int main (int argc, char ** argv){
     else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) {
       cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] );
     }
+    /*
     else if ( 0 == strcasecmp( argv[i] , "--cores" ) ) {
+      //Note BMK: specifying #NUMA regions isn't supported by initialize
       sscanf( argv[++i] , "%dx%d" ,
               cmdline + CMD_USE_NUMA ,
               cmdline + CMD_USE_CORE_PER_NUMA );
     }
+    */
     else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
       cmdline[ CMD_USE_CUDA ] = 1 ;
     }
-    else if ( 0 == strcasecmp( argv[i] , "--cuda-dev" ) ) {
-      cmdline[ CMD_USE_CUDA ] = 1 ;
-      cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ;
+    else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) {
+      cmdline[ CMD_USE_HIP ] = 1 ;
+    }
+    else if ( 0 == strcasecmp( argv[i] , "--device-id" ) ) {
+      cmdline[ CMD_DEVICE ] = atoi( argv[++i] ) ;
     }
     else if ( 0 == strcasecmp( argv[i] , "--cluster-size" ) ) {
       cmdline[CMD_CLUSTER_SIZE] = atoi(argv[++i]);
@@ -303,12 +345,12 @@ int main (int argc, char ** argv){
     }
 
     else if ( 0 == strcasecmp( argv[i] , "--mtx" ) ) {
-      mtx_bin_file = argv[++i];
+      mtx_file = argv[++i];
     }
     else {
       cmdline[ CMD_ERROR ] = 1 ;
       std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
-      std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl;
+      std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--hip\n\t--device-id[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl;
 
       return 0;
     }
@@ -317,190 +359,43 @@ int main (int argc, char ** argv){
   if(cmdline[CMD_CLUSTER_SIZE] == 0)
     cmdline[CMD_CLUSTER_SIZE] = 1;
 
-  if (mtx_bin_file == NULL){
-    std::cerr << "Provide a mtx binary file" << std::endl ;
-    std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl;
+  if (mtx_file == NULL){
+    std::cerr << "Provide a matrix file" << std::endl ;
+    std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--hip\n\t--device-id[DeviceIndex]\n\t--mtx[matrix]" << std::endl;
 
     return 0;
   }
 
+  Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space
 
-#if defined( KOKKOS_ENABLE_THREADS )
-
-    if ( cmdline[ CMD_USE_THREADS ] ) {
-
-      Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space
-
-      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
-        init_args.num_threads = cmdline[ CMD_USE_THREADS ];
-        init_args.num_numa = cmdline[ CMD_USE_NUMA ];
-        //const int core_per_numa = cmdline[ CMD_USE_CORE_PER_NUMA ]; // How to get this to initialize() without using impl_initialize()?
-      }
-      else {
-        init_args.num_threads = cmdline[ CMD_USE_THREADS ];
-      }
-
-      Kokkos::initialize( init_args );
-      Kokkos::print_configuration(std::cout);
-      {
-        INDEX_TYPE nv = 0, ne = 0;
-        INDEX_TYPE *xadj, *adj;
-        SCALAR_TYPE *ew;
-
-        KokkosKernels::Impl::read_matrix<INDEX_TYPE,INDEX_TYPE, SCALAR_TYPE> (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file);
-
-        typedef Kokkos::Threads myExecSpace;
-        typedef typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace, void, SIZE_TYPE > crsMat_t;
-
-        typedef typename crsMat_t::StaticCrsGraphType graph_t;
-        typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
-        typedef typename graph_t::entries_type::non_const_type   cols_view_t;
-        typedef typename crsMat_t::values_type::non_const_type values_view_t;
-
-        row_map_view_t rowmap_view("rowmap_view", nv+1);
-        cols_view_t columns_view("colsmap_view", ne);
-        values_view_t values_view("values_view", ne);
-
-        KokkosKernels::Impl::copy_vector<SCALAR_TYPE * , values_view_t, myExecSpace>(ne, ew, values_view);
-        KokkosKernels::Impl::copy_vector<INDEX_TYPE * , cols_view_t, myExecSpace>(ne, adj, columns_view);
-        KokkosKernels::Impl::copy_vector<INDEX_TYPE * , row_map_view_t, myExecSpace>(nv+1, xadj, rowmap_view);
-
-        graph_t static_graph (columns_view, rowmap_view);
-        crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph);
-        delete [] xadj;
-        delete [] adj;
-        delete [] ew;
-
-        run_experiment<myExecSpace, crsMat_t>(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]);
-      }
+  init_args.device_id = cmdline[ CMD_DEVICE ];
+  if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
+    init_args.num_threads = std::max(cmdline[ CMD_USE_THREADS ], cmdline [ CMD_USE_OPENMP ]);
+    init_args.num_numa = cmdline[ CMD_USE_NUMA ];
+  }
+  else {
+    init_args.num_threads = cmdline[ CMD_USE_THREADS ];
+  }
 
-      Kokkos::finalize();
-    }
+  Kokkos::initialize( init_args );
+  {
+#if defined( KOKKOS_ENABLE_THREADS )
+    if(cmdline[CMD_USE_THREADS])
+      run_pcg<Kokkos::Threads>(cmdline, mtx_file);
 #endif
-
 #if defined( KOKKOS_ENABLE_OPENMP )
-
-    if ( cmdline[ CMD_USE_OPENMP ] ) {
-
-      Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space
-
-      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
-        init_args.num_threads = cmdline[ CMD_USE_OPENMP ];
-        init_args.num_numa = cmdline[ CMD_USE_NUMA ];
-        //const int core_per_numa = cmdline[ CMD_USE_CORE_PER_NUMA ];
-      }
-      else {
-        init_args.num_threads = cmdline[ CMD_USE_OPENMP ];
-      }
-
-      Kokkos::initialize( init_args );
-      Kokkos::print_configuration(std::cout);
-      {
-        INDEX_TYPE nv = 0, ne = 0;
-        INDEX_TYPE *xadj, *adj;
-        SCALAR_TYPE *ew;
-
-        KokkosKernels::Impl::read_matrix<INDEX_TYPE,INDEX_TYPE, SCALAR_TYPE> (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file);
-
-
-        typedef Kokkos::OpenMP myExecSpace;
-        typedef typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace, void, SIZE_TYPE > crsMat_t;
-
-        typedef typename crsMat_t::StaticCrsGraphType graph_t;
-        typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t;
-        typedef typename crsMat_t::index_type::non_const_type   cols_view_t;
-        typedef typename crsMat_t::values_type::non_const_type values_view_t;
-
-        row_map_view_t rowmap_view("rowmap_view", nv+1);
-        cols_view_t columns_view("colsmap_view", ne);
-        values_view_t values_view("values_view", ne);
-
-        KokkosKernels::Impl::copy_vector<SCALAR_TYPE * , values_view_t, myExecSpace>(ne, ew, values_view);
-        KokkosKernels::Impl::copy_vector<INDEX_TYPE * , cols_view_t, myExecSpace>(ne, adj, columns_view);
-        KokkosKernels::Impl::copy_vector<INDEX_TYPE * , row_map_view_t, myExecSpace>(nv+1, xadj, rowmap_view);
-
-        graph_t static_graph (columns_view, rowmap_view);
-        crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph);
-
-        //crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj);
-        delete [] xadj;
-        delete [] adj;
-        delete [] ew;
-
-        run_experiment<myExecSpace, crsMat_t>(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]);
-      }
-      Kokkos::finalize();
-    }
+    if(cmdline[CMD_USE_OPENMP])
+      run_pcg<Kokkos::OpenMP>(cmdline, mtx_file);
 #endif
-
 #if defined( KOKKOS_ENABLE_CUDA )
-    if ( cmdline[ CMD_USE_CUDA ] ) {
-
-      Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space
-
-      // Use the last device:
-      init_args.device_id = cmdline[ CMD_USE_CUDA_DEV ];
-
-      Kokkos::initialize( init_args );
-      Kokkos::print_configuration(std::cout);
-      {
-        INDEX_TYPE nv = 0, ne = 0;
-        INDEX_TYPE *xadj, *adj;
-        SCALAR_TYPE *ew;
-
-        KokkosKernels::Impl::read_matrix<INDEX_TYPE,INDEX_TYPE, SCALAR_TYPE> (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file);
-
-
-        typedef Kokkos::Cuda myExecSpace;
-        typedef typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace, void, SIZE_TYPE > crsMat_t;
-
-        typedef typename crsMat_t::StaticCrsGraphType graph_t;
-        typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t;
-        typedef typename crsMat_t::index_type::non_const_type   cols_view_t;
-        typedef typename crsMat_t::values_type::non_const_type values_view_t;
-
-        row_map_view_t rowmap_view("rowmap_view", nv+1);
-        cols_view_t columns_view("colsmap_view", ne);
-        values_view_t values_view("values_view", ne);
-
-
-        {
-          typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view);
-          typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view);
-          typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view);
-
-          for (INDEX_TYPE i = 0; i <= nv; ++i){
-            hr(i) = xadj[i];
-          }
-
-          for (INDEX_TYPE i = 0; i < ne; ++i){
-            hc(i) = adj[i];
-            hv(i) = ew[i];
-          }
-          Kokkos::deep_copy (rowmap_view , hr);
-          Kokkos::deep_copy (columns_view , hc);
-          Kokkos::deep_copy (values_view , hv);
-
-
-        }
-        graph_t static_graph (columns_view, rowmap_view);
-        crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph);
-
-  //      typedef typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, Kokkos::Cuda> crsMat_t;
-  //      crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj);
-        delete [] xadj;
-        delete [] adj;
-        delete [] ew;
-
-        run_experiment<myExecSpace, crsMat_t>(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]);
-      }
-      Kokkos::finalize();
-    }
+    if(cmdline[CMD_USE_CUDA])
+      run_pcg<Kokkos::Cuda>(cmdline, mtx_file);
 #endif
-
+#if defined( KOKKOS_ENABLE_HIP )
+    if(cmdline[CMD_USE_HIP])
+      run_pcg<Kokkos::Experimental::HIP>(cmdline, mtx_file);
+#endif
+  }
+  Kokkos::finalize();
   return 0;
 }
-#else
-int main() {
-}
-#endif
diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp
index f90c6179f7..959e9d973c 100644
--- a/perf_test/sparse/KokkosSparse_spadd.cpp
+++ b/perf_test/sparse/KokkosSparse_spadd.cpp
@@ -60,7 +60,7 @@
 void print_options(){
   std::cerr << "Options\n" << std::endl;
 
-  std::cerr << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]'" << std::endl;
+  std::cerr << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" << std::endl;
 
   std::cerr << "\t[Required] --amtx <path> :: 1st input matrix" << std::endl;
   std::cerr << "\t[Required] --bmtx <path> :: 2nd input matrix" << std::endl;
diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp
index 80e4ab7c34..0f1c9f6210 100644
--- a/perf_test/sparse/KokkosSparse_spgemm.cpp
+++ b/perf_test/sparse/KokkosSparse_spgemm.cpp
@@ -52,7 +52,7 @@ void print_options(){
 
   std::cerr << "\t[Required] INPUT MATRIX: '--amtx [left_hand_side.mtx]' -- for C=AxA" << std::endl;
 
-  std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' --> if none are specified, Serial is used (if enabled)" << std::endl;
+  std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]' --> if none are specified, Serial is used (if enabled)" << std::endl;
   std::cerr << "\t[Optional] '--algorithm [DEFAULT=KKDEFAULT=KKSPGEMM|KKMEM|KKDENSE|MKL|CUSPARSE|CUSP|VIENNA|MKL2]' --> to choose algorithm. KKMEM is outdated, use KKSPGEMM instead." << std::endl;
   std::cerr << "\t[Optional] --bmtx [righ_hand_side.mtx]' for C = AxB" << std::endl;
   std::cerr << "\t[Optional] OUTPUT MATRICES: '--cmtx [output_matrix.mtx]' --> to write output C=AxB"  << std::endl;
@@ -84,6 +84,9 @@ int parse_inputs (KokkosKernels::Experiment::Parameters &params, int argc, char
     else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
       params.use_cuda = atoi(getNextArg(i, argc, argv)) + 1;
     }
+    else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) {
+      params.use_hip = atoi(getNextArg(i, argc, argv)) + 1;
+    }
     else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) {
       params.repeat = atoi(getNextArg(i, argc, argv));
     }
@@ -297,7 +300,7 @@ int main (int argc, char ** argv){
   }
 
   const int num_threads = std::max(params.use_openmp, params.use_threads);
-  const int device_id = params.use_cuda - 1;
+  const int device_id = params.use_cuda ? params.use_cuda - 1 : params.use_hip - 1;
 
   Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) );
   Kokkos::print_configuration(std::cout);
@@ -336,6 +339,16 @@ int main (int argc, char ** argv){
   }
 #endif
 
+#if defined( KOKKOS_ENABLE_HIP )
+  if (params.use_hip) {
+    KokkosKernels::Experiment::run_multi_mem_spgemm
+    <size_type, lno_t, scalar_t, Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(
+        params
+        );
+
+  }
+#endif
+
 #if defined( KOKKOS_ENABLE_THREADS )
   //If only serial is enabled (or no other device was specified), run with serial
   if (params.use_threads)
diff --git a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp
index 60779f7fe5..afef5968f0 100644
--- a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp
+++ b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp
@@ -66,12 +66,6 @@
 
 enum {STRUCT, UNSTR};
 
-#ifdef INT64
-typedef long long int LocalOrdinalType;
-#else
-typedef int LocalOrdinalType;
-#endif
-
 void print_help() {
   printf("SPMV_struct benchmark code written by Luc Berger-Vergiat.\n");
   printf("Options:\n");
@@ -482,6 +476,73 @@ int main(int argc, char **argv)
 
     if(compare_cusparse) {
 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
+#ifdef CUSPARSE_VERSION
+      KokkosKernels::Experimental::Controls controls;
+
+      cusparseIndexType_t myCusparseOffsetType = CUSPARSE_INDEX_32I;
+      cusparseIndexType_t myCusparseEntryType  = CUSPARSE_INDEX_32I;
+      cudaDataType        myCudaDataType       = CUDA_R_64F;
+
+      /* create matrix */
+      cusparseSpMatDescr_t A_cusparse;
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(&A_cusparse, A.numRows(), A.numCols(), A.nnz(),
+						  (void*) A.graph.row_map.data(),
+						  (void*) A.graph.entries.data(),
+						  (void*) A.values.data(),
+						  myCusparseOffsetType,
+						  myCusparseEntryType,
+						  CUSPARSE_INDEX_BASE_ZERO,
+						  myCudaDataType));
+
+      /* create lhs and rhs */
+      cusparseDnVecDescr_t vecX, vecY;
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(&vecX, x1.extent_int(0), (void*) x1.data(), myCudaDataType));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(&vecY, y1.extent_int(0), (void*) y1.data(), myCudaDataType));
+
+      const double alpha = 1.0, beta = 1.0;
+      size_t bufferSize     = 0;
+      void*  dBuffer        = NULL;
+      cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT;
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize(controls.getCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
+							&alpha, A_cusparse, vecX, &beta, vecY, myCudaDataType,
+							alg, &bufferSize));
+      CUDA_SAFE_CALL(cudaMalloc(&dBuffer, bufferSize));
+
+      /* perform SpMV */
+      Kokkos::Profiling::pushRegion("cuSparse spmv test");
+      double min_time = 1.0e32;
+      double max_time = 0.0;
+      double ave_time = 0.0;
+      for(int i=0;i<loop;i++) {
+	Kokkos::Timer timer;
+	KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV(controls.getCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
+					       &alpha, A_cusparse, vecX, &beta, vecY, myCudaDataType,
+					       alg, dBuffer));
+	Kokkos::fence();
+	double time = timer.seconds();
+	ave_time += time;
+	if(time>max_time) max_time = time;
+	if(time<min_time) min_time = time;
+      }
+
+      // Performance Output
+      double matrix_size = 1.0*((A.nnz()*(sizeof(Scalar)+sizeof(int)) + A.numRows()*sizeof(int)))/1024/1024;
+      double vector_size = 2.0*A.numRows()*sizeof(Scalar)/1024/1024;
+      double vector_readwrite = (A.nnz() + A.numCols())*sizeof(Scalar)/1024/1024;
+
+      double problem_size = matrix_size+vector_size;
+      printf("cusp   %i %i %i %6.2lf ( %6.2lf %6.2lf %6.2lf ) ( %6.3lf %6.3lf %6.3lf ) ( %8.5lf %8.5lf %8.5lf )\n",
+	     A.nnz(), A.numRows(),A.numCols(), problem_size,
+	     (matrix_size+vector_readwrite)/ave_time*loop/1024, (matrix_size+vector_readwrite)/max_time/1024,(matrix_size+vector_readwrite)/min_time/1024,
+	     2.0*A.nnz()*loop/ave_time/1e9, 2.0*A.nnz()/max_time/1e9, 2.0*A.nnz()/min_time/1e9,
+	     ave_time/loop*1000, max_time*1000, min_time*1000);
+      Kokkos::Profiling::popRegion();
+
+      CUDA_SAFE_CALL(cudaFree(dBuffer));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecX));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecY));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(A_cusparse));
+#else
       // The data needs to be reformatted for cusparse before launching the kernel.
       // Step one, extract raw data
       using graph_type = typename matrix_type::StaticCrsGraphType;
@@ -563,6 +624,7 @@ int main(int argc, char **argv)
       // Clean-up cusparse and cublas contexts
       cusparseDestroy(cusparseHandle);
       // cublasDestroy(cublasHandle);
+#endif
 #else
       printf("Kokkos was not configure with cusparse, the comparison with cusparse_matvec is not perfromed!\n");
 #endif
diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia
index dc905965a6..c5f7148125 100755
--- a/scripts/cm_test_all_sandia
+++ b/scripts/cm_test_all_sandia
@@ -20,11 +20,13 @@ print_help() {
   echo "--spack: Run spack builds rather than direct CMake tests"
   echo ""
   echo "--debug: Run tests in debug. Defaults to False"
+  echo "--boundscheck: Enable Kokkos_ENABLE_DEBUG_BOUNDS_CHECK to check View accesses within bounds."
   echo "--test-script: Test this script, not Kokkos"
   echo "--skip-hwloc: Do not do hwloc tests"
   echo "--num=N: Number of jobs to run in parallel"
   echo "--spot-check: Minimal test set to issue pull request"
   echo "--spot-check-tpls: Minimal test set enabling blas and lapack tpls"
+  echo "--timeout: Max time before ctest timeout (in seconds)"
   echo "--dry-run: Just print what would be executed"
   echo "--build-only: Just do builds, don't run anything"
   echo "--opt-flag=FLAG: Optimization flag (default: -O3)"
@@ -108,6 +110,7 @@ MACHINE=""
 HOSTNAME=$(hostname)
 PROCESSOR=`uname -p`
 CUDA_ENABLE_CMD=
+HIP_ENABLE_CMD=
 #Command(s) for accessing local modules on the current machine,
 #e.g. "module use ..."
 #This will be added to reproducer instructions/script.
@@ -118,13 +121,13 @@ if [[ "$HOSTNAME" =~ (white|ride).* ]]; then
   module load git
 fi
 
-if [[ "$HOSTNAME" =~ waterman.* ]]; then
-  MACHINE=waterman
+if [[ "$HOSTNAME" =~ weaver.* ]]; then
+  MACHINE=weaver
   module load git
 fi
 
-if [[ "$HOSTNAME" =~ .*bowman.* ]]; then
-  MACHINE=bowman
+if [[ "$HOSTNAME" =~ .*voltrino.* ]]; then
+  MACHINE=voltrino
   module load git
 fi
 
@@ -150,6 +153,10 @@ if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name
   MACHINE=mayer
 fi
 
+if [[ "$HOSTNAME" == caraway* ]]; then # Warning: very generic name
+  MACHINE=caraway
+fi
+
 if [[ "$HOSTNAME" == kokkos-dev\.sandia\.gov* ]]; then
   MACHINE=kokkos-dev
 fi
@@ -202,7 +209,7 @@ CXX_FLAGS_EXTRA=""
 LD_FLAGS_EXTRA=""
 KOKKOS_OPTIONS=""
 
-CXX_STANDARD="11"
+CXX_STANDARD="14"
 
 GCC_VARIANTS="+blas+lapack +openmp+serial<SPACK_HOST_ARCH>"
 CLANG_VARIANTS="+blas+lapack +openmp+serial<SPACK_HOST_ARCH>"
@@ -212,8 +219,6 @@ PGI_VARIANTS="$GCC_VARIANTS"
 SPACK_VARIANTS=("cuda 10.0 $CUDA_VARIANTS std=14"
           "cuda 10.1  $CUDA_VARIANTS"
           "cuda 9.2   $CUDA_VARIANTS"
-          "gcc 4.8.4  $GCC_VARIANTS"
-          "gcc 4.9.3  $GCC_VARIANTS"
           "gcc 5.3.0  $GCC_VARIANTS"
           "gcc 6.1.0  $GCC_VARIANTS"
           "gcc 7.2.0  $GCC_VARIANTS std=14"
@@ -221,8 +226,6 @@ SPACK_VARIANTS=("cuda 10.0 $CUDA_VARIANTS std=14"
           "gcc 8.3.0  $GCC_VARIANTS std=14"
           "gcc 9.1    $GCC_VARIANTS std=17"
           "gcc 9.2.0  $GCC_VARIANTS std=17"
-          "intel 15.0.2 $INTEL_VARIANTS"
-          "intel 16.0.1 $INTEL_VARIANTS"
           "intel 17.0.1 $INTEL_VARIANTS"
           "intel 18.0.5 $INTEL_VARIANTS"
           "intel 19.0.5 $INTEL_VARIANTS std=14"
@@ -252,6 +255,8 @@ KOKKOSKERNELS_SCALARS="double,complex_double"
 KOKKOSKERNELS_ORDINALS="int"
 KOKKOSKERNELS_OFFSETS="int,size_t"
 KOKKOSKERNELS_LAYOUTS="LayoutLeft"
+
+CTESTTIMEOUT=2500
 #
 # Handle arguments.
 #
@@ -273,6 +278,9 @@ do
     --debug*)
       DEBUG=True
       ;;
+    --boundscheck*)
+      KOKKOS_BOUNDS_CHECK="--boundscheck"
+      ;;
     --build-only*)
       BUILD_ONLY=True
       ;;
@@ -300,6 +308,9 @@ do
     --spot-check*)
       SPOT_CHECK=True
       ;;
+    --timeout*)
+      CTESTTIMEOUT="${key#*=}"
+      ;;
     --arch*)
       ARCH_FLAG="--arch=${key#*=}"
       ;;
@@ -408,7 +419,6 @@ if [ "$MACHINE" = "sems" ]; then
     COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
                "gcc/7.2.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
                "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
                "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   elif [ "$SPOT_CHECK_TPLS" = "True" ]; then
@@ -416,23 +426,16 @@ if [ "$MACHINE" = "sems" ]; then
     COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
                "gcc/7.2.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
                "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
                "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   else
     # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   fi
@@ -445,6 +448,8 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then
   module load sems-cmake/3.12.2
   BASE_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
   CUDA9_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/6.1.0"
+  CUDA10_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
+  CUDA11_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/9.2.0"
   CLANG7_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-cuda/9.2"
   SKIP_HWLOC=True
 
@@ -456,7 +461,6 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
                "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/16.0.3 $BASE_MODULE_LIST  "Serial,Pthread" icpc $INTEL_WARNING_FLAGS"
                "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
                "clang/4.0.1 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
                "clang/7.0.1 $CLANG7_MODULE_LIST "Cuda_OpenMP" clang++ $CLANG_WARNING_FLAGS"
@@ -466,7 +470,6 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
                "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/16.0.3 $BASE_MODULE_LIST  "Serial,Pthread" icpc $INTEL_WARNING_FLAGS"
                "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
                "clang/4.0.1 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
                "clang/7.0.1 $CLANG7_MODULE_LIST "Cuda_OpenMP" clang++ $CLANG_WARNING_FLAGS"
@@ -474,21 +477,21 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then
     )
   else
     # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "clang/4.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/7.0.1 $CLANG7_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/10.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/10.1 $CUDA10_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/11.1 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   fi
   SPACK_CUDA_ARCH="+kepler35"
@@ -501,23 +504,24 @@ elif [ "$MACHINE" = "white" ]; then
 
   BASE_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>"
   IBM_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0"
-  CUDA_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,ibm/xl/16.1.0"
-  CUDA10_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,ibm/xl/16.1.0"
+  CUDA_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,ibm/xl/16.1.1"
+  CUDA10_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,ibm/xl/16.1.1"
 
   GCC72_MODULE_TPL_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,netlib/3.8.0/gcc/7.2.0"
   GCC74_MODULE_TPL_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,openblas/0.3.4/gcc/7.4.0"
   CUDA_MODULE_TPL_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0"
   CUDA10_MODULE_TPL_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,openblas/0.3.4/gcc/7.4.0"
-  IBM_MODULE_TPL_LIST="cmake/3.12.3,<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0,netlib/3.8.0/ibm/xl/16.1.0"
+  IBM_MODULE_TPL_LIST="cmake/3.12.3,<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0,netlib/3.8.0/ibm/xl/16.1.1"
 
   # Don't do pthread on white.
   GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
 
+  # Don't run the IBM toolchain with CXX14 on white
+  # "ibm/16.1.1 $IBM_MODULE_LIST "Serial" xlC $IBM_WARNING_FLAGS"
   if [ "$SPOT_CHECK" = "True" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS"
-               "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "ibm/16.1.1 $IBM_MODULE_LIST "Serial" xlC $IBM_WARNING_FLAGS"
+               "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/10.1.105 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
@@ -530,12 +534,13 @@ elif [ "$MACHINE" = "white" ]; then
     )
   else
     # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+    COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/9.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "ibm/16.1.1 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
                "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/10.0.130 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/10.1.105 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/10.1.105 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   fi
 
@@ -545,15 +550,15 @@ elif [ "$MACHINE" = "white" ]; then
   SPACK_HOST_ARCH="+power8"
   SPACK_CUDA_ARCH="+kepler37"
   SPACK_CUDA_HOST_COMPILER="%gcc@7.2.0"
-elif [ "$MACHINE" = "waterman" ]; then
+elif [ "$MACHINE" = "weaver" ]; then
   MODULE_ENVIRONMENT="source /etc/profile.d/modules.sh"
   eval "$MODULE_ENVIRONMENT"
   SKIP_HWLOC=True
 
   BASE_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>"
   IBM_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0"
-  CUDA_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,ibm/xl/16.1.0"
-  CUDA10_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,ibm/xl/16.1.0"
+  CUDA_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,ibm/xl/16.1.1"
+  CUDA10_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,ibm/xl/16.1.1"
 
   GCC72_MODULE_TPL_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,openblas/0.2.20/gcc/7.2.0"
   CUDA_MODULE_TPL_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0"
@@ -564,14 +569,13 @@ elif [ "$MACHINE" = "waterman" ]; then
 #               "cuda/10.1.243 $CUDA10_MODULE_TPL_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
 
 
-  # Don't do pthread on waterman
+  # Don't do pthread on weaver
   GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
 
   if [ "$SPOT_CHECK" = "True" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS"
-               "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "ibm/16.1.0 $IBM_MODULE_LIST "Serial" xlC $IBM_WARNING_FLAGS"
+               "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "cuda/9.2.88 $CUDA_MODULE_LIST "Cuda_Serial" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/10.1.243 $CUDA10_MODULE_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
@@ -582,15 +586,16 @@ elif [ "$MACHINE" = "waterman" ]; then
     )
   else
     # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/7.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
+    COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/7.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/9.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "ibm/16.1.1 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
                "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/10.0.130 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/10.1.105 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/10.1.243 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/10.2.089 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   fi
 
@@ -600,26 +605,21 @@ elif [ "$MACHINE" = "waterman" ]; then
 
   SPACK_HOST_ARCH="+power9"
   SPACK_CUDA_ARCH="+volta70"
-elif [ "$MACHINE" = "bowman" ]; then
-  MODULE_ENVIRONMENT="source /etc/profile.d/modules.sh"
-  eval "$MODULE_ENVIRONMENT"
+elif [ "$MACHINE" = "voltrino" ]; then
   SKIP_HWLOC=True
   export SLURM_TASKS_PER_NODE=32
 
-  BASE_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
-
-  OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
+  BASE_MODULE_LIST="PrgEnv-intel,craype-mic-knl,cmake/3.16.2,slurm/19.05.5a,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/9.3.0"
 
   # Format: (compiler module-list build-list exe-name warning-flag)
-  COMPILERS=("intel/16.4.258 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-             "intel/17.2.174 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+  COMPILERS=("intel/17.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "intel/19.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
   )
 
   if [ -z "$ARCH_FLAG" ]; then
     ARCH_FLAG="--arch=KNL"
   fi
-  SPACK_HOST_ARCH="+knl"
-  SPACK_CUDA_HOST_COMPILER="%gcc@7.2.0"
 elif [ "$MACHINE" = "mayer" ]; then
   SKIP_HWLOC=True
   export SLURM_TASKS_PER_NODE=96
@@ -638,6 +638,23 @@ elif [ "$MACHINE" = "mayer" ]; then
   fi
 
   SPACK_HOST_ARCH="+armv8_tx2"
+elif [ "$MACHINE" = "caraway" ]; then
+  SKIP_HWLOC=True
+  # BUILD_ONLY=True
+  # report_and_log_test_result: only testing compilation of code for now,
+  #   output description and success based only on build succes; build time output (no run-time)
+
+  BASE_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>"
+
+  HIPCLANG_BUILD_LIST="Hip_Serial"
+  HIPCLANG_WARNING_FLAGS=""
+
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("rocm/3.8.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS")
+
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=VEGA900"
+  fi
 elif [ "$MACHINE" = "blake" ]; then
   MODULE_ENVIRONMENT="source /etc/profile.d/modules.sh"
   eval "$MODULE_ENVIRONMENT"
@@ -665,17 +682,17 @@ elif [ "$MACHINE" = "blake" ]; then
                "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
     )
   else
-    COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+    COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/19.1.144 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/19.3.199 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "gcc/5.5.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/8.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS"
     )
 
   fi
@@ -702,7 +719,6 @@ elif [ "$MACHINE" = "apollo" ]; then
   CUDA101_MODULE_LIST="sems-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
 
   CLANG_MODULE_LIST="sems-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/9.0.69"
-  CLANG7_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-gcc/6.1.0,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/9.1"
   NVCC_MODULE_LIST="sems-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
 #  HPX_MODULE_LIST="sems-env,sems-cmake/3.12.2,hpx/1.2.1,sems-gcc/6.1.0,binutils"
 #  HPX3_MODULE_LIST="sems-env,sems-cmake/3.12.2,compilers/hpx/1.3.0,sems-gcc/6.1.0,binutils"
@@ -713,39 +729,22 @@ elif [ "$MACHINE" = "apollo" ]; then
 
   if [ "$SPOT_CHECK" = "True" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
-               "gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
-               "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS"
-               "cuda/9.1 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
+               "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
     )
   elif [ "$SPOT_CHECK_TPLS" = "True" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
-               "gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
-               "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS"
-               "cuda/9.1 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
+               "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
     )
   else
     # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("cuda/9.1 $CUDA9_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/10.0 $CUDA10_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+    COMPILERS=("cuda/10.0 $CUDA10_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/10.1 $CUDA101_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/10.2 $CUDA101_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "clang/6.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
-               "clang/7.0 $CLANG7_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
-               "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
-               "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
     )
   fi
 
@@ -771,6 +770,7 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then
   GCC91_MODULE_LIST="sems-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>"
   NVCC_MODULE_LIST="sems-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
   NVCC_SEMSMODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
+  NVCC11_MODULE_LIST="sems-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/9.2.0"
 
   CLANG_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/6.1.0"
   CLANG8_MODULE_LIST="sems-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/10.0"
@@ -780,14 +780,12 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then
   BUILD_LIST_CLANG="Serial,Pthread,OpenMP"
 
   CLANG8_CUDA_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized,-Wno-pass-failed"
-  PGI_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-gcc/7.3.0,<COMPILER_NAME>/<COMPILER_VERSION>"
 
   if [ "$SPOT_CHECK" = "True" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
                "gcc/8.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
                "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS"
-               "gcc/4.8.4 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
                "intel/17.0.1 $BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS"
                "intel/18.0.5 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
                "intel/19.0.5 $BASE_MODULE_LIST "Pthread" icpc $INTEL_WARNING_FLAGS"
@@ -795,29 +793,30 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then
                "clang/9.0.0 $BASE_MODULE_LIST "Serial,Pthread" clang++ $CLANG_WARNING_FLAGS"
                "cuda/10.1 $NVCC_SEMSMODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/9.2 $NVCC_SEMSMODULE_LIST "Cuda_Serial" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/11.0 $NVCC11_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   elif [ "$SPOT_CHECK_TPLS" = "True" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
                "gcc/8.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
                "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS"
-               "gcc/4.8.4 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
                "intel/17.0.1 $BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS"
                "intel/18.0.5 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
                "intel/19.0.5 $BASE_MODULE_LIST "Pthread" icpc $INTEL_WARNING_FLAGS"
                "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Pthread_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS"
                "clang/9.0.0 $BASE_MODULE_LIST "Serial,Pthread" clang++ $CLANG_WARNING_FLAGS"
                "cuda/10.1 $NVCC_SEMSMODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/11.0 $NVCC11_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   else
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("cuda/10.0 $NVCC_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/10.1 $NVCC_SEMSMODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/11.0 $NVCC11_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/11.1 $NVCC_SEMSMODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/9.2 $NVCC_SEMSMODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CLANG8_CUDA_WARNING_FLAGS"
                "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
-               "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
@@ -825,20 +824,13 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then
                "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/9.1 $GCC91_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/7.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/10.0.0 $BASE_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
-               "pgi/19.4 $PGI_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS"
     )
   fi
 
@@ -1085,7 +1077,7 @@ setup_env() {
 
     if [[ "${SPOT_CHECK_TPLS}" = "True" ]]; then
       # Some machines will require explicitly setting include dirs and libs
-      if ([[ "$MACHINE" = white* ]] || [[ "$MACHINE" = waterman* ]]) && [[ "$mod" = openblas* ]]; then
+      if ([[ "$MACHINE" = white* ]] || [[ "$MACHINE" = weaver* ]]) && [[ "$mod" = openblas* ]]; then
         BLAS_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib"
         LAPACK_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib"
   #      BLAS_LIBRARIES="openblas"
@@ -1096,7 +1088,7 @@ setup_env() {
         KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD="--extra-linker-flags=-lgfortran,-lm"
         echo "TPL PATHS: KOKKOSKERNELS_TPL_PATH_CMD=$KOKKOSKERNELS_TPL_PATH_CMD"
         echo "TPL LIBS:  KOKKOSKERNELS_TPL_LIBS_CMD=$KOKKOSKERNELS_TPL_LIBS_CMD"
-      elif ([[ "$MACHINE" = white* ]] || [[ "$MACHINE" = waterman* ]]) && [[ "$mod" = netlib* ]]; then
+      elif ([[ "$MACHINE" = white* ]] || [[ "$MACHINE" = weaver* ]]) && [[ "$mod" = netlib* ]]; then
         BLAS_LIBRARY_DIRS="${BLAS_ROOT}/lib"
         LAPACK_LIBRARY_DIRS="${BLAS_ROOT}/lib"
         BLAS_LIBRARIES="blas"
@@ -1111,12 +1103,8 @@ setup_env() {
 
   done
 
-  if [[ "$MACHINE" = bowman* ]]; then
-    module swap gcc/6.2.0
-  fi
-
   if [ -e ${CM_ALL_SCRIPT_PATH}/update_lib.sh ]; then
-     echo calling ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE
+     echo "calling ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE"
      source ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE
   fi
 
@@ -1229,6 +1217,10 @@ single_build_and_test() {
     if [[ "$LOCAL_KOKKOS_DEVICES" = *Cuda* ]]; then
        CUDA_ENABLE_CMD="--with-cuda=$CUDA_ROOT"
     fi
+    if [[ "$LOCAL_KOKKOS_DEVICES" = *Hip* ]]; then
+       echo "Hip IS THE KOKKOS DEVICE"
+       HIP_ENABLE_CMD="--with-hip"
+    fi
     local arch_code=$(echo $ARCH_FLAG | cut -d "=" -f 2)
     echo "kokkos devices: ${LOCAL_KOKKOS_DEVICES}"
     echo "kokkos arch: ${arch_code}"
@@ -1243,16 +1235,16 @@ single_build_and_test() {
 
     # KOKKOS_OPTIONS and KOKKOS_CUDA_OPTIONS are exported and detected by kokkos' generate_makefile.sh during install of kokkos; we pass them to the reproducer script instructions
     echo "  #   Use generate_makefile line below to call cmake which generates makefile for this build:" &> call_generate_makefile.sh
-    echo "        ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} --no-examples $extra_args" &>> call_generate_makefile.sh
+    echo "        ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args" &>> call_generate_makefile.sh
     chmod +x call_generate_makefile.sh
 
     # script command with generic path for faster copy/paste of reproducer into issues
-    echo "  #     \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} --no-examples $extra_args" &> call_generate_makefile_genericpath.sh
+    echo "  #     \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args" &> call_generate_makefile_genericpath.sh
 
-    run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --no-examples $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+    run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
 
     local make_par_lvl=12
-    if [[ "$MACHINE" = bowman* ]] || [[ "$MACHINE" = white* ]]; then
+    if [[ "$MACHINE" = white* ]]; then
       make_par_lvl=48
     fi
     local -i build_start_time=$(date +%s)
@@ -1261,7 +1253,7 @@ single_build_and_test() {
     comment="build_time=$(($build_end_time-$build_start_time))"
 
     if [[ "$BUILD_ONLY" == False ]]; then
-      run_cmd ctest --timeout 2500 -V  --output-on-failure >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
+      run_cmd ctest --timeout ${CTESTTIMEOUT} -V  --output-on-failure >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
       local -i run_end_time=$(date +%s)
       comment="$comment run_time=$(($run_end_time-$build_end_time))"
     fi
@@ -1296,6 +1288,9 @@ run_in_background() {
     if [[ "$compiler" == cuda* ]]; then
       num_jobs=1
     fi
+    if [[ "$compiler" == rocm* ]]; then
+      num_jobs=1
+    fi
     if [[ "$compiler" == clang ]]; then
       num_jobs=1
     fi
@@ -1405,8 +1400,7 @@ wait_summarize_and_exit() {
 #
 
 CM_ALL_SCRIPT=$0
-CM_ALL_SCRIPT_PATH=`pwd`
-CM_ALL_SCRIPT_PATH=${CM_ALL_SCRIPT_PATH}/`dirname $CM_ALL_SCRIPT`
+CM_ALL_SCRIPT_PATH=$(cd `dirname $CM_ALL_SCRIPT` && pwd)
 
 ROOT_DIR=$(get_test_root_dir)
 mkdir -p $ROOT_DIR
diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff-dbg.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff-dbg.sh
deleted file mode 100755
index 375b7f8712..0000000000
--- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff-dbg.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-
-export TRILINOS_DIR=${PWD}/../..
-
-# Load modules
-module purge
-source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh cuda-9.2-dbg
-
-# Packages
-PACKAGE1=Tpetra
-PACKAGE2=Sacado
-PACKAGE3=Stokhos
-PACKAGE4=MueLu
-PACKAGE5=Intrepid2
-PACKAGE6=Ifpack2
-PACKAGE7=Panzer
-PACKAGE8=Phalanx
-PACKAGE9=Stratimikos
-PACKAGE10=Belos
-
-# Configure
-cmake \
- -GNinja \
- -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \
- -DTrilinos_ENABLE_TESTS=ON \
-  -DTrilinos_ENABLE_${PACKAGE1}=ON \
-  -DTrilinos_ENABLE_${PACKAGE2}=ON \
-  -DTrilinos_ENABLE_${PACKAGE3}=ON \
-  -DTrilinos_ENABLE_${PACKAGE4}=ON \
-  -DTrilinos_ENABLE_${PACKAGE5}=ON \
-  -DTrilinos_ENABLE_${PACKAGE6}=ON \
-  -DTrilinos_ENABLE_${PACKAGE7}=ON \
-  -DTrilinos_ENABLE_${PACKAGE8}=ON \
-  -DTrilinos_ENABLE_${PACKAGE9}=ON \
-  -DTrilinos_ENABLE_${PACKAGE10}=ON \
-  -DKOKKOS_ENABLE_DEPRECATED_CODE=OFF \
-  -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \
-  -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \
-  -DTpetra_ENABLE_DEBUG=ON \
-$TRILINOS_DIR
-
-# Notes: 
-# Compile using ninja
-# make NP=32
-
-# Allocate node:
-# bsub -J TestCompare-DepOffdbg -W 06:00 -Is -n 16 -q rhel7W bash
-
-# Run tests
-# ctest -j8
-
-# Submit tests as job
-# bsub -x -Is -q rhel7W -n 16 ctest -j8
diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff.sh
deleted file mode 100755
index 9f35eeed3f..0000000000
--- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-
-echo "SOURCE this script!!"
-
-export TRILINOS_DIR=${PWD}/../..
-
-# Load modules
-module purge
-source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh cuda-9.2-opt
-
-# Packages
-PACKAGE1=Tpetra
-PACKAGE2=Sacado
-PACKAGE3=Stokhos
-PACKAGE4=MueLu
-PACKAGE5=Intrepid2
-PACKAGE6=Ifpack2
-PACKAGE7=Panzer
-PACKAGE8=Phalanx
-PACKAGE9=Stratimikos
-PACKAGE10=Belos
-
-
-rm -rf CMake*
-
-# Configure
-cmake \
- -GNinja \
- -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \
- -DTrilinos_ENABLE_TESTS=ON \
-  -DTrilinos_ENABLE_${PACKAGE1}=ON \
-  -DTrilinos_ENABLE_${PACKAGE2}=ON \
-  -DTrilinos_ENABLE_${PACKAGE3}=ON \
-  -DTrilinos_ENABLE_${PACKAGE4}=ON \
-  -DTrilinos_ENABLE_${PACKAGE5}=ON \
-  -DTrilinos_ENABLE_${PACKAGE6}=ON \
-  -DTrilinos_ENABLE_${PACKAGE7}=ON \
-  -DTrilinos_ENABLE_${PACKAGE8}=ON \
-  -DTrilinos_ENABLE_${PACKAGE9}=ON \
-  -DTrilinos_ENABLE_${PACKAGE10}=ON \
-  -DKOKKOS_ENABLE_DEPRECATED_CODE=OFF \
-  -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \
-  -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \
-$TRILINOS_DIR
-
-
-# Notes: 
-# Compile using ninja
-# make NP=32
-
-# Allocate node:
-# bsub -J TestCompare-DepCodeOFF -W 06:00 -Is -n 16 -q rhel7W bash
-
-# Run tests
-# ctest -j8
-
-# Or submit tests as job
-# bsub -x -Is -q rhel7W -n 16 ctest -j8
diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon-dbg.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon-dbg.sh
index 41160c938c..c6af962034 100755
--- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon-dbg.sh
+++ b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon-dbg.sh
@@ -33,7 +33,6 @@ cmake \
   -DTrilinos_ENABLE_${PACKAGE8}=ON \
   -DTrilinos_ENABLE_${PACKAGE9}=ON \
   -DTrilinos_ENABLE_${PACKAGE10}=ON \
-  -DKOKKOS_ENABLE_DEPRECATED_CODE=ON \
   -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \
   -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \
   -DTpetra_ENABLE_DEBUG=ON \
diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon.sh
index 955821005f..9403741586 100755
--- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon.sh
+++ b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon.sh
@@ -38,7 +38,6 @@ cmake \
   -DTrilinos_ENABLE_${PACKAGE8}=ON \
   -DTrilinos_ENABLE_${PACKAGE9}=ON \
   -DTrilinos_ENABLE_${PACKAGE10}=ON \
-  -DKOKKOS_ENABLE_DEPRECATED_CODE=ON \
   -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \
   -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \
 $TRILINOS_DIR
diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depoff.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depoff.sh
deleted file mode 100755
index da9017e388..0000000000
--- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depoff.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-echo "SOURCE this script!!"
-
-export TRILINOS_DIR=${PWD}/../..
-
-# Load modules
-module purge
-source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh cuda-9.2-rdc-release-debug-pt
-
-rm -rf CMake*
-
-# Configure
-cmake \
- -GNinja \
- -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \
- -DTrilinos_ENABLE_TESTS=ON \
- -DTrilinos_ENABLE_ALL_PACKAGES=ON \
-  -DKOKKOS_ENABLE_DEPRECATED_CODE=OFF \
-  -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \
-  -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \
-$TRILINOS_DIR
-
-# Notes: 
-# Compile using ninja
-# make NP=32
-
-# Allocate node:
-# bsub -J TestKokkos-DepCodeOn-rdcpt -W 07:00 -Is -n 16 -q rhel7W bash
-
-# Run tests
-# ctest -j8
-
-# Submit tests as job
-# bsub -x -Is -q rhel7W -n 16 ctest -j8
diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depon.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depon.sh
index 01e2def015..d508d4c77a 100755
--- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depon.sh
+++ b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depon.sh
@@ -16,7 +16,6 @@ cmake \
  -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \
  -DTrilinos_ENABLE_TESTS=ON \
  -DTrilinos_ENABLE_ALL_PACKAGES=ON \
-  -DKOKKOS_ENABLE_DEPRECATED_CODE=ON \
   -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \
   -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \
 $TRILINOS_DIR
diff --git a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-env.sh b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-env.sh
index 76e0391912..7be71edc1c 100755
--- a/scripts/trilinos-integration/ATDM_configurations/configure-atdm-env.sh
+++ b/scripts/trilinos-integration/ATDM_configurations/configure-atdm-env.sh
@@ -38,7 +38,6 @@ cmake \
   -DTrilinos_ENABLE_${PACKAGE8}=ON \
   -DTrilinos_ENABLE_${PACKAGE9}=ON \
   -DTrilinos_ENABLE_${PACKAGE10}=ON \
-  -DKOKKOS_ENABLE_DEPRECATED_CODE=ON \
   -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \
   -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \
 $TRILINOS_DIR
diff --git a/scripts/update_lib.sh b/scripts/update_lib.sh
index ce3693409c..822efa28b8 100755
--- a/scripts/update_lib.sh
+++ b/scripts/update_lib.sh
@@ -1,6 +1,30 @@
 #!/bin/bash
 
-if [ "$1" = bowman ]; then
-   export LIBRARY_PATH=/home/projects/x86-64-knl/gcc/6.2.0/lib/gcc/x86_64-pc-linux-gnu/6.2.0:/home/projects/x86-64-knl/cloog/0.18.4/lib:/home/projects/x86-64-knl/isl/0.16.1/lib:/home/projects/x86-64-knl/gmp/6.1.0/lib:/home/projects/x86-64-knl/mpfr/3.1.3/lib:/home/projects/x86-64-knl/mpc/1.0.3/lib:/home/projects/x86-64-knl/binutils/2.26.0/lib:/usr/lib/gcc/x86_64-redhat-linux/4.8.3:$LIBRARY_PATH
-   export LD_LIBRARY_PATH=/home/projects/x86-64-knl/gcc/6.2.0/lib64:/home/projects/x86-64-knl/gcc/6.2.0/lib:/home/projects/x86-64-knl/cloog/0.18.4/lib:/home/projects/x86-64-knl/isl/0.16.1/lib:/home/projects/x86-64-knl/gmp/6.1.0/lib:/home/projects/x86-64-knl/mpfr/3.1.3/lib:/home/projects/x86-64-knl/mpc/1.0.3/lib:/home/projects/x86-64-knl/binutils/2.26.0/lib:/usr/lib/gcc/x86_64-redhat-linux/4.8.3:$LD_LIBRARY_PATH
+if [ "$1" = blake ]; then
+  ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)"
+  if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.0.128 ]]; then
+    module swap gcc/4.9.3 gcc/6.4.0
+    module list
+  fi
+fi
+if [ "$1" = kokkos-dev ]; then
+  ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)"
+  if [[ "${ICPCVER}" = 17.* ]]; then
+    module swap sems-gcc/4.9.3 sems-gcc/6.4.0
+    module list
+  fi
+fi
+if [ "$1" = kokkos-dev-2 ]; then
+  ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)"
+  if [[ "${ICPCVER}" = 17.* ]]; then
+    module swap sems-gcc/4.9.3 sems-gcc/6.4.0
+    module list
+  fi
+fi
+if [ "$1" = sems ]; then
+  ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)"
+  if [[ "${ICPCVER}" = 17.* ]]; then
+    module swap sems-gcc/4.8.4 sems-gcc/6.4.0
+    module list
+  fi
 fi
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index df360c69de..22c17b5247 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -56,285 +56,332 @@ SET(ETI_HEADERS)
 #Generate @X@ variables in the template X.hpp.in and X.cpp.in
 #files containing the list of all needed macros
 KOKKOSKERNELS_GENERATE_ETI(Blas1_abs abs
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_abs_mv abs
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_scal scal
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_scal_mv scal
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_dot dot
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_dot_mv dot
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas_gesv gesv
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_axpby axpby
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_axpby_mv axpby
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_update update
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_update_mv update
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_sum sum
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_sum_mv sum
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm1 nrm1
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm1_mv nrm1
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm2w nrm2w
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm2w_mv nrm2w
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_nrminf nrminf
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_nrminf_mv nrminf
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_iamax iamax
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_iamax_mv iamax
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm2 nrm2
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_nrm2_mv nrm2
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_mult mult
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_mult_mv mult
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_reciprocal reciprocal
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas1_reciprocal_mv reciprocal
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas2_gemv gemv
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas3_gemm gemm
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas3_trsm trsm
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas3_trmm trmm
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Blas_trtri trtri
+  COMPONENTS  blas
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Sparse_sptrsv_solve sptrsv_solve
+  COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_struct spmv
+  COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_mv_struct spmv
+  COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv spmv
+  COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_mv spmv
+  COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_symbolic spgemm_symbolic
+  COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_numeric spgemm_numeric
+  COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_jacobi spgemm_jacobi
+  COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Sparse_spiluk_symbolic spiluk_symbolic
+  COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Sparse_spiluk_numeric spiluk_numeric
+  COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Sparse_sptrsv_symbolic sptrsv_symbolic
+  COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Sparse_trsv trsv
+  COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Sparse_gauss_seidel_symbolic gauss_seidel_symbolic
+  COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Sparse_gauss_seidel_numeric gauss_seidel_numeric
+  COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
 )
 
 KOKKOSKERNELS_GENERATE_ETI(Sparse_gauss_seidel_apply gauss_seidel_apply
+  COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
   SOURCE_LIST SOURCES
-  TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE
 )
 
 LIST(APPEND HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
diff --git a/src/KokkosKernels_Half.hpp b/src/KokkosKernels_Half.hpp
new file mode 100644
index 0000000000..5ecb959f7e
--- /dev/null
+++ b/src/KokkosKernels_Half.hpp
@@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSKERNELS_HALF_HPP
+#define KOKKOSKERNELS_HALF_HPP
+
+#include "Kokkos_Core.hpp"
+
+namespace KokkosKernels {
+    namespace Experimental {
+        ////////////// BEGIN FP16/binary16 limits //////////////
+        #define KOKKOSKERNELS_IMPL_FP16_MAX 65504.0F           // Maximum normalized number
+        #define KOKKOSKERNELS_IMPL_FP16_MIN 0.000000059604645F // Minimum normalized positive half precision number
+        #define KOKKOSKERNELS_IMPL_FP16_RADIX 2                // Value of the base of the exponent representation. TODO: Confirm this
+        #define KOKKOSKERNELS_IMPL_FP16_MANT_DIG 15            // Number of digits in the matissa that can be represented without losing precision. TODO: Confirm this
+        #define KOKKOSKERNELS_IMPL_FP16_MIN_EXP -14            // This is the smallest possible exponent value
+        #define KOKKOSKERNELS_IMPL_FP16_MAX_EXP 15             // This is the largest possible exponent value
+        #define KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS 10
+        #define KOKKOSKERNELS_IMPL_FP16_EPSILON 0.0009765625F
+        #define KOKKOSKERNELS_IMPL_HUGE_VALH 0x7c00            // bits [10,14] set.
+        ////////////// END FP16/binary16 limits //////////////
+    } // Experimental
+} // KokkosKernels
+#endif // KOKKOSKERNELS_HALF_HPP
diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp
index 23e6f5e125..83c483a3d6 100644
--- a/src/Kokkos_ArithTraits.hpp
+++ b/src/Kokkos_ArithTraits.hpp
@@ -50,6 +50,8 @@
 
 #include <KokkosKernels_config.h>
 #include <Kokkos_Complex.hpp>
+#include <KokkosKernels_Half.hpp>
+#include <Kokkos_Macros.hpp>
 
 #ifdef HAVE_KOKKOSKERNELS_QUADMATH
 #  include <quadmath.h>
@@ -63,16 +65,6 @@
 #ifdef __CUDACC__
 #  include <math_constants.h>
 #endif
-//
-// mfh 24 Dec 2013: Temporary measure for testing; will go away.
-//
-#ifndef KOKKOS_FORCEINLINE_FUNCTION
-#  ifdef __CUDA_ARCH__
-#    define KOKKOS_FORCEINLINE_FUNCTION inline __host__ __device__
-#  else
-#    define KOKKOS_FORCEINLINE_FUNCTION
-#  endif // __CUDA_ARCH__
-#endif // KOKKOS_FORCEINLINE_FUNCTION
 
 namespace { // anonymous
 
@@ -674,6 +666,179 @@ class ArithTraits {
   //@}
 };
 
+// Since Kokkos::Experimental::half_t falls back to float, only define
+// ArithTraits if half_t is a backend specialization
+#if defined(KOKKOS_HALF_T_IS_FLOAT) &&\
+    !KOKKOS_HALF_T_IS_FLOAT
+template <>
+class ArithTraits<Kokkos::Experimental::half_t> {
+public:
+  typedef Kokkos::Experimental::half_t val_type;
+  typedef val_type mag_type;
+
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool is_complex = false;
+
+  static constexpr bool has_infinity = true;
+  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return Kokkos::Experimental::cast_to_half(HUGE_VALF); }
+
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) {
+    #ifndef __CUDA_ARCH__
+    using std::isinf;
+    #endif
+    return isinf (Kokkos::Experimental::cast_from_half<float>(x));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) {
+    #ifndef __CUDA_ARCH__
+    using std::isnan;
+    #endif
+    return isnan(Kokkos::Experimental::cast_from_half<float>(x));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
+    return Kokkos::Experimental::cast_to_half(fabs(Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
+    return Kokkos::Experimental::cast_to_half(0.0F);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
+    return Kokkos::Experimental::cast_to_half(1.0F);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
+    return Kokkos::Experimental::cast_to_half(-KOKKOSKERNELS_IMPL_FP16_MAX);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
+    return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
+    return x;
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type) {
+    return Kokkos::Experimental::cast_to_half(0.0F);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
+    return x;
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const val_type y) {
+    return Kokkos::Experimental::cast_to_half(::pow(Kokkos::Experimental::cast_from_half<float>(x),
+                 Kokkos::Experimental::cast_from_half<float>(y)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
+    return Kokkos::Experimental::cast_to_half(::sqrt (Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
+    return Kokkos::Experimental::cast_to_half(::cbrt (Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
+    return Kokkos::Experimental::cast_to_half(::exp (Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
+    return Kokkos::Experimental::cast_to_half(::log (Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
+    return Kokkos::Experimental::cast_to_half(::log10 (Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
+    return Kokkos::Experimental::cast_to_half(::sin (Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
+    return Kokkos::Experimental::cast_to_half(::cos (Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
+    return Kokkos::Experimental::cast_to_half(::tan (Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
+    return Kokkos::Experimental::cast_to_half(::sinh (Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
+    return Kokkos::Experimental::cast_to_half(::cosh (Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
+    return Kokkos::Experimental::cast_to_half(::tanh (Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
+    return Kokkos::Experimental::cast_to_half(::asin (Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
+    return Kokkos::Experimental::cast_to_half(::acos (Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
+    return Kokkos::Experimental::cast_to_half(::atan (Kokkos::Experimental::cast_from_half<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
+    //return ::pow(2, -KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS);
+    return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_EPSILON);
+  }
+  // Backwards compatibility with Teuchos::ScalarTraits.
+  typedef mag_type magnitudeType;
+  // C++ doesn't have a standard "half-float" type.
+  typedef val_type halfPrecision;
+  typedef double doublePrecision;
+
+  static const bool isComplex = false;
+  static const bool isOrdinal = false;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = true;
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const val_type x) {
+    return isNan (x) || isInf (x);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) {
+    return abs(x);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
+    return conj(x);
+  }
+  static std::string name () {
+    return "half";
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
+    return sqrt(x);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type nan () {
+#ifdef __CUDA_ARCH__
+    return Kokkos::Experimental::cast_to_half(CUDART_NAN_F);
+#else
+    return Kokkos::Experimental::cast_to_half(std::numeric_limits<float>::quiet_NaN());
+#endif // __CUDA_ARCH__
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps () {
+    return epsilon ();
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin () {
+    return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION int base () {
+    return KOKKOSKERNELS_IMPL_FP16_RADIX;
+  }
+  // Use float to allow running on both host and device
+  static KOKKOS_FORCEINLINE_FUNCTION float prec () {
+    float e = KOKKOSKERNELS_IMPL_FP16_EPSILON;
+    float b = (float) base();
+    float r = e * b;
+    return r;
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION int t () {
+    return KOKKOSKERNELS_IMPL_FP16_MANT_DIG;
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd () {
+    return Kokkos::Experimental::cast_to_half(1.0);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION int emin () {
+    return KOKKOSKERNELS_IMPL_FP16_MIN_EXP;
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin () {
+    return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION int emax () {
+    return KOKKOSKERNELS_IMPL_FP16_MAX_EXP;
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax () {
+    return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX);
+  }
+};
+#endif // KOKKOS_HALF_T_IS_FLOAT && KOKKOS_ENABLE_CUDA_HALF
 
 template<>
 class ArithTraits<float> {
@@ -691,13 +856,13 @@ class ArithTraits<float> {
   static KOKKOS_FORCEINLINE_FUNCTION float infinity() { return HUGE_VALF; }
 
   static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const float x) {
-    #ifndef __CUDA_ARCH__
+    #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isinf;
     #endif
     return isinf (x);
   }
   static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const float x) {
-    #ifndef __CUDA_ARCH__
+    #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isnan;
     #endif
     return isnan (x);
@@ -801,9 +966,11 @@ class ArithTraits<float> {
     return sqrt (x);
   }
   static KOKKOS_FORCEINLINE_FUNCTION float nan () {
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__)
     return CUDART_NAN_F;
     //return nan (); //this returns 0???
+#elif defined(__HIP_DEVICE_COMPILE__)
+    return ::nanf("");
 #else
     return std::numeric_limits<float>::quiet_NaN();
 #endif // __CUDA_ARCH__
@@ -840,7 +1007,6 @@ class ArithTraits<float> {
   }
 };
 
-
 /// \brief Partial specialization for std::complex<RealFloatType>.
 ///
 /// The C++ Standard Library (with C++03 at least) only allows
@@ -865,13 +1031,13 @@ class ArithTraits<std::complex<RealFloatType> > {
   }
 
   static bool isInf (const std::complex<RealFloatType>& x) {
-    #ifndef __CUDA_ARCH__
+    #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isinf;
     #endif
     return isinf (real (x)) || isinf (imag (x));
   }
   static bool isNan (const std::complex<RealFloatType>& x) {
-    #ifndef __CUDA_ARCH__
+    #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isnan;
     #endif
     return isnan (real (x)) || isnan (imag (x));
@@ -1045,13 +1211,13 @@ class ArithTraits<double> {
   static KOKKOS_FORCEINLINE_FUNCTION double infinity() { return HUGE_VAL; }
 
   static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) {
-    #ifndef __CUDA_ARCH__
+    #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isinf;
     #endif
     return isinf (x);
   }
   static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) {
-    #ifndef __CUDA_ARCH__
+    #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isnan;
     #endif
     return isnan (x);
@@ -1126,9 +1292,11 @@ class ArithTraits<double> {
     return ::atan (x);
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type nan () {
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__)
     return CUDART_NAN;
     //return nan (); // this returns 0 ???
+#elif defined(__HIP_DEVICE_COMPILE__)
+    return ::nan("");
 #else
     return std::numeric_limits<val_type>::quiet_NaN();
 #endif // __CUDA_ARCH__
@@ -1140,8 +1308,10 @@ class ArithTraits<double> {
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
   typedef float halfPrecision;
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__)
   typedef double doublePrecision; // CUDA doesn't support long double, unfortunately
+#elif defined(__HIP_DEVICE_COMPILE__)
+  typedef double doublePrecision; // HIP does not support long double unfortunately
 #else
   typedef long double doublePrecision;
 #endif // __CUDA_ARCH__
@@ -1197,9 +1367,10 @@ class ArithTraits<double> {
 };
 
 
-// CUDA does not support long double in device functions, so none of
-// the class methods in this specialization are marked as device
-// functions.
+// CUDA and HIP do not support long double in device functions,
+// so none of the class methods in this specialization are marked
+// as device functions.
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
 template<>
 class ArithTraits<long double> {
 public:
@@ -1213,18 +1384,14 @@ class ArithTraits<long double> {
   static const bool is_complex = false;
 
   static constexpr bool has_infinity = true;
-  static KOKKOS_FORCEINLINE_FUNCTION long double infinity() { return HUGE_VALL; }
+  static long double infinity() { return HUGE_VALL; }
 
   static bool isInf (const val_type& x) {
-    #ifndef __CUDA_ARCH__
     using std::isinf;
-    #endif
     return isinf (x);
   }
   static bool isNan (const val_type& x) {
-    #ifndef __CUDA_ARCH__
     using std::isnan;
-    #endif
     return isnan (x);
   }
   static mag_type abs (const val_type& x) {
@@ -1359,7 +1526,8 @@ class ArithTraits<long double> {
   static mag_type rmax () {
     return LDBL_MAX;
   }
-};
+}; // long double specialization
+#endif // KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
 
 #ifdef HAVE_KOKKOSKERNELS_QUADMATH
 
@@ -2923,11 +3091,13 @@ class ArithTraits<long> {
     return intPowSigned<val_type> (x, y);
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
-#ifdef __CUDA_ARCH__
-    return static_cast<val_type> ( ::sqrt (static_cast<double> (abs (x))));
+    using std::sqrt;
+    using std::abs;
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    return static_cast<val_type> ( sqrt (static_cast<long double> (abs (x))));
 #else
-    return static_cast<val_type> ( ::sqrt (static_cast<long double> (abs (x))));
-#endif // __CUDA_ARCH__
+    return static_cast<val_type> ( sqrt (static_cast<double> (abs (x))));
+#endif
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
     return static_cast<val_type> ( ::log (static_cast<double> (abs (x))));
@@ -3048,18 +3218,20 @@ class ArithTraits<unsigned long> {
     return intPowUnsigned<val_type> (x, y);
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
-#ifdef __CUDA_ARCH__
-    return static_cast<val_type> ( ::sqrt (static_cast<double> (x)));
+    using std::sqrt;
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    return static_cast<val_type> ( sqrt (static_cast<long double> (x)));
 #else
-    return static_cast<val_type> ( ::sqrt (static_cast<long double> (x)));
-#endif // __CUDA_ARCH__
+    return static_cast<val_type> ( sqrt (static_cast<double> (x)));
+#endif
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
-#ifdef __CUDA_ARCH__
-    return static_cast<val_type> ( ::cbrt (static_cast<double> (x)));
-#else
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    using std::cbrtl;
     return static_cast<val_type> ( ::cbrtl (static_cast<long double> (x)));
-#endif // __CUDA_ARCH__
+#else
+    return static_cast<val_type> ( ::cbrt (static_cast<double> (x)));
+#endif
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
     return static_cast<val_type> ( ::exp (static_cast<double> (x)));
@@ -3184,7 +3356,15 @@ class ArithTraits<long long> {
     return intPowSigned<val_type> (x, y);
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
-#ifdef __CUDA_ARCH__
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    using std::sqrt;
+    using std::abs;
+    // IEEE 754 promises that long double has at least 64 significand
+    // bits, so we can use it to represent any signed or unsigned
+    // 64-bit integer type exactly.  However, CUDA does not implement
+    // long double for device functions.
+    return static_cast<val_type> ( sqrt (static_cast<long double> (abs (x))));
+#else
     // Casting from a 64-bit integer type to double does result in a
     // loss of accuracy.  However, it gives us a good first
     // approximation.  For very large numbers, we may lose some
@@ -3196,20 +3376,16 @@ class ArithTraits<long long> {
     // correctness.  It actually should suffice to check numbers
     // within 1 of the result.
     return static_cast<val_type> ( ::sqrt (static_cast<double> (abs (x))));
-#else
-    // IEEE 754 promises that long double has at least 64 significand
-    // bits, so we can use it to represent any signed or unsigned
-    // 64-bit integer type exactly.  However, CUDA does not implement
-    // long double for device functions.
-    return static_cast<val_type> ( ::sqrt (static_cast<long double> (abs (x))));
-#endif // __CUDA_ARCH__
+#endif
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
-#ifdef __CUDA_ARCH__
-    return static_cast<val_type> ( ::cbrt (static_cast<double> (abs (x))));
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    using std::cbrtl;
+    using std::abs;
+    return static_cast<val_type> ( cbrtl (static_cast<long double> (abs (x))));
 #else
-    return static_cast<val_type> ( ::cbrtl (static_cast<long double> (abs (x))));
-#endif // __CUDA_ARCH__
+    return static_cast<val_type> ( ::cbrt (static_cast<double> (abs (x))));
+#endif
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
     return static_cast<val_type> ( ::exp (static_cast<double> (abs (x))));
@@ -3334,18 +3510,20 @@ class ArithTraits<unsigned long long> {
     return intPowUnsigned<val_type> (x, y);
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
-#ifdef __CUDA_ARCH__
-    return static_cast<val_type> ( ::sqrt (static_cast<double> (x)));
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    using std::sqrt;
+    return static_cast<val_type> ( sqrt (static_cast<long double> (x)));
 #else
-    return static_cast<val_type> ( ::sqrt (static_cast<long double> (x)));
-#endif // __CUDA_ARCH__
+    return static_cast<val_type> ( ::sqrt (static_cast<double> (x)));
+#endif
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
-#ifdef __CUDA_ARCH__
-    return static_cast<val_type> ( ::cbrt (static_cast<double> (x)));
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    using std::cbrtl;
+    return static_cast<val_type> ( cbrtl (static_cast<long double> (x)));
 #else
-    return static_cast<val_type> ( ::cbrtl (static_cast<long double> (x)));
-#endif // __CUDA_ARCH__
+    return static_cast<val_type> ( ::cbrt (static_cast<double> (x)));
+#endif
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
     return static_cast<val_type> ( ::exp (static_cast<double> (x)));
diff --git a/src/Kokkos_InnerProductSpaceTraits.hpp b/src/Kokkos_InnerProductSpaceTraits.hpp
index 65f3feaf8e..82cab6cc3b 100644
--- a/src/Kokkos_InnerProductSpaceTraits.hpp
+++ b/src/Kokkos_InnerProductSpaceTraits.hpp
@@ -170,6 +170,7 @@ class InnerProductSpaceTraits {
 /// \brief Partial specialization for long double.
 ///
 /// \warning CUDA does not support long double in device functions.
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
 template<>
 struct InnerProductSpaceTraits<long double>
 {
@@ -184,6 +185,7 @@ struct InnerProductSpaceTraits<long double>
     return x * y;
   }
 };
+#endif
 
 //! Partial specialization for Kokkos::complex<T>.
 template<class T>
diff --git a/src/batched/KokkosBatched_Gemm_Serial_Internal.hpp b/src/batched/KokkosBatched_Gemm_Serial_Internal.hpp
index afad371334..5875029dd1 100644
--- a/src/batched/KokkosBatched_Gemm_Serial_Internal.hpp
+++ b/src/batched/KokkosBatched_Gemm_Serial_Internal.hpp
@@ -45,7 +45,7 @@ namespace KokkosBatched {
          /**/  ValueType *__restrict__ C, const int cs0, const int cs1) {
     // C = beta C + alpha A B
     // C (m x n), A(m x k), B(k x n)
-      
+
     const ScalarType one(1.0), zero(0.0);
 
     if      (beta == zero) SerialSetInternal  ::invoke(m, n, zero, C, cs0, cs1);
diff --git a/src/batched/KokkosBatched_Gemm_TeamVector_Impl.hpp b/src/batched/KokkosBatched_Gemm_TeamVector_Impl.hpp
index 4e1c4d9579..0b68727f0e 100644
--- a/src/batched/KokkosBatched_Gemm_TeamVector_Impl.hpp
+++ b/src/batched/KokkosBatched_Gemm_TeamVector_Impl.hpp
@@ -138,7 +138,7 @@ namespace KokkosBatched {
            const CViewType &C) {
       // C = beta C + alpha A B
       // C (m x n), A(m x k), B(k x n)
-      return TeamGemmInternal<Algo::Gemm::Unblocked>::
+      return TeamVectorGemmInternal<Algo::Gemm::Unblocked>::
         invoke(member,
                C.extent(0), C.extent(1), A.extent(0),
                alpha, 
diff --git a/src/batched/KokkosBatched_Gemm_TeamVector_Internal.hpp b/src/batched/KokkosBatched_Gemm_TeamVector_Internal.hpp
index 3b53e9a577..971389902e 100644
--- a/src/batched/KokkosBatched_Gemm_TeamVector_Internal.hpp
+++ b/src/batched/KokkosBatched_Gemm_TeamVector_Internal.hpp
@@ -70,7 +70,7 @@ namespace KokkosBatched {
 		const ValueType
 		  *__restrict__ pB = B+j*bs1;
 		
-		ValueType c = 0;
+		ValueType c = ValueType(0);
 		for (int p=0;p<k;++p) 
 		  c += pA[p*as1]*pB[p*bs0];
 		C[i*cs0+j*cs1] += alpha*c;
diff --git a/src/batched/KokkosBatched_Gemm_Team_Internal.hpp b/src/batched/KokkosBatched_Gemm_Team_Internal.hpp
index f4f682cb91..b223a71fac 100644
--- a/src/batched/KokkosBatched_Gemm_Team_Internal.hpp
+++ b/src/batched/KokkosBatched_Gemm_Team_Internal.hpp
@@ -5,6 +5,7 @@
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
+#include "KokkosKernels_ExecSpaceUtils.hpp"
 
 #include "KokkosBatched_Set_Internal.hpp"
 #include "KokkosBatched_Scale_Internal.hpp"
@@ -49,7 +50,7 @@ namespace KokkosBatched {
 
     // C = beta C + alpha A B
     // C (m x n), A(m x k), B(k x n)
-      
+
     const ScalarType one(1.0), zero(0.0);
         
     if      (beta == zero) TeamSetInternal  ::invoke(member, m, n, zero, C, cs0, cs1);
@@ -68,7 +69,7 @@ namespace KokkosBatched {
             *__restrict__ pA = A+i*as0,
             *__restrict__ pB = B+j*bs1;
             
-          ValueType c = 0;
+          ValueType c = ValueType(0);
           for (int p=0;p<k;++p) 
             c += pA[p*as1]*pB[p*bs0];
           C[i*cs0+j*cs1] += alpha*c;
@@ -111,7 +112,7 @@ namespace KokkosBatched {
         member.team_barrier();
 
       ///
-      /// case cuda: team size is large and blocksize (mb,nb) is small
+      /// GPU case: team size is large and blocksize (mb,nb) is small
       InnerGemmFixC<mbAlgo,nbAlgo> inner(as0, as1, bs0, bs1, cs0, cs1);
       auto gemm = [&](const int ib, 
                       const int jb,
@@ -128,13 +129,16 @@ namespace KokkosBatched {
         Kokkos::parallel_for
         (Kokkos::TeamThreadRange(member, mq*nq ),
          [&](const int &ij) {
-#if                                                     \
-  defined (KOKKOS_ENABLE_CUDA) &&                       \
-  defined (KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
-          const int i = ij%mq*mb, j = ij/mq*nb;
-#else
-          const int i = ij/nq*mb, j = ij%nq*nb;
-#endif
+          int i, j;
+          //note: the condition is constexpr
+          if(KokkosKernels::Impl::kk_is_gpu_exec_space<typename MemberType::execution_space>()) {
+            i = ij%mq*mb;
+            j = ij/mq*nb;
+          }
+          else {
+            i = ij/nq*mb;
+            j = ij%nq*nb;
+          }
           inner.serial_invoke(alpha, 
                               AA+i*as0, BB+j*bs1, 
                               (i+mb) > ib ? mp : mb, 
diff --git a/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp b/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp
index 7b3d8b293e..b63ca28fcf 100644
--- a/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp
+++ b/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp
@@ -68,6 +68,7 @@ namespace KokkosBatched {
          [&](const int &i) {
           x2[i*x2s] *= inv_chi1_minus_alpha;
         });
+      member.team_barrier();
 
       // later consider to use the following
       // SerialScaleInternal::invoke(m_x2, inv_chi1_minus_alpha, x2, x2s);
diff --git a/src/batched/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp b/src/batched/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp
index 11174eafb6..2e62c20f32 100644
--- a/src/batched/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp
+++ b/src/batched/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp
@@ -469,10 +469,10 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0,
-      a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0,
-      a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0,
-      a_3p, b_p3, c_30 = 0, c_31 = 0, c_32 = 0, c_33 = 0;
+      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0),
+      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), c_13 = ValueType(0),
+      a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0),
+      a_3p, b_p3, c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0), c_33 = ValueType(0);
 
     const int
       i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0,
@@ -516,10 +516,10 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0,
-      a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0,
-      a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0,
-      a_3p,       c_30 = 0, c_31 = 0, c_32 = 0;
+      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0),
+      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0),
+      a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0),
+      a_3p,       c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0);
 
     const int
       i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0,
@@ -563,10 +563,10 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, 
-      a_1p, b_p1, c_10 = 0, c_11 = 0, 
-      a_2p,       c_20 = 0, c_21 = 0, 
-      a_3p,       c_30 = 0, c_31 = 0;
+      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), 
+      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), 
+      a_2p,       c_20 = ValueType(0), c_21 = ValueType(0), 
+      a_3p,       c_30 = ValueType(0), c_31 = ValueType(0);
 
     const int
       i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0,
@@ -610,10 +610,10 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0,
-      a_1p,       c_10 = 0,
-      a_2p,       c_20 = 0,
-      a_3p,       c_30 = 0;
+      a_0p, b_p0, c_00 = ValueType(0),
+      a_1p,       c_10 = ValueType(0),
+      a_2p,       c_20 = ValueType(0),
+      a_3p,       c_30 = ValueType(0);
 
     const int
       i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0,
@@ -657,9 +657,9 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0,
-      a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0,
-      a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0,
+      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0),
+      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), c_13 = ValueType(0),
+      a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0),
       /**/  b_p3;
 
     const int
@@ -702,8 +702,8 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0,
-      a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0,
+      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0),
+      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), c_13 = ValueType(0),
       /**/  b_p2, 
       /**/  b_p3;
 
@@ -745,7 +745,7 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0,
+      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0),
       /**/  b_p1, 
       /**/  b_p2, 
       /**/  b_p3; 
@@ -790,9 +790,9 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0,
-      a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0,
-      a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0;
+      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0),
+      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0),
+      a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0);
 
     const int
       i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0,
@@ -833,9 +833,9 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, 
-      a_1p, b_p1, c_10 = 0, c_11 = 0, 
-      a_2p,       c_20 = 0, c_21 = 0;
+      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), 
+      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), 
+      a_2p,       c_20 = ValueType(0), c_21 = ValueType(0);
 
     const int
       i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0,
@@ -876,9 +876,9 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0, 
-      a_1p,       c_10 = 0, 
-      a_2p,       c_20 = 0;
+      a_0p, b_p0, c_00 = ValueType(0), 
+      a_1p,       c_10 = ValueType(0), 
+      a_2p,       c_20 = ValueType(0);
 
     const int
       i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0,
@@ -919,8 +919,8 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0,
-      a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0,
+      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0),
+      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0),
       /**/  b_p2;
 
     const int
@@ -959,7 +959,7 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0,
+      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0),
       /**/  b_p1, 
       /**/  b_p2; 
 
@@ -1002,8 +1002,8 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0,
-      a_1p, b_p1, c_10 = 0, c_11 = 0;
+      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0),
+      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0);
 
     const int
       i0 = 0*_as0, i1 = 1*_as0,
@@ -1041,8 +1041,8 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0,
-      a_1p,       c_10 = 0;
+      a_0p, b_p0, c_00 = ValueType(0),
+      a_1p,       c_10 = ValueType(0);
 
     const int
       i0 = 0*_as0, i1 = 1*_as0,
@@ -1080,7 +1080,7 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0,
+      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0),
       /**/  b_p1;
     const int
       i0 = 0*_as0, 
@@ -1120,7 +1120,7 @@ namespace KokkosBatched {
     if (k <= 0) return 0;
 
     ValueType
-      a_0p, b_p0, c_00 = 0;
+      a_0p, b_p0, c_00 = ValueType(0);
 
     const int
       i0 = 0*_as0,
diff --git a/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp b/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp
index d1b59d652f..d443bad513 100644
--- a/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp
+++ b/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp
@@ -23,7 +23,7 @@ namespace KokkosBatched {
     KOKKOS_INLINE_FUNCTION
     static int
     invoke(const MemberType &member,
-	   const int m, 
+	   const int m, const int n, 
            const int k, 
            /* */ ValueType * A, const int as0, const int as1,
            /* */ ValueType * t, const int ts,
@@ -44,12 +44,12 @@ namespace KokkosBatched {
       if (is_Q_zero)
         TeamVectorSetInternal::invoke(member, m, value_type(1), Q, qs0+qs1);
       else
-        TeamVectorSetIdentityInternal::invoke(member, m, Q, qs0, qs1);
+        TeamVectorSetIdentityInternal::invoke(member, m, n, Q, qs0, qs1);
       member.team_barrier();
       
       return TeamVectorApplyQ_LeftForwardInternal
         ::invoke(member,
-		 m, m, k,
+		 m, n, k,
                  A, as0, as1, 
                  t, ts,
                  Q, qs0, qs1,
diff --git a/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp b/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp
index 2b0c1e4569..08439b0b28 100644
--- a/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp
+++ b/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp
@@ -96,6 +96,7 @@ namespace KokkosBatched {
 				    A, as0, as1,
 				    A, as0, as1,
 				    norm, 1);
+	member.team_barrier();
 
       const bool finish_when_rank_found = (matrix_rank == -1);
       
@@ -158,7 +159,7 @@ namespace KokkosBatched {
 	  if (m_atl == 0) max_diag = ats::abs(A[0]);
 	  const value_type
 	    val_diag = ats::abs(A_part3x3.A11[0]),
-	    threshold(max_diag*ats::epsilon());
+	    threshold(10*max_diag*ats::epsilon());
 	  if (val_diag < threshold) {
 	    matrix_rank = m_atl;
 	    if (finish_when_rank_found)
@@ -171,6 +172,7 @@ namespace KokkosBatched {
 						    n_A22,
 						    A_part3x3.A12, as1,
 						    norm_part1x3.A2, 1);
+    member.team_barrier();
 	/// -----------------------------------------------------
         A_part2x2.mergeToATL  (A_part3x3);
         t_part2x1.mergeToAT   (t_part3x1);
diff --git a/src/batched/KokkosBatched_SetIdentity_Impl.hpp b/src/batched/KokkosBatched_SetIdentity_Impl.hpp
index 4c0ea12348..0bf12243ee 100644
--- a/src/batched/KokkosBatched_SetIdentity_Impl.hpp
+++ b/src/batched/KokkosBatched_SetIdentity_Impl.hpp
@@ -19,7 +19,7 @@ namespace KokkosBatched {
   SerialSetIdentity::
   invoke(const AViewType &A) {
     return SerialSetIdentityInternal::
-      invoke(A.extent(0), 
+      invoke(A.extent(0), A.extent(1),
              A.data(), A.stride_0(), A.stride_1());
   }
 
@@ -36,7 +36,7 @@ namespace KokkosBatched {
          const AViewType &A) {
     return TeamSetIdentityInternal::
       invoke(member, 
-             A.extent(0), 
+             A.extent(0), A.extent(1),
              A.data(), A.stride_0(), A.stride_1());
   }
  
diff --git a/src/batched/KokkosBatched_SetIdentity_Internal.hpp b/src/batched/KokkosBatched_SetIdentity_Internal.hpp
index 40d8bbbaaf..8f7f6cf3f9 100644
--- a/src/batched/KokkosBatched_SetIdentity_Internal.hpp
+++ b/src/batched/KokkosBatched_SetIdentity_Internal.hpp
@@ -15,10 +15,10 @@ namespace KokkosBatched {
     template<typename ValueType>
     KOKKOS_INLINE_FUNCTION
     static int
-    invoke(const int m, 
+    invoke(const int m, const int n,
            /* */ ValueType *__restrict__ A, const int as0, const int as1) {
       const ValueType one(1), zero(0);
-      for (int j=0;j<m;++j) {
+      for (int j=0;j<n;++j) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
@@ -40,7 +40,7 @@ namespace KokkosBatched {
     KOKKOS_INLINE_FUNCTION
     static int
     invoke(const MemberType &member,
-           const int m, 
+           const int m, const int n,
            /* */ ValueType *__restrict__ A, const int as0, const int as1) {
       const ValueType one(1), zero(0);
       Kokkos::parallel_for
@@ -49,7 +49,7 @@ namespace KokkosBatched {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-          for (int j=0;j<m;++j) 
+          for (int j=0;j<n;++j) 
             A[i*as0+j*as1] = i == j ? one : zero;
         });
         
@@ -66,14 +66,14 @@ namespace KokkosBatched {
     KOKKOS_INLINE_FUNCTION
     static int
     invoke(const MemberType &member,
-           const int m, 
+           const int m, const int n,
            /* */ ValueType *__restrict__ A, const int as0, const int as1) {
       const ValueType one(1), zero(0);
       Kokkos::parallel_for
         (Kokkos::TeamThreadRange(member,m),
          [&](const int &i) {
 	   Kokkos::parallel_for
-	     (Kokkos::ThreadVectorRange(member,m),
+	     (Kokkos::ThreadVectorRange(member,n),
 	      [&](const int &j) {
 		A[i*as0+j*as1] = i == j ? one : zero;
 	      });
diff --git a/src/batched/KokkosBatched_SolveUTV_TeamVector_Impl.hpp b/src/batched/KokkosBatched_SolveUTV_TeamVector_Impl.hpp
index 3e2581c3b9..68e2fa17c6 100644
--- a/src/batched/KokkosBatched_SolveUTV_TeamVector_Impl.hpp
+++ b/src/batched/KokkosBatched_SolveUTV_TeamVector_Impl.hpp
@@ -36,7 +36,7 @@ namespace KokkosBatched {
 	TeamVectorSolveUTV_Internal::
 	  invoke(member,
 		 matrix_rank,
-		 T.extent(0),
+		 T.extent(0), V.extent(0),
 		 U.data(), U.stride(0), U.stride(1),
 		 T.data(), T.stride(0), T.stride(1),
 		 V.data(), V.stride(0), V.stride(1),
@@ -48,7 +48,7 @@ namespace KokkosBatched {
 	TeamVectorSolveUTV_Internal::
 	  invoke(member,
 		 matrix_rank, 
-		 T.extent(0), B.extent(1),
+		 T.extent(0), V.extent(0), B.extent(1),
 		 U.data(), U.stride(0), U.stride(1),
 		 T.data(), T.stride(0), T.stride(1),
 		 V.data(), V.stride(0), V.stride(1),
diff --git a/src/batched/KokkosBatched_SolveUTV_TeamVector_Internal.hpp b/src/batched/KokkosBatched_SolveUTV_TeamVector_Internal.hpp
index 1ceb22292e..f3fd1e80cd 100644
--- a/src/batched/KokkosBatched_SolveUTV_TeamVector_Internal.hpp
+++ b/src/batched/KokkosBatched_SolveUTV_TeamVector_Internal.hpp
@@ -25,7 +25,7 @@ namespace KokkosBatched {
     static int
     invoke(const MemberType &member, 
 	   const int matrix_rank,
-           const int m,
+           const int m, const int n,
            const ValueType * U, const int us0, const int us1,
 	   const ValueType * T, const int ts0, const int ts1,
 	   const ValueType * V, const int vs0, const int vs1,
@@ -104,7 +104,7 @@ namespace KokkosBatched {
     static int
     invoke(const MemberType &member, 
 	   const int matrix_rank,
-           const int m, const int nrhs,
+           const int m, const int n, const int nrhs,
            const ValueType * U, const int us0, const int us1,
 	   const ValueType * T, const int ts0, const int ts1,
 	   const ValueType * V, const int vs0, const int vs1,
@@ -120,10 +120,10 @@ namespace KokkosBatched {
       value_type * W = w; /// m x nrhs
       const int ws0 = xs0 < xs1 ? 1 : nrhs, ws1 = xs0 < xs1 ? m : 1;
 
-      if (matrix_rank < m) {
+      if (matrix_rank < n) {
 	/// U is m x matrix_rank
 	/// T is matrix_rank x matrix_rank
-	/// V is matrix_rank m
+	/// V is matrix_rank x n
 	/// W = U^T B
 	TeamVectorGemmInternal<Algo::Gemm::Unblocked>
 	  ::invoke(member,
@@ -133,6 +133,7 @@ namespace KokkosBatched {
 		   B, bs0, bs1,
 		   zero,
 		   W, ws0, ws1);
+	member.team_barrier();
 
 	/// W = T^{-1} W
 	TeamVectorTrsmInternalLeftLower<Algo::Trsm::Unblocked>
@@ -142,26 +143,31 @@ namespace KokkosBatched {
 		   one,
 		   T, ts0, ts1,
 		   W, ws0, ws1);
+	member.team_barrier();
 	
 	/// X = V^T W
 	TeamVectorGemmInternal<Algo::Gemm::Unblocked>
 	  ::invoke(member,
-		   m, nrhs, matrix_rank, 
+		   n, nrhs, matrix_rank, 
 		   one,
 		   V, vs1, vs0,
 		   W, ws0, ws1,
 		   zero,
 		   X, xs0, xs1);
+	member.team_barrier();
       } else {
+	/// W = U^T B
 	TeamVectorGemmInternal<Algo::Gemm::Unblocked>
 	  ::invoke(member,
-		   m, nrhs, matrix_rank, 
+		   matrix_rank, nrhs, m, 
 		   one,
 		   U, us1, us0,
 		   B, bs0, bs1,
 		   zero,
 		   X, xs0, xs1);
+    member.team_barrier();
 
+	/// X = T^{-1} X
 	TeamVectorTrsmInternalLeftUpper<Algo::Trsm::Unblocked>
 	  ::invoke(member,
 		   false,
@@ -169,12 +175,13 @@ namespace KokkosBatched {
 		   one,
 		   T, ts0, ts1,
 		   X, xs0, xs1);		
+	member.team_barrier();
       }
       
       /// X = P^T X
       TeamVectorApplyPivotMatrixBackwardInternal
       	::invoke(member,
-      		 nrhs, m,
+      		 nrhs, n,
       		 p, ps0,
       		 X, xs0, xs1);
 
diff --git a/src/batched/KokkosBatched_Trsm_Team_Internal.hpp b/src/batched/KokkosBatched_Trsm_Team_Internal.hpp
index 085bd9e293..64d8368f16 100644
--- a/src/batched/KokkosBatched_Trsm_Team_Internal.hpp
+++ b/src/batched/KokkosBatched_Trsm_Team_Internal.hpp
@@ -5,6 +5,7 @@
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
+#include "KokkosKernels_ExecSpaceUtils.hpp"
 
 #include "KokkosBatched_Set_Internal.hpp"
 #include "KokkosBatched_Scale_Internal.hpp"
@@ -114,7 +115,7 @@ namespace KokkosBatched {
       /// case host: team size is small and blocksize (mb,nb) is large
             
       ///
-      /// case cuda: team size is large and blocksize (mb,nb) is small
+      /// case GPU: team size is large and blocksize (mb,nb) is small
       InnerTrsmLeftLowerUnitDiag<mbAlgo>    trsm_u(as0, as1, bs0, bs1);
       InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, bs1);
             
@@ -195,7 +196,6 @@ namespace KokkosBatched {
          const ScalarType alpha,
          const ValueType *__restrict__ A, const int as0, const int as1,
          /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-
     const ScalarType one(1.0), zero(0.0);
 
     // note that parallel range is different ( m*n vs m-1*n);        
@@ -223,13 +223,15 @@ namespace KokkosBatched {
         }
 
         Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,iend*jend),[&](const int &ij) {
-#if							\
-  defined (KOKKOS_ENABLE_CUDA) &&                       \
-  defined (KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
-            const int i = ij%iend, j = ij/iend;
-#else
-            const int i = ij/jend, j = ij%jend;
-#endif
+            int i, j;
+            if(KokkosKernels::Impl::kk_is_gpu_exec_space<typename MemberType::execution_space>()) {
+              i = ij%iend;
+              j = ij/iend;
+            }
+            else {
+              i = ij/jend;
+              j = ij%jend;
+            }
             B0[i*bs0+j*bs1] -= a01[i*as0] * b1t[j*bs1];
           });          
       }
diff --git a/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp b/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp
index 618f8dc614..5bf26f0865 100644
--- a/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp
+++ b/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp
@@ -99,7 +99,7 @@ namespace KokkosBatched {
       if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0);
       if (m <= 0) return 0;
 
-      /// case cuda: team size is large and blocksize (mb,nb) is small
+      /// case GPU: team size is large and blocksize (mb,nb) is small
       InnerTrsmLeftLowerUnitDiag<mbAlgo>    trsm_u(as0, as1, bs0, 0);
       InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, 0);
 
diff --git a/src/batched/KokkosBatched_Trsv_Team_Internal.hpp b/src/batched/KokkosBatched_Trsv_Team_Internal.hpp
index 20ee624006..7d72f01e15 100644
--- a/src/batched/KokkosBatched_Trsv_Team_Internal.hpp
+++ b/src/batched/KokkosBatched_Trsv_Team_Internal.hpp
@@ -115,7 +115,7 @@ namespace KokkosBatched {
       if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0);
       if (m <= 0) return 0;
 
-      /// case cuda: team size is large and blocksize (mb,nb) is small
+      /// case GPU: team size is large and blocksize (mb,nb) is small
       InnerTrsmLeftLowerUnitDiag<mbAlgo>    trsm_u(as0, as1, bs0, 0);
       InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, 0);
             
diff --git a/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp b/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp
index 69b958d22d..b06c76b02a 100644
--- a/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp
+++ b/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp
@@ -31,7 +31,7 @@ namespace KokkosBatched {
 	   int &matrix_rank) {
       return TeamVectorUTV_Internal::
         invoke(member,
-               A.extent(0), //A.extent(1), 
+               A.extent(0), A.extent(1), 
                A.data(), A.stride(0), A.stride(1),
 	       p.data(), p.stride(0),
 	       U.data(), U.stride(0), U.stride(1),
diff --git a/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp b/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp
index 6f9a86e115..354dfa7c44 100644
--- a/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp
+++ b/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp
@@ -23,7 +23,7 @@ namespace KokkosBatched {
     KOKKOS_INLINE_FUNCTION
     static int
     invoke(const MemberType &member, 
-           const int m, // m = NumRows(A)
+           const int m, const int n, // m = NumRows(A), n = NumCols(A)
            /* */ ValueType * A, const int as0, const int as1,
 	   /* */ IntType   * p, const int ps0,
 	   /* */ ValueType * U, const int us0, const int us1,
@@ -41,23 +41,24 @@ namespace KokkosBatched {
       matrix_rank = -1;
       TeamVectorQR_WithColumnPivotingInternal
       	::invoke(member,
-      		 m, m,
+      		 m, n,
       		 A, as0, as1,
       		 t, ts0,
       		 p, ps0,
       		 work,
       		 matrix_rank);
-      
+
       TeamVectorQR_FormQ_Internal
       	::invoke(member,
-      		 m, matrix_rank,
+      		 m, matrix_rank, matrix_rank,
       		 A, as0, as1,
       		 t, ts0,
       		 U, us0, us1,
       		 work);
+      member.team_barrier();
 
       /// for rank deficient matrix
-      if (matrix_rank < m) {
+      if (matrix_rank < n) {
 	const value_type zero(0);
 	TeamVectorSetLowerTriangularInternal
 	  ::invoke(member,
@@ -67,14 +68,14 @@ namespace KokkosBatched {
 	
 	TeamVectorQR_Internal
 	  ::invoke(member,
-		   m, matrix_rank,
+		   n, matrix_rank,
 		   A, as1, as0,
 		   t, ts0,
 		   work);
 	
 	TeamVectorQR_FormQ_Internal
 	  ::invoke(member,
-		   m, matrix_rank,
+		   n, matrix_rank, matrix_rank,
 		   A, as1, as0,
 		   t, ts0,
 		   V, vs1, vs0,
diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp
index 372b5e1753..3253b6ce12 100644
--- a/src/batched/KokkosBatched_Util.hpp
+++ b/src/batched/KokkosBatched_Util.hpp
@@ -270,12 +270,17 @@ namespace KokkosBatched {
         // regieter blocking (not about team parallelism).
         // this mb should vary according to
         // - team policy (smaller) or range policy (bigger)
-        // - space (cuda vs host)
+        // - space (gpu vs host)
         // - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc.
 #if defined(KOKKOS_ENABLE_CUDA)
         template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION static constexpr
         typename std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::CudaSpace>::value,int>
         ::type mb() { return 2; }
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+        template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION static constexpr
+        typename std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::Experimental::HIPSpace>::value,int>
+        ::type mb() { return 2; }
 #endif
         template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION static constexpr
         typename std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::HostSpace>::value,int>
@@ -300,7 +305,7 @@ namespace KokkosBatched {
     using Gemm = Level3;
     using Trsm = Level3;
     using Trmm = Level3;
-    using Trtri = Level3; // TODO: Need new level for Trtri?
+    using Trtri = Level3;
     using LU   = Level3;
     using InverseLU = Level3;
     using SolveLU   = Level3;
@@ -320,6 +325,11 @@ namespace KokkosBatched {
         template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION static constexpr
         typename std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::CudaSpace>::value,int>
         ::type mb() { return 1; }
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+        template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION static constexpr
+        typename std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::Experimental::HIPSpace>::value,int>
+        ::type mb() { return 1; }
 #endif
         template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION static constexpr
         typename std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::HostSpace>::value,int>
diff --git a/src/batched/KokkosBatched_Vector.hpp b/src/batched/KokkosBatched_Vector.hpp
index 8737d72850..28a537f885 100644
--- a/src/batched/KokkosBatched_Vector.hpp
+++ b/src/batched/KokkosBatched_Vector.hpp
@@ -104,6 +104,25 @@ namespace KokkosBatched {
   };
 #endif
 
+#if defined(KOKKOS_ENABLE_HIP)
+  template<>
+  struct DefaultVectorLength<float,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 16 };
+  };
+  template<>
+  struct DefaultVectorLength<double,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 16 };
+  };
+  template<>
+  struct DefaultVectorLength<Kokkos::complex<float>,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 16 };
+  };
+  template<>
+  struct DefaultVectorLength<Kokkos::complex<double>,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 16 };
+  };
+#endif
+
   template<typename ValueType, typename MemorySpace>
   struct DefaultInternalVectorLength {
     enum : int { value = 1 };
@@ -147,6 +166,25 @@ namespace KokkosBatched {
     enum : int { value = 1 };
   };
 #endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+  template<>
+  struct DefaultInternalVectorLength<float,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 8 };
+  };
+  template<>
+  struct DefaultInternalVectorLength<double,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 4 };
+  };
+  template<>
+  struct DefaultInternalVectorLength<Kokkos::complex<float>,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 4 };
+  };
+  template<>
+  struct DefaultInternalVectorLength<Kokkos::complex<double>,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 2 };
+  };
+#endif
     
   template<typename T>
   struct MagnitudeScalarType;
diff --git a/src/batched/KokkosBatched_Vector_SIMD.hpp b/src/batched/KokkosBatched_Vector_SIMD.hpp
index d59f0f9be4..a950e5e41f 100644
--- a/src/batched/KokkosBatched_Vector_SIMD.hpp
+++ b/src/batched/KokkosBatched_Vector_SIMD.hpp
@@ -6,7 +6,7 @@
 #include <Kokkos_Complex.hpp>
 #include <KokkosBatched_Vector.hpp>
 
-#if defined(__CUDA_ARCH__) 
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 #undef  __KOKKOSBATCHED_ENABLE_AVX__
 #else
 // compiler bug with AVX in some architectures
@@ -129,7 +129,7 @@ namespace KokkosBatched {
 }
 
 
-#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 namespace KokkosBatched {
 
   template<>
@@ -143,7 +143,7 @@ namespace KokkosBatched {
     typedef float2 data_type;
 
     KOKKOS_INLINE_FUNCTION
-    static const char* label() { return "CudaFloat2"; }
+    static const char* label() { return "GpuFloat2"; }
 
     template<typename,int>
     friend class Vector;
@@ -224,7 +224,7 @@ namespace KokkosBatched {
     typedef double2 data_type;
 
     KOKKOS_INLINE_FUNCTION
-    static const char* label() { return "CudaDouble2"; }
+    static const char* label() { return "GpuDouble2"; }
 
     template<typename,int>
     friend class Vector;
@@ -305,7 +305,7 @@ namespace KokkosBatched {
     typedef float4 data_type;
 
     KOKKOS_INLINE_FUNCTION
-    static const char* label() { return "CudaFloat4"; }
+    static const char* label() { return "GpuFloat4"; }
 
     template<typename,int>
     friend class Vector;
@@ -400,7 +400,7 @@ namespace KokkosBatched {
     typedef double4 data_type;
 
     KOKKOS_INLINE_FUNCTION
-    static const char* label() { return "CudaDouble4"; }
+    static const char* label() { return "GpuDouble4"; }
 
     template<typename,int>
     friend class Vector;
diff --git a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp
index 95ab97d882..49317ca9d4 100644
--- a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp
+++ b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp
@@ -77,7 +77,7 @@ namespace KokkosBatched {
     return r_val;
   }
     
-#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)  
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   KOKKOS_FORCEINLINE_FUNCTION 
   static
   KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
@@ -298,7 +298,7 @@ namespace KokkosBatched {
     return r_val;
   }
 
-#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)  
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   KOKKOS_FORCEINLINE_FUNCTION
   static 
   KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
@@ -568,7 +568,7 @@ namespace KokkosBatched {
     return r_val;
   }
 
-#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)  
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   KOKKOS_FORCEINLINE_FUNCTION
   static 
   KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
@@ -858,7 +858,7 @@ namespace KokkosBatched {
     return r_val;
   }
 
-#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)  
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   KOKKOS_FORCEINLINE_FUNCTION
   static 
   KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp
index 74d15af1c3..db5bc9fbca 100644
--- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp
+++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp
@@ -139,8 +139,8 @@ struct SingleLevelNontransposeGEMV {
 // matrix A and the input vector x.  The output vector y is the
 // reduction result.
 //
-// WARNING: NOT RECOMMENDED FOR CUDA.  Reduction result may have
-// arbitrary length.  This is bad on CUDA because the CUDA
+// WARNING: NOT RECOMMENDED FOR GPU.  Reduction result may have
+// arbitrary length.  This is bad on GPU because the GPU
 // implementation of Kokkos::parallel_reduce may use shared memory for
 // intermediate results.
 template<class AViewType,
diff --git a/src/blas/impl/KokkosBlas3_gemm_impl.hpp b/src/blas/impl/KokkosBlas3_gemm_impl.hpp
index 3f4bbadef6..124909c159 100644
--- a/src/blas/impl/KokkosBlas3_gemm_impl.hpp
+++ b/src/blas/impl/KokkosBlas3_gemm_impl.hpp
@@ -64,13 +64,20 @@ namespace Impl {
 // On GPUs it is more important to not jump around in global memory, i.e. have coallesced loads
 template<class ExecSpace, class LayoutA, class LayoutAScratch>
 struct impl_gemm_choose_copy_layout {
-  typedef LayoutAScratch type;
+  using type = LayoutAScratch;
 };
 
 #ifdef KOKKOS_ENABLE_CUDA
 template<class LayoutA, class LayoutAScratch>
 struct impl_gemm_choose_copy_layout<Kokkos::Cuda,LayoutA,LayoutAScratch> {
-  typedef LayoutA type;
+  using type = LayoutA;
+};
+#endif
+
+#ifdef KOKKOS_ENABLE_HIP
+template<class LayoutA, class LayoutAScratch>
+struct impl_gemm_choose_copy_layout<Kokkos::Experimental::HIP,LayoutA,LayoutAScratch> {
+  using type = LayoutA;
 };
 #endif
 
@@ -392,7 +399,7 @@ KOKKOS_INLINE_FUNCTION
 void impl_team_gemm_block(const TeamHandle& team, const ViewTypeC& C, const ViewTypeA& A, const ViewTypeB& B) {
   typedef typename ViewTypeC::non_const_value_type ScalarC;
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) || !defined(__CUDA_ARCH__)
+#if defined(KOKKOS_COMPILER_GNU) && (!defined(__CUDA_ARCH__) || !defined(__HIP_DEVICE_COMPILE__))
   int blockA0 = A.extent_int(0);
   int blockA1 = A.extent_int(1);
   int blockB1 = B.extent_int(1);
@@ -510,7 +517,17 @@ struct GEMMImpl {
       ViewTypeBScratch::shmem_size() +
       ViewTypeCScratch::shmem_size();
 
+#if defined(KOKKOS_ENABLE_HIP)
+    // Note lbv, 10/29/20: The LaunchBounds<384,2> leads
+    // to an error with HIP as the heuristics on that platform
+    // yield an optimal_num_blocks=0 which means no ressources
+    // are allocated... Switching to LaunchBounds<384,2> fixes
+    // that problem but I'm not sure if that it a good perf
+    // parameter or why it is set to 2 for Cuda?
+    Kokkos::TeamPolicy<ExecSpace,Kokkos::LaunchBounds<384,0>> policy(num_blocks_0*num_blocks_1,team_size,vector_length);
+#else
     Kokkos::TeamPolicy<ExecSpace,Kokkos::LaunchBounds<384,2>> policy(num_blocks_0*num_blocks_1,team_size,vector_length);
+#endif
 
     Kokkos::parallel_for(impl_gemm_label<TransposeA,TransposeB>::label,policy.set_scratch_size(scratch_level,Kokkos::PerTeam(scratch_memory_size)),*this);
   }
diff --git a/src/blas/impl/KokkosBlas3_gemm_spec.hpp b/src/blas/impl/KokkosBlas3_gemm_spec.hpp
index 877d73c5fa..2a63c3736f 100644
--- a/src/blas/impl/KokkosBlas3_gemm_spec.hpp
+++ b/src/blas/impl/KokkosBlas3_gemm_spec.hpp
@@ -157,6 +157,10 @@ struct GEMM {
   if(std::is_same<typename CViewType::execution_space,Kokkos::Cuda>::value)
     team_size = blockA0;
   #endif
+  #if defined(KOKKOS_ENABLE_HIP)
+  if(std::is_same<typename CViewType::execution_space,Kokkos::Experimental::HIP>::value)
+    team_size = blockA0;
+  #endif
   #if defined(KOKKOS_ENABLE_ROCM)
   if(std::is_same<typename CViewType::execution_space,Kokkos::ROCm>::value)
     team_size = blockA0;
diff --git a/src/blas/impl/KokkosBlas3_trmm_spec.hpp b/src/blas/impl/KokkosBlas3_trmm_spec.hpp
index 13c87a299e..3c0bd9df6f 100644
--- a/src/blas/impl/KokkosBlas3_trmm_spec.hpp
+++ b/src/blas/impl/KokkosBlas3_trmm_spec.hpp
@@ -74,7 +74,7 @@ struct trmm_eti_spec_avail {
          > { enum : bool { value = true }; };
 
 //
-// This Macros provides the ETI specialization of trmm, currently not available.
+// This Macros provides the ETI specialization of trmm
 //
 #define KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
     KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL_LAYOUT( SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE)
diff --git a/src/blas/impl/KokkosBlas_gesv_spec.hpp b/src/blas/impl/KokkosBlas_gesv_spec.hpp
index e1e0b77f67..8f2d171436 100644
--- a/src/blas/impl/KokkosBlas_gesv_spec.hpp
+++ b/src/blas/impl/KokkosBlas_gesv_spec.hpp
@@ -118,6 +118,7 @@ struct GESV<AMatrix, BXMV, IPIVV, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
         const IPIVV& IPIV)
   {
    //NOTE: Might add the implementation of KokkosBlas::gesv later
+   throw std::runtime_error("No fallback implementation of GESV (general LU factorization & solve) exists. Enable BLAS and/or MAGMA TPL.");
   }
 };
 
diff --git a/src/common/KokkosKernels_BitUtils.hpp b/src/common/KokkosKernels_BitUtils.hpp
index b22d86a8bb..c845e37c53 100644
--- a/src/common/KokkosKernels_BitUtils.hpp
+++ b/src/common/KokkosKernels_BitUtils.hpp
@@ -51,7 +51,7 @@ namespace KokkosKernels{
 namespace Impl{
 
 // POP COUNT function returns the number of set bits
-#if defined( __CUDA_ARCH__ )
+#if defined( __CUDA_ARCH__ ) || defined(__HIP_DEVICE_COMPILE__)
 KOKKOS_FORCEINLINE_FUNCTION
 int pop_count( unsigned i ){
   return __popc(i);
@@ -112,66 +112,95 @@ int pop_count( long long i ){
   return _popcnt64(i);
 }
 
-#elif defined( KOKKOS_COMPILER_IBM )
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
 KOKKOS_FORCEINLINE_FUNCTION
 int pop_count( unsigned i ){
-  return __popcnt4(i);
+  return __builtin_popcount(i);
 }
 KOKKOS_FORCEINLINE_FUNCTION
 int pop_count( unsigned long i ){
-  return __popcnt8(i);
+  return __builtin_popcountl(i);
 }
 
 KOKKOS_FORCEINLINE_FUNCTION
 int pop_count( unsigned long long i ){
-  return __popcnt8(i);
+  return __builtin_popcountll(i);
 }
 
+KOKKOS_FORCEINLINE_FUNCTION
+int pop_count( int i ){
+  return __builtin_popcount(i);
+}
+KOKKOS_FORCEINLINE_FUNCTION
+int pop_count(  long i ){
+  return __builtin_popcountl(i);
+}
 
+KOKKOS_FORCEINLINE_FUNCTION
+int pop_count(  long long i ){
+  return __builtin_popcountll(i);
+}
 
+#elif defined(__ibmxl_vrm__)
+// See https://www.ibm.com/support/knowledgecenter/SSGH3R_16.1.0/com.ibm.xlcpp161.aix.doc/compiler_ref/compiler_builtins.html
+// link gives info about builtin names for xlclang++
+KOKKOS_FORCEINLINE_FUNCTION
+int pop_count( unsigned i ){
+  return __builtin_popcnt4(i);
+}
+KOKKOS_FORCEINLINE_FUNCTION
+int pop_count( unsigned long i ){
+  return __builtin_popcnt8(i);
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+int pop_count( unsigned long long i ){
+  return __builtin_popcnt8(i);
+}
 
 KOKKOS_FORCEINLINE_FUNCTION
 int pop_count( int i ){
-  return __popcnt4(i);
+  return __builtin_popcnt4(i);
 }
 
 KOKKOS_FORCEINLINE_FUNCTION
 int pop_count( long i ){
-  return __popcnt8(i);
+  return __builtin_popcnt8(i);
 }
 
 KOKKOS_FORCEINLINE_FUNCTION
 int pop_count( long long i ){
-  return __popcnt8(i);
+  return __builtin_popcnt8(i);
 }
 
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
+#elif defined(__IBMCPP__) || defined(__IBMC__)
 KOKKOS_FORCEINLINE_FUNCTION
 int pop_count( unsigned i ){
-  return __builtin_popcount(i);
+  return __popcnt4(i);
 }
 KOKKOS_FORCEINLINE_FUNCTION
 int pop_count( unsigned long i ){
-  return __builtin_popcountl(i);
+  return __popcnt8(i);
 }
 
 KOKKOS_FORCEINLINE_FUNCTION
 int pop_count( unsigned long long i ){
-  return __builtin_popcountll(i);
+  return __popcnt8(i);
 }
 
 KOKKOS_FORCEINLINE_FUNCTION
 int pop_count( int i ){
-  return __builtin_popcount(i);
+  return __popcnt4(i);
 }
+
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count(  long i ){
-  return __builtin_popcountl(i);
+int pop_count( long i ){
+  return __popcnt8(i);
 }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count(  long long i ){
-  return __builtin_popcountll(i);
+int pop_count( long long i ){
+  return __popcnt8(i);
 }
 
 #else
@@ -181,7 +210,7 @@ int pop_count(  long long i ){
 
 // least_set_bit function returns the position of right most set bit
 
-#if defined( __CUDA_ARCH__ )
+#if defined( __CUDA_ARCH__ ) || defined(__HIP_DEVICE_COMPILE__)
 KOKKOS_FORCEINLINE_FUNCTION
 int least_set_bit( unsigned i ){
   return __ffs(i);
@@ -189,7 +218,11 @@ int least_set_bit( unsigned i ){
 
 KOKKOS_FORCEINLINE_FUNCTION
 int least_set_bit( unsigned long i ){
+#if defined(__HIP_DEVICE_COMPILE__)
+  return __ffsll(static_cast<unsigned long long>(i));
+#else
   return __ffsll(i);
+#endif
 }
 
 
@@ -207,7 +240,11 @@ int least_set_bit( int i ){
 
 KOKKOS_FORCEINLINE_FUNCTION
 int least_set_bit( long i ){
+#if defined(__HIP_DEVICE_COMPILE__)
+  return __ffsll(static_cast<long long>(i));
+#else
   return __ffsll(i);
+#endif
 }
 
 KOKKOS_FORCEINLINE_FUNCTION
diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp
index c0ae6ce5eb..59bcf487fb 100644
--- a/src/common/KokkosKernels_ExecSpaceUtils.hpp
+++ b/src/common/KokkosKernels_ExecSpaceUtils.hpp
@@ -53,9 +53,9 @@ namespace KokkosKernels{
 
 namespace Impl{
 
-enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA};
+enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA, Exec_HIP};
 template <typename ExecutionSpace>
-inline ExecSpaceType kk_get_exec_space_type(){
+KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){
   ExecSpaceType exec_space = Exec_SERIAL;
 #if defined( KOKKOS_ENABLE_SERIAL )
   if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
@@ -81,6 +81,12 @@ inline ExecSpaceType kk_get_exec_space_type(){
   }
 #endif
 
+#if defined( KOKKOS_ENABLE_HIP )
+  if (std::is_same<Kokkos::Experimental::HIP, ExecutionSpace >::value){
+    exec_space = Exec_HIP;
+  }
+#endif
+
 #if defined( KOKKOS_ENABLE_QTHREAD)
   if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
     exec_space = Exec_QTHREADS;
@@ -90,6 +96,60 @@ inline ExecSpaceType kk_get_exec_space_type(){
 
 }
 
+template <typename ExecutionSpace>
+constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() {
+  return false;
+}
+
+#ifdef KOKKOS_ENABLE_CUDA
+template <>
+constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space<Kokkos::Cuda>() {
+  return true;
+}
+#endif
+
+#ifdef KOKKOS_ENABLE_HIP
+template <>
+constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space<Kokkos::Experimental::HIP>() {
+  return true;
+}
+#endif
+
+//Host function to determine free and total device memory.
+//Will throw if execution space doesn't support this.
+template <typename MemorySpace>
+inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem)
+{
+  std::ostringstream oss;
+  oss << "Error: memory space " << MemorySpace::name() << " does not support querying free/total memory.";
+  throw std::runtime_error(oss.str());
+}
+
+#ifdef KOKKOS_ENABLE_CUDA
+template <>
+inline void kk_get_free_total_memory<Kokkos::CudaSpace>(size_t& free_mem, size_t& total_mem)
+{
+  cudaMemGetInfo(&free_mem, &total_mem);
+}
+template <>
+inline void kk_get_free_total_memory<Kokkos::CudaUVMSpace>(size_t& free_mem, size_t& total_mem)
+{
+  cudaMemGetInfo(&free_mem, &total_mem);
+}
+template <>
+inline void kk_get_free_total_memory<Kokkos::CudaHostPinnedSpace>(size_t& free_mem, size_t& total_mem)
+{
+  cudaMemGetInfo(&free_mem, &total_mem);
+}
+#endif
+
+#ifdef KOKKOS_ENABLE_HIP
+template <>
+inline void kk_get_free_total_memory<Kokkos::Experimental::HIPSpace>(size_t& free_mem, size_t& total_mem)
+{
+  hipMemGetInfo(&free_mem, &total_mem);
+}
+#endif
 
 inline int kk_get_suggested_vector_size(
     const size_t nr, const  size_t nnz, const ExecSpaceType exec_space){
@@ -103,7 +163,7 @@ inline int kk_get_suggested_vector_size(
   case Exec_QTHREADS:
     break;
   case Exec_CUDA:
-
+  case Exec_HIP:
     if (nr > 0)
       suggested_vector_size_ = nnz / double (nr) + 0.5;
     if (suggested_vector_size_ < 3){
@@ -119,7 +179,14 @@ inline int kk_get_suggested_vector_size(
       suggested_vector_size_ = 16;
     }
     else {
-      suggested_vector_size_ = 32;
+      if(exec_space == Exec_CUDA || suggested_vector_size_ <= 48) {
+        //use full CUDA warp, or half a HIP wavefront
+        suggested_vector_size_ = 32;
+      }
+      else {
+        //use full HIP wavefront
+        suggested_vector_size_ = 64;
+      }
     }
     break;
   }
@@ -129,7 +196,9 @@ inline int kk_get_suggested_vector_size(
 
 
 inline int kk_get_suggested_team_size(const int vector_size, const ExecSpaceType exec_space){
-  if (exec_space == Exec_CUDA){
+  if (exec_space == Exec_CUDA || exec_space == Exec_HIP) {
+    //TODO: where this is used, tune the target value for
+    //threads per block (but 256 is probably OK for CUDA and HIP)
     return 256 / vector_size;
   }
   else {
@@ -171,6 +240,25 @@ struct SpaceInstance<Kokkos::Cuda> {
 };
 #endif
 
+#ifdef KOKKOS_ENABLE_HIP
+template <>
+struct SpaceInstance<Kokkos::Experimental::HIP> {
+  static Kokkos::Experimental::HIP create() {
+    hipStream_t stream;
+    hipStreamCreate(&stream);
+    return Kokkos::Experimental::HIP(stream);
+  }
+  static void destroy(Kokkos::Experimental::HIP& space) {
+    hipStream_t stream = space.hip_stream();
+    hipStreamDestroy(stream);
+  }
+  static bool overlap() {
+    //TODO: does HIP have an equivalent for CUDA_LAUNCH_BLOCKING?
+    return true;
+  }
+};
+#endif
+
 }
 
 }
diff --git a/src/common/KokkosKernels_Handle.hpp b/src/common/KokkosKernels_Handle.hpp
index 1713e7c460..2e335d4f04 100644
--- a/src/common/KokkosKernels_Handle.hpp
+++ b/src/common/KokkosKernels_Handle.hpp
@@ -371,7 +371,7 @@ class KokkosKernelsHandle
       return this->team_work_size;
     }
     else {
-      if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+      if (my_exec_space == KokkosKernels::Impl::Exec_CUDA || my_exec_space == KokkosKernels::Impl::Exec_HIP) {
         return team_size;
       }
       else {
@@ -609,10 +609,10 @@ class KokkosKernelsHandle
     }
   }
 
-  void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t verts_per_cluster) {
+  void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t hint_verts_per_cluster) {
     this->destroy_gs_handle();
     this->is_owner_of_the_gs_handle = true;
-    this->gsHandle = new ClusterGaussSeidelHandleType(clusterAlgo, verts_per_cluster);
+    this->gsHandle = new ClusterGaussSeidelHandleType(clusterAlgo, hint_verts_per_cluster);
   }
   void destroy_gs_handle(){
     if (is_owner_of_the_gs_handle && this->gsHandle != NULL){
diff --git a/src/common/KokkosKernels_Macros.hpp b/src/common/KokkosKernels_Macros.hpp
index 84de9048c9..ced946fe4f 100644
--- a/src/common/KokkosKernels_Macros.hpp
+++ b/src/common/KokkosKernels_Macros.hpp
@@ -46,10 +46,10 @@
 #define _KOKKOSKERNELS_MACROUTILS_HPP_
 
 // If KOKKOSKERNELS_ENABLE_OMP_SIMD is defined, it's legal to place
-// "#pragma omp simd" before a for loop. It's never defined if CUDA is enabled,
+// "#pragma omp simd" before a for loop. It's never defined if a GPU-type device is enabled,
 // since in that case, Kokkos::ThreadVectorRange should be used instead for SIMD parallel loops.
 
-#if !defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_OPENMP)
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ENABLE_OPENMP)
   #if defined(KOKKOS_COMPILER_GNU)
     // GCC 4.8.5 and older do not support #pragma omp simd
     #if (KOKKOS_COMPILER_GNU > 485 )
diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp
index bdb93c71b1..be37765594 100644
--- a/src/common/KokkosKernels_Sorting.hpp
+++ b/src/common/KokkosKernels_Sorting.hpp
@@ -250,7 +250,7 @@ struct DefaultComparator
 //Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter
 template<typename Ordinal, typename ValueType, typename TeamMember, typename Comparator = DefaultComparator<ValueType>>
 KOKKOS_INLINE_FUNCTION void
-TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem)
+TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator())
 {
   //Algorithm only works on power-of-two input size only.
   //If n is not a power-of-two, will implicitly pretend
@@ -277,7 +277,6 @@ TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem)
           Ordinal boxStart = boxID << (1 + i - j);  //boxID * boxSize
           Ordinal boxOffset = t - (boxStart >> 1);  //t - boxID * boxSize / 2;
           Ordinal elem1 = boxStart + boxOffset;
-          Comparator comp;
           if(j == 0)
           {
             //first phase (brown box): within a block, compare with the opposite value in the box
@@ -316,7 +315,7 @@ TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem)
 //Sort "values", while applying the same swaps to "perm" 
 template<typename Ordinal, typename ValueType, typename PermType, typename TeamMember, typename Comparator = DefaultComparator<ValueType>>
 KOKKOS_INLINE_FUNCTION void
-TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem)
+TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator())
 {
   //Algorithm only works on power-of-two input size only.
   //If n is not a power-of-two, will implicitly pretend
@@ -343,7 +342,6 @@ TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember
           Ordinal boxStart = boxID << (1 + i - j);  //boxID * boxSize
           Ordinal boxOffset = t - (boxStart >> 1);  //t - boxID * boxSize / 2;
           Ordinal elem1 = boxStart + boxOffset;
-          Comparator comp;
           if(j == 0)
           {
             //first phase (brown box): within a block, compare with the opposite value in the box
@@ -389,19 +387,20 @@ TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember
 template<typename View, typename Ordinal, typename TeamMember, typename Comparator>
 struct BitonicSingleTeamFunctor
 {
-  BitonicSingleTeamFunctor(View& v_) : v(v_) {}
+  BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) : v(v_), comp(comp_) {}
   KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const
   {
-    TeamBitonicSort<Ordinal, typename View::value_type, TeamMember, Comparator>(v.data(), v.extent(0), t);
+    TeamBitonicSort<Ordinal, typename View::value_type, TeamMember, Comparator>(v.data(), v.extent(0), t, comp);
   };
   View v;
+  Comparator comp;
 };
 
 //Functor that sorts equally sized chunks on each team
 template<typename View, typename Ordinal, typename TeamMember, typename Comparator>
 struct BitonicChunkFunctor
 {
-  BitonicChunkFunctor(View& v_, Ordinal chunkSize_) : v(v_), chunkSize(chunkSize_) {}
+  BitonicChunkFunctor(View& v_, const Comparator& comp_, Ordinal chunkSize_) : v(v_), comp(comp_), chunkSize(chunkSize_) {}
   KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const
   {
     Ordinal chunk = t.league_rank();
@@ -409,9 +408,10 @@ struct BitonicChunkFunctor
     Ordinal n = chunkSize;
     if(chunkStart + n > Ordinal(v.extent(0)))
       n = v.extent(0) - chunkStart;
-    TeamBitonicSort<Ordinal, typename View::value_type, TeamMember, Comparator>(v.data() + chunkStart, n, t);
+    TeamBitonicSort<Ordinal, typename View::value_type, TeamMember, Comparator>(v.data() + chunkStart, n, t, comp);
   };
   View v;
+  Comparator comp;
   Ordinal chunkSize;
 };
 
@@ -420,8 +420,8 @@ template<typename View, typename Ordinal, typename TeamMember, typename Comparat
 struct BitonicPhase1Functor
 {
   typedef typename View::value_type Value;
-  BitonicPhase1Functor(View& v_, Ordinal boxSize_, Ordinal teamsPerBox_)
-    : v(v_), boxSize(boxSize_), teamsPerBox(teamsPerBox_)
+  BitonicPhase1Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, Ordinal teamsPerBox_)
+    : v(v_), comp(comp_), boxSize(boxSize_), teamsPerBox(teamsPerBox_)
   {}
   KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const
   {
@@ -431,9 +431,8 @@ struct BitonicPhase1Functor
     Ordinal workStart = work * (t.league_rank() % teamsPerBox);
     Ordinal workReflect = boxSize - workStart - 1;
     Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work),
-      [=](const Ordinal i)
+      [&](const Ordinal i)
       {
-        Comparator comp;
         Ordinal elem1 = boxStart + workStart + i;
         Ordinal elem2 = boxStart + workReflect - i;
         if(elem2 < Ordinal(v.extent(0)))
@@ -448,6 +447,7 @@ struct BitonicPhase1Functor
       });
   };
   View v;
+  Comparator comp;
   Ordinal boxSize;
   Ordinal teamsPerBox;
 };
@@ -457,8 +457,8 @@ template<typename View, typename Ordinal, typename TeamMember, typename Comparat
 struct BitonicPhase2Functor
 {
   typedef typename View::value_type Value;
-  BitonicPhase2Functor(View& v_, Ordinal boxSize_, Ordinal teamsPerBox_)
-    : v(v_), boxSize(boxSize_), teamsPerBox(teamsPerBox_)
+  BitonicPhase2Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, Ordinal teamsPerBox_)
+    : v(v_), comp(comp_), boxSize(boxSize_), teamsPerBox(teamsPerBox_)
   {}
   KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const
   {
@@ -470,9 +470,8 @@ struct BitonicPhase2Functor
     Ordinal work = boxSize / teamsPerBox / 2;
     Ordinal workStart = boxStart + work * (t.league_rank() % teamsPerBox);
     Ordinal jump = boxSize / 2;
-    Comparator comp;
     Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work),
-      [=](const Ordinal i)
+      [&](const Ordinal i)
       {
         Ordinal elem1 = workStart + i;
         Ordinal elem2 = workStart + jump + i;
@@ -496,7 +495,7 @@ struct BitonicPhase2Functor
         Ordinal logSubBoxSize = logBoxSize - subLevel;
         Ordinal subBoxSize = Ordinal(1) << logSubBoxSize;
         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work),
-          [=](const Ordinal i)
+          [&](const Ordinal i)
           {
             Ordinal globalThread = i + t.league_rank() * work;
             Ordinal subBox = globalThread >> (logSubBoxSize - 1);
@@ -519,6 +518,7 @@ struct BitonicPhase2Functor
     }
   };
   View v;
+  Comparator comp;
   Ordinal boxSize;
   Ordinal teamsPerBox;
 };
@@ -531,16 +531,16 @@ struct BitonicPhase2Functor
 //and an arbitrary device-compatible comparison operator (provided through operator() of Comparator)
 //If comparator is void, use operator< (which should only be used for primitives)
 template<typename View, typename ExecSpace, typename Ordinal, typename Comparator = DefaultComparator<typename View::value_type>>
-void bitonicSort(View v)
+void bitonicSort(View v, const Comparator& comp = Comparator())
 {
   typedef Kokkos::TeamPolicy<ExecSpace> team_policy;
   typedef typename team_policy::member_type team_member;
   Ordinal n = v.extent(0);
   //If n is small, just sort on a single team
-  if(n <= Ordinal(1) << 16)
+  if(n <= Ordinal(1) << 12)
   {
     Kokkos::parallel_for(team_policy(1, Kokkos::AUTO()),
-        BitonicSingleTeamFunctor<View, Ordinal, team_member, Comparator>(v));
+        BitonicSingleTeamFunctor<View, Ordinal, team_member, Comparator>(v, comp));
   }
   else
   {
@@ -552,16 +552,16 @@ void bitonicSort(View v)
     Ordinal numTeams = npot / chunkSize;
     //First, sort within teams
     Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()),
-        BitonicChunkFunctor<View, Ordinal, team_member, Comparator>(v, chunkSize));
+        BitonicChunkFunctor<View, Ordinal, team_member, Comparator>(v, comp, chunkSize));
     for(int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2)
     {
       Ordinal boxSize = teamsPerBox * chunkSize;
       Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()),
-          BitonicPhase1Functor<View, Ordinal, team_member, Comparator>(v, boxSize, teamsPerBox));
+          BitonicPhase1Functor<View, Ordinal, team_member, Comparator>(v, comp, boxSize, teamsPerBox));
       for(int boxDiv = 1; teamsPerBox >> boxDiv; boxDiv++)
       {
         Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()),
-            BitonicPhase2Functor<View, Ordinal, team_member, Comparator>(v, boxSize >> boxDiv, teamsPerBox >> boxDiv));
+            BitonicPhase2Functor<View, Ordinal, team_member, Comparator>(v, comp, boxSize >> boxDiv, teamsPerBox >> boxDiv));
       }
     }
   }
diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp
index 16a336f200..6979f15847 100644
--- a/src/common/KokkosKernels_SparseUtils.hpp
+++ b/src/common/KokkosKernels_SparseUtils.hpp
@@ -858,13 +858,84 @@ inline size_t kk_is_d1_coloring_valid(
 
   struct ColorChecker <in_row_view_t, in_nnz_view_t, in_color_view_t, team_member_t>  cc(num_rows, xadj, adj, v_colors, team_work_chunk_size);
   size_t num_conf = 0;
-  Kokkos::parallel_reduce( "KokkosKernels::Common::IsD1ColoringValie", dynamic_team_policy(num_rows / team_work_chunk_size + 1 ,
+  Kokkos::parallel_reduce( "KokkosKernels::Common::IsD1ColoringValid", dynamic_team_policy(num_rows / team_work_chunk_size + 1 ,
       suggested_team_size, vector_size), cc, num_conf);
 
   MyExecSpace().fence();
   return num_conf;
 }
 
+template<typename Reducer, typename ordinal_t, typename rowmap_t>
+struct MinMaxDegreeFunctor
+{
+  using ReducerVal = typename Reducer::value_type;
+  MinMaxDegreeFunctor(const rowmap_t& rowmap_)
+    : rowmap(rowmap_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(ordinal_t i, ReducerVal& lminmax) const
+  {
+    ordinal_t deg = rowmap(i + 1) - rowmap(i);
+    if(deg < lminmax.min_val)
+      lminmax.min_val = deg;
+    if(deg > lminmax.max_val)
+      lminmax.max_val = deg;
+  }
+  rowmap_t rowmap;
+};
+
+template<typename Reducer, typename ordinal_t, typename rowmap_t>
+struct MaxDegreeFunctor
+{
+  using ReducerVal = typename Reducer::value_type;
+  MaxDegreeFunctor(const rowmap_t& rowmap_)
+    : rowmap(rowmap_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(ordinal_t i, ReducerVal& lmax) const
+  {
+    ordinal_t deg = rowmap(i + 1) - rowmap(i);
+    if(deg > lmax)
+      lmax = deg;
+  }
+  rowmap_t rowmap;
+};
+
+template<typename device_t, typename ordinal_t, typename rowmap_t>
+ordinal_t graph_max_degree(const rowmap_t& rowmap)
+{
+  using Reducer = Kokkos::Max<ordinal_t>;
+  ordinal_t nrows = rowmap.extent(0);
+  if(nrows)
+    nrows--;
+  if(nrows == 0)
+    return 0;
+  ordinal_t val;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<typename device_t::execution_space>(0, nrows),
+      MaxDegreeFunctor<Reducer, ordinal_t, rowmap_t>(rowmap),
+      Reducer(val));
+  return val;
+}
+
+template<typename device_t, typename ordinal_t, typename rowmap_t>
+void graph_min_max_degree(const rowmap_t& rowmap, ordinal_t& min_degree, ordinal_t& max_degree)
+{
+  using Reducer = Kokkos::MinMax<ordinal_t>;
+  ordinal_t nrows = rowmap.extent(0);
+  if(nrows)
+    nrows--;
+  if(nrows == 0)
+  {
+    min_degree = 0;
+    max_degree = 0;
+    return;
+  }
+  typename Reducer::value_type result;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<typename device_t::execution_space>(0, nrows),
+      MinMaxDegreeFunctor<Reducer, ordinal_t, rowmap_t>(rowmap),
+      Reducer(result));
+  min_degree = result.min_val;
+  max_degree = result.max_val;
+}
+
 template<typename execution_space, typename rowmap_t, typename entries_t, typename values_t>
 struct SortCrsMatrixFunctor
 {
@@ -970,12 +1041,7 @@ void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const val
 {
   using lno_t = typename entries_t::non_const_value_type;
   using team_pol = Kokkos::TeamPolicy<execution_space>;
-#ifdef KOKKOS_ENABLE_CUDA
-  //only CUDA benefits from using team-based bitonic
-  bool useRadix = std::is_same<execution_space, Kokkos::Cuda>::value ? false : true;
-#else
-  bool useRadix = true;
-#endif
+  bool useRadix = !kk_is_gpu_exec_space<execution_space>();
   SortCrsMatrixFunctor<execution_space, rowmap_t, entries_t, values_t>
     funct(useRadix, rowmap, entries, values);
   lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
@@ -1023,12 +1089,7 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries)
 {
   using lno_t = typename entries_t::non_const_value_type;
   using team_pol = Kokkos::TeamPolicy<execution_space>;
-#ifdef KOKKOS_ENABLE_CUDA
-  //only CUDA benefits from using team-based bitonic
-  bool useRadix = std::is_same<execution_space, Kokkos::Cuda>::value ? false : true;
-#else
-  bool useRadix = true;
-#endif
+  bool useRadix = !kk_is_gpu_exec_space<execution_space>();
   SortCrsGraphFunctor<execution_space, rowmap_t, entries_t>
     funct(useRadix, rowmap, entries);
   lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
@@ -1097,14 +1158,14 @@ struct MergedRowmapFunctor
 };
 
 template<typename rowmap_t, typename entries_t, typename values_t>
-struct MergedEntriesFunctor
+struct MatrixMergedEntriesFunctor
 {
   using size_type = typename rowmap_t::non_const_value_type;
   using lno_t = typename entries_t::non_const_value_type;
   using scalar_t = typename values_t::non_const_value_type;
 
   //Precondition: entries are sorted within each row
-  MergedEntriesFunctor(
+  MatrixMergedEntriesFunctor(
       const rowmap_t& rowmap_, const entries_t& entries_, const values_t& values_,
       const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_, const values_t& mergedValues_)
     : rowmap(rowmap_), entries(entries_), values(values_),
@@ -1154,6 +1215,52 @@ struct MergedEntriesFunctor
   values_t mergedValues;
 };
 
+template<typename rowmap_t, typename entries_t>
+struct GraphMergedEntriesFunctor
+{
+  using size_type = typename rowmap_t::non_const_value_type;
+  using lno_t = typename entries_t::non_const_value_type;
+
+  //Precondition: entries are sorted within each row
+  GraphMergedEntriesFunctor(
+      const rowmap_t& rowmap_, const entries_t& entries_,
+      const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_)
+    : rowmap(rowmap_), entries(entries_),
+    mergedRowmap(mergedRowmap_), mergedEntries(mergedEntries_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const
+  {
+    size_type rowBegin = rowmap(row);
+    size_type rowEnd = rowmap(row + 1);
+    if(rowEnd == rowBegin)
+    {
+      //Row was empty to begin with, nothing to do
+      return;
+    }
+    //Otherwise, accumulate the value for each column
+    lno_t accumCol = entries(rowBegin);
+    size_type insertPos = mergedRowmap(row);
+    for(size_type j = rowBegin + 1; j < rowEnd; j++)
+    {
+      if(accumCol != entries(j))
+      {
+        //write out and reset
+        mergedEntries(insertPos) = accumCol;
+        insertPos++;
+        accumCol = entries(j);
+      }
+    }
+    //always left with the last unique entry
+    mergedEntries(insertPos) = accumCol;
+  }
+
+  rowmap_t rowmap;
+  entries_t entries;
+  rowmap_t mergedRowmap;
+  entries_t mergedEntries;
+};
+
 //Sort the rows of matrix, and merge duplicate entries.
 template<typename crsMat_t>
 crsMat_t sort_and_merge_matrix(const crsMat_t& A)
@@ -1177,7 +1284,7 @@ crsMat_t sort_and_merge_matrix(const crsMat_t& A)
   values_t mergedValues("SortedMerged values", numCompressedEntries);
   //Compute merged entries and values
   Kokkos::parallel_for(range_t(0, A.numRows()),
-      MergedEntriesFunctor<c_rowmap_t, entries_t, values_t>
+      MatrixMergedEntriesFunctor<c_rowmap_t, entries_t, values_t>
       (A.graph.row_map, A.graph.entries, A.values,
        mergedRowmap, mergedEntries, mergedValues));
   //Finally, construct the new compressed matrix
@@ -1185,6 +1292,41 @@ crsMat_t sort_and_merge_matrix(const crsMat_t& A)
       mergedValues, mergedRowmap, mergedEntries);
 }
 
+template<typename exec_space, typename rowmap_t, typename entries_t>
+void sort_and_merge_graph(
+    const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in,
+    rowmap_t& rowmap_out, entries_t& entries_out)
+{
+  using size_type = typename rowmap_t::non_const_value_type;
+  using lno_t = typename entries_t::non_const_value_type;
+  using range_t = Kokkos::RangePolicy<exec_space>;
+  using const_rowmap_t = typename rowmap_t::const_type;
+  lno_t numRows = rowmap_in.extent(0);
+  if(numRows <= 1)
+  {
+    //Matrix has zero rows
+    rowmap_out = rowmap_t();
+    entries_out = entries_t();
+    return;
+  }
+  numRows--;
+  //Sort in place
+  sort_crs_graph<exec_space, const_rowmap_t, entries_t>(rowmap_in, entries_in);
+  //Count entries per row into a new rowmap, in terms of merges that can be done
+  rowmap_out = rowmap_t(Kokkos::ViewAllocateWithoutInitializing("SortedMerged rowmap"), numRows + 1);
+  size_type numCompressedEntries = 0;
+  Kokkos::parallel_reduce(range_t(0, numRows),
+      MergedRowmapFunctor<rowmap_t, entries_t>(rowmap_out, rowmap_in, entries_in), numCompressedEntries);
+  //Prefix sum to get rowmap
+  kk_exclusive_parallel_prefix_sum<rowmap_t, exec_space>(numRows + 1, rowmap_out);
+  entries_out = entries_t("SortedMerged entries", numCompressedEntries);
+  //Compute merged entries and values
+  Kokkos::parallel_for(range_t(0, numRows),
+      GraphMergedEntriesFunctor<const_rowmap_t, entries_t>
+      (rowmap_in, entries_in,
+       rowmap_out, entries_out));
+}
+
 template <typename lno_view_t,
           typename lno_nnz_view_t,
           typename scalar_view_t,
@@ -1199,76 +1341,44 @@ void kk_sort_graph(
 
     out_nnz_view_t out_adj,
     out_scalar_view_t out_vals){
-  ExecSpaceType exec = kk_get_exec_space_type<MyExecSpace>();
-
-  if (exec == Exec_CUDA){
-    typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj);
-    Kokkos::deep_copy (hr, in_xadj);
-    typename lno_nnz_view_t::HostMirror he = Kokkos::create_mirror_view (in_adj);
-    Kokkos::deep_copy (he, in_adj);
-    typename scalar_view_t::HostMirror hv = Kokkos::create_mirror_view (in_vals);
-    Kokkos::deep_copy (hv, in_vals);
-    MyExecSpace().fence();
-
-    typename lno_nnz_view_t::HostMirror heo = Kokkos::create_mirror_view (out_adj);
-    typename scalar_view_t::HostMirror hvo = Kokkos::create_mirror_view (out_vals);
+  // TODO BMK: can this function be deprecated?
+  typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj);
+  Kokkos::deep_copy (hr, in_xadj);
+  typename lno_nnz_view_t::HostMirror he = Kokkos::create_mirror_view (in_adj);
+  Kokkos::deep_copy (he, in_adj);
+  typename scalar_view_t::HostMirror hv = Kokkos::create_mirror_view (in_vals);
+  Kokkos::deep_copy (hv, in_vals);
+  MyExecSpace().fence();
 
+  typename lno_nnz_view_t::HostMirror heo = Kokkos::create_mirror_view (out_adj);
+  typename scalar_view_t::HostMirror hvo = Kokkos::create_mirror_view (out_vals);
 
-    typedef typename lno_view_t::non_const_value_type size_type;
-    typedef typename lno_nnz_view_t::non_const_value_type lno_t;
-    typedef typename scalar_view_t::non_const_value_type scalar_t;
+  typedef typename lno_view_t::non_const_value_type size_type;
+  typedef typename lno_nnz_view_t::non_const_value_type lno_t;
+  typedef typename scalar_view_t::non_const_value_type scalar_t;
 
-    lno_t nrows = in_xadj.extent(0) - 1;
-    std::vector <Edge<lno_t, scalar_t> > edges(in_adj.extent(0));
+  lno_t nrows = in_xadj.extent(0) - 1;
+  std::vector <Edge<lno_t, scalar_t> > edges(in_adj.extent(0));
 
-    size_type row_size = 0;
-    for (lno_t i = 0; i < nrows; ++i){
-      for (size_type j = hr(i); j < hr(i + 1); ++j){
-        edges[row_size].src = i;
-        edges[row_size].dst = he(j);
-        edges[row_size++].ew = hv(j);
-      }
+  size_type row_size = 0;
+  for (lno_t i = 0; i < nrows; ++i){
+    for (size_type j = hr(i); j < hr(i + 1); ++j){
+      edges[row_size].src = i;
+      edges[row_size].dst = he(j);
+      edges[row_size++].ew = hv(j);
     }
-    std::sort (edges.begin(), edges.begin() + row_size);
-    size_type ne = in_adj.extent(0);
-    for(size_type i = 0; i < ne; ++i){
-      heo(i) = edges[i].dst;
-      hvo(i) = edges[i].ew;
-    }
-
-
-    Kokkos::deep_copy (out_adj, heo);
-    Kokkos::deep_copy (out_vals, hvo);
-    MyExecSpace().fence();
   }
-  else {
-
-
-    typedef typename lno_view_t::non_const_value_type size_type;
-    typedef typename lno_nnz_view_t::non_const_value_type lno_t;
-    typedef typename scalar_view_t::non_const_value_type scalar_t;
-
-    lno_t nrows = in_xadj.extent(0) - 1;
-    std::vector <Edge<lno_t, scalar_t> > edges(in_adj.extent(0));
-
-    size_type row_size = 0;
-    for (lno_t i = 0; i < nrows; ++i){
-      for (size_type j = in_xadj(i); j < in_xadj(i + 1); ++j){
-        edges[row_size].src = i;
-        edges[row_size].dst = in_adj(j);
-        edges[row_size++].ew = in_vals(j);
-      }
-    }
-    std::sort (edges.begin(), edges.begin() + row_size);
-    size_type ne = in_adj.extent(0);
-    for(size_type i = 0; i < ne; ++i){
-      out_adj(i) = edges[i].dst;
-      out_vals(i) = edges[i].ew;
-    }
-
+  std::sort (edges.begin(), edges.begin() + row_size);
+  size_type ne = in_adj.extent(0);
+  for(size_type i = 0; i < ne; ++i){
+    heo(i) = edges[i].dst;
+    hvo(i) = edges[i].ew;
+  }
 
 
-  }
+  Kokkos::deep_copy (out_adj, heo);
+  Kokkos::deep_copy (out_vals, hvo);
+  MyExecSpace().fence();
 }
 
 /*
@@ -1562,47 +1672,46 @@ struct LowerTriangularMatrix{
       const size_type write_end = t_xadj[row_index + 1];
       const lno_t write_left_work = write_end - write_begin;
 
-      switch (exec_space){
-      case Exec_CUDA:
-        //TODO: Write cuda version here.
-        /*
+      //TODO: Write GPU (vector-level) version here:
+      /*
+      if(kk_is_gpu_exec_space<ExecutionSpace>())
+      {
         Kokkos::parallel_for(
             Kokkos::ThreadVectorRange(teamMember, read_left_work),
             [&] (lno_t i) {
           const size_type adjind = i + col_begin;
           const lno_t colIndex = adj[adjind];
-
         });
-        */
+      }
+      else
+      ...
+      */
 
-      default:
-        for (lno_t r = 0 , w = 0; r <  read_left_work && w < write_left_work; ++r){
-          const size_type adjind = r + col_begin;
-          const lno_t colIndex = adj[adjind];
-          lno_t colperm = colIndex;
-          if (permutation != NULL){
-            colperm = permutation[colIndex];
-          }
-          if (is_lower){
-            if (row_perm > colperm){
-              if (in_vals != NULL){
-                t_vals[write_begin + w] = in_vals[adjind];
-              }
-              t_adj[write_begin + w++] = colIndex;
+      for (lno_t r = 0 , w = 0; r <  read_left_work && w < write_left_work; ++r){
+        const size_type adjind = r + col_begin;
+        const lno_t colIndex = adj[adjind];
+        lno_t colperm = colIndex;
+        if (permutation != NULL){
+          colperm = permutation[colIndex];
+        }
+        if (is_lower){
+          if (row_perm > colperm){
+            if (in_vals != NULL){
+              t_vals[write_begin + w] = in_vals[adjind];
             }
+            t_adj[write_begin + w++] = colIndex;
           }
-          else {
-            if (row_perm < colperm){
-              if (in_vals != NULL){
-                t_vals[write_begin + w] = in_vals[adjind];
-              }
-              t_adj[write_begin + w++] = colIndex;
+        }
+        else {
+          if (row_perm < colperm){
+            if (in_vals != NULL){
+              t_vals[write_begin + w] = in_vals[adjind];
             }
+            t_adj[write_begin + w++] = colIndex;
           }
+        }
 
 
-        }
-        break;
       }
     });
   }
@@ -2188,7 +2297,6 @@ void kk_create_incidence_tranpose_matrix_from_lower_triangle(
     bool use_dynamic_scheduling = false,
     bool chunksize = 4){
 
-#ifndef KOKKOS_ENABLE_CUDA
   //typedef typename row_map_view_t::const_type const_row_map_view_t;
   //typedef typename cols_view_t::const_type   const_cols_view_t;
 
@@ -2229,7 +2337,6 @@ void kk_create_incidence_tranpose_matrix_from_lower_triangle(
     }
 
     });
-#endif
   }
 
 template <typename row_map_view_t,
@@ -2246,7 +2353,6 @@ void kk_create_incidence_matrix_from_lower_triangle(
     out_cols_view_t &out_entries,
     bool use_dynamic_scheduling = false,
     bool chunksize = 4){
-#ifndef KOKKOS_ENABLE_CUDA
 
   //typedef typename row_map_view_t::const_type const_row_map_view_t;
   //typedef typename cols_view_t::const_type   const_cols_view_t;
@@ -2318,8 +2424,7 @@ void kk_create_incidence_matrix_from_lower_triangle(
       tmp);
 
       out_entries = outcols;
-#endif
-  }
+}
 
 
 
@@ -2339,7 +2444,6 @@ void kk_create_incidence_matrix_from_original_matrix(
     permutation_view_t permutation,
     bool use_dynamic_scheduling = false,
     bool chunksize = 4){
-#ifndef KOKKOS_ENABLE_CUDA
 
   //typedef typename row_map_view_t::const_type const_row_map_view_t;
   //typedef typename cols_view_t::const_type   const_cols_view_t;
@@ -2460,7 +2564,6 @@ void kk_create_incidence_matrix_from_original_matrix(
       tmp);
 
       out_entries = outcols;*/
-#endif
   }
 
 
diff --git a/src/common/KokkosKernels_Uniform_Initialized_MemoryPool.hpp b/src/common/KokkosKernels_Uniform_Initialized_MemoryPool.hpp
index 3269c578ca..ae413d0fd5 100644
--- a/src/common/KokkosKernels_Uniform_Initialized_MemoryPool.hpp
+++ b/src/common/KokkosKernels_Uniform_Initialized_MemoryPool.hpp
@@ -167,7 +167,8 @@ class UniformMemoryPool{
   PoolType pool_type;
 
 public:
-
+  using execution_space = typename MyExecSpace::execution_space;
+  using memory_space = typename MyExecSpace::memory_space;
 
   /**
    * \brief UniformMemoryPool constructor.
diff --git a/src/common/KokkosKernels_Utils.hpp b/src/common/KokkosKernels_Utils.hpp
index 6c34a64ae8..544b57f989 100644
--- a/src/common/KokkosKernels_Utils.hpp
+++ b/src/common/KokkosKernels_Utils.hpp
@@ -90,132 +90,11 @@ void get_histogram(
   kk_get_histogram<in_lno_view_t, out_lno_view_t, MyExecSpace>(in_elements, in_view, histogram);
 }
 
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-template <typename idx, typename ExecutionSpace>
-void get_suggested_vector_team_size(
-    int max_allowed_team_size,
-    int &suggested_vector_size_,
-    int &suggested_team_size_,
-    idx nr, idx nnz){
-
-
-    suggested_vector_size_ =  1;
-    suggested_team_size_ = 1;
-
-#if defined( KOKKOS_ENABLE_SERIAL )
-  if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
-    suggested_vector_size_ =  1;
-    suggested_team_size_ = 1;
-  }
-#endif
-
-#if defined( KOKKOS_ENABLE_THREADS )
-  if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){
-    suggested_vector_size_ =  1;
-    suggested_team_size_ =  1;
-  }
-#endif
-
-#if defined( KOKKOS_ENABLE_OPENMP )
-  if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){
-    suggested_vector_size_ =  1;
-    suggested_team_size_ = 1;
-  }
-#endif
-
-#if defined( KOKKOS_ENABLE_CUDA )
-  if (std::is_same<Kokkos::Cuda, ExecutionSpace >::value){
-
-    suggested_vector_size_ = nnz / double (nr) + 0.5;
-
-    if (suggested_vector_size_ <= 3){
-      suggested_vector_size_ = 2;
-    }
-    else if (suggested_vector_size_ <= 6){
-      suggested_vector_size_ = 4;
-    }
-    else if (suggested_vector_size_ <= 12){
-      suggested_vector_size_ = 8;
-    }
-    else if (suggested_vector_size_ <= 24){
-      suggested_vector_size_ = 16;
-    }
-    else {
-      suggested_vector_size_ = 32;
-    }
-
-    suggested_team_size_ = max_allowed_team_size / suggested_vector_size_;
-  }
-#else
-  (void)max_allowed_team_size;
-  (void)nr;
-  (void)nnz;
-#endif
-
-#if defined( KOKKOS_ENABLE_QTHREAD)
-  if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
-    suggested_vector_size_ = 1;
-    suggested_team_size_ = 1;
-  }
-#endif
-
-}
-
-#else
 template <typename idx, typename ExecutionSpace>
 void get_suggested_vector_size(
     int &suggested_vector_size_,
-    idx nr, idx nnz){
-
-    suggested_vector_size_ =  1;
-
-#if defined( KOKKOS_ENABLE_SERIAL )
-  if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
-    suggested_vector_size_ =  1;
-  }
-#endif
-
-#if defined( KOKKOS_ENABLE_THREADS )
-  if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){
-    suggested_vector_size_ =  1;
-  }
-#endif
-
-#if defined( KOKKOS_ENABLE_OPENMP )
-  if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){
-    suggested_vector_size_ =  1;
-  }
-#endif
-
-#if defined( KOKKOS_ENABLE_CUDA )
-  if (std::is_same<Kokkos::Cuda, ExecutionSpace >::value){
-
-    suggested_vector_size_ = nnz / double (nr) + 0.5;
-
-    if (suggested_vector_size_ <= 3){
-      suggested_vector_size_ = 2;
-    }
-    else if (suggested_vector_size_ <= 6){
-      suggested_vector_size_ = 4;
-    }
-    else if (suggested_vector_size_ <= 12){
-      suggested_vector_size_ = 8;
-    }
-    else if (suggested_vector_size_ <= 24){
-      suggested_vector_size_ = 16;
-    }
-    else {
-      suggested_vector_size_ = 32;
-    }
-  }
-#endif
-
-#if defined( KOKKOS_ENABLE_QTHREAD)
-  if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
-    suggested_vector_size_ = 1;
-  }
-#endif
-
+    idx nr, idx nnz) {
+  suggested_vector_size_ = kk_get_suggested_vector_size(nr, nnz, get_exec_space_type<ExecutionSpace>());
 }
 
 //Get the best team size for the given functor.
@@ -224,36 +103,28 @@ void get_suggested_vector_size(
 template<typename team_policy_t, typename Functor, typename ParallelTag = Kokkos::ParallelForTag>
 int get_suggested_team_size(Functor& f, int vector_size)
 {
-#ifdef KOKKOS_ENABLE_CUDA
-  if(std::is_same<typename team_policy_t::traits::execution_space, Kokkos::Cuda>::value)
+  using execution_space = typename team_policy_t::traits::execution_space;
+  if(kk_is_gpu_exec_space<execution_space>())
   {
     team_policy_t temp(1, 1, vector_size);
     return temp.team_size_recommended(f, ParallelTag());
   }
   else
-#endif
-  {
     return 1;
-  }
 }
 
-#endif //ifdef KOKKOS_ENABLE_DEPRECATED_CODE ... else
-
 template<typename team_policy_t, typename Functor, typename ParallelTag = Kokkos::ParallelForTag>
 int get_suggested_team_size(Functor& f, int vector_size, size_t sharedPerTeam, size_t sharedPerThread)
 {
-#ifdef KOKKOS_ENABLE_CUDA
-  if(std::is_same<typename team_policy_t::traits::execution_space, Kokkos::Cuda>::value)
+  using execution_space = typename team_policy_t::traits::execution_space;
+  if(kk_is_gpu_exec_space<execution_space>())
   {
     team_policy_t temp = team_policy_t(1, 1, vector_size).
       set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam), Kokkos::PerThread(sharedPerThread));
     return temp.team_size_recommended(f, ParallelTag());
   }
   else
-#endif
-  {
     return 1;
-  }
 }
 
 template <typename idx_array_type,
@@ -1132,21 +1003,12 @@ void symmetrize_and_get_lower_diagonal_edge_list(
 
     int teamSizeMax = 0;
     int vector_size = 0;
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    int max_allowed_team_size = team_policy::team_size_max(fse);
 
-    get_suggested_vector_team_size<idx, MyExecSpace>(
-        max_allowed_team_size,
-        vector_size,
-        teamSizeMax,
-        xadj.extent(0) - 1, nnz);
-#else
     get_suggested_vector_size<idx, MyExecSpace>(
         vector_size,
         xadj.extent(0) - 1, nnz);
 
     teamSizeMax = get_suggested_team_size<team_policy>(fse, vector_size);
-#endif
     //std::cout << "max_allowed_team_size:" << max_allowed_team_size << " vs:" << vector_size << " tsm:" << teamSizeMax<< std::endl;
     
     team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size);
@@ -1186,21 +1048,12 @@ void symmetrize_and_get_lower_diagonal_edge_list(
 
     int teamSizeMax = 0;
     int vector_size = 0;
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    int max_allowed_team_size = team_policy::team_size_max(FSCH);
 
-    get_suggested_vector_team_size<idx, MyExecSpace>(
-        max_allowed_team_size,
-        vector_size,
-        teamSizeMax,
-        xadj.extent(0) - 1, nnz);
-#else
     get_suggested_vector_size<idx, MyExecSpace>(
         vector_size,
         xadj.extent(0) - 1, nnz);
 
     teamSizeMax = get_suggested_team_size<team_policy>(FSCH, vector_size);
-#endif
 
     team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size);
     Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S1", pol, FSCH);
@@ -1261,21 +1114,12 @@ void symmetrize_graph_symbolic_hashmap(
 
     int teamSizeMax = 0;
     int vector_size = 0;
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    int max_allowed_team_size = team_policy::team_size_max(fse);
 
-    get_suggested_vector_team_size<idx, MyExecSpace>(
-        max_allowed_team_size,
-        vector_size,
-        teamSizeMax,
-        xadj.extent(0) - 1, nnz);
-#else
     get_suggested_vector_size<idx, MyExecSpace>(
         vector_size,
         xadj.extent(0) - 1, nnz);
 
     teamSizeMax = get_suggested_team_size<team_policy>(fse, vector_size);
-#endif
 
     team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size);
     Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S0",
@@ -1311,22 +1155,13 @@ void symmetrize_graph_symbolic_hashmap(
 
     int teamSizeMax = 0;
     int vector_size = 0;
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    int max_allowed_team_size = team_policy::team_size_max(FSCH);
 
-    get_suggested_vector_team_size<idx, MyExecSpace>(
-        max_allowed_team_size,
-        vector_size,
-        teamSizeMax,
-        xadj.extent(0) - 1, nnz);
-#else
     get_suggested_vector_size<idx, MyExecSpace>(
         vector_size,
         xadj.extent(0) - 1, nnz);
 
     teamSizeMax = get_suggested_team_size<team_policy>(FSCH, vector_size);
 
-#endif
 
     team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size);
     Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S1",
diff --git a/src/common/KokkosKernels_default_types.hpp b/src/common/KokkosKernels_default_types.hpp
index a83752b282..0b8724b794 100644
--- a/src/common/KokkosKernels_default_types.hpp
+++ b/src/common/KokkosKernels_default_types.hpp
@@ -82,6 +82,8 @@
 
 #if defined(KOKKOS_ENABLE_CUDA)
   typedef Kokkos::Cuda default_device;
+#elif defined(KOKKOS_ENABLE_HIP)
+  typedef Kokkos::Experimental::HIP default_device;
 #elif defined(KOKKOS_ENABLE_OPENMP)
   typedef Kokkos::OpenMP default_device;
 #elif defined(KOKKOS_ENABLE_PTHREAD) || defined(KOKKOS_ENABLE_THREADS)
diff --git a/src/graph/KokkosGraph_Distance1Color.hpp b/src/graph/KokkosGraph_Distance1Color.hpp
index 83070c6e66..2e9a4bc03d 100644
--- a/src/graph/KokkosGraph_Distance1Color.hpp
+++ b/src/graph/KokkosGraph_Distance1Color.hpp
@@ -44,8 +44,6 @@
 #ifndef _KOKKOSGRAPH_DISTANCE1_COLOR_HPP
 #define _KOKKOSGRAPH_DISTANCE1_COLOR_HPP
 
-#include <sstream>
-
 #include "KokkosGraph_Distance1ColorHandle.hpp"
 #include "KokkosGraph_Distance1Color_impl.hpp"
 #include "KokkosKernels_Utils.hpp"
diff --git a/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/src/graph/KokkosGraph_Distance1ColorHandle.hpp
index 49e20d5395..077104ef9f 100644
--- a/src/graph/KokkosGraph_Distance1ColorHandle.hpp
+++ b/src/graph/KokkosGraph_Distance1ColorHandle.hpp
@@ -109,7 +109,7 @@ class GraphColoringHandle
   typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace> nnz_lno_persistent_work_view_t;
   typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type
 
-  typedef Kokkos::TeamPolicy<HandleExecSpace> team_policy_t ;
+  typedef Kokkos::TeamPolicy<ExecutionSpace> team_policy_t ;
   typedef typename team_policy_t::member_type team_member_t ;
 
   typedef typename Kokkos::View<size_t *> non_const_1d_size_type_view_t;
@@ -229,54 +229,34 @@ class GraphColoringHandle
   }
 
 
-  /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise.
+  /** \brief Chooses best algorithm based on the execution space. COLORING_SERIAL if serial, otherwise COLORING_VBBIT.
+   *         VBBIT is the fastest parallel algorithm (unless on GPU and the graph's maximum degree is very large, but
+   *         we don't have information about the graph here)
    */
   void choose_default_algorithm()
   {
-#if defined( KOKKOS_ENABLE_SERIAL )
-    if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
+    auto exec = KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>();
+    if(exec == KokkosKernels::Impl::Exec_SERIAL)
+    {
       this->coloring_algorithm_type = COLORING_SERIAL;
 #ifdef VERBOSE
-      std::cout << "Serial Execution Space, Default Algorithm: COLORING_VB" << std::endl;
+      std:cout << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n";
 #endif
     }
-#endif
-
-#if defined( KOKKOS_ENABLE_THREADS )
-    if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){
-      this->coloring_algorithm_type = COLORING_VB;
-#ifdef VERBOSE
-      std::cout << "PTHREAD Execution Space, Default Algorithm: COLORING_VB" << std::endl;
-#endif
-    }
-#endif
-
-#if defined( KOKKOS_ENABLE_OPENMP )
-    if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){
-      this->coloring_algorithm_type = COLORING_VB;
-#ifdef VERBOSE
-      std::cout << "OpenMP Execution Space, Default Algorithm: COLORING_VB" << std::endl;
-#endif
-    }
-#endif
-
-#if defined( KOKKOS_ENABLE_CUDA )
-    if (std::is_same<Kokkos::Cuda, ExecutionSpace >::value){
+    else if(KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>())
+    {
       this->coloring_algorithm_type = COLORING_EB;
 #ifdef VERBOSE
-      std::cout << "Cuda Execution Space, Default Algorithm: COLORING_VB" << std::endl;
+      std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_EB\n";
 #endif
     }
-#endif
-
-#if defined( KOKKOS_ENABLE_QTHREAD)
-    if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
+    else
+    {
       this->coloring_algorithm_type = COLORING_VB;
 #ifdef VERBOSE
-      std::cout << "Qthread Execution Space, Default Algorithm: COLORING_VB" << std::endl;
+      std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VB\n";
 #endif
     }
-#endif
   }
 
   template<typename v1, typename v2, typename v3>
@@ -357,7 +337,7 @@ class GraphColoringHandle
         }
       }, new_edge_count);
 
-      Kokkos::single(Kokkos::PerThread(teamMember),[=] () {
+      Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
         lower_xadj_counts(ii + 1) = new_edge_count;
       });
     }
@@ -463,7 +443,7 @@ class GraphColoringHandle
       row_index_view_type xadj, nonzero_view_type adj){
 
     KokkosKernels::Impl::symmetrize_and_get_lower_diagonal_edge_list
-    <row_index_view_type, nonzero_view_type, nnz_lno_persistent_work_view_t, HandleExecSpace>
+    <row_index_view_type, nonzero_view_type, nnz_lno_persistent_work_view_t, ExecutionSpace>
       (
         nv,
         xadj,
@@ -496,13 +476,8 @@ class GraphColoringHandle
 
       size_type_temp_work_view_t lower_count("LowerXADJ", nv + 1);
       size_type new_num_edge = 0;
-      typedef Kokkos::RangePolicy<HandleExecSpace> my_exec_space;
-
-      if ( false
-#if defined( KOKKOS_ENABLE_CUDA )
-          || std::is_same<Kokkos::Cuda, ExecutionSpace >::value
-#endif
-         )
+      typedef Kokkos::RangePolicy<ExecutionSpace> my_exec_space;
+      if (KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>())
       {
 
 
@@ -510,30 +485,22 @@ class GraphColoringHandle
         int vector_size = 0;
 
         CountLowerTriangleTeam<row_index_view_type, nonzero_view_type, size_type_temp_work_view_t> clt (nv, xadj, adj, lower_count);
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-        int max_allowed_team_size = team_policy_t::team_size_max(clt);
-        KokkosKernels::Impl::get_suggested_vector_team_size<size_type, HandleExecSpace>(
-            max_allowed_team_size,
-            vector_size,
-            teamSizeMax,
-            nv, ne);
-#else
+
         KokkosKernels::Impl::get_suggested_vector_size<size_type, HandleExecSpace>(
             vector_size,
             nv, ne);
 
         teamSizeMax = KokkosKernels::Impl::get_suggested_team_size<team_policy_t>(clt, vector_size);
-#endif
 
         Kokkos::parallel_for("KokkosGraph::CountLowerTriangleTeam",
             team_policy_t((nv + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size),
             clt//, new_num_edge
         );
 
-        KokkosKernels::Impl::inclusive_parallel_prefix_sum<size_type_temp_work_view_t, HandleExecSpace>
+        KokkosKernels::Impl::inclusive_parallel_prefix_sum<size_type_temp_work_view_t, ExecutionSpace>
         (nv+1, lower_count);
         //Kokkos::parallel_scan (my_exec_space(0, nv + 1), PPS<row_lno_temp_work_view_t>(lower_count));
-        HandleExecSpace().fence();
+        ExecutionSpace().fence();
         auto lower_total_count = Kokkos::subview(lower_count, nv);
         auto hlower = Kokkos::create_mirror_view (lower_total_count);
         Kokkos::deep_copy (hlower, lower_total_count);
@@ -559,7 +526,7 @@ class GraphColoringHandle
 
         //Kokkos::parallel_scan (my_exec_space(0, nv + 1), PPS<row_lno_temp_work_view_t>(lower_count));
 
-        KokkosKernels::Impl::inclusive_parallel_prefix_sum<size_type_temp_work_view_t, HandleExecSpace>
+        KokkosKernels::Impl::inclusive_parallel_prefix_sum<size_type_temp_work_view_t, ExecutionSpace>
         (nv+1, lower_count);
         nnz_lno_persistent_work_view_t half_src (Kokkos::ViewAllocateWithoutInitializing("HALF SRC"),new_num_edge);
         nnz_lno_persistent_work_view_t half_dst (Kokkos::ViewAllocateWithoutInitializing("HALF DST"),new_num_edge);
diff --git a/src/graph/KokkosGraph_Distance2Color.hpp b/src/graph/KokkosGraph_Distance2Color.hpp
index dacf9c99db..53f2b4a26b 100644
--- a/src/graph/KokkosGraph_Distance2Color.hpp
+++ b/src/graph/KokkosGraph_Distance2Color.hpp
@@ -245,80 +245,6 @@ void bipartite_color_columns(
   gch_d2->set_coloring_time(timer.seconds());
 }
 
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-/**
- * (DEPRECATED) Compute the left-side coloring of a bipartite matrix/graph.
- * Equivalent to bipartite_color_rows(), except this interface requires the user
- * to compute (col_map, col_entries) as the transpose of the graph (if nonsymmetric).
- *
- * This function is deprecated because it's not possible to support both undirected
- * distance-2 coloring and bipartite one-sided coloring
- * in a single interface. However, if the input graph has all diagonal entries present and
- * is symmetric (which is generally the case for discretized PDE matrices), then this
- * function is also equivalent to graph_color_distance2().
- *
- * In any case, the graphs (row_map, row_entries) and (col_map, col_entries) must be transposes
- * of each other.
- *
- * @param[in]  handle         The Kernel Handle
- * @param[in]  num_rows       Number of rows in the matrix (number of vertices)
- * @param[in]  num_cols       Number of columns in the matrix
- * @param[in]  row_map        Row map
- * @param[in]  row_entries    Row entries
- * @param[in]  col_map        Column map
- * @param[in]  col_entries    Column entries
- */
-template<class KernelHandle,
-  typename lno_row_view_t_, typename lno_nnz_view_t_,
-  typename lno_col_view_t_, typename lno_colnnz_view_t_>
-void graph_compute_distance2_color(KernelHandle *handle,
-                                   typename KernelHandle::nnz_lno_t num_rows,
-                                   typename KernelHandle::nnz_lno_t num_cols,
-                                   lno_row_view_t_ row_map,
-                                   lno_nnz_view_t_ row_entries,
-                                   // If graph is symmetric, simply pass the same graph twice:
-                                   // row_map == col_map, row_entries == col_entries
-                                   lno_col_view_t_ col_map,
-                                   lno_colnnz_view_t_ col_entries)
-{
-    using lno_t = typename KernelHandle::nnz_lno_t;
-    using size_type = typename KernelHandle::size_type;
-    using memory_space = typename KernelHandle::HandleTempMemorySpace;
-    static_assert(std::is_same<typename lno_row_view_t_::const_value_type, typename lno_col_view_t_ ::const_value_type>::value,
-        "Row and col maps must have the same value type (size_type).");
-    static_assert(std::is_same<typename lno_nnz_view_t_::const_value_type, typename lno_colnnz_view_t_::const_value_type>::value,
-        "Row and col entries must have the same value type (nnz_lno_t).");
-    //Internally, coloring accesses the graph through unmanaged views
-    //These are explicitly nonconst so that copies of adj for edge-filtering
-    //(which must be mutable) can use the same type.
-    //
-    //The original input graphs will never be modified.
-    using InternalRowmap = Kokkos::View<const size_type*, Kokkos::LayoutLeft,
-               memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
-    using InternalColinds = Kokkos::View<const lno_t*, Kokkos::LayoutLeft,
-               memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
-    if(row_entries.extent(0) != col_entries.extent(0))
-    {
-      throw std::runtime_error("row_entries and col_entries must represent transposes of each other, but they have different lengths");
-    }
-    Kokkos::Impl::Timer timer;
-    // Set our handle pointer to a GraphColoringHandleType.
-    auto *gch_d2 = handle->get_distance2_graph_coloring_handle();
-    // Create a view to save the colors to.
-    using color_view_type = typename KernelHandle::GraphColorDistance2HandleType::color_view_type;
-    color_view_type colors_out("Graph Colors", num_rows);
-    InternalRowmap rowmap_internal(row_map.data(), row_map.extent(0));
-    InternalColinds rowentries_internal(row_entries.data(), row_entries.extent(0));
-    InternalRowmap colmap_internal(col_map.data(), col_map.extent(0));
-    InternalColinds colentries_internal(col_entries.data(), col_entries.extent(0));
-    Impl::GraphColorDistance2<typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap, InternalColinds, true>
-      gc(num_rows, num_cols, row_map, row_entries, col_map, col_entries, gch_d2);
-    gc.compute_distance2_color();
-    gch_d2->add_to_overall_coloring_time(timer.seconds());
-    gch_d2->set_coloring_time(timer.seconds());
-}
-#endif
-
 
 }      // end namespace Experimental
 }      // end namespace KokkosGraph
diff --git a/src/graph/KokkosGraph_Distance2ColorHandle.hpp b/src/graph/KokkosGraph_Distance2ColorHandle.hpp
index f4624f545b..39d66b744f 100644
--- a/src/graph/KokkosGraph_Distance2ColorHandle.hpp
+++ b/src/graph/KokkosGraph_Distance2ColorHandle.hpp
@@ -198,71 +198,27 @@ class GraphColorDistance2Handle
      * Chooses best algorithm based on the execution space.
      *
      * This chooses the best algorithm based on the execution space:
-     * - COLORING_D2_SERIAL if the execution space is SERIAL
-     * - COLORING_D2_NB_BIT otherwise
+     * - COLORING_D2_SERIAL if the execution space is SERIAL (more work efficient than NB_BIT)
+     * - COLORING_D2_NB_BIT otherwise (fastest parallel algorithm)
      *
      */
 
     void choose_default_algorithm()
     {
-        bool found = false;
-#if defined(KOKKOS_ENABLE_SERIAL)
-        if(std::is_same<Kokkos::Serial, ExecutionSpace>::value)
+        if(KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>() == KokkosKernels::Impl::Exec_SERIAL)
         {
             this->coloring_algorithm_type = COLORING_D2_SERIAL;
-            found = true;
-#ifdef VERBOSE
-            std::cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL" << std::endl;
+#ifdef VERBOSE 
+            std:cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n";
 #endif
         }
-#endif
-
-#if defined(KOKKOS_ENABLE_THREADS)
-        if(std::is_same<Kokkos::Threads, ExecutionSpace>::value)
-        {
-            this->coloring_algorithm_type = COLORING_D2_NB_BIT;
-            found = true;
-#ifdef VERBOSE
-            std::cout << "PTHREAD Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl;
-#endif
-        }
-#endif
-
-#if defined(KOKKOS_ENABLE_OPENMP)
-        if(std::is_same<Kokkos::OpenMP, ExecutionSpace>::value)
-        {
-            this->coloring_algorithm_type = COLORING_D2_NB_BIT;
-            found = true;
-#ifdef VERBOSE
-            std::cout << "OpenMP Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl;
-#endif
-        }
-#endif
-
-#if defined(KOKKOS_ENABLE_CUDA)
-        if(std::is_same<Kokkos::Cuda, ExecutionSpace>::value)
-        {
-            this->coloring_algorithm_type = COLORING_D2_NB_BIT;
-            found = true;
-#ifdef VERBOSE
-            std::cout << "Cuda Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl;
-#endif
-        }
-#endif
-
-#if defined(KOKKOS_ENABLE_QTHREAD)
-        if(std::is_same<Kokkos::Qthread, ExecutionSpace>::value)
+        else
         {
             this->coloring_algorithm_type = COLORING_D2_NB_BIT;
-            found = true;
-#ifdef VERBOSE
-            std::cout << "Qthread Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl;
+#ifdef VERBOSE 
+            std:cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n";
 #endif
         }
-#endif
-        //Since this logic is based on checking every exec space, detect when a new one needs to be supported
-        if(!found)
-          throw std::logic_error("D2 coloring: default algorithm hasn't been chosen for the current execution space");
     }
 
 
diff --git a/src/graph/KokkosGraph_ExplicitCoarsening.hpp b/src/graph/KokkosGraph_ExplicitCoarsening.hpp
new file mode 100644
index 0000000000..212cb7c383
--- /dev/null
+++ b/src/graph/KokkosGraph_ExplicitCoarsening.hpp
@@ -0,0 +1,117 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSGRAPH_EXPLICIT_COARSEN_HPP
+#define KOKKOSGRAPH_EXPLICIT_COARSEN_HPP
+
+#include "KokkosGraph_ExplicitCoarsening_impl.hpp"
+#include "KokkosKernels_SparseUtils.hpp"
+
+namespace KokkosGraph {
+namespace Experimental {
+
+//Given a CRS graph and coarse labels, produce a new CRS graph representing the coarsened graph.
+//If A is nonsquare, entries in columns >= numVerts are discarded.
+//The labels should be in the range [0, numCoarseVerts), and the output graph wil have numCoarseVerts.
+//
+//If compress, sort and merge entries in each row.
+//An uncompressed graph will still work as input to some things like D1 graph coloring.
+
+template <typename device_t, typename fine_rowmap_t, typename fine_entries_t, typename labels_t, typename coarse_rowmap_t, typename coarse_entries_t>
+void graph_explicit_coarsen(
+    const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries,
+    const labels_t& labels, typename fine_entries_t::non_const_value_type numCoarseVerts,
+    coarse_rowmap_t& coarseRowmap, coarse_entries_t& coarseEntries,
+    bool compress = true)
+{
+  using size_type = typename fine_rowmap_t::non_const_value_type;
+  using lno_t = typename fine_entries_t::non_const_value_type;
+  using exec_space = typename device_t::execution_space;
+  static_assert(std::is_same<lno_t, typename coarse_entries_t::non_const_value_type>::value,
+      "graph_explicit_coarsen: The coarse and fine entry Views have different value types.");
+  KokkosGraph::Impl::ExplicitGraphCoarsening<lno_t, size_type, device_t, fine_rowmap_t, fine_entries_t, labels_t, coarse_rowmap_t, coarse_entries_t, coarse_entries_t>
+    egc(fineRowmap, fineEntries, labels, numCoarseVerts);
+  coarseRowmap = egc.coarseRowmap;
+  coarseEntries = egc.coarseEntries;
+  if(compress)
+  {
+    coarse_rowmap_t mergedRowmap;
+    coarse_entries_t mergedEntries;
+    KokkosKernels::Impl::sort_and_merge_graph<exec_space, coarse_rowmap_t, coarse_entries_t>
+      (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries);
+  }
+}
+
+//Same as above, but also produce the map from coarse vertices to fine vertices (inverse map of labels)
+template <typename device_t, typename fine_rowmap_t, typename fine_entries_t, typename labels_t, typename coarse_rowmap_t, typename coarse_entries_t, typename ordinal_view_t>
+void graph_explicit_coarsen_with_inverse_map(
+    const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries,
+    const labels_t& labels, typename fine_entries_t::non_const_value_type numCoarseVerts,
+    coarse_rowmap_t& coarseRowmap, coarse_entries_t& coarseEntries,
+    ordinal_view_t& inverseOffsets, ordinal_view_t& inverseLabels,
+    bool compress = true)
+{
+  using size_type = typename fine_rowmap_t::non_const_value_type;
+  using lno_t = typename fine_entries_t::non_const_value_type;
+  using exec_space = typename device_t::execution_space;
+  static_assert(std::is_same<lno_t, typename coarse_entries_t::non_const_value_type>::value,
+      "graph_explicit_coarsen: The coarse and fine entry Views have different value types.");
+  KokkosGraph::Impl::ExplicitGraphCoarsening<lno_t, size_type, device_t, fine_rowmap_t, fine_entries_t, labels_t, coarse_rowmap_t, coarse_entries_t, ordinal_view_t>
+    egc(fineRowmap, fineEntries, labels, numCoarseVerts);
+  coarseRowmap = egc.coarseRowmap;
+  coarseEntries = egc.coarseEntries;
+  inverseOffsets = egc.clusterOffsets;
+  inverseLabels = egc.clusterVerts;
+  if(compress)
+  {
+    coarse_rowmap_t mergedRowmap;
+    coarse_entries_t mergedEntries;
+    KokkosKernels::Impl::sort_and_merge_graph<exec_space, coarse_rowmap_t, coarse_entries_t>
+      (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries);
+  }
+}
+  
+}}
+
+#endif
diff --git a/src/graph/KokkosGraph_GraphColorHandle.hpp b/src/graph/KokkosGraph_GraphColorHandle.hpp
index de9fd6d8f4..9526c34b0e 100644
--- a/src/graph/KokkosGraph_GraphColorHandle.hpp
+++ b/src/graph/KokkosGraph_GraphColorHandle.hpp
@@ -49,12 +49,3 @@
  * KokkosGraph_Distance1Color.hpp to be more consistent with file naming
  * used in other places within Kokkos-Kernels.
  */
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-#include "KokkosGraph_Distance1ColorHandle.hpp"
-
-// This interface should be deprecated in version 3.0
-#pragma message("DEPRECATION WARNING: The KokkosGraph_GraphColorHandle.hpp header is replaced by KokkosGraph_Distance1ColorHandle.hpp")
-
-#endif
-
-
diff --git a/src/graph/KokkosGraph_MIS2.hpp b/src/graph/KokkosGraph_MIS2.hpp
new file mode 100644
index 0000000000..c578a97271
--- /dev/null
+++ b/src/graph/KokkosGraph_MIS2.hpp
@@ -0,0 +1,108 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Brian Kelley (bmkelle@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef _KOKKOSGRAPH_DISTANCE2_MIS_HPP
+#define _KOKKOSGRAPH_DISTANCE2_MIS_HPP
+
+#include "KokkosGraph_Distance2MIS_impl.hpp"
+
+namespace KokkosGraph{
+
+enum MIS2_Algorithm
+{
+  MIS2_QUALITY,
+  MIS2_FAST
+};
+
+namespace Experimental{
+
+// Compute a distance-2 maximal independent set, given a symmetric CRS graph.
+// Returns a list of the vertices in the set.
+//
+// Column indices >= num_verts are ignored.
+
+template <typename device_t, typename rowmap_t, typename colinds_t, typename lno_view_t = typename colinds_t::non_const_type>
+lno_view_t
+graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, MIS2_Algorithm algo = MIS2_FAST)
+{
+  if(rowmap.extent(0) <= 1)
+  {
+    //zero vertices means the MIS is empty.
+    return lno_view_t();
+  }
+  switch(algo)
+  {
+    case MIS2_QUALITY:
+    {
+      Impl::D2_MIS_FixedPriority<device_t, rowmap_t, colinds_t, lno_view_t> mis(rowmap, colinds);
+      return mis.compute();
+    }
+    case MIS2_FAST:
+    {
+      Impl::D2_MIS_RandomPriority<device_t, rowmap_t, colinds_t, lno_view_t> mis(rowmap, colinds);
+      return mis.compute();
+    }
+  }
+  throw std::invalid_argument("graph_d2_mis: invalid algorithm");
+}
+
+template <typename device_t, typename rowmap_t, typename colinds_t, typename labels_t = typename colinds_t::non_const_type>
+labels_t
+graph_mis2_coarsen(const rowmap_t& rowmap, const colinds_t& colinds, typename colinds_t::non_const_value_type& numClusters, MIS2_Algorithm algo = MIS2_FAST)
+{
+  if(rowmap.extent(0) <= 1)
+  {
+    //there are no vertices to label
+    return labels_t();
+  }
+  labels_t mis2 = graph_d2_mis<device_t, rowmap_t, colinds_t, labels_t>(rowmap, colinds, algo);
+  numClusters = mis2.extent(0);
+  Impl::D2_MIS_Coarsening<device_t, rowmap_t, colinds_t, labels_t> coarsening(rowmap, colinds, mis2);
+  return coarsening.compute();
+}
+
+}  // end namespace Experimental
+}  // end namespace KokkosGraph
+
+#endif
diff --git a/src/graph/KokkosGraph_RCM.hpp b/src/graph/KokkosGraph_RCM.hpp
new file mode 100644
index 0000000000..8f1109aa63
--- /dev/null
+++ b/src/graph/KokkosGraph_RCM.hpp
@@ -0,0 +1,78 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Brian Kelley (bmkelle@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef _KOKKOSGRAPH_RCM_HPP
+#define _KOKKOSGRAPH_RCM_HPP
+
+#include "KokkosGraph_BFS_impl.hpp"
+
+namespace KokkosGraph
+{
+namespace Experimental
+{
+
+//Compute the reverse Cuthill-McKee ordering of a graph.
+//The graph must be symmetric, but it may have any number of connected components.
+//This function returns a list of vertices in RCM order.
+
+template <typename device_t, typename rowmap_t, typename colinds_t, typename labels_t = typename colinds_t::non_const_type>
+labels_t
+graph_rcm(const rowmap_t& rowmap, const colinds_t& colinds)
+{
+  using lno_t = typename colinds_t::non_const_value_type;
+  if(rowmap.extent(0) <= 2)
+  {
+    //there are 0 or 1 vertices - return trivial ordering
+    lno_t numVerts = rowmap.extent(0);
+    if(numVerts)
+      numVerts--;
+    return labels_t("RCM Labels", numVerts);
+  }
+  Impl::SerialRCM<rowmap_t, colinds_t, labels_t> algo(rowmap, colinds);
+  return algo.rcm();
+}
+
+}}  //namespace KokkosGraph::Experimental
+
+#endif
diff --git a/src/graph/KokkosGraph_graph_color.hpp b/src/graph/KokkosGraph_graph_color.hpp
index 4494ecc509..9526c34b0e 100644
--- a/src/graph/KokkosGraph_graph_color.hpp
+++ b/src/graph/KokkosGraph_graph_color.hpp
@@ -49,12 +49,3 @@
  * KokkosGraph_Distance1Color.hpp to be more consistent with file naming
  * used in other places within Kokkos-Kernels.
  */
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-#include "KokkosGraph_Distance1Color.hpp"
-
-// This interface should be deprecated in version 3.0
-#pragma message("DEPRECATION WARNING: The KokkosGraph_graph_color.hpp header will be replaced by KokkosGraph_Distance1Color.hpp")
-
-#endif
-
-
diff --git a/src/graph/impl/KokkosGraph_BFS_impl.hpp b/src/graph/impl/KokkosGraph_BFS_impl.hpp
new file mode 100644
index 0000000000..df652902c0
--- /dev/null
+++ b/src/graph/impl/KokkosGraph_BFS_impl.hpp
@@ -0,0 +1,160 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Brian Kelley (bmkelle@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef _KOKKOSGRAPH_BFS_IMPL_HPP
+#define _KOKKOSGRAPH_BFS_IMPL_HPP
+
+#include "Kokkos_Core.hpp"
+#include "KokkosKernels_Utils.hpp"
+#include <vector>
+#include <algorithm>
+
+namespace KokkosGraph {
+namespace Experimental {
+namespace Impl {
+
+template<typename rowmap_t, typename entries_t, typename lno_view_t>
+struct SerialRCM
+{
+  using size_type = typename rowmap_t::non_const_value_type;
+  using lno_t = typename entries_t::non_const_value_type;
+  using host_rowmap_t = Kokkos::View<size_type*, Kokkos::HostSpace>;
+  using host_lno_view_t = Kokkos::View<lno_t*, Kokkos::HostSpace>;
+
+  lno_t numVerts;
+  host_rowmap_t rowmap;
+  host_lno_view_t entries;
+
+  SerialRCM(const rowmap_t& rowmap_, const entries_t& entries_) :
+    numVerts(rowmap_.extent(0) - 1),
+    rowmap(Kokkos::ViewAllocateWithoutInitializing("HostRowmap"), rowmap_.extent(0)),
+    entries(Kokkos::ViewAllocateWithoutInitializing("HostEntries"), entries_.extent(0))
+  {
+    Kokkos::deep_copy(rowmap, rowmap_);
+    Kokkos::deep_copy(entries, entries_);
+  }
+
+  lno_t findPseudoPeripheral()
+  {
+    //Choose vertex with smallest degree
+    lno_t periph = -1;
+    lno_t periphDeg = numVerts;
+    for(lno_t i = 0; i < numVerts; i++)
+    {
+      lno_t deg = rowmap(i + 1) - rowmap(i);
+      if(deg < periphDeg)
+      {
+        periph = i;
+        periphDeg = deg;
+        if(deg == 0)
+          break;
+      }
+    }
+    return periph;
+  }
+
+  lno_view_t rcm()
+  {
+    lno_t start = findPseudoPeripheral();
+    host_lno_view_t q(Kokkos::ViewAllocateWithoutInitializing("Queue"), numVerts);
+    host_lno_view_t label(Kokkos::ViewAllocateWithoutInitializing("Permutation"), numVerts);
+    for(lno_t i = 0; i < numVerts; i++)
+      label(i) = -1;
+    lno_t qhead = 0;
+    lno_t qtail = 0;
+    label(start) = qtail;
+    q(qtail++) = start;
+    std::vector<lno_t> neighbors;
+    lno_t outerQueue = 0;
+    while(true)
+    {
+      lno_t v = q(qhead++);
+      neighbors.clear();
+      for(size_type j = rowmap(v); j < rowmap(v + 1); j++)
+      {
+        lno_t nei = entries(j);
+        if(nei == v || nei >= numVerts)
+          continue;
+        if(label(nei) == -1)
+        {
+          neighbors.push_back(nei);
+        }
+      }
+      std::sort(neighbors.begin(), neighbors.end(),
+      [&](lno_t n1, lno_t n2) -> bool
+      {
+        //return true if n1 has a lower degree than n2
+        return (rowmap(n1 + 1) - rowmap(n1)) < (rowmap(n2 + 1) - rowmap(n2));
+      });
+      //label and enqueue all unlabeled neighbors
+      for(lno_t nei : neighbors)
+      {
+        label(nei) = qtail;
+        q(qtail++) = nei;
+      }
+      if(qtail == numVerts)
+      {
+        //have labeled all vertices
+        break;
+      }
+      else if(qhead == qtail)
+      {
+        //have exhausted this connected component, but others remain unlabeled
+        while(label(outerQueue) != -1)
+          outerQueue++;
+        label(outerQueue) = qtail;
+        q(qtail++) = outerQueue;
+      }
+    }
+    lno_view_t labelOut(Kokkos::ViewAllocateWithoutInitializing("RCM Permutation"), numVerts);
+    //reverse the labels
+    for(lno_t i = 0; i < numVerts; i++)
+      label(i) = numVerts - label(i) - 1;
+    Kokkos::deep_copy(labelOut, label);
+    return labelOut;
+  }
+};
+
+}}} //namespace KokkosGraph::Experimental::Impl
+#endif
diff --git a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp
index 4e6f322bce..110756a364 100644
--- a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp
+++ b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp
@@ -124,7 +124,13 @@ class GraphColor
       const_lno_nnz_view_t entries,
       HandleType *coloring_handle):
         nv (nv_), ne(ne_),xadj(row_map), adj (entries),
-        kok_src(), kok_dst(), cp(coloring_handle){}
+        kok_src(), kok_dst(), cp(coloring_handle)
+  {
+    static_assert(std::is_same<size_type, typename const_lno_row_view_t::non_const_value_type>::value,
+        "Row map element type does not match handle's size_type.");
+    static_assert(std::is_same<nnz_lno_t, typename const_lno_nnz_view_t::non_const_value_type>::value,
+        "Entries element type does not match handle's nnz_lno_t.");
+  }
 
   /** \brief GraphColor destructor.
    */
diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp
new file mode 100644
index 0000000000..866ad54daf
--- /dev/null
+++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp
@@ -0,0 +1,975 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Brian Kelley (bmkelle@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef _KOKKOSGRAPH_DISTANCE2_MIS_IMPL_HPP
+#define _KOKKOSGRAPH_DISTANCE2_MIS_IMPL_HPP
+
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Bitset.hpp"
+#include "KokkosKernels_Utils.hpp"
+#include <cstdint>
+
+namespace KokkosGraph {
+namespace Experimental {
+namespace Impl {
+
+template<typename device_t, typename rowmap_t, typename entries_t, typename lno_view_t>
+struct D2_MIS_RandomPriority
+{
+  using exec_space = typename device_t::execution_space;
+  using mem_space = typename device_t::memory_space;
+  using bitset_t = Kokkos::Bitset<device_t>;
+  using const_bitset_t = Kokkos::ConstBitset<device_t>;
+  using size_type = typename rowmap_t::non_const_value_type;
+  using lno_t = typename entries_t::non_const_value_type;
+  //The type of status/priority values.
+  using status_t = typename std::make_unsigned<lno_t>::type;
+  using status_view_t = Kokkos::View<status_t*, mem_space>;
+  using range_pol = Kokkos::RangePolicy<exec_space>;
+  using team_pol = Kokkos::TeamPolicy<exec_space>;
+  using team_mem = typename team_pol::member_type;
+  using all_worklists_t = Kokkos::View<lno_t**, Kokkos::LayoutLeft, mem_space>;
+  using worklist_t = Kokkos::View<lno_t*, Kokkos::LayoutLeft, mem_space>;
+
+  KOKKOS_INLINE_FUNCTION static uint32_t xorshiftHash(uint32_t in)
+  {
+    uint32_t x = in;
+    x ^= x << 13;
+    x ^= x >> 17;
+    x ^= x << 5;
+    return x;
+  }
+
+  // Priority values 0 and max are special, they mean the vertex is
+  // in the independent set or eliminated from consideration, respectively.
+  // Values in between represent a priority for being added to the set,
+  // based on degree and vertex ID as a tiebreak
+  //   (higher priority = less preferred to being in the independent set)
+
+  static constexpr status_t IN_SET = 0;
+  static constexpr status_t OUT_SET = ~IN_SET;
+
+  D2_MIS_RandomPriority(const rowmap_t& rowmap_, const entries_t& entries_)
+    : rowmap(rowmap_), entries(entries_), numVerts(rowmap.extent(0) - 1)
+  {
+    status_t i = numVerts + 1;
+    nvBits = 0;
+    while(i)
+    {
+      i >>= 1;
+      nvBits++;
+    }
+    //Each value in rowStatus represents the status and priority of each row.
+    //Each value in colStatus represents the lowest nonzero priority of any row adjacent to the column.
+    //  This counts up monotonically as vertices are eliminated (given status OUT_SET)
+    rowStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("RowStatus"), numVerts);
+    colStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("ColStatus"), numVerts);
+    allWorklists = Kokkos::View<lno_t**, Kokkos::LayoutLeft, mem_space>(Kokkos::ViewAllocateWithoutInitializing("AllWorklists"), numVerts, 3);
+  }
+
+  struct RefreshRowStatus
+  {
+    RefreshRowStatus(const status_view_t& rowStatus_, const worklist_t& worklist_, lno_t nvBits_, int round)
+      : rowStatus(rowStatus_), worklist(worklist_), nvBits(nvBits_)
+    {
+      hashedRound = xorshiftHash(round);
+    }
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const
+    {
+      lno_t i = worklist(w);
+      //Combine vertex and round to get some pseudorandom priority bits that change each round
+      status_t priority = xorshiftHash(i + hashedRound);
+      //Generate unique status per row, with IN_SET < status < OUT_SET,
+      int priorityBits = sizeof(status_t) * 8 - nvBits;
+      status_t priorityMask = 1;
+      priorityMask <<= priorityBits;
+      priorityMask--;
+      status_t newStatus = (status_t) (i + 1) + ((priority & priorityMask) << nvBits);
+      if(newStatus == OUT_SET)
+        newStatus--;
+      rowStatus(i) = newStatus;
+    }
+
+    status_view_t rowStatus;
+    worklist_t worklist;
+    int nvBits;
+    uint32_t hashedRound;
+  };
+
+  struct RefreshColStatus
+  {
+    RefreshColStatus(const status_view_t& colStatus_, const worklist_t& worklist_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, lno_t worklistLen_)
+      : colStatus(colStatus_), worklist(worklist_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklistLen(worklistLen_)
+    {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const
+    {
+      lno_t i = worklist(w);
+      //iterate over {i} union the neighbors of i, to find
+      //minimum status.
+      status_t s = OUT_SET;
+      size_type rowBegin = rowmap(i);
+      size_type rowEnd = rowmap(i + 1);
+      for(size_type j = rowBegin; j <= rowEnd; j++)
+      {
+        lno_t nei = (j == rowEnd) ? i : entries(j);
+        if(nei < nv)
+        {
+          status_t neiStat = rowStatus(nei);
+          if(neiStat < s)
+            s = neiStat;
+        }
+      }
+      if(s == IN_SET)
+        s = OUT_SET;
+      colStatus(i) = s;
+    }
+
+    KOKKOS_INLINE_FUNCTION void operator()(const team_mem& t) const
+    {
+      using MinReducer = Kokkos::Min<status_t>;
+      lno_t w = t.league_rank() * t.team_size() + t.team_rank();
+      if(w >= worklistLen)
+        return;
+      lno_t i = worklist(w);
+      size_type rowBegin = rowmap(i);
+      size_type rowEnd = rowmap(i + 1);
+      lno_t rowLen = rowEnd - rowBegin;
+      //iterate over {i} union the neighbors of i, to find
+      //minimum status.
+      status_t s;
+      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(t, rowLen + 1),
+      [&](lno_t j, status_t& ls)
+      {
+        lno_t nei = (j == rowLen) ? i : entries(rowBegin + j);
+        if(nei < nv)
+        {
+          status_t neiStat = rowStatus(nei);
+          if(neiStat < ls)
+            ls = neiStat;
+        }
+      }, MinReducer(s));
+      Kokkos::single(Kokkos::PerThread(t),
+      [&]()
+      {
+        if(s == IN_SET)
+          s = OUT_SET;
+        colStatus(i) = s;
+      });
+    }
+
+    status_view_t colStatus;
+    worklist_t worklist;
+    status_view_t rowStatus;
+    rowmap_t rowmap;
+    entries_t entries;
+    lno_t nv;
+    lno_t worklistLen;
+  };
+
+  struct DecideSetFunctor
+  {
+    DecideSetFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const worklist_t& worklist_, lno_t worklistLen_)
+      : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklist(worklist_), worklistLen(worklistLen_)
+    {}
+
+    //Enum values to be used as flags, so that the team policy version can
+    //express the neighbor checking as an OR-reduction
+    enum
+    {
+      NEI_OUT_SET = 1,
+      NEI_DIFFERENT_STATUS = 2
+    };
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const
+    {
+      lno_t i = worklist(w);
+      //Processing row i.
+      status_t s = rowStatus(i);
+      if(s == IN_SET || s == OUT_SET)
+        return;
+      //s is the status which must be the minimum among all neighbors
+      //to decide that i is IN_SET.
+      size_type rowBegin = rowmap(i);
+      size_type rowEnd = rowmap(i + 1);
+      bool neiOut = false;
+      bool neiMismatchS = false;
+      for(size_type j = rowBegin; j <= rowEnd; j++)
+      {
+        lno_t nei = (j == rowEnd) ? i : entries(j);
+        if(nei >= nv)
+          continue;
+        status_t neiStat = colStatus(nei);
+        if(neiStat == OUT_SET)
+        {
+          neiOut = true;
+          break;
+        }
+        else if(neiStat != s)
+        {
+          neiMismatchS = true;
+        }
+      }
+      if(neiOut)
+      {
+        //In order to make future progress, need to update the
+        //col statuses for all neighbors of i.
+        rowStatus(i) = OUT_SET;
+      }
+      else if(!neiMismatchS)
+      {
+        //all neighboring col statuses match s, therefore s is the minimum status among all d2 neighbors
+        rowStatus(i) = IN_SET;
+      }
+    }
+
+    KOKKOS_INLINE_FUNCTION void operator()(const team_mem& t) const
+    {
+      using OrReducer = Kokkos::BOr<int>;
+      lno_t w = t.league_rank() * t.team_size() + t.team_rank();
+      if(w >= worklistLen)
+        return;
+      lno_t i = worklist(w);
+      //Processing row i.
+      status_t s = rowStatus(i);
+      if(s == IN_SET || s == OUT_SET)
+        return;
+      //s is the status which must be the minimum among all neighbors
+      //to decide that i is IN_SET.
+      size_type rowBegin = rowmap(i);
+      size_type rowEnd = rowmap(i + 1);
+      lno_t rowLen = rowEnd - rowBegin;
+      int flags = 0;
+      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(t, rowLen + 1),
+      [&](lno_t j, int& lflags)
+      {
+        lno_t nei = (j == rowLen) ? i : entries(rowBegin + j);
+        if(nei >= nv)
+          return;
+        status_t neiStat = colStatus(nei);
+        if(neiStat == OUT_SET)
+          lflags |= NEI_OUT_SET;
+        else if(neiStat != s)
+          lflags |= NEI_DIFFERENT_STATUS;
+      }, OrReducer(flags));
+      Kokkos::single(Kokkos::PerThread(t),
+      [&]()
+      {
+        if(flags & NEI_OUT_SET)
+        {
+          //In order to make future progress, need to update the
+          //col statuses for all neighbors of i.
+          rowStatus(i) = OUT_SET;
+        }
+        else if(!(flags & NEI_DIFFERENT_STATUS))
+        {
+          //all neighboring col statuses match s, therefore s is the minimum status among all d2 neighbors
+          rowStatus(i) = IN_SET;
+        }
+      });
+    }
+
+    status_view_t rowStatus;
+    status_view_t colStatus;
+    rowmap_t rowmap;
+    entries_t entries;
+    lno_t nv;
+    worklist_t worklist;
+    lno_t worklistLen;
+  };
+
+  struct CountInSet
+  {
+    CountInSet(const status_view_t& rowStatus_)
+      : rowStatus(rowStatus_)
+    {}
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet) const
+    {
+      if(rowStatus(i) == IN_SET)
+        lNumInSet++;
+    }
+    status_view_t rowStatus;
+  };
+
+  struct CompactInSet
+  {
+    CompactInSet(const status_view_t& rowStatus_, const lno_view_t& setList_)
+      : rowStatus(rowStatus_), setList(setList_)
+    {}
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, bool finalPass) const
+    {
+      if(rowStatus(i) == IN_SET)
+      {
+        if(finalPass)
+          setList(lNumInSet) = i;
+        lNumInSet++;
+      }
+    }
+    status_view_t rowStatus;
+    lno_view_t setList;
+  };
+
+  struct InitWorklistFunctor
+  {
+    InitWorklistFunctor(const worklist_t& worklist_)
+      : worklist(worklist_)
+    {}
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const
+    {
+      worklist(i) = i;
+    }
+    worklist_t worklist;
+  };
+
+  struct CompactWorklistFunctor
+  {
+    CompactWorklistFunctor(const worklist_t& src_, const worklist_t& dst_, const status_view_t& status_)
+      : src(src_), dst(dst_), status(status_)
+    {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lNumInSet, bool finalPass) const
+    {
+      lno_t i = src(w);
+      status_t s = status(i);
+      if(s != IN_SET && s != OUT_SET)
+      {
+        //next worklist needs to contain i
+        if(finalPass)
+          dst(lNumInSet) = i;
+        lNumInSet++;
+      }
+    }
+
+    worklist_t src;
+    worklist_t dst;
+    status_view_t status;
+  };
+
+  lno_view_t compute()
+  {
+    //Initialize first worklist to 0...numVerts
+    worklist_t rowWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 0);
+    Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(rowWorklist));
+    worklist_t colWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 1);
+    Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(colWorklist));
+    worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2);
+    auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type<exec_space>();
+    bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space<exec_space>() && (entries.extent(0) / numVerts >= 16);
+    int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum);
+    int round = 0;
+    lno_t rowWorkLen = numVerts;
+    lno_t colWorkLen = numVerts;
+    int refreshColTeamSize = 0;
+    int decideSetTeamSize = 0;
+    if(useTeams)
+    {
+      team_pol dummyPolicy(1, 1, vectorLength);
+      //Compute the recommended team size for RefreshColStatus and DecideSetFunctor (will be constant)
+      {
+        RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen);
+        refreshColTeamSize = dummyPolicy.team_size_max(refreshCol, Kokkos::ParallelForTag());
+      }
+      {
+        DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen);
+        decideSetTeamSize = dummyPolicy.team_size_max(decideSet, Kokkos::ParallelForTag());
+      }
+    }
+    while(true)
+    {
+      //Compute new row statuses
+      Kokkos::parallel_for(range_pol(0, rowWorkLen), RefreshRowStatus(rowStatus, rowWorklist, nvBits, round));
+      //Compute new col statuses
+      {
+        RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen);
+        if(useTeams)
+          Kokkos::parallel_for(team_pol((colWorkLen + refreshColTeamSize - 1) / refreshColTeamSize, refreshColTeamSize, vectorLength), refreshCol);
+        else
+          Kokkos::parallel_for(range_pol(0, colWorkLen), refreshCol);
+      }
+      //Decide row statuses where enough information is available
+      {
+        DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen);
+        if(useTeams)
+          Kokkos::parallel_for(team_pol((rowWorkLen + decideSetTeamSize - 1) / decideSetTeamSize, decideSetTeamSize, vectorLength), decideSet);
+        else
+          Kokkos::parallel_for(range_pol(0, rowWorkLen), decideSet);
+      }
+      //Compact row worklist
+      Kokkos::parallel_scan(range_pol(0, rowWorkLen), CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus), rowWorkLen);
+      if(rowWorkLen == 0)
+        break;
+      std::swap(rowWorklist, thirdWorklist);
+      //Compact col worklist
+      Kokkos::parallel_scan(range_pol(0, colWorkLen), CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus), colWorkLen);
+      std::swap(colWorklist, thirdWorklist);
+      round++;
+    }
+    //now that every vertex has been decided IN_SET/OUT_SET,
+    //build a compact list of the vertices which are IN_SET.
+    lno_t numInSet = 0;
+    Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet);
+    lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), numInSet);
+    Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList));
+    return setList;
+  }
+
+  rowmap_t rowmap;
+  entries_t entries;
+  lno_t numVerts;
+  status_view_t rowStatus;
+  status_view_t colStatus;
+  all_worklists_t allWorklists;
+  //The number of bits required to represent vertex IDs, in the ECL-MIS tiebreak scheme:
+  //  ceil(log_2(numVerts + 1))
+  int nvBits;
+};
+
+//    UNUSED CODE
+//    Version of RefreshRowStatus, which does linear interpolation between a degree-based score and a random score.
+//    By gradually increasing the interpolation coefficient in favor of random, the MIS can converge much faster than
+//    constant priorities.
+//
+//    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const
+//    {
+//      lno_t i = worklist(w);
+//      int degBits = sizeof(status_t) * 8 - nvBits;
+//      if(degBits == 0)
+//      {
+//        //no space to store degree information. Algorithm will still work but will
+//        //probably produce a lower quality MIS.
+//        rowStatus(i) = i + 1;
+//        return;
+//      }
+//      //Combine vertex and round to get some pseudorandom priority bits that change each round
+//      status_t maxDegRange = (((status_t) 1) << degBits) - 2;
+//      lno_t deg = rowmap(i + 1) - rowmap(i);
+//      //Compute degree-based score and random score
+//      float degScore = (float) (deg - minDeg) * invDegRange;
+//      float randScore = (xorshiftHash(i + hashedRound) & 0xFFFF) / 65536.f;
+//      //Then linearly interpolate using k
+//      float finalScore = k * randScore + (1.f - k) * degScore;
+//      rowStatus(i) = (status_t) (i + 1) + (((status_t) (finalScore * maxDegRange)) << nvBits);
+//    }
+//    */
+
+template<typename device_t, typename rowmap_t, typename entries_t, typename lno_view_t>
+struct D2_MIS_FixedPriority
+{
+  using exec_space = typename device_t::execution_space;
+  using mem_space = typename device_t::memory_space;
+  using bitset_t = Kokkos::Bitset<device_t>;
+  using const_bitset_t = Kokkos::ConstBitset<device_t>;
+  using size_type = typename rowmap_t::non_const_value_type;
+  using lno_t = typename entries_t::non_const_value_type;
+  //The type of status/priority values.
+  using status_t = typename std::make_unsigned<lno_t>::type;
+  using status_view_t = Kokkos::View<status_t*, mem_space>;
+  using range_pol = Kokkos::RangePolicy<exec_space>;
+
+  // Priority values 0 and max are special, they mean the vertex is
+  // in the independent set or eliminated from consideration, respectively.
+  // Values in between represent a priority for being added to the set,
+  // based on degree and vertex ID as a tiebreak
+  //   (higher priority = less preferred to being in the independent set)
+
+  static constexpr status_t IN_SET = 0;
+  static constexpr status_t OUT_SET = ~IN_SET;
+
+  D2_MIS_FixedPriority(const rowmap_t& rowmap_, const entries_t& entries_)
+    : rowmap(rowmap_), entries(entries_), numVerts(rowmap.extent(0) - 1), colUpdateBitset(numVerts),
+    worklist1(Kokkos::ViewAllocateWithoutInitializing("WL1"), numVerts),
+    worklist2(Kokkos::ViewAllocateWithoutInitializing("WL2"), numVerts)
+  {
+    status_t i = numVerts + 1;
+    nvBits = 0;
+    while(i)
+    {
+      i >>= 1;
+      nvBits++;
+    }
+    //Each value in rowStatus represents the status and priority of each row.
+    //Each value in colStatus represents the lowest nonzero priority of any row adjacent to the column.
+    //  This counts up monotonically as vertices are eliminated (given status OUT_SET)
+    rowStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("RowStatus"), numVerts);
+    colStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("ColStatus"), numVerts);
+    KokkosKernels::Impl::graph_min_max_degree<device_t, lno_t, rowmap_t>(rowmap, minDegree, maxDegree);
+    //Compute row statuses 
+    Kokkos::parallel_for(range_pol(0, numVerts), InitRowStatus(rowStatus, rowmap, numVerts, nvBits, minDegree, maxDegree));
+    //Compute col statuses
+    Kokkos::parallel_for(range_pol(0, numVerts), InitColStatus(colStatus, rowStatus, rowmap, entries, numVerts));
+  }
+
+  struct InitRowStatus
+  {
+    InitRowStatus(const status_view_t& rowStatus_, const rowmap_t& rowmap_, lno_t nv_, lno_t nvBits_, lno_t minDeg_, lno_t maxDeg_)
+      : rowStatus(rowStatus_), rowmap(rowmap_), nv(nv_), nvBits(nvBits_), minDeg(minDeg_), maxDeg(maxDeg_), invDegRange(1.f / (maxDeg - minDeg)) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const
+    {
+      //Generate unique status per row, with IN_SET < status < OUT_SET,
+      int degBits = sizeof(status_t) * 8 - nvBits;
+      if(degBits == 0)
+      {
+        //no space to store degree information. Algorithm will still work but will
+        //probably produce a lower quality MIS.
+        rowStatus(i) = i + 1;
+        return;
+      }
+      status_t maxDegRange = (((status_t) 1) << degBits) - 2;
+      lno_t deg = rowmap(i + 1) - rowmap(i);
+      float degScore = (float) (deg - minDeg) * invDegRange;
+      rowStatus(i) = (status_t) (i + 1) + (((status_t) (degScore * maxDegRange)) << nvBits);
+    }
+
+    status_view_t rowStatus;
+    rowmap_t rowmap;
+    lno_t nv;
+    int nvBits;
+    lno_t minDeg;
+    lno_t maxDeg;
+    float invDegRange;
+  };
+
+  struct InitColStatus
+  {
+    InitColStatus(const status_view_t& colStatus_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_)
+      : colStatus(colStatus_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_)
+    {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const
+    {
+      //iterate over {i} union the neighbors of i, to find
+      //minimum status.
+      status_t s = rowStatus(i);
+      size_type rowBegin = rowmap(i);
+      size_type rowEnd = rowmap(i + 1);
+      for(size_type j = rowBegin; j < rowEnd; j++)
+      {
+        lno_t nei = entries(j);
+        if(nei != i && nei < nv)
+        {
+          status_t neiStat = rowStatus(nei);
+          if(neiStat < s)
+            s = neiStat;
+        }
+      }
+      colStatus(i) = s;
+    }
+
+    status_view_t colStatus;
+    status_view_t rowStatus;
+    rowmap_t rowmap;
+    entries_t entries;
+    lno_t nv;
+  };
+
+  struct IterateStatusFunctor
+  {
+    IterateStatusFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const lno_view_t& worklist_, const bitset_t& colUpdateBitset_)
+      : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklist(worklist_), colUpdateBitset(colUpdateBitset_)
+    {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const
+    {
+      lno_t i = worklist(w);
+      //Processing row i.
+      status_t s = rowStatus(i);
+      //s is the status which must be the minimum among all neighbors
+      //to decide that i is IN_SET.
+      size_type rowBegin = rowmap(i);
+      size_type rowEnd = rowmap(i + 1);
+      bool neiOut = false;
+      bool neiMismatchS = false;
+      for(size_type j = rowBegin; j <= rowEnd; j++)
+      {
+        lno_t nei = (j == rowEnd) ? i : entries(j);
+        if(nei >= nv)
+          continue;
+        status_t neiStat = colStatus(nei);
+        if(neiStat == OUT_SET)
+        {
+          neiOut = true;
+          break;
+        }
+        else if(neiStat != s)
+        {
+          neiMismatchS = true;
+        }
+      }
+      bool statusChanged = neiOut || !neiMismatchS;
+      if(neiOut)
+      {
+        //In order to make future progress, need to update the
+        //col statuses for all neighbors of i which have status s.
+        //This will increase the minimum to the next smallest row,
+        //so that another nearby vertex can be added to the set.
+        rowStatus(i) = OUT_SET;
+      }
+      else if(!neiMismatchS)
+      {
+        rowStatus(i) = IN_SET;
+      }
+      if(statusChanged)
+      {
+        for(size_type j = rowBegin; j <= rowEnd; j++)
+        {
+          lno_t nei = (j == rowEnd) ? i : entries(j);
+          if(nei < nv && colStatus(nei) == s)
+            colUpdateBitset.set(nei);
+        }
+      }
+      //else: still undecided
+    }
+
+    status_view_t rowStatus;
+    status_view_t colStatus;
+    rowmap_t rowmap;
+    entries_t entries;
+    lno_t nv;
+    lno_view_t worklist;
+    bitset_t colUpdateBitset;
+  };
+
+  struct UpdateWorklistFunctor
+  {
+    UpdateWorklistFunctor(const status_view_t& rowStatus_, const lno_view_t& oldWorklist_, const lno_view_t& newWorklist_)
+      : rowStatus(rowStatus_), oldWorklist(oldWorklist_), newWorklist(newWorklist_)
+    {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lcount, bool finalPass) const
+    {
+      //processing row i
+      lno_t i = oldWorklist(w);
+      //Bit i will be set when it's decided IN_SET/OUT_SET.
+      //If clear, vertex i needs to be processed still.
+      status_t s = rowStatus(i);
+      if(s != IN_SET && s != OUT_SET)
+      {
+        if(finalPass)
+          newWorklist(lcount) = i;
+        lcount++;
+      }
+    }
+
+    status_view_t rowStatus;
+    lno_view_t oldWorklist;
+    lno_view_t newWorklist;
+  };
+
+  struct ColRefreshWorklist
+  {
+    ColRefreshWorklist(const bitset_t& colUpdateBitset_, const lno_view_t& refreshList_)
+      : colUpdateBitset(colUpdateBitset_), refreshList(refreshList_)
+    {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lindex, bool finalPass) const
+    {
+      if(colUpdateBitset.test(i))
+      {
+        if(finalPass)
+        {
+          refreshList(lindex) = i;
+          colUpdateBitset.reset(i);
+        }
+        lindex++;
+      }
+    }
+
+    bitset_t colUpdateBitset;
+    lno_view_t refreshList;
+  };
+
+  struct RefreshColStatus
+  {
+    RefreshColStatus(const lno_view_t& worklist_, const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_)
+      : worklist(worklist_), rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_)
+    {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const
+    {
+      lno_t col = worklist(w);
+      status_t minNeiStat = OUT_SET;
+      size_type rowBegin = rowmap(col);
+      size_type rowEnd = rowmap(col + 1);
+      for(size_type j = rowBegin; j <= rowEnd; j++)
+      {
+        lno_t nei = (j == rowEnd) ? col : entries(j);
+        if(nei >= nv)
+          continue;
+        status_t neiStat = rowStatus(nei);
+        if(neiStat < minNeiStat)
+          minNeiStat = neiStat;
+      }
+      if(minNeiStat == IN_SET)
+        minNeiStat = OUT_SET;
+      colStatus(col) = minNeiStat;
+    }
+
+    lno_view_t worklist;
+    status_view_t rowStatus;
+    status_view_t colStatus;
+    rowmap_t rowmap;
+    entries_t entries;
+    lno_t nv;
+  };
+
+  struct InitWorklistFunctor
+  {
+    InitWorklistFunctor(const lno_view_t& worklist_)
+      : worklist(worklist_)
+    {}
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const
+    {
+      worklist(i) = i;
+    }
+    lno_view_t worklist;
+  };
+
+  struct CountInSet
+  {
+    CountInSet(const status_view_t& rowStatus_)
+      : rowStatus(rowStatus_)
+    {}
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet) const
+    {
+      if(rowStatus(i) == IN_SET)
+        lNumInSet++;
+    }
+    status_view_t rowStatus;
+  };
+
+  struct CompactInSet
+  {
+    CompactInSet(const status_view_t& rowStatus_, const lno_view_t& setList_)
+      : rowStatus(rowStatus_), setList(setList_)
+    {}
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, bool finalPass) const
+    {
+      if(rowStatus(i) == IN_SET)
+      {
+        if(finalPass)
+          setList(lNumInSet) = i;
+        lNumInSet++;
+      }
+    }
+    status_view_t rowStatus;
+    lno_view_t setList;
+  };
+
+  lno_view_t compute()
+  {
+    //Initialize first worklist to 0...numVerts
+    Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(worklist1));
+    lno_t workRemain = numVerts;
+    int numIter = 0;
+    while(workRemain)
+    {
+      //do another iteration
+      Kokkos::parallel_for(range_pol(0, workRemain),
+          IterateStatusFunctor(rowStatus, colStatus, rowmap, entries, numVerts, worklist1, colUpdateBitset));
+      //And refresh the column statuses using the other worklist.
+      lno_t colsToRefresh;
+      Kokkos::parallel_scan(range_pol(0, numVerts),
+          ColRefreshWorklist(colUpdateBitset, worklist2), colsToRefresh);
+      Kokkos::parallel_for(range_pol(0, colsToRefresh),
+          RefreshColStatus(worklist2, rowStatus, colStatus, rowmap, entries, numVerts));
+      //then build the next worklist with a scan. Also get the length of the next worklist.
+      lno_t newWorkRemain = 0;
+      Kokkos::parallel_scan(range_pol(0, workRemain),
+          UpdateWorklistFunctor(rowStatus, worklist1, worklist2),
+          newWorkRemain);
+      //Finally, flip the worklists
+      std::swap(worklist1, worklist2);
+      workRemain = newWorkRemain;
+      numIter++;
+    }
+    //now that every vertex has been decided IN_SET/OUT_SET,
+    //build a compact list of the vertices which are IN_SET.
+    lno_t numInSet = 0;
+    Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet);
+    lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), numInSet);
+    Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList));
+    return setList;
+  }
+
+  rowmap_t rowmap;
+  entries_t entries;
+  lno_t numVerts;
+  status_view_t rowStatus;
+  status_view_t colStatus;
+  //The number of bits required to represent vertex IDs, in the ECL-MIS tiebreak scheme:
+  //  ceil(log_2(numVerts + 1))
+  int nvBits;
+  lno_t minDegree;
+  lno_t maxDegree;
+  //Bitset representing columns whose status needs to be recomputed
+  //These bits are cleared after each refresh.
+  bitset_t colUpdateBitset;
+  lno_view_t worklist1;
+  lno_view_t worklist2;
+};
+
+template<typename device_t, typename rowmap_t, typename entries_t, typename labels_t>
+struct D2_MIS_Coarsening
+{
+  using exec_space = typename device_t::execution_space;
+  using mem_space = typename device_t::memory_space;
+  using bitset_t = Kokkos::Bitset<device_t>;
+  using const_bitset_t = Kokkos::ConstBitset<device_t>;
+  using size_type = typename rowmap_t::non_const_value_type;
+  using lno_t = typename entries_t::non_const_value_type;
+  using lno_view_t = typename entries_t::non_const_type;
+  //The type of status/priority values.
+  using status_t = typename std::make_unsigned<lno_t>::type;
+  using status_view_t = Kokkos::View<status_t*, mem_space>;
+  using range_pol = Kokkos::RangePolicy<exec_space>;
+
+  D2_MIS_Coarsening(const rowmap_t& rowmap_, const entries_t& entries_, const labels_t& mis2_)
+    : rowmap(rowmap_), entries(entries_), mis2(mis2_),
+      numVerts(rowmap.extent(0) - 1),
+      labels(Kokkos::ViewAllocateWithoutInitializing("Cluster Labels"), numVerts)
+  {
+    Kokkos::deep_copy(labels, (lno_t) -1);
+  }
+
+  //Phase 1 (over 0...numClusters) labels roots and immediate neighbors of roots.
+  struct Phase1Functor
+  {
+    Phase1Functor(const rowmap_t& rowmap_, const entries_t& entries_, const labels_t& mis2_, lno_t numVerts_, const labels_t& labels_)
+      : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(numVerts_), labels(labels_)
+    {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const
+    {
+      lno_t root = mis2(i);
+      size_type rowBegin = rowmap(root);
+      size_type rowEnd = rowmap(root + 1);
+      labels(root) = i;
+      for(size_type j = rowBegin; j < rowEnd; j++)
+      {
+        lno_t nei = entries(j);
+        if(nei != root && nei < numVerts)
+        {
+          labels(nei) = i;
+        }
+      }
+    }
+
+    rowmap_t rowmap;
+    entries_t entries;
+    labels_t mis2;
+    lno_t numVerts;
+    labels_t labels;
+  };
+
+  KOKKOS_INLINE_FUNCTION static uint32_t xorshiftHash(uint32_t in)
+  {
+    uint32_t x = in;
+    x ^= x << 13;
+    x ^= x >> 17;
+    x ^= x << 5;
+    return x;
+  }
+
+  //Phase 2 (over 0...numVerts) joins unlabeled vertices to the smallest adjacent cluster
+  struct Phase2Functor
+  {
+    Phase2Functor(const rowmap_t& rowmap_, const entries_t& entries_, const labels_t& mis2_, lno_t numVerts_, const labels_t& labels_)
+      : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(numVerts_), labels(labels_)
+    {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const
+    {
+      if(labels(i) != (lno_t) -1)
+        return;
+      size_type rowBegin = rowmap(i);
+      size_type rowEnd = rowmap(i + 1);
+      lno_t cluster = -1;
+      uint32_t minScore = ~(uint32_t) 0;
+      for(size_type j = rowBegin; j < rowEnd; j++)
+      {
+        lno_t nei = entries(j);
+        if(nei == i || nei >= numVerts)
+          continue;
+        lno_t neiCluster = labels(nei);
+        if(neiCluster != -1 && neiCluster != cluster)
+        {
+          //check if this cluster is smaller
+          uint32_t score = xorshiftHash(i + xorshiftHash(neiCluster));
+          if(score < minScore)
+          {
+            cluster = neiCluster;
+            minScore = score;
+          }
+        }
+      }
+      labels(i) = cluster;
+    }
+
+    rowmap_t rowmap;
+    entries_t entries;
+    labels_t mis2;
+    lno_t numVerts;
+    labels_t labels;
+  };
+
+  labels_t compute()
+  {
+    lno_t numClusters = mis2.extent(0);
+    Kokkos::parallel_for(range_pol(0, numClusters), Phase1Functor(rowmap, entries, mis2, numVerts, labels));
+    Kokkos::parallel_for(range_pol(0, numVerts), Phase2Functor(rowmap, entries, mis2, numVerts, labels));
+    return labels;
+  }
+
+  rowmap_t rowmap;
+  entries_t entries;
+  labels_t mis2;
+  lno_t numVerts;
+  labels_t labels;
+};
+
+}}}
+
+#endif
diff --git a/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp b/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp
new file mode 100644
index 0000000000..51fa777c79
--- /dev/null
+++ b/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp
@@ -0,0 +1,303 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSGRAPH_EXPLICIT_COARSEN_IMPL_HPP
+#define KOKKOSGRAPH_EXPLICIT_COARSEN_IMPL_HPP
+
+namespace KokkosGraph {
+namespace Impl {
+
+template<typename lno_t, typename size_type, typename device_t, typename fine_rowmap_t, typename fine_entries_t, typename labels_t, typename coarse_rowmap_t, typename coarse_entries_t, typename ordinal_view_t>
+struct ExplicitGraphCoarsening
+{
+  using exec_space = typename device_t::execution_space;
+  using range_pol = Kokkos::RangePolicy<exec_space>;
+  using team_pol = Kokkos::TeamPolicy<exec_space>;
+  using team_member_t = typename team_pol::member_type;
+  using bitset_t = Kokkos::Bitset<device_t>;
+  using const_bitset_t = Kokkos::ConstBitset<device_t>;
+
+  struct ClusterSizeFunctor
+  {
+    ClusterSizeFunctor(const ordinal_view_t& counts_, const labels_t& vertClusters_)
+      : counts(counts_), vertClusters(vertClusters_)
+    {}
+    KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const
+    {
+      Kokkos::atomic_increment(&counts(vertClusters(i)));
+    }
+    ordinal_view_t counts;
+    labels_t vertClusters;
+  };
+
+  struct FillClusterVertsFunctor
+  {
+    FillClusterVertsFunctor(const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, const labels_t& vertClusters_, const ordinal_view_t& insertCounts_)
+      : clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), insertCounts(insertCounts_)
+    {}
+    KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const
+    {
+      lno_t cluster = vertClusters(i);
+      lno_t offset = clusterOffsets(cluster) + Kokkos::atomic_fetch_add(&insertCounts(cluster), 1);
+      clusterVerts(offset) = i;
+    }
+    ordinal_view_t clusterOffsets;
+    ordinal_view_t clusterVerts;
+    labels_t vertClusters;
+    ordinal_view_t insertCounts;
+  };
+
+  struct BuildCrossClusterMaskFunctor
+  {
+    BuildCrossClusterMaskFunctor(const fine_rowmap_t& rowmap_, const fine_entries_t& colinds_, const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, const labels_t& vertClusters_, const bitset_t& mask_)
+      : numRows(rowmap_.extent(0) - 1), rowmap(rowmap_), colinds(colinds_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), mask(mask_)
+    {}
+
+    //Used a fixed-size hash set in shared memory
+    KOKKOS_INLINE_FUNCTION constexpr int tableSize() const
+    {
+      //Should always be a power-of-two, so that X % tableSize() reduces to a bitwise and.
+      return 512;
+    }
+
+    //Given a cluster index, get the hash table index.
+    //This is the 32-bit xorshift RNG, but it works as a hash function.
+    KOKKOS_INLINE_FUNCTION unsigned xorshiftHash(lno_t cluster) const
+    {
+      unsigned x = cluster;
+      x ^= x << 13;
+      x ^= x >> 17;
+      x ^= x << 5;
+      return x;
+    }
+
+    KOKKOS_INLINE_FUNCTION bool lookup(lno_t cluster, int* table) const
+    {
+      unsigned h = xorshiftHash(cluster);
+      for(unsigned i = h; i < h + 2; i++)
+      {
+        if(table[i % tableSize()] == cluster)
+          return true;
+      }
+      return false;
+    }
+
+    //Try to insert the edge between cluster (team's cluster) and neighbor (neighboring cluster)
+    //by inserting nei into the table.
+    KOKKOS_INLINE_FUNCTION bool insert(lno_t cluster, lno_t nei, int* table) const
+    {
+      unsigned h = xorshiftHash(nei);
+      for(unsigned i = h; i < h + 2; i++)
+      {
+        if(Kokkos::atomic_compare_exchange_strong<int>(&table[i % tableSize()], cluster, nei))
+          return true;
+      }
+      return false;
+    }
+
+    KOKKOS_INLINE_FUNCTION void operator()(const team_member_t t) const
+    {
+      lno_t cluster = t.league_rank();
+      lno_t clusterSize = clusterOffsets(cluster + 1) - clusterOffsets(cluster);
+      //Use a fixed-size hash table per thread to accumulate neighbor of the cluster.
+      //If it fills up (very unlikely) then just count every remaining edge going to another cluster
+      //not already in the table; this provides a reasonable upper bound for overallocating the cluster graph.
+      //each thread handles a cluster
+      int* table = (int*) t.team_shmem().get_shmem(tableSize() * sizeof(int));
+      //mark every entry as cluster (self-loop) to represent free/empty
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(t, tableSize()),
+        [&](const lno_t i)
+        {
+          table[i] = cluster;
+        });
+      t.team_barrier();
+      //now, for each row belonging to the cluster, iterate through the neighbors
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clusterSize),
+        [&] (const lno_t i)
+        {
+          lno_t row = clusterVerts(clusterOffsets(cluster) + i);
+          lno_t rowDeg = rowmap(row + 1) - rowmap(row);
+          Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, rowDeg),
+            [&] (const lno_t j)
+            {
+              lno_t nei = colinds(rowmap(row) + j);
+              //Remote neighbors are not included
+              if(nei >= numRows)
+                return;
+              lno_t neiCluster = vertClusters(nei);
+              if(neiCluster != cluster)
+              {
+                //Have a neighbor. Try to find it in the table.
+                if(!lookup(neiCluster, table))
+                {
+                  //Not in the table. Try to insert it.
+                  insert(cluster, neiCluster, table);
+                  //Whether or not insertion succeeded,
+                  //this is a cross-cluster edge possibly not seen before
+                  mask.set(rowmap(row) + j);
+                }
+              }
+            });
+        });
+    }
+
+    size_t team_shmem_size(int teamSize) const
+    {
+      return tableSize() * sizeof(int);
+    }
+
+    lno_t numRows;
+    fine_rowmap_t rowmap;
+    fine_entries_t colinds;
+    ordinal_view_t clusterOffsets;
+    ordinal_view_t clusterVerts;
+    labels_t vertClusters;
+    bitset_t mask;
+  };
+
+  struct FillClusterEntriesFunctor
+  {
+    FillClusterEntriesFunctor(
+        const fine_rowmap_t& rowmap_, const fine_entries_t& colinds_, const coarse_rowmap_t& clusterRowmap_, const coarse_entries_t& clusterEntries_, const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, const labels_t& vertClusters_, const bitset_t& edgeMask_)
+      : rowmap(rowmap_), colinds(colinds_), clusterRowmap(clusterRowmap_), clusterEntries(clusterEntries_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), edgeMask(edgeMask_)
+    {}
+    //Run this scan over entries in clusterVerts (reordered point rows)
+    KOKKOS_INLINE_FUNCTION void operator()(const lno_t i, lno_t& lcount, const bool& finalPass) const
+    {
+      lno_t numRows = rowmap.extent(0) - 1;
+      lno_t row = clusterVerts(i);
+      size_type rowStart = rowmap(row);
+      size_type rowEnd = rowmap(row + 1);
+      lno_t cluster = vertClusters(row);
+      lno_t clusterStart = clusterOffsets(cluster);
+      //Count the number of entries in this row.
+      //This is how much lcount will be increased by,
+      //yielding the offset corresponding to
+      //these point entries in the cluster entries.
+      lno_t rowEntries = 0;
+      for(size_type j = rowStart; j < rowEnd; j++)
+      {
+        if(edgeMask.test(j))
+          rowEntries++;
+      }
+      if(finalPass)
+      {
+        //if this is the last row in the cluster, update the upper bound in clusterRowmap
+        if(i == clusterStart)
+        {
+          clusterRowmap(cluster) = lcount;
+        }
+        lno_t clusterEdge = lcount;
+        //populate clusterEntries for these edges
+        for(size_type j = rowStart; j < rowEnd; j++)
+        {
+          if(edgeMask.test(j))
+          {
+            clusterEntries(clusterEdge++) = vertClusters(colinds(j));
+          }
+        }
+      }
+      //update the scan result at the end (exclusive)
+      lcount += rowEntries;
+      if(i == numRows - 1 && finalPass)
+      {
+        //on the very last row, set the last entry of the cluster rowmap
+        clusterRowmap(clusterRowmap.extent(0) - 1) = lcount;
+      }
+    }
+    fine_rowmap_t rowmap;
+    fine_entries_t colinds;
+    coarse_rowmap_t clusterRowmap;
+    coarse_entries_t clusterEntries;
+    ordinal_view_t clusterOffsets;
+    ordinal_view_t clusterVerts;
+    labels_t vertClusters;
+    const_bitset_t edgeMask;
+  };
+
+  //Constructor just does the computation and outputs to coarseRowmap, coarseEntries.
+  ExplicitGraphCoarsening(const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, const labels_t& labels, lno_t numCoarseVerts)
+  {
+    lno_t numFineVerts = fineRowmap.extent(0);
+    if(numFineVerts <= 1)
+    {
+      coarseRowmap = coarse_rowmap_t();
+      coarseEntries = coarse_entries_t();
+      return;
+    }
+    numFineVerts--;
+    clusterOffsets = ordinal_view_t("Cluster offsets", numCoarseVerts + 1);
+    clusterVerts = ordinal_view_t(Kokkos::ViewAllocateWithoutInitializing("Cluster verts"), numFineVerts);
+    Kokkos::parallel_for(range_pol(0, numFineVerts), ClusterSizeFunctor(clusterOffsets, labels));
+    KokkosKernels::Impl::exclusive_parallel_prefix_sum<ordinal_view_t, exec_space>(numCoarseVerts + 1, clusterOffsets);
+    {
+      ordinal_view_t tempInsertCounts("Temporary cluster insert counts", numCoarseVerts);
+      Kokkos::parallel_for(range_pol(0, numFineVerts), FillClusterVertsFunctor(clusterOffsets, clusterVerts, labels, tempInsertCounts));
+    }
+    //Determine the set of edges (in the point graph) that cross between two distinct clusters
+    int vectorSize = KokkosKernels::Impl::kk_get_suggested_vector_size(numFineVerts, fineEntries.extent(0), KokkosKernels::Impl::kk_get_exec_space_type<exec_space>());
+    bitset_t crossClusterEdgeMask(fineEntries.extent(0));
+    size_type numClusterEdges;
+    {
+      BuildCrossClusterMaskFunctor buildEdgeMask(fineRowmap, fineEntries, clusterOffsets, clusterVerts, labels, crossClusterEdgeMask);
+      int sharedPerTeam = buildEdgeMask.team_shmem_size(0); //using team-size = 0 for since no per-thread shared is used.
+      int teamSize = KokkosKernels::Impl::get_suggested_team_size<team_pol>(buildEdgeMask, vectorSize, sharedPerTeam, 0);
+      Kokkos::parallel_for(team_pol(numCoarseVerts, teamSize, vectorSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)), buildEdgeMask);
+      numClusterEdges = crossClusterEdgeMask.count();
+    }
+    coarseRowmap = coarse_rowmap_t(Kokkos::ViewAllocateWithoutInitializing("Cluster graph rowmap"), numCoarseVerts + 1);
+    coarseEntries = coarse_entries_t(Kokkos::ViewAllocateWithoutInitializing("Cluster graph colinds"), numClusterEdges);
+    Kokkos::parallel_scan(range_pol(0, numFineVerts), FillClusterEntriesFunctor
+        (fineRowmap, fineEntries, coarseRowmap, coarseEntries, clusterOffsets, clusterVerts, labels, crossClusterEdgeMask));
+  }
+
+  coarse_rowmap_t coarseRowmap;
+  coarse_entries_t coarseEntries;
+  ordinal_view_t clusterOffsets;
+  ordinal_view_t clusterVerts;
+};
+
+}}
+
+#endif
diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
index e9596fb772..ced3476539 100644
--- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
@@ -50,6 +50,7 @@
 #include "cusparse.h"
 #include "KokkosKernels_SparseUtils_cusparse.hpp"
 #include "KokkosKernels_Controls.hpp"
+#include "KokkosSparse_spmv_impl.hpp"
 
 namespace KokkosSparse {
 namespace Impl {
@@ -64,8 +65,18 @@ namespace Impl {
 		   const YVector& y) {
     using KAT = Kokkos::Details::ArithTraits<typename YVector::non_const_value_type>;
 
-    std::cout << "It is currently not possible to use the native SpMV implementation"
-      " when cuSPARSE is enabled" << std::endl;
+    if (beta == KAT::zero ()) {
+      KokkosSparse::Impl::spmv_beta<AMatrix, XVector, YVector, 0> (controls, mode, alpha, A, x, beta, y);
+    }
+    else if (beta == KAT::one ()) {
+      KokkosSparse::Impl::spmv_beta<AMatrix, XVector, YVector, 1> (controls, mode, alpha, A, x, beta, y);
+    }
+    else if (beta == -KAT::one ()) {
+      KokkosSparse::Impl::spmv_beta<AMatrix, XVector, YVector, -1> (controls, mode, alpha, A, x, beta, y);
+    }
+    else {
+      KokkosSparse::Impl::spmv_beta<AMatrix, XVector, YVector, 2> (controls, mode, alpha, A, x, beta, y);
+    }
   }
 
   template <class AMatrix, class XVector, class YVector>
@@ -84,9 +95,24 @@ namespace Impl {
     cusparseHandle_t cusparseHandle = controls.getCusparseHandle();
 
     /* Set the operation mode */
-    cusparseOperation_t myCusparseOperation = CUSPARSE_OPERATION_NON_TRANSPOSE;
-    if(mode[0] == Transpose[0]) {myCusparseOperation = CUSPARSE_OPERATION_TRANSPOSE;}
-    else if(mode[0] == ConjugateTranspose[0]) {myCusparseOperation = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE;}
+    cusparseOperation_t myCusparseOperation;
+    switch(toupper(mode[0]))
+    {
+      case 'N':
+        myCusparseOperation = CUSPARSE_OPERATION_NON_TRANSPOSE;
+        break;
+      case 'T':
+        myCusparseOperation = CUSPARSE_OPERATION_TRANSPOSE;
+        break;
+      case 'H':
+        myCusparseOperation = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE;
+        break;
+      default:
+      {
+        std::cerr << "Mode " << mode << " invalid for cuSPARSE SpMV.\n";
+        throw std::invalid_argument("Invalid mode");
+      }
+    }
 
 #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION)
 
@@ -239,8 +265,9 @@ namespace Impl {
 		      const XVector& x,					\
 		      const coefficient_type& beta,			\
 		      const YVector& y) {				\
-      if(controls.isParameter("algorithm") && controls.getParameter("algorithm") == "native") {		\
-	std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + Kokkos::ArithTraits<SCALAR>::name() + "]"; \
+      bool fallback = *mode == 'C' || ((*mode == 'T' || *mode == 'H') && 9000 <= CUDA_VERSION && CUDA_VERSION < 10000); \
+      if((controls.isParameter("algorithm") && controls.getParameter("algorithm") == "native") || fallback) { \
+	std::string label = "KokkosSparse::spmv[NATIVE," + Kokkos::ArithTraits<SCALAR>::name() + "]"; \
 	Kokkos::Profiling::pushRegion(label);				\
 	spmv_native(controls, mode, alpha, A, x, beta, y);		\
 	Kokkos::Profiling::popRegion();					\
@@ -252,6 +279,11 @@ namespace Impl {
       }									\
     }									\
   }; 
+
+//BMK: cuSPARSE that comes with CUDA 9 does not support tranpose or conjugate transpose modes.
+//No version of cuSPARSE supports mode C (conjugate, non transpose).
+//In those cases, fall back to KokkosKernels native spmv.
+
 #if (9000 <= CUDA_VERSION)
   KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft,  Kokkos::CudaSpace, true)
   KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
diff --git a/src/kokkoskernels_eti.cmake b/src/kokkoskernels_eti.cmake
index 1179ec9c41..04a6f412c9 100644
--- a/src/kokkoskernels_eti.cmake
+++ b/src/kokkoskernels_eti.cmake
@@ -126,7 +126,7 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER)
   CMAKE_PARSE_ARGUMENTS(ETI
     ""
     "HEADER_LIST;SOURCE_LIST"
-    "TYPE_LISTS"
+    "TYPE_LISTS;COMPONENTS"
     ${ARGN})
 
   STRING(TOUPPER "${FUNCTION_NAME}" UPPER_NAME)
@@ -134,26 +134,38 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER)
   SET(ETI_AVAIL_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_AVAIL")
   SET(ETI_INST_MACRO  "KOKKOS${UPPER_NAME}_ETI_SPEC_INST")
 
-  KOKKOSKERNELS_ETI_MAKE_LIST(${FUNCTION_NAME}_eti TYPE_LISTS ${ETI_TYPE_LISTS})
-  FOREACH(ETI ${${FUNCTION_NAME}_eti})
-    SET(MACRO_STRING "(")
-    FOREACH(TYPE_NAME ${${ETI}})
-      STRING(APPEND MACRO_STRING "${${TYPE_NAME}_CPP_TYPE},")
+  # if this is tied to particular components
+  # see whether those components are enabled
+  KOKKOSKERNELS_IS_ENABLED(
+    COMPONENTS ${ETI_COMPONENTS}
+    OUTPUT_VARIABLE ETI_COMP_IS_ENABLED
+  )
+
+  IF (ETI_COMP_IS_ENABLED)
+    MESSAGE(STATUS "Creating ETI files for ${FUNCTION_NAME}")
+    KOKKOSKERNELS_ETI_MAKE_LIST(${FUNCTION_NAME}_eti TYPE_LISTS ${ETI_TYPE_LISTS})
+    FOREACH(ETI ${${FUNCTION_NAME}_eti})
+      SET(MACRO_STRING "(")
+      FOREACH(TYPE_NAME ${${ETI}})
+        STRING(APPEND MACRO_STRING "${${TYPE_NAME}_CPP_TYPE},")
+      ENDFOREACH()
+      STRING(APPEND MACRO_STRING ")")
+      STRING(REPLACE ",)" ")" MACRO_STRING ${MACRO_STRING})
+      #Make a single header file for all instances
+      LIST(APPEND ${UPPER_NAME}_ETI_DECL_LIST  "${ETI_DECL_MACRO}${MACRO_STRING}")
+      LIST(APPEND ${UPPER_NAME}_ETI_AVAIL_LIST "${ETI_AVAIL_MACRO}${MACRO_STRING}")
+      SET(${UPPER_NAME}_ETI_DECL_LIST  "${ETI_DECL_MACRO}${MACRO_STRING}")
+      #Make a different source file for each instance
+      SET(INST_SOURCE   "impl/generated_specializations_cpp/${SUBFOLDER}/${ETI}.cpp")
+      SET(INST_TEMPLATE "impl/generated_specializations_cpp/${SUBFOLDER}/Kokkos${FUNCTION_NAME}_eti_spec_inst.cpp.in")
+      SET(${UPPER_NAME}_ETI_INST_BLOCK "${ETI_INST_MACRO}${MACRO_STRING}")
+      CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${INST_TEMPLATE}
+          ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE})
+      LIST(APPEND ${ETI_SOURCE_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE})
     ENDFOREACH()
-    STRING(APPEND MACRO_STRING ")")
-    STRING(REPLACE ",)" ")" MACRO_STRING ${MACRO_STRING})
-    #Make a single header file for all instances
-    LIST(APPEND ${UPPER_NAME}_ETI_DECL_LIST  "${ETI_DECL_MACRO}${MACRO_STRING}")
-    LIST(APPEND ${UPPER_NAME}_ETI_AVAIL_LIST "${ETI_AVAIL_MACRO}${MACRO_STRING}")
-    SET(${UPPER_NAME}_ETI_DECL_LIST  "${ETI_DECL_MACRO}${MACRO_STRING}")
-    #Make a different source file for each instance
-    SET(INST_SOURCE   "impl/generated_specializations_cpp/${SUBFOLDER}/${ETI}.cpp")
-    SET(INST_TEMPLATE "impl/generated_specializations_cpp/${SUBFOLDER}/Kokkos${FUNCTION_NAME}_eti_spec_inst.cpp.in")
-    SET(${UPPER_NAME}_ETI_INST_BLOCK "${ETI_INST_MACRO}${MACRO_STRING}")
-    CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${INST_TEMPLATE}
-        ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE})
-    LIST(APPEND ${ETI_SOURCE_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE})
-  ENDFOREACH()
+  ELSE()
+    MESSAGE(STATUS "Skipping ETI files for ${FUNCTION_NAME} because not all components are enabled")
+  ENDIF()
 
   SET(AVAIL_HEADER   "impl/generated_specializations_hpp/Kokkos${FUNCTION_NAME}_eti_spec_avail.hpp")
   SET(AVAIL_TEMPLATE "${AVAIL_HEADER}.in")
@@ -163,7 +175,6 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER)
   STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_INST_BLOCK  "${${UPPER_NAME}_ETI_INST_LIST}")
   STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_AVAIL_BLOCK "${${UPPER_NAME}_ETI_AVAIL_LIST}")
 
-  MESSAGE(STATUS "Creating ETI files for ${FUNCTION_NAME}")
   CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${DECL_TEMPLATE}
       ${CMAKE_CURRENT_BINARY_DIR}/${DECL_HEADER})
   CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${AVAIL_TEMPLATE}
diff --git a/src/sparse/KokkosSparse_BlockCrsMatrix.hpp b/src/sparse/KokkosSparse_BlockCrsMatrix.hpp
index 9cfd97afbb..61f3550275 100644
--- a/src/sparse/KokkosSparse_BlockCrsMatrix.hpp
+++ b/src/sparse/KokkosSparse_BlockCrsMatrix.hpp
@@ -415,17 +415,10 @@ class BlockCrsMatrix {
 
   //! Type of a host-memory mirror of the sparse matrix.
   typedef BlockCrsMatrix<ScalarType, OrdinalType, host_mirror_space, MemoryTraits> HostMirror;
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  //! Type of the graph structure of the sparse matrix.
-  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, execution_space, size_type, memory_traits> StaticCrsGraphType;
-  //! Type of the graph structure of the sparse matrix - consistent with Kokkos.
-  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, execution_space, size_type, memory_traits> staticcrsgraph_type;
-#else
   //! Type of the graph structure of the sparse matrix.
   typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, execution_space, memory_traits, size_type> StaticCrsGraphType;
   //! Type of the graph structure of the sparse matrix - consistent with Kokkos.
   typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, execution_space, memory_traits, size_type> staticcrsgraph_type;
-#endif
   //! Type of column indices in the sparse matrix.
   typedef typename staticcrsgraph_type::entries_type index_type;
   //! Const version of the type of column indices in the sparse matrix.
diff --git a/src/sparse/KokkosSparse_CrsMatrix.hpp b/src/sparse/KokkosSparse_CrsMatrix.hpp
index bba54c613c..d866a63601 100644
--- a/src/sparse/KokkosSparse_CrsMatrix.hpp
+++ b/src/sparse/KokkosSparse_CrsMatrix.hpp
@@ -104,6 +104,12 @@ inline int RowsPerThread<Kokkos::Cuda>(const int NNZPerRow) {
   return 1;
 }
 #endif
+#ifdef KOKKOS_ENABLE_HIP
+template<>
+inline int RowsPerThread<Kokkos::Experimental::HIP>(const int NNZPerRow) {
+  return 1;
+}
+#endif
 
 // A simple struct for storing a kernel launch configuration.
 // This is currently used by CrsMatrix to allow the user to have some control
@@ -406,17 +412,10 @@ class CrsMatrix {
 
   //! Type of a host-memory mirror of the sparse matrix.
   typedef CrsMatrix<ScalarType, OrdinalType, host_mirror_space, MemoryTraits> HostMirror;
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  //! Type of the graph structure of the sparse matrix.
-  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, device_type, size_type, memory_traits> StaticCrsGraphType;
-  //! Type of the graph structure of the sparse matrix - consistent with Kokkos.
-  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, device_type, size_type, memory_traits> staticcrsgraph_type;
-#else
   //! Type of the graph structure of the sparse matrix.
   typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, device_type, memory_traits, size_type> StaticCrsGraphType;
   //! Type of the graph structure of the sparse matrix - consistent with Kokkos.
   typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, device_type, memory_traits, size_type> staticcrsgraph_type;
-#endif
   //! Type of column indices in the sparse matrix.
   typedef typename staticcrsgraph_type::entries_type index_type;
   //! Const version of the type of column indices in the sparse matrix.
diff --git a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp
index e4ded70d54..fd4a9b58d9 100644
--- a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp
+++ b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp
@@ -57,20 +57,16 @@ namespace KokkosSparse{
 
   enum GSAlgorithm{GS_DEFAULT, GS_PERMUTED, GS_TEAM, GS_CLUSTER, GS_TWOSTAGE};
   enum GSDirection{GS_FORWARD, GS_BACKWARD, GS_SYMMETRIC};
-  enum ClusteringAlgorithm{CLUSTER_DEFAULT, CLUSTER_BALLOON, CLUSTER_CUTHILL_MCKEE, CLUSTER_DO_NOTHING, NUM_CLUSTERING_ALGORITHMS};
+  enum ClusteringAlgorithm{CLUSTER_DEFAULT, CLUSTER_MIS2, CLUSTER_BALLOON, NUM_CLUSTERING_ALGORITHMS};
 
   inline const char* getClusterAlgoName(ClusteringAlgorithm ca)
   {
     switch(ca)
     {
-      case CLUSTER_DEFAULT:
-        return "Default";
       case CLUSTER_BALLOON:
         return "Balloon";
-      case CLUSTER_CUTHILL_MCKEE:
-        return "Cuthill-McKee";
-      case CLUSTER_DO_NOTHING:
-        return "No-op";
+      case CLUSTER_MIS2:
+        return "MIS(2)";
       default:;
     }
     return "INVALID CLUSTERING ALGORITHM";
@@ -192,12 +188,8 @@ namespace KokkosSparse{
         return;
       }
       else {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-        KokkosKernels::Impl::get_suggested_vector_team_size<size_type, ExecutionSpace>(max_allowed_team_size, suggested_vector_size_, suggested_team_size_, nr, nnz);
-#else
         KokkosKernels::Impl::get_suggested_vector_size<size_type, ExecutionSpace>(suggested_vector_size_, nr, nnz);
         KokkosKernels::Impl::get_suggested_team_size<ExecutionSpace>(max_allowed_team_size, suggested_vector_size_, suggested_team_size_);
-#endif
         this->suggested_team_size = suggested_vector_size_;
         this->suggested_vector_size = suggested_vector_size_;
 
@@ -282,53 +274,11 @@ namespace KokkosSparse{
     void set_block_size(nnz_lno_t bs){this->block_size = bs; }
     nnz_lno_t get_block_size() const {return this->block_size;}
 
-    /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise.
-     */
     void choose_default_algorithm(){
-#if defined( KOKKOS_ENABLE_SERIAL )
-      if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
-        this->algorithm_type = GS_PERMUTED;
-#ifdef VERBOSE
-        std::cout << "Serial Execution Space, Default Algorithm: GS_PERMUTED" << std::endl;
-#endif
-      }
-#endif
-
-#if defined( KOKKOS_ENABLE_THREADS )
-      if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){
-        this->algorithm_type = GS_PERMUTED;
-#ifdef VERBOSE
-        std::cout << "PTHREAD Execution Space, Default Algorithm: GS_PERMUTED" << std::endl;
-#endif
-      }
-#endif
-
-#if defined( KOKKOS_ENABLE_OPENMP )
-      if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){
-        this->algorithm_type = GS_PERMUTED;
-#ifdef VERBOSE
-        std::cout << "OpenMP Execution Space, Default Algorithm: GS_PERMUTED" << std::endl;
-#endif
-      }
-#endif
-
-#if defined( KOKKOS_ENABLE_CUDA )
-      if (std::is_same<Kokkos::Cuda, ExecutionSpace >::value){
+      if(KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>())
         this->algorithm_type = GS_TEAM;
-#ifdef VERBOSE
-        std::cout << "Cuda Execution Space, Default Algorithm: GS_TEAM" << std::endl;
-#endif
-      }
-#endif
-
-#if defined( KOKKOS_ENABLE_QTHREAD)
-      if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
+      else
         this->algorithm_type = GS_PERMUTED;
-#ifdef VERBOSE
-        std::cout << "Qthread Execution Space, Default Algorithm: GS_PERMUTED" << std::endl;
-#endif
-      }
-#endif
     }
 
     ~PointGaussSeidelHandle() = default;
@@ -449,13 +399,8 @@ namespace KokkosSparse{
         return;
       }
       else {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-        KokkosKernels::Impl::get_suggested_vector_team_size<size_type, ExecutionSpace>(
-                                                                                       max_allowed_team_size, suggested_vector_size_, suggested_team_size_, nr, nnz);
-#else
         KokkosKernels::Impl::get_suggested_vector_size<size_type, ExecutionSpace>(suggested_vector_size_, nr, nnz);
         KokkosKernels::Impl::get_suggested_team_size<ExecutionSpace>(max_allowed_team_size, suggested_vector_size_, suggested_team_size_);
-#endif
         this->suggested_team_size = suggested_vector_size_;
         this->suggested_vector_size = suggested_vector_size_;
 
@@ -572,33 +517,7 @@ namespace KokkosSparse{
     
     bool use_teams() const
     {
-      bool return_value = false;
-#if defined( KOKKOS_ENABLE_SERIAL )
-      if (std::is_same< Kokkos::Serial , ExecutionSpace >::value) {
-        return_value = false;
-      }
-#endif
-#if defined( KOKKOS_ENABLE_THREADS )
-      if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){
-        return_value = false;
-      }
-#endif
-#if defined( KOKKOS_ENABLE_OPENMP )
-      if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){
-        return_value = false;
-      }
-#endif
-#if defined( KOKKOS_ENABLE_CUDA )
-      if (std::is_same<Kokkos::Cuda, ExecutionSpace >::value){
-        return_value = true;
-      }
-#endif
-#if defined( KOKKOS_ENABLE_QTHREAD)
-      if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
-        return_value = false;
-      }
-#endif
-      return return_value;
+      return KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>();
     }
 
     ~ClusterGaussSeidelHandle() = default;
diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp
index 820afbbaa3..1efae2c1a7 100644
--- a/src/sparse/KokkosSparse_spadd.hpp
+++ b/src/sparse/KokkosSparse_spadd.hpp
@@ -47,7 +47,7 @@
 
 #include "KokkosKernels_Handle.hpp"
 #include "KokkosKernels_Sorting.hpp"
-#include <limits>
+#include "Kokkos_ArithTraits.hpp"
 
 namespace KokkosSparse {
 namespace Experimental {
@@ -86,10 +86,10 @@ struct SortedCountEntries {
         Bcolinds(Bcolinds_),
         Crowcounts(Crowcounts_) {}
 
-  static constexpr ordinal_type ORDINAL_MAX = std::numeric_limits<ordinal_type>::max();
-
   KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const
   {
+    const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits<ordinal_type>::max();
+
     // count the union of nonzeros in Arow and Brow
     size_type numEntries = 0;
     size_type ai         = 0;
@@ -202,67 +202,6 @@ struct UnmergedSumFunctor {
   CcolindsT ABperm;
 };
 
-template <typename ExecSpace, typename size_type, typename ordinal_type,
-          typename CrowptrsT, typename CcolindsT>
-struct SortEntriesFunctor {
-  SortEntriesFunctor(const CrowptrsT& Crowptrs_, const CcolindsT& Ccolinds_,
-                     const CcolindsT& ABperm_)
-      : Crowptrs(Crowptrs_),
-        Ccolinds(Ccolinds_),
-        CcolindsAux("C colind aux", Ccolinds_.extent(0)),
-        ABperm(ABperm_),
-        ABpermAux("AB perm aux", ABperm_.extent(0)) {}
-  typedef typename Kokkos::TeamPolicy<ExecSpace>::member_type TeamMember;
-  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const {
-    // 3: Sort each row's colinds (permuting values at same time), then count
-    // unique colinds (write that to Crowptr(i)) CrowptrTemp tells how many
-    // entries in each oversized row
-    ordinal_type i       = t.league_rank();
-    size_type rowStart   = Crowptrs(i);
-    size_type rowEnd     = Crowptrs(i + 1);
-    size_type rowNum     = rowEnd - rowStart;
-    using lno_t          = typename CcolindsT::non_const_value_type;
-    using unsigned_lno_t = typename std::make_unsigned<lno_t>::type;
-    KokkosKernels::Impl::SerialRadixSort2<size_type, unsigned_lno_t, lno_t>(
-        (unsigned_lno_t*)Ccolinds.data() + rowStart,
-        (unsigned_lno_t*)CcolindsAux.data() + rowStart,
-        ABperm.data() + rowStart, ABpermAux.data() + rowStart, rowNum);
-  }
-  CrowptrsT Crowptrs;
-  CcolindsT Ccolinds;
-  CcolindsT CcolindsAux;
-  CcolindsT ABperm;
-  CcolindsT ABpermAux;
-};
-
-#ifdef KOKKOS_ENABLE_CUDA
-template <typename size_type, typename ordinal_type, typename CrowptrsT,
-          typename CcolindsT>
-struct SortEntriesFunctor<Kokkos::Cuda, size_type, ordinal_type, CrowptrsT,
-                          CcolindsT> {
-  SortEntriesFunctor(const CrowptrsT& Crowptrs_, CcolindsT& Ccolinds_,
-                     CcolindsT& ABperm_)
-      : Crowptrs(Crowptrs_), Ccolinds(Ccolinds_), ABperm(ABperm_) {}
-  typedef typename Kokkos::TeamPolicy<Kokkos::Cuda>::member_type TeamMember;
-  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const {
-    // 3: Sort each row's colinds (permuting values at same time), then count
-    // unique colinds (write that to Crowptr(i)) CrowptrTemp tells how many
-    // entries in each oversized row
-    size_type i        = t.league_rank();
-    size_type rowStart = Crowptrs(i);
-    size_type rowEnd   = Crowptrs(i + 1);
-    size_type rowNum   = rowEnd - rowStart;
-    KokkosKernels::Impl::TeamBitonicSort2<
-        size_type, typename CcolindsT::non_const_value_type,
-        typename CcolindsT::non_const_value_type, TeamMember>(
-        Ccolinds.data() + rowStart, ABperm.data() + rowStart, rowNum, t);
-  }
-  CrowptrsT Crowptrs;
-  CcolindsT Ccolinds;
-  CcolindsT ABperm;
-};
-#endif
-
 template <typename size_type, typename ordinal_type, typename ArowptrsT,
           typename BrowptrsT, typename CrowptrsT, typename CcolindsT>
 struct MergeEntriesFunctor {
@@ -478,7 +417,6 @@ template <typename size_type, typename ordinal_type, typename ArowptrsT,
           typename BscalarT>
 struct SortedNumericSumFunctor {
   using CscalarT = typename CvaluesT::non_const_value_type;
-  static constexpr ordinal_type ORDINAL_MAX = std::numeric_limits<ordinal_type>::max();
 
   SortedNumericSumFunctor(const ArowptrsT& Arowptrs_,
                           const BrowptrsT& Browptrs_,
@@ -502,6 +440,8 @@ struct SortedNumericSumFunctor {
 
   KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const
   {
+    const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits<ordinal_type>::max();
+
     // count the union of nonzeros in Arow and Brow
     size_type ai         = 0;
     size_type bi         = 0;
diff --git a/src/sparse/KokkosSparse_spgemm_handle.hpp b/src/sparse/KokkosSparse_spgemm_handle.hpp
index b34d349457..f517682d5e 100644
--- a/src/sparse/KokkosSparse_spgemm_handle.hpp
+++ b/src/sparse/KokkosSparse_spgemm_handle.hpp
@@ -504,8 +504,6 @@ class SPGEMMHandle{
     return this->cuSPARSEHandle;
   }
 #endif
-    /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise.
-   */
   void choose_default_algorithm(){
 #if defined( KOKKOS_ENABLE_SERIAL )
     if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
@@ -543,6 +541,15 @@ class SPGEMMHandle{
     }
 #endif
 
+#if defined( KOKKOS_ENABLE_HIP )
+    if (std::is_same<Kokkos::Experimental::HIP, ExecutionSpace >::value){
+      this->algorithm_type = SPGEMM_KK;
+#ifdef VERBOSE
+      std::cout << "HIP Execution Space, Default Algorithm: SPGEMM_KK" << std::endl;
+#endif
+    }
+#endif
+
 #if defined( KOKKOS_ENABLE_QTHREAD)
     if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
       this->algorithm_type = SPGEMM_SERIAL;
@@ -604,67 +611,20 @@ class SPGEMMHandle{
     //suggested_vector_size_=this->suggested_vector_size = 1;
     //return;
     if (this->suggested_team_size && this->suggested_vector_size) {
+      //already set in the handle
       suggested_vector_size_ = this->suggested_vector_size;
       suggested_team_size_ = this->suggested_team_size;
       return;
     }
 
-#if defined( KOKKOS_ENABLE_SERIAL )
-    if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
-      suggested_vector_size_ = this->suggested_vector_size = 1;
-      suggested_team_size_ = this->suggested_team_size = max_allowed_team_size;
-      return;
-    }
-#endif
-
-#if defined( KOKKOS_ENABLE_THREADS )
-    if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){
-      suggested_vector_size_ = this->suggested_vector_size = 1;
-      suggested_team_size_ = this->suggested_team_size = max_allowed_team_size;
-      return;
-    }
-#endif
-
-#if defined( KOKKOS_ENABLE_OPENMP )
-    if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){
-      suggested_vector_size_ = this->suggested_vector_size = 1;
-      suggested_team_size_ = this->suggested_team_size = max_allowed_team_size;
-    }
-#endif
-
-#if defined( KOKKOS_ENABLE_CUDA )
-    if (std::is_same<Kokkos::Cuda, ExecutionSpace >::value){
-
-      this->suggested_vector_size = nnz / double (nr) + 0.5;
-
-      if (this->suggested_vector_size <= 3){
-        this->suggested_vector_size = 2;
-      }
-      else if (this->suggested_vector_size <= 6){
-        this->suggested_vector_size = 4;
-      }
-      else if (this->suggested_vector_size <= 12){
-        this->suggested_vector_size = 8;
-      }
-      else if (this->suggested_vector_size <= 24){
-        this->suggested_vector_size = 16;
-      }
-      else {
-        this->suggested_vector_size = 32;
-      }
-
-      suggested_vector_size_ = this->suggested_vector_size;
-      this->suggested_team_size= suggested_team_size_ = max_allowed_team_size / this->suggested_vector_size;
-    }
-#endif
-
-#if defined( KOKKOS_ENABLE_QTHREAD)
-    if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
-      suggested_vector_size_ = this->suggested_vector_size = 1;
-      suggested_team_size_ = this->suggested_team_size = max_allowed_team_size;
-    }
-#endif
-
+    //otherwise, recompute team_size/vector_size based on heuristic and save them in the handle
+    suggested_vector_size_ = KokkosKernels::Impl::kk_get_suggested_vector_size(nr, nnz, KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>());
+    if(KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>())
+      suggested_team_size_ = max_allowed_team_size / suggested_vector_size_;
+    else
+      suggested_team_size = max_allowed_team_size;
+    this->suggested_vector_size = suggested_vector_size_;
+    this->suggested_team_size = suggested_vector_size_;
   }
 
   void set_compression_steps(bool isCompressionSingleStep){
diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp
index e18bc4690f..4c26f5cd6e 100644
--- a/src/sparse/KokkosSparse_spmv.hpp
+++ b/src/sparse/KokkosSparse_spmv.hpp
@@ -151,7 +151,7 @@ spmv (KokkosKernels::Experimental::Controls controls,
       KokkosBlas::scal(y_i, beta, y_i);
     return;
   }
-  return Impl::SPMV<
+  Impl::SPMV<
     typename AMatrix_Internal::value_type,
     typename AMatrix_Internal::ordinal_type,
     typename AMatrix_Internal::device_type,
diff --git a/src/sparse/KokkosSparse_sptrsv_supernode.hpp b/src/sparse/KokkosSparse_sptrsv_supernode.hpp
index 6f230780cc..e73837e3a4 100644
--- a/src/sparse/KokkosSparse_sptrsv_supernode.hpp
+++ b/src/sparse/KokkosSparse_sptrsv_supernode.hpp
@@ -57,8 +57,14 @@
 
 #include "KokkosBlas3_trmm.hpp"
 #include "KokkosBlas_trtri.hpp"
-#include "KokkosSparse_sptrsv.hpp"
 
+#include "KokkosBatched_Trtri_Decl.hpp"
+#include "KokkosBatched_Trtri_Serial_Impl.hpp"
+
+#include "KokkosBatched_Trmm_Decl.hpp"
+#include "KokkosBatched_Trmm_Serial_Impl.hpp"
+
+#include "KokkosSparse_sptrsv.hpp"
 
 namespace KokkosSparse {
 namespace Experimental {
@@ -900,6 +906,7 @@ void sptrsv_supernodal_symbolic(
     host_graph_t graphU_host, KernelHandle *kernelHandleU) {
 
   #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  int nrows = graphL_host.numRows ();
   double time_seconds = 0.0;
   Kokkos::Timer timer;
   Kokkos::Timer tic;
@@ -946,7 +953,6 @@ void sptrsv_supernodal_symbolic(
     int nsuper_merged = nsuper;
     #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
     tic.reset ();
-    int nrows = graphL_host.numRows ();
     check_supernode_sizes("Original L-structure", nrows, nsuper, supercols_merged, graphL_host);
     check_supernode_sizes("Original U-structure", nrows, nsuper, supercols_merged, graphU_host);
     #endif
@@ -1082,6 +1088,89 @@ void sptrsv_supernodal_symbolic(
 /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
 /* Auxiliary functions for numeric computation                                               */
 
+/* ========================================================================================= */
+  struct Tag_SupTrtriFunctor{};
+  struct Tag_SupTrtriTrmmFunctor{};
+
+  template <typename UploType, typename DiagType, typename integer_view_host_t, 
+            typename input_size_type, typename row_map_type, typename index_type, typename values_type>
+  struct TriSupernodalTrtriFunctor {
+
+    integer_view_host_t supernode_ids;
+    const input_size_type *nb;
+    row_map_type hr;
+    index_type   hc;
+    values_type  hv;
+
+    KOKKOS_INLINE_FUNCTION
+    TriSupernodalTrtriFunctor(integer_view_host_t supernode_ids_, const input_size_type *nb_, 
+                              row_map_type& hr_, index_type& hc_, values_type& hv_) :
+    supernode_ids(supernode_ids_),
+    nb(nb_),
+    hr(hr_),
+    hc(hc_),
+    hv(hv_)
+    {}
+
+    // functor: just invert diagonal
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const Tag_SupTrtriFunctor&, const int i) const {
+      using execution_space = typename values_type::execution_space;
+      using memory_space    = typename execution_space::memory_space;
+      using values_view_t   = typename values_type::non_const_type;
+      using scalar_t        = typename values_view_t::value_type;
+
+      using range_type = Kokkos::pair<int, int>;
+      using TrtriAlgoType = KokkosBatched::Algo::Trtri::Unblocked;
+
+      int s = supernode_ids(i);
+      int j1 = nb[s];
+      int nsrow = hr(j1+1) - hr(j1);
+      int nscol = nb[s +1] - nb[s];
+
+      // invert diagonal
+      auto nnzD = hr (j1);
+      Kokkos::View<scalar_t**, Kokkos::LayoutLeft, memory_space, Kokkos::MemoryUnmanaged>
+        viewL (&hv(nnzD), nsrow, nscol);
+      auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ());
+      KokkosBatched::SerialTrtri<UploType, DiagType, TrtriAlgoType>::invoke(Ljj);
+    }
+
+    // functor: invert diagonal + apply inverse to off-diagonal
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const Tag_SupTrtriTrmmFunctor&, const int i) const {
+      using execution_space = typename values_type::execution_space;
+      using memory_space    = typename execution_space::memory_space;
+      using values_view_t   = typename values_type::non_const_type;
+      using scalar_t        = typename values_view_t::value_type;
+
+      using range_type = Kokkos::pair<int, int>;
+      using TrtriAlgoType = KokkosBatched::Algo::Trtri::Unblocked;
+      using Side  = KokkosBatched::Side;
+      using Trans = KokkosBatched::Trans;
+
+      int s = supernode_ids(i);
+      int j1 = nb[s];
+      int nsrow = hr(j1+1) - hr(j1);
+      int nscol = nb[s +1] - nb[s];
+
+      // invert diagonal
+      auto nnzD = hr (j1);
+      Kokkos::View<scalar_t**, Kokkos::LayoutLeft, memory_space, Kokkos::MemoryUnmanaged>
+        viewL (&hv(nnzD), nsrow, nscol);
+      auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ());
+      KokkosBatched::SerialTrtri<UploType, DiagType, TrtriAlgoType>::invoke(Ljj);
+
+      // apply invse to off-diagonal
+      //if (nsrow > nscol && invert_offdiag)
+      {
+        const scalar_t one (1.0);
+        auto Lij = Kokkos::subview (viewL, range_type (nscol, nsrow), Kokkos::ALL ());
+        KokkosBatched::SerialTrmm<Side::Right, UploType, Trans::NoTranspose, DiagType, TrtriAlgoType>::
+          invoke(one, Ljj, Lij);
+      }
+    }
+  };
 /* ========================================================================================= */
 template <typename KernelHandle, typename input_size_type,
           typename row_map_type, typename index_type, typename values_type>
@@ -1090,10 +1179,11 @@ invert_supernodal_columns(KernelHandle kernelHandle, bool unit_diag, int nsuper,
                           row_map_type& hr, index_type& hc, values_type& hv) {
 
   using execution_space = typename values_type::execution_space;
-  using memory_space = typename execution_space::memory_space;
-  using values_view_t  = typename values_type::non_const_type;
-  using scalar_t = typename values_view_t::value_type;
+  using memory_space    = typename execution_space::memory_space;
+  using values_view_t   = typename values_type::non_const_type;
+  using scalar_t        = typename values_view_t::value_type;
   using range_type = Kokkos::pair<int, int>;
+  using integer_view_host_t = Kokkos::View<int*, Kokkos::HostSpace>;
 
   const scalar_t one (1.0);
 
@@ -1109,46 +1199,136 @@ invert_supernodal_columns(KernelHandle kernelHandle, bool unit_diag, int nsuper,
   // quick return
   if (!invert_diag) return;
 
+  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   Kokkos::Timer timer;
   double time1 = 0.0;
   double time2 = 0.0;
+  double time3 = 0.0;
+  #endif
 
   // ----------------------------------------------------------
   // now let's invert some blocks
+  // > first go through all the supernode columns
+  // > use KokkosBlas on large blocks, and keep track of small blocks
+  // > to call batchedBlas on them
+  int num_batchs = 0;
+  int size_unblocked = handle->get_supernode_size_unblocked();
+  integer_view_host_t supernode_ids ("supernode_batch", nsuper);
   for (int s2 = 0; s2 < nsuper; s2++) {
-    int j1 = nb[s2];
-    int nsrow = hr(j1+1) - hr(j1);
     int nscol = nb[s2+1] - nb[s2];
 
-    auto nnzD = hr (j1);
-    char uplo_char = (lower ? 'L' : 'U');
-    char diag_char = (unit_diag ? 'U' : 'N');
+    if (nscol >= size_unblocked) {
+      int j1 = nb[s2];
+      int nsrow = hr(j1+1) - hr(j1);
 
-    Kokkos::View<scalar_t**, Kokkos::LayoutLeft, memory_space, Kokkos::MemoryUnmanaged>
-      viewL (&hv(nnzD), nsrow, nscol);
-    auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ());
+      auto nnzD = hr (j1);
+      char uplo_char = (lower ? 'L' : 'U');
+      char diag_char = (unit_diag ? 'U' : 'N');
 
-    timer.reset ();
-    KokkosBlas::trtri(&uplo_char, &diag_char, Ljj);
-    time1 += timer.seconds ();
-
-    if (nsrow > nscol && invert_offdiag) {
-      char side_char = 'R';
-      char tran_char = 'N';
-      auto Lij = Kokkos::subview (viewL, range_type (nscol, nsrow), Kokkos::ALL ());
+      Kokkos::View<scalar_t**, Kokkos::LayoutLeft, memory_space, Kokkos::MemoryUnmanaged>
+        viewL (&hv(nnzD), nsrow, nscol);
+      auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ());
 
+      #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
       timer.reset ();
-      KokkosBlas::trmm (&side_char, &uplo_char,
-                        &tran_char, &diag_char,
-                        one, Ljj, Lij);
-      time2 += timer.seconds ();
+      #endif
+      KokkosBlas::trtri(&uplo_char, &diag_char, Ljj);
+      #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+      time1 += timer.seconds ();
+      #endif
+
+      if (nsrow > nscol && invert_offdiag) {
+        char side_char = 'R';
+        char tran_char = 'N';
+        auto Lij = Kokkos::subview (viewL, range_type (nscol, nsrow), Kokkos::ALL ());
+
+        #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+        timer.reset ();
+        #endif
+        KokkosBlas::trmm (&side_char, &uplo_char,
+                          &tran_char, &diag_char,
+                          one, Ljj, Lij);
+        #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+        time2 += timer.seconds ();
+        #endif
+      }
+    }
+    else {
+      supernode_ids (num_batchs) = s2;
+      num_batchs ++;
     }
   }
+  // now call batchedBLAS
+  if (num_batchs > 0) {
+    using Uplo = KokkosBatched::Uplo;
+    using Diag = KokkosBatched::Diag;
+    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    timer.reset ();
+    #endif
+    if (lower) {
+      if (unit_diag) {
+        if (invert_offdiag) {
+          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriTrmmFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Lower, Diag::Unit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
+            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor);
+        } else {
+          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Lower, Diag::Unit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
+            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor);
+        }
+      } else {
+        if (invert_offdiag) {
+          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriTrmmFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Lower, Diag::NonUnit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
+            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor);
+        } else {
+          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Lower, Diag::NonUnit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
+            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor);
+        }
+      }
+    } else {
+      if (unit_diag) {
+        if (invert_offdiag) {
+          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriTrmmFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Upper, Diag::Unit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
+            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor);
+        } else {
+          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Upper, Diag::Unit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
+            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor);
+        }
+      } else {
+        if (invert_offdiag) {
+          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriTrmmFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Upper, Diag::NonUnit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
+            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor);
+        } else {
+          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Upper, Diag::NonUnit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
+            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batchs), sptrsv_tritri_functor);
+        }
+      }
+    }
+    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    time3 = timer.seconds ();
+    #endif
+  }
 
   #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   std::cout << "   invert_supernodes" << std::endl;
+  std::cout << "   + num supernodes = " << nsuper << " num batchs = " << num_batchs << std::endl;
   std::cout << "   > Time for inversion::trtri : " << time1 << std::endl;
   std::cout << "   > Time for inversion::trmm  : " << time2 << std::endl;
+  std::cout << "   > Time for batchs           : " << time3 << std::endl;
   #endif
 }
 
diff --git a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp
index f2cdee87bb..bb1f96c4e3 100644
--- a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp
@@ -56,6 +56,8 @@
 #include "KokkosKernels_BitUtils.hpp"
 #include "KokkosKernels_SimpleUtils.hpp"
 #include "KokkosSparse_partitioning_impl.hpp"
+#include "KokkosGraph_MIS2.hpp"
+#include "KokkosGraph_ExplicitCoarsening.hpp"
 
 namespace KokkosSparse{
   namespace Impl{
@@ -80,6 +82,10 @@ namespace KokkosSparse{
       typedef typename HandleType::nnz_lno_t nnz_lno_t;
       typedef typename HandleType::nnz_scalar_t nnz_scalar_t;
 
+      static_assert(std::is_same<size_type, typename in_lno_row_view_t::non_const_value_type>::value,
+          "ClusterGaussSeidel: Handle's size_type does not match input rowmap's element type.");
+      static_assert(std::is_same<nnz_lno_t, typename in_lno_nnz_view_t::non_const_value_type>::value,
+          "ClusterGaussSeidel: Handle's nnz_lno_t does not match input entries's element type.");
 
       typedef typename in_lno_row_view_t::const_type const_lno_row_view_t;
       typedef typename in_lno_row_view_t::non_const_type non_const_lno_row_view_t;
@@ -306,7 +312,7 @@ namespace KokkosSparse{
               for(int j = 0; j < N; j++)
                 lsum.data[j] += val * _Xvector(colIndex, colStart + j);
             }, sum);
-          Kokkos::single(Kokkos::PerThread(teamMember),[=] ()
+          Kokkos::single(Kokkos::PerThread(teamMember),[&] ()
           {
             nnz_scalar_t invDiagonalVal = _inverse_diagonal(row);
             for(int i = 0; i < N; i++)
@@ -494,208 +500,6 @@ namespace KokkosSparse{
         nnz_lno_t clusterSize;
       };
 
-      template<typename nnz_view_t>
-      struct ClusterSizeFunctor
-      {
-        ClusterSizeFunctor(nnz_view_t& counts_, nnz_view_t& vertClusters_)
-          : counts(counts_), vertClusters(vertClusters_)
-        {}
-        KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i) const
-        {
-          Kokkos::atomic_increment(&counts(vertClusters(i)));
-        }
-        nnz_view_t counts;
-        nnz_view_t vertClusters;
-      };
-
-      template<typename nnz_view_t>
-      struct FillClusterVertsFunctor
-      {
-        FillClusterVertsFunctor(nnz_view_t& clusterOffsets_, nnz_view_t& clusterVerts_, nnz_view_t& vertClusters_, nnz_view_t& insertCounts_)
-          : clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), insertCounts(insertCounts_)
-        {}
-        KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i) const
-        {
-          nnz_lno_t cluster = vertClusters(i);
-          nnz_lno_t offset = clusterOffsets(cluster) + Kokkos::atomic_fetch_add(&insertCounts(cluster), 1);
-          clusterVerts(offset) = i;
-        }
-        nnz_view_t clusterOffsets;
-        nnz_view_t clusterVerts;
-        nnz_view_t vertClusters;
-        nnz_view_t insertCounts;
-      };
-
-      template<typename Rowmap, typename Colinds, typename nnz_view_t>
-      struct BuildCrossClusterMaskFunctor
-      {
-        BuildCrossClusterMaskFunctor(Rowmap& rowmap_, Colinds& colinds_, nnz_view_t& clusterOffsets_, nnz_view_t& clusterVerts_, nnz_view_t& vertClusters_, bitset_t& mask_)
-          : numRows(rowmap_.extent(0) - 1), rowmap(rowmap_), colinds(colinds_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), mask(mask_)
-        {}
-
-        //Used a fixed-size hash set in shared memory
-        KOKKOS_INLINE_FUNCTION constexpr int tableSize() const
-        {
-          //Should always be a power-of-two, so that X % tableSize() reduces to a bitwise and.
-          return 512;
-        }
-
-        //Given a cluster index, get the hash table index.
-        //This is the 32-bit xorshift RNG, but it works as a hash function.
-        KOKKOS_INLINE_FUNCTION unsigned xorshiftHash(nnz_lno_t cluster) const
-        {
-          unsigned x = cluster;
-          x ^= x << 13;
-          x ^= x >> 17;
-          x ^= x << 5;
-          return x;
-        }
-
-        KOKKOS_INLINE_FUNCTION bool lookup(nnz_lno_t cluster, int* table) const
-        {
-          unsigned h = xorshiftHash(cluster);
-          for(unsigned i = h; i < h + 2; i++)
-          {
-            if(table[i % tableSize()] == cluster)
-              return true;
-          }
-          return false;
-        }
-
-        //Try to insert the edge between cluster (team's cluster) and neighbor (neighboring cluster)
-        //by inserting nei into the table.
-        KOKKOS_INLINE_FUNCTION bool insert(nnz_lno_t cluster, nnz_lno_t nei, int* table) const
-        {
-          unsigned h = xorshiftHash(nei);
-          for(unsigned i = h; i < h + 2; i++)
-          {
-            if(Kokkos::atomic_compare_exchange_strong<int>(&table[i % tableSize()], cluster, nei))
-              return true;
-          }
-          return false;
-        }
-
-        KOKKOS_INLINE_FUNCTION void operator()(const team_member_t t) const
-        {
-          nnz_lno_t cluster = t.league_rank();
-          nnz_lno_t clusterSize = clusterOffsets(cluster + 1) - clusterOffsets(cluster);
-          //Use a fixed-size hash table per thread to accumulate neighbor of the cluster.
-          //If it fills up (very unlikely) then just count every remaining edge going to another cluster
-          //not already in the table; this provides a reasonable upper bound for overallocating the cluster graph.
-          //each thread handles a cluster
-          int* table = (int*) t.team_shmem().get_shmem(tableSize() * sizeof(int));
-          //mark every entry as cluster (self-loop) to represent free/empty
-          Kokkos::parallel_for(Kokkos::TeamVectorRange(t, tableSize()),
-            [&](const nnz_lno_t i)
-            {
-              table[i] = cluster;
-            });
-          t.team_barrier();
-          //now, for each row belonging to the cluster, iterate through the neighbors
-          Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clusterSize),
-            [&] (const nnz_lno_t i)
-            {
-              nnz_lno_t row = clusterVerts(clusterOffsets(cluster) + i);
-              nnz_lno_t rowDeg = rowmap(row + 1) - rowmap(row);
-              Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, rowDeg),
-                [&] (const nnz_lno_t j)
-                {
-                  nnz_lno_t nei = colinds(rowmap(row) + j);
-                  //Remote neighbors are not included
-                  if(nei >= numRows)
-                    return;
-                  nnz_lno_t neiCluster = vertClusters(nei);
-                  if(neiCluster != cluster)
-                  {
-                    //Have a neighbor. Try to find it in the table.
-                    if(!lookup(neiCluster, table))
-                    {
-                      //Not in the table. Try to insert it.
-                      insert(cluster, neiCluster, table);
-                      //Whether or not insertion succeeded,
-                      //this is a cross-cluster edge possibly not seen before
-                      mask.set(rowmap(row) + j);
-                    }
-                  }
-                });
-            });
-        }
-
-        size_t team_shmem_size(int teamSize) const
-        {
-          return tableSize() * sizeof(int);
-        }
-
-        nnz_lno_t numRows;
-        Rowmap rowmap;
-        Colinds colinds;
-        nnz_view_t clusterOffsets;
-        nnz_view_t clusterVerts;
-        nnz_view_t vertClusters;
-        bitset_t mask;
-      };
-
-      template<typename Rowmap, typename Colinds, typename nnz_view_t>
-      struct FillClusterEntriesFunctor
-      {
-        FillClusterEntriesFunctor(
-            Rowmap& rowmap_, Colinds& colinds_, nnz_view_t& clusterRowmap_, nnz_view_t& clusterEntries_, nnz_view_t& clusterOffsets_, nnz_view_t& clusterVerts_, nnz_view_t& vertClusters_, bitset_t& edgeMask_)
-          : rowmap(rowmap_), colinds(colinds_), clusterRowmap(clusterRowmap_), clusterEntries(clusterEntries_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), edgeMask(edgeMask_)
-        {}
-        //Run this scan over entries in clusterVerts (reordered point rows)
-        KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i, nnz_lno_t& lcount, const bool& finalPass) const
-        {
-          nnz_lno_t numRows = rowmap.extent(0) - 1;
-          nnz_lno_t row = clusterVerts(i);
-          size_type rowStart = rowmap(row);
-          size_type rowEnd = rowmap(row + 1);
-          nnz_lno_t cluster = vertClusters(row);
-          nnz_lno_t clusterStart = clusterOffsets(cluster);
-          //Count the number of entries in this row.
-          //This is how much lcount will be increased by,
-          //yielding the offset corresponding to
-          //these point entries in the cluster entries.
-          nnz_lno_t rowEntries = 0;
-          for(size_type j = rowStart; j < rowEnd; j++)
-          {
-            if(edgeMask.test(j))
-              rowEntries++;
-          }
-          if(finalPass)
-          {
-            //if this is the last row in the cluster, update the upper bound in clusterRowmap
-            if(i == clusterStart)
-            {
-              clusterRowmap(cluster) = lcount;
-            }
-            nnz_lno_t clusterEdge = lcount;
-            //populate clusterEntries for these edges
-            for(size_type j = rowStart; j < rowEnd; j++)
-            {
-              if(edgeMask.test(j))
-              {
-                clusterEntries(clusterEdge++) = vertClusters(colinds(j));
-              }
-            }
-          }
-          //update the scan result at the end (exclusive)
-          lcount += rowEntries;
-          if(i == numRows - 1 && finalPass)
-          {
-            //on the very last row, set the last entry of the cluster rowmap
-            clusterRowmap(clusterRowmap.extent(0) - 1) = lcount;
-          }
-        }
-        Rowmap rowmap;
-        Colinds colinds;
-        nnz_view_t clusterRowmap;
-        nnz_view_t clusterEntries;
-        nnz_view_t clusterOffsets;
-        nnz_view_t clusterVerts;
-        nnz_view_t vertClusters;
-        const_bitset_t edgeMask;
-      };
-
       //Assign cluster labels to vertices, given that the vertices are naturally
       //ordered so that contiguous groups of vertices form decent clusters.
       template<typename View>
@@ -740,9 +544,9 @@ namespace KokkosSparse{
         using nnz_view_t   = nnz_lno_persistent_work_view_t;
         using in_rowmap_t  = const_lno_row_view_t;
         using in_colinds_t = const_lno_nnz_view_t;
-        using rowmap_t     = Kokkos::View<row_lno_t*, MyTempMemorySpace>;
+        using rowmap_t     = Kokkos::View<size_type*, MyTempMemorySpace>;
         using colinds_t    = Kokkos::View<nnz_lno_t*, MyTempMemorySpace>;
-        using raw_rowmap_t = Kokkos::View<const row_lno_t*, MyTempMemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+        using raw_rowmap_t = Kokkos::View<const size_type*, MyTempMemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
         using raw_colinds_t = Kokkos::View<const nnz_lno_t*, MyTempMemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
         auto gsHandle = get_gs_handle();
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
@@ -767,8 +571,6 @@ namespace KokkosSparse{
         //Now that a symmetric graph is available, build the cluster graph (also symmetric)
         nnz_lno_t clusterSize = gsHandle->get_cluster_size();
         nnz_lno_t numClusters = (num_rows + clusterSize - 1) / clusterSize;
-        nnz_view_t clusterOffsets("Cluster offsets", numClusters + 1);
-        nnz_view_t clusterVerts("Cluster -> vertices", num_rows);
         raw_rowmap_t raw_sym_xadj;
         raw_colinds_t raw_sym_adj;
         if(this->is_symmetric)
@@ -784,15 +586,13 @@ namespace KokkosSparse{
         nnz_view_t vertClusters;
         auto clusterAlgo = gsHandle->get_clustering_algo();
         if(clusterAlgo == CLUSTER_DEFAULT)
-          clusterAlgo = CLUSTER_BALLOON;
+          clusterAlgo = CLUSTER_MIS2;
         switch(clusterAlgo)
         {
-          case CLUSTER_CUTHILL_MCKEE:
+          case CLUSTER_MIS2:
           {
-            RCM<HandleType, raw_rowmap_t, raw_colinds_t> rcm(num_rows, raw_sym_xadj, raw_sym_adj);
-            nnz_view_t cmOrder = rcm.cuthill_mckee();
-            vertClusters = nnz_view_t("Cluster labels", num_rows);
-            Kokkos::parallel_for(my_exec_space(0, num_rows), ReorderedClusteringFunctor<nnz_view_t>(vertClusters, cmOrder, clusterSize));
+            vertClusters = KokkosGraph::Experimental::graph_mis2_coarsen<MyExecSpace, raw_rowmap_t, raw_colinds_t, nnz_view_t>
+              (raw_sym_xadj, raw_sym_adj, numClusters, KokkosGraph::MIS2_FAST);
             break;
           }
           case CLUSTER_BALLOON:
@@ -801,12 +601,6 @@ namespace KokkosSparse{
             vertClusters = balloon.run(clusterSize);
             break;
           }
-          case CLUSTER_DO_NOTHING:
-          {
-            vertClusters = nnz_view_t("Cluster labels", num_rows);
-            Kokkos::parallel_for(my_exec_space(0, num_rows), NopVertClusteringFunctor<nnz_view_t>(vertClusters, clusterSize));
-            break;
-          }
           case CLUSTER_DEFAULT:
           {
             throw std::logic_error("Logic to choose default clustering algorithm is incorrect");
@@ -818,46 +612,12 @@ namespace KokkosSparse{
         std::cout << "Graph clustering: " << timer.seconds() << '\n';
         timer.reset();
 #endif
-        //Construct the cluster offset and vertex array. These allow fast iteration over all vertices in a given cluster.
-        Kokkos::parallel_for(my_exec_space(0, num_rows), ClusterSizeFunctor<nnz_view_t>(clusterOffsets, vertClusters));
-        KokkosKernels::Impl::exclusive_parallel_prefix_sum<nnz_view_t, MyExecSpace>(numClusters + 1, clusterOffsets);
-        {
-          nnz_view_t tempInsertCounts("Temporary cluster insert counts", numClusters);
-          Kokkos::parallel_for(my_exec_space(0, num_rows), FillClusterVertsFunctor<nnz_view_t>(clusterOffsets, clusterVerts, vertClusters, tempInsertCounts));
-        }
-#if KOKKOSSPARSE_IMPL_PRINTDEBUG
-        {
-          auto clusterOffsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), clusterOffsets);
-          auto clusterVertsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), clusterVerts);
-          puts("Clusters (cluster #, and vertex #s):");
-          for(nnz_lno_t i = 0; i < numClusters; i++)
-          {
-            printf("%d: ", (int) i);
-            for(nnz_lno_t j = clusterOffsetsHost(i); j < clusterOffsetsHost(i + 1); j++)
-            {
-              printf("%d ", (int) clusterVerts(j));
-            }
-            putchar('\n');
-          }
-          printf("\n\n\n");
-        }
-#endif
-        //Determine the set of edges (in the point graph) that cross between two distinct clusters
-        int vectorSize = this->handle->get_suggested_vector_size(num_rows, raw_sym_adj.extent(0));
-        bitset_t crossClusterEdgeMask(raw_sym_adj.extent(0));
-        size_type numClusterEdges;
-        {
-          BuildCrossClusterMaskFunctor<raw_rowmap_t, raw_colinds_t, nnz_view_t>
-            buildEdgeMask(raw_sym_xadj, raw_sym_adj, clusterOffsets, clusterVerts, vertClusters, crossClusterEdgeMask);
-          int sharedPerTeam = buildEdgeMask.team_shmem_size(0); //using team-size = 0 for since no per-thread shared is used.
-          int teamSize = KokkosKernels::Impl::get_suggested_team_size<team_policy_t>(buildEdgeMask, vectorSize, sharedPerTeam, 0);
-          Kokkos::parallel_for(team_policy_t(numClusters, teamSize, vectorSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)), buildEdgeMask);
-          numClusterEdges = crossClusterEdgeMask.count();
-        }
-        nnz_view_t clusterRowmap = nnz_view_t("Cluster graph rowmap", numClusters + 1);
-        nnz_view_t clusterEntries = nnz_view_t("Cluster graph colinds", numClusterEdges);
-        Kokkos::parallel_scan(my_exec_space(0, num_rows), FillClusterEntriesFunctor<raw_rowmap_t, raw_colinds_t, nnz_view_t>
-            (raw_sym_xadj, raw_sym_adj, clusterRowmap, clusterEntries, clusterOffsets, clusterVerts, vertClusters, crossClusterEdgeMask));
+        rowmap_t clusterRowmap;
+        colinds_t clusterEntries;
+        nnz_view_t clusterOffsets;
+        nnz_view_t clusterVerts;
+        KokkosGraph::Experimental::graph_explicit_coarsen_with_inverse_map<Kokkos::Device<MyExecSpace, MyTempMemorySpace>, raw_rowmap_t, raw_colinds_t, nnz_view_t, rowmap_t, colinds_t, nnz_view_t>
+          (raw_sym_xadj, raw_sym_adj, vertClusters, numClusters, clusterRowmap, clusterEntries, clusterOffsets, clusterVerts, false);
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
         std::cout << "Building explicit cluster graph: " << timer.seconds() << '\n';
         timer.reset();
@@ -892,7 +652,7 @@ namespace KokkosSparse{
         Kokkos::deep_copy(colors, h_colors);
 #else
         //Create a handle that uses nnz_lno_t as the size_type, since the cluster graph should never be larger than 2^31 entries.
-        KokkosKernels::Experimental::KokkosKernelsHandle<nnz_lno_t, nnz_lno_t, double, MyExecSpace, MyPersistentMemorySpace, MyPersistentMemorySpace> kh;
+        HandleType kh;
         kh.create_graph_coloring_handle(KokkosGraph::COLORING_DEFAULT);
         KokkosGraph::Experimental::graph_color_symbolic(&kh, numClusters, numClusters, clusterRowmap, clusterEntries);
         //retrieve colors
diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
index 5c50815f34..d5c111862f 100644
--- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
@@ -276,7 +276,7 @@ namespace KokkosSparse{
               for(int j = 0; j < N; j++)
                 lsum.data[j] += val * _Xvector(colIndex, colStart + j);
             }, sum);
-          Kokkos::single(Kokkos::PerThread(teamMember),[=] ()
+          Kokkos::single(Kokkos::PerThread(teamMember),[&] ()
           {
             nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(row);
             for(int i = 0; i < N; i++)
@@ -420,7 +420,7 @@ namespace KokkosSparse{
 
                   product += product2;
                   //update the new vector entries.
-                  Kokkos::single(Kokkos::PerThread(teamMember),[=] () {
+                  Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
                       nnz_lno_t block_row_index = ii * block_size + i;
                       nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(block_row_index);
                       _Xvector(block_row_index, vec) += omega * (_Yvector(block_row_index, vec) - product) * invDiagonalVal;
@@ -484,7 +484,7 @@ namespace KokkosSparse{
 
           Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& ii) {
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
-              Kokkos::single(Kokkos::PerThread(teamMember),[=] () {
+              Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
                   for(nnz_lno_t i = 0; i < block_size; diagonal_positions[i++] = -1);
                 });
 #endif
@@ -542,7 +542,7 @@ namespace KokkosSparse{
                       valueToUpdate += all_shared_memory[colind] * _adj_vals(current_row_begin + colind);
                     }, product);
 
-                  Kokkos::single(Kokkos::PerThread(teamMember),[=] ()
+                  Kokkos::single(Kokkos::PerThread(teamMember),[&] ()
                   {
                     nnz_lno_t block_row_index = ii * block_size + i;
                     nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(block_row_index);
@@ -554,9 +554,8 @@ namespace KokkosSparse{
                     }
                   });
 
-#if !defined(__CUDA_ARCH__)
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
-                  if (/*i == 0 && ii == 1*/ ii == 0 || (block_size == 1 && ii < 2) ){
+                  if (!KokkosKernels::Impl::kk_is_gpu_exec_space<typename team_member_t::execution_space>() && (ii == 0 || (block_size == 1 && ii < 2))){
                     std::cout << "\n\n\nrow:" << ii * block_size + i;
                     std::cout << "\nneighbors:";
                     for (nnz_lno_t z = 0; z < block_row_size; ++z){
@@ -573,7 +572,6 @@ namespace KokkosSparse{
 
                     std::cout << std::endl << "block_row_index:" << ii * block_size + i <<  " _Xvector(block_row_index):" << _Xvector(ii * block_size + i, vec) << std::endl << std::endl<< std::endl;
                   }
-#endif
 #endif
                   //row_begin += row_size * block_size;
                 }
@@ -737,31 +735,16 @@ namespace KokkosSparse{
         timer.reset();
 #endif
 
-
-#if defined( KOKKOS_ENABLE_CUDA )
-        if (std::is_same<Kokkos::Cuda, MyExecSpace >::value){
-          for (nnz_lno_t i = 0; i < numColors; ++i){
-            nnz_lno_t color_index_begin = h_color_xadj(i);
-            nnz_lno_t color_index_end = h_color_xadj(i + 1);
-
-            if (color_index_begin + 1 >= color_index_end ) continue;
-            auto colorsubset =
-              subview(color_adj, Kokkos::pair<row_lno_t, row_lno_t> (color_index_begin, color_index_end));
-            MyExecSpace().fence();
-            Kokkos::sort (colorsubset);
-            //TODO: MD 08/2017: If I remove the below fence, code fails on cuda.
-            //I do not see any reason yet it to fail.
-            MyExecSpace().fence();
-          }
-        }
-#endif
-
-        MyExecSpace().fence();
+        // TODO BMK: Why are the vertices in each color set only being sorted on GPU?
+        // Wouldn't it have a locality benefit on CPU too?
+        if(KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
+          KokkosKernels::Impl::sort_crs_graph<MyExecSpace, decltype(color_xadj), decltype(color_adj)>(color_xadj, color_adj);
+          MyExecSpace().fence();
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        std::cout << "SORT_TIME:" << timer.seconds() << std::endl;
-        timer.reset();
-        //std::cout << "sort" << std::endl;
+          std::cout << "SORT_TIME:" << timer.seconds() << std::endl;
+          timer.reset();
 #endif
+        }
 
         row_lno_persistent_work_view_t permuted_xadj ("new xadj", num_rows + 1);
         nnz_lno_persistent_work_view_t old_to_new_map ("old_to_new_index_", num_rows );
@@ -843,8 +826,7 @@ namespace KokkosSparse{
           nnz_lno_t num_values_in_l2 = 0;
           nnz_lno_t num_big_rows = 0;
 
-          KokkosKernels::Impl::ExecSpaceType ex_sp = this->handle->get_handle_exec_space();
-          if (ex_sp != KokkosKernels::Impl::Exec_CUDA){
+          if (!KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
             //again, if it is on CPUs, we make L1 as big as we need.
             size_t l1mem = 1;
             while(l1mem < level_1_mem){
@@ -882,12 +864,11 @@ namespace KokkosSparse{
               num_big_rows = KOKKOSKERNELS_MACRO_MIN(num_large_rows, (size_type)(MyExecSpace::concurrency() / suggested_vector_size));
               //std::cout << "num_big_rows:" << num_big_rows << std::endl;
 
-#if defined( KOKKOS_ENABLE_CUDA )
-              if (ex_sp == KokkosKernels::Impl::Exec_CUDA) {
+              if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
                 //check if we have enough memory for this. lower the concurrency if we do not have enugh memory.
                 size_t free_byte ;
                 size_t total_byte ;
-                cudaMemGetInfo( &free_byte, &total_byte ) ;
+                KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
                 size_t required_size = size_t (num_big_rows) * level_2_mem;
                 if (required_size + num_big_rows * sizeof(int) > free_byte){
                   num_big_rows = ((((free_byte - num_big_rows * sizeof(int))* 0.8) /8 ) * 8) / level_2_mem;
@@ -900,7 +881,6 @@ namespace KokkosSparse{
                   num_big_rows = min_chunk_size;
                 }
               }
-#endif
             }
           }
 
@@ -1165,7 +1145,7 @@ namespace KokkosSparse{
           // change fill_matrix_numeric so that they store the internal matrix as above.
           // the rest will wok fine.
 
-          if (this->handle->get_handle_exec_space() == KokkosKernels::Impl::Exec_CUDA){
+          if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
             Kokkos::parallel_for( "KokkosSparse::GaussSeidel::Team_fill_matrix_numeric",
                                   team_policy_t(num_rows / rows_per_team + 1 , suggested_team_size, suggested_vector_size),
                                   fill_matrix_numeric(
@@ -1209,7 +1189,7 @@ namespace KokkosSparse{
                                      block_size,
                                      block_matrix_size);
 
-            if (this->handle->get_handle_exec_space() == KokkosKernels::Impl::Exec_CUDA || block_size > 1){
+            if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>() || block_size > 1){
               Kokkos::parallel_for("KokkosSparse::GaussSeidel::team_get_matrix_diagonals",
                                    team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size),
                                    gmd );
diff --git a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp
index ddfcb70f92..af10787c46 100644
--- a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp
@@ -74,555 +74,6 @@ struct IotaFunctor
   View v;
 };
 
-template <typename HandleType, typename lno_row_view_t, typename lno_nnz_view_t>
-struct RCM
-{
-  typedef typename HandleType::HandleExecSpace MyExecSpace;
-  typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace;
-  typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
-
-  typedef typename HandleType::size_type size_type;
-  typedef typename HandleType::nnz_lno_t nnz_lno_t;
-
-  typedef typename lno_row_view_t::const_type const_lno_row_view_t;
-  typedef typename lno_row_view_t::non_const_type non_const_lno_row_view_t;
-  typedef typename non_const_lno_row_view_t::value_type offset_t;
-
-  typedef typename lno_nnz_view_t::const_type const_lno_nnz_view_t;
-  typedef typename lno_nnz_view_t::non_const_type non_const_lno_nnz_view_t;
-
-  typedef typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t;
-  typedef typename HandleType::row_lno_persistent_work_view_t row_lno_persistent_work_view_t;
-  typedef typename HandleType::row_lno_persistent_work_host_view_t row_lno_persistent_work_host_view_t; //Host view type
-
-  typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
-  typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t;
-  typedef typename HandleType::nnz_lno_persistent_work_host_view_t nnz_lno_persistent_work_host_view_t; //Host view type
-
-  typedef nnz_lno_persistent_work_view_t nnz_view_t;
-  typedef Kokkos::View<nnz_lno_t, MyTempMemorySpace, Kokkos::MemoryTraits<0>> single_view_t;
-  typedef Kokkos::View<nnz_lno_t, Kokkos::HostSpace, Kokkos::MemoryTraits<0>> single_view_host_t;
-
-  typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-
-  typedef Kokkos::RangePolicy<MyExecSpace> range_policy_t ;
-  typedef Kokkos::TeamPolicy<MyExecSpace> team_policy_t ;
-  typedef typename team_policy_t::member_type team_member_t ;
-
-  typedef nnz_lno_t LO;
-
-  RCM(size_type numRows_, lno_row_view_t& rowmap_, lno_nnz_view_t& colinds_)
-    : numRows(numRows_), rowmap(rowmap_), colinds(colinds_)
-  {}
-
-  nnz_lno_t numRows;
-  const_lno_row_view_t rowmap;
-  const_lno_nnz_view_t colinds;
-
-  template<typename Rowmap>
-  struct MaxDegreeFunctor
-  {
-    typedef typename std::remove_cv<typename Rowmap::value_type>::type size_type;
-    MaxDegreeFunctor(Rowmap& rowmap_) : r(rowmap_) {}
-    KOKKOS_INLINE_FUNCTION void operator()(const size_type i, size_type& lmax) const
-    {
-      size_type ideg = r(i + 1) - r(i);
-      if(ideg > lmax)
-        lmax = ideg;
-    }
-    Rowmap r;
-  };
-
-  //simple parallel reduction to find max degree in graph
-  size_type find_max_degree()
-  {
-    size_type maxDeg = 0;
-    Kokkos::parallel_reduce(range_policy_t(0, numRows), MaxDegreeFunctor<const_lno_row_view_t>(rowmap), Kokkos::Max<size_type>(maxDeg));
-    //max degree should be computed as an offset_t,
-    //but must fit in a nnz_lno_t
-    return maxDeg;
-  }
-
-  //radix sort keys according to their corresponding values ascending.
-  //keys are NOT preserved since the use of this in RCM doesn't care about degree after sorting
-  template<typename size_type, typename KeyType, typename ValueType, typename IndexType, typename member_t>
-  KOKKOS_INLINE_FUNCTION static void
-  radixSortKeysAndValues(KeyType* keys, KeyType* keysAux, ValueType* values, ValueType* valuesAux, IndexType n, const member_t& mem)
-  {
-    if(n <= 1)
-      return;
-    //sort 4 bits at a time
-    KeyType mask = 0xF;
-    bool inAux = false;
-    //maskPos counts the low bit index of mask (0, 4, 8, ...)
-    IndexType maskPos = 0;
-    IndexType sortBits = 0;
-    KeyType minKey = Kokkos::ArithTraits<KeyType>::max();
-    KeyType maxKey = 0;
-    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(mem, n),
-    [=](size_type i, KeyType& lminkey)
-    {
-      if(keys[i] < lminkey)
-        lminkey = keys[i];
-    }, Kokkos::Min<KeyType>(minKey));
-    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(mem, n),
-    [=](size_type i, KeyType& lmaxkey)
-    {
-      if(keys[i] > lmaxkey)
-        lmaxkey = keys[i];
-    }, Kokkos::Max<KeyType>(maxKey));
-    //apply a bias so that key range always starts at 0
-    //also invert key values here for a descending sort
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(mem, n),
-    [=](size_type i)
-    {
-      keys[i] -= minKey;
-    });
-    KeyType upperBound = maxKey - minKey;
-    while(upperBound)
-    {
-      upperBound >>= 1;
-      sortBits++;
-    }
-    for(IndexType s = 0; s < (sortBits + 3) / 4; s++)
-    {
-      //Count the number of elements in each bucket
-      IndexType count[16] = {0};
-      IndexType offset[17];
-      if(!inAux)
-      {
-        for(IndexType i = 0; i < n; i++)
-        {
-          count[(keys[i] & mask) >> maskPos]++;
-        }
-      }
-      else
-      {
-        for(IndexType i = 0; i < n; i++)
-        {
-          count[(keysAux[i] & mask) >> maskPos]++;
-        }
-      }
-      offset[0] = 0;
-      //get offset as the prefix sum for count
-      for(IndexType i = 0; i < 16; i++)
-      {
-        offset[i + 1] = offset[i] + count[i];
-      }
-      //now for each element in [lo, hi), move it to its offset in the other buffer
-      //this branch should be ok because whichBuf is the same on all threads
-      if(!inAux)
-      {
-        //copy from *Over to *Aux
-        for(IndexType i = 0; i < n; i++)
-        {
-          IndexType bucket = (keys[i] & mask) >> maskPos;
-          keysAux[offset[bucket + 1] - count[bucket]] = keys[i];
-          valuesAux[offset[bucket + 1] - count[bucket]] = values[i];
-          count[bucket]--;
-        }
-      }
-      else
-      {
-        //copy from *Aux to *Over
-        for(IndexType i = 0; i < n; i++)
-        {
-          IndexType bucket = (keysAux[i] & mask) >> maskPos;
-          keys[offset[bucket + 1] - count[bucket]] = keysAux[i];
-          values[offset[bucket + 1] - count[bucket]] = valuesAux[i];
-          count[bucket]--;
-        }
-      }
-      inAux = !inAux;
-      mask = mask << 4;
-      maskPos += 4;
-    }
-    //move keys/values back from aux if they are currently in aux,
-    //and remove bias
-    if(inAux)
-    {
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(mem, n),
-      [=](size_type i)
-      {
-        //TODO: when everything works, is safe to remove next line
-        //since keys (BFS visit scores) will never be needed again
-        keys[i] = keysAux[i];
-        values[i] = valuesAux[i];
-      });
-    }
-  }
-
-  //Functor that does breadth-first search on a sparse graph.
-  struct BfsFunctor
-  {
-    typedef Kokkos::View<nnz_lno_t**, MyTempMemorySpace, Kokkos::MemoryTraits<0>> WorkView;
-
-    BfsFunctor(const WorkView& workQueue_, const WorkView& scratch_, const nnz_view_t& visit_, const const_lno_row_view_t& rowmap_, const const_lno_nnz_view_t& colinds_, const single_view_t& numLevels_, const nnz_view_t& threadNeighborCounts_, nnz_lno_t start_, nnz_lno_t numRows_)
-      : workQueue(workQueue_), scratch(scratch_), visit(visit_), rowmap(rowmap_), colinds(colinds_), numLevels(numLevels_), threadNeighborCounts(threadNeighborCounts_), start(start_), numRows(numRows_)
-    {}
-
-    KOKKOS_INLINE_FUNCTION void operator()(const team_member_t mem) const
-    {
-      const nnz_lno_t LNO_MAX = Kokkos::ArithTraits<nnz_lno_t>::max();
-      const nnz_lno_t NOT_VISITED = LNO_MAX;
-      const nnz_lno_t QUEUED = NOT_VISITED - 1;
-      int nthreads = mem.team_size();
-      nnz_lno_t tid = mem.team_rank();
-      auto neighborList = Kokkos::subview(scratch, tid, Kokkos::ALL());
-      //active and next indicate which buffer in workQueue holds the nodes in current/next frontiers, respectively
-      //active, next and visitCounter are thread-local, but always kept consistent across threads
-      int active = 0;
-      int next = 1;
-      nnz_lno_t visitCounter = 0;
-      Kokkos::single(Kokkos::PerTeam(mem),
-      [=]()
-      {
-        workQueue(active, 0) = start;
-        visit(start) = QUEUED;
-      });
-      nnz_lno_t activeQSize = 1;
-      nnz_lno_t nextQSize = 0;
-      //KK create_reverse_map() expects incoming values to start at 1
-      nnz_lno_t level = 1;
-      //do this until all nodes have been visited and added to a level
-      while(visitCounter < numRows)
-      {
-        mem.team_barrier();
-        //each thread works on a contiguous block of nodes in queue (for locality)
-        //compute in size_t to avoid possible 32-bit overflow
-        nnz_lno_t workStart = tid * activeQSize / nthreads;
-        nnz_lno_t workEnd = (tid + 1) * activeQSize / nthreads;
-        //the maximum work batch size (among all threads)
-        //the following loop contains barriers so all threads must iterate same # of times
-        nnz_lno_t maxBatch = (activeQSize + nthreads - 1) / nthreads;
-        for(nnz_lno_t loop = 0; loop < maxBatch; loop++)
-        {
-          //this thread may not actually have anything to work on (if nthreads doesn't divide qSize)
-          bool busy = loop < workEnd - workStart;
-          nnz_lno_t neiCount = 0;
-          nnz_lno_t process = LNO_MAX;
-          if(busy)
-          {
-            process = workQueue(active, workStart + loop);
-            offset_t rowStart = rowmap(process);
-            offset_t rowEnd = rowmap(process + 1);
-            //build a list of all non-visited neighbors
-            for(offset_t j = rowStart; j < rowEnd; j++)
-            {
-              nnz_lno_t col = colinds(j);
-              //use atomic here to guarantee neighbors are added to neighborList exactly once
-              if(col < numRows && Kokkos::atomic_compare_exchange_strong<nnz_lno_t>(&visit(col), NOT_VISITED, QUEUED))
-              {
-                //this thread is the first to see that col needs to be queued
-                neighborList(neiCount) = col;
-                neiCount++;
-              }
-            }
-          }
-          threadNeighborCounts(tid) = neiCount;
-          mem.team_barrier();
-          size_type queueUpdateOffset = 0;
-          for(nnz_lno_t i = 0; i < tid; i++)
-          {
-            queueUpdateOffset += threadNeighborCounts(i);
-          }
-          //write out all updates to next queue in parallel
-          if(busy)
-          {
-            nnz_lno_t nextQueueIter = 0;
-            for(nnz_lno_t i = 0; i < neiCount; i++)
-            {
-              nnz_lno_t toQueue = neighborList(i);
-              visit(toQueue) = QUEUED;
-              workQueue(next, nextQSize + queueUpdateOffset + nextQueueIter) = toQueue;
-              nextQueueIter++;
-            }
-            //assign level to to process
-            visit(process) = level;
-          }
-          nnz_lno_t totalAdded = 0;
-          for(nnz_lno_t i = 0; i < nthreads; i++)
-          {
-            totalAdded += threadNeighborCounts(i);
-          }
-          nextQSize += totalAdded;
-          mem.team_barrier();
-        }
-        //swap queue buffers
-        active = next;
-        next = 1 - next;
-        //all threads have a consistent value of qSize here.
-        //update visitCounter in preparation for next frontier
-        visitCounter += activeQSize;
-        activeQSize = nextQSize;
-        nextQSize = 0;
-        if(visitCounter < numRows && activeQSize == 0)
-        {
-          Kokkos::single(Kokkos::PerTeam(mem),
-          [=]()
-          {
-            //Some nodes are unreachable from start (graph not connected)
-            //Find an unvisited node to resume BFS
-            for(nnz_lno_t search = numRows - 1; search >= 0; search--)
-            {
-              if(visit(search) == NOT_VISITED)
-              {
-                workQueue(active, 0) = search;
-                visit(search) = QUEUED;
-                break;
-              }
-            }
-          });
-          activeQSize = 1;
-        }
-        level++;
-      }
-      Kokkos::single(Kokkos::PerTeam(mem),
-      [=]
-      {
-        numLevels() = level - 1;
-      });
-    }
-
-    WorkView workQueue;
-    WorkView scratch;
-    nnz_view_t visit;
-    const_lno_row_view_t rowmap;
-    const_lno_nnz_view_t colinds;
-    single_view_t numLevels;
-    nnz_view_t threadNeighborCounts;
-    nnz_lno_t start;
-    nnz_lno_t numRows;
-  };
-
-  //Parallel breadth-first search, producing level structure in (xadj, adj) form:
-  //xadj(level) gives index in adj where level begins.
-  //Returns the total number of levels, and sets xadj, adj and maxDeg.
-  nnz_lno_t parallel_bfs(nnz_lno_t start, nnz_view_t& xadj, nnz_view_t& adj, nnz_lno_t& maxDeg, nnz_lno_t nthreads)
-  {
-    //need to know maximum degree to allocate scratch space for threads
-    maxDeg = find_max_degree();
-    //view for storing the visit timestamps
-    nnz_view_t visit("BFS visited nodes", numRows);
-    const nnz_lno_t LNO_MAX = Kokkos::ArithTraits<nnz_lno_t>::max();
-    const nnz_lno_t NOT_VISITED = LNO_MAX;
-    KokkosBlas::fill(visit, NOT_VISITED);
-    //the visit queue
-    //one of q1,q2 is active at a time and holds the nodes to process in next BFS level
-    //elements which are LNO_MAX are just placeholders (nothing to process)
-    Kokkos::View<nnz_lno_t**, MyTempMemorySpace, Kokkos::MemoryTraits<0>> workQueue("BFS queue (double buffered)", 2, numRows);
-    nnz_view_t threadNeighborCounts("Number of nodes to queue on each thread", nthreads);
-    single_view_t numLevels("# of BFS levels");
-    single_view_host_t numLevelsHost("# of BFS levels");
-    Kokkos::View<nnz_lno_t**, MyTempMemorySpace, Kokkos::MemoryTraits<0u>> scratch("Scratch buffer shared by threads", nthreads, maxDeg);
-    Kokkos::parallel_for(team_policy_t(1, nthreads), BfsFunctor(workQueue, scratch, visit, rowmap, colinds, numLevels, threadNeighborCounts, start, numRows));
-    Kokkos::deep_copy(numLevelsHost, numLevels);
-    //now that level structure has been computed, construct xadj/adj
-    KokkosKernels::Impl::create_reverse_map<nnz_view_t, nnz_view_t, MyExecSpace>
-      (numRows, numLevelsHost(), visit, xadj, adj);
-    return numLevelsHost();
-  }
-
-  struct CuthillMcKeeFunctor
-  {
-    typedef Kokkos::View<offset_t*, MyTempMemorySpace, Kokkos::MemoryTraits<0u>> ScoreView;
-
-    CuthillMcKeeFunctor(nnz_lno_t numLevels_, nnz_lno_t maxDegree_, const const_lno_row_view_t& rowmap_, const const_lno_nnz_view_t& colinds_, const ScoreView& scores_, const ScoreView& scoresAux_, const nnz_view_t& visit_, const nnz_view_t& xadj_, const nnz_view_t& adj_, const nnz_view_t& adjAux_)
-      : numLevels(numLevels_), maxDegree(maxDegree_), rowmap(rowmap_), colinds(colinds_), scores(scores_), scoresAux(scoresAux_), visit(visit_), xadj(xadj_), adj(adj_), adjAux(adjAux_)
-    {
-      numRows = rowmap.extent(0) - 1;
-    }
-
-    KOKKOS_INLINE_FUNCTION void operator()(const team_member_t mem) const
-    {
-      int tid = mem.team_rank();
-      int nthreads = mem.team_size();
-      const nnz_lno_t LNO_MAX = Kokkos::ArithTraits<nnz_lno_t>::max();
-      nnz_lno_t visitCounter = 0;
-      for(nnz_lno_t level = 0; level < numLevels; level++)
-      {
-        //iterate over vertices in this level and compute
-        //min predecessors (minimum-labeled vertices from previous level)
-        nnz_lno_t levelOffset = xadj(level);
-        nnz_lno_t levelSize = xadj(level + 1) - levelOffset;
-        //compute as offset_t to avoid overflow, but the upper bound on
-        //the scores is approx. numRows * maxDegree, which should be representable
-        nnz_lno_t workStart = tid * levelSize / nthreads;
-        nnz_lno_t workEnd = (tid + 1) * levelSize / nthreads;
-        for(nnz_lno_t i = workStart; i < workEnd; i++)
-        {
-          nnz_lno_t process = adj(levelOffset + i);
-          nnz_lno_t minNeighbor = LNO_MAX;
-          offset_t rowStart = rowmap(process);
-          offset_t rowEnd = rowmap(process + 1);
-          for(offset_t j = rowStart; j < rowEnd; j++)
-          {
-            nnz_lno_t neighbor = colinds(j);
-            if(neighbor < numRows)
-            {
-              nnz_lno_t neighborVisit = visit(neighbor);
-              if(neighborVisit < minNeighbor)
-                minNeighbor = neighborVisit;
-            }
-          }
-          scores(i) = ((offset_t) minNeighbor * (maxDegree + 1)) + (rowmap(process + 1) - rowmap(process));
-        }
-        mem.team_barrier();
-        Kokkos::single(Kokkos::PerTeam(mem),
-        [=]()
-        {
-          radixSortKeysAndValues<size_type, offset_t, nnz_lno_t, nnz_lno_t, team_member_t>
-            (scores.data(), scoresAux.data(), adj.data() + levelOffset, adjAux.data(), levelSize, mem);
-        });
-        mem.team_barrier();
-        //label all vertices (which are now in label order within their level)
-        for(nnz_lno_t i = workStart; i < workEnd; i++)
-        {
-          nnz_lno_t process = adj(levelOffset + i);
-          //visit counter increases with levels, so flip the range for the "reverse" in RCM
-          visit(process) = visitCounter + i;
-        }
-        visitCounter += levelSize;
-      }
-    }
-
-    nnz_lno_t numRows;
-    nnz_lno_t numLevels;
-    nnz_lno_t maxDegree;
-    const_lno_row_view_t rowmap;
-    const_lno_nnz_view_t colinds;
-    ScoreView scores;
-    ScoreView scoresAux;
-    nnz_view_t visit;
-    //The levels, stored in CRS format.
-    //xadj stores offsets for each level, and adj stores the rows in each level.
-    nnz_view_t xadj;
-    nnz_view_t adj;
-    nnz_view_t adjAux;
-  };
-
-  //Does the reversing in "reverse Cuthill-McKee")
-  struct OrderReverseFunctor
-  {
-    OrderReverseFunctor(const nnz_view_t& visit_, nnz_lno_t numRows_)
-      : visit(visit_), numRows(numRows_)
-    {}
-
-    KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
-    {
-      visit(i) = numRows - visit(i) - 1;
-    }
-    nnz_view_t visit;
-    nnz_lno_t numRows;
-  };
-
-  //breadth-first search, producing a reverse Cuthill-McKee ordering
-  nnz_view_t parallel_cuthill_mckee(nnz_lno_t start)
-  {
-    size_type nthreads = MyExecSpace::concurrency();
-    if(nthreads > 64)
-      nthreads = 64;
-    #ifdef KOKKOS_ENABLE_CUDA
-    if(std::is_same<MyExecSpace, Kokkos::Cuda>::value)
-    {
-      nthreads = 256;
-    }
-    #endif
-    nnz_view_t xadj, adj;
-    nnz_lno_t maxDegree = 0;
-    //parallel_bfs will compute maxDegree
-    auto numLevels = parallel_bfs(start, xadj, adj, maxDegree, nthreads);
-    nnz_lno_t maxLevelSize = 0;
-    Kokkos::parallel_reduce(range_policy_t(0, numLevels), MaxDegreeFunctor<nnz_view_t>(xadj), Kokkos::Max<nnz_lno_t>(maxLevelSize));
-    //visit (to be returned) contains the RCM numberings of each row
-    nnz_view_t visit("RCM labels", numRows);
-    //Populate visit wth LNO_MAX so that the "min-labeled neighbor"
-    //is always a node in the previous level
-    const nnz_lno_t LNO_MAX = Kokkos::ArithTraits<nnz_lno_t>::max();
-    KokkosBlas::fill(visit, LNO_MAX);
-    //the "score" of a node is a single value that provides an ordering equivalent
-    //to sorting by min predecessor and then by min degree
-    //reduce nthreads to be a power of 2
-    Kokkos::View<offset_t*, MyTempMemorySpace, Kokkos::MemoryTraits<0u>> scores("RCM scores for sorting", maxLevelSize);
-    Kokkos::View<offset_t*, MyTempMemorySpace, Kokkos::MemoryTraits<0u>> scoresAux("RCM scores for sorting (radix sort aux)", maxLevelSize);
-    nnz_view_t adjAux("RCM scores for sorting (radix sort aux)", maxLevelSize);
-    Kokkos::parallel_for(team_policy_t(1, nthreads), CuthillMcKeeFunctor(numLevels, maxDegree, rowmap, colinds, scores, scoresAux, visit, xadj, adj, adjAux));
-    //reverse the visit order (for the 'R' in RCM)
-    Kokkos::parallel_for(range_policy_t(0, numRows), OrderReverseFunctor(visit, numRows));
-    return visit;
-  }
-
-  template<typename Reducer>
-  struct MinDegreeRowFunctor
-  {
-    typedef typename Reducer::value_type Value;
-    MinDegreeRowFunctor(const const_lno_row_view_t& rowmap_) : rowmap(rowmap_) {}
-    KOKKOS_INLINE_FUNCTION void operator()(const size_type i, Value& lval) const
-    {
-      size_type ideg = rowmap(i + 1) - rowmap(i);
-      if(ideg < lval.val)
-      {
-        lval.val = ideg;
-        lval.loc = i;
-      }
-    }
-    const_lno_row_view_t rowmap;
-  };
-
-  //parallel-for functor that assigns a cluster given a envelope-reduced reordering (like RCM)
-  struct OrderToClusterFunctor
-  {
-    OrderToClusterFunctor(const nnz_view_t& ordering_, const nnz_view_t& vertClusters_, nnz_lno_t clusterSize_)
-      : ordering(ordering_), vertClusters(vertClusters_), clusterSize(clusterSize_)
-    {}
-
-    KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
-    {
-      vertClusters(i) = ordering(i) / clusterSize;
-    }
-
-    const nnz_view_t ordering;
-    nnz_view_t vertClusters;
-    nnz_lno_t clusterSize;
-  };
-
-  //Find a peripheral node (one of minimal degree), suitable for starting RCM or BFS 
-  nnz_lno_t find_peripheral()
-  {
-    typedef Kokkos::MinLoc<size_type, size_type> MinLocReducer;
-    typedef typename MinLocReducer::value_type MinLocVal;
-    MinLocVal v;
-    Kokkos::parallel_reduce(range_policy_t(0, numRows),
-        MinDegreeRowFunctor<MinLocReducer>(rowmap), MinLocReducer(v));
-    return v.loc;
-  }
-
-  nnz_view_t cuthill_mckee()
-  {
-    nnz_lno_t periph = find_peripheral();
-    //run Cuthill-McKee BFS from periph
-    auto ordering = parallel_cuthill_mckee(periph);
-    return ordering;
-  }
-
-  nnz_view_t rcm()
-  {
-    nnz_view_t cm = cuthill_mckee();
-    //reverse the visit order (for the 'R' in RCM)
-    Kokkos::parallel_for(range_policy_t(0, numRows), OrderReverseFunctor(cm, numRows));
-    return cm;
-  }
-
-  nnz_view_t cm_cluster(nnz_lno_t clusterSize)
-  {
-    nnz_view_t cm = cuthill_mckee();
-    nnz_view_t vertClusters("Vert to cluster", numRows);
-    OrderToClusterFunctor makeClusters(cm, vertClusters, clusterSize);
-    Kokkos::parallel_for(range_policy_t(0, numRows), makeClusters);
-    return vertClusters;
-  }
-};
-
 template <typename HandleType, typename lno_row_view_t, typename lno_nnz_view_t>
 struct BalloonClustering
 {
diff --git a/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp
index 6ed2d1be38..420e622c8f 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp
@@ -47,6 +47,7 @@
 
 //#define KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 
+#include "KokkosKernels_Controls.hpp"
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 #include "cusparse.h"
 #endif
@@ -78,10 +79,10 @@ namespace Impl{
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 
-    typedef typename ain_row_index_view_type::device_type device1;
-    typedef typename ain_nonzero_index_view_type::device_type device2;
-
-    typedef typename KernelHandle::nnz_lno_t idx;
+    using device1   = typename ain_row_index_view_type::device_type;
+    using device2   = typename ain_nonzero_index_view_type::device_type;
+    using idx       = typename KernelHandle::nnz_lno_t;
+    using size_type = typename KernelHandle::size_type;
 
 
     //TODO this is not correct, check memory space.
@@ -94,11 +95,14 @@ namespace Impl{
       //return;
     }
 
-    if (std::is_same<idx, int>::value){
+#if defined(CUSPARSE_VERSION) && (11000 <= CUSPARSE_VERSION)
+      throw std::runtime_error ("SpGEMM cuSPARSE backend is not yet supported for this CUDA version\n");
+#else
 
-      const idx *a_xadj = (int *)row_mapA.data();
-      const idx *b_xadj = (int *)row_mapB.data();
-      idx *c_xadj = (int *)row_mapC.data();
+    if (std::is_same<idx, int>::value && std::is_same<size_type, int>::value){
+      const idx *a_xadj = (const idx*) row_mapA.data();
+      const idx *b_xadj = (const idx*) row_mapB.data();
+      idx *c_xadj = (idx*) row_mapC.data();
 
       const idx *a_adj = entriesA.data();
       const idx *b_adj = entriesB.data();
@@ -143,6 +147,7 @@ namespace Impl{
       throw std::runtime_error ("CUSPARSE requires local ordinals to be integer.\n");
       //return;
     }
+#endif
 #else
     (void)handle;
     (void)m;          (void)n;          (void)k;
@@ -186,6 +191,9 @@ namespace Impl{
       cin_nonzero_value_view_type valuesC){
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#if defined(CUSPARSE_VERSION) && (11000 <= CUSPARSE_VERSION)
+      throw std::runtime_error ("SpGEMM cuSPARSE backend is not yet supported for this CUDA version\n");
+#else
     typedef typename KernelHandle::nnz_lno_t idx;
 
     typedef typename KernelHandle::nnz_scalar_t value_type;
@@ -289,6 +297,7 @@ namespace Impl{
       throw std::runtime_error ("CUSPARSE requires local ordinals to be integer.\n");
       //return;
     }
+#endif
 #else
     (void)handle;
     (void)m;        (void)n;        (void)k;
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
index a8a539ef10..06a3153ad9 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
@@ -787,9 +787,35 @@ class KokkosSPGEMM{
 				    typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv,
 				    KokkosKernels::Impl::ExecSpaceType my_exec_space);
 
+  //Utility to compute the number of pool chunks for L2 hashmap accumulators.
+  //Uses free memory query for accelerators/GPUs but assumes infinite available host memory.
+  //
+  //chunk_bytes: bytes in each chunk
+  //ideal_num_chunks: number of chunks that would give each thread/team its own chunk (no contention)
+  template<typename Pool>
+  size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks)
+  {
+    if(!KokkosKernels::Impl::kk_is_gpu_exec_space<typename Pool::execution_space>())
+      return ideal_num_chunks;
+    size_t free_byte, total_byte;
+    KokkosKernels::Impl::kk_get_free_total_memory<typename Pool::memory_space>(free_byte, total_byte);
+    size_t required_size = ideal_num_chunks * chunk_bytes;
+    if (KOKKOSKERNELS_VERBOSE)
+      std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
+    size_t num_chunks = ideal_num_chunks;
+    //If there is not enough memory to safely allocate ideal_num_chunks, use half the free memory, rounded down
+    if (required_size > free_byte / 2) {
+      num_chunks = (free_byte / 2) / chunk_bytes;
+    }
+    //then take the largest power of 2 smaller than that
+    size_t po2_num_chunks = 1;
+    while (po2_num_chunks * 2 < num_chunks) {
+      po2_num_chunks *= 2;
+    }
+    return po2_num_chunks;
+  }
 };
 
-
 }
 }
 #include "KokkosSparse_spgemm_imp_outer.hpp"
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp
index 5d98e28b98..35f00201a2 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp
@@ -206,19 +206,11 @@ struct KokkosSPGEMM
 #endif
 #if defined( KOKKOS_ENABLE_OPENMP )
     case KokkosKernels::Impl::Exec_OMP:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::OpenMP::hardware_thread_id();
-  #else
       return Kokkos::OpenMP::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_THREADS )
     case KokkosKernels::Impl::Exec_PTHREADS:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::Threads::hardware_thread_id();
-  #else
       return Kokkos::Threads::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_QTHREAD)
     case KokkosKernels::Impl::Exec_QTHREADS:
@@ -227,6 +219,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
 
@@ -769,6 +765,7 @@ bool KokkosSPGEMM
 {
   //get the execution space type.
   KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space();
+  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
   //get the suggested vectorlane size based on the execution space, and average number of nnzs per row.
   int suggested_vector_size = this->handle->get_suggested_vector_size(n, nnz);
   //get the suggested team size.
@@ -799,7 +796,7 @@ bool KokkosSPGEMM
   out_nnz_view_t set_nexts_;
   out_nnz_view_t set_begins_;
 #ifdef KOKKOSKERNELSMOREMEM
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (exec_gpu) {
     set_nexts_ = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_nexts_"), nnz);
     set_begins_ = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_begins_"), nnz);
     Kokkos::deep_copy (set_begins_, -1);
@@ -812,8 +809,9 @@ bool KokkosSPGEMM
   }
 
   //if compressing in single step, allocate the memory as upperbound.
-  //TODO: two step is not there for cuda.
-  if (compress_in_single_step || lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  //TODO: two step is not there for GPU.
+
+  if (compress_in_single_step || exec_gpu) {
     out_nnz_indices = out_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("set_entries_"), nnz);
     out_nnz_sets = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_indices_"), nnz);
   }
@@ -842,7 +840,8 @@ bool KokkosSPGEMM
 
   timer1.reset();
   //bool compression_applied = false;
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<typename HandleType::HandleExecSpace>()) {
+
 
 #ifndef KOKKOSKERNELSMOREMEM
     size_type max_row_nnz = 0;
@@ -861,30 +860,9 @@ bool KokkosSPGEMM
     sszm_compressMatrix.pow2_hash_size = min_hash_size;
     sszm_compressMatrix.pow2_hash_func = min_hash_size - 1;
 
-    size_t num_chunks = concurrency / suggested_vector_size;
+    nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
+      (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
-
-#if defined( KOKKOS_ENABLE_CUDA )
-	if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
-
-		size_t free_byte ;
-		size_t total_byte ;
-		cudaMemGetInfo( &free_byte, &total_byte ) ;
-		size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
-		if (KOKKOSKERNELS_VERBOSE)
-			std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-		if (required_size + num_chunks*sizeof(int) > free_byte){
-			num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize;
-		}
-		{
-			size_t min_chunk_size = 1;
-			while (min_chunk_size * 2 <= num_chunks) {
-				min_chunk_size *= 2;
-			}
-			num_chunks = min_chunk_size;
-		}
-	}
-#endif
     if (KOKKOSKERNELS_VERBOSE){
 
       std::cout << "\t\tPOOL chunksize:" << chunksize << " num_chunks:"
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
index 595e216700..8fdf276e61 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
@@ -99,7 +99,7 @@ void KokkosSPGEMM
 		Kokkos::Impl::Timer timer1;
 		auto new_row_mapB_begin = Kokkos::subview (row_mapB, std::make_pair (nnz_lno_t(0), b_row_cnt));
 		auto new_row_mapB_end = Kokkos::subview (row_mapB, std::make_pair (nnz_lno_t(1), b_row_cnt + 1));
-		row_lno_persistent_work_view_t flops_per_row(Kokkos::ViewAllocateWithoutInitializing("origianal row flops"), a_row_cnt);
+		row_lno_persistent_work_view_t flops_per_row(Kokkos::ViewAllocateWithoutInitializing("original row flops"), a_row_cnt);
 
 		//get maximum row flops.
 		maxNumRoughZeros = this->getMaxRoughRowNNZ(a_row_cnt, row_mapA, entriesA,
@@ -121,13 +121,11 @@ void KokkosSPGEMM
     //number of rows and nnzs
     nnz_lno_t n = this->row_mapB.extent(0) - 1;
     size_type nnz = this->entriesB.extent(0);
-    KokkosKernels::Impl::ExecSpaceType my_exec_space_ = KokkosKernels::Impl::get_exec_space_type<MyExecSpace>();
 
     bool compress_in_single_step = this->handle->get_spgemm_handle()->get_compression_step();
-    //compress in single step if it is cuda execution space.
-    if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA) {
+    //compress in single step if it is GPU.
+    if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>())
     	compress_in_single_step = true;
-    }
 
     //compressed B fields.
     row_lno_temp_work_view_t new_row_mapB(Kokkos::ViewAllocateWithoutInitializing("new row map"), n+1);
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
index 5303a46c40..a5fc298e2c 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
@@ -221,19 +221,11 @@ struct KokkosSPGEMM
 #endif
 #if defined( KOKKOS_ENABLE_OPENMP )
     case KokkosKernels::Impl::Exec_OMP:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::OpenMP::hardware_thread_id();
-  #else
       return Kokkos::OpenMP::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_THREADS )
     case KokkosKernels::Impl::Exec_PTHREADS:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::Threads::hardware_thread_id();
-  #else
       return Kokkos::Threads::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_QTHREAD)
     case KokkosKernels::Impl::Exec_QTHREADS:
@@ -242,6 +234,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
   }
@@ -662,7 +658,7 @@ struct KokkosSPGEMM
       if (c_row_size > max_first_level_hash_size){
     	  {
     		  while (tmp == NULL){
-    			  Kokkos::single(Kokkos::PerTeam(teamMember),[=] (volatile nnz_lno_t * &memptr) {
+    			  Kokkos::single(Kokkos::PerTeam(teamMember),[&] (volatile nnz_lno_t * &memptr) {
     				  memptr = (volatile nnz_lno_t * )( memory_space.allocate_chunk(row_index));
     			  }, tmp);
     		  }
@@ -1252,7 +1248,7 @@ void
 
   //choose parameters
   if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){
-	  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+	  if (KokkosKernels::Impl::kk_is_gpu_exec_space<typename HandleType::HandleExecSpace>()) {
 		  //then chose the best method and parameters.
 		  size_type average_row_nnz = overall_nnz / this->a_row_cnt;
 		  size_t average_row_flops = original_overall_flops / this->a_row_cnt;
@@ -1382,7 +1378,7 @@ void
 
 
   //required memory for L2
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<typename HandleType::HandleExecSpace>()) {
 
 	  if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){
 		  tmp_max_nnz = 1;
@@ -1425,29 +1421,9 @@ void
 	  chunksize += min_hash_size ; //this is for the hash begins
 	  chunksize += max_nnz; //this is for hash nexts
   }
-  int num_chunks = concurrency / suggested_vector_size;
 
-#if defined( KOKKOS_ENABLE_CUDA )
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
-
-    size_t free_byte ;
-    size_t total_byte ;
-    cudaMemGetInfo( &free_byte, &total_byte ) ;
-    size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
-    if (KOKKOSKERNELS_VERBOSE)
-      std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-    if (required_size + num_chunks > free_byte){
-      num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize;
-    }
-    {
-      nnz_lno_t min_chunk_size = 1;
-      while (min_chunk_size * 2 <= num_chunks) {
-        min_chunk_size *= 2;
-      }
-      num_chunks = min_chunk_size;
-    }
-  }
-#endif
+  nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
+    (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
   // END SIZE CALCULATIONS FOR MEMORYPOOL
 
@@ -1463,7 +1439,7 @@ void
   KokkosKernels::Impl::PoolType my_pool_type =
       KokkosKernels::Impl::OneThread2OneChunk;
 
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
 
@@ -1513,7 +1489,7 @@ void
   }
   timer1.reset();
 
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
 	  if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){
                   if (thread_shmem_key_size <= 0) {
                     std::cout << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl;
@@ -1625,7 +1601,7 @@ void
 
   KokkosKernels::Impl::PoolType my_pool_type =
       KokkosKernels::Impl::OneThread2OneChunk;
-  if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<my_exec_space>()) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
 
@@ -1675,7 +1651,7 @@ void
   }
   timer1.reset();
 
-  if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<my_exec_space>()) {
     Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY2",  gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
     MyExecSpace().fence();
   }
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
index 3ba3d4e443..e3a4f492a6 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
@@ -130,19 +130,11 @@ struct KokkosSPGEMM
 #endif
 #if defined( KOKKOS_ENABLE_OPENMP )
     case KokkosKernels::Impl::Exec_OMP:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::OpenMP::hardware_thread_id();
-  #else
       return Kokkos::OpenMP::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_THREADS )
     case KokkosKernels::Impl::Exec_PTHREADS:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::Threads::hardware_thread_id();
-  #else
       return Kokkos::Threads::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_QTHREAD)
     case KokkosKernels::Impl::Exec_QTHREADS:
@@ -151,6 +143,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
   }
@@ -489,7 +485,7 @@ struct KokkosSPGEMM
 //
 //  Policy typedefs with tags found in: KokkosSparse_spgemm_impl.hpp
 //
-//  if Cuda enabled :
+//  if GPU:
 //    "KokkosSparse::NumericCMEM::KKSPEED::GPU" : gpu_team_policy_t,  i.e. GPUTag
 //
 //  else :
@@ -527,7 +523,7 @@ void
 
   Kokkos::Impl::Timer numeric_speed_timer_with_free;
 
-  if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<typename HandleType::HandleExecSpace>()) {
     //allocate memory for begins and next to be used by the hashmap
     nnz_lno_temp_work_view_t beginsC
     (Kokkos::ViewAllocateWithoutInitializing("C keys"), valuesC_.extent(0));
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
index 36afa46eef..f6f4e8e3a8 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
@@ -197,19 +197,11 @@ struct KokkosSPGEMM
 #endif
 #if defined( KOKKOS_ENABLE_OPENMP )
     case KokkosKernels::Impl::Exec_OMP:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::OpenMP::hardware_thread_id();
-  #else
       return Kokkos::OpenMP::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_THREADS )
     case KokkosKernels::Impl::Exec_PTHREADS:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::Threads::hardware_thread_id();
-  #else
       return Kokkos::Threads::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_QTHREAD)
     case KokkosKernels::Impl::Exec_QTHREADS:
@@ -218,6 +210,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
   }
@@ -780,19 +776,11 @@ struct KokkosSPGEMM
 #endif
 #if defined( KOKKOS_ENABLE_OPENMP )
     case KokkosKernels::Impl::Exec_OMP:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::OpenMP::hardware_thread_id();
-  #else
       return Kokkos::OpenMP::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_THREADS )
     case KokkosKernels::Impl::Exec_PTHREADS:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::Threads::hardware_thread_id();
-  #else
       return Kokkos::Threads::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_QTHREAD)
     case KokkosKernels::Impl::Exec_QTHREADS:
@@ -801,6 +789,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
   }
@@ -1509,13 +1501,14 @@ void KokkosSPGEMM
   ){
 
 	SPGEMMAlgorithm current_spgemm_algorithm = this->spgemm_algorithm;
+        constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
 	KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space();
-	if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA)
+	if (exec_gpu)
 	{
 		current_spgemm_algorithm = SPGEMM_KK_MEMORY;
 	}
 	maxNumRoughNonzeros = KOKKOSKERNELS_MACRO_MIN(this->b_col_cnt, maxNumRoughNonzeros);
-    int shmem_size_to_use = shmem_size;
+	int shmem_size_to_use = shmem_size;
 
 	typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
 
@@ -1527,7 +1520,7 @@ void KokkosSPGEMM
 	int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz);
 
 	//this kernel does not really work well if the vector size is less than 4.
-	if (suggested_vector_size < 4 && lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+	if (suggested_vector_size < 4 && exec_gpu) {
 		if (KOKKOSKERNELS_VERBOSE){
 			std::cout << "\tsuggested_vector_size:" << suggested_vector_size << " setting it to 4 for Structure kernel" << std::endl;
 		}
@@ -1538,7 +1531,7 @@ void KokkosSPGEMM
 
 
 	if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){
-		if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+		if (exec_gpu){
 			//then chose the best method and parameters.
 			current_spgemm_algorithm = SPGEMM_KK_MEMORY;
 			int estimate_compress = 8;
@@ -1649,33 +1642,13 @@ void KokkosSPGEMM
 	}
 
 	//initizalize value for the mem pool
-	nnz_lno_t num_chunks = concurrency / suggested_vector_size;
 	KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
-	if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
+	if (exec_gpu) {
 		my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
 	}
 
-
-#if defined( KOKKOS_ENABLE_CUDA )
-	if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
-		size_t free_byte ;
-		size_t total_byte ;
-		cudaMemGetInfo( &free_byte, &total_byte ) ;
-		size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
-		if (KOKKOSKERNELS_VERBOSE)
-			std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-		if (required_size + num_chunks > free_byte){
-			num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize;
-		}
-		{
-			nnz_lno_t min_chunk_size = 1;
-			while (min_chunk_size * 2 <= num_chunks) {
-				min_chunk_size *= 2;
-			}
-			num_chunks = min_chunk_size;
-		}
-	}
-#endif
+        nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
+          (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
 	if (KOKKOSKERNELS_VERBOSE){
 		std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl;
@@ -1721,8 +1694,8 @@ void KokkosSPGEMM
 	timer1.reset();
 
 
-	if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
-		Kokkos::parallel_for("StructureC_NC::CUDA_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc);
+	if (exec_gpu) {
+		Kokkos::parallel_for("StructureC_NC::GPU_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc);
 	}
 	else {
 		if (current_spgemm_algorithm == SPGEMM_KK_DENSE){
@@ -1807,8 +1780,9 @@ void KokkosSPGEMM
 ){
 
   SPGEMMAlgorithm current_spgemm_algorithm = this->spgemm_algorithm;
+  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<typename HandleType::HandleExecSpace>();
   KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space();
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (exec_gpu) {
 	current_spgemm_algorithm = SPGEMM_KK_MEMORY;
   }
 
@@ -1816,7 +1790,7 @@ void KokkosSPGEMM
   nnz_lno_t brows = row_mapB_.extent(0) - 1;
   size_type bnnz =  entriesSetIndex.extent(0);
   size_type compressed_b_size = bnnz;
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (exec_gpu) {
 	  KokkosKernels::Impl::kk_reduce_diff_view <b_original_row_view_t,
 	  	  	  	  	  	  	  	  	  	  	  	b_compressed_row_view_t, MyExecSpace> (brows, old_row_mapB, row_mapB_, compressed_b_size);
 	  if (KOKKOSKERNELS_VERBOSE){
@@ -1826,7 +1800,7 @@ void KokkosSPGEMM
   int suggested_vector_size = this->handle->get_suggested_vector_size(brows, compressed_b_size);
 
   //this kernel does not really work well if the vector size is less than 4.
-  if (suggested_vector_size < 4 && lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (suggested_vector_size < 4 && exec_gpu) {
       if (KOKKOSKERNELS_VERBOSE){
         std::cout << "\tsuggested_vector_size:" << suggested_vector_size << " setting it to 4 for Structure kernel" << std::endl;
       }
@@ -1837,7 +1811,7 @@ void KokkosSPGEMM
   int shmem_size_to_use = shmem_size;
 
   if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){
-	  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+	  if (exec_gpu) {
 		  //then chose the best method and parameters.
 		  current_spgemm_algorithm = SPGEMM_KK_MEMORY;
 		  int estimate_compress = 8;
@@ -1967,7 +1941,7 @@ void KokkosSPGEMM
   }
 
 
-  if (current_spgemm_algorithm == SPGEMM_KK_DENSE && lcl_my_exec_space != KokkosKernels::Impl::Exec_CUDA){
+  if (current_spgemm_algorithm == SPGEMM_KK_DENSE && !exec_gpu) {
     nnz_lno_t col_size = this->b_col_cnt / (sizeof (nnz_lno_t) * 8)+ 1;
     nnz_lno_t max_row_size = KOKKOSKERNELS_MACRO_MIN(col_size, maxNumRoughNonzeros);
     chunksize = col_size + max_row_size;
@@ -1979,34 +1953,13 @@ void KokkosSPGEMM
       std::cout << "\tDense Acc - COLS:" << col_size << " max_row_size:" << max_row_size << std::endl;
     }
   }
-  nnz_lno_t num_chunks = concurrency / suggested_vector_size;
-
   KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
+  if (exec_gpu) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
 
-
-#if defined( KOKKOS_ENABLE_CUDA )
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
-    size_t free_byte ;
-    size_t total_byte ;
-    cudaMemGetInfo( &free_byte, &total_byte ) ;
-    size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
-    if (KOKKOSKERNELS_VERBOSE)
-      std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-    if (required_size + num_chunks > free_byte){
-      num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize;
-    }
-    {
-      nnz_lno_t min_chunk_size = 1;
-      while (min_chunk_size * 2 <= num_chunks) {
-        min_chunk_size *= 2;
-      }
-      num_chunks = min_chunk_size;
-    }
-  }
-#endif
+  nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
+    (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
   if (KOKKOSKERNELS_VERBOSE){
     std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl;
@@ -2051,7 +2004,7 @@ void KokkosSPGEMM
 
   timer1.reset();
 
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
+  if (exec_gpu) {
     Kokkos::parallel_for("KokkosSparse::StructureC::GPU_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc);
   }
   else {
@@ -2587,19 +2540,11 @@ struct KokkosSPGEMM
 #endif
 #if defined( KOKKOS_ENABLE_OPENMP )
     case KokkosKernels::Impl::Exec_OMP:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::OpenMP::hardware_thread_id();
-  #else
       return Kokkos::OpenMP::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_THREADS )
     case KokkosKernels::Impl::Exec_PTHREADS:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::Threads::hardware_thread_id();
-  #else
       return Kokkos::Threads::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_QTHREAD)
     case KokkosKernels::Impl::Exec_QTHREADS:
@@ -2608,6 +2553,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
   }
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp
index c53f8b461c..c06d4c4cb2 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp
@@ -206,19 +206,11 @@ struct KokkosSPGEMM
 #endif
 #if defined( KOKKOS_ENABLE_OPENMP )
     case KokkosKernels::Impl::Exec_OMP:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::OpenMP::hardware_thread_id();
-  #else
       return Kokkos::OpenMP::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_THREADS )
     case KokkosKernels::Impl::Exec_PTHREADS:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::Threads::hardware_thread_id();
-  #else
       return Kokkos::Threads::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_QTHREAD)
     case KokkosKernels::Impl::Exec_QTHREADS:
@@ -227,6 +219,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
   }
@@ -1330,17 +1326,17 @@ void KokkosSPGEMM
     ){
 
   bool apply_compression = this->handle->get_spgemm_handle()->get_compression();
+  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
 
   const nnz_lno_t * min_result_row_for_each_row = this->handle->get_spgemm_handle()->get_min_col_of_row().data();
   nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz();
 
   typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
 
-
   int suggested_vector_size = this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz);
 
   //this kernel does not really work well if the vector size is less than 4.
-  if (suggested_vector_size < 4 && MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA){
+  if (suggested_vector_size < 4 && exec_gpu) {
     if (KOKKOSKERNELS_VERBOSE) std::cout << "\tVecSize:" << suggested_vector_size << " Setting it to 4" << std::endl;
     suggested_vector_size = 4;
   }
@@ -1420,31 +1416,14 @@ void KokkosSPGEMM
 
   }
 
-  nnz_lno_t num_chunks = concurrency / suggested_vector_size;
   KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
-  if (MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) {
+  if (exec_gpu) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
 
+  nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
+    (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
-#if defined( KOKKOS_ENABLE_CUDA )
-  size_t free_byte ;
-  size_t total_byte ;
-  cudaMemGetInfo( &free_byte, &total_byte ) ;
-  size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t);
-  if (KOKKOSKERNELS_VERBOSE)
-    std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-  if (required_size + num_chunks > free_byte){
-    num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize;
-  }
-  {
-    nnz_lno_t min_chunk_size = 1;
-    while (min_chunk_size * 2 < num_chunks) {
-      min_chunk_size *= 2;
-    }
-    num_chunks = min_chunk_size;
-  }
-#endif
   if (KOKKOSKERNELS_VERBOSE){
     std::cout <<  "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. <<
         " num_chunks:" << num_chunks <<
@@ -1494,8 +1473,7 @@ void KokkosSPGEMM
 
   timer1.reset();
 
-  //nnz_lno_t runcuda = atoi(getenv("runcuda"));
-  if (/*runcuda ||*/ MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) {
+  if (exec_gpu) {
     Kokkos::parallel_for( gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc);
   }
   else {
@@ -1690,6 +1668,7 @@ void KokkosSPGEMM
     b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
     KokkosSPGEMM_symbolic_triangle_setup(){
 
+  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
   nnz_lno_t n = this->row_mapB.extent(0) - 1;
   size_type nnz = this->entriesB.extent(0);
 
@@ -1741,7 +1720,7 @@ void KokkosSPGEMM
     }
 
     size_type bnnz =  set_index_entries.extent(0);
-    if (this->MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA){
+    if (exec_gpu) {
       KokkosKernels::Impl::kkp_reduce_diff_view
       <size_type, MyExecSpace> (this->b_row_cnt, p_rowmapB_begins, p_rowmapB_ends, bnnz);
       if (KOKKOSKERNELS_VERBOSE){
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp
index 119e6cddc6..6a9b67c0b2 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp
@@ -202,19 +202,11 @@ struct KokkosSPGEMM
 #endif
 #if defined( KOKKOS_ENABLE_OPENMP )
     case KokkosKernels::Impl::Exec_OMP:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::OpenMP::hardware_thread_id();
-  #else
       return Kokkos::OpenMP::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_THREADS )
     case KokkosKernels::Impl::Exec_PTHREADS:
-  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      return Kokkos::Threads::hardware_thread_id();
-  #else
       return Kokkos::Threads::impl_hardware_thread_id();
-  #endif
 #endif
 #if defined( KOKKOS_ENABLE_QTHREAD)
     case KokkosKernels::Impl::Exec_QTHREADS:
@@ -223,6 +215,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
   }
@@ -900,12 +896,13 @@ void KokkosSPGEMM
   const int num_left_side_nnz_per_row = 2;
   const nnz_lno_t * min_result_row_for_each_row = this->handle->get_spgemm_handle()->get_min_col_of_row().data();
   nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz();
+  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
 
   typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
   int suggested_vector_size = this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz);
 
   //this kernel does not really work well if the vector size is less than 4.
-  if (suggested_vector_size < 4 && MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA){
+  if (suggested_vector_size < 4 && exec_gpu) {
     if (KOKKOSKERNELS_VERBOSE) std::cout << "\tVecSize:" << suggested_vector_size << " Setting it to 4" << std::endl;
     suggested_vector_size = 4;
   }
@@ -966,31 +963,13 @@ void KokkosSPGEMM
     pool_init_val = 0;
   }
 
-  nnz_lno_t num_chunks = concurrency / suggested_vector_size;
   KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
-  if (MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) {
+  if (exec_gpu) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
+  nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
+    (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
-
-#if defined( KOKKOS_ENABLE_CUDA )
-  size_t free_byte ;
-  size_t total_byte ;
-  cudaMemGetInfo( &free_byte, &total_byte ) ;
-  size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t);
-  if (KOKKOSKERNELS_VERBOSE)
-    std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-  if (required_size + num_chunks > free_byte){
-    num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize;
-  }
-  {
-    nnz_lno_t min_chunk_size = 1;
-    while (min_chunk_size * 2 < num_chunks) {
-      min_chunk_size *= 2;
-    }
-    num_chunks = min_chunk_size;
-  }
-#endif
   if (KOKKOSKERNELS_VERBOSE){
     std::cout <<  "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. <<
         " num_chunks:" << num_chunks <<
@@ -1040,9 +1019,7 @@ void KokkosSPGEMM
 
   timer1.reset();
 
-  //nnz_lno_t runcuda = atoi(getenv("runcuda"));
-
-  if (/*runcuda ||*/ MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) {
+  if (exec_gpu) {
     Kokkos::parallel_for( gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc);
   }
   else {
diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
index 70b1d05391..d4c2c98a6f 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
@@ -219,6 +219,10 @@ namespace KokkosSparse{
 #if defined( KOKKOS_ENABLE_CUDA )
 	case KokkosKernels::Impl::Exec_CUDA:
 	  return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+	case KokkosKernels::Impl::Exec_HIP:
+	  return row_index;
 #endif
 	}
       }
@@ -816,7 +820,7 @@ namespace KokkosSparse{
 	  // Initialize hashmaps
 	  if (c_row_size > max_first_level_hash_size){
 	    while (tmp == NULL){
-	      Kokkos::single(Kokkos::PerTeam(teamMember),[=] (volatile nnz_lno_t * &memptr) {
+	      Kokkos::single(Kokkos::PerTeam(teamMember),[&] (volatile nnz_lno_t * &memptr) {
 		  memptr = (volatile nnz_lno_t * )( memory_space.allocate_chunk(row_index));
 		}, tmp);
 	    }
@@ -1181,6 +1185,8 @@ namespace KokkosSparse{
 				  dinv_view_t dinv,
 				  KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space)
     {
+      using pool_memory_space = KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t>;
+      constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
       if (KOKKOSKERNELS_VERBOSE){
 	std::cout << "\tSPARSE ACC MODE" << std::endl;
       }
@@ -1238,7 +1244,7 @@ namespace KokkosSparse{
 
       // Choose the SpGEMM algorithm and corresponding parameters
       if (this->spgemm_algorithm == SPGEMM_KK || this->spgemm_algorithm == SPGEMM_KK_LP){
-	if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+	if (exec_gpu) {
 	  size_type average_row_nnz = overall_nnz / this->a_row_cnt;
 	  size_t average_row_flops = original_overall_flops / this->a_row_cnt;
 
@@ -1310,7 +1316,7 @@ namespace KokkosSparse{
 	    }
 	  }
 	}
-	// If CUDA is not enabled, we decide whether we want to use a sparse or a dense acumulator 
+	// If non-GPU, we decide whether we want to use a sparse or a dense acumulator 
 	else {
 
 	  bool run_dense = false;
@@ -1364,7 +1370,7 @@ namespace KokkosSparse{
 
 
       // Compute the memory pool size
-      if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+      if (exec_gpu) {
 	if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){
 	  tmp_max_nnz = 1;
 	}
@@ -1395,26 +1401,9 @@ namespace KokkosSparse{
 	chunksize += min_hash_size ; //this is for the hash begins
 	chunksize += max_nnz; //this is for hash nexts
       }
-      int num_chunks = concurrency / suggested_vector_size;
 
-#if defined( KOKKOS_ENABLE_CUDA )
-      if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
-	size_t free_byte ;
-	size_t total_byte ;
-	cudaMemGetInfo( &free_byte, &total_byte ) ;
-	size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
-	if (KOKKOSKERNELS_VERBOSE)
-	  std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-	if (required_size + num_chunks > free_byte){
-	  num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize;
-	}
-	nnz_lno_t min_chunk_size = 1;
-	while (min_chunk_size * 2 <= num_chunks) {
-	  min_chunk_size *= 2;
-	}
-	num_chunks = min_chunk_size;
-      }
-#endif
+      nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
+        (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
       if (KOKKOSKERNELS_VERBOSE){
 	std::cout << "\t\t max_nnz: " << max_nnz
@@ -1428,11 +1417,10 @@ namespace KokkosSparse{
 
       // Allocate the memory pool
       KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
-      if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+      if (exec_gpu) {
 	my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
       }
 
-      typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
       Kokkos::Impl::Timer timer;
       pool_memory_space m_space(num_chunks, chunksize, -1,  my_pool_type);
       MyExecSpace().fence();
@@ -1470,7 +1458,7 @@ namespace KokkosSparse{
       }
       timer.reset();
 
-      if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+      if (exec_gpu) {
 	if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){
 	  if (thread_shmem_key_size <= 0) {
 	    std::cout << "KokkosSPGEMM_jacobi_sparseacc SPGEMM_KK_MEMORY_SPREADTEAM: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl;
diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
index 3ea7d150b6..7b91f95e09 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -48,6 +48,7 @@
 #include "KokkosKernels_Controls.hpp"
 #include "Kokkos_InnerProductSpaceTraits.hpp"
 #include "KokkosBlas1_scal.hpp"
+#include "KokkosKernels_ExecSpaceUtils.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosSparse_spmv_impl_omp.hpp"
 
@@ -80,7 +81,6 @@ struct GetCoeffView<Kokkos::View<IT*,IL,ID,IM,IS>,DeviceType> {
 template<class AMatrix,
          class XVector,
          class YVector,
-         int dobeta,
          bool conjugate>
 struct SPMV_Transpose_Functor {
   typedef typename AMatrix::execution_space            execution_space;
@@ -95,55 +95,57 @@ struct SPMV_Transpose_Functor {
   const coefficient_type alpha;
   AMatrix m_A;
   XVector m_x;
-  const coefficient_type beta;
   YVector m_y;
-  const ordinal_type rows_per_thread;
+  ordinal_type rows_per_team;
 
   SPMV_Transpose_Functor (const coefficient_type& alpha_,
                           const AMatrix& m_A_,
                           const XVector& m_x_,
-                          const coefficient_type& beta_,
-                          const YVector& m_y_,
-                          const ordinal_type rows_per_thread_) :
-    alpha (alpha_), m_A (m_A_), m_x (m_x_),
-    beta (beta_), m_y (m_y_),
-    rows_per_thread (rows_per_thread_)
+                          const YVector& m_y_) :
+    alpha (alpha_), m_A (m_A_), m_x (m_x_), m_y (m_y_)
   {}
 
+  KOKKOS_INLINE_FUNCTION void
+  operator() (const ordinal_type iRow) const
+  {
+    const auto row = m_A.rowConst (iRow);
+    const ordinal_type row_length = row.length;
+    for(ordinal_type iEntry = 0; iEntry < row_length; iEntry++)
+    {
+      const value_type val = conjugate ?
+        ATV::conj (row.value(iEntry)) :
+        row.value(iEntry);
+      const ordinal_type ind = row.colidx(iEntry);
+      Kokkos::atomic_add (&m_y(ind), static_cast<y_value_type> (alpha * val * m_x(iRow)));
+    }
+  }
+
   KOKKOS_INLINE_FUNCTION void
   operator() (const team_member& dev) const
   {
-    // This should be a thread loop as soon as we can use C++11
-    for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) {
+    const ordinal_type teamWork = dev.league_rank() * rows_per_team;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_team),
+    [&](ordinal_type loop)
+    {
       // iRow represents a row of the matrix, so its correct type is
       // ordinal_type.
-      const ordinal_type iRow = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank()))
-                                * rows_per_thread + loop;
+      const ordinal_type iRow = teamWork + loop;
       if (iRow >= m_A.numRows ()) {
         return;
       }
 
       const auto row = m_A.rowConst (iRow);
       const ordinal_type row_length = row.length;
-
-#ifdef __CUDA_ARCH__
-      for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-           iEntry < row_length;
-           iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-      for (ordinal_type iEntry = 0;
-           iEntry < row_length;
-           iEntry ++)
-#endif
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length),
+      [&](ordinal_type iEntry)
       {
         const value_type val = conjugate ?
           ATV::conj (row.value(iEntry)) :
           row.value(iEntry);
         const ordinal_type ind = row.colidx(iEntry);
-
         Kokkos::atomic_add (&m_y(ind), static_cast<y_value_type> (alpha * val * m_x(iRow)));
-      }
-    }
+      });
+    });
   }
 };
 
@@ -184,10 +186,38 @@ struct SPMV_Functor {
                    "YVector must be a rank 1 View.");
   }
 
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const ordinal_type iRow) const
+  {
+    using y_value_type = typename YVector::non_const_value_type;
+    if (iRow >= m_A.numRows ()) {
+      return;
+    }
+    const KokkosSparse::SparseRowViewConst<AMatrix> row = m_A.rowConst(iRow);
+    const ordinal_type row_length = static_cast<ordinal_type> (row.length);
+    y_value_type sum = 0;
+
+    for(ordinal_type iEntry = 0; iEntry < row_length; iEntry++)
+    {
+      const value_type val = conjugate ?
+              ATV::conj (row.value(iEntry)) :
+              row.value(iEntry);
+      sum += val * m_x(row.colidx(iEntry));
+    }
+
+    sum *= alpha;
+
+    if (dobeta == 0) {
+      m_y(iRow) = sum ;
+    } else {
+      m_y(iRow) = beta * m_y(iRow) + sum;
+    }
+  }
+
   KOKKOS_INLINE_FUNCTION
   void operator() (const team_member& dev) const
   {
-    typedef typename YVector::non_const_value_type y_value_type;
+    using y_value_type = typename YVector::non_const_value_type;
 
     Kokkos::parallel_for(Kokkos::TeamThreadRange(dev,0,rows_per_team), [&] (const ordinal_type& loop) {
 
@@ -226,19 +256,27 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th
 
   if(nnz_per_row < 1) nnz_per_row = 1;
 
+  int max_vector_length = 1;
+#ifdef KOKKOS_ENABLE_CUDA
+  if(std::is_same<execution_space, Kokkos::Cuda>::value)
+    max_vector_length = 32;
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+  if(std::is_same<execution_space, Kokkos::Experimental::HIP>::value)
+    max_vector_length = 64;
+#endif
+
   if(vector_length < 1) {
     vector_length = 1;
-    while(vector_length<32 && vector_length*6 < nnz_per_row)
+    while(vector_length < max_vector_length && vector_length * 6 < nnz_per_row)
       vector_length*=2;
   }
 
   // Determine rows per thread
   if(rows_per_thread < 1) {
-    #ifdef KOKKOS_ENABLE_CUDA
-    if(std::is_same<Kokkos::Cuda,execution_space>::value)
+    if(KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>())
       rows_per_thread = 1;
     else
-    #endif
     {
       if(nnz_per_row < 20 && nnz > 5000000 ) {
         rows_per_thread = 256;
@@ -247,14 +285,12 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th
     }
   }
 
-  #ifdef KOKKOS_ENABLE_CUDA
   if(team_size < 1) {
-    if(std::is_same<Kokkos::Cuda,execution_space>::value)
+    if(KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>())
     { team_size = 256/vector_length; }
     else
     { team_size = 1; }
   }
-  #endif
 
   rows_per_team = rows_per_thread * team_size;
 
@@ -297,21 +333,14 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls,
      ((int) A.graph.row_block_offsets.extent(0) == (int) omp_get_max_threads()+1) &&
      (((uintptr_t)(const void*)(x.data())%64)==0) && (((uintptr_t)(const void*)(y.data())%64)==0)
      ) {
+    //Note BMK: this case is typically not called in practice even for OpenMP, since
+    //it requires row_block_offsets to have been computed in the graph.
     spmv_raw_openmp_no_transpose<AMatrix,XVector,YVector>(alpha,A,x,beta,y);
     return;
   }
   #endif
-  int team_size = -1;
-  int vector_length = -1;
-  int64_t rows_per_thread = -1;
-
-  // Note on 03/24/20, lbv: We can use the controls
-  // here to allow the user to pass in some tunning
-  // parameters.
-  if(controls.isParameter("team size"))       {team_size       = std::stoi(controls.getParameter("team size"));}
-  if(controls.isParameter("vector length"))   {vector_length   = std::stoi(controls.getParameter("vector length"));}
-  if(controls.isParameter("rows per thread")) {rows_per_thread = std::stoll(controls.getParameter("rows per thread"));}
 
+  bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>();
   bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule
   bool use_static_schedule  = false; // Forces the use of a static schedule
   if(controls.isParameter("schedule")) {
@@ -321,26 +350,45 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls,
       use_static_schedule  = true;
     }
   }
-
-  int64_t rows_per_team = spmv_launch_parameters<execution_space>(A.numRows(),A.nnz(),rows_per_thread,team_size,vector_length);
-  int64_t worksets = (y.extent(0)+rows_per_team-1)/rows_per_team;
-
-  SPMV_Functor<AMatrix,XVector,YVector,dobeta,conjugate> func (alpha,A,x,beta,y,rows_per_team);
-
-  if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule) {
-    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic> > policy(1,1);
-    if(team_size<0)
-      policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic> >(worksets,Kokkos::AUTO,vector_length);
-    else
-      policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic> >(worksets,team_size,vector_length);
-    Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Dynamic>",policy,func);
-  } else {
-    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> > policy(1,1);
-    if(team_size<0)
-      policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,Kokkos::AUTO,vector_length);
+  if(use_teams) {
+    int team_size = -1;
+    int vector_length = -1;
+    int64_t rows_per_thread = -1;
+
+    // Note on 03/24/20, lbv: We can use the controls
+    // here to allow the user to pass in some tunning
+    // parameters.
+    if(controls.isParameter("team size"))       {team_size       = std::stoi(controls.getParameter("team size"));}
+    if(controls.isParameter("vector length"))   {vector_length   = std::stoi(controls.getParameter("vector length"));}
+    if(controls.isParameter("rows per thread")) {rows_per_thread = std::stoll(controls.getParameter("rows per thread"));}
+
+    int64_t rows_per_team = spmv_launch_parameters<execution_space>(A.numRows(),A.nnz(),rows_per_thread,team_size,vector_length);
+    int64_t worksets = (y.extent(0)+rows_per_team-1)/rows_per_team;
+
+    SPMV_Functor<AMatrix,XVector,YVector,dobeta,conjugate> func (alpha,A,x,beta,y,rows_per_team);
+
+    if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule) {
+      Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic> > policy(1,1);
+      if(team_size<0)
+        policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic> >(worksets,Kokkos::AUTO,vector_length);
+      else
+        policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic> >(worksets,team_size,vector_length);
+      Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Dynamic>",policy,func);
+    } else {
+      Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> > policy(1,1);
+      if(team_size<0)
+        policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,Kokkos::AUTO,vector_length);
+      else
+        policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,team_size,vector_length);
+      Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Static>",policy,func);
+    }
+  }
+  else {
+    SPMV_Functor<AMatrix,XVector,YVector,dobeta,conjugate> func (alpha,A,x,beta,y,1);
+    if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule)
+      Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Dynamic>",Kokkos::RangePolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>(0, A.numRows()),func);
     else
-      policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,team_size,vector_length);
-    Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Static>",policy,func);
+      Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Static>",Kokkos::RangePolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(0, A.numRows()),func);
   }
 }
 
@@ -356,7 +404,9 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha,
                            typename YVector::const_value_type& beta,
                            const YVector& y)
 {
-  typedef typename AMatrix::ordinal_type ordinal_type;
+  using ordinal_type = typename AMatrix::non_const_ordinal_type;
+  using size_type = typename AMatrix::non_const_size_type;
+  using execution_space = typename AMatrix::execution_space;
 
   if (A.numRows () <= static_cast<ordinal_type> (0)) {
     return;
@@ -368,33 +418,46 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha,
     KokkosBlas::scal (y, beta, y);
   }
 
-  typedef typename AMatrix::size_type size_type;
-
   // Assuming that no row contains duplicate entries, NNZPerRow
   // cannot be more than the number of columns of the matrix.  Thus,
   // the appropriate type is ordinal_type.
-  const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
+  const ordinal_type NNZPerRow = A.nnz () / A.numRows ();
 
   int vector_length = 1;
-  while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2;
+  bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>();
+  int max_vector_length = 1;
+#ifdef KOKKOS_ENABLE_CUDA
+  if(std::is_same<execution_space, Kokkos::Cuda>::value)
+    max_vector_length = 32;
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+  if(std::is_same<execution_space, Kokkos::Experimental::HIP>::value)
+    max_vector_length = 64;
+#endif
+  if(use_teams) {
+    while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) )
+      vector_length*=2;
+  }
 
-  typedef SPMV_Transpose_Functor<AMatrix, XVector, YVector, dobeta, conjugate> OpType;
+  typedef SPMV_Transpose_Functor<AMatrix, XVector, YVector, conjugate> OpType;
 
   typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-  OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
-
-  const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space > (NNZPerRow);
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>::team_size_recommended (op, vector_length);
-#else
-  const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-#endif
-  const int rows_per_team = rows_per_thread * team_size;
-  const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-  Kokkos::parallel_for("KokkosSparse::spmv<Transpose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
-     ( nteams , team_size , vector_length ) , op );
+  OpType op (alpha, A, x, y);
 
+  if(use_teams) {
+    const ordinal_type rows_per_thread = RowsPerThread<execution_space > (NNZPerRow);
+    const ordinal_type team_size = Kokkos::TeamPolicy<execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+    const ordinal_type rows_per_team = rows_per_thread * team_size;
+    op.rows_per_team = rows_per_team;
+    const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
+    Kokkos::parallel_for("KokkosSparse::spmv<Transpose>", Kokkos::TeamPolicy< execution_space >
+       ( nteams , team_size , vector_length ) , op );
+  }
+  else {
+    Kokkos::parallel_for("KokkosSparse::spmv<Transpose>", Kokkos::RangePolicy< execution_space >
+       ( 0 , nrow ) , op );
+  }
 }
 
 template<class AMatrix,
@@ -457,28 +520,60 @@ struct SPMV_MV_Transpose_Functor {
   YVector m_y;
 
   const ordinal_type n;
-  const ordinal_type rows_per_thread;
+  ordinal_type rows_per_team;
 
   SPMV_MV_Transpose_Functor (const coefficient_type& alpha_,
                              const AMatrix& m_A_,
                              const XVector& m_x_,
                              const coefficient_type& beta_,
-                             const YVector& m_y_,
-                             const ordinal_type rows_per_thread_) :
+                             const YVector& m_y_) :
     alpha (alpha_),
-    m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)),
-    rows_per_thread (rows_per_thread_)
+    m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1))
   {}
 
+  KOKKOS_INLINE_FUNCTION void
+  operator() (const ordinal_type iRow) const
+  {
+    const auto row = m_A.rowConst (iRow);
+    const ordinal_type row_length = row.length;
+
+    for(ordinal_type iEntry = 0; iEntry < row_length; iEntry++)
+    {
+      const A_value_type val = conjugate ?
+        Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
+        row.value(iEntry);
+      const ordinal_type ind = row.colidx(iEntry);
+
+      if (doalpha != 1) {
+        #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+        #pragma unroll
+        #endif
+        for (ordinal_type k = 0; k < n; ++k) {
+          Kokkos::atomic_add (&m_y(ind,k),
+                              static_cast<y_value_type> (alpha * val * m_x(iRow, k)));
+        }
+      } else {
+        #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+        #pragma unroll
+        #endif
+        for (ordinal_type k = 0; k < n; ++k) {
+          Kokkos::atomic_add (&m_y(ind,k),
+                              static_cast<y_value_type> (val * m_x(iRow, k)));
+        }
+      }
+    }
+  }
+
   KOKKOS_INLINE_FUNCTION void
   operator() (const team_member& dev) const
   {
-    // This should be a thread loop as soon as we can use C++11
-    for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) {
+    const ordinal_type teamWork = dev.league_rank() * rows_per_team;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_team),
+    [&](ordinal_type loop)
+    {
       // iRow represents a row of the matrix, so its correct type is
       // ordinal_type.
-      const ordinal_type iRow = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank()))
-                                * rows_per_thread + loop;
+      const ordinal_type iRow = teamWork + loop;
       if (iRow >= m_A.numRows ()) {
         return;
       }
@@ -486,15 +581,8 @@ struct SPMV_MV_Transpose_Functor {
       const auto row = m_A.rowConst (iRow);
       const ordinal_type row_length = row.length;
 
-#ifdef __CUDA_ARCH__
-      for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-           iEntry < static_cast<ordinal_type> (row_length);
-           iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-      for (ordinal_type iEntry = 0;
-           iEntry < row_length;
-           iEntry ++)
-#endif
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length),
+      [&](ordinal_type iEntry)
       {
         const A_value_type val = conjugate ?
           Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
@@ -518,8 +606,8 @@ struct SPMV_MV_Transpose_Functor {
                                 static_cast<y_value_type> (val * m_x(iRow, k)));
           }
         }
-      }
-    }
+      });
+    });
   }
 };
 
@@ -531,7 +619,7 @@ template<class AMatrix,
          bool conjugate>
 struct SPMV_MV_LayoutLeft_Functor {
   typedef typename AMatrix::execution_space            execution_space;
-  typedef typename AMatrix::non_const_ordinal_type               ordinal_type;
+  typedef typename AMatrix::non_const_ordinal_type     ordinal_type;
   typedef typename AMatrix::non_const_value_type       A_value_type;
   typedef typename YVector::non_const_value_type       y_value_type;
   typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
@@ -546,21 +634,23 @@ struct SPMV_MV_LayoutLeft_Functor {
   //! The number of columns in the input and output MultiVectors.
   ordinal_type n;
   ordinal_type rows_per_thread;
+  int vector_length;
 
   SPMV_MV_LayoutLeft_Functor (const coefficient_type& alpha_,
                               const AMatrix& m_A_,
                               const XVector& m_x_,
                               const coefficient_type& beta_,
                               const YVector& m_y_,
-                              const ordinal_type rows_per_thread_) :
+                              const ordinal_type rows_per_thread_,
+                              int vector_length_) :
     alpha (alpha_),
     m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)),
-    rows_per_thread (rows_per_thread_)
+    rows_per_thread (rows_per_thread_), vector_length(vector_length_)
   {}
 
   template<int UNROLL>
   KOKKOS_INLINE_FUNCTION void
-  strip_mine (const team_member& /* dev */, const ordinal_type& iRow, const ordinal_type& kk) const
+  strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const
   {
     y_value_type sum[UNROLL];
 
@@ -581,142 +671,137 @@ struct SPMV_MV_LayoutLeft_Functor {
     // assume either that rows have no duplicate entries, or that rows
     // never have enough duplicate entries to overflow ordinal_type.
 
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
-#pragma loop count (15)
-#endif
-#ifdef __CUDA_ARCH__
-        for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-             iEntry < row.length;
-             iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-        for (ordinal_type iEntry = 0;
-             iEntry < row.length;
-             iEntry ++)
-#endif
-      {
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row.length),
+    [&](ordinal_type iEntry)
+    {
       const A_value_type val = conjugate ?
         Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
         row.value(iEntry);
       const ordinal_type ind = row.colidx(iEntry);
-
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
         sum[k] += val * m_x(ind, kk + k);
       }
-    }
+    });
 
     if (doalpha == -1) {
       for (int ii=0; ii < UNROLL; ++ii) {
-        y_value_type sumt = sum[ii];
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-        if (blockDim.x > 1)
-          sumt += Kokkos::shfl_down(sumt, 1,blockDim.x);
-        if (blockDim.x > 2)
-          sumt += Kokkos::shfl_down(sumt, 2,blockDim.x);
-        if (blockDim.x > 4)
-          sumt += Kokkos::shfl_down(sumt, 4,blockDim.x);
-        if (blockDim.x > 8)
-          sumt += Kokkos::shfl_down(sumt, 8,blockDim.x);
-        if (blockDim.x > 16)
-          sumt += Kokkos::shfl_down(sumt, 16,blockDim.x);
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-        sum[ii] = -sumt;
+        y_value_type sumt;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length),
+        [&](ordinal_type, y_value_type& lsum)
+        {
+          //in this context, sum[ii] is a partial sum ii on one of the vector lanes.
+          lsum -= sum[ii];
+        }, sumt);
+        sum[ii] = sumt;
+        //that was an all-reduce, so sum[ii] is the same on every vector lane
       }
     }
     else {
       for (int ii=0; ii < UNROLL; ++ii) {
-        y_value_type sumt = sum[ii];
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-        if (blockDim.x > 1)
-          sumt += Kokkos::shfl_down(sumt, 1,blockDim.x);
-        if (blockDim.x > 2)
-          sumt += Kokkos::shfl_down(sumt, 2,blockDim.x);
-        if (blockDim.x > 4)
-          sumt += Kokkos::shfl_down(sumt, 4,blockDim.x);
-        if (blockDim.x > 8)
-          sumt += Kokkos::shfl_down(sumt, 8,blockDim.x);
-        if (blockDim.x > 16)
-          sumt += Kokkos::shfl_down(sumt, 16,blockDim.x);
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-        sum[ii] = sumt;
+        y_value_type sumt;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length),
+        [&](ordinal_type, y_value_type& lsum)
+        {
+          //in this context, sum[ii] is a partial sum ii on one of the vector lanes.
+          lsum += sum[ii];
+        }, sumt);
+        if(doalpha == 1)
+          sum[ii] = sumt;
+        else
+          sum[ii] = sumt * alpha;
       }
     }
 
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-    if (threadIdx.x==0)
-#else
-    if (true)
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-    {
-      if (doalpha * doalpha != 1) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-        for (int k = 0; k < UNROLL; ++k) {
-          sum[k] *= alpha;
-        }
-      }
+    if (dobeta == 0) {
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+      [&](ordinal_type k)
+      {
+        m_y(iRow, kk + k) = sum[k];
+      });
+    } else if (dobeta == 1) {
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+      [&](ordinal_type k)
+      {
+        m_y(iRow, kk + k) = m_y(iRow, kk + k) + sum[k];
+      });
+    } else if (dobeta == -1) {
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+      [&](ordinal_type k)
+      {
+        m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k];
+      });
+    } else {
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+      [&](ordinal_type k)
+      {
+        m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k];
+      });
+    }
+  }
+
+  template<int UNROLL>
+  KOKKOS_INLINE_FUNCTION void
+  strip_mine (const ordinal_type& iRow, const ordinal_type& kk) const
+  {
+    y_value_type sum[UNROLL];
 
-      if (dobeta == 0) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-        for (int k = 0; k < UNROLL; ++k) {
-          m_y(iRow, kk + k) = sum[k];
-        }
-      } else if (dobeta == 1) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-        for (int k = 0; k < UNROLL; ++k) {
-          m_y(iRow, kk + k) += sum[k];
-        }
-      } else if (dobeta == -1) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
-        for (int k = 0; k < UNROLL; ++k) {
-          m_y(iRow, kk + k) = -m_y(iRow, kk + k) +  sum[k];
-        }
-      } else {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
+    for (int k = 0; k < UNROLL; ++k) {
+      sum[k] = Kokkos::Details::ArithTraits<y_value_type>::zero ();
+    }
+
+    const auto row = m_A.rowConst (iRow);
+
+    // The correct type of iEntry is ordinal_type, the type of the
+    // number of columns in the (local) matrix.  This is because we
+    // assume either that rows have no duplicate entries, or that rows
+    // never have enough duplicate entries to overflow ordinal_type.
+
+    for(ordinal_type iEntry = 0; iEntry < row.length; iEntry++)
+    {
+      const A_value_type val = conjugate ?
+        Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
+        row.value(iEntry);
+      const ordinal_type ind = row.colidx(iEntry);
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
-        for (int k = 0; k < UNROLL; ++k) {
-          m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k];
-        }
+      for (int k = 0; k < UNROLL; ++k) {
+        if(doalpha == 1)
+          sum[k] += val * m_x(ind, kk + k);
+        else if(doalpha == -1)
+          sum[k] -= val * m_x(ind, kk + k);
+        else
+          sum[k] += alpha * val * m_x(ind, kk + k);
       }
     }
+
+    if (dobeta == 0) {
+      for(ordinal_type k = 0; k < UNROLL; k++)
+        m_y(iRow, kk + k) = sum[k];
+    } else if (dobeta == 1) {
+      for(ordinal_type k = 0; k < UNROLL; k++)
+        m_y(iRow, kk + k) = m_y(iRow, kk + k) + sum[k];
+    } else if (dobeta == -1) {
+      for(ordinal_type k = 0; k < UNROLL; k++)
+        m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k];
+    } else {
+      for(ordinal_type k = 0; k < UNROLL; k++)
+        m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k];
+    }
   }
 
   KOKKOS_INLINE_FUNCTION void
-  strip_mine_1 (const team_member& /* dev */, const ordinal_type& iRow) const
+  strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const
   {
-    y_value_type sum = Kokkos::Details::ArithTraits<y_value_type>::zero ();
-
     const auto row = m_A.rowConst (iRow);
 
     // The correct type of iEntry is ordinal_type, the type of the
@@ -724,48 +809,17 @@ struct SPMV_MV_LayoutLeft_Functor {
     // assume either that rows have no duplicate entries, or that rows
     // never have enough duplicate entries to overflow ordinal_type.
 
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
-#pragma loop count (15)
-#endif
-#ifdef __CUDA_ARCH__
-    for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-         iEntry < row.length;
-         iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-    for (ordinal_type iEntry = 0;
-         iEntry < row.length;
-         iEntry ++)
-#endif
+    y_value_type sum;
+    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, row.length),
+    [&](ordinal_type iEntry, y_value_type& lsum)
     {
       const A_value_type val = conjugate ?
           Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
           row.value(iEntry);
-      sum += val * m_x(row.colidx(iEntry),0);
-    }
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-    if (blockDim.x > 1)
-      sum += Kokkos::shfl_down(sum, 1,blockDim.x);
-    if (blockDim.x > 2)
-      sum += Kokkos::shfl_down(sum, 2,blockDim.x);
-    if (blockDim.x > 4)
-      sum += Kokkos::shfl_down(sum, 4,blockDim.x);
-    if (blockDim.x > 8)
-      sum += Kokkos::shfl_down(sum, 8,blockDim.x);
-    if (blockDim.x > 16)
-      sum += Kokkos::shfl_down(sum, 16,blockDim.x);
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-    if (threadIdx.x==0)
-#else
-    if (true)
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
+      lsum += val * m_x(row.colidx(iEntry),0);
+    }, sum);
+    Kokkos::single(Kokkos::PerThread(dev),
+    [&]()
     {
       if (doalpha == -1) {
         sum = -sum;
@@ -782,9 +836,144 @@ struct SPMV_MV_LayoutLeft_Functor {
       } else {
         m_y(iRow, 0) = beta * m_y(iRow, 0) + sum;
       }
+    });
+  }
+
+  KOKKOS_INLINE_FUNCTION void
+  strip_mine_1 (const ordinal_type& iRow) const
+  {
+    const auto row = m_A.rowConst (iRow);
+
+    // The correct type of iEntry is ordinal_type, the type of the
+    // number of columns in the (local) matrix.  This is because we
+    // assume either that rows have no duplicate entries, or that rows
+    // never have enough duplicate entries to overflow ordinal_type.
+
+    y_value_type sum = y_value_type();
+    for(ordinal_type iEntry = 0; iEntry < row.length; iEntry++)
+    {
+      const A_value_type val = conjugate ?
+          Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
+          row.value(iEntry);
+      sum += val * m_x(row.colidx(iEntry),0);
+    }
+    if (doalpha == -1) {
+      sum = -sum;
+    } else if (doalpha != 1) {
+      sum *= alpha;
+    }
+
+    if (dobeta == 0) {
+      m_y(iRow, 0) = sum ;
+    } else if (dobeta == 1) {
+      m_y(iRow, 0) += sum ;
+    } else if (dobeta == -1) {
+      m_y(iRow, 0) = -m_y(iRow, 0) +  sum;
+    } else {
+      m_y(iRow, 0) = beta * m_y(iRow, 0) + sum;
     }
   }
 
+  KOKKOS_INLINE_FUNCTION void
+  operator() (const ordinal_type& iRow) const
+  {
+    // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it
+    // needs to have the same type as n.
+    ordinal_type kk = 0;
+
+#ifdef KOKKOS_FAST_COMPILE
+    for (; kk + 4 <= n; kk += 4) {
+      strip_mine<4>(dev, iRow, kk);
+    }
+    for( ; kk < n; ++kk) {
+      strip_mine<1>(dev, iRow, kk);
+    }
+#else
+#  if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+      if ((n > 8) && (n % 8 == 1)) {
+        strip_mine<9>(iRow, kk);
+        kk += 9;
+      }
+      for(; kk + 8 <= n; kk += 8)
+        strip_mine<8>(iRow, kk);
+      if(kk < n) {
+        switch(n - kk) {
+#  else // NOT a GPU
+      if ((n > 16) && (n % 16 == 1)) {
+        strip_mine<17>(iRow, kk);
+        kk += 17;
+      }
+
+      for (; kk + 16 <= n; kk += 16) {
+        strip_mine<16>(iRow, kk);
+      }
+
+      if(kk < n) {
+        switch(n - kk) {
+        case 15:
+          strip_mine<15>(iRow, kk);
+          break;
+
+        case 14:
+          strip_mine<14>(iRow, kk);
+          break;
+
+        case 13:
+          strip_mine<13>(iRow, kk);
+          break;
+
+        case 12:
+          strip_mine<12>(iRow, kk);
+          break;
+
+        case 11:
+          strip_mine<11>(iRow, kk);
+          break;
+
+        case 10:
+          strip_mine<10>(iRow, kk);
+          break;
+
+        case 9:
+          strip_mine<9>(iRow, kk);
+          break;
+
+        case 8:
+          strip_mine<8>(iRow, kk);
+          break;
+#  endif // if/else: __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__
+        case 7:
+          strip_mine<7>(iRow, kk);
+          break;
+
+        case 6:
+          strip_mine<6>(iRow, kk);
+          break;
+
+        case 5:
+          strip_mine<5>(iRow, kk);
+          break;
+
+        case 4:
+          strip_mine<4>(iRow, kk);
+          break;
+
+        case 3:
+          strip_mine<3>(iRow, kk);
+          break;
+
+        case 2:
+          strip_mine<2>(iRow, kk);
+          break;
+
+        case 1:
+          strip_mine_1(iRow);
+          break;
+        }
+      }
+#endif // KOKKOS_FAST_COMPILE
+    }
+
 
   KOKKOS_INLINE_FUNCTION void
   operator() (const team_member& dev) const
@@ -812,91 +1001,92 @@ struct SPMV_MV_LayoutLeft_Functor {
         strip_mine<1>(dev, iRow, kk);
       }
 #else
-#  ifdef __CUDA_ARCH__
+#  if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
       if ((n > 8) && (n % 8 == 1)) {
         strip_mine<9>(dev, iRow, kk);
         kk += 9;
       }
       for(; kk + 8 <= n; kk += 8)
         strip_mine<8>(dev, iRow, kk);
-      if(kk < n)
+      if(kk < n) {
         switch(n - kk) {
-#  else // NOT a CUDA device
-          if ((n > 16) && (n % 16 == 1)) {
-            strip_mine<17>(dev, iRow, kk);
-            kk += 17;
-          }
+#  else // NOT a GPU
+      if ((n > 16) && (n % 16 == 1)) {
+        strip_mine<17>(dev, iRow, kk);
+        kk += 17;
+      }
 
-          for (; kk + 16 <= n; kk += 16) {
-            strip_mine<16>(dev, iRow, kk);
-          }
+      for (; kk + 16 <= n; kk += 16) {
+        strip_mine<16>(dev, iRow, kk);
+      }
 
-          if(kk < n)
-            switch(n - kk) {
-            case 15:
-              strip_mine<15>(dev, iRow, kk);
-              break;
-
-            case 14:
-              strip_mine<14>(dev, iRow, kk);
-              break;
-
-            case 13:
-              strip_mine<13>(dev, iRow, kk);
-              break;
-
-            case 12:
-              strip_mine<12>(dev, iRow, kk);
-              break;
-
-            case 11:
-              strip_mine<11>(dev, iRow, kk);
-              break;
-
-            case 10:
-              strip_mine<10>(dev, iRow, kk);
-              break;
-
-            case 9:
-              strip_mine<9>(dev, iRow, kk);
-              break;
-
-            case 8:
-              strip_mine<8>(dev, iRow, kk);
-              break;
-#  endif // __CUDA_ARCH__
-            case 7:
-              strip_mine<7>(dev, iRow, kk);
-              break;
-
-            case 6:
-              strip_mine<6>(dev, iRow, kk);
-              break;
-
-            case 5:
-              strip_mine<5>(dev, iRow, kk);
-              break;
-
-            case 4:
-              strip_mine<4>(dev, iRow, kk);
-              break;
-
-            case 3:
-              strip_mine<3>(dev, iRow, kk);
-              break;
-
-            case 2:
-              strip_mine<2>(dev, iRow, kk);
-              break;
-
-            case 1:
-              strip_mine_1(dev, iRow);
-              break;
-            }
-#endif // KOKKOS_FAST_COMPILE
+      if(kk < n) {
+        switch(n - kk) {
+        case 15:
+          strip_mine<15>(dev, iRow, kk);
+          break;
+
+        case 14:
+          strip_mine<14>(dev, iRow, kk);
+          break;
+
+        case 13:
+          strip_mine<13>(dev, iRow, kk);
+          break;
+
+        case 12:
+          strip_mine<12>(dev, iRow, kk);
+          break;
+
+        case 11:
+          strip_mine<11>(dev, iRow, kk);
+          break;
+
+        case 10:
+          strip_mine<10>(dev, iRow, kk);
+          break;
+
+        case 9:
+          strip_mine<9>(dev, iRow, kk);
+          break;
+
+        case 8:
+          strip_mine<8>(dev, iRow, kk);
+          break;
+#  endif // if/else: __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__
+        case 7:
+          strip_mine<7>(dev, iRow, kk);
+          break;
+
+        case 6:
+          strip_mine<6>(dev, iRow, kk);
+          break;
+
+        case 5:
+          strip_mine<5>(dev, iRow, kk);
+          break;
+
+        case 4:
+          strip_mine<4>(dev, iRow, kk);
+          break;
+
+        case 3:
+          strip_mine<3>(dev, iRow, kk);
+          break;
+
+        case 2:
+          strip_mine<2>(dev, iRow, kk);
+          break;
+
+        case 1:
+          strip_mine_1(dev, iRow);
+          break;
         }
+      }
+#endif // KOKKOS_FAST_COMPILE
     }
-  };
+  }
+};
 
 
 template<class AMatrix,
@@ -912,7 +1102,8 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a
                                  const typename YVector::non_const_value_type& beta,
                                  const YVector& y)
 {
-  typedef typename AMatrix::ordinal_type ordinal_type;
+  using ordinal_type = typename AMatrix::non_const_ordinal_type;
+  using size_type = typename AMatrix::non_const_size_type;
 
   if (A.numRows () <= static_cast<ordinal_type> (0)) {
     return;
@@ -924,39 +1115,38 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a
     return;
   }
   else {
-    typedef typename AMatrix::size_type size_type;
 
     // Assuming that no row contains duplicate entries, NNZPerRow
     // cannot be more than the number of columns of the matrix.  Thus,
     // the appropriate type is ordinal_type.
-    const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
+    const ordinal_type NNZPerRow = A.nnz () / A.numRows ();
 
-    int vector_length = 1;
-    while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
+    bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>();
+    ordinal_type vector_length = 1;
+    if(use_teams) {
+      while( (vector_length*2*3 <= NNZPerRow) && (vector_length<8) )
+        vector_length *= 2;
+    }
 
 #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
 
     typedef SPMV_MV_LayoutLeft_Functor<AMatrix, XVector, YVector,
                                        doalpha, dobeta, conjugate> OpType;
-    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
 
     typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-    // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-    // instead of int?  For example, if the number of threads is 1,
-    // then this is just the number of rows.  Ditto for rows_per_team.
-    // team_size is a hardware resource thing so it might legitimately
-    // be int.
-    const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
-#else
-  const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-#endif
-    const int rows_per_team = rows_per_thread * team_size;
-    const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-    Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
-       ( nteams , team_size , vector_length ) , op );
+    if(use_teams) {
+      const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
+      const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+      const ordinal_type rows_per_team = rows_per_thread * team_size;
+      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
+      Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
+         ( nteams , team_size , vector_length ) , op );
+    }
+    else {
+      Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>", Kokkos::RangePolicy< typename AMatrix::execution_space >( 0, nrow ), op );
+    }
 
 #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
 
@@ -965,24 +1155,20 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a
 
     typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
-
-    // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-    // instead of int?  For example, if the number of threads is 1,
-    // then this is just the number of rows.  Ditto for rows_per_team.
-    // team_size is a hardware resource thing so it might legitimately
-    // be int.
-    const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
-#else
-  const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-#endif
-    const int rows_per_team = rows_per_thread * team_size;
-    const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-    Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
-       ( nteams , team_size , vector_length ) , op );
+    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
 
+    if(use_teams) {
+      const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
+      const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+      const ordinal_type rows_per_team = rows_per_thread * team_size;
+      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
+      Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
+         ( nteams , team_size , vector_length ) , op );
+    }
+    else {
+      Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>",  Kokkos::RangePolicy< typename AMatrix::execution_space >
+         ( 0, nrow ) , op );
+    }
 #endif // KOKKOS_FAST_COMPILE
   }
 }
@@ -1000,7 +1186,8 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph
                               const typename YVector::non_const_value_type& beta,
                               const YVector& y)
 {
-  typedef typename AMatrix::ordinal_type ordinal_type;
+  using ordinal_type = typename AMatrix::non_const_ordinal_type;
+  using size_type = typename AMatrix::non_const_size_type;
 
   if (A.numRows () <= static_cast<ordinal_type> (0)) {
     return;
@@ -1013,39 +1200,40 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph
   }
 
   if (doalpha != 0) {
-    typedef typename AMatrix::size_type size_type;
 
     // Assuming that no row contains duplicate entries, NNZPerRow
     // cannot be more than the number of columns of the matrix.  Thus,
     // the appropriate type is ordinal_type.
     const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
 
-    int vector_length = 1;
-    while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
+    ordinal_type vector_length = 1;
+    bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>();
+    //Transpose functor uses atomics which can't be vectorized on CPU
+    if(use_teams) {
+      while( (vector_length*2*3 <= NNZPerRow) && (vector_length<8) )
+        vector_length*=2;
+    }
 
 #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
 
     typedef SPMV_MV_Transpose_Functor<AMatrix, XVector, YVector,
       doalpha, dobeta, conjugate> OpType;
-    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
-
-    typename AMatrix::const_ordinal_type nrow = A.numRows();
-
-    // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-    // instead of int?  For example, if the number of threads is 1,
-    // then this is just the number of rows.  Ditto for rows_per_team.
-    // team_size is a hardware resource thing so it might legitimately
-    // be int.
-    const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
-#else
-  const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-#endif
-    const int rows_per_team = rows_per_thread * team_size;
-    const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-    Kokkos::parallel_for ("KokkosSparse::spmv<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
-       ( nteams , team_size , vector_length ) , op );
+    OpType op (alpha, A, x, beta, y);
+
+    const ordinal_type nrow = A.numRows();
+    if(use_teams) {
+      const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
+      const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+      const ordinal_type rows_per_team = rows_per_thread * team_size;
+      op.rows_per_team = rows_per_team;
+      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
+      Kokkos::parallel_for ("KokkosSparse::spmv<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
+         ( nteams , team_size , vector_length ) , op );
+    }
+    else {
+      Kokkos::parallel_for ("KokkosSparse::spmv<MV,Transpose>",  Kokkos::RangePolicy < typename AMatrix::execution_space >
+         ( 0 , nrow ) , op );
+    }
 
 #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
 
@@ -1053,24 +1241,21 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph
       2, 2, conjugate, SizeType> OpType;
 
     typename AMatrix::const_ordinal_type nrow = A.numRows();
-
-    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
-
-    // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-    // instead of int?  For example, if the number of threads is 1,
-    // then this is just the number of rows.  Ditto for rows_per_team.
-    // team_size is a hardware resource thing so it might legitimately
-    // be int.
-    const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
-#else
-  const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-#endif
-    const int rows_per_team = rows_per_thread * team_size;
-    const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-    Kokkos::parallel_for("KokkosSparse::spmv<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
-       ( nteams , team_size , vector_length ) , op );
+    if(use_teams) {
+      OpType op (alpha, A, x, beta, y);
+
+      const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
+      const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+      const ordinal_type rows_per_team = rows_per_thread * team_size;
+      op.rows_per_team = rows_per_team;
+      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
+      Kokkos::parallel_for("KokkosSparse::spmv<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
+         ( nteams , team_size , vector_length ) , op );
+    }
+    else {
+      Kokkos::parallel_for("KokkosSparse::spmv<MV,Transpose>",  Kokkos::RangePolicy< typename AMatrix::execution_space >
+         ( 0, nrow ) , op );
+    }
 
 #endif // KOKKOS_FAST_COMPILE
   }
@@ -1135,7 +1320,6 @@ spmv_alpha_mv (const char mode[],
   }
 }
 
-}
-}
+}}  //namespace KokkosSparse::Impl
 
 #endif // KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_
diff --git a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp
index a4f1c07258..72c8a969fe 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp
@@ -47,7 +47,6 @@ namespace Impl {
 #ifdef KOKKOS_ENABLE_OPENMP
 template<typename AMatrix, typename XVector, typename YVector>
 void spmv_raw_openmp_no_transpose(typename YVector::const_value_type& s_a, AMatrix A, XVector x, typename YVector::const_value_type& s_b, YVector y) {
-
   typedef typename YVector::non_const_value_type value_type;
   typedef typename AMatrix::ordinal_type         ordinal_type;
   typedef typename AMatrix::non_const_size_type            size_type;
diff --git a/src/sparse/impl/KokkosSparse_spmv_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_spec.hpp
index 9d1f44bd2a..b678142dbe 100644
--- a/src/sparse/impl/KokkosSparse_spmv_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_spec.hpp
@@ -257,8 +257,6 @@ struct SPMV < AT, AO, AD, AM, AS,
   {
     typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
 
-    typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
-
     if (alpha == KAT::zero ()) {
       if (beta != KAT::one ()) {
         KokkosBlas::scal (y, beta, y);
diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
index 51d2189c5c..3179a0cc31 100644
--- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
@@ -46,6 +46,7 @@
 #define KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_
 
 #include "Kokkos_InnerProductSpaceTraits.hpp"
+#include "KokkosKernels_ExecSpaceUtils.hpp"
 #include "KokkosBlas1_scal.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 
@@ -91,12 +92,13 @@ struct SPMV_Struct_Transpose_Functor {
   KOKKOS_INLINE_FUNCTION void
   operator() (const team_member& dev) const
   {
-    // This should be a thread loop as soon as we can use C++11
-    for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) {
+    const ordinal_type teamWorkStart = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank())) * rows_per_thread;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread),
+    [&](ordinal_type loop)
+    {
       // iRow represents a row of the matrix, so its correct type is
       // ordinal_type.
-      const ordinal_type iRow = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank()))
-                                * rows_per_thread + loop;
+      ordinal_type iRow = teamWorkStart + loop;
       if (iRow >= m_A.numRows ()) {
         return;
       }
@@ -104,15 +106,8 @@ struct SPMV_Struct_Transpose_Functor {
       const auto row = m_A.rowConst (iRow);
       const ordinal_type row_length = row.length;
 
-#ifdef __CUDA_ARCH__
-      for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-           iEntry < row_length;
-           iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-      for (ordinal_type iEntry = 0;
-           iEntry < row_length;
-           iEntry ++)
-#endif
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length),
+      [&](ordinal_type iEntry)
       {
         const value_type val = conjugate ?
           ATV::conj (row.value(iEntry)) :
@@ -120,8 +115,8 @@ struct SPMV_Struct_Transpose_Functor {
         const ordinal_type ind = row.colidx(iEntry);
 
         Kokkos::atomic_add (&m_y(ind), static_cast<y_value_type> (alpha * val * m_x(iRow)));
-      }
-    }
+      });
+    });
   }
 };
 
@@ -302,7 +297,7 @@ struct SPMV_Struct_Functor {
       });
     dev.team_barrier();
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team), [&] (const ordinal_type& loop) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team),[&] (const ordinal_type& loop) {
         const ordinal_type interiorIdx = static_cast<ordinal_type> ( dev.league_rank() ) * rows_per_team + loop;
         if(interiorIdx >= numInterior) { return; }
 
@@ -665,11 +660,9 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, int nnz_
 
   // Determine rows per thread
   if(rows_per_thread < 1) {
-    #ifdef KOKKOS_ENABLE_CUDA
-    if(std::is_same<Kokkos::Cuda,execution_space>::value)
+    if(KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>())
       rows_per_thread = 1;
     else
-    #endif
     {
       if(nnz_per_row < 20 && numInterior*nnz_per_row > 5000000 ) {
         rows_per_thread = 256;
@@ -678,14 +671,12 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, int nnz_
     }
   }
 
-  #ifdef KOKKOS_ENABLE_CUDA
   if(team_size < 1) {
-    if(std::is_same<Kokkos::Cuda,execution_space>::value)
+    if(KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>())
     { team_size = 128 / vector_length; }
     else
     { team_size = 1; }
   }
-  #endif
 
   rows_per_team = rows_per_thread * team_size;
 
@@ -903,27 +894,19 @@ struct SPMV_MV_Struct_Transpose_Functor {
   operator() (const team_member& dev) const
   {
     // This should be a thread loop as soon as we can use C++11
-    for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) {
-      // iRow represents a row of the matrix, so its correct type is
-      // ordinal_type.
-      const ordinal_type iRow = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank()))
-                                * rows_per_thread + loop;
+    const ordinal_type teamWorkStart = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank())) * rows_per_thread;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread),
+    [&](ordinal_type loop)
+    {
+      const ordinal_type iRow = teamWorkStart + loop;
       if (iRow >= m_A.numRows ()) {
         return;
       }
-
       const auto row = m_A.rowConst (iRow);
       const ordinal_type row_length = row.length;
 
-#ifdef __CUDA_ARCH__
-      for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-           iEntry < static_cast<ordinal_type> (row_length);
-           iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-      for (ordinal_type iEntry = 0;
-           iEntry < row_length;
-           iEntry ++)
-#endif
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length),
+      [&](ordinal_type iEntry)
       {
         const A_value_type val = conjugate ?
           Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
@@ -947,429 +930,334 @@ struct SPMV_MV_Struct_Transpose_Functor {
                                 static_cast<y_value_type> (val * m_x(iRow, k)));
           }
         }
-      }
-    }
+      });
+    });
   }
 };
 
-  template<class AMatrix,
-           class XVector,
-           class YVector,
-           int doalpha,
-           int dobeta,
-           bool conjugate>
-  struct SPMV_MV_Struct_LayoutLeft_Functor {
-    typedef typename AMatrix::execution_space            execution_space;
-    typedef typename AMatrix::non_const_ordinal_type     ordinal_type;
-    typedef typename AMatrix::non_const_value_type       A_value_type;
-    typedef typename YVector::non_const_value_type       y_value_type;
-    typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
-    typedef typename team_policy::member_type            team_member;
-    typedef typename YVector::non_const_value_type       coefficient_type;
-
-    const coefficient_type alpha;
-    AMatrix m_A;
-    XVector m_x;
-    const coefficient_type beta;
-    YVector m_y;
-    //! The number of columns in the input and output MultiVectors.
-    ordinal_type n;
-    ordinal_type rows_per_thread;
-
-    SPMV_MV_Struct_LayoutLeft_Functor (const coefficient_type& alpha_,
-                                       const AMatrix& m_A_,
-                                       const XVector& m_x_,
-                                       const coefficient_type& beta_,
-                                       const YVector& m_y_,
-                                       const ordinal_type rows_per_thread_) :
-      alpha (alpha_),
-      m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)),
-      rows_per_thread (rows_per_thread_)
-    {}
-
-    template<int UNROLL>
-    KOKKOS_INLINE_FUNCTION void
-    strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const
-    {
-      y_value_type sum[UNROLL];
-
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-      for (int k = 0; k < UNROLL; ++k) {
-        sum[k] = Kokkos::Details::ArithTraits<y_value_type>::zero ();
-      }
+template<class AMatrix,
+         class XVector,
+         class YVector,
+         int doalpha,
+         int dobeta,
+         bool conjugate>
+struct SPMV_MV_Struct_LayoutLeft_Functor {
+  typedef typename AMatrix::execution_space            execution_space;
+  typedef typename AMatrix::non_const_ordinal_type     ordinal_type;
+  typedef typename AMatrix::non_const_value_type       A_value_type;
+  typedef typename YVector::non_const_value_type       y_value_type;
+  typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
+  typedef typename team_policy::member_type            team_member;
+  typedef typename YVector::non_const_value_type       coefficient_type;
 
-      const auto row = m_A.rowConst (iRow);
+  const coefficient_type alpha;
+  AMatrix m_A;
+  XVector m_x;
+  const coefficient_type beta;
+  YVector m_y;
+  //! The number of columns in the input and output MultiVectors.
+  ordinal_type n;
+  ordinal_type rows_per_thread;
+  int vector_length;
+
+  SPMV_MV_Struct_LayoutLeft_Functor (const coefficient_type& alpha_,
+                                     const AMatrix& m_A_,
+                                     const XVector& m_x_,
+                                     const coefficient_type& beta_,
+                                     const YVector& m_y_,
+                                     const ordinal_type rows_per_thread_,
+                                     int vector_length_) :
+    alpha (alpha_),
+    m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)),
+    rows_per_thread (rows_per_thread_), vector_length(vector_length_)
+  {}
 
-      // The correct type of iEntry is ordinal_type, the type of the
-      // number of columns in the (local) matrix.  This is because we
-      // assume either that rows have no duplicate entries, or that rows
-      // never have enough duplicate entries to overflow ordinal_type.
+  template<int UNROLL>
+  KOKKOS_INLINE_FUNCTION void
+  strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const
+  {
+    y_value_type sum[UNROLL];
 
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
-#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
-#pragma loop count (15)
-#endif
-#ifdef __CUDA_ARCH__
-      for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-           iEntry < row.length;
-           iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-      for (ordinal_type iEntry = 0;
-           iEntry < row.length;
-           iEntry ++)
-#endif
-          {
-            const A_value_type val = conjugate ?
-              Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
-              row.value(iEntry);
-            const ordinal_type ind = row.colidx(iEntry);
+    for (int k = 0; k < UNROLL; ++k) {
+      sum[k] = Kokkos::Details::ArithTraits<y_value_type>::zero ();
+    }
+
+    const auto row = m_A.rowConst (iRow);
+
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row.length),
+    [&](ordinal_type iEntry)
+    {
+      const A_value_type val = conjugate ?
+        Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
+        row.value(iEntry);
+      const ordinal_type ind = row.colidx(iEntry);
 
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
-            for (int k = 0; k < UNROLL; ++k) {
-              sum[k] += val * m_x(ind, kk + k);
-            }
-          }
-
-      if (doalpha == -1) {
-        for (int ii=0; ii < UNROLL; ++ii) {
-          y_value_type sumt = sum[ii];
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-          if (blockDim.x > 1)
-            sumt += Kokkos::shfl_down(sumt, 1,blockDim.x);
-          if (blockDim.x > 2)
-            sumt += Kokkos::shfl_down(sumt, 2,blockDim.x);
-          if (blockDim.x > 4)
-            sumt += Kokkos::shfl_down(sumt, 4,blockDim.x);
-          if (blockDim.x > 8)
-            sumt += Kokkos::shfl_down(sumt, 8,blockDim.x);
-          if (blockDim.x > 16)
-            sumt += Kokkos::shfl_down(sumt, 16,blockDim.x);
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-          sum[ii] = -sumt;
-        }
+      for (int k = 0; k < UNROLL; ++k) {
+        sum[k] += val * m_x(ind, kk + k);
       }
-      else {
-        for (int ii=0; ii < UNROLL; ++ii) {
-          y_value_type sumt = sum[ii];
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-          if (blockDim.x > 1)
-            sumt += Kokkos::shfl_down(sumt, 1,blockDim.x);
-          if (blockDim.x > 2)
-            sumt += Kokkos::shfl_down(sumt, 2,blockDim.x);
-          if (blockDim.x > 4)
-            sumt += Kokkos::shfl_down(sumt, 4,blockDim.x);
-          if (blockDim.x > 8)
-            sumt += Kokkos::shfl_down(sumt, 8,blockDim.x);
-          if (blockDim.x > 16)
-            sumt += Kokkos::shfl_down(sumt, 16,blockDim.x);
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
+    });
+
+    if (doalpha == -1) {
+      for (int ii=0; ii < UNROLL; ++ii) {
+        y_value_type sumt;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length),
+        [&](ordinal_type , y_value_type& lsum)
+        {
+          //in this context, sum[ii] is a partial sum ii on one of the vector lanes.
+          lsum -= sum[ii];
+        }, sumt);
+        sum[ii] = sumt;
+        //that was an all-reduce, so sum[ii] is the same on every vector lane
+      }
+    }
+    else {
+      for (int ii=0; ii < UNROLL; ++ii) {
+        y_value_type sumt;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length),
+        [&](ordinal_type, y_value_type& lsum)
+        {
+          //in this context, sum[ii] is a partial sum ii on one of the vector lanes.
+          lsum += sum[ii];
+        }, sumt);
+        if(doalpha == 1)
           sum[ii] = sumt;
-        }
+        else
+          sum[ii] = sumt * alpha;
       }
-
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-      if (threadIdx.x==0)
-#else
-        if (true)
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-          {
-            if (doalpha * doalpha != 1) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-              for (int k = 0; k < UNROLL; ++k) {
-                sum[k] *= alpha;
-              }
-            }
-
-            if (dobeta == 0) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-              for (int k = 0; k < UNROLL; ++k) {
-                m_y(iRow, kk + k) = sum[k];
-              }
-            } else if (dobeta == 1) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-              for (int k = 0; k < UNROLL; ++k) {
-                m_y(iRow, kk + k) += sum[k];
-              }
-            } else if (dobeta == -1) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-              for (int k = 0; k < UNROLL; ++k) {
-                m_y(iRow, kk + k) = -m_y(iRow, kk + k) +  sum[k];
-              }
-            } else {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-              for (int k = 0; k < UNROLL; ++k) {
-                m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k];
-              }
-            }
-          }
     }
 
-    KOKKOS_INLINE_FUNCTION void
-    strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const
+    Kokkos::single(Kokkos::PerThread(dev),
+    [&]()
     {
-      y_value_type sum = Kokkos::Details::ArithTraits<y_value_type>::zero ();
-
-      const auto row = m_A.rowConst (iRow);
+      if (dobeta == 0) {
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+        [&](ordinal_type k)
+        {
+          m_y(iRow, kk + k) = sum[k];
+        });
+      } else if (dobeta == 1) {
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+        [&](ordinal_type k)
+        {
+          m_y(iRow, kk + k) += sum[k];
+        });
+      } else if (dobeta == -1) {
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+        [&](ordinal_type k)
+        {
+          m_y(iRow, kk + k) = -m_y(iRow, kk + k) +  sum[k];
+        });
+      } else {
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+        [&](ordinal_type k)
+        {
+          m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k];
+        });
+      }
+    });
+  }
 
-      // The correct type of iEntry is ordinal_type, the type of the
-      // number of columns in the (local) matrix.  This is because we
-      // assume either that rows have no duplicate entries, or that rows
-      // never have enough duplicate entries to overflow ordinal_type.
+  KOKKOS_INLINE_FUNCTION void
+  strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const
+  {
+    const auto row = m_A.rowConst (iRow);
 
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
-#pragma loop count (15)
-#endif
-#ifdef __CUDA_ARCH__
-      for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-           iEntry < row.length;
-           iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-        for (ordinal_type iEntry = 0;
-             iEntry < row.length;
-             iEntry ++)
-#endif
-          {
-            const A_value_type val = conjugate ?
-              Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
-              row.value(iEntry);
-            sum += val * m_x(row.colidx(iEntry),0);
-          }
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-      if (blockDim.x > 1)
-        sum += Kokkos::shfl_down(sum, 1,blockDim.x);
-      if (blockDim.x > 2)
-        sum += Kokkos::shfl_down(sum, 2,blockDim.x);
-      if (blockDim.x > 4)
-        sum += Kokkos::shfl_down(sum, 4,blockDim.x);
-      if (blockDim.x > 8)
-        sum += Kokkos::shfl_down(sum, 8,blockDim.x);
-      if (blockDim.x > 16)
-        sum += Kokkos::shfl_down(sum, 16,blockDim.x);
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-      if (threadIdx.x==0)
-#else
-        if (true)
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-          {
-            if (doalpha == -1) {
-              sum = -sum;
-            } else if (doalpha * doalpha != 1) {
-              sum *= alpha;
-            }
+    y_value_type sum;
+    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, row.length),
+    [&](ordinal_type iEntry, y_value_type& lsum)
+    {
+      const A_value_type val = conjugate ?
+        Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
+        row.value(iEntry);
+      lsum += val * m_x(row.colidx(iEntry),0);
+    }, sum);
+
+    Kokkos::single(Kokkos::PerThread(dev),
+    [&]()
+    {
+      if (doalpha == -1) {
+        sum = -sum;
+      } else if (doalpha * doalpha != 1) {
+        sum *= alpha;
+      }
 
-            if (dobeta == 0) {
-              m_y(iRow, 0) = sum ;
-            } else if (dobeta == 1) {
-              m_y(iRow, 0) += sum ;
-            } else if (dobeta == -1) {
-              m_y(iRow, 0) = -m_y(iRow, 0) +  sum;
-            } else {
-              m_y(iRow, 0) = beta * m_y(iRow, 0) + sum;
-            }
-          }
-    }
+      if (dobeta == 0) {
+        m_y(iRow, 0) = sum;
+      } else if (dobeta == 1) {
+        m_y(iRow, 0) += sum;
+      } else if (dobeta == -1) {
+        m_y(iRow, 0) = -m_y(iRow, 0) +  sum;
+      } else {
+        m_y(iRow, 0) = beta * m_y(iRow, 0) + sum;
+      }
+    });
+  }
 
 
-    KOKKOS_INLINE_FUNCTION void
-    operator() (const team_member& dev) const
-    {
-      for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) {
+  KOKKOS_INLINE_FUNCTION void
+  operator() (const team_member& dev) const
+  {
+    for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) {
 
-        // iRow indexes over (local) rows of the matrix, so its correct
-        // type is ordinal_type.
+      // iRow indexes over (local) rows of the matrix, so its correct
+      // type is ordinal_type.
 
-        const ordinal_type iRow = (dev.league_rank() * dev.team_size() + dev.team_rank())
-          * rows_per_thread + loop;
-        if (iRow >= m_A.numRows ()) {
-          return;
-        }
+      const ordinal_type iRow = (dev.league_rank() * dev.team_size() + dev.team_rank())
+        * rows_per_thread + loop;
+      if (iRow >= m_A.numRows ()) {
+        return;
+      }
 
-        // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it
-        // needs to have the same type as n.
-        ordinal_type kk = 0;
+      // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it
+      // needs to have the same type as n.
+      ordinal_type kk = 0;
 
 #ifdef KOKKOS_FAST_COMPILE
-        for (; kk + 4 <= n; kk += 4) {
-          strip_mine<4>(dev, iRow, kk);
-        }
-        for( ; kk < n; ++kk) {
-          strip_mine<1>(dev, iRow, kk);
-        }
+      for (; kk + 4 <= n; kk += 4) {
+        strip_mine<4>(dev, iRow, kk);
+      }
+      for( ; kk < n; ++kk) {
+        strip_mine<1>(dev, iRow, kk);
+      }
 #else
-#  ifdef __CUDA_ARCH__
-        if ((n > 8) && (n % 8 == 1)) {
-          strip_mine<9>(dev, iRow, kk);
-          kk += 9;
-        }
-        for(; kk + 8 <= n; kk += 8)
-          strip_mine<8>(dev, iRow, kk);
-        if(kk < n)
-          switch(n - kk) {
+#  if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+      if ((n > 8) && (n % 8 == 1)) {
+        strip_mine<9>(dev, iRow, kk);
+        kk += 9;
+      }
+      for(; kk + 8 <= n; kk += 8)
+        strip_mine<8>(dev, iRow, kk);
+      if(kk < n)
+      {
+        switch(n - kk) {
 #  else // NOT a CUDA device
-            if ((n > 16) && (n % 16 == 1)) {
-              strip_mine<17>(dev, iRow, kk);
-              kk += 17;
-            }
-
-            for (; kk + 16 <= n; kk += 16) {
-              strip_mine<16>(dev, iRow, kk);
-            }
-
-            if(kk < n)
-              switch(n - kk) {
-              case 15:
-                strip_mine<15>(dev, iRow, kk);
-                break;
-
-              case 14:
-                strip_mine<14>(dev, iRow, kk);
-                break;
-
-              case 13:
-                strip_mine<13>(dev, iRow, kk);
-                break;
-
-              case 12:
-                strip_mine<12>(dev, iRow, kk);
-                break;
-
-              case 11:
-                strip_mine<11>(dev, iRow, kk);
-                break;
-
-              case 10:
-                strip_mine<10>(dev, iRow, kk);
-                break;
-
-              case 9:
-                strip_mine<9>(dev, iRow, kk);
-                break;
-
-              case 8:
-                strip_mine<8>(dev, iRow, kk);
-                break;
-#  endif // __CUDA_ARCH__
-              case 7:
-                strip_mine<7>(dev, iRow, kk);
-                break;
-
-              case 6:
-                strip_mine<6>(dev, iRow, kk);
-                break;
-
-              case 5:
-                strip_mine<5>(dev, iRow, kk);
-                break;
-
-              case 4:
-                strip_mine<4>(dev, iRow, kk);
-                break;
-
-              case 3:
-                strip_mine<3>(dev, iRow, kk);
-                break;
-
-              case 2:
-                strip_mine<2>(dev, iRow, kk);
-                break;
-
-              case 1:
-                strip_mine_1(dev, iRow);
-                break;
-              }
-#endif // KOKKOS_FAST_COMPILE
-          }
+      if ((n > 16) && (n % 16 == 1)) {
+        strip_mine<17>(dev, iRow, kk);
+        kk += 17;
       }
-    };
-
-
-    template<class AMatrix,
-             class XVector,
-             class YVector,
-             int doalpha,
-             int dobeta,
-             bool conjugate>
-    static void
-    spmv_alpha_beta_mv_struct_no_transpose (const typename YVector::non_const_value_type& alpha,
-                                            const AMatrix& A,
-                                            const XVector& x,
-                                            const typename YVector::non_const_value_type& beta,
-                                            const YVector& y)
-    {
-      typedef typename AMatrix::ordinal_type ordinal_type;
 
-      if (A.numRows () <= static_cast<ordinal_type> (0)) {
-        return;
+      for (; kk + 16 <= n; kk += 16) {
+        strip_mine<16>(dev, iRow, kk);
       }
-      if (doalpha == 0) {
-        if (dobeta != 1) {
-          KokkosBlas::scal (y, beta, y);
+
+      if(kk < n)
+      {
+        switch(n - kk) {
+          case 15:
+            strip_mine<15>(dev, iRow, kk);
+            break;
+
+          case 14:
+            strip_mine<14>(dev, iRow, kk);
+            break;
+
+          case 13:
+            strip_mine<13>(dev, iRow, kk);
+            break;
+
+          case 12:
+            strip_mine<12>(dev, iRow, kk);
+            break;
+
+          case 11:
+            strip_mine<11>(dev, iRow, kk);
+            break;
+
+          case 10:
+            strip_mine<10>(dev, iRow, kk);
+            break;
+
+          case 9:
+            strip_mine<9>(dev, iRow, kk);
+            break;
+
+          case 8:
+            strip_mine<8>(dev, iRow, kk);
+            break;
+  #endif // __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__
+          case 7:
+            strip_mine<7>(dev, iRow, kk);
+            break;
+
+          case 6:
+            strip_mine<6>(dev, iRow, kk);
+            break;
+
+          case 5:
+            strip_mine<5>(dev, iRow, kk);
+            break;
+
+          case 4:
+            strip_mine<4>(dev, iRow, kk);
+            break;
+
+          case 3:
+            strip_mine<3>(dev, iRow, kk);
+            break;
+
+          case 2:
+            strip_mine<2>(dev, iRow, kk);
+            break;
+
+          case 1:
+            strip_mine_1(dev, iRow);
+            break;
         }
-        return;
       }
-      else {
-        typedef typename AMatrix::size_type size_type;
+#endif // KOKKOS_FAST_COMPILE
+    }
+  }
+};
 
-        // Assuming that no row contains duplicate entries, NNZPerRow
-        // cannot be more than the number of columns of the matrix.  Thus,
-        // the appropriate type is ordinal_type.
-        const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
 
-        int vector_length = 1;
-        while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
+  template<class AMatrix,
+           class XVector,
+           class YVector,
+           int doalpha,
+           int dobeta,
+           bool conjugate>
+  static void
+  spmv_alpha_beta_mv_struct_no_transpose (const typename YVector::non_const_value_type& alpha,
+                                          const AMatrix& A,
+                                          const XVector& x,
+                                          const typename YVector::non_const_value_type& beta,
+                                          const YVector& y)
+  {
+    typedef typename AMatrix::ordinal_type ordinal_type;
+
+    if (A.numRows () <= static_cast<ordinal_type> (0)) {
+      return;
+    }
+    if (doalpha == 0) {
+      if (dobeta != 1) {
+        KokkosBlas::scal (y, beta, y);
+      }
+      return;
+    }
+    else {
+      typedef typename AMatrix::size_type size_type;
+
+      // Assuming that no row contains duplicate entries, NNZPerRow
+      // cannot be more than the number of columns of the matrix.  Thus,
+      // the appropriate type is ordinal_type.
+      const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
+
+      int vector_length = 1;
+      while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
 
 #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
 
-        typedef SPMV_MV_Struct_LayoutLeft_Functor<AMatrix, XVector, YVector,
-                                                  doalpha, dobeta, conjugate> OpType;
-        OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+      typedef SPMV_MV_Struct_LayoutLeft_Functor<AMatrix, XVector, YVector,
+                                                doalpha, dobeta, conjugate> OpType;
+      OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
 
-        typename AMatrix::const_ordinal_type nrow = A.numRows();
+      typename AMatrix::const_ordinal_type nrow = A.numRows();
 
         // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
         // instead of int?  For example, if the number of threads is 1,
@@ -1377,11 +1265,7 @@ struct SPMV_MV_Struct_Transpose_Functor {
         // team_size is a hardware resource thing so it might legitimately
         // be int.
         const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-        const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
-#else
         const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-#endif
         const int rows_per_team = rows_per_thread * team_size;
         const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
         Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,NoTranspose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
@@ -1389,12 +1273,12 @@ struct SPMV_MV_Struct_Transpose_Functor {
 
 #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
 
-        typedef SPMV_MV_Struct_LayoutLeft_Functor<AMatrix, XVector, YVector,
-                                                  2, 2, conjugate> OpType;
+      typedef SPMV_MV_Struct_LayoutLeft_Functor<AMatrix, XVector, YVector,
+                                                2, 2, conjugate> OpType;
 
-        typename AMatrix::const_ordinal_type nrow = A.numRows();
+      typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-        OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+      OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
 
         // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
         // instead of int?  For example, if the number of threads is 1,
@@ -1402,63 +1286,58 @@ struct SPMV_MV_Struct_Transpose_Functor {
         // team_size is a hardware resource thing so it might legitimately
         // be int.
         const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-        const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
-#else
         const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-#endif
         const int rows_per_team = rows_per_thread * team_size;
         const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
         Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,NoTranspose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
                              ( nteams , team_size , vector_length ) , op );
-
 #endif // KOKKOS_FAST_COMPILE
-      }
     }
+  }
 
-    template<class AMatrix,
-             class XVector,
-             class YVector,
-             int doalpha,
-             int dobeta,
-             bool conjugate>
-    static void
-    spmv_alpha_beta_mv_struct_transpose (const typename YVector::non_const_value_type& alpha,
-                                         const AMatrix& A,
-                                         const XVector& x,
-                                         const typename YVector::non_const_value_type& beta,
-                                         const YVector& y)
-    {
-      typedef typename AMatrix::ordinal_type ordinal_type;
+  template<class AMatrix,
+           class XVector,
+           class YVector,
+           int doalpha,
+           int dobeta,
+           bool conjugate>
+  static void
+  spmv_alpha_beta_mv_struct_transpose (const typename YVector::non_const_value_type& alpha,
+                                       const AMatrix& A,
+                                       const XVector& x,
+                                       const typename YVector::non_const_value_type& beta,
+                                       const YVector& y)
+  {
+    typedef typename AMatrix::ordinal_type ordinal_type;
 
-      if (A.numRows () <= static_cast<ordinal_type> (0)) {
-        return;
-      }
+    if (A.numRows () <= static_cast<ordinal_type> (0)) {
+      return;
+    }
 
-      // We need to scale y first ("scaling" by zero just means filling
-      // with zeros), since the functor works by atomic-adding into y.
-      if (dobeta != 1) {
-        KokkosBlas::scal (y, beta, y);
-      }
+    // We need to scale y first ("scaling" by zero just means filling
+    // with zeros), since the functor works by atomic-adding into y.
+    if (dobeta != 1) {
+      KokkosBlas::scal (y, beta, y);
+    }
 
-      if (doalpha != 0) {
-        typedef typename AMatrix::size_type size_type;
+    if (doalpha != 0) {
+      typedef typename AMatrix::size_type size_type;
 
-        // Assuming that no row contains duplicate entries, NNZPerRow
-        // cannot be more than the number of columns of the matrix.  Thus,
-        // the appropriate type is ordinal_type.
-        const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
+      // Assuming that no row contains duplicate entries, NNZPerRow
+      // cannot be more than the number of columns of the matrix.  Thus,
+      // the appropriate type is ordinal_type.
+      const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
 
-        int vector_length = 1;
-        while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
+      int vector_length = 1;
+      while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
 
 #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
 
-        typedef SPMV_MV_Struct_Transpose_Functor<AMatrix, XVector, YVector,
-                                                 doalpha, dobeta, conjugate> OpType;
-        OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+      typedef SPMV_MV_Struct_Transpose_Functor<AMatrix, XVector, YVector,
+                                               doalpha, dobeta, conjugate> OpType;
+      OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
 
-        typename AMatrix::const_ordinal_type nrow = A.numRows();
+      typename AMatrix::const_ordinal_type nrow = A.numRows();
 
         // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
         // instead of int?  For example, if the number of threads is 1,
@@ -1466,11 +1345,7 @@ struct SPMV_MV_Struct_Transpose_Functor {
         // team_size is a hardware resource thing so it might legitimately
         // be int.
         const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-        const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
-#else
         const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-#endif
         const int rows_per_team = rows_per_thread * team_size;
         const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
         Kokkos::parallel_for ("KokkosSparse::spmv_struct<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
@@ -1478,12 +1353,12 @@ struct SPMV_MV_Struct_Transpose_Functor {
 
 #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
 
-        typedef SPMV_MV_Struct_Transpose_Functor<AMatrix, XVector, YVector,
-                                                 2, 2, conjugate, SizeType> OpType;
+      typedef SPMV_MV_Struct_Transpose_Functor<AMatrix, XVector, YVector,
+                                               2, 2, conjugate, SizeType> OpType;
 
-        typename AMatrix::const_ordinal_type nrow = A.numRows();
+      typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-        OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+      OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
 
         // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
         // instead of int?  For example, if the number of threads is 1,
@@ -1491,80 +1366,74 @@ struct SPMV_MV_Struct_Transpose_Functor {
         // team_size is a hardware resource thing so it might legitimately
         // be int.
         const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-        const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
-#else
         const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-#endif
         const int rows_per_team = rows_per_thread * team_size;
         const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
         Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
                              ( nteams , team_size , vector_length ) , op );
 
 #endif // KOKKOS_FAST_COMPILE
-      }
     }
+  }
 
-    template<class AMatrix,
-             class XVector,
-             class YVector,
-             int doalpha,
-             int dobeta>
-    static void
-    spmv_alpha_beta_mv_struct (const char mode[],
-                               const typename YVector::non_const_value_type& alpha,
-                               const AMatrix& A,
-                               const XVector& x,
-                               const typename YVector::non_const_value_type& beta,
-                               const YVector& y)
-    {
-      if (mode[0] == NoTranspose[0]) {
-        spmv_alpha_beta_mv_struct_no_transpose<AMatrix, XVector, YVector, doalpha, dobeta, false> (alpha, A, x, beta, y);
-      }
-      else if (mode[0] == Conjugate[0]) {
-        spmv_alpha_beta_mv_struct_no_transpose<AMatrix, XVector, YVector, doalpha, dobeta, true> (alpha, A, x, beta, y);
-      }
-      else if (mode[0] == Transpose[0]) {
-        spmv_alpha_beta_mv_struct_transpose<AMatrix, XVector, YVector, doalpha, dobeta, false> (alpha, A, x, beta, y);
-      }
-      else if (mode[0] == ConjugateTranspose[0]) {
-        spmv_alpha_beta_mv_struct_transpose<AMatrix, XVector, YVector, doalpha, dobeta, true> (alpha, A, x, beta, y);
-      }
-      else {
-        Kokkos::Impl::throw_runtime_exception ("Invalid Transpose Mode for KokkosSparse::spmv()");
-      }
+  template<class AMatrix,
+           class XVector,
+           class YVector,
+           int doalpha,
+           int dobeta>
+  static void
+  spmv_alpha_beta_mv_struct (const char mode[],
+                             const typename YVector::non_const_value_type& alpha,
+                             const AMatrix& A,
+                             const XVector& x,
+                             const typename YVector::non_const_value_type& beta,
+                             const YVector& y)
+  {
+    if (mode[0] == NoTranspose[0]) {
+      spmv_alpha_beta_mv_struct_no_transpose<AMatrix, XVector, YVector, doalpha, dobeta, false> (alpha, A, x, beta, y);
     }
-
-    template<class AMatrix,
-             class XVector,
-             class YVector,
-             int doalpha>
-    void
-    spmv_alpha_mv_struct (const char mode[],
-                          const typename YVector::non_const_value_type& alpha,
-                          const AMatrix& A,
-                          const XVector& x,
-                          const typename YVector::non_const_value_type& beta,
-                          const YVector& y)
-    {
-      typedef typename YVector::non_const_value_type coefficient_type;
-      typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
-
-      if (beta == KAT::zero ()) {
-        spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 0> (mode, alpha, A, x, beta, y);
-      }
-      else if (beta == KAT::one ()) {
-        spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 1> (mode, alpha, A, x, beta, y);
-      }
-      else if (beta == -KAT::one ()) {
-        spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, -1> (mode, alpha, A, x, beta, y);
-      }
-      else {
-        spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 2> (mode, alpha, A, x, beta, y);
-      }
+    else if (mode[0] == Conjugate[0]) {
+      spmv_alpha_beta_mv_struct_no_transpose<AMatrix, XVector, YVector, doalpha, dobeta, true> (alpha, A, x, beta, y);
+    }
+    else if (mode[0] == Transpose[0]) {
+      spmv_alpha_beta_mv_struct_transpose<AMatrix, XVector, YVector, doalpha, dobeta, false> (alpha, A, x, beta, y);
+    }
+    else if (mode[0] == ConjugateTranspose[0]) {
+      spmv_alpha_beta_mv_struct_transpose<AMatrix, XVector, YVector, doalpha, dobeta, true> (alpha, A, x, beta, y);
+    }
+    else {
+      Kokkos::Impl::throw_runtime_exception ("Invalid Transpose Mode for KokkosSparse::spmv()");
     }
+  }
 
+  template<class AMatrix,
+           class XVector,
+           class YVector,
+           int doalpha>
+  void
+  spmv_alpha_mv_struct (const char mode[],
+                        const typename YVector::non_const_value_type& alpha,
+                        const AMatrix& A,
+                        const XVector& x,
+                        const typename YVector::non_const_value_type& beta,
+                        const YVector& y)
+  {
+    typedef typename YVector::non_const_value_type coefficient_type;
+    typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
 
+    if (beta == KAT::zero ()) {
+      spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 0> (mode, alpha, A, x, beta, y);
+    }
+    else if (beta == KAT::one ()) {
+      spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 1> (mode, alpha, A, x, beta, y);
+    }
+    else if (beta == -KAT::one ()) {
+      spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, -1> (mode, alpha, A, x, beta, y);
+    }
+    else {
+      spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 2> (mode, alpha, A, x, beta, y);
+    }
+  }
 
 }
 }
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
index 25e9844940..623df284ea 100644
--- a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
@@ -95,7 +95,7 @@ namespace Impl{
     if (!std::is_same<size_type, int>::value)
       sptrsv_handle->allocate_tmp_int_rowmap(row_map.extent(0));
     const int* rm  = !std::is_same<size_type, int>::value ? sptrsv_handle->get_int_rowmap_ptr_copy(row_map) : (const int*)row_map.data();
-    const int* ent =  entries.data();
+    const int* ent = (const int*) entries.data();
     const scalar_type* vals = values.data();
 
     if (std::is_same<scalar_type,double>::value) {
@@ -297,7 +297,7 @@ namespace Impl{
     int nnz = entries.extent_int(0);
 
     const int* rm  = !std::is_same<size_type, int>::value ? sptrsv_handle->get_int_rowmap_ptr() : (const int*)row_map.data();
-    const int* ent =  entries.data(); 
+    const int* ent = (const int*) entries.data();
     const scalar_type* vals = values.data();
     const scalar_type* bv = rhs.data();
     scalar_type* xv = lhs.data();
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
index a9ffcd282a..271d8b2396 100644
--- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
@@ -2464,6 +2464,23 @@ struct ReturnRangePolicyType<Kokkos::Cuda> {
   }
 };
 #endif
+#ifdef KOKKOS_ENABLE_HIP
+template <>
+struct ReturnRangePolicyType<Kokkos::Experimental::HIP> {
+  using PolicyType = Kokkos::RangePolicy<Kokkos::Experimental::HIP>;
+
+  static inline
+  PolicyType get_policy(int nt, int ts) {
+    return PolicyType(nt,ts);
+  }
+
+  template <class ExecInstanceType>
+  static inline
+  PolicyType get_policy(int nt, int ts, ExecInstanceType stream) {
+    return PolicyType(stream,nt,ts);
+  }
+};
+#endif
 
 template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType >
 void lower_tri_solve_cg( TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) {
diff --git a/test_common/KokkosKernels_TestParameters.hpp b/test_common/KokkosKernels_TestParameters.hpp
index 295b46df9b..c069c618e6 100644
--- a/test_common/KokkosKernels_TestParameters.hpp
+++ b/test_common/KokkosKernels_TestParameters.hpp
@@ -72,6 +72,7 @@ struct Parameters{
   int use_threads;
   int use_openmp;
   int use_cuda;
+  int use_hip;
   int use_serial;
   int a_mem_space, b_mem_space, c_mem_space, work_mem_space;
 
@@ -121,6 +122,7 @@ struct Parameters{
     use_threads = 0;
     use_openmp = 0;
     use_cuda = 0;
+    use_hip = 0;
     use_serial = 0;
     a_mem_space = b_mem_space = c_mem_space = work_mem_space = 1;
     a_mtx_bin_file = b_mtx_bin_file = c_mtx_bin_file = NULL;
diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp
index 8a9306325f..bf86768d16 100644
--- a/test_common/KokkosKernels_TestUtils.hpp
+++ b/test_common/KokkosKernels_TestUtils.hpp
@@ -105,6 +105,92 @@ namespace Test {
       EXPECT_NEAR_KK(h_v1(i), h_v2(i), tol);
     }
   }
-}
 
+  #if defined(KOKKOS_HALF_T_IS_FLOAT)
+  using halfScalarType = Kokkos::Experimental::half_t;
+  #endif // KOKKOS_HALF_T_IS_FLOAT
+
+  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class ExecutionSpace>
+  struct SharedVanillaGEMM {
+    bool A_t, B_t, A_c, B_c;
+    int C_rows, C_cols, A_cols;
+    ViewTypeA A;
+    ViewTypeB B;
+    ViewTypeC C;
+
+    typedef typename ViewTypeA::value_type ScalarA;
+    typedef typename ViewTypeB::value_type ScalarB;
+    typedef typename ViewTypeC::value_type ScalarC;
+    typedef Kokkos::Details::ArithTraits<ScalarC> APT;
+    typedef typename APT::mag_type mag_type;
+    ScalarA alpha;
+    ScalarC beta;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,C_rows), [&] (const int& i) {
+        // Give each kokkos thread a vector of A
+        auto a_vec = A_t ? Kokkos::subview(A, Kokkos::ALL(), i) : Kokkos::subview(A, i, Kokkos::ALL());
+
+        // Have all vector lanes perform the dot product
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,C_cols), [&] (const int& j) {
+          auto b_vec = B_t ? Kokkos::subview(B, j, Kokkos::ALL()) : Kokkos::subview(B, Kokkos::ALL(), j);
+          ScalarC ab = ScalarC(0);
+          for (int k = 0; k < A_cols; k++) {
+            auto a = A_c ? APT::conj(a_vec(k)) : a_vec(k);
+            auto b = B_c ? APT::conj(b_vec(k)) : b_vec(k);
+            ab += a * b;
+          }
+          C(i,j) = beta * C(i,j) + alpha * ab;
+        });
+      });
+    }
+  };
+  // C(i,:,:) = alpha * (A(i,:,:) * B(i,:,:)) + beta * C(i,:,:)
+  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class ExecutionSpace>
+  struct Functor_BatchedVanillaGEMM {
+    bool A_t, B_t, A_c, B_c;
+    ViewTypeA A;
+    ViewTypeB B;
+    ViewTypeC C;
+
+    using ScalarA = typename ViewTypeA::value_type;
+    using ScalarB = typename ViewTypeB::value_type;
+    using ScalarC = typename ViewTypeC::value_type;
+    ScalarA alpha;
+    ScalarC beta;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
+      int i = team.league_rank();
+
+      auto _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
+      auto _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL());
+      auto _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL());
+      using SubviewTypeA = decltype(_A);
+      using SubviewTypeB = decltype(_B);
+      using SubviewTypeC = decltype(_C);
+      struct SharedVanillaGEMM<SubviewTypeA,SubviewTypeB,SubviewTypeC,ExecutionSpace> vgemm;
+      vgemm.A_t = A_t; vgemm.B_t = B_t;
+      vgemm.A_c = A_c; vgemm.B_c = B_c;
+      vgemm.C_rows = C.extent(1);
+      vgemm.C_cols = C.extent(2);    
+      vgemm.A_cols = A_t?A.extent(1):A.extent(2);
+      vgemm.A = _A;
+      vgemm.B = _B;
+      vgemm.C = _C;
+      vgemm.alpha = alpha;
+      vgemm.beta = beta;
+      vgemm(team);
+    }
+
+    inline
+    void run() {
+      Kokkos::parallel_for(
+          "Test::VanillaGEMM",
+          Kokkos::TeamPolicy<ExecutionSpace>(C.extent(0), Kokkos::AUTO, 16),
+          *this);
+    }
+  };
+}
 #endif
diff --git a/test_common/Test_Common_ArithTraits.hpp b/test_common/Test_Common_ArithTraits.hpp
index 5e253a1820..bba54ff6f0 100644
--- a/test_common/Test_Common_ArithTraits.hpp
+++ b/test_common/Test_Common_ArithTraits.hpp
@@ -63,6 +63,13 @@
 #include <typeinfo> // typeid (T)
 #include <cstdio>
 
+#define FAILURE() {printf("%s:%s:%d: Failure\n", __FILE__, __func__, __LINE__); success = 0;}
+
+#if 0
+#define TRACE() printf("%s:%s:%d: Trace\n", __FILE__, __func__, __LINE__);
+#else
+#define TRACE()
+#endif
 
 namespace {
   // Whether Kokkos::Details::ArithTraits<ScalarType> implements
@@ -183,6 +190,7 @@ class ArithTraitsTesterBase {
   KOKKOS_INLINE_FUNCTION void
   operator () (size_type iwork, value_type& dst) const
   {
+    TRACE();
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
     (void) iwork; // not using this argument
     int success = 1;
@@ -203,7 +211,7 @@ class ArithTraitsTesterBase {
     // std::numeric_limits.
     if (! AT::is_specialized) {
       printf ("! AT::is_specialized\n");
-      success = 0;
+      FAILURE();
     }
 
     // It's OK to refer to std::numeric_limits constants in a device
@@ -211,11 +219,11 @@ class ArithTraitsTesterBase {
     // as device functions).
     if (AT::is_integer != std::numeric_limits<ScalarType>::is_integer) {
       printf ("AT::is_integer not same as numeric_limits\n");
-      success = 0;
+      FAILURE();
     }
     if (AT::is_exact != std::numeric_limits<ScalarType>::is_exact) {
       printf ("AT::is_exact not same as numeric_limits\n");
-      success = 0;
+      FAILURE();
     }
 
     const ScalarType zero = AT::zero ();
@@ -224,34 +232,34 @@ class ArithTraitsTesterBase {
     // Test properties of the arithmetic and multiplicative identities.
     if (zero + zero != zero) {
       printf ("0 + 0 != 0\n");
-      success = 0;
+      FAILURE();
     }
     if (zero + one != one) {
       printf ("0 + 1 != 1\n");
-      success = 0;
+      FAILURE();
     }
     if (one - one != zero) {
       printf ("1 - 1 != 0\n");
-      success = 0;
+      FAILURE();
     }
     // This is technically 1 even of Z_2, since in that field, one
     // is its own inverse (so -one == one).
     if ((one + one) - one != one) {
       printf ("(1 + 1) - 1 != 1\n");
-      success = 0;
+      FAILURE();
     }
 
     if (AT::abs (zero) != zero) {
       printf ("AT::abs(0) != 0\n");
-      success = 0;
+      FAILURE();
     }
     if (AT::abs (one) != one) {
       printf ("AT::abs(1) != 1\n");
-      success = 0;
+      FAILURE();
     }
     if (AT::is_signed && AT::abs (-one) != one) {
       printf ("AT::is_signed and AT::abs(-1) != 1\n");
-      success = 0;
+      FAILURE();
     }
     // Need enable_if to test whether T can be compared using <=.
     // However, mag_type should always be comparable using <=.
@@ -260,7 +268,7 @@ class ArithTraitsTesterBase {
     // They should work even for a set only containing zero.
     if (AT::abs (zero) > AT::abs (AT::max ())) {
       printf ("AT::abs(0) > AT::abs (AT::max ())\n");
-      success = 0;
+      FAILURE();
     }
 
     dst = dst && success;
@@ -312,17 +320,17 @@ class ArithTraitsTesterBase {
     // std::numeric_limits.
     if (! AT::is_specialized) {
       out << "ArithTraits is not specialized for T" << endl;
-      success = 0;
+      FAILURE();
     }
 
     if (AT::is_integer != std::numeric_limits<ScalarType>::is_integer) {
       out << "AT::is_integer != std::numeric_limits<ScalarType>::is_integer" << endl;
-      success = 0;
+      FAILURE();
     }
 
     if (AT::is_exact != std::numeric_limits<ScalarType>::is_exact) {
       out << "AT::is_exact != std::numeric_limits<ScalarType>::is_exact" << endl;
-      success = 0;
+      FAILURE();
     }
 
     const ScalarType zero = AT::zero ();
@@ -331,35 +339,35 @@ class ArithTraitsTesterBase {
 
     if (zero + zero != zero) {
       out << "zero + zero != zero" << endl;
-      success = 0;
+      FAILURE();
     }
     if (zero + one != one) {
       out << "zero + one != one" << endl;
-      success = 0;
+      FAILURE();
     }
     if (one - one != zero) {
       out << "one - one != zero" << endl;
-      success = 0;
+      FAILURE();
     }
     // This is technically 1 even of Z_2, since in that field, one
     // is its own inverse (so -one == one).
     if ((one + one) - one != one) {
       out << "(one + one) - one != one" << endl;
-      success = 0;
+      FAILURE();
     }
 
     if (AT::abs (zero) != zero) {
       out << "AT::abs (zero) != zero" << endl;
-      success = 0;
+      FAILURE();
     }
     if (AT::abs (one) != one) {
       out << "AT::abs (one) != one" << endl;
-      success = 0;
+      FAILURE();
     }
     if (AT::is_signed) {
       if (AT::abs (-one) != one) {
         out << "AT::abs (-one) != one" << endl;
-        success = 0;
+        FAILURE();
       }
     }
     // Need enable_if to test whether T can be compared using <=.
@@ -369,19 +377,19 @@ class ArithTraitsTesterBase {
     // // They should work even for a set only containing zero.
     if (AT::abs (zero) > AT::abs (AT::max ())) {
       out << "AT::abs (zero) > AT::abs (AT::max ())" << endl;
-      success = 0;
+      FAILURE();
     }
 
     if (AT::has_infinity) {
       if (! AT::isInf (AT::infinity())) {
         out << "AT::isInf (inf) != true" << endl;
-        success = 0;
+        FAILURE();
       }
     }
     if ( ! std::is_same< ScalarType, decltype(AT::infinity()) >::value )
     {
       std::cout << "AT::infinity() return value has wrong type" << endl;
-      success = 0;
+      FAILURE();
     }
 
     // Run the parent class' remaining tests, if any.
@@ -462,12 +470,13 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 0> :
 
   KOKKOS_INLINE_FUNCTION void
   operator () (size_type iwork, value_type& dst) const {
+    TRACE();
     //typedef Kokkos::Details::ArithTraits<ScalarType> AT;
     (void) iwork; // forestall compiler warning for unused variable
     int success = 1;
 
     if (HasTranscendentals<ScalarType>::value) {
-      success = 0;
+      FAILURE();
     }
 
     // Call the base class' implementation.  Every subclass'
@@ -488,7 +497,7 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 0> :
 
     if (HasTranscendentals<ScalarType>::value) {
       out << "HasTranscendentals<T>::value is true" << endl;
-      success = 0;
+      FAILURE();
     }
 
     // Call the base class' implementation.  Every subclass'
@@ -542,12 +551,13 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
 
   KOKKOS_INLINE_FUNCTION void
   operator () (size_type iwork, value_type& dst) const {
+    TRACE();
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
     (void) iwork; // forestall compiler warning for unused variable
     int success = 1;
 
     if (! HasTranscendentals<ScalarType>::value) {
-      success = 0;
+      FAILURE();
     }
 
     const ScalarType zero = AT::zero ();
@@ -576,20 +586,20 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
       result = AT::pow (two, three);
       if (!equal(result,eight)) {
         printf ("AT::pow(2,3) != 8\n");
-        success = 0;
+        FAILURE();
       }
     }
     if (!equal(AT::pow (three, zero) , one)) {
       printf ("AT::pow(3,0) != 1\n");
-      success = 0;
+      FAILURE();
     }
     if (!equal(AT::pow (three, one) , three)) {
       printf ("AT::pow(3,1) != 3\n");
-      success = 0;
+      FAILURE();
     }
     if (!equal(AT::pow (three, two) , nine)) {
       printf ("AT::pow(3,2) != 9\n");
-      success = 0;
+      FAILURE();
     }
 
     // This fails inexplicably for complex numbers on gcc 4.2.1 on Mac.
@@ -597,7 +607,7 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
       result = AT::pow (three, three);
       if (!equal(result , twentySeven)) {
         printf ("AT::pow(3,3) != 27\n");
-        success = 0;
+        FAILURE();
       }
     }
 
@@ -606,93 +616,93 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
       result = AT::pow (-three, one);
       if (!equal(result , -three)) {
         printf ("AT::pow(-3,1) != -3\n");
-        success = 0;
+        FAILURE();
       }
       result = AT::pow (-three, two);
       if (!equal(result , nine)) {
         printf ("AT::pow(-3,2) != 9\n");
-        success = 0;
+        FAILURE();
       }
       result = AT::pow (-three, three);
       if (!equal(result , -twentySeven)) {
         printf ("AT::pow(-3,3) != 27\n");
-        success = 0;
+        FAILURE();
       }
     }
 
     if (!equal(AT::sqrt (zero) , zero)) {
       printf ("AT::sqrt(0) != 0\n");
-      success = 0;
+      FAILURE();
     }
     if (!equal(AT::sqrt (one) , one)) {
       printf ("AT::sqrt(1) != 1\n");
-      success = 0;
+      FAILURE();
     }
     if (!equal(AT::sqrt (thirtySix) , six)) {
       printf ("AT::sqrt(36) != 6\n");
-      success = 0;
+      FAILURE();
     }
     if (!equal(AT::sqrt (sixtyFour) , eight)) {
       printf ("AT::sqrt(64) != 8\n");
-      success = 0;
+      FAILURE();
     }
     if (AT::is_integer) {
       if (!equal(AT::sqrt (fortyTwo) , six)) {
         printf ("AT:sqrt(42) != 6\n");
-        success = 0;
+        FAILURE();
       }
       if (!equal(AT::sqrt (oneTwentySeven) , eleven)) {
         printf ("AT::sqrt(127) != 11\n");
-        success = 0;
+        FAILURE();
       }
     }
 
     if (!equal(AT::cbrt (zero) , zero)) {
       printf ("AT::cbrt(0) != 0\n");
-      success = 0;
+      FAILURE();
     }
     if (!equal(AT::cbrt (one) , one)) {
       printf ("AT::cbrt(1) != 1\n");
-      success = 0;
+      FAILURE();
     }
     if (!equal(AT::cbrt (twentySeven) , three)) {
       printf ("AT::cbrt(27) != 3\n");
-      success = 0;
+      FAILURE();
     }
     if (!equal(AT::cbrt (sixtyFour) , four)) {
       printf ("AT::cbrt(64) != 4\n");
-      success = 0;
+      FAILURE();
     }
     if (AT::is_integer) {
       if (!equal(AT::cbrt (fortyTwo) , three)) {
         printf ("AT:cbrt(42) != 3\n");
-        success = 0;
+        FAILURE();
       }
       if (!equal(AT::cbrt (oneTwentySeven) , five)) {
         printf ("AT::cbrt(127) != 5\n");
-        success = 0;
+        FAILURE();
       }
     }
 
     if (!equal(AT::exp (zero) , one)) {
       printf ("AT::cbrt(0) != 1\n");
-      success = 0;
+      FAILURE();
     }
     if (AT::is_complex) {
       const ScalarType val = two; //(two.real(), two.real());
       if (!equal(AT::conj (AT::exp  (val)) , 
                  AT::exp  (AT::conj (val)))) {
         printf ("AT::conj(exp(complex(2,2))) != AT::exp(conj(complex(2,2)))\n");
-        success = 0;
+        FAILURE();
       }
     }
     if (!equal(AT::log (one) , zero)) {
       printf ("AT::log(1) != 0\n");
-      success = 0;
+      FAILURE();
     }
     if (!equal(AT::log10 (one) , zero)) {
       printf ("AT::log10(1) != 0\n");
-      success = 0;
+      FAILURE();
     }
 
     if (AT::is_complex) {
@@ -701,11 +711,11 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
       const auto val_cos = AT::cos (val);
       if (!equal(val_sin*val_sin + val_cos*val_cos , one)) {
         printf ("AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n");
-        success = 0;
+        FAILURE();
       } 
       if (!equal(val_sin/val_cos , AT::tan(val))) {
         printf ("AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n");
-        success = 0;
+        FAILURE();
       } 
     } else {
       ScalarType val = three; 
@@ -713,25 +723,25 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
       const auto val_cos = AT::cos (val);
       if (!equal(val_sin*val_sin + val_cos*val_cos , one)) {
         printf ("AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n");
-        success = 0;
+        FAILURE();
       } 
       if (!equal(val_sin/val_cos , AT::tan(val))) {
         printf ("AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n");
-        success = 0;
+        FAILURE();
       } 
     }
 
     if (!equal(AT::asin (AT::sin (one)), one)) {
       printf ("AT::asin(sin(1)) != 1\n");
-      success = 0;
+      FAILURE();
     } 
     if (!equal(AT::acos (AT::cos (one)), one)) {
       printf ("AT::acos(cos(1)) != 1\n");
-      success = 0;
+      FAILURE();
     } 
     if (!equal(AT::atan (AT::tan (one)), one)) {
       printf ("AT::atan(tan(1)) != 1\n");
-      success = 0;
+      FAILURE();
     } 
 
     // Call the base class' implementation.  Every subclass'
@@ -752,7 +762,7 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
 
     if (! HasTranscendentals<ScalarType>::value) {
       out << "HasTranscendentals<T>::value is false" << endl;
-      success = 0;
+      FAILURE();
     }
 
     const ScalarType zero = AT::zero ();
@@ -781,20 +791,20 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
       result = AT::pow (two, three);
       if (result != eight) {
         out << "AT::pow (two, three) != eight" << endl;
-        success = 0;
+        FAILURE();
       }
     }
     if (AT::pow (three, zero) != one) {
       out << "AT::pow (three, zero) != one" << endl;
-      success = 0;
+      FAILURE();
     }
     if (AT::pow (three, one) != three) {
       out << "AT::pow (three, one) != three" << endl;
-      success = 0;
+      FAILURE();
     }
     if (AT::pow (three, two) != nine) {
       out << "AT::pow (three, two) != nine" << endl;
-      success = 0;
+      FAILURE();
     }
 
     // This fails inexplicably for complex numbers on gcc 4.2.1 on Mac.
@@ -803,7 +813,7 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
       if (result != twentySeven) {
         out << "AT::pow (three, three) = " << result
             << " != twentySeven = " << twentySeven << endl;
-        success = 0;
+        FAILURE();
       }
     }
 
@@ -813,95 +823,95 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
       if (result != -three) {
         out << "AT::pow (-three, one) = " << result
             << " != -three = " << -three << endl;
-        success = 0;
+        FAILURE();
       }
       result = AT::pow (-three, two);
       if (result != nine) {
         out << "AT::pow (-three, two) = " << result
             << " != nine = " << nine << endl;
-        success = 0;
+        FAILURE();
       }
       result = AT::pow (-three, three);
       if (result != -twentySeven) {
         out << "AT::pow (-three, three) = " << result
             << " != -twentySeven = " << twentySeven << endl;
-        success = 0;
+        FAILURE();
       }
     }
 
     if (AT::sqrt (zero) != zero) {
       out << "AT::sqrt (zero) != zero" << endl;
-      success = 0;
+      FAILURE();
     }
     if (AT::sqrt (one) != one) {
       out << "AT::sqrt (one) != one" << endl;
-      success = 0;
+      FAILURE();
     }
     if (AT::sqrt (thirtySix) != six) {
       out << "AT::sqrt (thirtySix) != six" << endl;
-      success = 0;
+      FAILURE();
     }
     if (AT::sqrt (sixtyFour) != eight) {
       out << "AT::sqrt (sixtyFour) != eight" << endl;
-      success = 0;
+      FAILURE();
     }
     if (AT::is_integer) {
       if (AT::sqrt (fortyTwo) != six) {
         out << "AT::sqrt (fortyTwo) != six" << endl;
-        success = 0;
+        FAILURE();
       }
       if (AT::sqrt (oneTwentySeven) != eleven) {
         out << "AT::sqrt (oneTwentySeven) != eleven" << endl;
-        success = 0;
+        FAILURE();
       }
     }
 
     if (!equal(AT::cbrt (zero) , zero)) {
       printf ("AT::cbrt(0) != 0\n");
-      success = 0;
+      FAILURE();
     }
     if (!equal(AT::cbrt (one) , one)) {
       printf ("AT::cbrt(1) != 1\n");
-      success = 0;
+      FAILURE();
     }
     if (!equal(AT::cbrt (twentySeven) , three)) {
       printf ("AT::cbrt(27) != 3\n");
-      success = 0;
+      FAILURE();
     }
     if (!equal(AT::cbrt (sixtyFour) , four)) {
       printf ("AT::cbrt(64) != 4\n");
-      success = 0;
+      FAILURE();
     }
     if (AT::is_integer) {
       if (!equal(AT::cbrt (fortyTwo) , three)) {
         printf ("AT:cbrt(42) != 3\n");
-        success = 0;
+        FAILURE();
       }
       if (!equal(AT::cbrt (oneTwentySeven) , five)) {
         printf ("AT::cbrt(127) != 5\n");
-        success = 0;
+        FAILURE();
       }
     }
 
     if (!equal(AT::exp (zero) , one)) {
       printf ("AT::cbrt(0) != 1\n");
-      success = 0;
+      FAILURE();
     }
     if (AT::is_complex) {
       const ScalarType val = two; //(two.real(), two.real());
       if (!equal(AT::conj (AT::exp  (val)) , 
                  AT::exp  (AT::conj (val)))) {
         printf ("AT::conj(exp(complex(2,0))) != AT::exp(conj(complex(2,0)))\n");
-        success = 0;
+        FAILURE();
       }
     }
     if (AT::log (one) != zero) {
       out << "AT::log (one) != zero" << endl;
-      success = 0;
+      FAILURE();
     }
     if (AT::log10 (one) != zero) {
       out << "AT::log10 (one) != zero" << endl;
-      success = 0;
+      FAILURE();
     }
 
     if (AT::is_complex) {
@@ -910,11 +920,11 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
       const auto val_cos = AT::cos (val);
       if (!equal(val_sin*val_sin + val_cos*val_cos , one)) {
         printf ("AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n");
-        success = 0;
+        FAILURE();
       } 
       if (!equal(val_sin/val_cos , AT::tan(val))) {
         printf ("AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n");
-        success = 0;
+        FAILURE();
       } 
     } else {
       const ScalarType val = three; 
@@ -922,25 +932,25 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
       const auto val_cos = AT::cos (val);
       if (!equal(val_sin*val_sin + val_cos*val_cos , one)) {
         printf ("AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n");
-        success = 0;
+        FAILURE();
       } 
       if (!equal(val_sin/val_cos , AT::tan(val))) {
         printf ("AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n");
-        success = 0;
+        FAILURE();
       } 
     }
 
     if (!equal(AT::asin (AT::sin (three)), three)) {
       printf ("AT::asin(sin(3)) != 3\n");
-      success = 0;
+      FAILURE();
     } 
     if (!equal(AT::acos (AT::cos (three)), three)) {
       printf ("AT::acos(cos(3)) != 3\n");
-      success = 0;
+      FAILURE();
     } 
     if (!equal(AT::atan (AT::tan (three)), three)) {
       printf ("AT::atan(tan(3)) != 3\n");
-      success = 0;
+      FAILURE();
     } 
 
     // Call the base class' implementation.  Every subclass'
@@ -1020,17 +1030,32 @@ class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 0> :
 
   KOKKOS_INLINE_FUNCTION void
   operator () (size_type iwork, value_type& dst) const {
+    TRACE();
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
     (void) iwork; // forestall compiler warning for unused variable
     int success = 1;
 
     // Apparently, std::numeric_limits<ScalarType>::is_signed is 1
     // only for real numbers.
-    if (AT::is_signed != std::numeric_limits<ScalarType>::is_signed) {
-      success = 0;
+#if defined(KOKKOS_HALF_T_IS_FLOAT)
+    if (std::is_same<ScalarType, Kokkos::Experimental::half_t>::value) {
+      if (AT::is_signed != 0x1)
+        FAILURE();
+    } else
+#else
+    {
+      if (AT::is_signed != std::numeric_limits<ScalarType>::is_signed) {
+        printf(
+            "AT::is_signed = 0x%x, std::numeric_limits<ScalarType>::is_signed "
+            "= 0x%x\n",
+            AT::is_signed, std::numeric_limits<ScalarType>::is_signed);
+        FAILURE();
+      }
     }
+#endif // KOKKOS_HALF_T_IS_FLOAT
+
     if (AT::is_complex) {
-      success = 0;
+      FAILURE();
     }
 
     // Call the base class' implementation.  Every subclass'
@@ -1052,11 +1077,11 @@ class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 0> :
     // Apparently, std::numeric_limits<ScalarType>::is_signed is 1 only for real numbers.
     if (AT::is_signed != std::numeric_limits<ScalarType>::is_signed) {
       out << "ArithTraits<T>::is_signed != std::numeric_limits<ScalarType>::is_signed" << endl;
-      success = 0;
+      FAILURE();
     }
     if (AT::is_complex) {
       out << "ArithTraits<T>::is_complex is wrong" << endl;
-      success = 0;
+      FAILURE();
     }
     // Call the base class' implementation.  Every subclass'
     // implementation of testHostImpl() should (must) do this, in
@@ -1090,12 +1115,13 @@ class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 1> :
 
   KOKKOS_INLINE_FUNCTION void
   operator () (size_type iwork, value_type& dst) const {
+    TRACE();
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
     (void) iwork; // forestall compiler warning for unused variable
     int success = 1;
 
     if (! AT::is_complex) {
-      success = 0;
+      FAILURE();
     }
     typedef typename AT::mag_type mag_type;
     const mag_type one = Kokkos::Details::ArithTraits<mag_type>::one ();
@@ -1108,7 +1134,7 @@ class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 1> :
     // Test conjugation.
     if (AT::conj (oneMinusOne) != onePlusOne ||
         AT::conj (onePlusOne) != oneMinusOne) {
-      success = 0;
+      FAILURE();
     }
 
     // Call the base class' implementation.  Every subclass'
@@ -1129,7 +1155,7 @@ class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 1> :
 
     if (! AT::is_complex) {
       out << "ArithTraits<T>::is_complex is wrong" << endl;
-      success = 0;
+      FAILURE();
     }
     typedef typename AT::mag_type mag_type;
     const mag_type one = Kokkos::Details::ArithTraits<mag_type>::one ();
@@ -1142,11 +1168,11 @@ class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 1> :
     // Test conjugation.
     if (AT::conj (oneMinusOne) != onePlusOne) {
       out << "AT::conj ((1, -1)) != (1, 1)" << endl;
-      success = 0;
+      FAILURE();
     }
     if (AT::conj (onePlusOne) != oneMinusOne) {
       out << "AT::conj ((1, 1)) != (1, -1)" << endl;
-      success = 0;
+      FAILURE();
     }
     // Call the base class' implementation.  Every subclass'
     // implementation of testHostImpl() should (must) do this, in
@@ -1232,17 +1258,19 @@ class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 0> :
 
   KOKKOS_INLINE_FUNCTION void
   operator () (size_type iwork, value_type& dst) const {
+    TRACE();
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
     (void) iwork; // forestall compiler warning for unused variable
     int success = 1;
 
     if (AT::is_exact) {
       printf ("AT::is_exact is 1\n");
-      success = 0;
+      FAILURE();
     }
+
     if (! AT::isNan (AT::nan ())) {
       printf ("NaN is not NaN\n");
-      success = 0;
+      FAILURE();
     }
 
     const ScalarType zero = AT::zero ();
@@ -1250,19 +1278,19 @@ class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 0> :
 
     if (AT::isInf (zero)) {
       printf ("0 is Inf\n");
-      success = 0;
+      FAILURE();
     }
     if (AT::isInf (one)) {
       printf ("1 is Inf\n");
-      success = 0;
+      FAILURE();
     }
     if (AT::isNan (zero)) {
       printf ("0 is NaN\n");
-      success = 0;
+      FAILURE();
     }
     if (AT::isNan (one)) {
       printf ("1 is NaN\n");
-      success = 0;
+      FAILURE();
     }
 
     // Call the base class' implementation.  Every subclass'
@@ -1283,14 +1311,14 @@ class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 0> :
 
     if (AT::is_exact) {
       out << "AT::is_exact is wrong" << endl;
-      success = 0;
+      FAILURE();
     }
 
     //if (std::numeric_limits<ScalarType>::is_iec559) {
     //success = success && AT::isInf (AT::inf ());
     if (! AT::isNan (AT::nan ())) {
       out << "isNan or nan failed" << endl;
-      success = 0;
+      FAILURE();
     }
     //}
 
@@ -1299,19 +1327,19 @@ class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 0> :
 
     if (AT::isInf (zero)) {
       out << "isInf(zero) is 1" << endl;
-      success = 0;
+      FAILURE();
     }
     if (AT::isInf (one)) {
       out << "isInf(one) is 1" << endl;
-      success = 0;
+      FAILURE();
     }
     if (AT::isNan (zero)) {
       out << "isNan(zero) is 1" << endl;
-      success = 0;
+      FAILURE();
     }
     if (AT::isNan (one)) {
       out << "isNan(one) is 1" << endl;
-      success = 0;
+      FAILURE();
     }
 
     // Call the base class' implementation.  Every subclass'
@@ -1351,13 +1379,14 @@ class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 1> :
 
   KOKKOS_INLINE_FUNCTION void
   operator () (size_type iwork, value_type& dst) const {
+    TRACE();
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
     (void) iwork; // forestall compiler warning for unused variable
     int success = 1;
 
     if (! AT::is_exact) {
       printf ("! AT:is_exact\n");
-      success = 0;
+      FAILURE();
     }
 
     // Call the base class' implementation.  Every subclass'
@@ -1378,7 +1407,7 @@ class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 1> :
 
     if (! AT::is_exact) {
       out << "AT::is_exact is wrong" << endl;
-      success = 0;
+      FAILURE();
     }
     // Call the base class' implementation.  Every subclass'
     // implementation of testHostImpl() should (must) do this, in
@@ -1532,6 +1561,13 @@ int runAllArithTraitsDeviceTests (std::ostream& out, const int verbose)
   // Built-in real floating-point types
   //
 
+#if defined(KOKKOS_HALF_T_IS_FLOAT)
+  TRACE();
+  success = success && curSuccess;
+  curSuccess =
+      testArithTraitsOnDevice<Kokkos::Experimental::half_t, DeviceType>(
+          out, verbose);
+#endif // KOKKOS_HALF_T_IS_FLOAT
   success = success && curSuccess; curSuccess = testArithTraitsOnDevice<float, DeviceType> (out, verbose);
   success = success && curSuccess; curSuccess = testArithTraitsOnDevice<double, DeviceType> (out, verbose);
 
@@ -1542,7 +1578,7 @@ int runAllArithTraitsDeviceTests (std::ostream& out, const int verbose)
   success = success && curSuccess; curSuccess = testArithTraitsOnDevice<Kokkos::complex<float>, DeviceType> (out, verbose);
   success = success && curSuccess; curSuccess = testArithTraitsOnDevice<Kokkos::complex<double>, DeviceType> (out, verbose);
 
-  return success;
+  return success && curSuccess;
 }
 
 
@@ -1598,7 +1634,7 @@ int runAllArithTraitsHostTests (std::ostream& out, const int verbose)
 
   success = success && curSuccess; curSuccess = testArithTraitsOnHost<float, DeviceType> (out, verbose);
   success = success && curSuccess; curSuccess = testArithTraitsOnHost<double, DeviceType> (out, verbose);
-#ifndef KOKKOS_ENABLE_CUDA
+#if !defined( KOKKOS_ENABLE_CUDA ) && !defined( KOKKOS_ENABLE_HIP )
   // This would spill tons of warnings about host device stuff otherwise
   success = success && curSuccess; curSuccess = testArithTraitsOnHost<long double, DeviceType> (out, verbose);
   success = success && curSuccess; curSuccess = testArithTraitsOnHost<std::complex<float>, DeviceType> (out, verbose);
@@ -1609,11 +1645,17 @@ int runAllArithTraitsHostTests (std::ostream& out, const int verbose)
   // Kokkos' complex floating-point types
   //
 
+#if defined(KOKKOS_HALF_T_IS_FLOAT)
+  success = success && curSuccess;
+  TRACE();
+  curSuccess = testArithTraitsOnHost<Kokkos::Experimental::half_t, DeviceType>(
+      out, verbose);
+#endif // KOKKOS_HALF_T_IS_FLOAT
   success = success && curSuccess; curSuccess = testArithTraitsOnHost<Kokkos::complex<float>, DeviceType> (out, verbose);
   success = success && curSuccess; curSuccess = testArithTraitsOnHost<Kokkos::complex<double>, DeviceType> (out, verbose);
   //success = success && curSuccess; curSuccess = testArithTraitsOnHost<Kokkos::complex<long double>, DeviceType> (out, verbose);
 
-  return success;
+  return success && curSuccess;
 }
 
 template <typename device>
@@ -1627,8 +1669,8 @@ void test_ArithTraits ()
     int overflow(int c) { return c; }
   };
   NullBuffer null_buffer;
-  //std::ostream &out = std::cout;
-  std::ostream out(&null_buffer);
+  std::ostream &out = std::cerr;
+  //std::ostream out(&null_buffer);
 
 
   bool success = true;
diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt
index e610ded3f9..534782e590 100644
--- a/unit_test/CMakeLists.txt
+++ b/unit_test/CMakeLists.txt
@@ -39,19 +39,21 @@ IF (KOKKOS_ENABLE_CUDA)
   KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/cuda)
 
   APPEND_GLOB(CUDA_BLAS_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Blas*.cpp)
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     blas_cuda
     SOURCES
       Test_Main.cpp
       ${CUDA_BLAS_SOURCES}
+    COMPONENTS blas
     )
 
   APPEND_GLOB(CUDA_BATCHED_DLA_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Batched*.cpp)
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     batched_dla_cuda
     SOURCES
       Test_Main.cpp
       ${CUDA_BATCHED_DLA_SOURCES}
+    COMPONENTS batched
     )
     
   APPEND_GLOB(CUDA_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Sparse*.cpp)
@@ -66,27 +68,29 @@ IF (KOKKOS_ENABLE_CUDA)
          "${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Sparse_Utils_cusparse.cpp")
   ENDIF()
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     sparse_cuda
     SOURCES
       Test_Main.cpp
       ${CUDA_SPARSE_SOURCES}
+    COMPONENTS sparse
   )
   
   APPEND_GLOB(CUDA_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Graph*.cpp)
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     graph_cuda
     SOURCES
       Test_Main.cpp
       ${CUDA_GRAPH_SOURCES}
+    COMPONENTS graph
   )
   
   #currently float 128 test is not working. So common tests are explicitly added.  
   APPEND_GLOB(CUDA_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Common*.cpp)
   
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     common_cuda
     SOURCES
       Test_Main.cpp
@@ -94,6 +98,64 @@ IF (KOKKOS_ENABLE_CUDA)
   )
 ENDIF ()
 
+IF (KOKKOS_ENABLE_HIP)
+  KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/hip)
+  KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/hip)
+
+  APPEND_GLOB(HIP_BLAS_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Blas*.cpp)
+  KOKKOSKERNELS_ADD_UNIT_TEST(
+    blas_hip
+    SOURCES
+      Test_Main.cpp
+      ${HIP_BLAS_SOURCES}
+    COMPONENTS blas
+    )
+
+  # APPEND_GLOB(HIP_BATCHED_DLA_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Batched*.cpp)
+  # KOKKOSKERNELS_ADD_UNIT_TEST(
+  #   batched_dla_hip
+  #   SOURCES
+  #     Test_Main.cpp
+  #     ${HIP_BATCHED_DLA_SOURCES}
+  #   COMPONENTS batched
+  #   )
+    
+  # APPEND_GLOB(HIP_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Sparse*.cpp)
+  # # HIP does not provide UVM, these two tests are henced remove permanently
+  # # IF (NOT KOKKOS_ENABLE_CUDA_UVM)
+  #   LIST(REMOVE_ITEM HIP_SPARSE_SOURCES
+  #     "${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Sparse_findRelOffset.cpp")
+  #   LIST(REMOVE_ITEM HIP_SPARSE_SOURCES
+  #     "${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Sparse_trsv.cpp")
+  # # ENDIF()
+
+  # KOKKOSKERNELS_ADD_UNIT_TEST(
+  #   sparse_hip
+  #   SOURCES
+  #     Test_Main.cpp
+  #     ${HIP_SPARSE_SOURCES}
+  #   COMPONENTS sparse
+  # )
+  
+  # APPEND_GLOB(HIP_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Graph*.cpp)
+  # KOKKOSKERNELS_ADD_UNIT_TEST(
+  #   graph_hip
+  #   SOURCES
+  #     Test_Main.cpp
+  #     ${HIP_GRAPH_SOURCES}
+  #   COMPONENTS graph
+  # )
+  
+  #currently float 128 test is not working. So common tests are explicitly added.  
+  APPEND_GLOB(HIP_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip/Test_HIP_Common*.cpp)
+  KOKKOSKERNELS_ADD_UNIT_TEST(
+    common_hip
+    SOURCES
+      Test_Main.cpp
+      ${HIP_COMMON_SOURCES}
+  )
+ENDIF ()
+
 IF (KOKKOS_ENABLE_OPENMP)
   KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/openmp)
   KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/openmp)
@@ -104,42 +166,46 @@ IF (KOKKOS_ENABLE_OPENMP)
   #   SET(DISABLE_SLOW_DGEMM_DOUBLE_TEST "--gtest_filter=-openmp.gemm_double")
   # ENDIF()
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     blas_openmp
     SOURCES
       Test_Main.cpp
       ${OPENMP_BLAS_SOURCES}
-    )
+    COMPONENTS blas
+  )
 
   APPEND_GLOB(OPENMP_BATCHED_DLA_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Batched*.cpp)
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     batched_dla_openmp
     SOURCES
       Test_Main.cpp
       ${OPENMP_BATCHED_DLA_SOURCES}
-    )
+    COMPONENTS batched
+  )
         
   APPEND_GLOB(OPENMP_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Sparse*.cpp)
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     sparse_openmp
     SOURCES
       Test_Main.cpp
       ${OPENMP_SPARSE_SOURCES}
+    COMPONENTS sparse
   )
   
   APPEND_GLOB(OPENMP_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Graph*.cpp)
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     graph_openmp
     SOURCES
       Test_Main.cpp
       ${OPENMP_GRAPH_SOURCES}
+    COMPONENTS graph
   )
   
   APPEND_GLOB(OPENMP_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Common*.cpp)
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     common_openmp
     SOURCES
       Test_Main.cpp
@@ -157,43 +223,47 @@ IF (KOKKOS_ENABLE_SERIAL)
   #   SET(DISABLE_SLOW_DGEMM_DOUBLE_TEST "--gtest_filter=-serial.gemm_double")
   # ENDIF()
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     blas_serial
     SOURCES
       Test_Main.cpp
       ${SERIAL_BLAS_SOURCES}
 #    ARGS ${DISABLE_SLOW_DGEMM_DOUBLE_TEST}
+    COMPONENTS blas
     )
     
   APPEND_GLOB(SERIAL_BATCHED_DLA_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Batched*.cpp)
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     batched_dla_serial
     SOURCES
       Test_Main.cpp
       ${SERIAL_BATCHED_DLA_SOURCES}
+    COMPONENTS batched
     )
 
    APPEND_GLOB(SERIAL_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Sparse*.cpp)
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     sparse_serial
     SOURCES
       Test_Main.cpp
       ${SERIAL_SPARSE_SOURCES}
+    COMPONENTS sparse
   )
   
   APPEND_GLOB(SERIAL_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Graph*.cpp)
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     graph_serial
     SOURCES
       Test_Main.cpp
       ${SERIAL_GRAPH_SOURCES}
+    COMPONENTS graph
   )
   
   APPEND_GLOB(SERIAL_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Common*.cpp)
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     common_serial
     SOURCES
       Test_Main.cpp
@@ -207,35 +277,38 @@ IF (KOKKOS_ENABLE_PTHREAD)
 
   APPEND_GLOB(THREADS_BLAS_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Blas*.cpp)
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     blas_threads
     SOURCES
       Test_Main.cpp
       ${THREADS_BLAS_SOURCES}
+    COMPONENTS blas
     )
     
   APPEND_GLOB(THREADS_SPARSE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Sparse*.cpp)
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     sparse_threads
     SOURCES
       Test_Main.cpp
       ${THREADS_SPARSE_SOURCES}
+    COMPONENTS sparse
   )
   
   APPEND_GLOB(THREADS_GRAPH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Graph*.cpp)
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     graph_threads
     SOURCES
       Test_Main.cpp
       ${THREADS_GRAPH_SOURCES}
+    COMPONENTS graph
   )
   
   
   APPEND_GLOB(THREADS_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Common*.cpp)
 
-  KOKKOSKERNELS_ADD_UNIT_TEST_EXECUTABLE_AND_TEST(
+  KOKKOSKERNELS_ADD_UNIT_TEST(
     common_threads
     SOURCES
       Test_Main.cpp
diff --git a/unit_test/batched/Test_Batched_SerialGemm.hpp b/unit_test/batched/Test_Batched_SerialGemm.hpp
index 791c22d054..6b6109de47 100644
--- a/unit_test/batched/Test_Batched_SerialGemm.hpp
+++ b/unit_test/batched/Test_Batched_SerialGemm.hpp
@@ -66,6 +66,97 @@ namespace Test {
       Kokkos::Profiling::popRegion();
     }
   };
+
+template<typename DeviceType,
+        typename ViewType,
+        typename ScalarType,
+        typename ParamTagType, 
+        typename AlgoTagType>
+  void impl_test_batched_gemm_half(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2,
+      const int matCdim1, const int matCdim2) {
+    using layout_type = typename ViewType::array_layout;
+    using execution_space = typename DeviceType::execution_space;
+    using host_value_type = float;
+    using transA = typename ParamTagType::transA;
+    using transB = typename ParamTagType::transB;
+    using ViewType_host_value_type = Kokkos::View<host_value_type***,layout_type,DeviceType>;
+    using ats = Kokkos::Details::ArithTraits<host_value_type>;
+
+    /// randomized input testing views
+    ScalarType alpha = ScalarType(1.5);
+    ScalarType beta = ScalarType(3.0);
+
+    ViewType
+      a_expected("a_expected", N, matAdim1, matAdim2), a1("a1", N, matAdim1, matAdim2),
+      b_expected("b_expected", N, matBdim1, matBdim2), b1("b1", N, matBdim1, matBdim2),
+      c_expected("c_expected", N, matCdim1, matCdim2), c1("c1", N, matCdim1, matCdim2);
+
+    // fill_random does not support half precision, so use float to
+    // generate random numbers and copy to half views with deep_copy
+    Kokkos::Random_XorShift64_Pool<execution_space> random(13718);
+    ViewType_host_value_type
+      a_expected_host_value_type("a_expected_host_value_type", N, matAdim1, matAdim2),
+      b_expected_host_value_type("b_expected_host_value_type", N, matBdim1, matBdim2),
+      c_expected_host_value_type("c_expected_host_value_type", N, matCdim1, matCdim2),
+      c1_host_value_type("c1_host_value_type", N, matCdim1, matCdim2);
+
+    Kokkos::fill_random(a_expected_host_value_type, random, host_value_type(1.0));
+    Kokkos::fill_random(b_expected_host_value_type, random, host_value_type(1.0));
+    Kokkos::fill_random(c_expected_host_value_type, random, host_value_type(1.0));
+
+    Kokkos::fence();
+
+    Kokkos::deep_copy(a_expected, a_expected_host_value_type);
+    Kokkos::deep_copy(b_expected, b_expected_host_value_type);
+    Kokkos::deep_copy(c_expected, c_expected_host_value_type);
+
+    Kokkos::deep_copy(a1, a_expected);
+    Kokkos::deep_copy(b1, b_expected);
+    Kokkos::deep_copy(c1, c_expected);
+
+    Functor_BatchedVanillaGEMM<ViewType, ViewType, ViewType, execution_space> vgemm;
+    vgemm.A_t = std::is_same<transA, Trans::Transpose>::value;
+    vgemm.B_t = std::is_same<transB, Trans::Transpose>::value;
+    vgemm.A_c = vgemm.B_c = false;
+    vgemm.A = a_expected;
+    vgemm.B = b_expected;
+    vgemm.C = c_expected;
+    vgemm.alpha = alpha;
+    vgemm.beta = beta;
+    vgemm.run(); // Compute c_expected
+    Functor_TestBatchedSerialGemm<DeviceType,ViewType,ScalarType,
+      ParamTagType,AlgoTagType>(alpha, a1, b1, beta, c1).run();
+
+    // Convert and copy half to host_value_type, on device
+    Kokkos::deep_copy(c_expected_host_value_type, c_expected);
+    Kokkos::deep_copy(c1_host_value_type, c1);
+
+    // We may not have half precision on the host, use single precision here.
+    // For comparison send it to host, in host compatible type
+    typename ViewType_host_value_type::HostMirror c_expected_host_value_type_host = Kokkos::create_mirror_view(c_expected_host_value_type);
+    typename ViewType_host_value_type::HostMirror c1_host_value_type_host = Kokkos::create_mirror_view(c1_host_value_type);
+
+    // Copy host_value_type on device to host_value_type on host
+    Kokkos::deep_copy(c_expected_host_value_type_host, c_expected_host_value_type);
+    Kokkos::deep_copy(c1_host_value_type_host, c1_host_value_type);
+
+    Kokkos::fence();
+
+    // check c_expected = c1 ; this eps is about 2^-9
+    // Set mag_type to host_value_type, we may not have half precision on host
+    using mag_type = host_value_type;
+    mag_type sum(1), diff(0);
+
+    mag_type eps = (mag_type) (1 << 1) * KOKKOSKERNELS_IMPL_FP16_EPSILON;
+
+    for (int k=0;k<N;++k)
+      for (int i=0;i<matCdim1;++i) 
+        for (int j=0;j<matCdim2;++j) {
+          sum  += ats::abs(c_expected_host_value_type_host(k,i,j));
+          diff += ats::abs(c_expected_host_value_type_host(k,i,j)-c1_host_value_type_host(k,i,j));
+        }
+    EXPECT_NEAR_KK( diff/sum, 0, eps);
+  }
     
   template<typename DeviceType,
            typename ViewType,
@@ -116,7 +207,7 @@ namespace Test {
     mag_type sum(1), diff(0);
     const mag_type eps = 1.0e3 * ats::epsilon();
 
-    for (int k=0;k<N;++k) 
+    for (int k=0;k<N;++k)
       for (int i=0;i<matCdim1;++i) 
         for (int j=0;j<matCdim2;++j) {
           sum  += ats::abs(c0_host(k,i,j));
@@ -158,7 +249,7 @@ int test_batched_gemm() {
     }
   }
 #endif
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) 
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
     typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
     Test::impl_test_batched_gemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(0, 10, 10, 10, 10, 10, 10);
@@ -187,3 +278,65 @@ int test_batched_gemm() {
   
   return 0;
 }
+
+template<typename DeviceType, 
+         typename ValueType, 
+         typename ScalarType,
+         typename ParamTagType,
+         typename AlgoTagType>
+int test_batched_gemm_half() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) 
+  {
+    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
+    Test::impl_test_batched_gemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(0, 10, 10, 10, 10, 10, 10);
+    for (int i=0;i<10;++i) {                                                                                        
+      //printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::impl_test_batched_gemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, i, i, i, i, i, i);
+    }
+    for (int i=1;i<10;++i) {                                                                                        
+      //printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      int dimM=i; int dimN=2*i; int dimK=3*i;
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_gemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_gemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimN, dimK, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_gemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_gemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimN, dimK, dimM, dimN); }
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
+    Test::impl_test_batched_gemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(0, 10, 10, 10, 10, 10, 10);
+    for (int i=0;i<10;++i) {                                                                                        
+      //printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::impl_test_batched_gemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, i, i, i, i, i, i);
+    }
+    for (int i=0;i<10;++i) {                                                                                        
+      //printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      int dimM=i; int dimN=2*i; int dimK=3*i;
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_gemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_gemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimN, dimK, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_gemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_gemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimN, dimK, dimM, dimN); }
+    }
+  }
+#endif
+  
+  return 0;
+}
diff --git a/unit_test/batched/Test_Batched_SerialGemm_Real.hpp b/unit_test/batched/Test_Batched_SerialGemm_Real.hpp
index 24222cba2f..087c94f997 100644
--- a/unit_test/batched/Test_Batched_SerialGemm_Real.hpp
+++ b/unit_test/batched/Test_Batched_SerialGemm_Real.hpp
@@ -1,3 +1,30 @@
+#if defined(KOKKOS_HALF_T_IS_FLOAT)
+TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_half_half ) {
+  typedef ::Test::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+
+  test_batched_gemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_gemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_serial_gemm_t_nt_half_half ) {
+  typedef ::Test::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+
+  test_batched_gemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_gemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_serial_gemm_nt_t_half_half ) {
+  typedef ::Test::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+
+  test_batched_gemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_gemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_serial_gemm_t_t_half_half ) {
+  typedef ::Test::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+
+  test_batched_gemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_gemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Unblocked>();
+}
+#endif // KOKKOS_HALF_T_IS_FLOAT
+
 #if defined(KOKKOSKERNELS_INST_FLOAT)
 TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_float_float ) {
   typedef ::Test::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
diff --git a/unit_test/batched/Test_Batched_SerialTrmm.hpp b/unit_test/batched/Test_Batched_SerialTrmm.hpp
index 8f8fd48758..3301f3cd42 100644
--- a/unit_test/batched/Test_Batched_SerialTrmm.hpp
+++ b/unit_test/batched/Test_Batched_SerialTrmm.hpp
@@ -54,7 +54,7 @@ namespace Test {
     KOKKOS_INLINE_FUNCTION
     void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__)
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
       int i = team.league_rank();
 #else
       const int i = team.league_rank();
diff --git a/unit_test/batched/Test_Batched_SerialTrtri.hpp b/unit_test/batched/Test_Batched_SerialTrtri.hpp
index c50e26ae35..f4f74d6b7c 100644
--- a/unit_test/batched/Test_Batched_SerialTrtri.hpp
+++ b/unit_test/batched/Test_Batched_SerialTrtri.hpp
@@ -56,7 +56,7 @@ namespace Test {
     KOKKOS_INLINE_FUNCTION
     void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__)
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
       int i = team.league_rank();
 #else
       const int i = team.league_rank();
diff --git a/unit_test/batched/Test_Batched_TeamGemm.hpp b/unit_test/batched/Test_Batched_TeamGemm.hpp
index 7418361809..10f11d686d 100644
--- a/unit_test/batched/Test_Batched_TeamGemm.hpp
+++ b/unit_test/batched/Test_Batched_TeamGemm.hpp
@@ -78,7 +78,7 @@ namespace Test {
            typename ScalarType,
            typename ParamTagType, 
            typename AlgoTagType>
-  void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2,
+  void impl_test_batched_teamgemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2,
       const int matCdim1, const int matCdim2) {
     typedef typename ViewType::value_type value_type;
     typedef Kokkos::Details::ArithTraits<value_type> ats;
@@ -130,63 +130,155 @@ namespace Test {
         }
     EXPECT_NEAR_KK( diff/sum, 0, eps);
   }
+
+  template<typename DeviceType,
+           typename ViewType,
+           typename ScalarType,
+           typename ParamTagType, 
+           typename AlgoTagType>
+  void impl_test_batched_teamgemm_half(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2,
+      const int matCdim1, const int matCdim2) {
+    using layout_type = typename ViewType::array_layout;
+    using transA = typename ParamTagType::transA;
+    using transB = typename ParamTagType::transB;
+    using execution_space = typename DeviceType::execution_space;
+    using host_value_type = float;
+    using ViewType_host_value_type = Kokkos::View<host_value_type***,layout_type,DeviceType>;
+    using ats = Kokkos::Details::ArithTraits<host_value_type>;
+
+    /// randomized input testing views
+    ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0);
+
+    ViewType
+      a_expected("a_expected", N, matAdim1, matAdim2), a1("a1", N, matAdim1, matAdim2),
+      b_expected("b_expected", N, matBdim1, matBdim2), b1("b1", N, matBdim1, matBdim2),
+      c_expected("c_expected", N, matCdim1, matCdim2), c1("c1", N, matCdim1, matCdim2);
+
+    // fill_random does not support half precision, so use float to
+    // generate random numbers and copy to half views with deep_copy
+    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
+    ViewType_host_value_type
+      a_expected_host_value_type("a_expected_host_value_type", N, matAdim1, matAdim2),
+      b_expected_host_value_type("b_expected_host_value_type", N, matBdim1, matBdim2),
+      c_expected_host_value_type("c_expected_host_value_type", N, matCdim1, matCdim2),
+      c1_host_value_type("c1_host_value_type", N, matCdim1, matCdim2);
+
+    Kokkos::fill_random(a_expected_host_value_type, random, host_value_type(1.0));
+    Kokkos::fill_random(b_expected_host_value_type, random, host_value_type(1.0));
+    Kokkos::fill_random(c_expected_host_value_type, random, host_value_type(1.0));
+
+    Kokkos::fence();
+
+    Kokkos::deep_copy(a_expected, a_expected_host_value_type);
+    Kokkos::deep_copy(b_expected, b_expected_host_value_type);
+    Kokkos::deep_copy(c_expected, c_expected_host_value_type);
+
+    Kokkos::deep_copy(a1, a_expected);
+    Kokkos::deep_copy(b1, b_expected);
+    Kokkos::deep_copy(c1, c_expected);
+
+    Functor_BatchedVanillaGEMM<ViewType, ViewType, ViewType, execution_space> vgemm;
+    vgemm.A_t = std::is_same<transA, Trans::Transpose>::value;
+    vgemm.B_t = std::is_same<transB, Trans::Transpose>::value;
+    vgemm.A_c = vgemm.B_c = false;
+    vgemm.A = a_expected;
+    vgemm.B = b_expected;
+    vgemm.C = c_expected;
+    vgemm.alpha = alpha;
+    vgemm.beta = beta;
+    vgemm.run(); // Compute c_expected
+
+    Functor_TestBatchedTeamGemm<DeviceType,ViewType,ScalarType,
+      ParamTagType,AlgoTagType>(alpha, a1, b1, beta, c1).run();
+
+    Kokkos::fence();
+
+    // Convert and copy half to host_value_type, on device
+    Kokkos::deep_copy(c_expected_host_value_type, c_expected);
+    Kokkos::deep_copy(c1_host_value_type, c1);    
+
+    // We may not have half precision on the host, use single precision here.
+    // For comparison send it to host, in host compatible type
+    typename ViewType_host_value_type::HostMirror c_expected_host_value_type_host = Kokkos::create_mirror_view(c_expected_host_value_type);
+    typename ViewType_host_value_type::HostMirror c1_host_value_type_host = Kokkos::create_mirror_view(c1_host_value_type);
+
+    // Copy host_value_type on device to host_value_type on host
+    Kokkos::deep_copy(c_expected_host_value_type_host, c_expected_host_value_type);
+    Kokkos::deep_copy(c1_host_value_type_host, c1_host_value_type);
+
+    // check c_expected = c1 ; this eps is about 2^-9
+    // Set mag_type to host_value_type, we may not have half precision on host
+    using mag_type = host_value_type;
+    mag_type sum(1), diff(0);
+
+    mag_type eps = (mag_type) (1 << 1) * KOKKOSKERNELS_IMPL_FP16_EPSILON;
+
+    for (int k=0;k<N;++k)
+      for (int i=0;i<matCdim1;++i) 
+        for (int j=0;j<matCdim2;++j) {
+          sum  += ats::abs(c_expected_host_value_type_host(k,i,j));
+          diff += ats::abs(c_expected_host_value_type_host(k,i,j)-c1_host_value_type_host(k,i,j));
+        }
+    EXPECT_NEAR_KK( diff/sum, 0, eps);
+  }
 }
 
+// void (*impl_test)(const int, const int, const int, const int, const int, const int, const int)
 template<typename DeviceType, 
          typename ValueType, 
          typename ScalarType,
          typename ParamTagType,
          typename AlgoTagType>
-int test_batched_gemm() {
+int test_batched_teamgemm() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) 
   {
     typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
-    Test::impl_test_batched_gemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(0, 10, 10, 10, 10, 10, 10);
+    Test::impl_test_batched_teamgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(0, 10, 10, 10, 10, 10, 10);
     for (int i=0;i<10;++i) {
       //printf("Testing: LayoutLeft,  Blksize %d\n", i);
-      Test::impl_test_batched_gemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, i, i, i, i, i, i);
+      Test::impl_test_batched_teamgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, i, i, i, i, i, i);
     }
     for (int i=0;i<10;++i) {                                                                                        
       //printf("Testing: LayoutLeft,  Blksize %d\n", i);
       int dimM=i; int dimN=2*i; int dimK=3*i;
       if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
         (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
-          Test::impl_test_batched_gemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimK, dimN, dimM, dimN); }
+          Test::impl_test_batched_teamgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimK, dimN, dimM, dimN); }
       if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
         (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
-          Test::impl_test_batched_gemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimN, dimK, dimM, dimN); }
+          Test::impl_test_batched_teamgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimN, dimK, dimM, dimN); }
       if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
         (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
-          Test::impl_test_batched_gemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimK, dimN, dimM, dimN); }
+          Test::impl_test_batched_teamgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimK, dimN, dimM, dimN); }
       if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
         (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
-          Test::impl_test_batched_gemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimN, dimK, dimM, dimN); }
+          Test::impl_test_batched_teamgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimN, dimK, dimM, dimN); }
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) 
   {
     typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
-    Test::impl_test_batched_gemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(0, 10, 10, 10, 10, 10, 10);
+    Test::impl_test_batched_teamgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(0, 10, 10, 10, 10, 10, 10);
     for (int i=0;i<10;++i) {
       //printf("Testing: LayoutRight, Blksize %d\n", i);
-      Test::impl_test_batched_gemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, i, i, i, i, i, i);
+      Test::impl_test_batched_teamgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, i, i, i, i, i, i);
     }
     for (int i=0;i<10;++i) {                                                                                        
       //printf("Testing: LayoutLeft,  Blksize %d\n", i);
       int dimM=i; int dimN=2*i; int dimK=3*i;
       if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
         (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
-          Test::impl_test_batched_gemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimK, dimN, dimM, dimN); }
+          Test::impl_test_batched_teamgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimK, dimN, dimM, dimN); }
       if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
         (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
-          Test::impl_test_batched_gemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimN, dimK, dimM, dimN); }
+          Test::impl_test_batched_teamgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimN, dimK, dimM, dimN); }
       if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
         (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
-          Test::impl_test_batched_gemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimK, dimN, dimM, dimN); }
+          Test::impl_test_batched_teamgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimK, dimN, dimM, dimN); }
       if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
         (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
-          Test::impl_test_batched_gemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimN, dimK, dimM, dimN); }
+          Test::impl_test_batched_teamgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimN, dimK, dimM, dimN); }
     }
   }
 #endif
@@ -194,3 +286,64 @@ int test_batched_gemm() {
   return 0;
 }
 
+template<typename DeviceType, 
+         typename ValueType, 
+         typename ScalarType,
+         typename ParamTagType,
+         typename AlgoTagType>
+int test_batched_teamgemm_half() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) 
+  {
+    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
+    Test::impl_test_batched_teamgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(0, 10, 10, 10, 10, 10, 10);
+    for (int i=0;i<10;++i) {
+      //printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::impl_test_batched_teamgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, i, i, i, i, i, i);
+    }
+    for (int i=0;i<10;++i) {                                                                                        
+      //printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      int dimM=i; int dimN=2*i; int dimK=3*i;
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_teamgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_teamgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimN, dimK, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_teamgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_teamgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimN, dimK, dimM, dimN); }
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) 
+  {
+    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
+    Test::impl_test_batched_teamgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(0, 10, 10, 10, 10, 10, 10);
+    for (int i=0;i<10;++i) {
+      //printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::impl_test_batched_teamgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, i, i, i, i, i, i);
+    }
+    for (int i=0;i<10;++i) {                                                                                        
+      //printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      int dimM=i; int dimN=2*i; int dimK=3*i;
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_teamgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_teamgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimN, dimK, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_teamgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_teamgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimN, dimK, dimM, dimN); }
+    }
+  }
+#endif
+  
+  return 0;
+}
diff --git a/unit_test/batched/Test_Batched_TeamGemm_Complex.hpp b/unit_test/batched/Test_Batched_TeamGemm_Complex.hpp
index abf7983966..2f66860ff4 100644
--- a/unit_test/batched/Test_Batched_TeamGemm_Complex.hpp
+++ b/unit_test/batched/Test_Batched_TeamGemm_Complex.hpp
@@ -6,32 +6,32 @@
 TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_dcomplex ) {
   typedef ::Test::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 }
 TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_dcomplex ) {
   typedef ::Test::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 }
 TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_dcomplex ) {
   typedef ::Test::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 }
 TEST_F( TestCategory, batched_scalar_team_gemm_t_t_dcomplex_dcomplex ) {
   typedef ::Test::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 }
 // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_dcomplex ) {
 //   typedef ::Test::ParamTag<Trans::ConjTranspose,Trans::NoTranspose> param_tag_type;
 //   typedef Algo::Gemm::Blocked algo_tag_type;
-//   test_batched_gemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+//   test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_gemm_nt_ct_dcomplex_dcomplex ) {
 //   typedef ::Test::ParamTag<Trans::NoTranspose,Trans::ConjTranspose> param_tag_type;
 //   typedef Algo::Gemm::Blocked algo_tag_type;
-//   test_batched_gemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+//   test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 // }
 
 /// dcomplex, double
@@ -39,32 +39,32 @@ TEST_F( TestCategory, batched_scalar_team_gemm_t_t_dcomplex_dcomplex ) {
 TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_double ) {
   typedef ::Test::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 }
 TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_double ) {
   typedef ::Test::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 }
 TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_double ) {
   typedef ::Test::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 }
 TEST_F( TestCategory, batched_scalar_team_gemm_t_t_dcomplex_double ) {
   typedef ::Test::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 }
 // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_double ) {
 //   typedef ::Test::ParamTag<Trans::ConjTranspose,Trans::NoTranspose> param_tag_type;
 //   typedef Algo::Gemm::Blocked algo_tag_type;
-//   test_batched_gemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+//   test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_gemm_nt_ct_dcomplex_double ) {
 //   typedef ::Test::ParamTag<Trans::NoTranspose,Trans::ConjTranspose> param_tag_type;
 //   typedef Algo::Gemm::Blocked algo_tag_type;
-//   test_batched_gemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+//   test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 // }
 
 #endif
diff --git a/unit_test/batched/Test_Batched_TeamGemm_Real.hpp b/unit_test/batched/Test_Batched_TeamGemm_Real.hpp
index 065fb68c97..327b1bcc21 100644
--- a/unit_test/batched/Test_Batched_TeamGemm_Real.hpp
+++ b/unit_test/batched/Test_Batched_TeamGemm_Real.hpp
@@ -1,24 +1,50 @@
+#if defined(KOKKOS_HALF_T_IS_FLOAT)
+TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_half_half ) {
+  typedef ::Test::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+
+  test_batched_teamgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_half_half ) {
+  typedef ::Test::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+
+  test_batched_teamgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_half_half ) {
+  typedef ::Test::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+
+  test_batched_teamgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_gemm_t_t_half_half ) {
+  typedef ::Test::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+
+  test_batched_teamgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Unblocked>();
+}
+#endif // KOKKOS_HALF_T_IS_FLOAT
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
 TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_float_float ) {
   typedef ::Test::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
 }
 TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_float_float ) {
   typedef ::Test::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
 }
 TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_float_float ) {
   typedef ::Test::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
 }
 TEST_F( TestCategory, batched_scalar_team_gemm_t_t_float_float ) {
   typedef ::Test::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
 }
 #endif
 
@@ -26,22 +52,22 @@ TEST_F( TestCategory, batched_scalar_team_gemm_t_t_float_float ) {
 TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_double_double ) {
   typedef ::Test::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
 }
 TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_double_double ) {
   typedef ::Test::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
 }
 TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_double_double ) {
   typedef ::Test::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
 }
 TEST_F( TestCategory, batched_scalar_team_gemm_t_t_double_double ) {
   typedef ::Test::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
 }
 #endif
 
diff --git a/unit_test/batched/Test_Batched_TeamVectorGemm.hpp b/unit_test/batched/Test_Batched_TeamVectorGemm.hpp
new file mode 100644
index 0000000000..09b2dfa89c
--- /dev/null
+++ b/unit_test/batched/Test_Batched_TeamVectorGemm.hpp
@@ -0,0 +1,346 @@
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+
+#include "KokkosBatched_Gemm_Decl.hpp"
+#include "KokkosBatched_Gemm_TeamVector_Impl.hpp"
+
+#include "KokkosKernels_TestUtils.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+
+  template<typename TA, typename TB>
+  struct ParamTag { 
+    typedef TA transA;
+    typedef TB transB;
+  };
+ 
+  template<typename DeviceType,
+           typename ViewType,
+           typename ScalarType,
+           typename ParamTagType, 
+           typename AlgoTagType>
+  struct Functor_TestBatchedTeamVector {
+    ViewType _a, _b, _c;
+    
+    ScalarType _alpha, _beta;
+    
+    KOKKOS_INLINE_FUNCTION
+    Functor_TestBatchedTeamVector(const ScalarType alpha, 
+            const ViewType &a,
+            const ViewType &b,
+            const ScalarType beta,
+            const ViewType &c)
+      : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {}
+
+    template<typename MemberType>
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const ParamTagType &, const MemberType &member) const {
+      const int k = member.league_rank();
+
+      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
+      auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
+      
+      TeamVectorGemm<MemberType,
+        typename ParamTagType::transA,
+        typename ParamTagType::transB,
+        AlgoTagType>::
+        invoke(member, _alpha, aa, bb, _beta, cc);
+    }
+    
+    inline
+    void run() {
+      typedef typename ViewType::value_type value_type;
+      std::string name_region("KokkosBatched::Test::TeamVector");
+      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
+                                      std::is_same<value_type,double>::value ? "::Double" :
+                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
+                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
+      std::string name = name_region + name_value_type;
+      Kokkos::Profiling::pushRegion( name.c_str() );
+      const int league_size = _c.extent(0);
+      Kokkos::TeamPolicy<DeviceType,ParamTagType> policy(league_size, Kokkos::AUTO);
+      Kokkos::parallel_for(name.c_str(), policy, *this);            
+      Kokkos::Profiling::popRegion(); 
+    }
+  };
+    
+  template<typename DeviceType,
+           typename ViewType,
+           typename ScalarType,
+           typename ParamTagType, 
+           typename AlgoTagType>
+  void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2,
+      const int matCdim1, const int matCdim2) {
+    typedef typename ViewType::value_type value_type;
+    typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+    /// randomized input testing views
+    ScalarType alpha = 1.5, beta = 3.0;
+
+    ViewType
+      a0("a0", N, matAdim1,matAdim2), a1("a1", N, matAdim1,matAdim2),
+      b0("b0", N, matBdim1,matBdim2), b1("b1", N, matBdim1,matBdim2),
+      c0("c0", N, matCdim1,matCdim2), c1("c1", N, matCdim1,matCdim2);
+
+    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
+    Kokkos::fill_random(a0, random, value_type(1.0));
+    Kokkos::fill_random(b0, random, value_type(1.0));
+    Kokkos::fill_random(c0, random, value_type(1.0));
+
+    Kokkos::fence();
+
+    Kokkos::deep_copy(a1, a0);
+    Kokkos::deep_copy(b1, b0);
+    Kokkos::deep_copy(c1, c0);
+
+    /// test body
+    Functor_TestBatchedTeamVector<DeviceType,ViewType,ScalarType,
+      ParamTagType,Algo::Gemm::Unblocked>(alpha, a0, b0, beta, c0).run();
+    Functor_TestBatchedTeamVector<DeviceType,ViewType,ScalarType,
+      ParamTagType,AlgoTagType>(alpha, a1, b1, beta, c1).run();
+
+    Kokkos::fence();
+
+    /// for comparison send it to host
+    typename ViewType::HostMirror c0_host = Kokkos::create_mirror_view(c0);
+    typename ViewType::HostMirror c1_host = Kokkos::create_mirror_view(c1);
+
+    Kokkos::deep_copy(c0_host, c0);
+    Kokkos::deep_copy(c1_host, c1);
+
+    /// check c0 = c1 ; this eps is about 10^-14
+    typedef typename ats::mag_type mag_type;
+    mag_type sum(1), diff(0);
+    const mag_type eps = 1.0e3 * ats::epsilon();
+
+    for (int k=0;k<N;++k) 
+      for (int i=0;i<matCdim1;++i) 
+        for (int j=0;j<matCdim2;++j) {
+          sum  += ats::abs(c0_host(k,i,j));
+          diff += ats::abs(c0_host(k,i,j)-c1_host(k,i,j));
+        }
+    EXPECT_NEAR_KK( diff/sum, 0, eps);
+  }
+
+  template<typename DeviceType,
+           typename ViewType,
+           typename ScalarType,
+           typename ParamTagType, 
+           typename AlgoTagType>
+  void impl_test_batched_teamvectorgemm_half(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2,
+      const int matCdim1, const int matCdim2) {
+    using layout_type = typename ViewType::array_layout;
+    using transA = typename ParamTagType::transA;
+    using transB = typename ParamTagType::transB;
+    using execution_space = typename DeviceType::execution_space;
+    using host_value_type = float;
+    using ViewType_host_value_type = Kokkos::View<host_value_type***,layout_type,DeviceType>;
+    using ats = Kokkos::Details::ArithTraits<host_value_type>;
+
+    /// randomized input testing views
+    ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0);
+
+    ViewType
+      a_expected("a_expected", N, matAdim1, matAdim2), a1("a1", N, matAdim1, matAdim2),
+      b_expected("b_expected", N, matBdim1, matBdim2), b1("b1", N, matBdim1, matBdim2),
+      c_expected("c_expected", N, matCdim1, matCdim2), c1("c1", N, matCdim1, matCdim2);
+
+    // fill_random does not support half precision, so use float to
+    // generate random numbers and copy to half views with deep_copy
+    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
+    ViewType_host_value_type
+      a_expected_host_value_type("a_expected_host_value_type", N, matAdim1, matAdim2),
+      b_expected_host_value_type("b_expected_host_value_type", N, matBdim1, matBdim2),
+      c_expected_host_value_type("c_expected_host_value_type", N, matCdim1, matCdim2),
+      c1_host_value_type("c1_host_value_type", N, matCdim1, matCdim2);
+
+    Kokkos::fill_random(a_expected_host_value_type, random, host_value_type(1.0));
+    Kokkos::fill_random(b_expected_host_value_type, random, host_value_type(1.0));
+    Kokkos::fill_random(c_expected_host_value_type, random, host_value_type(1.0));
+
+    Kokkos::fence();
+
+    Kokkos::deep_copy(a_expected, a_expected_host_value_type);
+    Kokkos::deep_copy(b_expected, b_expected_host_value_type);
+    Kokkos::deep_copy(c_expected, c_expected_host_value_type);
+
+    Kokkos::deep_copy(a1, a_expected);
+    Kokkos::deep_copy(b1, b_expected);
+    Kokkos::deep_copy(c1, c_expected);
+
+    //Functor_TestBatchedTeamVector<DeviceType,ViewType,ScalarType,
+    //  ParamTagType,Algo::Gemm::Unblocked>(alpha, a_expected, b_expected, beta, c_expected).run();
+    Functor_BatchedVanillaGEMM<ViewType, ViewType, ViewType, execution_space> vgemm;
+    vgemm.A_t = std::is_same<transA, Trans::Transpose>::value;
+    vgemm.B_t = std::is_same<transB, Trans::Transpose>::value;
+    vgemm.A_c = vgemm.B_c = false;
+    vgemm.A = a_expected;
+    vgemm.B = b_expected;
+    vgemm.C = c_expected;
+    vgemm.alpha = alpha;
+    vgemm.beta = beta;
+    vgemm.run(); // Compute c_expected
+
+    Functor_TestBatchedTeamVector<DeviceType,ViewType,ScalarType,
+      ParamTagType,AlgoTagType>(alpha, a1, b1, beta, c1).run();
+
+    Kokkos::fence();
+
+    // Convert and copy half to host_value_type, on device
+    Kokkos::deep_copy(c_expected_host_value_type, c_expected);
+    Kokkos::deep_copy(c1_host_value_type, c1);    
+
+    // We may not have half precision on the host, use single precision here.
+    // For comparison send it to host, in host compatible type
+    typename ViewType_host_value_type::HostMirror c_expected_host_value_type_host = Kokkos::create_mirror_view(c_expected_host_value_type);
+    typename ViewType_host_value_type::HostMirror c1_host_value_type_host = Kokkos::create_mirror_view(c1_host_value_type);
+
+    // Copy host_value_type on device to host_value_type on host
+    Kokkos::deep_copy(c_expected_host_value_type_host, c_expected_host_value_type);
+    Kokkos::deep_copy(c1_host_value_type_host, c1_host_value_type);
+
+    // check c_expected = c1 ; this eps is about 2^-9
+    // Set mag_type to host_value_type, we may not have half precision on host
+    using mag_type = host_value_type;
+    mag_type sum(1), diff(0);
+
+    mag_type eps = (mag_type) (1 << 1) * KOKKOSKERNELS_IMPL_FP16_EPSILON;
+
+    for (int k=0;k<N;++k)
+      for (int i=0;i<matCdim1;++i) 
+        for (int j=0;j<matCdim2;++j) {
+          sum  += ats::abs(c_expected_host_value_type_host(k,i,j));
+          diff += ats::abs(c_expected_host_value_type_host(k,i,j)-c1_host_value_type_host(k,i,j));
+        }
+    EXPECT_NEAR_KK( diff/sum, 0, eps);
+  }
+}
+
+// void (*impl_test)(const int, const int, const int, const int, const int, const int, const int)
+template<typename DeviceType, 
+         typename ValueType, 
+         typename ScalarType,
+         typename ParamTagType,
+         typename AlgoTagType>
+int test_batched_teamvectorgemm() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) 
+  {
+    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
+    Test::impl_test_batched_teamvectorgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(0, 10, 10, 10, 10, 10, 10);
+    for (int i=0;i<10;++i) {
+      //printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::impl_test_batched_teamvectorgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, i, i, i, i, i, i);
+    }
+    for (int i=0;i<10;++i) {                                                                                        
+      //printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      int dimM=i; int dimN=2*i; int dimK=3*i;
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_teamvectorgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_teamvectorgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimN, dimK, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_teamvectorgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_teamvectorgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimN, dimK, dimM, dimN); }
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) 
+  {
+    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
+    Test::impl_test_batched_teamvectorgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(0, 10, 10, 10, 10, 10, 10);
+    for (int i=0;i<10;++i) {
+      //printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::impl_test_batched_teamvectorgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, i, i, i, i, i, i);
+    }
+    for (int i=0;i<10;++i) {                                                                                        
+      //printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      int dimM=i; int dimN=2*i; int dimK=3*i;
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_teamvectorgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_teamvectorgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimN, dimK, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_teamvectorgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_teamvectorgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimN, dimK, dimM, dimN); }
+    }
+  }
+#endif
+  
+  return 0;
+}
+
+template<typename DeviceType, 
+         typename ValueType, 
+         typename ScalarType,
+         typename ParamTagType,
+         typename AlgoTagType>
+int test_batched_teamvectorgemm_half() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) 
+  {
+    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
+    Test::impl_test_batched_teamvectorgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(0, 10, 10, 10, 10, 10, 10);
+    for (int i=0;i<10;++i) {
+      //printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::impl_test_batched_teamvectorgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, i, i, i, i, i, i);
+    }
+    for (int i=0;i<10;++i) {                                                                                        
+      //printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      int dimM=i; int dimN=2*i; int dimK=3*i;
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_teamvectorgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_teamvectorgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimN, dimK, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_teamvectorgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_teamvectorgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimN, dimK, dimM, dimN); }
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) 
+  {
+    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
+    Test::impl_test_batched_teamvectorgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(0, 10, 10, 10, 10, 10, 10);
+    for (int i=0;i<10;++i) {
+      //printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::impl_test_batched_teamvectorgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, i, i, i, i, i, i);
+    }
+    for (int i=0;i<10;++i) {                                                                                        
+      //printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      int dimM=i; int dimN=2*i; int dimK=3*i;
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_teamvectorgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::NoTranspose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_teamvectorgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimM, dimK, dimN, dimK, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::NoTranspose>::value)) {
+          Test::impl_test_batched_teamvectorgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimK, dimN, dimM, dimN); }
+      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
+        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
+          Test::impl_test_batched_teamvectorgemm_half<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimN, dimK, dimM, dimN); }
+    }
+  }
+#endif
+  
+  return 0;
+}
diff --git a/unit_test/batched/Test_Batched_TeamVectorGemm_Complex.hpp b/unit_test/batched/Test_Batched_TeamVectorGemm_Complex.hpp
new file mode 100644
index 0000000000..4926d20670
--- /dev/null
+++ b/unit_test/batched/Test_Batched_TeamVectorGemm_Complex.hpp
@@ -0,0 +1,53 @@
+#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT)
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_scomplex_scomplex ) {
+  typedef ::Test::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_scomplex_scomplex ) {
+  typedef ::Test::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_scomplex_scomplex ) {
+  typedef ::Test::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_scomplex_scomplex ) {
+  typedef ::Test::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Unblocked>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_dcomplex_dcomplex ) {
+  typedef ::Test::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_dcomplex_dcomplex ) {
+  typedef ::Test::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_dcomplex_dcomplex ) {
+  typedef ::Test::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_dcomplex_dcomplex ) {
+  typedef ::Test::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Unblocked>();
+}
+#endif
diff --git a/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp b/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp
new file mode 100644
index 0000000000..de7748bd65
--- /dev/null
+++ b/unit_test/batched/Test_Batched_TeamVectorGemm_Real.hpp
@@ -0,0 +1,80 @@
+#if defined(KOKKOS_HALF_T_IS_FLOAT)
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_half_half ) {
+  typedef ::Test::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+
+  //test_batched_teamvectorgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_half_half ) {
+  typedef ::Test::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+
+  //test_batched_teamvectorgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_half_half ) {
+  typedef ::Test::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+
+  //test_batched_teamvectorgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_half_half ) {
+  typedef ::Test::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+
+  //test_batched_teamvectorgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm_half<TestExecSpace,::Test::halfScalarType,::Test::halfScalarType,param_tag_type,Algo::Gemm::Unblocked>();
+}
+#endif // KOKKOS_HALF_T_IS_FLOAT
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_float_float ) {
+  typedef ::Test::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_float_float ) {
+  typedef ::Test::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_float_float ) {
+  typedef ::Test::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_float_float ) {
+  typedef ::Test::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Unblocked>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_double_double ) {
+  typedef ::Test::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_double_double ) {
+  typedef ::Test::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_double_double ) {
+  typedef ::Test::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Unblocked>();
+}
+TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_double_double ) {
+  typedef ::Test::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+
+  //test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Unblocked>();
+}
+#endif
diff --git a/unit_test/blas/Test_Blas3_gemm.hpp b/unit_test/blas/Test_Blas3_gemm.hpp
index 55c71231f6..451b7fedac 100644
--- a/unit_test/blas/Test_Blas3_gemm.hpp
+++ b/unit_test/blas/Test_Blas3_gemm.hpp
@@ -25,7 +25,7 @@ namespace Test {
     KOKKOS_INLINE_FUNCTION
     void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__)
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
       int i = team.league_rank();
 #else
       const int i = team.league_rank();
diff --git a/unit_test/blas/Test_Blas3_trmm.hpp b/unit_test/blas/Test_Blas3_trmm.hpp
index 74fd49b988..9f72bd5e63 100644
--- a/unit_test/blas/Test_Blas3_trmm.hpp
+++ b/unit_test/blas/Test_Blas3_trmm.hpp
@@ -49,7 +49,7 @@ namespace Test {
     KOKKOS_INLINE_FUNCTION
     void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__)
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
       int i = team.league_rank();
 #else
       const int i = team.league_rank();
diff --git a/unit_test/blas/Test_Blas3_trsm.hpp b/unit_test/blas/Test_Blas3_trsm.hpp
index e6e98723c2..8fec44b637 100644
--- a/unit_test/blas/Test_Blas3_trsm.hpp
+++ b/unit_test/blas/Test_Blas3_trsm.hpp
@@ -49,7 +49,8 @@ namespace Test {
     KOKKOS_INLINE_FUNCTION
     void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__)
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+
       int i = team.league_rank();
 #else
       const int i = team.league_rank();
diff --git a/unit_test/blas/Test_Blas_trtri.hpp b/unit_test/blas/Test_Blas_trtri.hpp
index f939b87b31..bcc6b842c8 100644
--- a/unit_test/blas/Test_Blas_trtri.hpp
+++ b/unit_test/blas/Test_Blas_trtri.hpp
@@ -49,7 +49,7 @@ namespace Test {
     KOKKOS_INLINE_FUNCTION
     void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__)
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
       int i = team.league_rank();
 #else
       const int i = team.league_rank();
diff --git a/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Complex.cpp b/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Complex.cpp
new file mode 100644
index 0000000000..8ac5c834bc
--- /dev/null
+++ b/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_Cuda.hpp"
+#include "Test_Batched_TeamVectorGemm.hpp"
+#include "Test_Batched_TeamVectorGemm_Complex.hpp"
diff --git a/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Real.cpp b/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Real.cpp
new file mode 100644
index 0000000000..27e7b3b565
--- /dev/null
+++ b/unit_test/cuda/Test_Cuda_Batched_TeamVectorGemm_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_Cuda.hpp"
+#include "Test_Batched_TeamVectorGemm.hpp"
+#include "Test_Batched_TeamVectorGemm_Real.hpp"
diff --git a/unit_test/cuda/Test_Cuda_Graph_mis2.cpp b/unit_test/cuda/Test_Cuda_Graph_mis2.cpp
new file mode 100644
index 0000000000..00148fd653
--- /dev/null
+++ b/unit_test/cuda/Test_Cuda_Graph_mis2.cpp
@@ -0,0 +1,2 @@
+#include<Test_Cuda.hpp>
+#include<Test_Graph_mis2.hpp>
diff --git a/unit_test/cuda/Test_Cuda_Graph_rcm.cpp b/unit_test/cuda/Test_Cuda_Graph_rcm.cpp
new file mode 100644
index 0000000000..e7fb84820d
--- /dev/null
+++ b/unit_test/cuda/Test_Cuda_Graph_rcm.cpp
@@ -0,0 +1,2 @@
+#include<Test_Cuda.hpp>
+#include<Test_Graph_rcm.hpp>
diff --git a/unit_test/graph/Test_Graph_graph_color_distance2.hpp b/unit_test/graph/Test_Graph_graph_color_distance2.hpp
index 7dac558bff..6f60fc9d62 100644
--- a/unit_test/graph/Test_Graph_graph_color_distance2.hpp
+++ b/unit_test/graph/Test_Graph_graph_color_distance2.hpp
@@ -47,6 +47,7 @@
 #include <Kokkos_Core.hpp>
 
 #include "KokkosGraph_Distance2Color.hpp"
+#include "KokkosGraph_MIS2.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosKernels_IOUtils.hpp"
 #include "KokkosKernels_SparseUtils.hpp"
@@ -322,72 +323,12 @@ void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz, lno_t bandwidth
     }
 }
 
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-template<typename scalar_unused, typename lno_t, typename size_type, typename device>
-void test_old_d2(lno_t numRows, lno_t numCols, size_type nnz, lno_t bandwidth, lno_t row_size_variance)
-{
-    using execution_space = typename device::execution_space;
-    using memory_space = typename device::memory_space;
-    using crsMat = KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>;
-    using graph_type = typename crsMat::StaticCrsGraphType;
-    using c_rowmap_t = typename graph_type::row_map_type;
-    using c_entries_t = typename graph_type::entries_type;
-    using rowmap_t = typename graph_type::row_map_type::non_const_type;
-    using entries_t = typename graph_type::entries_type::non_const_type;
-    using KernelHandle = KokkosKernelsHandle<
-      size_type, lno_t, double,
-      execution_space, memory_space, memory_space>;
-    //Generate graph
-    crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(numRows, numCols, nnz, row_size_variance, bandwidth);
-    auto G = A.graph;
-    rowmap_t t_rowmap("rowmap^T", numCols + 1);
-    entries_t t_entries("entries^T", G.entries.extent(0));
-    KokkosKernels::Impl::transpose_graph
-      <c_rowmap_t, c_entries_t, rowmap_t, entries_t, rowmap_t, execution_space>
-      (numRows, numCols, G.row_map, G.entries, t_rowmap, t_entries);
-    auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.row_map);
-    auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.entries);
-    auto t_rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_rowmap);
-    auto t_entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_entries);
-    std::vector<GraphColoringAlgorithmDistance2> algos =
-    {COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT};
-    for(auto algo : algos)
-    {
-      KernelHandle kh;
-      kh.create_distance2_graph_coloring_handle(algo);
-      // Compute the one-sided bipartite coloring.
-      graph_compute_distance2_color<KernelHandle, c_rowmap_t, c_entries_t, rowmap_t, entries_t>
-        (&kh, numRows, numCols, G.row_map, G.entries, t_rowmap, t_entries);
-      execution_space().fence();
-      auto coloring_handle = kh.get_distance2_graph_coloring_handle();
-      auto colors = coloring_handle->get_vertex_colors();
-      auto colorsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors);
-      auto numColors = coloring_handle->get_num_colors();
-      EXPECT_LE(numColors, numRows);
-      bool success = Test::verifyBipartitePartialColoring
-          <lno_t, size_type, decltype(rowmapHost), decltype(entriesHost), decltype(colorsHost)>
-          (numRows, numCols, rowmapHost, entriesHost, t_rowmapHost, t_entriesHost, colorsHost);
-      EXPECT_TRUE(success) << "Old dist-2 coloring: algorithm " << coloring_handle->getD2AlgorithmName() << " produced invalid coloring";
-      kh.destroy_distance2_graph_coloring_handle();
-    }
-}
-#define DO_DEPRECATED_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-      test_old_d2<SCALAR, ORDINAL, OFFSET, DEVICE>(2000, 4000, 3000 * 20, 800, 10); \
-      test_old_d2<SCALAR, ORDINAL, OFFSET, DEVICE>(4000, 2000, 3000 * 20, 800, 10);
-#else
-#define DO_DEPRECATED_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)
-#endif
-
 #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
     TEST_F(TestCategory, graph##_##graph_color_distance2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \
     { \
       test_dist2_coloring<SCALAR, ORDINAL, OFFSET, DEVICE>(5000, 5000 * 20, 1000, 10); \
       test_dist2_coloring<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50 * 10, 40, 10); \
     } \
-    TEST_F(TestCategory, graph##_##graph_color_deprecated_distance2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \
-    { \
-      DO_DEPRECATED_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-    } \
     TEST_F(TestCategory, graph##_##graph_color_bipartite_sym##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \
     { \
       test_bipartite_symmetric<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50 * 5, 30, 1); \
diff --git a/unit_test/graph/Test_Graph_mis2.hpp b/unit_test/graph/Test_Graph_mis2.hpp
new file mode 100644
index 0000000000..30d32fb2dc
--- /dev/null
+++ b/unit_test/graph/Test_Graph_mis2.hpp
@@ -0,0 +1,234 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Brian Kelley (bmkelle@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include <random>
+#include <Kokkos_Core.hpp>
+
+#include "KokkosGraph_MIS2.hpp"
+#include "KokkosSparse_CrsMatrix.hpp"
+#include "KokkosKernels_IOUtils.hpp"
+#include "KokkosKernels_SparseUtils.hpp"
+#include "KokkosKernels_Handle.hpp"
+#include "KokkosKernels_ExecSpaceUtils.hpp"
+
+using namespace KokkosKernels;
+using namespace KokkosKernels::Experimental;
+
+using namespace KokkosGraph;
+using namespace KokkosGraph::Experimental;
+
+namespace Test {
+
+template<typename lno_t, typename size_type, typename rowmap_t, typename entries_t, typename mis_t>
+bool verifyD2MIS(
+    lno_t numVerts,
+    const rowmap_t& rowmap, const entries_t& entries,
+    const mis_t& misArray)
+{
+  //set a std::set of the mis, for fast membership test
+  std::set<lno_t> mis;
+  for(size_t i = 0; i < misArray.extent(0); i++)
+    mis.insert(misArray(i));
+  for(lno_t i = 0; i < numVerts; i++)
+  {
+    //determine whether another vertex in the set is
+    //within 2 hops of i.
+    bool misIn2Hops = false;
+    for(size_type j = rowmap(i); j < rowmap(i + 1); j++)
+    {
+      lno_t nei1 = entries(j);
+      if(nei1 == i || nei1 >= numVerts)
+        continue;
+      if(mis.find(nei1) != mis.end())
+      {
+        misIn2Hops = true;
+        break;
+      }
+      for(size_type k = rowmap(nei1); k < rowmap(nei1 + 1); k++)
+      {
+        lno_t nei2 = entries(k);
+        if(nei2 == i || nei2 >= numVerts)
+          continue;
+        if(mis.find(nei2) != mis.end())
+        {
+          misIn2Hops = true;
+          break;
+        }
+      }
+    }
+    if(mis.find(i) == mis.end())
+    {
+      //i is not in the set
+      if(!misIn2Hops)
+      {
+        std::cout << "INVALID D2 MIS: vertex " << i << " is not in the set,\n";
+        std::cout << "but there are no vertices in the set within 2 hops.\n";
+        return false;
+      }
+    }
+    else
+    {
+      //i is in the set
+      if(misIn2Hops)
+      {
+        std::cout << "INVALID D2 MIS: vertex " << i << " is in the set,\n";
+        std::cout << "but there is another vertex within 2 hops which is also in the set.\n";
+        return false;
+      }
+    }
+  }
+  return true;
+}
+}
+
+template<typename scalar_unused, typename lno_t, typename size_type, typename device>
+void test_mis2(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance)
+{
+  using execution_space = typename device::execution_space;
+  using crsMat = KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>;
+  using graph_type = typename crsMat::StaticCrsGraphType;
+  using c_rowmap_t = typename graph_type::row_map_type;
+  using c_entries_t = typename graph_type::entries_type;
+  using rowmap_t = typename c_rowmap_t::non_const_type;
+  using entries_t = typename c_entries_t::non_const_type;
+  //Generate graph, and add some out-of-bounds columns
+  crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(numVerts, numVerts, nnz, row_size_variance, bandwidth);
+  auto G = A.graph;
+  //Symmetrize the graph
+  rowmap_t symRowmap;
+  entries_t symEntries;
+  KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap
+    <c_rowmap_t, c_entries_t,
+    rowmap_t, entries_t, execution_space>
+      (numVerts, G.row_map, G.entries, symRowmap, symEntries);
+  auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap);
+  auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries);
+  //For each algorithm, compute and verify the MIS
+  std::vector<MIS2_Algorithm> algos
+    = {MIS2_FAST, MIS2_QUALITY};
+  for(auto algo : algos)
+  {
+    auto mis = graph_d2_mis<device, rowmap_t, entries_t>(symRowmap, symEntries, algo);
+    auto misHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), mis);
+    bool success = Test::verifyD2MIS
+      <lno_t, size_type, decltype(rowmapHost), decltype(entriesHost), decltype(misHost)>
+      (numVerts, rowmapHost, entriesHost, misHost);
+    EXPECT_TRUE(success) << "Dist-2 MIS (algo " << (int) algo << ") produced invalid set.";
+  }
+}
+
+template<typename scalar_unused, typename lno_t, typename size_type, typename device>
+void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance)
+{
+  using execution_space = typename device::execution_space;
+  using crsMat = KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>;
+  using graph_type = typename crsMat::StaticCrsGraphType;
+  using c_rowmap_t = typename graph_type::row_map_type;
+  using c_entries_t = typename graph_type::entries_type;
+  using rowmap_t = typename c_rowmap_t::non_const_type;
+  using entries_t = typename c_entries_t::non_const_type;
+  //Generate graph, and add some out-of-bounds columns
+  crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(numVerts, numVerts, nnz, row_size_variance, bandwidth);
+  auto G = A.graph;
+  //Symmetrize the graph
+  rowmap_t symRowmap;
+  entries_t symEntries;
+  KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap
+    <c_rowmap_t, c_entries_t,
+    rowmap_t, entries_t, execution_space>
+      (numVerts, G.row_map, G.entries, symRowmap, symEntries);
+  auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap);
+  auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries);
+  //For each algorithm, compute and verify the MIS
+  std::vector<MIS2_Algorithm> algos
+    = {MIS2_FAST, MIS2_QUALITY};
+  for(auto algo : algos)
+  {
+    lno_t numClusters = 0;
+    auto labels = graph_mis2_coarsen<device, rowmap_t, entries_t>(symRowmap, symEntries, numClusters, algo);
+    auto labelsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), labels);
+    //Not a strong test, but sanity check the number of clusters returned
+    EXPECT_TRUE(numClusters >= 1 && numClusters <= numVerts);
+    //Check that every label is in the range [0, numClusters)
+    for(lno_t i = 0; i < numVerts; i++)
+      EXPECT_TRUE(0 <= labelsHost(i) && labelsHost(i) < numClusters);
+  }
+}
+
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
+    TEST_F(TestCategory, graph##_##graph_mis2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \
+    { \
+      test_mis2<SCALAR, ORDINAL, OFFSET, DEVICE>(5000, 5000 * 20, 1000, 10); \
+      test_mis2<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50 * 10, 40, 10); \
+      test_mis2<SCALAR, ORDINAL, OFFSET, DEVICE>(5, 5 * 3, 5, 0); \
+    } \
+    TEST_F(TestCategory, graph##_##graph_mis2_coarsening##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \
+    { \
+      test_mis2_coarsening<SCALAR, ORDINAL, OFFSET, DEVICE>(5000, 5000 * 20, 1000, 10); \
+      test_mis2_coarsening<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50 * 10, 40, 10); \
+      test_mis2_coarsening<SCALAR, ORDINAL, OFFSET, DEVICE>(5, 5 * 3, 5, 0); \
+    }
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) \
+  || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
+#endif
+
+#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) \
+  || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#endif
+
+#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) \
+  || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#endif
+
+#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) \
+  || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#endif
+#endif
diff --git a/unit_test/graph/Test_Graph_rcm.hpp b/unit_test/graph/Test_Graph_rcm.hpp
new file mode 100644
index 0000000000..eb3cd45a37
--- /dev/null
+++ b/unit_test/graph/Test_Graph_rcm.hpp
@@ -0,0 +1,197 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+#include "KokkosGraph_RCM.hpp"
+#include "KokkosKernels_IOUtils.hpp"
+#include "KokkosSparse_CrsMatrix.hpp"
+
+#include <vector>
+
+//Generates a graph from 3D 7-pt stencil. Slices grid into 2 connected components near the middle of X dimension.
+template<typename rowmap_t, typename entries_t>
+void generate7pt(rowmap_t& rowmapView, entries_t& entriesView, int gridX, int gridY, int gridZ)
+{
+  using size_type = typename rowmap_t::non_const_value_type;
+  using lno_t = typename entries_t::non_const_value_type;
+  auto getVertexID =
+  [=](lno_t x, lno_t y, lno_t z) -> lno_t
+  {
+    return x + y * gridX + z * gridX * gridY;
+  };
+  lno_t numVertices = gridX * gridY * gridZ;
+  //Generate the graph on host (use std::vector to not need to know
+  //how many entries ahead of time)
+  std::vector<size_type> rowmap(numVertices + 1);
+  std::vector<lno_t> entries;
+  rowmap[0] = 0;
+  lno_t xslice = gridX / 2;
+  for(lno_t k = 0; k < gridZ; k++)
+  {
+    for(lno_t j = 0; j < gridY; j++)
+    {
+      for(lno_t i = 0; i < gridX; i++)
+      {
+        lno_t v = getVertexID(i, j, k);
+        if(i != 0 && i != xslice + 1)
+          entries.push_back(getVertexID(i - 1, j, k));
+        if(i != gridX - 1 && i != xslice)
+          entries.push_back(getVertexID(i + 1, j, k));
+        if(j != 0)
+          entries.push_back(getVertexID(i, j - 1, k));
+        if(j != gridY - 1)
+          entries.push_back(getVertexID(i, j + 1, k));
+        if(k != 0)
+          entries.push_back(getVertexID(i, j, k - 1));
+        if(k != gridZ - 1)
+          entries.push_back(getVertexID(i, j, k + 1));
+        rowmap[v + 1] = entries.size();
+      }
+    }
+  }
+  size_type numEdges = entries.size();
+  //Now that the graph is formed, copy rowmap and entries to Kokkos::Views in device memory
+  //The nonowning host views just alias the std::vectors.
+  Kokkos::View<size_type*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> rowmapHost(rowmap.data(), numVertices + 1);
+  Kokkos::View<lno_t*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> entriesHost(entries.data(), numEdges);
+  //Allocate owning views on device with the correct size.
+  rowmapView = rowmap_t(Kokkos::ViewAllocateWithoutInitializing("Rowmap"), numVertices + 1);
+  entriesView = entries_t(Kokkos::ViewAllocateWithoutInitializing("Colinds"), numEdges);
+  //Copy the graph from host to device
+  Kokkos::deep_copy(rowmapView, rowmapHost);
+  Kokkos::deep_copy(entriesView, entriesHost);
+}
+
+template<typename rowmap_t, typename entries_t, typename labels_t>
+int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries, const labels_t& invPerm, const labels_t& perm)
+{
+  using size_type = typename rowmap_t::non_const_value_type;
+  using lno_t = typename entries_t::non_const_value_type;
+  lno_t numVerts = rowmap.extent(0) - 1;
+  int bw = 0;
+  for(lno_t i = 0; i < numVerts; i++)
+  {
+    lno_t origRow = perm(i);
+    for(size_type j = rowmap(origRow); j < rowmap(origRow + 1); j++)
+    {
+      lno_t origNei = entries(j);
+      lno_t nei = invPerm(origNei);
+      if(nei > i)
+      {
+        lno_t thisBW = nei - i;
+        if(thisBW > bw)
+          bw = thisBW;
+      }
+    }
+  }
+  return bw;
+}
+
+template <typename lno_t, typename size_type, typename device>
+void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ)
+{
+  typedef typename KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type> crsMat_t;
+  typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef typename graph_t::row_map_type rowmap_t;
+  typedef typename graph_t::entries_type entries_t; 
+  lno_t numVerts = gridX * gridY * gridZ;
+  typename rowmap_t::non_const_type rowmap;
+  typename entries_t::non_const_type entries;
+  generate7pt(rowmap, entries, gridX, gridY, gridZ);
+  auto rcm = KokkosGraph::Experimental::graph_rcm<device, rowmap_t, entries_t>(rowmap, entries);
+  auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap);
+  auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries);
+  auto rcmHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rcm);
+  decltype(rcmHost) rcmPermHost(Kokkos::ViewAllocateWithoutInitializing("RCMPerm"), numVerts);
+  for(lno_t i = 0; i < numVerts; i++)
+    rcmPermHost(rcmHost(i)) = i;
+  //make sure each row index shows up exactly once
+  {
+    std::vector<int> counts(numVerts);
+    for(lno_t i = 0; i < numVerts; i++)
+    {
+      lno_t orig = rcmHost(i);
+      ASSERT_GE(orig, 0);
+      ASSERT_LT(orig, numVerts);
+      counts[orig]++;
+    }
+    for(lno_t i = 0; i < numVerts; i++)
+      ASSERT_EQ(counts[i], 1);
+  }
+  Kokkos::View<lno_t*, Kokkos::HostSpace> identityOrder(Kokkos::ViewAllocateWithoutInitializing("Identity"), numVerts);
+  for(lno_t i = 0; i < numVerts; i++)
+    identityOrder(i) = i;
+  size_t origBW = maxBandwidth(rowmapHost, entriesHost, identityOrder, identityOrder);
+  size_t rcmBW = maxBandwidth(rowmapHost, entriesHost, rcmHost, rcmPermHost);
+  EXPECT_LE(rcmBW, origBW);
+}
+
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
+TEST_F( TestCategory, graph ## _ ## rcm ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
+  test_rcm<ORDINAL,OFFSET,DEVICE>(6, 3, 3); \
+  test_rcm<ORDINAL,OFFSET,DEVICE>(20, 20, 20); \
+  test_rcm<ORDINAL,OFFSET,DEVICE>(100, 100, 1); \
+}
+
+#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
+ && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+ EXECUTE_TEST(double, int, int, TestExecSpace)
+#endif
+
+#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
+ && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+ EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
+ && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+ EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#endif
+
+#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
+ && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+ EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#endif
diff --git a/unit_test/hip/Test_HIP.hpp b/unit_test/hip/Test_HIP.hpp
new file mode 100644
index 0000000000..cd4c49f16d
--- /dev/null
+++ b/unit_test/hip/Test_HIP.hpp
@@ -0,0 +1,21 @@
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <KokkosKernels_config.h>
+
+#if defined(KOKKOSKERNELS_TEST_ETI_ONLY) && !defined(KOKKOSKERNELS_ETI_ONLY)
+#define KOKKOSKERNELS_ETI_ONLY
+#endif
+
+class hip : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+  }
+
+  static void TearDownTestCase()
+  {
+  }
+};
+
+#define TestCategory hip
+#define TestExecSpace Kokkos::Experimental::HIP
diff --git a/unit_test/hip/Test_HIP_Batched_SerialEigendecomposition_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialEigendecomposition_Real.cpp
new file mode 100644
index 0000000000..1aceff3e62
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialEigendecomposition_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialEigendecomposition.hpp"
+#include "Test_Batched_SerialEigendecomposition_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialGemm_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialGemm_Complex.cpp
new file mode 100644
index 0000000000..280d12eb89
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialGemm_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialGemm.hpp"
+#include "Test_Batched_SerialGemm_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialGemm_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialGemm_Real.cpp
new file mode 100644
index 0000000000..0a3425962a
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialGemm_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialGemm.hpp"
+#include "Test_Batched_SerialGemm_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialGemv_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialGemv_Complex.cpp
new file mode 100644
index 0000000000..1f405f4caa
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialGemv_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialGemv.hpp"
+#include "Test_Batched_SerialGemv_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialGemv_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialGemv_Real.cpp
new file mode 100644
index 0000000000..98e69da8e1
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialGemv_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialGemv.hpp"
+#include "Test_Batched_SerialGemv_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Complex.cpp
new file mode 100644
index 0000000000..7d0f3bcdea
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialInverseLU.hpp"
+#include "Test_Batched_SerialInverseLU_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Real.cpp
new file mode 100644
index 0000000000..c147695515
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialInverseLU_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialInverseLU.hpp"
+#include "Test_Batched_SerialInverseLU_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialLU_Complex.cpp
new file mode 100644
index 0000000000..ac11b50956
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialLU_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialLU.hpp"
+#include "Test_Batched_SerialLU_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialLU_Real.cpp
new file mode 100644
index 0000000000..b9bdbfb95a
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialLU_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialLU.hpp"
+#include "Test_Batched_SerialLU_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Complex.cpp
new file mode 100644
index 0000000000..d7070fd0b5
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialMatUtil.hpp"
+#include "Test_Batched_SerialMatUtil_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Real.cpp
new file mode 100644
index 0000000000..65674e04b9
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialMatUtil_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialMatUtil.hpp"
+#include "Test_Batched_SerialMatUtil_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Complex.cpp
new file mode 100644
index 0000000000..059877ff2d
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialSolveLU.hpp"
+#include "Test_Batched_SerialSolveLU_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Real.cpp
new file mode 100644
index 0000000000..d09271a0e6
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialSolveLU_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialSolveLU.hpp"
+#include "Test_Batched_SerialSolveLU_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrmm_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrmm_Complex.cpp
new file mode 100644
index 0000000000..e10cb11259
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialTrmm_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialTrmm.hpp"
+#include "Test_Batched_SerialTrmm_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrmm_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrmm_Real.cpp
new file mode 100644
index 0000000000..95b412a3a8
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialTrmm_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialTrmm.hpp"
+#include "Test_Batched_SerialTrmm_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrsm_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrsm_Complex.cpp
new file mode 100644
index 0000000000..b12b6fc203
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialTrsm_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialTrsm.hpp"
+#include "Test_Batched_SerialTrsm_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrsm_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrsm_Real.cpp
new file mode 100644
index 0000000000..660293cfd2
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialTrsm_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialTrsm.hpp"
+#include "Test_Batched_SerialTrsm_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrsv_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrsv_Complex.cpp
new file mode 100644
index 0000000000..f82c94e5e9
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialTrsv_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialTrsv.hpp"
+#include "Test_Batched_SerialTrsv_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrsv_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrsv_Real.cpp
new file mode 100644
index 0000000000..34c80371e1
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialTrsv_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialTrsv.hpp"
+#include "Test_Batched_SerialTrsv_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrtri_Complex.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrtri_Complex.cpp
new file mode 100644
index 0000000000..387aee1cc2
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialTrtri_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialTrtri.hpp"
+#include "Test_Batched_SerialTrtri_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_SerialTrtri_Real.cpp b/unit_test/hip/Test_HIP_Batched_SerialTrtri_Real.cpp
new file mode 100644
index 0000000000..1f996ca4e1
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_SerialTrtri_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_SerialTrtri.hpp"
+#include "Test_Batched_SerialTrtri_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamGemm_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamGemm_Complex.cpp
new file mode 100644
index 0000000000..49b75ee6fa
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamGemm_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamGemm.hpp"
+#include "Test_Batched_TeamGemm_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamGemm_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamGemm_Real.cpp
new file mode 100644
index 0000000000..52cacfa3c8
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamGemm_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamGemm.hpp"
+#include "Test_Batched_TeamGemm_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamGemv_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamGemv_Complex.cpp
new file mode 100644
index 0000000000..fed2bad261
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamGemv_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamGemv.hpp"
+#include "Test_Batched_TeamGemv_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamGemv_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamGemv_Real.cpp
new file mode 100644
index 0000000000..2d589ba4ef
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamGemv_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamGemv.hpp"
+#include "Test_Batched_TeamGemv_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Complex.cpp
new file mode 100644
index 0000000000..fa4ab4b3a1
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamInverseLU.hpp"
+#include "Test_Batched_TeamInverseLU_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Real.cpp
new file mode 100644
index 0000000000..9877053d34
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamInverseLU_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamInverseLU.hpp"
+#include "Test_Batched_TeamInverseLU_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamLU_Complex.cpp
new file mode 100644
index 0000000000..068f2faa3f
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamLU_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamLU.hpp"
+#include "Test_Batched_TeamLU_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamLU_Real.cpp
new file mode 100644
index 0000000000..0e09a25fb2
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamLU_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamLU.hpp"
+#include "Test_Batched_TeamLU_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Complex.cpp
new file mode 100644
index 0000000000..8a2b9d4c44
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamMatUtil.hpp"
+#include "Test_Batched_TeamMatUtil_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Real.cpp
new file mode 100644
index 0000000000..8262c3c2eb
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamMatUtil_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamMatUtil.hpp"
+#include "Test_Batched_TeamMatUtil_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Complex.cpp
new file mode 100644
index 0000000000..b5474a3a24
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamSolveLU.hpp"
+#include "Test_Batched_TeamSolveLU_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Real.cpp
new file mode 100644
index 0000000000..469fce62a9
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamSolveLU_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamSolveLU.hpp"
+#include "Test_Batched_TeamSolveLU_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamTrsm_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamTrsm_Complex.cpp
new file mode 100644
index 0000000000..e48617a7b6
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamTrsm_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamTrsm.hpp"
+#include "Test_Batched_TeamTrsm_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamTrsm_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamTrsm_Real.cpp
new file mode 100644
index 0000000000..83ce8988d0
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamTrsm_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamTrsm.hpp"
+#include "Test_Batched_TeamTrsm_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamTrsv_Complex.cpp b/unit_test/hip/Test_HIP_Batched_TeamTrsv_Complex.cpp
new file mode 100644
index 0000000000..ff75837fca
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamTrsv_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamTrsv.hpp"
+#include "Test_Batched_TeamTrsv_Complex.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamTrsv_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamTrsv_Real.cpp
new file mode 100644
index 0000000000..5fba12911e
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamTrsv_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamTrsv.hpp"
+#include "Test_Batched_TeamTrsv_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamVectorQR_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamVectorQR_Real.cpp
new file mode 100644
index 0000000000..e8ee97ffc7
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamVectorQR_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamVectorQR.hpp"
+#include "Test_Batched_TeamVectorQR_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamVectorQR_WithColumnPivoting_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamVectorQR_WithColumnPivoting_Real.cpp
new file mode 100644
index 0000000000..a55667f9d4
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamVectorQR_WithColumnPivoting_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamVectorQR_WithColumnPivoting.hpp"
+#include "Test_Batched_TeamVectorQR_WithColumnPivoting_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamVectorSolveUTV_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamVectorSolveUTV_Real.cpp
new file mode 100644
index 0000000000..aaa8ad4f91
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamVectorSolveUTV_Real.cpp
@@ -0,0 +1,6 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamVectorSolveUTV.hpp"
+#include "Test_Batched_TeamVectorSolveUTV_Real.hpp"
+
+#include "Test_Batched_TeamVectorSolveUTV2.hpp"
+#include "Test_Batched_TeamVectorSolveUTV2_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Batched_TeamVectorUTV_Real.cpp b/unit_test/hip/Test_HIP_Batched_TeamVectorUTV_Real.cpp
new file mode 100644
index 0000000000..f60705ae07
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Batched_TeamVectorUTV_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_HIP.hpp"
+#include "Test_Batched_TeamVectorUTV.hpp"
+#include "Test_Batched_TeamVectorUTV_Real.hpp"
diff --git a/unit_test/hip/Test_HIP_Blas1_abs.cpp b/unit_test/hip/Test_HIP_Blas1_abs.cpp
new file mode 100644
index 0000000000..e175c8970e
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_abs.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_abs.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_asum.cpp b/unit_test/hip/Test_HIP_Blas1_asum.cpp
new file mode 100644
index 0000000000..c93f5f32fd
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_asum.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_asum.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_axpby.cpp b/unit_test/hip/Test_HIP_Blas1_axpby.cpp
new file mode 100644
index 0000000000..2814ecc583
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_axpby.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_axpby.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_axpy.cpp b/unit_test/hip/Test_HIP_Blas1_axpy.cpp
new file mode 100644
index 0000000000..8c7170d275
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_axpy.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_axpy.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_dot.cpp b/unit_test/hip/Test_HIP_Blas1_dot.cpp
new file mode 100644
index 0000000000..2892b1e7e7
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_dot.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_dot.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_iamax.cpp b/unit_test/hip/Test_HIP_Blas1_iamax.cpp
new file mode 100644
index 0000000000..8fb34c05db
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_iamax.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_iamax.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_mult.cpp b/unit_test/hip/Test_HIP_Blas1_mult.cpp
new file mode 100644
index 0000000000..e124061c58
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_mult.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_mult.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_nrm1.cpp b/unit_test/hip/Test_HIP_Blas1_nrm1.cpp
new file mode 100644
index 0000000000..fb292630e7
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_nrm1.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_nrm1.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_nrm2.cpp b/unit_test/hip/Test_HIP_Blas1_nrm2.cpp
new file mode 100644
index 0000000000..cf2f9e7237
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_nrm2.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_nrm2.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_nrm2_squared.cpp b/unit_test/hip/Test_HIP_Blas1_nrm2_squared.cpp
new file mode 100644
index 0000000000..4d91e62f85
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_nrm2_squared.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_nrm2_squared.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_nrminf.cpp b/unit_test/hip/Test_HIP_Blas1_nrminf.cpp
new file mode 100644
index 0000000000..67a07902f0
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_nrminf.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_nrminf.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_reciprocal.cpp b/unit_test/hip/Test_HIP_Blas1_reciprocal.cpp
new file mode 100644
index 0000000000..892469cb7c
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_reciprocal.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_reciprocal.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_scal.cpp b/unit_test/hip/Test_HIP_Blas1_scal.cpp
new file mode 100644
index 0000000000..11df7e89b5
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_scal.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_scal.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_sum.cpp b/unit_test/hip/Test_HIP_Blas1_sum.cpp
new file mode 100644
index 0000000000..3be74c5d9a
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_sum.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_sum.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_team_abs.cpp b/unit_test/hip/Test_HIP_Blas1_team_abs.cpp
new file mode 100644
index 0000000000..d59b6a61de
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_team_abs.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_team_abs.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_team_axpby.cpp b/unit_test/hip/Test_HIP_Blas1_team_axpby.cpp
new file mode 100644
index 0000000000..0f3a2a5fec
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_team_axpby.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_team_axpby.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_team_axpy.cpp b/unit_test/hip/Test_HIP_Blas1_team_axpy.cpp
new file mode 100644
index 0000000000..823154d5af
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_team_axpy.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_team_axpy.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_team_dot.cpp b/unit_test/hip/Test_HIP_Blas1_team_dot.cpp
new file mode 100644
index 0000000000..05987c8dd4
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_team_dot.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_team_dot.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_team_mult.cpp b/unit_test/hip/Test_HIP_Blas1_team_mult.cpp
new file mode 100644
index 0000000000..ca54d031f1
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_team_mult.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_team_mult.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp b/unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp
new file mode 100644
index 0000000000..9994255a31
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_team_nrm2.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_team_nrm2.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_team_scal.cpp b/unit_test/hip/Test_HIP_Blas1_team_scal.cpp
new file mode 100644
index 0000000000..2f804c4dc5
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_team_scal.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_team_scal.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_team_update.cpp b/unit_test/hip/Test_HIP_Blas1_team_update.cpp
new file mode 100644
index 0000000000..99cc8746ed
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_team_update.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_team_update.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas1_update.cpp b/unit_test/hip/Test_HIP_Blas1_update.cpp
new file mode 100644
index 0000000000..f2388dbc9b
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas1_update.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas1_update.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas2_gemv.cpp b/unit_test/hip/Test_HIP_Blas2_gemv.cpp
new file mode 100644
index 0000000000..9df86cde64
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas2_gemv.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas2_gemv.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas2_team_gemv.cpp b/unit_test/hip/Test_HIP_Blas2_team_gemv.cpp
new file mode 100644
index 0000000000..da40621400
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas2_team_gemv.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas2_team_gemv.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas3_gemm.cpp b/unit_test/hip/Test_HIP_Blas3_gemm.cpp
new file mode 100644
index 0000000000..9fdd5004a4
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas3_gemm.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas3_gemm.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas3_trmm.cpp b/unit_test/hip/Test_HIP_Blas3_trmm.cpp
new file mode 100644
index 0000000000..baaf52d8a5
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas3_trmm.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas3_trmm.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas3_trsm.cpp b/unit_test/hip/Test_HIP_Blas3_trsm.cpp
new file mode 100644
index 0000000000..fa4ce5e728
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas3_trsm.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas3_trsm.hpp>
diff --git a/unit_test/hip/Test_HIP_Blas_gesv.cpp b/unit_test/hip/Test_HIP_Blas_gesv.cpp
new file mode 100644
index 0000000000..7d4a4bb0c4
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas_gesv.cpp
@@ -0,0 +1,4 @@
+#include<Test_HIP.hpp>
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA
+#include<Test_Blas_gesv.hpp>
+#endif
diff --git a/unit_test/hip/Test_HIP_Blas_trtri.cpp b/unit_test/hip/Test_HIP_Blas_trtri.cpp
new file mode 100644
index 0000000000..e5b58ad470
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Blas_trtri.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Blas_trtri.hpp>
diff --git a/unit_test/hip/Test_HIP_Common_ArithTraits.cpp b/unit_test/hip/Test_HIP_Common_ArithTraits.cpp
new file mode 100644
index 0000000000..6482ba2dba
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Common_ArithTraits.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Common_ArithTraits.hpp>
diff --git a/unit_test/hip/Test_HIP_Common_Sorting.cpp b/unit_test/hip/Test_HIP_Common_Sorting.cpp
new file mode 100644
index 0000000000..f01730e654
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Common_Sorting.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Common_Sorting.hpp>
diff --git a/unit_test/hip/Test_HIP_Common_Transpose.cpp b/unit_test/hip/Test_HIP_Common_Transpose.cpp
new file mode 100644
index 0000000000..d81855df62
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Common_Transpose.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Common_Transpose.hpp>
diff --git a/unit_test/hip/Test_HIP_Common_set_bit_count.cpp b/unit_test/hip/Test_HIP_Common_set_bit_count.cpp
new file mode 100644
index 0000000000..bd2fd76423
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Common_set_bit_count.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Common_set_bit_count.hpp>
diff --git a/unit_test/hip/Test_HIP_Graph_graph_color.cpp b/unit_test/hip/Test_HIP_Graph_graph_color.cpp
new file mode 100644
index 0000000000..01343e32c5
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Graph_graph_color.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Graph_graph_color.hpp>
diff --git a/unit_test/hip/Test_HIP_Graph_graph_color_d2.cpp b/unit_test/hip/Test_HIP_Graph_graph_color_d2.cpp
new file mode 100644
index 0000000000..5ca8df65dc
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Graph_graph_color_d2.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Graph_graph_color_distance2.hpp>
diff --git a/unit_test/hip/Test_HIP_Graph_graph_color_deterministic.cpp b/unit_test/hip/Test_HIP_Graph_graph_color_deterministic.cpp
new file mode 100644
index 0000000000..b24e4bf4b4
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Graph_graph_color_deterministic.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Graph_graph_color_deterministic.hpp>
diff --git a/unit_test/hip/Test_HIP_Graph_rcm.cpp b/unit_test/hip/Test_HIP_Graph_rcm.cpp
new file mode 100644
index 0000000000..652eb9ade5
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Graph_rcm.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Graph_rcm.hpp>
diff --git a/unit_test/hip/Test_HIP_Sparse_CrsMatrix.cpp b/unit_test/hip/Test_HIP_Sparse_CrsMatrix.cpp
new file mode 100644
index 0000000000..782e8152a2
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Sparse_CrsMatrix.cpp
@@ -0,0 +1,3 @@
+#include<Test_HIP.hpp>
+#include<Test_Sparse_CrsMatrix.hpp>
+#include<Test_Sparse_BlockCrsMatrix.hpp>
diff --git a/unit_test/hip/Test_HIP_Sparse_block_gauss_seidel.cpp b/unit_test/hip/Test_HIP_Sparse_block_gauss_seidel.cpp
new file mode 100644
index 0000000000..986460a37b
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Sparse_block_gauss_seidel.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Sparse_block_gauss_seidel.hpp>
diff --git a/unit_test/hip/Test_HIP_Sparse_findRelOffset.cpp b/unit_test/hip/Test_HIP_Sparse_findRelOffset.cpp
new file mode 100644
index 0000000000..0d82182e9b
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Sparse_findRelOffset.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Sparse_findRelOffset.hpp>
diff --git a/unit_test/hip/Test_HIP_Sparse_gauss_seidel.cpp b/unit_test/hip/Test_HIP_Sparse_gauss_seidel.cpp
new file mode 100644
index 0000000000..b63fee6a94
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Sparse_gauss_seidel.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Sparse_gauss_seidel.hpp>
diff --git a/unit_test/hip/Test_HIP_Sparse_replaceSumInto.cpp b/unit_test/hip/Test_HIP_Sparse_replaceSumInto.cpp
new file mode 100644
index 0000000000..72bf132cf0
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Sparse_replaceSumInto.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Sparse_replaceSumInto.hpp>
diff --git a/unit_test/hip/Test_HIP_Sparse_replaceSumIntoLonger.cpp b/unit_test/hip/Test_HIP_Sparse_replaceSumIntoLonger.cpp
new file mode 100644
index 0000000000..daf96e433d
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Sparse_replaceSumIntoLonger.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Sparse_replaceSumIntoLonger.hpp>
diff --git a/unit_test/hip/Test_HIP_Sparse_spadd.cpp b/unit_test/hip/Test_HIP_Sparse_spadd.cpp
new file mode 100644
index 0000000000..98736daebf
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Sparse_spadd.cpp
@@ -0,0 +1,3 @@
+#include<Test_HIP.hpp>
+#include<Test_Sparse_spadd.hpp>
+
diff --git a/unit_test/hip/Test_HIP_Sparse_spgemm.cpp b/unit_test/hip/Test_HIP_Sparse_spgemm.cpp
new file mode 100644
index 0000000000..2402f7596e
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Sparse_spgemm.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Sparse_spgemm.hpp>
diff --git a/unit_test/hip/Test_HIP_Sparse_spgemm_jacobi.cpp b/unit_test/hip/Test_HIP_Sparse_spgemm_jacobi.cpp
new file mode 100644
index 0000000000..6ab09e6743
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Sparse_spgemm_jacobi.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Sparse_spgemm_jacobi.hpp>
diff --git a/unit_test/hip/Test_HIP_Sparse_spiluk.cpp b/unit_test/hip/Test_HIP_Sparse_spiluk.cpp
new file mode 100644
index 0000000000..83f2a59192
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Sparse_spiluk.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Sparse_spiluk.hpp>
diff --git a/unit_test/hip/Test_HIP_Sparse_spmv.cpp b/unit_test/hip/Test_HIP_Sparse_spmv.cpp
new file mode 100644
index 0000000000..18edf035e8
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Sparse_spmv.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Sparse_spmv.hpp>
diff --git a/unit_test/hip/Test_HIP_Sparse_sptrsv.cpp b/unit_test/hip/Test_HIP_Sparse_sptrsv.cpp
new file mode 100644
index 0000000000..cb18ff3ed8
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Sparse_sptrsv.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Sparse_sptrsv.hpp>
diff --git a/unit_test/hip/Test_HIP_Sparse_trsv.cpp b/unit_test/hip/Test_HIP_Sparse_trsv.cpp
new file mode 100644
index 0000000000..c371d334e9
--- /dev/null
+++ b/unit_test/hip/Test_HIP_Sparse_trsv.cpp
@@ -0,0 +1,2 @@
+#include<Test_HIP.hpp>
+#include<Test_Sparse_trsv.hpp>
diff --git a/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Complex.cpp b/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Complex.cpp
new file mode 100644
index 0000000000..9adfd61517
--- /dev/null
+++ b/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_OpenMP.hpp"
+#include "Test_Batched_TeamVectorGemm.hpp"
+#include "Test_Batched_TeamVectorGemm_Complex.hpp"
diff --git a/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Real.cpp b/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Real.cpp
new file mode 100644
index 0000000000..e841dea6a5
--- /dev/null
+++ b/unit_test/openmp/Test_OpenMP_Batched_TeamVectorGemm_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_OpenMP.hpp"
+#include "Test_Batched_TeamVectorGemm.hpp"
+#include "Test_Batched_TeamVectorGemm_Real.hpp"
diff --git a/unit_test/openmp/Test_OpenMP_Graph_mis2.cpp b/unit_test/openmp/Test_OpenMP_Graph_mis2.cpp
new file mode 100644
index 0000000000..8622411b0f
--- /dev/null
+++ b/unit_test/openmp/Test_OpenMP_Graph_mis2.cpp
@@ -0,0 +1,2 @@
+#include<Test_OpenMP.hpp>
+#include<Test_Graph_mis2.hpp>
diff --git a/unit_test/openmp/Test_OpenMP_Graph_rcm.cpp b/unit_test/openmp/Test_OpenMP_Graph_rcm.cpp
new file mode 100644
index 0000000000..eb8164cb30
--- /dev/null
+++ b/unit_test/openmp/Test_OpenMP_Graph_rcm.cpp
@@ -0,0 +1,2 @@
+#include<Test_OpenMP.hpp>
+#include<Test_Graph_rcm.hpp>
diff --git a/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Complex.cpp b/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Complex.cpp
new file mode 100644
index 0000000000..991031d817
--- /dev/null
+++ b/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Complex.cpp
@@ -0,0 +1,3 @@
+#include "Test_Serial.hpp"
+#include "Test_Batched_TeamVectorGemm.hpp"
+#include "Test_Batched_TeamVectorGemm_Complex.hpp"
diff --git a/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Real.cpp b/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Real.cpp
new file mode 100644
index 0000000000..cc2041cefc
--- /dev/null
+++ b/unit_test/serial/Test_Serial_Batched_TeamVectorGemm_Real.cpp
@@ -0,0 +1,3 @@
+#include "Test_Serial.hpp"
+#include "Test_Batched_TeamVectorGemm.hpp"
+#include "Test_Batched_TeamVectorGemm_Real.hpp"
diff --git a/unit_test/serial/Test_Serial_Graph_mis2.cpp b/unit_test/serial/Test_Serial_Graph_mis2.cpp
new file mode 100644
index 0000000000..38db82cfc9
--- /dev/null
+++ b/unit_test/serial/Test_Serial_Graph_mis2.cpp
@@ -0,0 +1,2 @@
+#include<Test_Serial.hpp>
+#include<Test_Graph_mis2.hpp>
diff --git a/unit_test/serial/Test_Serial_Graph_rcm.cpp b/unit_test/serial/Test_Serial_Graph_rcm.cpp
new file mode 100644
index 0000000000..ac225ba858
--- /dev/null
+++ b/unit_test/serial/Test_Serial_Graph_rcm.cpp
@@ -0,0 +1,2 @@
+#include<Test_Serial.hpp>
+#include<Test_Graph_rcm.hpp>
diff --git a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp
index 5645f1d2ce..9993d46e22 100644
--- a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp
+++ b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp
@@ -81,8 +81,8 @@ int run_gauss_seidel(
     bool is_symmetric_graph,
     int apply_type = 0, // 0 for symmetric, 1 for forward, 2 for backward.
     int cluster_size = 1,
-    ClusteringAlgorithm cluster_algorithm = CLUSTER_DEFAULT,
-    bool classic = false) // only with two-stage, true for sptrsv instead of richardson
+    bool classic = false, // only with two-stage, true for sptrsv instead of richardson
+    ClusteringAlgorithm clusterAlgo = CLUSTER_DEFAULT) 
 {
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type lno_view_t;
@@ -101,7 +101,7 @@ int run_gauss_seidel(
   kh.set_team_work_size(16);
   kh.set_dynamic_scheduling(true);
   if(gs_algorithm == GS_CLUSTER)
-    kh.create_gs_handle(cluster_algorithm, cluster_size);
+    kh.create_gs_handle(clusterAlgo, cluster_size);
   else if(gs_algorithm == GS_TWOSTAGE) {
     // test for two-stage/classical gs
     kh.create_gs_handle(gs_algorithm);
@@ -282,9 +282,10 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
   }
   //*** Cluster-coloring version ****
   int clusterSizes[3] = {2, 5, 34};
+  std::vector<ClusteringAlgorithm> clusteringAlgos = {CLUSTER_MIS2, CLUSTER_BALLOON};
   for(int csize = 0; csize < 3; csize++)
   {
-    for(int algo = 0; algo < (int) NUM_CLUSTERING_ALGORITHMS; algo++)
+    for(auto clusterAlgo : clusteringAlgos)
     {
       for(int apply_type = 0; apply_type < apply_count; ++apply_type)
       {
@@ -292,7 +293,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
         //Zero out X before solving
         Kokkos::deep_copy(x_vector, zero);
         run_gauss_seidel<crsMat_t, scalar_view_t, device>(
-            input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, apply_type, clusterSizes[csize], (ClusteringAlgorithm) algo);
+            input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, apply_type, clusterSizes[csize], false, clusterAlgo);
         KokkosBlas::axpby(one, solution_x, -one, x_vector);
         mag_t result_norm_res = KokkosBlas::nrm2(x_vector);
         EXPECT_LT(result_norm_res, initial_norm_res);
@@ -312,10 +313,9 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
   //*** Two-stage version (classic) ****
   for (int apply_type = 0; apply_type < apply_count; ++apply_type)
   {
-    ClusteringAlgorithm cluster_algo = (ClusteringAlgorithm)0;
     Kokkos::deep_copy(x_vector, zero);
     run_gauss_seidel<crsMat_t, scalar_view_t, device>
-      (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, cluster_algo, true);
+      (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true);
     KokkosBlas::axpby(one, solution_x, -one, x_vector);
     mag_t result_norm_res = KokkosBlas::nrm2(x_vector);
     EXPECT_LT(result_norm_res, initial_norm_res);
@@ -435,10 +435,9 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
   for(int apply_type = 0; apply_type < apply_count; ++apply_type)
   {
     //Zero out X before solving
-    ClusteringAlgorithm cluster_algo = (ClusteringAlgorithm)0;
     Kokkos::deep_copy(x_vector, zero);
     run_gauss_seidel<crsMat_t, scalar_view2d_t, device>
-      (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, cluster_algo, true);
+      (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true);
     Kokkos::deep_copy(x_host, x_vector);
     for(lno_t i = 0; i < numVecs; i++)
     {
@@ -455,43 +454,6 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
   }
 }
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void test_rcm(lno_t numRows, size_type nnzPerRow, lno_t bandwidth)
-{
-  using namespace Test;
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
-  typedef typename crsMat_t::StaticCrsGraphType graph_t;
-  typedef typename graph_t::row_map_type::non_const_type lno_row_view_t;
-  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
-  typedef KokkosKernelsHandle
-      <size_type, lno_t, scalar_t,
-      typename device::execution_space, typename device::memory_space,typename device::memory_space> KernelHandle;
-  srand(245);
-  size_type nnzTotal = nnzPerRow * numRows;
-  lno_t nnzVariance = nnzPerRow / 4;
-  crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(numRows, numRows, nnzTotal, nnzVariance, bandwidth);
-  lno_row_view_t symRowmap;
-  lno_nnz_view_t symEntries;
-  KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap
-    <typename graph_t::row_map_type, typename graph_t::entries_type, lno_row_view_t, lno_nnz_view_t, typename device::execution_space>
-    (numRows, A.graph.row_map, A.graph.entries, symRowmap, symEntries);
-  typedef KokkosSparse::Impl::RCM<KernelHandle, typename graph_t::row_map_type::non_const_type, typename graph_t::entries_type::non_const_type> rcm_t;
-  rcm_t rcm(numRows, symRowmap, symEntries);
-  lno_nnz_view_t rcmOrder = rcm.rcm();
-  //perm(i) = the node with timestamp i
-  //make sure that perm is in fact a permutation matrix (contains each row exactly once)
-  Kokkos::View<lno_t*, Kokkos::HostSpace> rcmHost("RCM row ordering", numRows);
-  Kokkos::deep_copy(rcmHost, rcmOrder);
-  std::set<lno_t> rowSet;
-  for(lno_t i = 0; i < numRows; i++)
-    rowSet.insert(rcmHost(i));
-  if((lno_t) rowSet.size() != numRows)
-  {
-    std::cerr << "Only got back " << rowSet.size() << " unique row IDs!\n";
-    return;
-  }
-}
-
 template <typename scalar_t, typename lno_t, typename size_type, typename device>
 void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) {
   const scalar_t zero = Kokkos::Details::ArithTraits<scalar_t>::zero();
@@ -659,9 +621,6 @@ TEST_F( TestCategory, sparse ## _ ## gauss_seidel_symmetric_rank2 ## _ ## SCALAR
 TEST_F( TestCategory, sparse ## _ ## gauss_seidel_zero_rows ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
   test_sgs_zero_rows<SCALAR,ORDINAL,OFFSET,DEVICE>(); \
 } \
-TEST_F( TestCategory, sparse ## _ ## rcm ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_rcm<SCALAR,ORDINAL,OFFSET,DEVICE>(10000, 50, 2000); \
-} \
 TEST_F( TestCategory, sparse ## _ ## balloon_clustering ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
   test_balloon_clustering<SCALAR,ORDINAL,OFFSET,DEVICE>(5000, 100, 2000); \
 } \
diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp
index a7b42fa697..5a033fdf34 100644
--- a/unit_test/sparse/Test_Sparse_spmv.hpp
+++ b/unit_test/sparse/Test_Sparse_spmv.hpp
@@ -42,7 +42,7 @@ struct fSPMV {
 
     if(error > eps) {
       err++;
-      printf("expected_y(%d)=%f, y(%d)=%f\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i)));
+      //printf("expected_y(%d)=%f, y(%d)=%f\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i)));
     }
   }
 };
@@ -51,7 +51,8 @@ struct fSPMV {
 template <typename crsMat_t, typename x_vector_type, typename y_vector_type>
 void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
                      typename y_vector_type::non_const_value_type alpha,
-                     typename y_vector_type::non_const_value_type beta){
+                     typename y_vector_type::non_const_value_type beta,
+                     char mode = 'N'){
 
   using graph_t          = typename crsMat_t::StaticCrsGraphType;
   using size_type_view_t = typename graph_t::row_map_type;
@@ -61,7 +62,9 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
   using size_type = typename size_type_view_t::non_const_value_type;
   using lno_t     = typename lno_view_t::non_const_value_type;
   using scalar_t  = typename scalar_view_t::non_const_value_type;
+  using KAT       = Kokkos::ArithTraits<scalar_t>;
 
+  mode = toupper(mode);
 
   typename scalar_view_t::HostMirror h_values = Kokkos::create_mirror_view(input_mat.values);
   Kokkos::deep_copy(h_values,input_mat.values);
@@ -84,15 +87,24 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
 
   lno_t nr = input_mat.numRows();
 
-  for (lno_t i = 0; i < nr; ++i){
-    scalar_t result = 0;
-    for (size_type j = h_rowmap(i); j < h_rowmap(i+1); ++j){
+  //first, scale y by beta
+  for(size_t i = 0; i < h_y.extent(0); i++)
+    h_y(i) *= beta;
+
+  //then go through the matrix and accumulate the matrix-vector product
+  for (lno_t row = 0; row < nr; ++row) {
+    for (size_type j = h_rowmap(row); j < h_rowmap(row+1); ++j) {
       lno_t col = h_entries(j);
       scalar_t val = h_values(j);
-      scalar_t vector_val = h_x(col);
-      result += val * vector_val;
+      if(mode == 'N')
+        h_y(row) += alpha * val * h_x(col);
+      else if(mode == 'C')
+        h_y(row) += alpha * KAT::conj(val) * h_x(col);
+      else if(mode == 'T')
+        h_y(col) += alpha * val * h_x(row);
+      else if(mode == 'H')
+        h_y(col) += alpha * KAT::conj(val) * h_x(row);
     }
-    h_y(i) = beta * h_y(i) + alpha * result;
   }
   KokkosKernels::Impl::safe_host_to_device_deep_copy (y.extent(0),  h_y, y);
   Kokkos::fence();
@@ -102,7 +114,7 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
 template <typename crsMat_t, typename x_vector_type, typename y_vector_type>
 void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
                 typename y_vector_type::non_const_value_type alpha,
-                typename y_vector_type::non_const_value_type beta) {
+                typename y_vector_type::non_const_value_type beta, char mode) {
   //typedef typename crsMat_t::StaticCrsGraphType graph_t;
   using ExecSpace = typename crsMat_t::execution_space;
   using my_exec_space    = Kokkos::RangePolicy<ExecSpace>;
@@ -114,15 +126,25 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
   // so let us use y_value_type to determine
   // the appropriate tolerance precision.
   const y_value_mag_type eps = std::is_same<y_value_mag_type, float>::value ? 2*1e-3 : 1e-7;
-  const size_t nr = input_mat.numRows();
-  y_vector_type expected_y("expected", nr);
+  bool transposed = (mode == 'T') || (mode == 'H');
+  y_vector_type expected_y("expected", transposed ? input_mat.numCols() : input_mat.numRows());
   Kokkos::deep_copy(expected_y, y);
   Kokkos::fence();
 
-  sequential_spmv(input_mat, x, expected_y, alpha, beta);
-  //KokkosKernels::Impl::print_1Dview(expected_y);
-  KokkosSparse::spmv("N", alpha, input_mat, x, beta, y);
-  //KokkosKernels::Impl::print_1Dview(y);
+  sequential_spmv(input_mat, x, expected_y, alpha, beta, mode);
+  bool threw = false;
+  std::string msg;
+  try
+  {
+    KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y);
+    Kokkos::fence();
+  }
+  catch(std::exception& e)
+  {
+    threw = true;
+    msg = e.what();
+  }
+  ASSERT_FALSE(threw) << "KokkosSparse::Test::spmv 1D, mode " << mode << ": threw exception:\n" << msg << '\n';
   int num_errors = 0;
   Kokkos::parallel_reduce("KokkosSparse::Test::spmv",
                           my_exec_space(0, y.extent(0)),
@@ -137,7 +159,7 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
 template <typename crsMat_t, typename x_vector_type, typename y_vector_type>
 void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vector_type expected_y,
                    typename y_vector_type::non_const_value_type alpha,
-                   typename y_vector_type::non_const_value_type beta, int numMV) {
+                   typename y_vector_type::non_const_value_type beta, int numMV, char mode) {
   using ExecSpace = typename crsMat_t::execution_space;
   using my_exec_space = Kokkos::RangePolicy<ExecSpace>;
   using y_value_type     = typename y_vector_type::non_const_value_type;
@@ -153,8 +175,19 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vecto
 
   Kokkos::fence();
 
-  KokkosSparse::spmv("N", alpha, input_mat, x, beta, y);
-
+  bool threw = false;
+  std::string msg;
+  try
+  {
+    KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y);
+    Kokkos::fence();
+  }
+  catch(std::exception& e)
+  {
+    threw = true;
+    msg = e.what();
+  }
+  ASSERT_FALSE(threw) << "KokkosSparse::Test::spmv 2D, mode " << mode << ": threw exception:\n" << msg << '\n';
 
   for (int i = 0; i < numMV; ++i){
     auto x_i = Kokkos::subview (x, Kokkos::ALL (), i);
@@ -162,7 +195,7 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vecto
     auto y_i = Kokkos::subview (expected_y, Kokkos::ALL (), i);
     Kokkos::fence();
 
-    sequential_spmv(input_mat, x_i, y_i, alpha, beta);
+    sequential_spmv(input_mat, x_i, y_i, alpha, beta, mode);
 
     auto y_spmv = Kokkos::subview (y, Kokkos::ALL (), i);
     int num_errors = 0;
@@ -170,8 +203,9 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vecto
                             my_exec_space(0,y_i.extent(0)),
                             fSPMV<decltype(y_i), decltype(y_spmv)>(y_i, y_spmv, eps),
                             num_errors);
-    if(num_errors>0) printf("KokkosSparse::Test::spmv_mv: %i errors of %i for mv %i\n",
-                            num_errors, y_i.extent_int(0), i);
+    if(num_errors>0)
+      std::cout << "KokkosSparse::Test::spmv_mv: " << num_errors << " errors of " << y_i.extent_int(0)
+        << " for mv " << i << " (alpha=" << alpha << ", beta=" << beta << ", mode = " << mode << ")\n";
     EXPECT_TRUE(num_errors==0);
   }
 }
@@ -306,6 +340,23 @@ void check_spmv_controls(KokkosKernels::Experimental::Controls controls,
 
 } // namespace Test
 
+template <typename scalar_t>
+scalar_t randomUpperBound(int mag)
+{
+  return (scalar_t) mag;
+}
+
+template <>
+Kokkos::complex<double> randomUpperBound<Kokkos::complex<double>>(int mag)
+{
+  return Kokkos::complex<double>(mag, mag);
+}
+
+template <>
+Kokkos::complex<float> randomUpperBound<Kokkos::complex<float>>(int mag)
+{
+  return Kokkos::complex<float>(mag, mag);
+}
 
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
 void test_spmv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance){
@@ -326,18 +377,33 @@ void test_spmv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_vari
 
   x_vector_type input_x ("x", nc);
   y_vector_type output_y ("y", nr);
+  x_vector_type input_xt ("x", nr);
+  y_vector_type output_yt ("y", nc);
 
   Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
 
   typedef typename x_vector_type::value_type ScalarX;
   typedef typename y_vector_type::value_type ScalarY;
 
-  Kokkos::fill_random(input_x,rand_pool,ScalarX(10));
-  Kokkos::fill_random(output_y,rand_pool,ScalarY(10));
+  Kokkos::fill_random(input_x,rand_pool,randomUpperBound<ScalarX>(10));
+  Kokkos::fill_random(output_y,rand_pool,randomUpperBound<ScalarY>(10));
+  Kokkos::fill_random(input_xt,rand_pool,randomUpperBound<ScalarX>(10));
+  Kokkos::fill_random(output_yt,rand_pool,randomUpperBound<ScalarY>(10));
 
-  Test::check_spmv(input_mat, input_x, output_y, 1.0, 0.0);
-  Test::check_spmv(input_mat, input_x, output_y, 0.0, 1.0);
-  Test::check_spmv(input_mat, input_x, output_y, 1.0, 1.0);
+  std::vector<char> nonTransModes = {'N', 'C'};
+  std::vector<char> transModes = {'T', 'H'};
+  for(auto mode : nonTransModes)
+  {
+    Test::check_spmv(input_mat, input_x, output_y, 1.0, 0.0, mode);
+    Test::check_spmv(input_mat, input_x, output_y, 0.0, 1.0, mode);
+    Test::check_spmv(input_mat, input_x, output_y, 1.0, 1.0, mode);
+  }
+  for(auto mode : transModes)
+  {
+    Test::check_spmv(input_mat, input_xt, output_yt, 1.0, 0.0, mode);
+    Test::check_spmv(input_mat, input_xt, output_yt, 0.0, 1.0, mode);
+    Test::check_spmv(input_mat, input_xt, output_yt, 1.0, 1.0, mode);
+  }
 }
 
 template <typename scalar_t, typename lno_t, typename size_type, typename layout, class Device>
@@ -353,21 +419,66 @@ void test_spmv_mv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_v
   ViewTypeY b_y("B",numCols,numMV);
   ViewTypeY b_y_copy("B",numCols,numMV);
 
+  ViewTypeX b_xt("A",numCols,numMV);
+  ViewTypeY b_yt("B",numRows,numMV);
+  ViewTypeY b_yt_copy("B",numRows,numMV);
+
   Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-  Kokkos::fill_random(b_x,rand_pool,scalar_t(10));
-  Kokkos::fill_random(b_y,rand_pool,scalar_t(10));
+  Kokkos::fill_random(b_x,rand_pool,randomUpperBound<scalar_t>(10));
+  Kokkos::fill_random(b_y,rand_pool,randomUpperBound<scalar_t>(10));
+  Kokkos::fill_random(b_xt,rand_pool,randomUpperBound<scalar_t>(10));
+  Kokkos::fill_random(b_yt,rand_pool,randomUpperBound<scalar_t>(10));
 
 
   crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(numRows,numCols,nnz,row_size_variance, bandwidth);
 
   Kokkos::deep_copy(b_y_copy, b_y);
+  Kokkos::deep_copy(b_yt_copy, b_yt);
 
+  std::vector<char> nonTransModes = {'N', 'C'};
+  std::vector<char> transModes = {'T', 'H'};
+  for(auto mode : nonTransModes)
+  {
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, numMV, mode);
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, numMV, mode);
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, numMV, mode);
+  }
+  for(auto mode : transModes)
+  {
+    Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, 1.0, 0.0, numMV, mode);
+    Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, 0.0, 1.0, numMV, mode);
+    Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, 1.0, 1.0, numMV, mode);
+  }
+}
 
-  Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, numMV);
-  Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, numMV);
-  Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, numMV);
+template <typename scalar_t, typename lno_t, typename size_type, typename layout, class Device>
+void test_spmv_mv_heavy(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance, int numMV){
+
+  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type> crsMat_t;
 
+  typedef Kokkos::View<scalar_t**, layout, Device> ViewTypeX;
+  typedef Kokkos::View<scalar_t**, layout, Device> ViewTypeY;
 
+  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(numRows,numRows,nnz,row_size_variance, bandwidth);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+
+  for(int nv = 1; nv <= numMV; nv++) {
+    ViewTypeX b_x("A",numRows,nv);
+    ViewTypeY b_y("B",numRows,nv);
+    ViewTypeY b_y_copy("B",numRows,nv);
+
+    Kokkos::fill_random(b_x,rand_pool,scalar_t(10));
+    Kokkos::fill_random(b_y,rand_pool,scalar_t(10));
+
+    Kokkos::deep_copy(b_y_copy, b_y);
+
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'N');
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'N');
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N');
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T');
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T');
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'T');
+  }
 }
 
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
@@ -736,6 +847,7 @@ TEST_F( TestCategory,sparse ## _ ## spmv_mv ## _ ## SCALAR ## _ ## ORDINAL ## _
   test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (50000, 50000 * 30, 100, 10, 5); \
   test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (50000, 50000 * 30, 200, 10, 1); \
   test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (10000, 10000 * 20, 100, 5, 10); \
+  test_spmv_mv_heavy<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (200, 200 * 10, 60, 4, 30); \
 }
 
 #define EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, DEVICE) \
diff --git a/unit_test/standalone/main.cpp b/unit_test/standalone/main.cpp
index 68d336805e..259a572c7a 100644
--- a/unit_test/standalone/main.cpp
+++ b/unit_test/standalone/main.cpp
@@ -3,6 +3,9 @@
 #ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
 #include<Test_Cuda.hpp>
 #endif
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP
+#include<Test_HIP.hpp>
+#endif
 #ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
 #include<Test_Serial.hpp>
 #endif
diff --git a/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Complex.cpp b/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Complex.cpp
new file mode 100644
index 0000000000..02b4d3681f
--- /dev/null
+++ b/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Complex.cpp
@@ -0,0 +1,3 @@
+#include<Test_Threads.hpp>
+#include "Test_Batched_TeamVectorGemm.hpp"
+#include "Test_Batched_TeamVectorGemm_Complex.hpp"
diff --git a/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Real.cpp b/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Real.cpp
new file mode 100644
index 0000000000..5c17d8df16
--- /dev/null
+++ b/unit_test/threads/Test_Threads_Batched_TeamVectorGemm_Real.cpp
@@ -0,0 +1,3 @@
+#include<Test_Threads.hpp>
+#include "Test_Batched_TeamVectorGemm.hpp"
+#include "Test_Batched_TeamVectorGemm_Real.hpp"
diff --git a/unit_test/threads/Test_Threads_Graph_mis2.cpp b/unit_test/threads/Test_Threads_Graph_mis2.cpp
new file mode 100644
index 0000000000..cbf15a7662
--- /dev/null
+++ b/unit_test/threads/Test_Threads_Graph_mis2.cpp
@@ -0,0 +1,3 @@
+#include<Test_Threads.hpp>
+#include<Test_Graph_mis2.hpp>
+
diff --git a/unit_test/threads/Test_Threads_Graph_rcm.cpp b/unit_test/threads/Test_Threads_Graph_rcm.cpp
new file mode 100644
index 0000000000..37184bb806
--- /dev/null
+++ b/unit_test/threads/Test_Threads_Graph_rcm.cpp
@@ -0,0 +1,3 @@
+#include<Test_Threads.hpp>
+#include<Test_Graph_rcm.hpp>
+