From afd6f62d53b6c154701888d64187859b62e5bf6b Mon Sep 17 00:00:00 2001
From: cwschilly <132086024+cwschilly@users.noreply.github.com>
Date: Wed, 6 Sep 2023 16:20:49 -0400
Subject: [PATCH] * #128: set up GPU CI pipelines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* #128: specify cuda dir

* #128: temporarily run GPU on every push; stop MPI builds

* #128: specify correct dockerfile

* #128: provide correct build/source dirs

* #128: rework MPI in build script

* #128: rework MPI in other gpu build script

* #128: CI build script try another configuration and fix invalid path

* #128: fix missing letter in path for a gpu build

* #128: add newlines to end of files

* #128: add spack find -p to find cuda root

* #128: only run one pipeline; add cuda paths

* #128: add kokkos variables

* #128: add Tpetra_INST_SERIAL:BOOL=ON

* #128: add CUDA root flag

* #128: use correct kokkos architecture

* #128: enable cusolver and cusparse

* #128: emulate local build

* #128: try with different docker image

* #128: update cuda path

* Try adding debug flag for Buildx

* Tpetra: Disable cudaMemcpyAsync for Intercept.cpp

* #128: lower -j

* #128: use defaul kokkos architecture

* #128: use fewer processes for GPU testing

* 128: re-enable Kokkos_ARCH_AMPERE86

* #128: add cuda sample build to CI to validate CUDA

* #128: run cuda test on NGA host

* #128: update jobs dependency on CI for cuda

* #128: add CUD sample run

* #128: remove command not existing in CI

* #128: change cuda path

* #128: try to display information about driver

* #128: change bad command in CI

* #128: fix command

* #128: try install nvidia util in Docker container

* #128: remove commands

* #128: fix cuda path

* #128: fix dockerfile

* #128: add different cuda test images

* Run container in separate step

* Remove not needed code

* Apply changes to Epetra=OFF

* Try to build and run docker within same step

* #128: remove unused old CI files

* #128: check both gpu pipelines

* #128: Tpetra_INST_SERIAL=ON

* #128: fix workflow name

* #128: rework with cuda 11.4 dockerfile

* #168: try to simplify CI sheel script

* #128: try simplify shell script

* #128: remove librairies path for blas and lapack to check if resolved

* #128: try remove Lapack and blas lib paths from cmake call

* #128: try again changing path dynamically

* #128: fix another path

* #128: fix blas path

* #128: apply working conffiguration to other build scripts

* #128: restore triggering workflows on PR

* #128: disable GPU build job for PR having `EpetraMPI T1` label

* #128: enable GPU build only with EpetraMPI T2 and EpetraMPI T3 labels

* #128: upload test log

* #128: fix typo

* #128: fix artifacts

* #128: add junit report for tests

* #128: add junit reporting in CI and set

* #128: fix artifact name

* #128: fix artifacts missing

* #128 fix extra slach char in path

* #128: fix artifacts path

* #128: fix path in gitbub action

* #128: try mounting artifacts folder into the host runner

* #128: use same logic for gpu or non-gpu pipelines

* 128: Finalize pipelines (GPU on push, MPI cancellations)

* 128: remove label requirements

* Revert "Tpetra: Disable cudaMemcpyAsync for Intercept.cpp"

This reverts commit 5db2d5d5b18f722f79e75736399048c1c3592b31.

* #128: test intercept reversion

* Revert "Revert "Tpetra: Disable cudaMemcpyAsync for Intercept.cpp""

This reverts commit de87a2248864cb8a2eaf03cba12b836e0bd4db7f.

* #128: fix underscore

* #128: run GPU pipeline on merge to fy23 develop

---------

Co-authored-by: Thomas Dutheillet-Lamonthézie <thomas.dutheillet@ng-analytics.com>
Co-authored-by: Jacob Domagala <domagala.jacob@gmail.com>
---
 .github/workflows/ci-gpu-epetraOFF.yml        | 55 +++++++++++
 .github/workflows/ci-gpu-epetraON.yml         | 55 +++++++++++
 nga-ci/build-gpu-epetraOFF.sh                 | 95 +++++++++++++++++++
 nga-ci/build-gpu-epetraON.sh                  | 95 +++++++++++++++++++
 nga-ci/gpu-epetraOFF.dockerfile               | 11 +++
 nga-ci/gpu-epetraON.dockerfile                | 11 +++
 nga-ci/test-gpu.sh                            | 25 +++++
 nga-ci/test-mpi.sh                            | 25 +++++
 .../core/test/KokkosIntegration/Intercept.cpp | 22 ++---
 9 files changed, 383 insertions(+), 11 deletions(-)
 create mode 100644 .github/workflows/ci-gpu-epetraOFF.yml
 create mode 100644 .github/workflows/ci-gpu-epetraON.yml
 create mode 100644 nga-ci/build-gpu-epetraOFF.sh
 create mode 100644 nga-ci/build-gpu-epetraON.sh
 create mode 100644 nga-ci/gpu-epetraOFF.dockerfile
 create mode 100644 nga-ci/gpu-epetraON.dockerfile
 create mode 100644 nga-ci/test-gpu.sh
 create mode 100644 nga-ci/test-mpi.sh

diff --git a/.github/workflows/ci-gpu-epetraOFF.yml b/.github/workflows/ci-gpu-epetraOFF.yml
new file mode 100644
index 000000000000..044be0b4b512
--- /dev/null
+++ b/.github/workflows/ci-gpu-epetraOFF.yml
@@ -0,0 +1,55 @@
+name: GPU-EpetraOFF
+
+# Trigger the workflow on merge to NGA-FY23-develop
+on:
+  push:
+    branches:
+      - NGA-FY23-develop
+  workflow_dispatch:
+
+# Cancel any existing jobs
+concurrency:
+  group: ${{ github.event.repository.name }}-${{ github.ref }}-${{ github.workflow }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  build:
+    runs-on: self-hosted
+    strategy:
+      fail-fast: true
+    steps:
+      - uses: actions/checkout@v3
+      - name: CI Variables
+        run: echo "DOCKER_TAG=$(echo ${{ github.ref }} | cut -d'/' -f3- | sed 's/[^a-z0-9_-]/__/gi')" >> $GITHUB_ENV
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+        with:
+          buildkitd-flags: --debug
+      - name: Inspect Builder
+        run: |
+          echo "Name:      ${{ steps.buildx.outputs.name }}"
+          echo "Endpoint:  ${{ steps.buildx.outputs.endpoint }}"
+          echo "Status:    ${{ steps.buildx.outputs.status }}"
+          echo "Flags:     ${{ steps.buildx.outputs.flags }}"
+          echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
+          echo "DOCKER_TAG: ${{ env.DOCKER_TAG }}"
+      - name: Build and Run Docker Image
+        run: |
+          docker build -t ${{ env.DOCKER_TAG }} -f ./nga-ci/gpu-epetraOFF.dockerfile .
+          docker run -v /tmp/artifacts:/tmp/artifacts --gpus all ${{ env.DOCKER_TAG }} /opt/src/Trilinos/nga-ci/test-gpu.sh
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v3
+        if: success() || failure()
+        with:
+          name: Artifacts
+          path: /tmp/artifacts/*
+          if-no-files-found: ignore
+      - name: Report Test results
+        uses: phoenix-actions/test-reporting@v12
+        if: success() || failure()
+        with:
+          name: Tests report (GPU-EpetraOFF)
+          path: /tmp/artifacts/junit-tests-report.xml
+          reporter: java-junit
+          output-to: step-summary
+          fail-on-error: 'true'
diff --git a/.github/workflows/ci-gpu-epetraON.yml b/.github/workflows/ci-gpu-epetraON.yml
new file mode 100644
index 000000000000..943a0d712f21
--- /dev/null
+++ b/.github/workflows/ci-gpu-epetraON.yml
@@ -0,0 +1,55 @@
+name: GPU-EpetraON
+
+# Trigger the workflow on merge to NGA-FY23-develop
+on:
+  push:
+    branches:
+      - NGA-FY23-develop
+  workflow_dispatch:
+
+# Cancel any existing jobs
+concurrency:
+  group: ${{ github.event.repository.name }}-${{ github.ref }}-${{ github.workflow }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  build:
+    runs-on: self-hosted
+    strategy:
+      fail-fast: true
+    steps:
+      - uses: actions/checkout@v3
+      - name: CI Variables
+        run: echo "DOCKER_TAG=$(echo ${{ github.ref }} | cut -d'/' -f3- | sed 's/[^a-z0-9_-]/__/gi')" >> $GITHUB_ENV
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+        with:
+          buildkitd-flags: --debug
+      - name: Inspect Builder
+        run: |
+          echo "Name:      ${{ steps.buildx.outputs.name }}"
+          echo "Endpoint:  ${{ steps.buildx.outputs.endpoint }}"
+          echo "Status:    ${{ steps.buildx.outputs.status }}"
+          echo "Flags:     ${{ steps.buildx.outputs.flags }}"
+          echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
+          echo "DOCKER_TAG: ${{ env.DOCKER_TAG }}"
+      - name: Build and Run Docker Image
+        run: |
+          docker build -t ${{ env.DOCKER_TAG }} -f ./nga-ci/gpu-epetraON.dockerfile .
+          docker run -v /tmp/artifacts:/tmp/artifacts --gpus all ${{ env.DOCKER_TAG }} /opt/src/Trilinos/nga-ci/test-gpu.sh
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v3
+        if: success() || failure()
+        with:
+          name: Artifacts
+          path: /tmp/artifacts/*
+          if-no-files-found: ignore
+      - name: Report Test results
+        uses: phoenix-actions/test-reporting@v12
+        if: success() || failure()
+        with:
+          name: Tests report (GPU-EpetraOFF)
+          path: /tmp/artifacts/junit-tests-report.xml
+          reporter: java-junit
+          output-to: step-summary
+          fail-on-error: 'true'
diff --git a/nga-ci/build-gpu-epetraOFF.sh b/nga-ci/build-gpu-epetraOFF.sh
new file mode 100644
index 000000000000..cf26f8b5c5f1
--- /dev/null
+++ b/nga-ci/build-gpu-epetraOFF.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+. /opt/spack/share/spack/setup-env.sh
+spack env activate trilinos
+
+cd /opt/build/Trilinos
+
+export MPI_ROOT="$(dirname $(which mpicc))"
+export MPICC="${MPI_ROOT}/mpicc"
+export MPICXX="${MPI_ROOT}/mpicxx"
+export MPIF90="${MPI_ROOT}/mpif90"
+export MPIRUN="${MPI_ROOT}/mpirun"
+
+export BLAS_ROOT="$(spack location -i openblas)"
+export LAPACK_ROOT="${BLAS_ROOT}"
+
+export CUDA_ROOT=/usr/local/cuda
+export PATH=${CUDA_ROOT}/bin:$PATH
+export OMPI_CXX=/opt/src/Trilinos/packages/kokkos/bin/nvcc_wrapper
+export LD_LIBRARY_PATH=${CUDA_ROOT}/lib64:$LD_LIBRARY_PATH
+export CUDA_LAUNCH_BLOCKING=1
+ENABLE_CUDA=ON
+
+cmake -G "${CMAKE_GENERATOR:-Ninja}" \
+    -D CMAKE_BUILD_TYPE=DEBUG \
+    -D Trilinos_ENABLE_DEBUG=ON \
+    -D Trilinos_PARALLEL_LINK_JOBS_LIMIT=2 \
+    -D Trilinos_ENABLE_ALL_PACKAGES=ON \
+    -D Trilinos_ENABLE_ALL_OPTIONAL_PACKAGES=ON \
+    -D Trilinos_ALLOW_NO_PACKAGES=ON \
+    -D Trilinos_DISABLE_ENABLED_FORWARD_DEP_PACKAGES=ON \
+    -D Trilinos_IGNORE_MISSING_EXTRA_REPOSITORIES=ON \
+    -D Trilinos_ENABLE_TESTS=ON \
+    -D Trilinos_TEST_CATEGORIES=BASIC \
+    -D Trilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=ON \
+    -D Trilinos_VERBOSE_CONFIGURE=ON \
+    -D BUILD_SHARED_LIBS=ON \
+    \
+    -D Trilinos_WARNINGS_AS_ERRORS_FLAGS="-Wno-error" \
+    -D Trilinos_ENABLE_SEACAS=OFF \
+    -D Trilinos_ENABLE_Sacado=OFF \
+    \
+    -D TPL_ENABLE_CUDA="${ENABLE_CUDA}" \
+    -D Tpetra_INST_SERIAL=ON \
+    -D Tpetra_INST_CUDA=ON \
+    -D Trilinos_ENABLE_Kokkos=ON \
+    -D Kokkos_ARCH_AMPERE86=ON \
+    -D Kokkos_ENABLE_OPENMP=OFF \
+    -D Kokkos_ENABLE_CUDA="${ENABLE_CUDA}" \
+    -D Kokkos_ENABLE_CUDA_LAMBDA="${ENABLE_CUDA}" \
+    -D Kokkos_ENABLE_CUDA_UVM=OFF \
+    \
+    -D TPL_ENABLE_CUSOLVER=ON \
+    -D TPL_ENABLE_CUSPARSE=ON \
+    \
+    -D TPL_ENABLE_BLAS=ON \
+    -D TPL_BLAS_LIBRARIES="${BLAS_ROOT}/lib/libopenblas.so" \
+    -D TPL_ENABLE_LAPACK=ON \
+    -D TPL_LAPACK_LIBRARIES="${LAPACK_ROOT}/lib/libopenblas.so" \
+    \
+    -D TPL_ENABLE_Matio=OFF \
+    -D TPL_ENABLE_X11=OFF \
+    -D TPL_ENABLE_Pthread=OFF \
+    -D TPL_ENABLE_Boost=OFF \
+    -D TPL_ENABLE_BoostLib=OFF \
+    -D TPL_ENABLE_ParMETIS=OFF \
+    -D TPL_ENABLE_Zlib=OFF \
+    -D TPL_ENABLE_HDF5=OFF \
+    -D TPL_ENABLE_Netcdf=OFF \
+    -D TPL_ENABLE_SuperLU=OFF \
+    -D TPL_ENABLE_Scotch=OFF \
+    \
+    -D CMAKE_C_COMPILER=${MPICC} \
+    -D CMAKE_CXX_COMPILER=${MPICXX} \
+    -D CMAKE_Fortran_COMPILER=${MPIF90} \
+    -D TPL_ENABLE_MPI=ON \
+    -D MPI_BIN_DIR=${MPIRUN} \
+    -D MPI_EXEC=${MPIRUN} \
+    \
+    -D Trilinos_ENABLE_Rythmos=OFF \
+    -D Trilinos_ENABLE_Pike=OFF \
+    -D Trilinos_ENABLE_Komplex=OFF \
+    -D Trilinos_ENABLE_TriKota=OFF \
+    -D Trilinos_ENABLE_Moertel=OFF \
+    -D Trilinos_ENABLE_Domi=OFF \
+    -D Trilinos_ENABLE_FEI=OFF \
+    \
+    -D Trilinos_ENABLE_PyTrilinos=OFF \
+    \
+    -D Trilinos_ENABLE_Epetra=OFF \
+    -S /opt/src/Trilinos -B /opt/build/Trilinos
+ninja -j 4
diff --git a/nga-ci/build-gpu-epetraON.sh b/nga-ci/build-gpu-epetraON.sh
new file mode 100644
index 000000000000..e65e0019d68c
--- /dev/null
+++ b/nga-ci/build-gpu-epetraON.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+. /opt/spack/share/spack/setup-env.sh
+spack env activate trilinos
+
+cd /opt/build/Trilinos
+
+export MPI_ROOT="$(dirname $(which mpicc))"
+export MPICC="${MPI_ROOT}/mpicc"
+export MPICXX="${MPI_ROOT}/mpicxx"
+export MPIF90="${MPI_ROOT}/mpif90"
+export MPIRUN="${MPI_ROOT}/mpirun"
+
+export BLAS_ROOT="$(spack location -i openblas)"
+export LAPACK_ROOT="${BLAS_ROOT}"
+
+export CUDA_ROOT=/usr/local/cuda
+export PATH=${CUDA_ROOT}/bin:$PATH
+export OMPI_CXX=/opt/src/Trilinos/packages/kokkos/bin/nvcc_wrapper
+export LD_LIBRARY_PATH=${CUDA_ROOT}/lib64:$LD_LIBRARY_PATH
+export CUDA_LAUNCH_BLOCKING=1
+ENABLE_CUDA=ON
+
+cmake -G "${CMAKE_GENERATOR:-Ninja}" \
+    -D CMAKE_BUILD_TYPE=DEBUG \
+    -D Trilinos_ENABLE_DEBUG=ON \
+    -D Trilinos_PARALLEL_LINK_JOBS_LIMIT=2 \
+    -D Trilinos_ENABLE_ALL_PACKAGES=ON \
+    -D Trilinos_ENABLE_ALL_OPTIONAL_PACKAGES=ON \
+    -D Trilinos_ALLOW_NO_PACKAGES=ON \
+    -D Trilinos_DISABLE_ENABLED_FORWARD_DEP_PACKAGES=ON \
+    -D Trilinos_IGNORE_MISSING_EXTRA_REPOSITORIES=ON \
+    -D Trilinos_ENABLE_TESTS=ON \
+    -D Trilinos_TEST_CATEGORIES=BASIC \
+    -D Trilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=ON \
+    -D Trilinos_VERBOSE_CONFIGURE=ON \
+    -D BUILD_SHARED_LIBS=ON \
+    \
+    -D Trilinos_WARNINGS_AS_ERRORS_FLAGS="-Wno-error" \
+    -D Trilinos_ENABLE_SEACAS=OFF \
+    -D Trilinos_ENABLE_Sacado=OFF \
+    \
+    -D TPL_ENABLE_CUDA="${ENABLE_CUDA}" \
+    -D Tpetra_INST_SERIAL=ON \
+    -D Tpetra_INST_CUDA=ON \
+    -D Trilinos_ENABLE_Kokkos=ON \
+    -D Kokkos_ARCH_AMPERE86=ON \
+    -D Kokkos_ENABLE_OPENMP=OFF \
+    -D Kokkos_ENABLE_CUDA="${ENABLE_CUDA}" \
+    -D Kokkos_ENABLE_CUDA_LAMBDA="${ENABLE_CUDA}" \
+    -D Kokkos_ENABLE_CUDA_UVM=OFF \
+    \
+    -D TPL_ENABLE_CUSOLVER=ON \
+    -D TPL_ENABLE_CUSPARSE=ON \
+    \
+    -D TPL_ENABLE_BLAS=ON \
+    -D TPL_BLAS_LIBRARIES="${BLAS_ROOT}/lib/libopenblas.so" \
+    -D TPL_ENABLE_LAPACK=ON \
+    -D TPL_LAPACK_LIBRARIES="${LAPACK_ROOT}/lib/libopenblas.so" \
+    \
+    -D TPL_ENABLE_Matio=OFF \
+    -D TPL_ENABLE_X11=OFF \
+    -D TPL_ENABLE_Pthread=OFF \
+    -D TPL_ENABLE_Boost=OFF \
+    -D TPL_ENABLE_BoostLib=OFF \
+    -D TPL_ENABLE_ParMETIS=OFF \
+    -D TPL_ENABLE_Zlib=OFF \
+    -D TPL_ENABLE_HDF5=OFF \
+    -D TPL_ENABLE_Netcdf=OFF \
+    -D TPL_ENABLE_SuperLU=OFF \
+    -D TPL_ENABLE_Scotch=OFF \
+    \
+    -D CMAKE_C_COMPILER=${MPICC} \
+    -D CMAKE_CXX_COMPILER=${MPICXX} \
+    -D CMAKE_Fortran_COMPILER=${MPIF90} \
+    -D TPL_ENABLE_MPI=ON \
+    -D MPI_BIN_DIR=${MPIRUN} \
+    -D MPI_EXEC=${MPIRUN} \
+    \
+    -D Trilinos_ENABLE_Rythmos=OFF \
+    -D Trilinos_ENABLE_Pike=OFF \
+    -D Trilinos_ENABLE_Komplex=OFF \
+    -D Trilinos_ENABLE_TriKota=OFF \
+    -D Trilinos_ENABLE_Moertel=OFF \
+    -D Trilinos_ENABLE_Domi=OFF \
+    -D Trilinos_ENABLE_FEI=OFF \
+    \
+    -D Trilinos_ENABLE_PyTrilinos=OFF \
+    \
+    -D Trilinos_ENABLE_Epetra=ON \
+    -S /opt/src/Trilinos -B /opt/build/Trilinos
+ninja -j 4
diff --git a/nga-ci/gpu-epetraOFF.dockerfile b/nga-ci/gpu-epetraOFF.dockerfile
new file mode 100644
index 000000000000..2f54ce79a6d5
--- /dev/null
+++ b/nga-ci/gpu-epetraOFF.dockerfile
@@ -0,0 +1,11 @@
+# Choose a base image
+FROM calebschilly/trilinos-deps:main AS build-stage
+
+COPY . /opt/src/Trilinos
+RUN mkdir -p /opt/build/Trilinos
+
+# Build using the spack environment we created
+RUN bash /opt/src/Trilinos/nga-ci/build-gpu-epetraOFF.sh
+
+# For running later
+RUN chmod +x /opt/src/Trilinos/nga-ci/test-gpu.sh
diff --git a/nga-ci/gpu-epetraON.dockerfile b/nga-ci/gpu-epetraON.dockerfile
new file mode 100644
index 000000000000..a8d7a1969023
--- /dev/null
+++ b/nga-ci/gpu-epetraON.dockerfile
@@ -0,0 +1,11 @@
+# Choose a base image
+FROM calebschilly/trilinos-deps:main AS build-stage
+
+COPY . /opt/src/Trilinos
+RUN mkdir -p /opt/build/Trilinos
+
+# Build using the spack environment we created
+RUN bash /opt/src/Trilinos/nga-ci/build-gpu-epetraON.sh
+
+# For running later
+RUN chmod +x /opt/src/Trilinos/nga-ci/test-gpu.sh
diff --git a/nga-ci/test-gpu.sh b/nga-ci/test-gpu.sh
new file mode 100644
index 000000000000..cadb5e6d1556
--- /dev/null
+++ b/nga-ci/test-gpu.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+set -x
+set -e
+
+. /opt/spack/share/spack/setup-env.sh
+spack env activate trilinos
+
+cd /opt/build/Trilinos
+ret_code=0
+
+export OMPI_ALLOW_RUN_AS_ROOT=1
+export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
+
+# path to the artifacts
+artifacts_dir=/tmp/artifacts
+
+ctest -j 5 --output-junit junit-tests-report.xml --output-on-failure || ret_code=$?
+# We collect the test logs for exporting
+echo "ctest returned: $ret_code"
+mkdir -p ${artifacts_dir}
+cp /opt/build/Trilinos/junit-tests-report.xml ${artifacts_dir}
+cp /opt/build/Trilinos/Testing/Temporary/LastTest.log ${artifacts_dir}
+echo ${ret_code} > ${artifacts_dir}/success_flag.txt
+ls ${artifacts_dir}
diff --git a/nga-ci/test-mpi.sh b/nga-ci/test-mpi.sh
new file mode 100644
index 000000000000..f4bab2adcfec
--- /dev/null
+++ b/nga-ci/test-mpi.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+set -x
+set -e
+
+. /opt/spack/share/spack/setup-env.sh
+spack env activate trilinos
+
+cd /opt/build/Trilinos
+ret_code=0
+
+export OMPI_ALLOW_RUN_AS_ROOT=1
+export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
+
+# path to the artifacts
+artifacts_dir=/tmp/artifacts
+
+ctest -j 14 --output-junit junit-tests-report.xml --output-on-failure || ret_code=$?
+# We collect the test logs for exporting
+echo "ctest returned: $ret_code"
+mkdir -p ${artifacts_dir}
+cp /opt/build/Trilinos/junit-tests-report.xml ${artifacts_dir}
+cp /opt/build/Trilinos/Testing/Temporary/LastTest.log ${artifacts_dir}
+echo ${ret_code} > ${artifacts_dir}/success_flag.txt
+ls ${artifacts_dir}
diff --git a/packages/tpetra/core/test/KokkosIntegration/Intercept.cpp b/packages/tpetra/core/test/KokkosIntegration/Intercept.cpp
index 91161f5c29da..7ca0811ce68a 100644
--- a/packages/tpetra/core/test/KokkosIntegration/Intercept.cpp
+++ b/packages/tpetra/core/test/KokkosIntegration/Intercept.cpp
@@ -60,7 +60,7 @@ void initialize(int& narg, char* arg[]) {
 
   fprintf(stderr, "Kokkos::initialize()\n");
   o_init(narg, arg);
-} 
+}
 
 void finalize() {
   void (*o_finalize)(void);
@@ -82,15 +82,15 @@ __host__ __device__ cudaError_t cudaDeviceSynchronize() {
   return o_cudaDeviceSynchronize();
 }
 
+#ifndef __CUDA_ARCH__
 //Copies data between host and device.  Don't care about __device__ calls, so count only if from host.
 __host__ __device__ cudaError_t cudaMemcpy2DAsync ( void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind, cudaStream_t stream) {
   cudaError_t (*o_cudaMemcpy2DAsync) (void*, size_t, const void*, size_t, size_t, size_t, cudaMemcpyKind, cudaStream_t);
   o_cudaMemcpy2DAsync = (cudaError_t (*)(void*, size_t, const void*, size_t, size_t, size_t, cudaMemcpyKind, cudaStream_t))dlsym(RTLD_NEXT, "cudaMemcpy2DAsync");
-#ifndef __CUDA_ARCH__
-  ApiTest *ctr = ApiTest::getInstance();
 
+  ApiTest *ctr = ApiTest::getInstance();
   ctr->incr("cudaMemcpy2DAsync");
-#endif
+
   return o_cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
 }
 
@@ -98,11 +98,10 @@ __host__ __device__ cudaError_t cudaMemcpy2DAsync ( void* dst, size_t dpitch, co
 __host__ __device__ cudaError_t cudaMemcpy3DAsync ( const cudaMemcpy3DParms* p, cudaStream_t stream ) {
   cudaError_t (*o_cudaMemcpy3DAsync) ( const cudaMemcpy3DParms* , cudaStream_t );
   o_cudaMemcpy3DAsync = (cudaError_t (*)(const cudaMemcpy3DParms* , cudaStream_t))dlsym(RTLD_NEXT, "cudaMemcpy3DAsync");
-#ifndef __CUDA_ARCH__
-  ApiTest *ctr = ApiTest::getInstance();
 
+  ApiTest *ctr = ApiTest::getInstance();
   ctr->incr("cudaMemcpy3DAsync");
-#endif
+
   return o_cudaMemcpy3DAsync(p, stream);
 }
 
@@ -110,17 +109,18 @@ __host__ __device__ cudaError_t cudaMemcpy3DAsync ( const cudaMemcpy3DParms* p,
 __host__ __device__ cudaError_t cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream) {
   cudaError_t (*o_cudaMemcpyAsync) ( void*, const void*, size_t, cudaMemcpyKind, cudaStream_t );
   o_cudaMemcpyAsync = (cudaError_t (*)(void*, const void*, size_t, cudaMemcpyKind, cudaStream_t))dlsym(RTLD_NEXT, "cudaMemcpyAsync");
-#ifndef __CUDA_ARCH__
-  ApiTest *ctr = ApiTest::getInstance();
 
+  ApiTest *ctr = ApiTest::getInstance();
   ctr->incr("cudaMemcpyAsync");
-#endif
+
   return o_cudaMemcpyAsync(dst, src, count, kind, stream);
 }
 
+#endif // __CUDA_ARCH__
+
 //Copies data to the given symbol on the device.
 __host__ cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
-  cudaError_t (*o_cudaMemcpy)(void*, const void*, size_t, cudaMemcpyKind);  
+  cudaError_t (*o_cudaMemcpy)(void*, const void*, size_t, cudaMemcpyKind);
   o_cudaMemcpy = (cudaError_t (*)(void*, const void*, size_t, cudaMemcpyKind))dlsym(RTLD_NEXT, "cudaMemcpy");
   ApiTest *ctr = ApiTest::getInstance();