From afd6f62d53b6c154701888d64187859b62e5bf6b Mon Sep 17 00:00:00 2001 From: cwschilly <132086024+cwschilly@users.noreply.github.com> Date: Wed, 6 Sep 2023 16:20:49 -0400 Subject: [PATCH] * #128: set up GPU CI pipelines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * #128: specify cuda dir * #128: temporarily run GPU on every push; stop MPI builds * #128: specify correct dockerfile * #128: provide correct build/source dirs * #128: rework MPI in build script * #128: rework MPI in other gpu build script * #128: CI build script try another configuration and fix invalid path * #128: fix missing letter in path for a gpu build * #128: add newlines to end of files * #128: add spack find -p to find cuda root * #128: only run one pipeline; add cuda paths * #128: add kokkos variables * #128: add Tpetra_INST_SERIAL:BOOL=ON * #128: add CUDA root flag * #128: use correct kokkos architecture * #128: enable cusolver and cusparse * #128: emulate local build * #128: try with different docker image * #128: update cuda path * Try adding debug flag for Buildx * Tpetra: Disable cudaMemcpyAsync for Intercept.cpp * #128: lower -j * #128: use defaul kokkos architecture * #128: use fewer processes for GPU testing * 128: re-enable Kokkos_ARCH_AMPERE86 * #128: add cuda sample build to CI to validate CUDA * #128: run cuda test on NGA host * #128: update jobs dependency on CI for cuda * #128: add CUD sample run * #128: remove command not existing in CI * #128: change cuda path * #128: try to display information about driver * #128: change bad command in CI * #128: fix command * #128: try install nvidia util in Docker container * #128: remove commands * #128: fix cuda path * #128: fix dockerfile * #128: add different cuda test images * Run container in separate step * Remove not needed code * Apply changes to Epetra=OFF * Try to build and run docker within same step * #128: remove unused old CI files * #128: check both gpu pipelines * #128: Tpetra_INST_SERIAL=ON * #128: fix workflow name * #128: rework with cuda 11.4 dockerfile * #168: try to simplify CI sheel script * #128: try simplify shell script * #128: remove librairies path for blas and lapack to check if resolved * #128: try remove Lapack and blas lib paths from cmake call * #128: try again changing path dynamically * #128: fix another path * #128: fix blas path * #128: apply working conffiguration to other build scripts * #128: restore triggering workflows on PR * #128: disable GPU build job for PR having `EpetraMPI T1` label * #128: enable GPU build only with EpetraMPI T2 and EpetraMPI T3 labels * #128: upload test log * #128: fix typo * #128: fix artifacts * #128: add junit report for tests * #128: add junit reporting in CI and set * #128: fix artifact name * #128: fix artifacts missing * #128 fix extra slach char in path * #128: fix artifacts path * #128: fix path in gitbub action * #128: try mounting artifacts folder into the host runner * #128: use same logic for gpu or non-gpu pipelines * 128: Finalize pipelines (GPU on push, MPI cancellations) * 128: remove label requirements * Revert "Tpetra: Disable cudaMemcpyAsync for Intercept.cpp" This reverts commit 5db2d5d5b18f722f79e75736399048c1c3592b31. * #128: test intercept reversion * Revert "Revert "Tpetra: Disable cudaMemcpyAsync for Intercept.cpp"" This reverts commit de87a2248864cb8a2eaf03cba12b836e0bd4db7f. * #128: fix underscore * #128: run GPU pipeline on merge to fy23 develop --------- Co-authored-by: Thomas Dutheillet-Lamonthézie Co-authored-by: Jacob Domagala --- .github/workflows/ci-gpu-epetraOFF.yml | 55 +++++++++++ .github/workflows/ci-gpu-epetraON.yml | 55 +++++++++++ nga-ci/build-gpu-epetraOFF.sh | 95 +++++++++++++++++++ nga-ci/build-gpu-epetraON.sh | 95 +++++++++++++++++++ nga-ci/gpu-epetraOFF.dockerfile | 11 +++ nga-ci/gpu-epetraON.dockerfile | 11 +++ nga-ci/test-gpu.sh | 25 +++++ nga-ci/test-mpi.sh | 25 +++++ .../core/test/KokkosIntegration/Intercept.cpp | 22 ++--- 9 files changed, 383 insertions(+), 11 deletions(-) create mode 100644 .github/workflows/ci-gpu-epetraOFF.yml create mode 100644 .github/workflows/ci-gpu-epetraON.yml create mode 100644 nga-ci/build-gpu-epetraOFF.sh create mode 100644 nga-ci/build-gpu-epetraON.sh create mode 100644 nga-ci/gpu-epetraOFF.dockerfile create mode 100644 nga-ci/gpu-epetraON.dockerfile create mode 100644 nga-ci/test-gpu.sh create mode 100644 nga-ci/test-mpi.sh diff --git a/.github/workflows/ci-gpu-epetraOFF.yml b/.github/workflows/ci-gpu-epetraOFF.yml new file mode 100644 index 000000000000..044be0b4b512 --- /dev/null +++ b/.github/workflows/ci-gpu-epetraOFF.yml @@ -0,0 +1,55 @@ +name: GPU-EpetraOFF + +# Trigger the workflow on merge to NGA-FY23-develop +on: + push: + branches: + - NGA-FY23-develop + workflow_dispatch: + +# Cancel any existing jobs +concurrency: + group: ${{ github.event.repository.name }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + build: + runs-on: self-hosted + strategy: + fail-fast: true + steps: + - uses: actions/checkout@v3 + - name: CI Variables + run: echo "DOCKER_TAG=$(echo ${{ github.ref }} | cut -d'/' -f3- | sed 's/[^a-z0-9_-]/__/gi')" >> $GITHUB_ENV + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + buildkitd-flags: --debug + - name: Inspect Builder + run: | + echo "Name: ${{ steps.buildx.outputs.name }}" + echo "Endpoint: ${{ steps.buildx.outputs.endpoint }}" + echo "Status: ${{ steps.buildx.outputs.status }}" + echo "Flags: ${{ steps.buildx.outputs.flags }}" + echo "Platforms: ${{ steps.buildx.outputs.platforms }}" + echo "DOCKER_TAG: ${{ env.DOCKER_TAG }}" + - name: Build and Run Docker Image + run: | + docker build -t ${{ env.DOCKER_TAG }} -f ./nga-ci/gpu-epetraOFF.dockerfile . + docker run -v /tmp/artifacts:/tmp/artifacts --gpus all ${{ env.DOCKER_TAG }} /opt/src/Trilinos/nga-ci/test-gpu.sh + - name: Upload artifacts + uses: actions/upload-artifact@v3 + if: success() || failure() + with: + name: Artifacts + path: /tmp/artifacts/* + if-no-files-found: ignore + - name: Report Test results + uses: phoenix-actions/test-reporting@v12 + if: success() || failure() + with: + name: Tests report (GPU-EpetraOFF) + path: /tmp/artifacts/junit-tests-report.xml + reporter: java-junit + output-to: step-summary + fail-on-error: 'true' diff --git a/.github/workflows/ci-gpu-epetraON.yml b/.github/workflows/ci-gpu-epetraON.yml new file mode 100644 index 000000000000..943a0d712f21 --- /dev/null +++ b/.github/workflows/ci-gpu-epetraON.yml @@ -0,0 +1,55 @@ +name: GPU-EpetraON + +# Trigger the workflow on merge to NGA-FY23-develop +on: + push: + branches: + - NGA-FY23-develop + workflow_dispatch: + +# Cancel any existing jobs +concurrency: + group: ${{ github.event.repository.name }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + build: + runs-on: self-hosted + strategy: + fail-fast: true + steps: + - uses: actions/checkout@v3 + - name: CI Variables + run: echo "DOCKER_TAG=$(echo ${{ github.ref }} | cut -d'/' -f3- | sed 's/[^a-z0-9_-]/__/gi')" >> $GITHUB_ENV + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + buildkitd-flags: --debug + - name: Inspect Builder + run: | + echo "Name: ${{ steps.buildx.outputs.name }}" + echo "Endpoint: ${{ steps.buildx.outputs.endpoint }}" + echo "Status: ${{ steps.buildx.outputs.status }}" + echo "Flags: ${{ steps.buildx.outputs.flags }}" + echo "Platforms: ${{ steps.buildx.outputs.platforms }}" + echo "DOCKER_TAG: ${{ env.DOCKER_TAG }}" + - name: Build and Run Docker Image + run: | + docker build -t ${{ env.DOCKER_TAG }} -f ./nga-ci/gpu-epetraON.dockerfile . + docker run -v /tmp/artifacts:/tmp/artifacts --gpus all ${{ env.DOCKER_TAG }} /opt/src/Trilinos/nga-ci/test-gpu.sh + - name: Upload artifacts + uses: actions/upload-artifact@v3 + if: success() || failure() + with: + name: Artifacts + path: /tmp/artifacts/* + if-no-files-found: ignore + - name: Report Test results + uses: phoenix-actions/test-reporting@v12 + if: success() || failure() + with: + name: Tests report (GPU-EpetraOFF) + path: /tmp/artifacts/junit-tests-report.xml + reporter: java-junit + output-to: step-summary + fail-on-error: 'true' diff --git a/nga-ci/build-gpu-epetraOFF.sh b/nga-ci/build-gpu-epetraOFF.sh new file mode 100644 index 000000000000..cf26f8b5c5f1 --- /dev/null +++ b/nga-ci/build-gpu-epetraOFF.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash + +set -e +set -x + +. /opt/spack/share/spack/setup-env.sh +spack env activate trilinos + +cd /opt/build/Trilinos + +export MPI_ROOT="$(dirname $(which mpicc))" +export MPICC="${MPI_ROOT}/mpicc" +export MPICXX="${MPI_ROOT}/mpicxx" +export MPIF90="${MPI_ROOT}/mpif90" +export MPIRUN="${MPI_ROOT}/mpirun" + +export BLAS_ROOT="$(spack location -i openblas)" +export LAPACK_ROOT="${BLAS_ROOT}" + +export CUDA_ROOT=/usr/local/cuda +export PATH=${CUDA_ROOT}/bin:$PATH +export OMPI_CXX=/opt/src/Trilinos/packages/kokkos/bin/nvcc_wrapper +export LD_LIBRARY_PATH=${CUDA_ROOT}/lib64:$LD_LIBRARY_PATH +export CUDA_LAUNCH_BLOCKING=1 +ENABLE_CUDA=ON + +cmake -G "${CMAKE_GENERATOR:-Ninja}" \ + -D CMAKE_BUILD_TYPE=DEBUG \ + -D Trilinos_ENABLE_DEBUG=ON \ + -D Trilinos_PARALLEL_LINK_JOBS_LIMIT=2 \ + -D Trilinos_ENABLE_ALL_PACKAGES=ON \ + -D Trilinos_ENABLE_ALL_OPTIONAL_PACKAGES=ON \ + -D Trilinos_ALLOW_NO_PACKAGES=ON \ + -D Trilinos_DISABLE_ENABLED_FORWARD_DEP_PACKAGES=ON \ + -D Trilinos_IGNORE_MISSING_EXTRA_REPOSITORIES=ON \ + -D Trilinos_ENABLE_TESTS=ON \ + -D Trilinos_TEST_CATEGORIES=BASIC \ + -D Trilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=ON \ + -D Trilinos_VERBOSE_CONFIGURE=ON \ + -D BUILD_SHARED_LIBS=ON \ + \ + -D Trilinos_WARNINGS_AS_ERRORS_FLAGS="-Wno-error" \ + -D Trilinos_ENABLE_SEACAS=OFF \ + -D Trilinos_ENABLE_Sacado=OFF \ + \ + -D TPL_ENABLE_CUDA="${ENABLE_CUDA}" \ + -D Tpetra_INST_SERIAL=ON \ + -D Tpetra_INST_CUDA=ON \ + -D Trilinos_ENABLE_Kokkos=ON \ + -D Kokkos_ARCH_AMPERE86=ON \ + -D Kokkos_ENABLE_OPENMP=OFF \ + -D Kokkos_ENABLE_CUDA="${ENABLE_CUDA}" \ + -D Kokkos_ENABLE_CUDA_LAMBDA="${ENABLE_CUDA}" \ + -D Kokkos_ENABLE_CUDA_UVM=OFF \ + \ + -D TPL_ENABLE_CUSOLVER=ON \ + -D TPL_ENABLE_CUSPARSE=ON \ + \ + -D TPL_ENABLE_BLAS=ON \ + -D TPL_BLAS_LIBRARIES="${BLAS_ROOT}/lib/libopenblas.so" \ + -D TPL_ENABLE_LAPACK=ON \ + -D TPL_LAPACK_LIBRARIES="${LAPACK_ROOT}/lib/libopenblas.so" \ + \ + -D TPL_ENABLE_Matio=OFF \ + -D TPL_ENABLE_X11=OFF \ + -D TPL_ENABLE_Pthread=OFF \ + -D TPL_ENABLE_Boost=OFF \ + -D TPL_ENABLE_BoostLib=OFF \ + -D TPL_ENABLE_ParMETIS=OFF \ + -D TPL_ENABLE_Zlib=OFF \ + -D TPL_ENABLE_HDF5=OFF \ + -D TPL_ENABLE_Netcdf=OFF \ + -D TPL_ENABLE_SuperLU=OFF \ + -D TPL_ENABLE_Scotch=OFF \ + \ + -D CMAKE_C_COMPILER=${MPICC} \ + -D CMAKE_CXX_COMPILER=${MPICXX} \ + -D CMAKE_Fortran_COMPILER=${MPIF90} \ + -D TPL_ENABLE_MPI=ON \ + -D MPI_BIN_DIR=${MPIRUN} \ + -D MPI_EXEC=${MPIRUN} \ + \ + -D Trilinos_ENABLE_Rythmos=OFF \ + -D Trilinos_ENABLE_Pike=OFF \ + -D Trilinos_ENABLE_Komplex=OFF \ + -D Trilinos_ENABLE_TriKota=OFF \ + -D Trilinos_ENABLE_Moertel=OFF \ + -D Trilinos_ENABLE_Domi=OFF \ + -D Trilinos_ENABLE_FEI=OFF \ + \ + -D Trilinos_ENABLE_PyTrilinos=OFF \ + \ + -D Trilinos_ENABLE_Epetra=OFF \ + -S /opt/src/Trilinos -B /opt/build/Trilinos +ninja -j 4 diff --git a/nga-ci/build-gpu-epetraON.sh b/nga-ci/build-gpu-epetraON.sh new file mode 100644 index 000000000000..e65e0019d68c --- /dev/null +++ b/nga-ci/build-gpu-epetraON.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash + +set -e +set -x + +. /opt/spack/share/spack/setup-env.sh +spack env activate trilinos + +cd /opt/build/Trilinos + +export MPI_ROOT="$(dirname $(which mpicc))" +export MPICC="${MPI_ROOT}/mpicc" +export MPICXX="${MPI_ROOT}/mpicxx" +export MPIF90="${MPI_ROOT}/mpif90" +export MPIRUN="${MPI_ROOT}/mpirun" + +export BLAS_ROOT="$(spack location -i openblas)" +export LAPACK_ROOT="${BLAS_ROOT}" + +export CUDA_ROOT=/usr/local/cuda +export PATH=${CUDA_ROOT}/bin:$PATH +export OMPI_CXX=/opt/src/Trilinos/packages/kokkos/bin/nvcc_wrapper +export LD_LIBRARY_PATH=${CUDA_ROOT}/lib64:$LD_LIBRARY_PATH +export CUDA_LAUNCH_BLOCKING=1 +ENABLE_CUDA=ON + +cmake -G "${CMAKE_GENERATOR:-Ninja}" \ + -D CMAKE_BUILD_TYPE=DEBUG \ + -D Trilinos_ENABLE_DEBUG=ON \ + -D Trilinos_PARALLEL_LINK_JOBS_LIMIT=2 \ + -D Trilinos_ENABLE_ALL_PACKAGES=ON \ + -D Trilinos_ENABLE_ALL_OPTIONAL_PACKAGES=ON \ + -D Trilinos_ALLOW_NO_PACKAGES=ON \ + -D Trilinos_DISABLE_ENABLED_FORWARD_DEP_PACKAGES=ON \ + -D Trilinos_IGNORE_MISSING_EXTRA_REPOSITORIES=ON \ + -D Trilinos_ENABLE_TESTS=ON \ + -D Trilinos_TEST_CATEGORIES=BASIC \ + -D Trilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=ON \ + -D Trilinos_VERBOSE_CONFIGURE=ON \ + -D BUILD_SHARED_LIBS=ON \ + \ + -D Trilinos_WARNINGS_AS_ERRORS_FLAGS="-Wno-error" \ + -D Trilinos_ENABLE_SEACAS=OFF \ + -D Trilinos_ENABLE_Sacado=OFF \ + \ + -D TPL_ENABLE_CUDA="${ENABLE_CUDA}" \ + -D Tpetra_INST_SERIAL=ON \ + -D Tpetra_INST_CUDA=ON \ + -D Trilinos_ENABLE_Kokkos=ON \ + -D Kokkos_ARCH_AMPERE86=ON \ + -D Kokkos_ENABLE_OPENMP=OFF \ + -D Kokkos_ENABLE_CUDA="${ENABLE_CUDA}" \ + -D Kokkos_ENABLE_CUDA_LAMBDA="${ENABLE_CUDA}" \ + -D Kokkos_ENABLE_CUDA_UVM=OFF \ + \ + -D TPL_ENABLE_CUSOLVER=ON \ + -D TPL_ENABLE_CUSPARSE=ON \ + \ + -D TPL_ENABLE_BLAS=ON \ + -D TPL_BLAS_LIBRARIES="${BLAS_ROOT}/lib/libopenblas.so" \ + -D TPL_ENABLE_LAPACK=ON \ + -D TPL_LAPACK_LIBRARIES="${LAPACK_ROOT}/lib/libopenblas.so" \ + \ + -D TPL_ENABLE_Matio=OFF \ + -D TPL_ENABLE_X11=OFF \ + -D TPL_ENABLE_Pthread=OFF \ + -D TPL_ENABLE_Boost=OFF \ + -D TPL_ENABLE_BoostLib=OFF \ + -D TPL_ENABLE_ParMETIS=OFF \ + -D TPL_ENABLE_Zlib=OFF \ + -D TPL_ENABLE_HDF5=OFF \ + -D TPL_ENABLE_Netcdf=OFF \ + -D TPL_ENABLE_SuperLU=OFF \ + -D TPL_ENABLE_Scotch=OFF \ + \ + -D CMAKE_C_COMPILER=${MPICC} \ + -D CMAKE_CXX_COMPILER=${MPICXX} \ + -D CMAKE_Fortran_COMPILER=${MPIF90} \ + -D TPL_ENABLE_MPI=ON \ + -D MPI_BIN_DIR=${MPIRUN} \ + -D MPI_EXEC=${MPIRUN} \ + \ + -D Trilinos_ENABLE_Rythmos=OFF \ + -D Trilinos_ENABLE_Pike=OFF \ + -D Trilinos_ENABLE_Komplex=OFF \ + -D Trilinos_ENABLE_TriKota=OFF \ + -D Trilinos_ENABLE_Moertel=OFF \ + -D Trilinos_ENABLE_Domi=OFF \ + -D Trilinos_ENABLE_FEI=OFF \ + \ + -D Trilinos_ENABLE_PyTrilinos=OFF \ + \ + -D Trilinos_ENABLE_Epetra=ON \ + -S /opt/src/Trilinos -B /opt/build/Trilinos +ninja -j 4 diff --git a/nga-ci/gpu-epetraOFF.dockerfile b/nga-ci/gpu-epetraOFF.dockerfile new file mode 100644 index 000000000000..2f54ce79a6d5 --- /dev/null +++ b/nga-ci/gpu-epetraOFF.dockerfile @@ -0,0 +1,11 @@ +# Choose a base image +FROM calebschilly/trilinos-deps:main AS build-stage + +COPY . /opt/src/Trilinos +RUN mkdir -p /opt/build/Trilinos + +# Build using the spack environment we created +RUN bash /opt/src/Trilinos/nga-ci/build-gpu-epetraOFF.sh + +# For running later +RUN chmod +x /opt/src/Trilinos/nga-ci/test-gpu.sh diff --git a/nga-ci/gpu-epetraON.dockerfile b/nga-ci/gpu-epetraON.dockerfile new file mode 100644 index 000000000000..a8d7a1969023 --- /dev/null +++ b/nga-ci/gpu-epetraON.dockerfile @@ -0,0 +1,11 @@ +# Choose a base image +FROM calebschilly/trilinos-deps:main AS build-stage + +COPY . /opt/src/Trilinos +RUN mkdir -p /opt/build/Trilinos + +# Build using the spack environment we created +RUN bash /opt/src/Trilinos/nga-ci/build-gpu-epetraON.sh + +# For running later +RUN chmod +x /opt/src/Trilinos/nga-ci/test-gpu.sh diff --git a/nga-ci/test-gpu.sh b/nga-ci/test-gpu.sh new file mode 100644 index 000000000000..cadb5e6d1556 --- /dev/null +++ b/nga-ci/test-gpu.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +set -x +set -e + +. /opt/spack/share/spack/setup-env.sh +spack env activate trilinos + +cd /opt/build/Trilinos +ret_code=0 + +export OMPI_ALLOW_RUN_AS_ROOT=1 +export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 + +# path to the artifacts +artifacts_dir=/tmp/artifacts + +ctest -j 5 --output-junit junit-tests-report.xml --output-on-failure || ret_code=$? +# We collect the test logs for exporting +echo "ctest returned: $ret_code" +mkdir -p ${artifacts_dir} +cp /opt/build/Trilinos/junit-tests-report.xml ${artifacts_dir} +cp /opt/build/Trilinos/Testing/Temporary/LastTest.log ${artifacts_dir} +echo ${ret_code} > ${artifacts_dir}/success_flag.txt +ls ${artifacts_dir} diff --git a/nga-ci/test-mpi.sh b/nga-ci/test-mpi.sh new file mode 100644 index 000000000000..f4bab2adcfec --- /dev/null +++ b/nga-ci/test-mpi.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +set -x +set -e + +. /opt/spack/share/spack/setup-env.sh +spack env activate trilinos + +cd /opt/build/Trilinos +ret_code=0 + +export OMPI_ALLOW_RUN_AS_ROOT=1 +export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 + +# path to the artifacts +artifacts_dir=/tmp/artifacts + +ctest -j 14 --output-junit junit-tests-report.xml --output-on-failure || ret_code=$? +# We collect the test logs for exporting +echo "ctest returned: $ret_code" +mkdir -p ${artifacts_dir} +cp /opt/build/Trilinos/junit-tests-report.xml ${artifacts_dir} +cp /opt/build/Trilinos/Testing/Temporary/LastTest.log ${artifacts_dir} +echo ${ret_code} > ${artifacts_dir}/success_flag.txt +ls ${artifacts_dir} diff --git a/packages/tpetra/core/test/KokkosIntegration/Intercept.cpp b/packages/tpetra/core/test/KokkosIntegration/Intercept.cpp index 91161f5c29da..7ca0811ce68a 100644 --- a/packages/tpetra/core/test/KokkosIntegration/Intercept.cpp +++ b/packages/tpetra/core/test/KokkosIntegration/Intercept.cpp @@ -60,7 +60,7 @@ void initialize(int& narg, char* arg[]) { fprintf(stderr, "Kokkos::initialize()\n"); o_init(narg, arg); -} +} void finalize() { void (*o_finalize)(void); @@ -82,15 +82,15 @@ __host__ __device__ cudaError_t cudaDeviceSynchronize() { return o_cudaDeviceSynchronize(); } +#ifndef __CUDA_ARCH__ //Copies data between host and device. Don't care about __device__ calls, so count only if from host. __host__ __device__ cudaError_t cudaMemcpy2DAsync ( void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind, cudaStream_t stream) { cudaError_t (*o_cudaMemcpy2DAsync) (void*, size_t, const void*, size_t, size_t, size_t, cudaMemcpyKind, cudaStream_t); o_cudaMemcpy2DAsync = (cudaError_t (*)(void*, size_t, const void*, size_t, size_t, size_t, cudaMemcpyKind, cudaStream_t))dlsym(RTLD_NEXT, "cudaMemcpy2DAsync"); -#ifndef __CUDA_ARCH__ - ApiTest *ctr = ApiTest::getInstance(); + ApiTest *ctr = ApiTest::getInstance(); ctr->incr("cudaMemcpy2DAsync"); -#endif + return o_cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream); } @@ -98,11 +98,10 @@ __host__ __device__ cudaError_t cudaMemcpy2DAsync ( void* dst, size_t dpitch, co __host__ __device__ cudaError_t cudaMemcpy3DAsync ( const cudaMemcpy3DParms* p, cudaStream_t stream ) { cudaError_t (*o_cudaMemcpy3DAsync) ( const cudaMemcpy3DParms* , cudaStream_t ); o_cudaMemcpy3DAsync = (cudaError_t (*)(const cudaMemcpy3DParms* , cudaStream_t))dlsym(RTLD_NEXT, "cudaMemcpy3DAsync"); -#ifndef __CUDA_ARCH__ - ApiTest *ctr = ApiTest::getInstance(); + ApiTest *ctr = ApiTest::getInstance(); ctr->incr("cudaMemcpy3DAsync"); -#endif + return o_cudaMemcpy3DAsync(p, stream); } @@ -110,17 +109,18 @@ __host__ __device__ cudaError_t cudaMemcpy3DAsync ( const cudaMemcpy3DParms* p, __host__ __device__ cudaError_t cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream) { cudaError_t (*o_cudaMemcpyAsync) ( void*, const void*, size_t, cudaMemcpyKind, cudaStream_t ); o_cudaMemcpyAsync = (cudaError_t (*)(void*, const void*, size_t, cudaMemcpyKind, cudaStream_t))dlsym(RTLD_NEXT, "cudaMemcpyAsync"); -#ifndef __CUDA_ARCH__ - ApiTest *ctr = ApiTest::getInstance(); + ApiTest *ctr = ApiTest::getInstance(); ctr->incr("cudaMemcpyAsync"); -#endif + return o_cudaMemcpyAsync(dst, src, count, kind, stream); } +#endif // __CUDA_ARCH__ + //Copies data to the given symbol on the device. __host__ cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) { - cudaError_t (*o_cudaMemcpy)(void*, const void*, size_t, cudaMemcpyKind); + cudaError_t (*o_cudaMemcpy)(void*, const void*, size_t, cudaMemcpyKind); o_cudaMemcpy = (cudaError_t (*)(void*, const void*, size_t, cudaMemcpyKind))dlsym(RTLD_NEXT, "cudaMemcpy"); ApiTest *ctr = ApiTest::getInstance();