diff --git a/src/force/dftd3.cu b/src/force/dftd3.cu
index a25033a7c..ac0148fd2 100644
--- a/src/force/dftd3.cu
+++ b/src/force/dftd3.cu
@@ -32,6 +32,7 @@ J. Comput. Chem., 32, 1456 (2011).
 #include "model/box.cuh"
 #include "neighbor.cuh"
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <algorithm>
 #include <cctype>
 #include <iostream>
@@ -947,7 +948,7 @@ void DFTD3::compute_small_box(
     r12.data() + size_x12 * 3,
     r12.data() + size_x12 * 4,
     r12.data() + size_x12 * 5);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_dftd3_coordination_number_small_box<<<(N - 1) / 64 + 1, 64>>>(
     dftd3_para,
@@ -959,7 +960,7 @@ void DFTD3::compute_small_box(
     r12.data() + size_x12 * 4,
     r12.data() + size_x12 * 5,
     cn.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   add_dftd3_force_small_box<<<(N - 1) / 64 + 1, 64>>>(
     dftd3_para,
@@ -979,7 +980,7 @@ void DFTD3::compute_small_box(
     virial_per_atom.data(),
     dc6_sum.data(),
     dc8_sum.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   add_dftd3_force_extra_small_box<<<(N - 1) / 64 + 1, 64>>>(
     dftd3_para,
@@ -996,7 +997,7 @@ void DFTD3::compute_small_box(
     force_per_atom.data() + N,
     force_per_atom.data() + N * 2,
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 void DFTD3::compute_large_box(
@@ -1058,7 +1059,7 @@ void DFTD3::compute_large_box(
     position_per_atom.data() + N,
     position_per_atom.data() + N * 2,
     cn.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_dftd3_force_large_box<<<(N - 1) / 64 + 1, 64>>>(
     dftd3_para,
@@ -1084,7 +1085,7 @@ void DFTD3::compute_large_box(
     virial_per_atom.data(),
     dc6_sum.data(),
     dc8_sum.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_dftd3_force_extra_large_box<<<(N - 1) / 64 + 1, 64>>>(
     dftd3_para,
@@ -1107,7 +1108,7 @@ void DFTD3::compute_large_box(
     force_per_atom.data() + N,
     force_per_atom.data() + N * 2,
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 void DFTD3::compute(
diff --git a/src/force/dftd3para.cuh b/src/force/dftd3para.cuh
index 6d00112d7..6407cb30d 100644
--- a/src/force/dftd3para.cuh
+++ b/src/force/dftd3para.cuh
@@ -15,6 +15,8 @@
 
 #pragma once
 
+#include "utilities/gpu_macro.cuh"
+
 namespace
 {
 #define Bohr 0.5291772575069165f
diff --git a/src/force/eam.cu b/src/force/eam.cu
index 7a910d654..b5debf445 100644
--- a/src/force/eam.cu
+++ b/src/force/eam.cu
@@ -22,6 +22,7 @@ The EAM potential. Currently two analytical versions:
 #include "eam.cuh"
 #include "neighbor.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <cstring>
 #define BLOCK_SIZE_FORCE 64
 
@@ -514,7 +515,7 @@ void EAM::compute(
       position_per_atom.data() + number_of_atoms * 2,
       eam_data.Fp.data(),
       potential_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     find_force_eam_step2<0><<<grid_size, BLOCK_SIZE_FORCE>>>(
       eam2004zhou,
@@ -535,7 +536,7 @@ void EAM::compute(
       force_per_atom.data() + 2 * number_of_atoms,
       virial_per_atom.data(),
       potential_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 
   if (potential_model == 1) {
@@ -554,7 +555,7 @@ void EAM::compute(
       position_per_atom.data() + number_of_atoms * 2,
       eam_data.Fp.data(),
       potential_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     find_force_eam_step2<1><<<grid_size, BLOCK_SIZE_FORCE>>>(
       eam2004zhou,
@@ -575,6 +576,6 @@ void EAM::compute(
       force_per_atom.data() + 2 * number_of_atoms,
       virial_per_atom.data(),
       potential_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
diff --git a/src/force/fcp.cu b/src/force/fcp.cu
index 4c161183f..50168ed6b 100644
--- a/src/force/fcp.cu
+++ b/src/force/fcp.cu
@@ -19,6 +19,7 @@ The force constant potential (FCP)
 
 #include "fcp.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <cstring>
 #include <vector>
 
@@ -1038,7 +1039,7 @@ void FCP::compute(
     position_per_atom.data() + number_of_atoms * 2,
     fcp_data.r0.data(),
     fcp_data.u.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   fcp_data.pfv.fill(0.0f);
 
@@ -1125,5 +1126,5 @@ void FCP::compute(
     force_per_atom.data() + 2 * number_of_atoms,
     virial_per_atom.data());
 
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
diff --git a/src/force/force.cu b/src/force/force.cu
index 2329ebb24..50fa151e5 100644
--- a/src/force/force.cu
+++ b/src/force/force.cu
@@ -28,6 +28,7 @@ The driver class calculating force and related quantities.
 #include "ilp_tmd_sw.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cstring>
 #include <iostream>
@@ -106,7 +107,7 @@ void Force::parse_potential(
     strcmp(potential_name, "nep4_temperature") == 0 ||
     strcmp(potential_name, "nep4_zbl_temperature") == 0) {
     int num_gpus;
-    CHECK(cudaGetDeviceCount(&num_gpus));
+    CHECK(gpuGetDeviceCount(&num_gpus));
 #ifdef ZHEYONG
     num_gpus = 3;
 #endif
@@ -226,7 +227,7 @@ static __global__ void gpu_sum_force(int N, double* g_fx, double* g_fy, double*
   s_f[tid] = f;
   __syncthreads();
 
-#pragma unroll
+
   for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
     if (tid < offset) {
       s_f[tid] += s_f[tid + offset];
@@ -466,7 +467,7 @@ void Force::compute(
     force_per_atom.data() + number_of_atoms * 2,
     potential_per_atom.data(),
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   if (multiple_potentials_mode_.compare("observe") == 0) {
     // If observing, calculate using main potential only
@@ -516,7 +517,7 @@ void Force::compute(
       force_per_atom.data(),
       virial_per_atom.data(),
       (double)potentials.size());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   } else {
     PRINT_INPUT_ERROR("Invalid mode for multiple potentials.\n");
   }
@@ -552,7 +553,7 @@ void Force::compute(
       force_per_atom.data() + number_of_atoms,
       force_per_atom.data() + 2 * number_of_atoms,
       ftot.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
       number_of_atoms,
@@ -561,7 +562,7 @@ void Force::compute(
       force_per_atom.data() + number_of_atoms,
       force_per_atom.data() + 2 * number_of_atoms,
       ftot.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 
   // always correct the force when using the FCP potential
@@ -574,7 +575,7 @@ void Force::compute(
         force_per_atom.data() + number_of_atoms,
         force_per_atom.data() + 2 * number_of_atoms,
         ftot.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
 
       gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
         number_of_atoms,
@@ -583,7 +584,7 @@ void Force::compute(
         force_per_atom.data() + number_of_atoms,
         force_per_atom.data() + 2 * number_of_atoms,
         ftot.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
     }
   }
 }
@@ -647,7 +648,7 @@ static __global__ void gpu_sum_tensor(int N, double* g_tensor, double* g_sum_ten
   s_t[tid] = t;
   __syncthreads();
 
-#pragma unroll
+
   for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
     if (tid < offset) {
       s_t[tid] += s_t[tid + offset];
@@ -754,7 +755,7 @@ void Force::compute(
     force_per_atom.data() + number_of_atoms * 2,
     potential_per_atom.data(),
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   temperature += delta_T;
   if (multiple_potentials_mode_.compare("observe") == 0) {
@@ -805,7 +806,7 @@ void Force::compute(
       force_per_atom.data(),
       virial_per_atom.data(),
       (double)potentials.size());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   } else {
     PRINT_INPUT_ERROR("Invalid mode for multiple potentials.\n");
   }
@@ -841,7 +842,7 @@ void Force::compute(
       force_per_atom.data() + number_of_atoms,
       force_per_atom.data() + 2 * number_of_atoms,
       ftot.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
       number_of_atoms,
@@ -850,7 +851,7 @@ void Force::compute(
       force_per_atom.data() + number_of_atoms,
       force_per_atom.data() + 2 * number_of_atoms,
       ftot.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   } else if (compute_hnemdec_ == 0) {
     // the tensor:
     // xx xy xz    0 3 4
@@ -876,10 +877,10 @@ void Force::compute(
       virial_per_atom.data() + 8 * number_of_atoms,
       virial_per_atom.data() + 2 * number_of_atoms,
       tensor_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     gpu_sum_tensor<<<9, 1024>>>(number_of_atoms, tensor_per_atom.data(), tensor_tot.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     gpu_add_driving_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
       number_of_atoms,
@@ -901,7 +902,7 @@ void Force::compute(
       force_per_atom.data(),
       force_per_atom.data() + number_of_atoms,
       force_per_atom.data() + 2 * number_of_atoms);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
   } else if (compute_hnemdec_ != -1) {
     gpu_add_driving_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
@@ -926,7 +927,7 @@ void Force::compute(
         force_per_atom.data() + number_of_atoms,
         force_per_atom.data() + 2 * number_of_atoms,
         ftot.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
 
       gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
         number_of_atoms,
@@ -935,7 +936,7 @@ void Force::compute(
         force_per_atom.data() + number_of_atoms,
         force_per_atom.data() + 2 * number_of_atoms,
         ftot.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
     }
   }
 }
diff --git a/src/force/force_constant.cu b/src/force/force_constant.cu
index 164306d24..90608601f 100644
--- a/src/force/force_constant.cu
+++ b/src/force/force_constant.cu
@@ -23,6 +23,7 @@ Use finite difference to calculate the seconod order force constants：
 #include "model/box.cuh"
 #include "model/group.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <vector>
 
 static __global__ void gpu_shift_atom(const double dx, double* x) { x[0] += dx; }
@@ -34,13 +35,13 @@ static void shift_atom(
 
   if (beta == 0) {
     gpu_shift_atom<<<1, 1>>>(dx, position_per_atom.data() + n2);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   } else if (beta == 1) {
     gpu_shift_atom<<<1, 1>>>(dx, position_per_atom.data() + number_of_atoms + n2);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   } else {
     gpu_shift_atom<<<1, 1>>>(dx, position_per_atom.data() + number_of_atoms * 2 + n2);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
@@ -67,10 +68,10 @@ static void get_f(
     box, position_per_atom, type, group, potential_per_atom, force_per_atom, virial_per_atom);
 
   size_t M = sizeof(double);
-  CHECK(cudaMemcpy(f + 0, force_per_atom.data() + n1, M, cudaMemcpyDeviceToHost));
-  CHECK(cudaMemcpy(f + 1, force_per_atom.data() + n1 + number_of_atoms, M, cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(f + 0, force_per_atom.data() + n1, M, gpuMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(f + 1, force_per_atom.data() + n1 + number_of_atoms, M, gpuMemcpyDeviceToHost));
   CHECK(
-    cudaMemcpy(f + 2, force_per_atom.data() + n1 + number_of_atoms * 2, M, cudaMemcpyDeviceToHost));
+    gpuMemcpy(f + 2, force_per_atom.data() + n1 + number_of_atoms * 2, M, gpuMemcpyDeviceToHost));
 
   shift_atom(-dx, n2, beta, position_per_atom);
 }
diff --git a/src/force/ilp_tmd_sw.cu b/src/force/ilp_tmd_sw.cu
index 931f47348..079029e0d 100644
--- a/src/force/ilp_tmd_sw.cu
+++ b/src/force/ilp_tmd_sw.cu
@@ -22,6 +22,7 @@ TODO:
 #include "neighbor.cuh"
 #include "utilities/error.cuh"
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 
 #define BLOCK_SIZE_FORCE 128
 
@@ -126,8 +127,7 @@ ILP_TMD_SW::ILP_TMD_SW(FILE* fid_ilp, FILE* fid_sw, int num_types, int num_atoms
   // init constant cutoff coeff
   float h_tap_coeff[8] = \
     {1.0f, 0.0f, 0.0f, 0.0f, -35.0f, 84.0f, -70.0f, 20.0f};
-  cudaMemcpyToSymbol(Tap_coeff_tmd, h_tap_coeff, 8 * sizeof(float));
-  CUDA_CHECK_KERNEL
+  CHECK(gpuMemcpyToSymbol(Tap_coeff_tmd, h_tap_coeff, 8 * sizeof(float)));
 
   // set ilp_flag to 1
   ilp_flag = 1;
@@ -1437,7 +1437,7 @@ void ILP_TMD_SW::compute_ilp(
     number_of_atoms, N1, N2, box, big_ilp_NN, big_ilp_NL, \
     type.data(), ilp_para, x, y, z, ilp_NN, \
     ilp_NL, group[1].label.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // initialize force of ilp neighbor temporary vector
   ilp_data.f12x_ilp_neigh.fill(0);
@@ -1485,7 +1485,7 @@ void ILP_TMD_SW::compute_ilp(
     g_f12x_ilp_neigh,
     g_f12y_ilp_neigh,
     g_f12z_ilp_neigh);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   reduce_force_many_body<<<grid_size, BLOCK_SIZE_FORCE>>>(
     number_of_atoms,
@@ -1510,7 +1510,7 @@ void ILP_TMD_SW::compute_ilp(
     g_f12x_ilp_neigh,
     g_f12y_ilp_neigh,
     g_f12z_ilp_neigh);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
   // step 1: calculate the partial forces
   gpu_find_force_sw3_partial<<<grid_size, BLOCK_SIZE_FORCE>>>(
@@ -1518,7 +1518,7 @@ void ILP_TMD_SW::compute_ilp(
     type.data(), position_per_atom.data(), position_per_atom.data() + number_of_atoms,
     position_per_atom.data() + number_of_atoms * 2, potential_per_atom.data(), sw2_data.f12x.data(),
     sw2_data.f12y.data(), sw2_data.f12z.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // step 2: calculate force and related quantities
   find_properties_many_body(
diff --git a/src/force/lj.cu b/src/force/lj.cu
index 4fafeeb61..88c4b553f 100644
--- a/src/force/lj.cu
+++ b/src/force/lj.cu
@@ -20,6 +20,7 @@ The class dealing with the Lennard-Jones (LJ) pairwise potentials.
 #include "lj.cuh"
 #include "neighbor.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 
 // best block size here: 128
 #define BLOCK_SIZE_FORCE 128
@@ -233,5 +234,5 @@ void LJ::compute(
     force_per_atom.data() + 2 * number_of_atoms,
     virial_per_atom.data(),
     potential_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
diff --git a/src/force/neighbor.cu b/src/force/neighbor.cu
index 8b234415e..1a9580a98 100644
--- a/src/force/neighbor.cu
+++ b/src/force/neighbor.cu
@@ -19,6 +19,7 @@ neighbor list.
 
 #include "neighbor.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <thrust/execution_policy.h>
 #include <thrust/scan.h>
 
@@ -183,18 +184,18 @@ void find_cell_list(
     cell_count_sum.resize(N_cells);
   }
 
-  CHECK(cudaMemset(cell_count.data(), 0, sizeof(int) * N_cells));
-  CHECK(cudaMemset(cell_count_sum.data(), 0, sizeof(int) * N_cells));
-  CHECK(cudaMemset(cell_contents.data(), 0, sizeof(int) * N));
+  CHECK(gpuMemset(cell_count.data(), 0, sizeof(int) * N_cells));
+  CHECK(gpuMemset(cell_count_sum.data(), 0, sizeof(int) * N_cells));
+  CHECK(gpuMemset(cell_contents.data(), 0, sizeof(int) * N));
 
   find_cell_counts<<<grid_size, block_size>>>(
     box, N, cell_count.data(), x, y, z, num_bins[0], num_bins[1], num_bins[2], rc_inv);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   thrust::exclusive_scan(
     thrust::device, cell_count.data(), cell_count.data() + N_cells, cell_count_sum.data());
 
-  CHECK(cudaMemset(cell_count.data(), 0, sizeof(int) * N_cells));
+  CHECK(gpuMemset(cell_count.data(), 0, sizeof(int) * N_cells));
 
   find_cell_contents<<<grid_size, block_size>>>(
     box,
@@ -209,7 +210,7 @@ void find_cell_list(
     num_bins[1],
     num_bins[2],
     rc_inv);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 static void __global__ set_to_zero(int size, int* data)
@@ -221,7 +222,7 @@ static void __global__ set_to_zero(int size, int* data)
 }
 
 void find_cell_list(
-  cudaStream_t& stream,
+  gpuStream_t& stream,
   const double rc,
   const int* num_bins,
   Box& box,
@@ -248,29 +249,33 @@ void find_cell_list(
 
   set_to_zero<<<(cell_count.size() - 1) / 64 + 1, 64, 0, stream>>>(
     cell_count.size(), cell_count.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   set_to_zero<<<(cell_count_sum.size() - 1) / 64 + 1, 64, 0, stream>>>(
     cell_count_sum.size(), cell_count_sum.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   set_to_zero<<<(cell_contents.size() - 1) / 64 + 1, 64, 0, stream>>>(
     cell_contents.size(), cell_contents.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_cell_counts<<<grid_size, block_size, 0, stream>>>(
     box, N, cell_count.data(), x, y, z, num_bins[0], num_bins[1], num_bins[2], rc_inv);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   thrust::exclusive_scan(
+#ifdef USE_HIP
+    thrust::hip::par.on(stream),
+#else
     thrust::cuda::par.on(stream),
+#endif
     cell_count.data(),
     cell_count.data() + N_cells,
     cell_count_sum.data());
 
   set_to_zero<<<(cell_count.size() - 1) / 64 + 1, 64, 0, stream>>>(
     cell_count.size(), cell_count.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_cell_contents<<<grid_size, block_size, 0, stream>>>(
     box,
@@ -285,7 +290,7 @@ void find_cell_list(
     num_bins[1],
     num_bins[2],
     rc_inv);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 void find_neighbor(
@@ -335,11 +340,11 @@ void find_neighbor(
     num_bins[2],
     rc_inv_cell_list,
     rc * rc);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   const int MN = NL.size() / NN.size();
   gpu_sort_neighbor_list<<<N, MN, MN * sizeof(int)>>>(N, NN.data(), NL.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 // For ILP, the neighbor could not contain atoms in the same layer
@@ -488,11 +493,11 @@ void find_neighbor_ilp(
     rc_inv_cell_list,
     rc * rc,
     big_ilp_cutoff_square);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   const int MN = NL.size() / NN.size();
   gpu_sort_neighbor_list_ilp<<<N, min(1024, MN), MN * sizeof(int)>>>(N, NN.data(), NL.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 static __global__ void gpu_find_neighbor_ON1_SW(
@@ -624,9 +629,9 @@ void find_neighbor_SW(
     num_bins[2],
     rc_inv_cell_list,
     rc * rc);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   const int MN = NL.size() / NN.size();
   gpu_sort_neighbor_list<<<N, MN, MN * sizeof(int)>>>(N, NN.data(), NL.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
\ No newline at end of file
diff --git a/src/force/nep3.cu b/src/force/nep3.cu
index 1f7729ccc..8fcec4206 100644
--- a/src/force/nep3.cu
+++ b/src/force/nep3.cu
@@ -25,6 +25,7 @@ heat transport, Phys. Rev. B. 104, 104309 (2021).
 #include "nep3_small_box.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/nep_utilities.cuh"
 #include <fstream>
 #include <iostream>
@@ -1176,7 +1177,7 @@ void NEP3::compute_large_box(
     nep_data.NL_radial.data(),
     nep_data.NN_angular.data(),
     nep_data.NL_angular.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   static int num_calls = 0;
   if (num_calls++ % 1000 == 0) {
@@ -1202,11 +1203,11 @@ void NEP3::compute_large_box(
 
   gpu_sort_neighbor_list<<<N, paramb.MN_radial, paramb.MN_radial * sizeof(int)>>>(
     N, nep_data.NN_radial.data(), nep_data.NL_radial.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   gpu_sort_neighbor_list<<<N, paramb.MN_angular, paramb.MN_angular * sizeof(int)>>>(
     N, nep_data.NN_angular.data(), nep_data.NL_angular.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   bool is_polarizability = paramb.model_type == 2;
   find_descriptor<<<grid_size, BLOCK_SIZE>>>(
@@ -1233,7 +1234,7 @@ void NEP3::compute_large_box(
     nep_data.Fp.data(),
     virial_per_atom.data(),
     nep_data.sum_fxyz.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   bool is_dipole = paramb.model_type == 1;
   find_force_radial<<<grid_size, BLOCK_SIZE>>>(
@@ -1258,7 +1259,7 @@ void NEP3::compute_large_box(
     force_per_atom.data() + N,
     force_per_atom.data() + N * 2,
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_partial_force_angular<<<grid_size, BLOCK_SIZE>>>(
     paramb,
@@ -1282,7 +1283,7 @@ void NEP3::compute_large_box(
     nep_data.f12x.data(),
     nep_data.f12y.data(),
     nep_data.f12z.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_properties_many_body(
     box,
@@ -1295,7 +1296,7 @@ void NEP3::compute_large_box(
     position_per_atom,
     force_per_atom,
     virial_per_atom);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   if (zbl.enabled) {
     find_force_ZBL<<<grid_size, BLOCK_SIZE>>>(
@@ -1316,7 +1317,7 @@ void NEP3::compute_large_box(
       force_per_atom.data() + N * 2,
       virial_per_atom.data(),
       potential_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
@@ -1362,7 +1363,7 @@ void NEP3::compute_small_box(
     r12.data() + size_x12 * 3,
     r12.data() + size_x12 * 4,
     r12.data() + size_x12 * 5);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   const bool is_polarizability = paramb.model_type == 2;
   find_descriptor_small_box<<<grid_size, BLOCK_SIZE>>>(
@@ -1391,7 +1392,7 @@ void NEP3::compute_small_box(
     nep_data.Fp.data(),
     virial_per_atom.data(),
     nep_data.sum_fxyz.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   bool is_dipole = paramb.model_type == 1;
   find_force_radial_small_box<<<grid_size, BLOCK_SIZE>>>(
@@ -1415,7 +1416,7 @@ void NEP3::compute_small_box(
     force_per_atom.data() + N,
     force_per_atom.data() + N * 2,
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_force_angular_small_box<<<grid_size, BLOCK_SIZE>>>(
     paramb,
@@ -1440,7 +1441,7 @@ void NEP3::compute_small_box(
     force_per_atom.data() + N,
     force_per_atom.data() + N * 2,
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   if (zbl.enabled) {
     find_force_ZBL_small_box<<<grid_size, BLOCK_SIZE>>>(
@@ -1460,7 +1461,7 @@ void NEP3::compute_small_box(
       force_per_atom.data() + N * 2,
       virial_per_atom.data(),
       potential_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
@@ -1758,7 +1759,7 @@ void NEP3::compute_large_box(
     nep_data.NL_radial.data(),
     nep_data.NN_angular.data(),
     nep_data.NL_angular.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   static int num_calls = 0;
   if (num_calls++ % 1000 == 0) {
@@ -1784,11 +1785,11 @@ void NEP3::compute_large_box(
 
   gpu_sort_neighbor_list<<<N, paramb.MN_radial, paramb.MN_radial * sizeof(int)>>>(
     N, nep_data.NN_radial.data(), nep_data.NL_radial.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   gpu_sort_neighbor_list<<<N, paramb.MN_angular, paramb.MN_angular * sizeof(int)>>>(
     N, nep_data.NN_angular.data(), nep_data.NL_angular.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_descriptor<<<grid_size, BLOCK_SIZE>>>(
     temperature,
@@ -1814,7 +1815,7 @@ void NEP3::compute_large_box(
     nep_data.Fp.data(),
     virial_per_atom.data(),
     nep_data.sum_fxyz.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   bool is_dipole = paramb.model_type == 1;
   find_force_radial<<<grid_size, BLOCK_SIZE>>>(
@@ -1839,7 +1840,7 @@ void NEP3::compute_large_box(
     force_per_atom.data() + N,
     force_per_atom.data() + N * 2,
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_partial_force_angular<<<grid_size, BLOCK_SIZE>>>(
     paramb,
@@ -1863,7 +1864,7 @@ void NEP3::compute_large_box(
     nep_data.f12x.data(),
     nep_data.f12y.data(),
     nep_data.f12z.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_properties_many_body(
     box,
@@ -1876,7 +1877,7 @@ void NEP3::compute_large_box(
     position_per_atom,
     force_per_atom,
     virial_per_atom);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   if (zbl.enabled) {
     find_force_ZBL<<<grid_size, BLOCK_SIZE>>>(
@@ -1897,7 +1898,7 @@ void NEP3::compute_large_box(
       force_per_atom.data() + N * 2,
       virial_per_atom.data(),
       potential_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
@@ -1944,7 +1945,7 @@ void NEP3::compute_small_box(
     r12.data() + size_x12 * 3,
     r12.data() + size_x12 * 4,
     r12.data() + size_x12 * 5);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_descriptor_small_box<<<grid_size, BLOCK_SIZE>>>(
     temperature,
@@ -1972,7 +1973,7 @@ void NEP3::compute_small_box(
     nep_data.Fp.data(),
     virial_per_atom.data(),
     nep_data.sum_fxyz.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   bool is_dipole = paramb.model_type == 1;
   find_force_radial_small_box<<<grid_size, BLOCK_SIZE>>>(
@@ -1996,7 +1997,7 @@ void NEP3::compute_small_box(
     force_per_atom.data() + N,
     force_per_atom.data() + N * 2,
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_force_angular_small_box<<<grid_size, BLOCK_SIZE>>>(
     paramb,
@@ -2021,7 +2022,7 @@ void NEP3::compute_small_box(
     force_per_atom.data() + N,
     force_per_atom.data() + N * 2,
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   if (zbl.enabled) {
     find_force_ZBL_small_box<<<grid_size, BLOCK_SIZE>>>(
@@ -2041,7 +2042,7 @@ void NEP3::compute_small_box(
       force_per_atom.data() + N * 2,
       virial_per_atom.data(),
       potential_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
diff --git a/src/force/nep3_multigpu.cu b/src/force/nep3_multigpu.cu
index 5cf5f4425..9f9342379 100644
--- a/src/force/nep3_multigpu.cu
+++ b/src/force/nep3_multigpu.cu
@@ -26,6 +26,7 @@ when there is NVlink, but is also not very bad when there is only PCI-E.
 #include "nep3_multigpu.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/nep_utilities.cuh"
 #include <iostream>
 #include <string>
@@ -347,7 +348,7 @@ NEP3_MULTIGPU::NEP3_MULTIGPU(
     annmb[gpu].dim = annmb[0].dim;
     annmb[gpu].num_neurons1 = annmb[0].num_neurons1;
 #ifndef ZHEYONG
-    CHECK(cudaSetDevice(gpu));
+    CHECK(gpuSetDevice(gpu));
 #endif
 
     nep_data[gpu].parameters.resize(annmb[gpu].num_para);
@@ -359,7 +360,7 @@ NEP3_MULTIGPU::NEP3_MULTIGPU(
     nep_data[gpu].cell_count_sum.resize(num_atoms);
     nep_data[gpu].cell_contents.resize(num_atoms);
 
-    CHECK(cudaStreamCreate(&nep_data[gpu].stream));
+    CHECK(gpuStreamCreate(&nep_data[gpu].stream));
 
 #ifdef USE_TABLE
     nep_data[gpu].gn_radial.resize(table_length * paramb.num_types_sq * (paramb.n_max_radial + 1));
@@ -400,7 +401,7 @@ NEP3_MULTIGPU::NEP3_MULTIGPU(
 #endif
   }
 
-  CHECK(cudaSetDevice(0));
+  CHECK(gpuSetDevice(0));
 
   nep_temp_data.cell_count_sum_cpu.resize(num_atoms);
   nep_temp_data.cell_count.resize(num_atoms);
@@ -417,7 +418,7 @@ void NEP3_MULTIGPU::allocate_memory()
   for (int gpu = 0; gpu < paramb.num_gpus; ++gpu) {
 
 #ifndef ZHEYONG
-    CHECK(cudaSetDevice(gpu));
+    CHECK(gpuSetDevice(gpu));
 #endif
 
     nep_data[gpu].f12x.resize(nep_temp_data.num_atoms_per_gpu * paramb.MN_angular);
@@ -438,7 +439,7 @@ void NEP3_MULTIGPU::allocate_memory()
     nep_data[gpu].virial.resize(nep_temp_data.num_atoms_per_gpu * 9);
   }
 
-  CHECK(cudaSetDevice(0));
+  CHECK(gpuSetDevice(0));
 
   nep_temp_data.type.resize(nep_temp_data.num_atoms_per_gpu);
   nep_temp_data.position.resize(nep_temp_data.num_atoms_per_gpu * 3);
@@ -450,7 +451,7 @@ void NEP3_MULTIGPU::allocate_memory()
 NEP3_MULTIGPU::~NEP3_MULTIGPU(void)
 {
   for (int gpu = 0; gpu < paramb.num_gpus; ++gpu) {
-    CHECK(cudaStreamDestroy(nep_data[gpu].stream));
+    CHECK(gpuStreamDestroy(nep_data[gpu].stream));
   }
 }
 
@@ -625,7 +626,7 @@ static void __global__ set_to_zero(int size, int* data)
 }
 
 static void find_cell_list(
-  cudaStream_t& stream,
+  gpuStream_t& stream,
   const int partition_direction,
   const double rc,
   const int* num_bins,
@@ -653,15 +654,15 @@ static void find_cell_list(
 
   set_to_zero<<<(cell_count.size() - 1) / 64 + 1, 64, 0, stream>>>(
     cell_count.size(), cell_count.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   set_to_zero<<<(cell_count_sum.size() - 1) / 64 + 1, 64, 0, stream>>>(
     cell_count_sum.size(), cell_count_sum.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   set_to_zero<<<(cell_contents.size() - 1) / 64 + 1, 64, 0, stream>>>(
     cell_contents.size(), cell_contents.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_cell_counts<<<grid_size, block_size, 0, stream>>>(
     partition_direction,
@@ -675,17 +676,21 @@ static void find_cell_list(
     num_bins[1],
     num_bins[2],
     rc_inv);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   thrust::exclusive_scan(
+#ifdef USE_HIP
+    thrust::hip::par.on(stream),
+#else
     thrust::cuda::par.on(stream),
+#endif
     cell_count.data(),
     cell_count.data() + N_cells,
     cell_count_sum.data());
 
   set_to_zero<<<(cell_count.size() - 1) / 64 + 1, 64, 0, stream>>>(
     cell_count.size(), cell_count.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_cell_contents<<<grid_size, block_size, 0, stream>>>(
     partition_direction,
@@ -701,7 +706,7 @@ static void find_cell_list(
     num_bins[1],
     num_bins[2],
     rc_inv);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 static __global__ void find_neighbor_list_large_box(
@@ -1719,19 +1724,19 @@ void NEP3_MULTIGPU::compute(
       position.data(),
       nep_temp_data.type.data(),
       nep_temp_data.position.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
-    CHECK(cudaMemcpy(
+    CHECK(gpuMemcpy(
       nep_data[gpu].type.data(),
       nep_temp_data.type.data(),
       sizeof(int) * nep_data[gpu].N3,
-      cudaMemcpyDeviceToDevice));
+      gpuMemcpyDeviceToDevice));
     for (int d = 0; d < 3; ++d) {
-      CHECK(cudaMemcpy(
+      CHECK(gpuMemcpy(
         nep_data[gpu].position.data() + nep_temp_data.num_atoms_per_gpu * d,
         nep_temp_data.position.data() + nep_temp_data.num_atoms_per_gpu * d,
         sizeof(double) * nep_data[gpu].N3,
-        cudaMemcpyDeviceToDevice));
+        gpuMemcpyDeviceToDevice));
     }
   }
 
@@ -1739,7 +1744,7 @@ void NEP3_MULTIGPU::compute(
   for (int gpu = 0; gpu < paramb.num_gpus; ++gpu) {
 
 #ifndef ZHEYONG
-    CHECK(cudaSetDevice(gpu));
+    CHECK(gpuSetDevice(gpu));
 #endif
 
     find_cell_list(
@@ -1779,7 +1784,7 @@ void NEP3_MULTIGPU::compute(
       nep_data[gpu].NL_radial.data(),
       nep_data[gpu].NN_angular.data(),
       nep_data[gpu].NL_angular.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     bool is_polarizability = paramb.model_type == 2;
     find_descriptor<<<
@@ -1810,7 +1815,7 @@ void NEP3_MULTIGPU::compute(
       nep_data[gpu].Fp.data(),
       nep_data[gpu].virial.data(),
       nep_data[gpu].sum_fxyz.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     bool is_dipole = paramb.model_type == 1;
     find_force_radial<<<
@@ -1839,7 +1844,7 @@ void NEP3_MULTIGPU::compute(
       nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu,
       nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu * 2,
       nep_data[gpu].virial.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     find_partial_force_angular<<<
       (nep_data[gpu].N5 - nep_data[gpu].N4 - 1) / 64 + 1,
@@ -1867,7 +1872,7 @@ void NEP3_MULTIGPU::compute(
       nep_data[gpu].f12x.data(),
       nep_data[gpu].f12y.data(),
       nep_data[gpu].f12z.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     gpu_find_force_many_body<<<
       (nep_data[gpu].N2 - nep_data[gpu].N1 - 1) / 64 + 1,
@@ -1890,7 +1895,7 @@ void NEP3_MULTIGPU::compute(
       nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu,
       nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu * 2,
       nep_data[gpu].virial.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     if (zbl.enabled) {
       find_force_ZBL<<<
@@ -1915,34 +1920,34 @@ void NEP3_MULTIGPU::compute(
         nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu * 2,
         nep_data[gpu].virial.data(),
         nep_data[gpu].potential.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
     }
   }
 
-  CHECK(cudaSetDevice(0));
+  CHECK(gpuSetDevice(0));
 
   // serial
   for (int gpu = 0; gpu < paramb.num_gpus; ++gpu) {
-    CHECK(cudaMemcpy(
+    CHECK(gpuMemcpy(
       nep_temp_data.potential.data() + nep_data[gpu].N1,
       nep_data[gpu].potential.data() + nep_data[gpu].N1,
       sizeof(double) * (nep_data[gpu].N2 - nep_data[gpu].N1),
-      cudaMemcpyDeviceToDevice));
+      gpuMemcpyDeviceToDevice));
 
     for (int d = 0; d < 3; ++d) {
-      CHECK(cudaMemcpy(
+      CHECK(gpuMemcpy(
         nep_temp_data.force.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d,
         nep_data[gpu].force.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d,
         sizeof(double) * (nep_data[gpu].N2 - nep_data[gpu].N1),
-        cudaMemcpyDeviceToDevice));
+        gpuMemcpyDeviceToDevice));
     }
 
     for (int d = 0; d < 9; ++d) {
-      CHECK(cudaMemcpy(
+      CHECK(gpuMemcpy(
         nep_temp_data.virial.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d,
         nep_data[gpu].virial.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d,
         sizeof(double) * (nep_data[gpu].N2 - nep_data[gpu].N1),
-        cudaMemcpyDeviceToDevice));
+        gpuMemcpyDeviceToDevice));
     }
 
     collect_properties<<<(nep_data[gpu].N2 - nep_data[gpu].N1 - 1) / 64 + 1, 64>>>(
@@ -1958,7 +1963,7 @@ void NEP3_MULTIGPU::compute(
       force.data(),
       potential.data(),
       virial.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
@@ -2269,19 +2274,19 @@ void NEP3_MULTIGPU::compute(
       position.data(),
       nep_temp_data.type.data(),
       nep_temp_data.position.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
-    CHECK(cudaMemcpy(
+    CHECK(gpuMemcpy(
       nep_data[gpu].type.data(),
       nep_temp_data.type.data(),
       sizeof(int) * nep_data[gpu].N3,
-      cudaMemcpyDeviceToDevice));
+      gpuMemcpyDeviceToDevice));
     for (int d = 0; d < 3; ++d) {
-      CHECK(cudaMemcpy(
+      CHECK(gpuMemcpy(
         nep_data[gpu].position.data() + nep_temp_data.num_atoms_per_gpu * d,
         nep_temp_data.position.data() + nep_temp_data.num_atoms_per_gpu * d,
         sizeof(double) * nep_data[gpu].N3,
-        cudaMemcpyDeviceToDevice));
+        gpuMemcpyDeviceToDevice));
     }
   }
 
@@ -2289,7 +2294,7 @@ void NEP3_MULTIGPU::compute(
   for (int gpu = 0; gpu < paramb.num_gpus; ++gpu) {
 
 #ifndef ZHEYONG
-    CHECK(cudaSetDevice(gpu));
+    CHECK(gpuSetDevice(gpu));
 #endif
 
     find_cell_list(
@@ -2329,7 +2334,7 @@ void NEP3_MULTIGPU::compute(
       nep_data[gpu].NL_radial.data(),
       nep_data[gpu].NN_angular.data(),
       nep_data[gpu].NL_angular.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     find_descriptor<<<
       (nep_data[gpu].N5 - nep_data[gpu].N4 - 1) / 64 + 1,
@@ -2359,7 +2364,7 @@ void NEP3_MULTIGPU::compute(
       nep_data[gpu].Fp.data(),
       nep_data[gpu].virial.data(),
       nep_data[gpu].sum_fxyz.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     bool is_dipole = paramb.model_type == 1;
     find_force_radial<<<
@@ -2388,7 +2393,7 @@ void NEP3_MULTIGPU::compute(
       nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu,
       nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu * 2,
       nep_data[gpu].virial.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     find_partial_force_angular<<<
       (nep_data[gpu].N5 - nep_data[gpu].N4 - 1) / 64 + 1,
@@ -2416,7 +2421,7 @@ void NEP3_MULTIGPU::compute(
       nep_data[gpu].f12x.data(),
       nep_data[gpu].f12y.data(),
       nep_data[gpu].f12z.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     gpu_find_force_many_body<<<
       (nep_data[gpu].N2 - nep_data[gpu].N1 - 1) / 64 + 1,
@@ -2439,7 +2444,7 @@ void NEP3_MULTIGPU::compute(
       nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu,
       nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu * 2,
       nep_data[gpu].virial.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     if (zbl.enabled) {
       find_force_ZBL<<<
@@ -2464,39 +2469,39 @@ void NEP3_MULTIGPU::compute(
         nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu * 2,
         nep_data[gpu].virial.data(),
         nep_data[gpu].potential.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
     }
   }
 
   for (int gpu = 0; gpu < paramb.num_gpus; ++gpu) {
-    CHECK(cudaSetDevice(gpu));
-    CHECK(cudaDeviceSynchronize());
+    CHECK(gpuSetDevice(gpu));
+    CHECK(gpuDeviceSynchronize());
   }
 
-  CHECK(cudaSetDevice(0));
+  CHECK(gpuSetDevice(0));
 
   // serial
   for (int gpu = 0; gpu < paramb.num_gpus; ++gpu) {
-    CHECK(cudaMemcpy(
+    CHECK(gpuMemcpy(
       nep_temp_data.potential.data() + nep_data[gpu].N1,
       nep_data[gpu].potential.data() + nep_data[gpu].N1,
       sizeof(double) * (nep_data[gpu].N2 - nep_data[gpu].N1),
-      cudaMemcpyDeviceToDevice));
+      gpuMemcpyDeviceToDevice));
 
     for (int d = 0; d < 3; ++d) {
-      CHECK(cudaMemcpy(
+      CHECK(gpuMemcpy(
         nep_temp_data.force.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d,
         nep_data[gpu].force.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d,
         sizeof(double) * (nep_data[gpu].N2 - nep_data[gpu].N1),
-        cudaMemcpyDeviceToDevice));
+        gpuMemcpyDeviceToDevice));
     }
 
     for (int d = 0; d < 9; ++d) {
-      CHECK(cudaMemcpy(
+      CHECK(gpuMemcpy(
         nep_temp_data.virial.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d,
         nep_data[gpu].virial.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d,
         sizeof(double) * (nep_data[gpu].N2 - nep_data[gpu].N1),
-        cudaMemcpyDeviceToDevice));
+        gpuMemcpyDeviceToDevice));
     }
 
     collect_properties<<<(nep_data[gpu].N2 - nep_data[gpu].N1 - 1) / 64 + 1, 64>>>(
@@ -2512,6 +2517,6 @@ void NEP3_MULTIGPU::compute(
       force.data(),
       potential.data(),
       virial.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
diff --git a/src/force/nep3_multigpu.cuh b/src/force/nep3_multigpu.cuh
index 79280b70e..ecb70d954 100644
--- a/src/force/nep3_multigpu.cuh
+++ b/src/force/nep3_multigpu.cuh
@@ -57,7 +57,7 @@ struct NEP3_MULTIGPU_Data {
 
   int N1, N2, N3, N4, N5; // for local system
   int M0, M1, M2;         // for global system
-  cudaStream_t stream;
+  gpuStream_t stream;
 };
 
 struct NEP3_TEMP_Data {
diff --git a/src/force/nep3_small_box.cuh b/src/force/nep3_small_box.cuh
index f6d012ce4..cdbf5a7b8 100644
--- a/src/force/nep3_small_box.cuh
+++ b/src/force/nep3_small_box.cuh
@@ -17,6 +17,7 @@
 #include "nep3.cuh"
 #include "utilities/common.cuh"
 #include "utilities/nep_utilities.cuh"
+#include "utilities/gpu_macro.cuh"
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
 static __device__ __inline__ double atomicAdd(double* address, double val)
diff --git a/src/force/potential.cu b/src/force/potential.cu
index c7eb2dc8e..b3d894141 100644
--- a/src/force/potential.cu
+++ b/src/force/potential.cu
@@ -19,6 +19,7 @@ The abstract base class (ABC) for the potential classes.
 
 #include "potential.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #define BLOCK_SIZE_FORCE 64
 #include <thrust/execution_policy.h>
 #include <thrust/scan.h>
@@ -162,7 +163,7 @@ void Potential::find_properties_many_body(
     force_per_atom.data() + number_of_atoms,
     force_per_atom.data() + 2 * number_of_atoms,
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 static __global__ void gpu_find_force_many_body(
@@ -312,5 +313,5 @@ void Potential::find_properties_many_body(
     force_per_atom.data() + number_of_atoms,
     force_per_atom.data() + 2 * number_of_atoms,
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
diff --git a/src/force/tersoff1988.cu b/src/force/tersoff1988.cu
index 33405a166..823f18f15 100644
--- a/src/force/tersoff1988.cu
+++ b/src/force/tersoff1988.cu
@@ -23,6 +23,7 @@ The version of the Tersoff potential as described in
 #include "tersoff1988.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <vector>
 
 #define LDG(a, n) __ldg(a + n)
@@ -538,7 +539,7 @@ void Tersoff1988::compute(
     position_per_atom.data() + number_of_atoms * 2,
     tersoff_data.b.data(),
     tersoff_data.bp.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // pre-compute the partial forces
   find_force_tersoff_step2<<<grid_size, BLOCK_SIZE_FORCE>>>(
@@ -560,7 +561,7 @@ void Tersoff1988::compute(
     tersoff_data.f12x.data(),
     tersoff_data.f12y.data(),
     tersoff_data.f12z.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // the final step: calculate force and related quantities
   find_properties_many_body(
diff --git a/src/force/tersoff1989.cu b/src/force/tersoff1989.cu
index 0b8035c77..858f08684 100644
--- a/src/force/tersoff1989.cu
+++ b/src/force/tersoff1989.cu
@@ -23,6 +23,7 @@ The double-element version of the Tersoff potential as described in
 #include "tersoff1989.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 
 #define BLOCK_SIZE_FORCE 64 // 128 is also good
 
@@ -556,7 +557,7 @@ void Tersoff1989::compute(
     position_per_atom.data() + number_of_atoms * 2,
     tersoff_data.b.data(),
     tersoff_data.bp.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // pre-compute the partial forces
   find_force_tersoff_step2<<<grid_size, BLOCK_SIZE_FORCE>>>(
@@ -579,7 +580,7 @@ void Tersoff1989::compute(
     tersoff_data.f12x.data(),
     tersoff_data.f12y.data(),
     tersoff_data.f12z.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // the final step: calculate force and related quantities
   find_properties_many_body(
diff --git a/src/force/tersoff_mini.cu b/src/force/tersoff_mini.cu
index bad984625..65c30848c 100644
--- a/src/force/tersoff_mini.cu
+++ b/src/force/tersoff_mini.cu
@@ -26,6 +26,7 @@ J. Phys.: Condens. Matter 32, 135901 (2020).
 #include "tersoff_mini.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 
 #define BLOCK_SIZE_FORCE 64
 
@@ -371,7 +372,7 @@ void Tersoff_mini::compute(
     position_per_atom.data() + number_of_atoms * 2,
     tersoff_mini_data.b.data(),
     tersoff_mini_data.bp.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // pre-compute the partial forces
   find_force_step2<<<grid_size, BLOCK_SIZE_FORCE>>>(
@@ -393,7 +394,7 @@ void Tersoff_mini::compute(
     tersoff_mini_data.f12x.data(),
     tersoff_mini_data.f12y.data(),
     tersoff_mini_data.f12z.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // the final step: calculate force and related quantities
   find_properties_many_body(
diff --git a/src/integrate/ensemble.cu b/src/integrate/ensemble.cu
index cb77bbe3a..51c3d3f2d 100644
--- a/src/integrate/ensemble.cu
+++ b/src/integrate/ensemble.cu
@@ -19,6 +19,7 @@ The abstract base class (ABC) for the ensemble classes.
 
 #include "ensemble.cuh"
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 #define DIM 3
 
 Ensemble::Ensemble(void)
@@ -314,7 +315,7 @@ void Ensemble::velocity_verlet(
       force_per_atom.data() + number_of_atoms,
       force_per_atom.data() + 2 * number_of_atoms);
   }
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 // Find some thermodynamic properties:
@@ -757,7 +758,7 @@ void Ensemble::find_thermo(
       thermo.data());
   }
 
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 // Scale the velocity of every particle in the systems by a factor
@@ -782,7 +783,7 @@ void Ensemble::scale_velocity_global(const double factor, GPU_Vector<double>& ve
     velocity_per_atom.data(),
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 static __global__ void gpu_find_vc_and_ke(
@@ -837,7 +838,7 @@ static __global__ void gpu_find_vc_and_ke(
   }
   __syncthreads();
 
-#pragma unroll
+
   for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
     if (tid < offset) {
       s_mc[tid] += s_mc[tid + offset];
@@ -887,7 +888,7 @@ void Ensemble::find_vc_and_ke(
     vcy,
     vcz,
     ke);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 static __global__ void gpu_scale_velocity(
@@ -963,5 +964,5 @@ void Ensemble::scale_velocity_local(
     velocity_per_atom.data(),
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
diff --git a/src/integrate/ensemble_bao.cu b/src/integrate/ensemble_bao.cu
index c30ee36ea..b1d0032cc 100644
--- a/src/integrate/ensemble_bao.cu
+++ b/src/integrate/ensemble_bao.cu
@@ -23,6 +23,7 @@ The Langevin thermostat with the BAOAB splitting:
 #include "ensemble_bao.cuh"
 #include "langevin_utilities.cuh"
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <cstdlib>
 
 Ensemble_BAO::Ensemble_BAO(int t, int N, double T, double Tc)
@@ -35,7 +36,7 @@ Ensemble_BAO::Ensemble_BAO(int t, int N, double T, double Tc)
   curand_states.resize(N);
   int grid_size = (N - 1) / 128 + 1;
   initialize_curand_states<<<grid_size, 128>>>(curand_states.data(), N, rand());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 Ensemble_BAO::Ensemble_BAO(
@@ -69,9 +70,9 @@ Ensemble_BAO::Ensemble_BAO(
   int grid_size_sink = (N_sink - 1) / 128 + 1;
   initialize_curand_states<<<grid_size_source, 128>>>(
     curand_states_source.data(), N_source, rand());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
   initialize_curand_states<<<grid_size_sink, 128>>>(curand_states_sink.data(), N_sink, rand());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
   energy_transferred[0] = 0.0;
   energy_transferred[1] = 0.0;
 }
@@ -96,7 +97,7 @@ void Ensemble_BAO::integrate_nvt_lan(
     velocity_per_atom.data(),
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   gpu_find_momentum<<<4, 1024>>>(
     number_of_atoms,
@@ -104,14 +105,14 @@ void Ensemble_BAO::integrate_nvt_lan(
     velocity_per_atom.data(),
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   gpu_correct_momentum<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
     number_of_atoms,
     velocity_per_atom.data(),
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 // wrapper of the local Langevin thermostatting kernels
@@ -136,7 +137,7 @@ void Ensemble_BAO::integrate_heat_lan(
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms,
     ke.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   ke.copy_to_host(ek2.data());
   energy_transferred[0] += ek2[source] * 0.5;
@@ -153,7 +154,7 @@ void Ensemble_BAO::integrate_heat_lan(
     velocity_per_atom.data(),
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   gpu_langevin<<<(N_sink - 1) / 128 + 1, 128>>>(
     curand_states_sink.data(),
@@ -166,7 +167,7 @@ void Ensemble_BAO::integrate_heat_lan(
     velocity_per_atom.data(),
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_ke<<<Ng, 512>>>(
     group[0].size.data(),
@@ -177,7 +178,7 @@ void Ensemble_BAO::integrate_heat_lan(
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms,
     ke.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   ke.copy_to_host(ek2.data());
   energy_transferred[0] -= ek2[source] * 0.5;
@@ -271,7 +272,7 @@ void Ensemble_BAO::operator_A(
       force_per_atom.data(),
       force_per_atom.data() + number_of_atoms,
       force_per_atom.data() + 2 * number_of_atoms);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   } else {
     gpu_operator_A<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
       number_of_atoms,
@@ -288,7 +289,7 @@ void Ensemble_BAO::operator_A(
       force_per_atom.data(),
       force_per_atom.data() + number_of_atoms,
       force_per_atom.data() + 2 * number_of_atoms);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
@@ -411,7 +412,7 @@ void Ensemble_BAO::operator_B(
       force_per_atom.data() + number_of_atoms,
       force_per_atom.data() + 2 * number_of_atoms);
   }
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 void Ensemble_BAO::compute1(
diff --git a/src/integrate/ensemble_bao.cuh b/src/integrate/ensemble_bao.cuh
index 47a28561c..e462f5dbe 100644
--- a/src/integrate/ensemble_bao.cuh
+++ b/src/integrate/ensemble_bao.cuh
@@ -15,7 +15,12 @@
 
 #pragma once
 #include "ensemble.cuh"
-#include <curand_kernel.h>
+#include "utilities/gpu_macro.cuh"
+#ifdef USE_HIP
+  #include <hiprand_kernel.h>
+#else
+  #include <curand_kernel.h>
+#endif
 
 class Ensemble_BAO : public Ensemble
 {
@@ -41,9 +46,9 @@ public:
 protected:
   int N_source, N_sink, offset_source, offset_sink;
   double c1, c2, c2_source, c2_sink;
-  GPU_Vector<curandState> curand_states;
-  GPU_Vector<curandState> curand_states_source;
-  GPU_Vector<curandState> curand_states_sink;
+  GPU_Vector<gpurandState> curand_states;
+  GPU_Vector<gpurandState> curand_states_source;
+  GPU_Vector<gpurandState> curand_states_sink;
 
   void operator_A(
     const double time_step,
diff --git a/src/integrate/ensemble_bdp.cu b/src/integrate/ensemble_bdp.cu
index 78e27b675..148f89477 100644
--- a/src/integrate/ensemble_bdp.cu
+++ b/src/integrate/ensemble_bdp.cu
@@ -21,6 +21,7 @@ The Bussi-Donadio-Parrinello thermostat:
 #include "ensemble_bdp.cuh"
 #include "svr_utilities.cuh"
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <chrono>
 #define DIM 3
 
diff --git a/src/integrate/ensemble_ber.cu b/src/integrate/ensemble_ber.cu
index 70f9ff56c..6523c412e 100644
--- a/src/integrate/ensemble_ber.cu
+++ b/src/integrate/ensemble_ber.cu
@@ -20,6 +20,7 @@ The Berendsen thermostat and barostat:
 
 #include "ensemble_ber.cuh"
 #include "npt_utilities.cuh"
+#include "utilities/gpu_macro.cuh"
 
 Ensemble_BER::Ensemble_BER(int t, int mg, double* mv, double T, double Tc)
 {
@@ -95,7 +96,7 @@ static void cpu_pressure_orthogonal(
   double* scale_factor)
 {
   double p[3];
-  CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 3, cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 3, gpuMemcpyDeviceToHost));
 
   if (deform_x) {
     scale_factor[0] = box.cpu_h[0];
@@ -141,7 +142,7 @@ static void cpu_pressure_isotropic(
   Box& box, double* p0, double* p_coupling, double* thermo, double& scale_factor)
 {
   double p[3];
-  CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 3, cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 3, gpuMemcpyDeviceToHost));
   scale_factor = 1.0 - p_coupling[0] * (p0[0] - (p[0] + p[1] + p[2]) * 0.3333333333333333);
   box.cpu_h[0] *= scale_factor;
   box.cpu_h[1] *= scale_factor;
@@ -156,7 +157,7 @@ cpu_pressure_triclinic(Box& box, double* p0, double* p_coupling, double* thermo,
 {
   // p_coupling and p0 are in Voigt notation: xx, yy, zz, yz, xz, xy
   double p[6]; // but thermo is this order: xx, yy, zz, xy, xz, yz
-  CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 6, cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 6, gpuMemcpyDeviceToHost));
   mu[0] = 1.0 - p_coupling[0] * (p0[0] - p[0]);    // xx
   mu[4] = 1.0 - p_coupling[1] * (p0[1] - p[1]);    // yy
   mu[8] = 1.0 - p_coupling[2] * (p0[2] - p[2]);    // zz
@@ -233,7 +234,7 @@ void Ensemble_BER::compute2(
       atom.velocity_per_atom.data(),
       atom.velocity_per_atom.data() + number_of_atoms,
       atom.velocity_per_atom.data() + 2 * number_of_atoms);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 
   if (type == 11) {
@@ -266,7 +267,7 @@ void Ensemble_BER::compute2(
         atom.position_per_atom.data(),
         atom.position_per_atom.data() + number_of_atoms,
         atom.position_per_atom.data() + number_of_atoms * 2);
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
     } else {
       double mu[9];
       cpu_pressure_triclinic(box, target_pressure, pressure_coupling, thermo.data(), mu);
diff --git a/src/integrate/ensemble_lan.cu b/src/integrate/ensemble_lan.cu
index 327b69277..6dae340b0 100644
--- a/src/integrate/ensemble_lan.cu
+++ b/src/integrate/ensemble_lan.cu
@@ -21,6 +21,7 @@ The Bussi-Parrinello integrator of the Langevin thermostat:
 #include "ensemble_lan.cuh"
 #include "langevin_utilities.cuh"
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <cstdlib>
 
 Ensemble_LAN::Ensemble_LAN() {}
@@ -35,7 +36,7 @@ Ensemble_LAN::Ensemble_LAN(int t, int N, double T, double Tc)
   curand_states.resize(N);
   int grid_size = (N - 1) / 128 + 1;
   initialize_curand_states<<<grid_size, 128>>>(curand_states.data(), N, rand());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 Ensemble_LAN::Ensemble_LAN(
@@ -69,9 +70,9 @@ Ensemble_LAN::Ensemble_LAN(
   int grid_size_sink = (N_sink - 1) / 128 + 1;
   initialize_curand_states<<<grid_size_source, 128>>>(
     curand_states_source.data(), N_source, rand());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
   initialize_curand_states<<<grid_size_sink, 128>>>(curand_states_sink.data(), N_sink, rand());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
   energy_transferred[0] = 0.0;
   energy_transferred[1] = 0.0;
 }
@@ -98,7 +99,7 @@ void Ensemble_LAN::integrate_nvt_lan_half(
     velocity_per_atom.data(),
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   gpu_find_momentum<<<4, 1024>>>(
     number_of_atoms,
@@ -106,14 +107,14 @@ void Ensemble_LAN::integrate_nvt_lan_half(
     velocity_per_atom.data(),
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   gpu_correct_momentum<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
     number_of_atoms,
     velocity_per_atom.data(),
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 // wrapper of the local Langevin thermostatting kernels
@@ -138,7 +139,7 @@ void Ensemble_LAN::integrate_heat_lan_half(
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms,
     ke.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   ke.copy_to_host(ek2.data());
   energy_transferred[0] += ek2[source] * 0.5;
@@ -155,7 +156,7 @@ void Ensemble_LAN::integrate_heat_lan_half(
     velocity_per_atom.data(),
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   gpu_langevin<<<(N_sink - 1) / 128 + 1, 128>>>(
     curand_states_sink.data(),
@@ -168,7 +169,7 @@ void Ensemble_LAN::integrate_heat_lan_half(
     velocity_per_atom.data(),
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_ke<<<Ng, 512>>>(
     group[0].size.data(),
@@ -179,7 +180,7 @@ void Ensemble_LAN::integrate_heat_lan_half(
     velocity_per_atom.data() + number_of_atoms,
     velocity_per_atom.data() + 2 * number_of_atoms,
     ke.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   ke.copy_to_host(ek2.data());
   energy_transferred[0] -= ek2[source] * 0.5;
diff --git a/src/integrate/ensemble_lan.cuh b/src/integrate/ensemble_lan.cuh
index 5968d8283..f769a6390 100644
--- a/src/integrate/ensemble_lan.cuh
+++ b/src/integrate/ensemble_lan.cuh
@@ -15,7 +15,12 @@
 
 #pragma once
 #include "ensemble.cuh"
-#include <curand_kernel.h>
+#include "utilities/gpu_macro.cuh"
+#ifdef USE_HIP
+  #include <hiprand_kernel.h>
+#else
+  #include <curand_kernel.h>
+#endif
 
 class Ensemble_LAN : public Ensemble
 {
@@ -42,9 +47,9 @@ public:
 protected:
   int N_source, N_sink, offset_source, offset_sink;
   double c1, c2, c2_source, c2_sink;
-  GPU_Vector<curandState> curand_states;
-  GPU_Vector<curandState> curand_states_source;
-  GPU_Vector<curandState> curand_states_sink;
+  GPU_Vector<gpurandState> curand_states;
+  GPU_Vector<gpurandState> curand_states_source;
+  GPU_Vector<gpurandState> curand_states_sink;
 
   void
   integrate_nvt_lan_half(const GPU_Vector<double>& mass, GPU_Vector<double>& velocity_per_atom);
diff --git a/src/integrate/ensemble_msst.cu b/src/integrate/ensemble_msst.cu
index 068f48825..040cb84d8 100644
--- a/src/integrate/ensemble_msst.cu
+++ b/src/integrate/ensemble_msst.cu
@@ -18,6 +18,7 @@ The NVE ensemble integrator.
 ------------------------------------------------------------------------------*/
 
 #include "ensemble_msst.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <cstring>
 
 namespace
@@ -200,7 +201,7 @@ void Ensemble_MSST::remap(double dilation)
     dilation,
     atom->position_per_atom.data() + shock_direction * N,
     atom->velocity_per_atom.data() + shock_direction * N);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 void Ensemble_MSST::get_conserved()
@@ -271,22 +272,22 @@ void Ensemble_MSST::compute1(
   get_omega();
 
   get_vsum();
-  CHECK(cudaMemcpy(
+  CHECK(gpuMemcpy(
     gpu_v_backup.data(),
     atom.velocity_per_atom.data(),
     sizeof(double) * gpu_v_backup.size(),
-    cudaMemcpyDeviceToDevice));
+    gpuMemcpyDeviceToDevice));
 
   // propagate velocity sum 1/2 step by temporarily propagating the velocities
   msst_v();
   get_vsum();
 
   // reset the velocities
-  CHECK(cudaMemcpy(
+  CHECK(gpuMemcpy(
     atom.velocity_per_atom.data(),
     gpu_v_backup.data(),
     sizeof(double) * gpu_v_backup.size(),
-    cudaMemcpyDeviceToDevice));
+    gpuMemcpyDeviceToDevice));
 
   // propagate velocities 1/2 step using the new velocity sum
   msst_v();
diff --git a/src/integrate/ensemble_mttk.cu b/src/integrate/ensemble_mttk.cu
index 04a3df07a..b59f6c7f2 100644
--- a/src/integrate/ensemble_mttk.cu
+++ b/src/integrate/ensemble_mttk.cu
@@ -21,6 +21,7 @@ P and T are both set -> NPT ensemable
 ------------------------------------------------------------------------------*/
 
 #include "ensemble_mttk.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <cstring>
 
 namespace
diff --git a/src/integrate/ensemble_nhc.cu b/src/integrate/ensemble_nhc.cu
index 7bd643304..8fc17e1f5 100644
--- a/src/integrate/ensemble_nhc.cu
+++ b/src/integrate/ensemble_nhc.cu
@@ -21,6 +21,7 @@ Oxford University Press, 2010.
 
 #include "ensemble_nhc.cuh"
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 #define DIM 3
 
 Ensemble_NHC::Ensemble_NHC(int t, int mg, double* mv, int N, double T, double Tc, double dt)
diff --git a/src/integrate/ensemble_nphug.cu b/src/integrate/ensemble_nphug.cu
index 3d46399ac..e9c92dbf7 100644
--- a/src/integrate/ensemble_nphug.cu
+++ b/src/integrate/ensemble_nphug.cu
@@ -14,6 +14,7 @@
 */
 
 #include "ensemble_nphug.cuh"
+#include "utilities/gpu_macro.cuh"
 
 namespace
 {
diff --git a/src/integrate/ensemble_npt_scr.cu b/src/integrate/ensemble_npt_scr.cu
index edcaaadd7..c5b104b2b 100644
--- a/src/integrate/ensemble_npt_scr.cu
+++ b/src/integrate/ensemble_npt_scr.cu
@@ -24,6 +24,7 @@ J. Chem. Phys. 153, 114107 (2020).
 #include "npt_utilities.cuh"
 #include "svr_utilities.cuh"
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <chrono>
 
 void Ensemble_NPT_SCR::initialize_rng()
@@ -85,7 +86,7 @@ static void cpu_pressure_orthogonal(
   double* scale_factor)
 {
   double p[3];
-  CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 3, cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 3, gpuMemcpyDeviceToHost));
 
   if (deform_x) {
     scale_factor[0] = box.cpu_h[0];
@@ -146,7 +147,7 @@ static void cpu_pressure_isotropic(
   double& scale_factor)
 {
   double p[3];
-  CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 3, cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 3, gpuMemcpyDeviceToHost));
   const double pressure_instant = (p[0] + p[1] + p[2]) * 0.3333333333333333;
   const double scale_factor_Berendsen =
     1.0 - p_coupling[0] * (target_pressure[0] - pressure_instant);
@@ -174,7 +175,7 @@ static void cpu_pressure_triclinic(
 {
   // p_coupling and p0 are in Voigt notation: xx, yy, zz, yz, xz, xy
   double p[6]; // but thermo is this order: xx, yy, zz, xy, xz, yz
-  CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 6, cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 6, gpuMemcpyDeviceToHost));
   mu[0] = 1.0 - p_coupling[0] * (p0[0] - p[0]);    // xx
   mu[4] = 1.0 - p_coupling[1] * (p0[1] - p[1]);    // yy
   mu[8] = 1.0 - p_coupling[2] * (p0[2] - p[2]);    // zz
@@ -295,7 +296,7 @@ void Ensemble_NPT_SCR::compute2(
       atom.position_per_atom.data(),
       atom.position_per_atom.data() + number_of_atoms,
       atom.position_per_atom.data() + number_of_atoms * 2);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   } else {
     double mu[9];
     cpu_pressure_triclinic(
diff --git a/src/integrate/ensemble_nve.cu b/src/integrate/ensemble_nve.cu
index 05ad72c9a..9b0b51e6e 100644
--- a/src/integrate/ensemble_nve.cu
+++ b/src/integrate/ensemble_nve.cu
@@ -18,6 +18,7 @@ The NVE ensemble integrator.
 ------------------------------------------------------------------------------*/
 
 #include "ensemble_nve.cuh"
+#include "utilities/gpu_macro.cuh"
 
 Ensemble_NVE::Ensemble_NVE(int t) { type = t; }
 
diff --git a/src/integrate/ensemble_pimd.cu b/src/integrate/ensemble_pimd.cu
index 7e67a18bc..604357f65 100644
--- a/src/integrate/ensemble_pimd.cu
+++ b/src/integrate/ensemble_pimd.cu
@@ -27,6 +27,7 @@ References for implementation:
 #include "langevin_utilities.cuh"
 #include "svr_utilities.cuh"
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <chrono>
 #include <cstdlib>
 
@@ -173,7 +174,7 @@ void Ensemble_PIMD::initialize(Atom& atom)
   curand_states.resize(number_of_atoms);
   int grid_size = (number_of_atoms - 1) / 128 + 1;
   initialize_curand_states<<<grid_size, 128>>>(curand_states.data(), number_of_atoms, rand());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 Ensemble_PIMD::~Ensemble_PIMD(void)
@@ -288,7 +289,7 @@ static __global__ void gpu_langevin(
   const bool thermostat_centroid,
   const int number_of_atoms,
   const int number_of_beads,
-  curandState* g_state,
+  gpurandState* g_state,
   const double temperature,
   const double temperature_coupling,
   const double omega_n,
@@ -315,7 +316,7 @@ static __global__ void gpu_langevin(
       }
     }
 
-    curandState state = g_state[n];
+    gpurandState state = g_state[n];
     for (int k = 0; k < number_of_beads; ++k) {
       if (k == 0 && !thermostat_centroid) {
         continue;
@@ -592,7 +593,7 @@ static void cpu_pressure_orthogonal(
   double* scale_factor)
 {
   double p[3];
-  CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 3, cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 3, gpuMemcpyDeviceToHost));
 
   if (box.pbc_x == 1) {
     const double scale_factor_Berendsen = 1.0 - p_coupling[0] * (p0[0] - p[0]);
@@ -638,7 +639,7 @@ static void cpu_pressure_isotropic(
   double& scale_factor)
 {
   double p[3];
-  CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 3, cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 3, gpuMemcpyDeviceToHost));
   const double pressure_instant = (p[0] + p[1] + p[2]) * 0.3333333333333333;
   const double scale_factor_Berendsen =
     1.0 - p_coupling[0] * (target_pressure[0] - pressure_instant);
@@ -666,7 +667,7 @@ static void cpu_pressure_triclinic(
 {
   // p_coupling and p0 are in Voigt notation: xx, yy, zz, yz, xz, xy
   double p[6]; // but thermo is this order: xx, yy, zz, xy, xz, yz
-  CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 6, cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 6, gpuMemcpyDeviceToHost));
   mu[0] = 1.0 - p_coupling[0] * (p0[0] - p[0]);    // xx
   mu[4] = 1.0 - p_coupling[1] * (p0[1] - p[1]);    // yy
   mu[8] = 1.0 - p_coupling[2] * (p0[2] - p[2]);    // zz
@@ -792,15 +793,15 @@ void Ensemble_PIMD::langevin(const double time_step, Atom& atom)
       transformation_matrix.data(),
       atom.mass.data(),
       velocity_beads.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     gpu_find_momentum_beads<<<number_of_beads, 1024>>>(
       number_of_atoms, atom.mass.data(), velocity_beads.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     gpu_correct_momentum_beads<<<(number_of_atoms - 1) / 64 + 1, 64>>>(
       number_of_atoms, number_of_beads, velocity_beads.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
@@ -817,7 +818,7 @@ void Ensemble_PIMD::compute1(
 
   gpu_apply_pbc<<<(number_of_atoms - 1) / 64 + 1, 64>>>(
     box, number_of_atoms, number_of_beads, position_beads.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   gpu_nve_1<<<(number_of_atoms - 1) / 64 + 1, 64>>>(
     number_of_atoms,
@@ -829,7 +830,7 @@ void Ensemble_PIMD::compute1(
     force_beads.data(),
     position_beads.data(),
     velocity_beads.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 void Ensemble_PIMD::compute2(
@@ -848,13 +849,13 @@ void Ensemble_PIMD::compute2(
     atom.mass.data(),
     force_beads.data(),
     velocity_beads.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   langevin(time_step, atom);
 
   gpu_apply_pbc<<<(number_of_atoms - 1) / 64 + 1, 64>>>(
     box, number_of_atoms, number_of_beads, position_beads.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   gpu_average<<<(number_of_atoms - 1) / 64 + 1, 64>>>(
     number_of_atoms,
@@ -869,7 +870,7 @@ void Ensemble_PIMD::compute2(
     atom.potential_per_atom.data(),
     atom.force_per_atom.data(),
     atom.virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   gpu_find_kinetic_energy_virial_part<<<(number_of_atoms - 1) / 64 + 1, 64>>>(
     box,
@@ -880,7 +881,7 @@ void Ensemble_PIMD::compute2(
     atom.position_per_atom.data(),
     kinetic_energy_virial_part.data(),
     atom.virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   gpu_find_sum_1024<<<1024, 128>>>(
     number_of_atoms,
@@ -888,11 +889,11 @@ void Ensemble_PIMD::compute2(
     atom.potential_per_atom.data(),
     atom.virial_per_atom.data(),
     sum_1024.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   gpu_find_thermo<<<8, 1024>>>(
     box.get_volume(), number_of_atoms * K_B * temperature, sum_1024.data(), thermo.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   if (num_target_pressure_components == 1) {
     double scale_factor;
@@ -916,7 +917,7 @@ void Ensemble_PIMD::compute2(
       scale_factor[2],
       position_beads.data(),
       atom.position_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   } else if (num_target_pressure_components == 6) {
     double mu[9];
     cpu_pressure_triclinic(
diff --git a/src/integrate/ensemble_pimd.cuh b/src/integrate/ensemble_pimd.cuh
index 498c46b69..3c7e6a806 100644
--- a/src/integrate/ensemble_pimd.cuh
+++ b/src/integrate/ensemble_pimd.cuh
@@ -15,7 +15,12 @@
 
 #pragma once
 #include "ensemble.cuh"
-#include <curand_kernel.h>
+#include "utilities/gpu_macro.cuh"
+#ifdef USE_HIP
+  #include <hiprand_kernel.h>
+#else
+  #include <curand_kernel.h>
+#endif
 #include <random>
 #include <vector>
 
@@ -59,7 +64,7 @@ protected:
   bool thermostat_internal = false;
   bool thermostat_centroid = false;
   double omega_n;
-  GPU_Vector<curandState> curand_states;
+  GPU_Vector<gpurandState> curand_states;
   GPU_Vector<double*> position_beads;
   GPU_Vector<double*> velocity_beads;
   GPU_Vector<double*> potential_beads;
diff --git a/src/integrate/ensemble_ti.cu b/src/integrate/ensemble_ti.cu
index f64478d1a..4a9f9849d 100644
--- a/src/integrate/ensemble_ti.cu
+++ b/src/integrate/ensemble_ti.cu
@@ -14,6 +14,7 @@
 */
 
 #include "ensemble_ti.cuh"
+#include "utilities/gpu_macro.cuh"
 
 namespace
 {
@@ -125,7 +126,7 @@ void Ensemble_TI::init()
   curand_states.resize(N);
   int grid_size = (N - 1) / 128 + 1;
   initialize_curand_states<<<grid_size, 128>>>(curand_states.data(), N, rand());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   thermo_cpu.resize(thermo->size());
   gpu_k.resize(N);
@@ -139,11 +140,11 @@ void Ensemble_TI::init()
   gpu_k.copy_from_host(cpu_k.data());
   gpu_espring.resize(N);
   position_0.resize(3 * N);
-  CHECK(cudaMemcpy(
+  CHECK(gpuMemcpy(
     position_0.data(),
     atom->position_per_atom.data(),
     sizeof(double) * position_0.size(),
-    cudaMemcpyDeviceToDevice));
+    gpuMemcpyDeviceToDevice));
 }
 
 void Ensemble_TI::find_thermo()
diff --git a/src/integrate/ensemble_ti_as.cu b/src/integrate/ensemble_ti_as.cu
index 352e94054..0cb06aea3 100644
--- a/src/integrate/ensemble_ti_as.cu
+++ b/src/integrate/ensemble_ti_as.cu
@@ -14,6 +14,7 @@
 */
 
 #include "ensemble_ti_as.cuh"
+#include "utilities/gpu_macro.cuh"
 
 Ensemble_TI_AS::Ensemble_TI_AS(const char** params, int num_params)
 {
diff --git a/src/integrate/ensemble_ti_rs.cu b/src/integrate/ensemble_ti_rs.cu
index 58440c697..918477fc4 100644
--- a/src/integrate/ensemble_ti_rs.cu
+++ b/src/integrate/ensemble_ti_rs.cu
@@ -14,6 +14,7 @@
 */
 
 #include "ensemble_ti_rs.cuh"
+#include "utilities/gpu_macro.cuh"
 
 namespace
 {
diff --git a/src/integrate/ensemble_ti_spring.cu b/src/integrate/ensemble_ti_spring.cu
index 0da21a50d..f633e0797 100644
--- a/src/integrate/ensemble_ti_spring.cu
+++ b/src/integrate/ensemble_ti_spring.cu
@@ -14,6 +14,7 @@
 */
 
 #include "ensemble_ti_spring.cuh"
+#include "utilities/gpu_macro.cuh"
 
 namespace
 {
@@ -164,18 +165,18 @@ void Ensemble_TI_Spring::init()
   curand_states.resize(N);
   int grid_size = (N - 1) / 128 + 1;
   initialize_curand_states<<<grid_size, 128>>>(curand_states.data(), N, rand());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   thermo_cpu.resize(thermo->size());
   gpu_k.resize(N, 0);
   cpu_k.resize(N, 0);
   gpu_espring.resize(N);
   position_0.resize(3 * N);
-  CHECK(cudaMemcpy(
+  CHECK(gpuMemcpy(
     position_0.data(),
     atom->position_per_atom.data(),
     sizeof(double) * position_0.size(),
-    cudaMemcpyDeviceToDevice));
+    gpuMemcpyDeviceToDevice));
 
   if (!auto_k) {
     for (int i = 0; i < N; i++) {
diff --git a/src/integrate/ensemble_wall_harmonic.cu b/src/integrate/ensemble_wall_harmonic.cu
index adcc84df5..d7d073ba5 100644
--- a/src/integrate/ensemble_wall_harmonic.cu
+++ b/src/integrate/ensemble_wall_harmonic.cu
@@ -14,6 +14,7 @@
 */
 
 #include "ensemble_wall_harmonic.cuh"
+#include "utilities/gpu_macro.cuh"
 
 namespace
 {
diff --git a/src/integrate/ensemble_wall_mirror.cu b/src/integrate/ensemble_wall_mirror.cu
index 14f7685d0..8e615976b 100644
--- a/src/integrate/ensemble_wall_mirror.cu
+++ b/src/integrate/ensemble_wall_mirror.cu
@@ -14,6 +14,7 @@
 */
 
 #include "ensemble_wall_mirror.cuh"
+#include "utilities/gpu_macro.cuh"
 
 namespace
 {
diff --git a/src/integrate/ensemble_wall_piston.cu b/src/integrate/ensemble_wall_piston.cu
index 7ace8a5f0..4e4982afb 100644
--- a/src/integrate/ensemble_wall_piston.cu
+++ b/src/integrate/ensemble_wall_piston.cu
@@ -14,6 +14,7 @@
 */
 
 #include "ensemble_wall_piston.cuh"
+#include "utilities/gpu_macro.cuh"
 
 namespace
 {
diff --git a/src/integrate/integrate.cu b/src/integrate/integrate.cu
index c9e815d0b..f92cc9a21 100644
--- a/src/integrate/integrate.cu
+++ b/src/integrate/integrate.cu
@@ -38,6 +38,7 @@ The driver class for the various integrators.
 #include "integrate.cuh"
 #include "model/atom.cuh"
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 
 void Integrate::initialize(
@@ -273,7 +274,7 @@ void Integrate::compute1(
     atom.position_temp.data(),
     atom.position_temp.data() + num_atoms,
     atom.position_temp.data() + num_atoms * 2);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   ensemble->compute1(time_step, group, box, atom, thermo);
 
@@ -288,7 +289,7 @@ void Integrate::compute1(
     atom.unwrapped_position.data(),
     atom.unwrapped_position.data() + num_atoms,
     atom.unwrapped_position.data() + num_atoms * 2);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 void Integrate::compute2(
diff --git a/src/integrate/langevin_utilities.cuh b/src/integrate/langevin_utilities.cuh
index dd186d87d..311dc242c 100644
--- a/src/integrate/langevin_utilities.cuh
+++ b/src/integrate/langevin_utilities.cuh
@@ -18,20 +18,22 @@ Some CUDA kernels for Langevin thermostats.
 ------------------------------------------------------------------------------*/
 
 #pragma once
-#define CURAND_NORMAL(a) curand_normal_double(a)
+#include "utilities/gpu_macro.cuh"
+
+#define CURAND_NORMAL(a) gpurand_normal_double(a)
 
 // initialize curand states
-static __global__ void initialize_curand_states(curandState* state, int N, int seed)
+static __global__ void initialize_curand_states(gpurandState* state, int N, int seed)
 {
   int n = blockIdx.x * blockDim.x + threadIdx.x;
   if (n < N) {
-    curand_init(seed, n, 0, &state[n]);
+    gpurand_init(seed, n, 0, &state[n]);
   }
 }
 
 // global Langevin thermostatting
 static __global__ void gpu_langevin(
-  curandState* g_state,
+  gpurandState* g_state,
   const int N,
   const double c1,
   const double c2,
@@ -42,7 +44,7 @@ static __global__ void gpu_langevin(
 {
   int n = blockIdx.x * blockDim.x + threadIdx.x;
   if (n < N) {
-    curandState state = g_state[n];
+    gpurandState state = g_state[n];
     double c2m = c2 * sqrt(1.0 / g_mass[n]);
     g_vx[n] = c1 * g_vx[n] + c2m * CURAND_NORMAL(&state);
     g_vy[n] = c1 * g_vy[n] + c2m * CURAND_NORMAL(&state);
@@ -123,7 +125,7 @@ static __global__ void gpu_correct_momentum(const int N, double* g_vx, double* g
 
 // local Langevin thermostatting
 static __global__ void gpu_langevin(
-  curandState* g_state,
+  gpurandState* g_state,
   const int N,
   const int offset,
   const int* g_group_contents,
@@ -136,7 +138,7 @@ static __global__ void gpu_langevin(
 {
   int m = blockIdx.x * blockDim.x + threadIdx.x;
   if (m < N) {
-    curandState state = g_state[m];
+    gpurandState state = g_state[m];
     int n = g_group_contents[offset + m];
     double c2m = c2 * sqrt(1.0 / g_mass[n]);
     g_vx[n] = c1 * g_vx[n] + c2m * CURAND_NORMAL(&state);
@@ -177,7 +179,7 @@ static __global__ void find_ke(
     }
   }
   __syncthreads();
-#pragma unroll
+
   for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
     if (tid < offset) {
       s_ke[tid] += s_ke[tid + offset];
diff --git a/src/integrate/npt_utilities.cuh b/src/integrate/npt_utilities.cuh
index 2fc01db0c..c7d7c0c24 100644
--- a/src/integrate/npt_utilities.cuh
+++ b/src/integrate/npt_utilities.cuh
@@ -17,6 +17,9 @@
 Some CUDA kernels for position scaling in NPT
 ------------------------------------------------------------------------------*/
 
+#pragma once
+#include "utilities/gpu_macro.cuh"
+
 static __global__ void gpu_pressure_orthogonal(
   const int number_of_particles,
   const double scale_factor_x,
diff --git a/src/integrate/svr_utilities.cuh b/src/integrate/svr_utilities.cuh
index 6aee7985d..f7d8b7b78 100644
--- a/src/integrate/svr_utilities.cuh
+++ b/src/integrate/svr_utilities.cuh
@@ -22,6 +22,9 @@
 [1] G. Bussi et al. J. Chem. Phys. 126, 014101 (2007).
 ------------------------------------------------------------------------------*/
 
+#pragma once
+#include "utilities/gpu_macro.cuh"
+
 static double gasdev(std::mt19937& rng)
 {
   std::uniform_real_distribution<double> rand1(0, 1);
diff --git a/src/main_gpumd/add_efield.cu b/src/main_gpumd/add_efield.cu
index 01bb595f5..e7a7256c4 100644
--- a/src/main_gpumd/add_efield.cu
+++ b/src/main_gpumd/add_efield.cu
@@ -20,6 +20,7 @@ Add electric field to a group of atoms.
 #include "add_efield.cuh"
 #include "model/atom.cuh"
 #include "model/group.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <iostream>
 #include <vector>
@@ -67,7 +68,7 @@ void Add_Efield::compute(const int step, const std::vector<Group>& groups, Atom&
       atom.force_per_atom.data(),
       atom.force_per_atom.data() + num_atoms_total,
       atom.force_per_atom.data() + num_atoms_total * 2);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
diff --git a/src/main_gpumd/add_force.cu b/src/main_gpumd/add_force.cu
index 92bb952da..4b9f68a4f 100644
--- a/src/main_gpumd/add_force.cu
+++ b/src/main_gpumd/add_force.cu
@@ -20,6 +20,7 @@ Add force to a group of atoms.
 #include "add_force.cuh"
 #include "model/atom.cuh"
 #include "model/group.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <iostream>
 #include <vector>
@@ -64,7 +65,7 @@ void Add_Force::compute(const int step, const std::vector<Group>& groups, Atom&
       atom.force_per_atom.data(),
       atom.force_per_atom.data() + num_atoms_total,
       atom.force_per_atom.data() + num_atoms_total * 2);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
diff --git a/src/main_gpumd/add_random_force.cu b/src/main_gpumd/add_random_force.cu
index d3672a2e3..7e76f19a9 100644
--- a/src/main_gpumd/add_random_force.cu
+++ b/src/main_gpumd/add_random_force.cu
@@ -19,33 +19,34 @@ Add random forces with zero mean and specified variance.
 
 #include "add_random_force.cuh"
 #include "model/atom.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cstdlib>
 #include <iostream>
 #include <vector>
 
-static __global__ void initialize_curand_states(curandState* state, int N, int seed)
+static __global__ void initialize_curand_states(gpurandState* state, int N, int seed)
 {
   int n = blockIdx.x * blockDim.x + threadIdx.x;
   if (n < N) {
-    curand_init(seed, n, 0, &state[n]);
+    gpurand_init(seed, n, 0, &state[n]);
   }
 }
 
 static __global__ void add_random_force(
   const int N,
   const double force_variance,
-  curandState* g_state,
+  gpurandState* g_state,
   double* g_fx,
   double* g_fy,
   double* g_fz)
 {
   int n = blockIdx.x * blockDim.x + threadIdx.x;
   if (n < N) {
-    curandState state = g_state[n];
-    g_fx[n] += force_variance * curand_normal_double(&state);
-    g_fy[n] += force_variance * curand_normal_double(&state);
-    g_fz[n] += force_variance * curand_normal_double(&state);
+    gpurandState state = g_state[n];
+    g_fx[n] += force_variance * gpurand_normal_double(&state);
+    g_fy[n] += force_variance * gpurand_normal_double(&state);
+    g_fz[n] += force_variance * gpurand_normal_double(&state);
     g_state[n] = state;
   }
 }
@@ -88,7 +89,7 @@ static __global__ void gpu_sum_force(int N, double* g_fx, double* g_fy, double*
   s_f[tid] = f;
   __syncthreads();
 
-#pragma unroll
+
   for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
     if (tid < offset) {
       s_f[tid] += s_f[tid + offset];
@@ -123,14 +124,14 @@ void Add_Random_Force::compute(const int step, Atom& atom)
       atom.force_per_atom.data(),
       atom.force_per_atom.data() + atom.number_of_atoms,
       atom.force_per_atom.data() + atom.number_of_atoms * 2);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     gpu_sum_force<<<3, 1024>>>(
       atom.number_of_atoms,
       atom.force_per_atom.data(),
       atom.force_per_atom.data() + atom.number_of_atoms,
       atom.force_per_atom.data() + 2 * atom.number_of_atoms);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     gpu_correct_force<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
       atom.number_of_atoms,
@@ -138,7 +139,7 @@ void Add_Random_Force::compute(const int step, Atom& atom)
       atom.force_per_atom.data(),
       atom.force_per_atom.data() + atom.number_of_atoms,
       atom.force_per_atom.data() + 2 * atom.number_of_atoms);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
@@ -168,7 +169,7 @@ void Add_Random_Force::parse(const char** param, int num_param, int number_of_at
   curand_states_.resize(number_of_atoms);
   int grid_size = (number_of_atoms - 1) / 128 + 1;
   initialize_curand_states<<<grid_size, 128>>>(curand_states_.data(), number_of_atoms, rand());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 void Add_Random_Force::finalize() { num_calls_ = 0; }
diff --git a/src/main_gpumd/add_random_force.cuh b/src/main_gpumd/add_random_force.cuh
index 30fc6aaef..3bbc7fd55 100644
--- a/src/main_gpumd/add_random_force.cuh
+++ b/src/main_gpumd/add_random_force.cuh
@@ -15,7 +15,12 @@
 
 #pragma once
 #include "utilities/gpu_vector.cuh"
-#include <curand_kernel.h>
+#include "utilities/gpu_macro.cuh"
+#ifdef USE_HIP
+  #include <hiprand_kernel.h>
+#else
+  #include <curand_kernel.h>
+#endif
 
 class Atom;
 
@@ -27,7 +32,7 @@ public:
   void finalize();
 
 private:
-  GPU_Vector<curandState> curand_states_;
+  GPU_Vector<gpurandState> curand_states_;
   int num_calls_ = 0;
   double force_variance_ = 0.0;
 };
diff --git a/src/main_gpumd/cohesive.cu b/src/main_gpumd/cohesive.cu
index 96fb6ebcb..4c188db1e 100644
--- a/src/main_gpumd/cohesive.cu
+++ b/src/main_gpumd/cohesive.cu
@@ -25,6 +25,7 @@ Compute the cohesive energy curve with different deformations.
 #include "model/group.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 
 static void __global__ deform_position(
diff --git a/src/main_gpumd/electron_stop.cu b/src/main_gpumd/electron_stop.cu
index 7e12e5000..12eb55ef5 100644
--- a/src/main_gpumd/electron_stop.cu
+++ b/src/main_gpumd/electron_stop.cu
@@ -21,6 +21,7 @@ Apply electron stopping.
 #include "model/atom.cuh"
 #include "utilities/common.cuh"
 #include "utilities/gpu_vector.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <iostream>
 #include <vector>
 
@@ -171,21 +172,21 @@ void Electron_Stop::compute(double time_step, Atom& atom)
     stopping_force.data(),
     stopping_loss.data());
 
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_force_average<<<3, 1024>>>(atom.number_of_atoms, stopping_force.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   apply_electron_stopping<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
     atom.number_of_atoms, stopping_force.data(), atom.force_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_power_loss<<<1, 1024>>>(atom.number_of_atoms, stopping_loss.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   double power_loss_host;
-  CHECK(cudaMemcpyFromSymbol(
-    &power_loss_host, device_power_loss, sizeof(double), 0, cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpyFromSymbol(
+    &power_loss_host, device_power_loss, sizeof(double), 0, gpuMemcpyDeviceToHost));
   stopping_power_loss += power_loss_host;
 }
 
diff --git a/src/main_gpumd/main.cu b/src/main_gpumd/main.cu
index 57fc332f8..e275cae1c 100644
--- a/src/main_gpumd/main.cu
+++ b/src/main_gpumd/main.cu
@@ -15,6 +15,7 @@
 
 #include "run.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/main_common.cuh"
 #include <chrono>
 #include <stdio.h>
@@ -33,12 +34,12 @@ int main(int argc, char* argv[])
   printf("Started running GPUMD.\n");
   print_line_2();
 
-  CHECK(cudaDeviceSynchronize());
+  CHECK(gpuDeviceSynchronize());
   clock_t time_begin = clock();
 
   Run run;
 
-  CHECK(cudaDeviceSynchronize());
+  CHECK(gpuDeviceSynchronize());
   clock_t time_finish = clock();
   double time_used = (time_finish - time_begin) / double(CLOCKS_PER_SEC);
 
diff --git a/src/main_gpumd/replicate.cu b/src/main_gpumd/replicate.cu
index 5070220e1..aaa2ca009 100644
--- a/src/main_gpumd/replicate.cu
+++ b/src/main_gpumd/replicate.cu
@@ -14,6 +14,7 @@
 */
 
 #include "replicate.cuh"
+#include "utilities/gpu_macro.cuh"
 
 void Replicate(const char** param, int num_param, Box& box, Atom& atoms, std::vector<Group>& groups)
 {
diff --git a/src/main_gpumd/run.cu b/src/main_gpumd/run.cu
index 17db0bcbb..8c63d55d2 100644
--- a/src/main_gpumd/run.cu
+++ b/src/main_gpumd/run.cu
@@ -33,6 +33,7 @@ Run simulation according to the inputs in the run.in file.
 #include "replicate.cuh"
 #include "run.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include "velocity.cuh"
 
@@ -83,7 +84,7 @@ static void calculate_time_step(
   }
   const int N = velocity_per_atom.size() / 3;
   double* gpu_v2_max;
-  CHECK(cudaGetSymbolAddress((void**)&gpu_v2_max, device_v2_max));
+  CHECK(gpuGetSymbolAddress((void**)&gpu_v2_max, device_v2_max));
   gpu_find_largest_v2<<<1, 1024>>>(
     N,
     (N - 1) / 1024 + 1,
@@ -91,9 +92,9 @@ static void calculate_time_step(
     velocity_per_atom.data() + N,
     velocity_per_atom.data() + N * 2,
     gpu_v2_max);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
   double cpu_v2_max[1] = {0.0};
-  CHECK(cudaMemcpy(cpu_v2_max, gpu_v2_max, sizeof(double), cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(cpu_v2_max, gpu_v2_max, sizeof(double), gpuMemcpyDeviceToHost));
   double cpu_v_max = sqrt(cpu_v2_max[0]);
   double time_step_min = max_distance_per_step / cpu_v_max;
 
@@ -492,23 +493,26 @@ void Run::parse_one_keyword(std::vector<std::string>& tokens)
 
 void Run::parse_velocity(const char** param, int num_param)
 {
-  int seed;
+  int seed = 0;
   bool use_seed = false;
   if (!(num_param == 2 || num_param == 4)) {
     PRINT_INPUT_ERROR("velocity should have 1 or 2 parameters.\n");
+  } else if (num_param == 4) {
+    // See https://github.com/brucefan1983/GPUMD/pull/768
+    // for the reason for putting this branch here.
+    use_seed = true;
+    if (!is_valid_int(param[3], &seed)) {
+      PRINT_INPUT_ERROR("seed should be a positive integer.\n");
+    }
   }
+
   if (!is_valid_real(param[1], &initial_temperature)) {
     PRINT_INPUT_ERROR("initial temperature should be a real number.\n");
   }
   if (initial_temperature <= 0.0) {
     PRINT_INPUT_ERROR("initial temperature should be a positive number.\n");
   }
-  if (num_param == 4) {
-    use_seed = true;
-    if (!is_valid_int(param[3], &seed)) {
-      PRINT_INPUT_ERROR("seed should be a positive integer.\n");
-    }
-  }
+
   velocity.initialize(
     has_velocity_in_xyz,
     initial_temperature,
@@ -766,7 +770,7 @@ void Run::parse_change_box(const char** param, int num_param)
     atom.position_per_atom.data(),
     atom.position_per_atom.data() + number_of_atoms,
     atom.position_per_atom.data() + number_of_atoms * 2);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   if (box.triclinic == 0) {
     printf("    Changed box lengths are\n");
diff --git a/src/main_gpumd/velocity.cu b/src/main_gpumd/velocity.cu
index 058ca38d2..b69931c13 100644
--- a/src/main_gpumd/velocity.cu
+++ b/src/main_gpumd/velocity.cu
@@ -24,6 +24,7 @@ If DEBUG is off, the velocities are different in different runs.
 #include "model/group.cuh"
 #include "utilities/common.cuh"
 #include "utilities/gpu_vector.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "velocity.cuh"
 #include <vector>
 
diff --git a/src/main_nep/dataset.cu b/src/main_nep/dataset.cu
index 06ded508e..ab44a31de 100644
--- a/src/main_nep/dataset.cu
+++ b/src/main_nep/dataset.cu
@@ -18,6 +18,7 @@
 #include "parameters.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/nep_utilities.cuh"
 
 void Dataset::copy_structures(std::vector<Structure>& structures_input, int n1, int n2)
@@ -290,7 +291,7 @@ void Dataset::find_neighbor(Parameters& para)
     r.data() + N * 2,
     NN_radial_gpu.data(),
     NN_angular_gpu.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   NN_radial_gpu.copy_to_host(NN_radial_cpu.data());
   NN_angular_gpu.copy_to_host(NN_angular_cpu.data());
@@ -327,7 +328,7 @@ void Dataset::find_neighbor(Parameters& para)
 void Dataset::construct(
   Parameters& para, std::vector<Structure>& structures_input, int n1, int n2, int device_id)
 {
-  CHECK(cudaSetDevice(device_id));
+  CHECK(gpuSetDevice(device_id));
   copy_structures(structures_input, n1, n2);
   find_has_type(para);
   error_cpu.resize(Nc);
@@ -394,7 +395,7 @@ static __global__ void gpu_sum_force_error(
 
 std::vector<float> Dataset::get_rmse_force(Parameters& para, const bool use_weight, int device_id)
 {
-  CHECK(cudaSetDevice(device_id));
+  CHECK(gpuSetDevice(device_id));
   const int block_size = 256;
   gpu_sum_force_error<<<Nc, block_size, sizeof(float) * block_size>>>(
     use_weight,
@@ -411,7 +412,7 @@ std::vector<float> Dataset::get_rmse_force(Parameters& para, const bool use_weig
     force_ref_gpu.data() + N * 2,
     error_gpu.data());
   int mem = sizeof(float) * Nc;
-  CHECK(cudaMemcpy(error_cpu.data(), error_gpu.data(), mem, cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(error_cpu.data(), error_gpu.data(), mem, gpuMemcpyDeviceToHost));
 
   std::vector<float> rmse_array(para.num_types + 1, 0.0f);
   std::vector<int> count_array(para.num_types + 1, 0);
@@ -498,7 +499,7 @@ std::vector<float> Dataset::get_rmse_energy(
   const bool do_shift,
   int device_id)
 {
-  CHECK(cudaSetDevice(device_id));
+  CHECK(gpuSetDevice(device_id));
   energy_shift_per_structure = 0.0f;
 
   const int block_size = 256;
@@ -507,7 +508,7 @@ std::vector<float> Dataset::get_rmse_energy(
   if (do_shift) {
     gpu_get_energy_shift<<<Nc, block_size, sizeof(float) * block_size>>>(
       Na.data(), Na_sum.data(), energy.data(), energy_ref_gpu.data(), error_gpu.data());
-    CHECK(cudaMemcpy(error_cpu.data(), error_gpu.data(), mem, cudaMemcpyDeviceToHost));
+    CHECK(gpuMemcpy(error_cpu.data(), error_gpu.data(), mem, gpuMemcpyDeviceToHost));
     for (int n = 0; n < Nc; ++n) {
       energy_shift_per_structure += error_cpu[n];
     }
@@ -521,7 +522,7 @@ std::vector<float> Dataset::get_rmse_energy(
     energy.data(),
     energy_ref_gpu.data(),
     error_gpu.data());
-  CHECK(cudaMemcpy(error_cpu.data(), error_gpu.data(), mem, cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(error_cpu.data(), error_gpu.data(), mem, gpuMemcpyDeviceToHost));
 
   std::vector<float> rmse_array(para.num_types + 1, 0.0f);
   std::vector<int> count_array(para.num_types + 1, 0);
@@ -589,7 +590,7 @@ static __global__ void gpu_sum_virial_error(
 
 std::vector<float> Dataset::get_rmse_virial(Parameters& para, const bool use_weight, int device_id)
 {
-  CHECK(cudaSetDevice(device_id));
+  CHECK(gpuSetDevice(device_id));
 
   std::vector<float> rmse_array(para.num_types + 1, 0.0f);
   std::vector<int> count_array(para.num_types + 1, 0);
@@ -607,7 +608,7 @@ std::vector<float> Dataset::get_rmse_virial(Parameters& para, const bool use_wei
     virial.data(),
     virial_ref_gpu.data(),
     error_gpu.data());
-  CHECK(cudaMemcpy(error_cpu.data(), error_gpu.data(), mem, cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(error_cpu.data(), error_gpu.data(), mem, gpuMemcpyDeviceToHost));
   for (int n = 0; n < Nc; ++n) {
     if (structures[n].has_virial) {
       float rmse_temp = use_weight ? weight_cpu[n] * weight_cpu[n] * error_cpu[n] : error_cpu[n];
diff --git a/src/main_nep/fitness.cu b/src/main_nep/fitness.cu
index 2870d242e..c6b2c37cd 100644
--- a/src/main_nep/fitness.cu
+++ b/src/main_nep/fitness.cu
@@ -22,6 +22,7 @@ Get the fitness
 #include "parameters.cuh"
 #include "structure.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/gpu_vector.cuh"
 #include <algorithm>
 #include <chrono>
@@ -34,7 +35,7 @@ Get the fitness
 Fitness::Fitness(Parameters& para)
 {
   int deviceCount;
-  CHECK(cudaGetDeviceCount(&deviceCount));
+  CHECK(gpuGetDeviceCount(&deviceCount));
 
   std::vector<Structure> structures_train;
   read_structures(true, para, structures_train);
@@ -63,7 +64,7 @@ Fitness::Fitness(Parameters& para)
     for (int device_id = 0; device_id < deviceCount; ++device_id) {
       print_line_1();
       printf("Constructing train_set in device  %d.\n", device_id);
-      CHECK(cudaSetDevice(device_id));
+      CHECK(gpuSetDevice(device_id));
       train_set[batch_id][device_id].construct(
         para, structures_train, count - batch_size, count, device_id);
       print_line_2();
@@ -77,7 +78,7 @@ Fitness::Fitness(Parameters& para)
     for (int device_id = 0; device_id < deviceCount; ++device_id) {
       print_line_1();
       printf("Constructing test_set in device  %d.\n", device_id);
-      CHECK(cudaSetDevice(device_id));
+      CHECK(gpuSetDevice(device_id));
       test_set[device_id].construct(para, structures_test, 0, structures_test.size(), device_id);
       print_line_2();
     }
@@ -133,7 +134,7 @@ void Fitness::compute(
   const int generation, Parameters& para, const float* population, float* fitness)
 {
   int deviceCount;
-  CHECK(cudaGetDeviceCount(&deviceCount));
+  CHECK(gpuGetDeviceCount(&deviceCount));
   int population_iter = (para.population_size - 1) / deviceCount + 1;
 
   if (generation == 0) {
@@ -344,7 +345,7 @@ void Fitness::write_nep_txt(FILE* fid_nep, Parameters& para, float* elite)
   for (int m = 0; m < para.number_of_variables; ++m) {
     fprintf(fid_nep, "%15.7e\n", elite[m]);
   }
-  CHECK(cudaSetDevice(0));
+  CHECK(gpuSetDevice(0));
   para.q_scaler_gpu[0].copy_to_host(para.q_scaler_cpu.data());
   for (int d = 0; d < para.q_scaler_cpu.size(); ++d) {
     fprintf(fid_nep, "%15.7e\n", para.q_scaler_cpu[d]);
diff --git a/src/main_nep/main.cu b/src/main_nep/main.cu
index 3cfc7a616..3157432a6 100644
--- a/src/main_nep/main.cu
+++ b/src/main_nep/main.cu
@@ -17,6 +17,7 @@
 #include "parameters.cuh"
 #include "snes.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/main_common.cuh"
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/src/main_nep/nep3.cu b/src/main_nep/nep3.cu
index 0f59dbb7a..ece3d8d69 100644
--- a/src/main_nep/nep3.cu
+++ b/src/main_nep/nep3.cu
@@ -26,6 +26,7 @@ heat transport, Phys. Rev. B. 104, 104309 (2021).
 #include "parameters.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/gpu_vector.cuh"
 #include "utilities/nep_utilities.cuh"
 
@@ -292,7 +293,7 @@ NEP3::NEP3(
   }
 
   for (int device_id = 0; device_id < deviceCount; device_id++) {
-    cudaSetDevice(device_id);
+    gpuSetDevice(device_id);
     annmb[device_id].dim = para.dim;
     annmb[device_id].num_neurons1 = para.num_neurons1;
     annmb[device_id].num_para = para.number_of_variables;
@@ -857,14 +858,14 @@ void NEP3::find_force(
 {
 
   for (int device_id = 0; device_id < device_in_this_iter; ++device_id) {
-    CHECK(cudaSetDevice(device_id));
+    CHECK(gpuSetDevice(device_id));
     nep_data[device_id].parameters.copy_from_host(
       parameters + device_id * para.number_of_variables);
     update_potential(para, nep_data[device_id].parameters.data(), annmb[device_id]);
   }
 
   for (int device_id = 0; device_id < device_in_this_iter; ++device_id) {
-    CHECK(cudaSetDevice(device_id));
+    CHECK(gpuSetDevice(device_id));
     const int block_size = 32;
     const int grid_size = (dataset[device_id].N - 1) / block_size + 1;
 
@@ -894,7 +895,7 @@ void NEP3::find_force(
         nep_data[device_id].x12_angular.data(),
         nep_data[device_id].y12_angular.data(),
         nep_data[device_id].z12_angular.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
     }
 
     find_descriptors_radial<<<grid_size, block_size>>>(
@@ -908,7 +909,7 @@ void NEP3::find_force(
       nep_data[device_id].y12_radial.data(),
       nep_data[device_id].z12_radial.data(),
       nep_data[device_id].descriptors.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     find_descriptors_angular<<<grid_size, block_size>>>(
       dataset[device_id].N,
@@ -922,14 +923,14 @@ void NEP3::find_force(
       nep_data[device_id].z12_angular.data(),
       nep_data[device_id].descriptors.data(),
       nep_data[device_id].sum_fxyz.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     if (calculate_q_scaler) {
       find_max_min<<<annmb[device_id].dim, 1024>>>(
         dataset[device_id].N,
         nep_data[device_id].descriptors.data(),
         para.q_scaler_gpu[device_id].data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
     }
 
     zero_force<<<grid_size, block_size>>>(
@@ -940,7 +941,7 @@ void NEP3::find_force(
       dataset[device_id].virial.data(),
       dataset[device_id].virial.data() + dataset[device_id].N,
       dataset[device_id].virial.data() + dataset[device_id].N * 2);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     if (para.train_mode == 2) {
       apply_ann_pol<<<grid_size, block_size>>>(
@@ -952,7 +953,7 @@ void NEP3::find_force(
         para.q_scaler_gpu[device_id].data(),
         dataset[device_id].virial.data(),
         nep_data[device_id].Fp.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
     } else if (para.train_mode == 3) {
       apply_ann_temperature<<<grid_size, block_size>>>(
         dataset[device_id].N,
@@ -964,7 +965,7 @@ void NEP3::find_force(
         dataset[device_id].temperature_ref_gpu.data(),
         dataset[device_id].energy.data(),
         nep_data[device_id].Fp.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
     } else {
       apply_ann<<<grid_size, block_size>>>(
         dataset[device_id].N,
@@ -975,7 +976,7 @@ void NEP3::find_force(
         para.q_scaler_gpu[device_id].data(),
         dataset[device_id].energy.data(),
         nep_data[device_id].Fp.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
     }
 
     bool is_dipole = para.train_mode == 1;
@@ -995,7 +996,7 @@ void NEP3::find_force(
       dataset[device_id].force.data() + dataset[device_id].N,
       dataset[device_id].force.data() + dataset[device_id].N * 2,
       dataset[device_id].virial.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     find_force_angular<<<grid_size, block_size>>>(
       is_dipole,
@@ -1014,7 +1015,7 @@ void NEP3::find_force(
       dataset[device_id].force.data() + dataset[device_id].N,
       dataset[device_id].force.data() + dataset[device_id].N * 2,
       dataset[device_id].virial.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     if (zbl.enabled) {
       find_force_ZBL<<<grid_size, block_size>>>(
@@ -1032,7 +1033,7 @@ void NEP3::find_force(
         dataset[device_id].force.data() + dataset[device_id].N * 2,
         dataset[device_id].virial.data(),
         dataset[device_id].energy.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
     }
   }
 }
diff --git a/src/main_nep/parameters.cu b/src/main_nep/parameters.cu
index c0e76a47a..b2e8ac22c 100644
--- a/src/main_nep/parameters.cu
+++ b/src/main_nep/parameters.cu
@@ -16,6 +16,7 @@
 #include "parameters.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cmath>
 #include <cstring>
@@ -221,9 +222,9 @@ void Parameters::calculate_parameters()
   }
 
   int deviceCount;
-  CHECK(cudaGetDeviceCount(&deviceCount));
+  CHECK(gpuGetDeviceCount(&deviceCount));
   for (int device_id = 0; device_id < deviceCount; device_id++) {
-    CHECK(cudaSetDevice(device_id));
+    CHECK(gpuSetDevice(device_id));
     q_scaler_gpu[device_id].resize(dim);
     q_scaler_gpu[device_id].copy_from_host(q_scaler_cpu.data());
   }
@@ -932,7 +933,7 @@ void Parameters::parse_population(const char** param, int num_param)
   }
 
   int deviceCount;
-  CHECK(cudaGetDeviceCount(&deviceCount));
+  CHECK(gpuGetDeviceCount(&deviceCount));
   int fully_used_device = population_size % deviceCount;
   int population_should_increase;
   if (fully_used_device != 0) {
diff --git a/src/main_nep/snes.cu b/src/main_nep/snes.cu
index 75a04df93..2915560a5 100644
--- a/src/main_nep/snes.cu
+++ b/src/main_nep/snes.cu
@@ -27,15 +27,16 @@ https://doi.org/10.1145/2001576.2001692
 #include "parameters.cuh"
 #include "snes.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <chrono>
 #include <cmath>
 #include <iostream>
 
-static __global__ void initialize_curand_states(curandState* state, int N, int seed)
+static __global__ void initialize_curand_states(gpurandState* state, int N, int seed)
 {
   int n = blockIdx.x * blockDim.x + threadIdx.x;
   if (n < N) {
-    curand_init(seed, n, 0, &state[n]);
+    gpurand_init(seed, n, 0, &state[n]);
   }
 }
 
@@ -61,7 +62,7 @@ SNES::SNES(Parameters& para, Fitness* fitness_function)
   type_of_variable.resize(number_of_variables, para.num_types);
   initialize_rng();
 
-  cudaSetDevice(0); // normally use GPU-0
+  gpuSetDevice(0); // normally use GPU-0
   gpu_type_of_variable.resize(number_of_variables);
   gpu_index.resize(population_size * (para.num_types + 1));
   gpu_utility.resize(number_of_variables);
@@ -73,7 +74,7 @@ SNES::SNES(Parameters& para, Fitness* fitness_function)
   gpu_population.resize(N);
   curand_states.resize(N);
   initialize_curand_states<<<(N - 1) / 128 + 1, 128>>>(curand_states.data(), N, 1234567);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   initialize_mu_and_sigma(para);
   calculate_utility();
@@ -106,7 +107,7 @@ void SNES::initialize_mu_and_sigma(Parameters& para)
     }
     fclose(fid_restart);
   }
-  cudaSetDevice(0); // normally use GPU-0
+  gpuSetDevice(0); // normally use GPU-0
   gpu_mu.copy_from_host(mu.data());
   gpu_sigma.copy_from_host(sigma.data());
 }
@@ -275,15 +276,15 @@ static __global__ void gpu_create_population(
   const int number_of_variables,
   const float* g_mu,
   const float* g_sigma,
-  curandState* g_state,
+  gpurandState* g_state,
   float* g_s,
   float* g_population)
 {
   int n = blockIdx.x * blockDim.x + threadIdx.x;
   if (n < N) {
     int v = n % number_of_variables;
-    curandState state = g_state[n];
-    float s = curand_normal(&state);
+    gpurandState state = g_state[n];
+    float s = gpurand_normal(&state);
     g_s[n] = s;
     g_population[n] = g_sigma[v] * s + g_mu[v];
     g_state[n] = state;
@@ -292,7 +293,7 @@ static __global__ void gpu_create_population(
 
 void SNES::create_population(Parameters& para)
 {
-  cudaSetDevice(0); // normally use GPU-0
+  gpuSetDevice(0); // normally use GPU-0
   const int N = population_size * number_of_variables;
   gpu_create_population<<<(N - 1) / 128 + 1, 128>>>(
     N,
@@ -302,7 +303,7 @@ void SNES::create_population(Parameters& para)
     curand_states.data(),
     gpu_s.data(),
     gpu_population.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
   gpu_population.copy_to_host(population.data());
 }
 
@@ -346,7 +347,7 @@ static __global__ void gpu_find_L1_L2_NEP4(
 
 void SNES::regularize_NEP4(Parameters& para)
 {
-  cudaSetDevice(0); // normally use GPU-0
+  gpuSetDevice(0); // normally use GPU-0
 
   for (int t = 0; t <= para.num_types; ++t) {
     float num_variables = float(para.number_of_variables) / para.num_types;
@@ -362,7 +363,7 @@ void SNES::regularize_NEP4(Parameters& para)
       gpu_population.data(),
       gpu_cost_L1reg.data(),
       gpu_cost_L2reg.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     gpu_cost_L1reg.copy_to_host(cost_L1reg.data());
     gpu_cost_L2reg.copy_to_host(cost_L2reg.data());
@@ -414,10 +415,10 @@ static __global__ void gpu_find_L1_L2(
 
 void SNES::regularize(Parameters& para)
 {
-  cudaSetDevice(0); // normally use GPU-0
+  gpuSetDevice(0); // normally use GPU-0
   gpu_find_L1_L2<<<population_size, 1024>>>(
     number_of_variables, gpu_population.data(), gpu_cost_L1reg.data(), gpu_cost_L2reg.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
   gpu_cost_L1reg.copy_to_host(cost_L1reg.data());
   gpu_cost_L2reg.copy_to_host(cost_L2reg.data());
 
@@ -495,7 +496,7 @@ static __global__ void gpu_update_mu_and_sigma(
 
 void SNES::update_mu_and_sigma(Parameters& para)
 {
-  cudaSetDevice(0); // normally use GPU-0
+  gpuSetDevice(0); // normally use GPU-0
   gpu_type_of_variable.copy_from_host(type_of_variable.data());
   gpu_index.copy_from_host(index.data());
   gpu_utility.copy_from_host(utility.data());
@@ -510,12 +511,12 @@ void SNES::update_mu_and_sigma(Parameters& para)
     gpu_s.data(),
     gpu_mu.data(),
     gpu_sigma.data());
-  CUDA_CHECK_KERNEL;
+  GPU_CHECK_KERNEL;
 }
 
 void SNES::output_mu_and_sigma(Parameters& para)
 {
-  cudaSetDevice(0); // normally use GPU-0
+  gpuSetDevice(0); // normally use GPU-0
   gpu_mu.copy_to_host(mu.data());
   gpu_sigma.copy_to_host(sigma.data());
   FILE* fid_restart = my_fopen("nep.restart", "w");
diff --git a/src/main_nep/snes.cuh b/src/main_nep/snes.cuh
index 4f9352a98..4b4e7dd84 100644
--- a/src/main_nep/snes.cuh
+++ b/src/main_nep/snes.cuh
@@ -15,7 +15,12 @@
 
 #pragma once
 #include "utilities/gpu_vector.cuh"
-#include <curand_kernel.h>
+#include "utilities/gpu_macro.cuh"
+#ifdef USE_HIP
+  #include <hiprand_kernel.h>
+#else
+  #include <curand_kernel.h>
+#endif
 #include <random>
 #include <vector>
 class Fitness;
@@ -43,7 +48,7 @@ protected:
   std::vector<float> cost_L2reg;
   std::vector<int> type_of_variable;
 
-  GPU_Vector<curandState> curand_states;
+  GPU_Vector<gpurandState> curand_states;
   GPU_Vector<int> gpu_type_of_variable;
   GPU_Vector<int> gpu_index;
   GPU_Vector<float> gpu_utility;
diff --git a/src/main_nep/structure.cu b/src/main_nep/structure.cu
index 47379cad6..423065ac6 100644
--- a/src/main_nep/structure.cu
+++ b/src/main_nep/structure.cu
@@ -16,6 +16,7 @@
 #include "parameters.cuh"
 #include "structure.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <algorithm>
 #include <cctype>
 #include <chrono>
diff --git a/src/makefile.hip b/src/makefile.hip
new file mode 100644
index 000000000..7fcfc6d37
--- /dev/null
+++ b/src/makefile.hip
@@ -0,0 +1,110 @@
+###########################################################
+# Note: 
+# 1) You can modify gfx90a according to your GPU
+#    architecture.
+# 2) Do not remove -DUSE_HIP.
+# 3) Add -DUSE_PLUMED to CFLAGS when use the PLUMED plugin
+#    and remove it otherwise.
+# 4) Add -DUSE_TABLE to speed up MD simulations with NEP
+#    using pre-computed radial functions in the descriptors
+###########################################################
+
+
+###########################################################
+# some flags
+###########################################################
+CC = hipcc
+CFLAGS = -std=c++14 -O3 --offload-arch=gfx90a -DUSE_HIP
+INC = -I./
+LDFLAGS = 
+LIBS = -lhipblas -lhipsolver
+
+
+###########################################################
+# source files
+###########################################################
+SOURCES_GPUMD =                   \
+	$(wildcard main_gpumd/*.cu)   \
+	$(wildcard minimize/*.cu)     \
+	$(wildcard phonon/*.cu)       \
+	$(wildcard integrate/*.cu)    \
+	$(wildcard mc/*.cu)           \
+	$(wildcard force/*.cu)        \
+	$(wildcard measure/*.cu)      \
+	$(wildcard model/*.cu)        \
+	$(wildcard utilities/*.cu)
+SOURCES_NEP =                     \
+	$(wildcard main_nep/*.cu)     \
+	$(wildcard utilities/*.cu)
+
+
+###########################################################
+# object files
+###########################################################
+OBJ_GPUMD = $(SOURCES_GPUMD:.cu=.o)
+OBJ_NEP = $(SOURCES_NEP:.cu=.o)
+
+
+###########################################################
+# headers
+###########################################################
+HEADERS =                         \
+	$(wildcard utilities/*.cuh)   \
+	$(wildcard main_gpumd/*.cuh)  \
+	$(wildcard integrate/*.cuh)   \
+	$(wildcard mc/*.cuh)          \
+	$(wildcard minimize/*.cuh)    \
+	$(wildcard force/*.cuh)       \
+	$(wildcard measure/*.cuh)     \
+	$(wildcard model/*.cuh)       \
+	$(wildcard phonon/*.cuh)      \
+	$(wildcard main_nep/*.cuh)
+
+
+###########################################################
+# executables
+###########################################################
+all: gpumd nep
+gpumd: $(OBJ_GPUMD)
+	$(CC) $(LDFLAGS) $^ -o $@ $(LIBS)
+	@echo =================================================
+	@echo The gpumd executable is successfully compiled!
+	@echo =================================================
+nep: $(OBJ_NEP)
+	$(CC) $(LDFLAGS) $^ -o $@ $(LIBS)
+	@echo =================================================
+	@echo The nep executable is successfully compiled!
+	@echo =================================================
+
+
+###########################################################
+# rules for building object files
+###########################################################
+integrate/%.o: integrate/%.cu $(HEADERS)
+	$(CC) $(CFLAGS) $(INC) -c $< -o $@
+mc/%.o: mc/%.cu $(HEADERS)
+	$(CC) $(CFLAGS) $(INC) -c $< -o $@
+minimize/%.o: minimize/%.cu $(HEADERS)
+	$(CC) $(CFLAGS) $(INC) -c $< -o $@
+force/%.o: force/%.cu $(HEADERS)
+	$(CC) $(CFLAGS) $(INC) -c $< -o $@
+measure/%.o: measure/%.cu $(HEADERS)
+	$(CC) $(CFLAGS) $(INC) -c $< -o $@
+main_gpumd/%.o: main_gpumd/%.cu $(HEADERS)
+	$(CC) $(CFLAGS) $(INC) -c $< -o $@
+utilities/%.o: utilities/%.cu $(HEADERS)
+	$(CC) $(CFLAGS) $(INC) -c $< -o $@
+model/%.o: model/%.cu $(HEADERS)
+	$(CC) $(CFLAGS) $(INC) -c $< -o $@
+phonon/%.o: phonon/%.cu $(HEADERS)
+	$(CC) $(CFLAGS) $(INC) -c $< -o $@
+main_nep/%.o: main_nep/%.cu $(HEADERS)
+	$(CC) $(CFLAGS) $(INC) -c $< -o $@
+
+
+###########################################################
+# clean up
+###########################################################
+clean:
+	rm -f */*.o gpumd nep
+
diff --git a/src/mc/mc.cu b/src/mc/mc.cu
index f078c2aef..8cf2746a6 100644
--- a/src/mc/mc.cu
+++ b/src/mc/mc.cu
@@ -22,6 +22,7 @@ The driver class for the various MC ensembles.
 #include "mc_ensemble_sgc.cuh"
 #include "model/atom.cuh"
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cstring>
 
diff --git a/src/mc/mc_ensemble.cu b/src/mc/mc_ensemble.cu
index 7fc22271c..05def352f 100644
--- a/src/mc/mc_ensemble.cu
+++ b/src/mc/mc_ensemble.cu
@@ -19,6 +19,7 @@ The abstract base class (ABC) for the MC_Ensemble classes.
 
 #include "mc_ensemble.cuh"
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <chrono>
 #include <fstream>
 #include <iostream>
diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 6488c4d20..5a08b8c53 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -18,6 +18,7 @@ The canonical ensemble for MCMD.
 ------------------------------------------------------------------------------*/
 
 #include "mc_ensemble_canonical.cuh"
+#include "utilities/gpu_macro.cuh"
 
 MC_Ensemble_Canonical::MC_Ensemble_Canonical(
   const char** param, int num_param, int num_steps_mc_input)
@@ -243,7 +244,7 @@ void MC_Ensemble_Canonical::compute(
       type_j = atom.cpu_type[j];
     }
 
-    CHECK(cudaMemset(NN_ij.data(), 0, sizeof(int)));
+    CHECK(gpuMemset(NN_ij.data(), 0, sizeof(int)));
     get_neighbors_of_i_and_j<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
       atom.number_of_atoms,
       box,
@@ -255,7 +256,7 @@ void MC_Ensemble_Canonical::compute(
       atom.position_per_atom.data() + atom.number_of_atoms * 2,
       NN_ij.data(),
       NL_ij.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     int NN_ij_cpu;
     NN_ij.copy_to_host(&NN_ij_cpu);
@@ -269,7 +270,7 @@ void MC_Ensemble_Canonical::compute(
       atom.type.data(),
       type_before.data(),
       type_after.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     find_local_types<<<(NN_ij_cpu - 1) / 64 + 1, 64>>>(
       NN_ij_cpu,
@@ -278,10 +279,10 @@ void MC_Ensemble_Canonical::compute(
       type_after.data(),
       local_type_before.data(),
       local_type_after.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
-    CHECK(cudaMemset(NN_radial.data(), 0, sizeof(int) * NN_radial.size()));
-    CHECK(cudaMemset(NN_angular.data(), 0, sizeof(int) * NN_angular.size()));
+    CHECK(gpuMemset(NN_radial.data(), 0, sizeof(int) * NN_radial.size()));
+    CHECK(gpuMemset(NN_angular.data(), 0, sizeof(int) * NN_angular.size()));
     create_inputs_for_energy_calculator<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
       atom.number_of_atoms,
       NN_ij_cpu,
@@ -306,7 +307,7 @@ void MC_Ensemble_Canonical::compute(
       x12_angular.data(),
       y12_angular.data(),
       z12_angular.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     nep_energy.find_energy(
       NN_ij_cpu,
diff --git a/src/mc/mc_ensemble_sgc.cu b/src/mc/mc_ensemble_sgc.cu
index fa3e5f62a..d68edea39 100644
--- a/src/mc/mc_ensemble_sgc.cu
+++ b/src/mc/mc_ensemble_sgc.cu
@@ -27,6 +27,7 @@ integration across phase boundaries, Phys. Rev. B 86, 134204 (2012).
 ------------------------------------------------------------------------------*/
 
 #include "mc_ensemble_sgc.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <map>
 
 const std::map<std::string, double> MASS_TABLE{
@@ -359,7 +360,7 @@ void MC_Ensemble_SGC::compute(
       type_j = types[index_new_species];
     }
 
-    CHECK(cudaMemset(NN_ij.data(), 0, sizeof(int)));
+    CHECK(gpuMemset(NN_ij.data(), 0, sizeof(int)));
     get_neighbors_of_i<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
       atom.number_of_atoms,
       box,
@@ -370,14 +371,14 @@ void MC_Ensemble_SGC::compute(
       atom.position_per_atom.data() + atom.number_of_atoms * 2,
       NN_ij.data(),
       NL_ij.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     int NN_ij_cpu;
     NN_ij.copy_to_host(&NN_ij_cpu);
 
     get_types<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
       atom.number_of_atoms, i, type_j, atom.type.data(), type_before.data(), type_after.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     find_local_types<<<(NN_ij_cpu - 1) / 64 + 1, 64>>>(
       NN_ij_cpu,
@@ -386,10 +387,10 @@ void MC_Ensemble_SGC::compute(
       type_after.data(),
       local_type_before.data(),
       local_type_after.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
-    CHECK(cudaMemset(NN_radial.data(), 0, sizeof(int) * NN_radial.size()));
-    CHECK(cudaMemset(NN_angular.data(), 0, sizeof(int) * NN_angular.size()));
+    CHECK(gpuMemset(NN_radial.data(), 0, sizeof(int) * NN_radial.size()));
+    CHECK(gpuMemset(NN_angular.data(), 0, sizeof(int) * NN_angular.size()));
     create_inputs_for_energy_calculator<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
       atom.number_of_atoms,
       NN_ij_cpu,
@@ -414,7 +415,7 @@ void MC_Ensemble_SGC::compute(
       x12_angular.data(),
       y12_angular.data(),
       z12_angular.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     nep_energy.find_energy(
       NN_ij_cpu,
diff --git a/src/mc/nep_energy.cu b/src/mc/nep_energy.cu
index da0c9575c..34c579bf7 100644
--- a/src/mc/nep_energy.cu
+++ b/src/mc/nep_energy.cu
@@ -23,6 +23,7 @@ heat transport, Phys. Rev. B. 104, 104309 (2021).
 #include "nep_energy.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/nep_utilities.cuh"
 #include <fstream>
 #include <iostream>
@@ -470,7 +471,7 @@ void NEP_Energy::find_energy(
     g_y12_angular,
     g_z12_angular,
     g_pe);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   if (zbl.enabled) {
     find_energy_zbl<<<(N - 1) / 64 + 1, 64>>>(
@@ -484,6 +485,6 @@ void NEP_Energy::find_energy(
       g_y12_angular,
       g_z12_angular,
       g_pe);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
diff --git a/src/measure/active.cu b/src/measure/active.cu
index b50f57a08..90f0cf318 100644
--- a/src/measure/active.cu
+++ b/src/measure/active.cu
@@ -23,6 +23,7 @@ Run active learning on-the-fly during MD
 #include "parse_utilities.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <iostream>
 #include <vector>
@@ -193,7 +194,7 @@ void Active::process(
   // Reset mean vectors to zero
   initialize_mean_vectors<<<(3 * number_of_atoms - 1) / 128 + 1, 128>>>(
     number_of_atoms, mean_force_.data(), mean_force_sq_.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // Loop backwards over files to evaluate the main potential last, keeping it's properties intact
   for (int potential_index = number_of_potentials - 1; potential_index >= 0; potential_index--) {
@@ -205,7 +206,7 @@ void Active::process(
       atom.force_per_atom.data() + number_of_atoms * 2,
       atom.potential_per_atom.data(),
       atom.virial_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
     // Compute new potential properties
     force.potentials[potential_index]->compute(
       box,
@@ -223,12 +224,12 @@ void Active::process(
       atom.force_per_atom.data(),
       atom.force_per_atom.data() + number_of_atoms,
       atom.force_per_atom.data() + number_of_atoms * 2);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
   // Sum mean and mean_sq on GPU, move sum to CPU
   compute_uncertainty<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
     number_of_atoms, mean_force_.data(), mean_force_sq_.data(), gpu_uncertainty_.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
   gpu_uncertainty_.copy_to_host(cpu_uncertainty_.data());
   double uncertainty = -1.0;
   for (int i = 0; i < number_of_atoms; i++) {
diff --git a/src/measure/compute.cu b/src/measure/compute.cu
index 999ab0e51..31745ad87 100644
--- a/src/measure/compute.cu
+++ b/src/measure/compute.cu
@@ -20,6 +20,7 @@ Compute block (space) averages of various per-atom quantities.
 #include "compute.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cstring>
 #include <vector>
@@ -186,7 +187,7 @@ static __global__ void find_group_sum_1(
   }
   __syncthreads();
 
-#pragma unroll
+
   for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
     if (tid < offset) {
       s_data[tid] += s_data[tid + offset];
@@ -232,7 +233,7 @@ static __global__ void find_group_sum_3(
   }
   __syncthreads();
 
-#pragma unroll
+
   for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
     if (tid < offset) {
       s_fx[tid] += s_fx[tid + offset];
@@ -278,14 +279,14 @@ void Compute::process(
       velocity_per_atom.data() + N,
       velocity_per_atom.data() + 2 * N,
       gpu_per_atom_x.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
     find_group_sum_1<<<Ng, 256>>>(
       group[grouping_method].size.data(),
       group[grouping_method].size_sum.data(),
       group[grouping_method].contents.data(),
       gpu_per_atom_x.data(),
       gpu_group_sum.data() + offset);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
     offset += Ng;
   }
   if (compute_potential) {
@@ -295,7 +296,7 @@ void Compute::process(
       group[grouping_method].contents.data(),
       potential_per_atom.data(),
       gpu_group_sum.data() + offset);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
     offset += Ng;
   }
   if (compute_force) {
@@ -307,7 +308,7 @@ void Compute::process(
       force_per_atom.data() + N,
       force_per_atom.data() + 2 * N,
       gpu_group_sum.data() + offset);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
     offset += Ng * 3;
   }
   if (compute_virial) {
@@ -319,7 +320,7 @@ void Compute::process(
       virial_per_atom.data() + N,
       virial_per_atom.data() + N * 2,
       gpu_group_sum.data() + offset);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
     offset += Ng * 3;
   }
   if (compute_jp) {
@@ -344,7 +345,7 @@ void Compute::process(
       gpu_per_atom_x.data(),
       gpu_per_atom_y.data(),
       gpu_per_atom_z.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     find_group_sum_3<<<Ng, 256>>>(
       group[grouping_method].size.data(),
@@ -354,7 +355,7 @@ void Compute::process(
       gpu_per_atom_y.data(),
       gpu_per_atom_z.data(),
       gpu_group_sum.data() + offset);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
     offset += Ng * 3;
   }
   if (compute_jk) {
@@ -368,7 +369,7 @@ void Compute::process(
       gpu_per_atom_x.data(),
       gpu_per_atom_y.data(),
       gpu_per_atom_z.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     find_group_sum_3<<<Ng, 256>>>(
       group[grouping_method].size.data(),
@@ -378,7 +379,7 @@ void Compute::process(
       gpu_per_atom_y.data(),
       gpu_per_atom_z.data(),
       gpu_group_sum.data() + offset);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
     offset += Ng * 3;
   }
   if (compute_momentum) {
@@ -391,7 +392,7 @@ void Compute::process(
       gpu_per_atom_x.data(),
       gpu_per_atom_y.data(),
       gpu_per_atom_z.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     find_group_sum_3<<<Ng, 256>>>(
       group[grouping_method].size.data(),
@@ -401,7 +402,7 @@ void Compute::process(
       gpu_per_atom_y.data(),
       gpu_per_atom_z.data(),
       gpu_group_sum.data() + offset);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
     offset += Ng * 3;
   }
 
diff --git a/src/measure/compute_heat.cu b/src/measure/compute_heat.cu
index f7103a40d..50af54d4a 100644
--- a/src/measure/compute_heat.cu
+++ b/src/measure/compute_heat.cu
@@ -26,6 +26,7 @@ https://doi.org/10.1103/PhysRevB.92.094301
 
 #include "compute_heat.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 
 namespace
 {
@@ -90,7 +91,7 @@ void compute_heat(
     heat_per_atom.data() + N * 2,
     heat_per_atom.data() + N * 3,
     heat_per_atom.data() + N * 4);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 namespace
@@ -160,5 +161,5 @@ void compute_heat(
     heat_per_atom.data(),
     heat_per_atom.data() + N,
     heat_per_atom.data() + N * 2);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
diff --git a/src/measure/dos.cu b/src/measure/dos.cu
index 910daee9b..f8e53cc0f 100644
--- a/src/measure/dos.cu
+++ b/src/measure/dos.cu
@@ -29,6 +29,7 @@ Reference for DOS:
 #include "parse_utilities.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cstring>
 
@@ -225,7 +226,7 @@ void DOS::postprocess()
   if (!compute_)
     return;
 
-  CHECK(cudaDeviceSynchronize()); // needed for pre-Pascal GPU
+  CHECK(gpuDeviceSynchronize()); // needed for pre-Pascal GPU
 
   normalize_vac();
   output_vac();
@@ -301,7 +302,7 @@ void DOS::copy_mass(const GPU_Vector<double>& mass)
     const int offset = (group_id_ < 0) ? 0 : group_->cpu_size_sum[group_id_];
     gpu_copy_mass<<<(num_atoms_ - 1) / 128 + 1, 128>>>(
       num_atoms_, group_->contents.data() + offset, mass.data(), mass_.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
@@ -338,7 +339,7 @@ void DOS::copy_velocity(const int correlation_step, const GPU_Vector<double>& ve
       }
     }
   }
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 void DOS::find_vac(const int correlation_step)
@@ -378,7 +379,7 @@ void DOS::find_vac(const int correlation_step)
       vacy_.data(),
       vacz_.data());
   }
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 void DOS::normalize_vac()
diff --git a/src/measure/dump_beads.cu b/src/measure/dump_beads.cu
index 35f020b0f..25d59a195 100644
--- a/src/measure/dump_beads.cu
+++ b/src/measure/dump_beads.cu
@@ -23,6 +23,7 @@ Dump bead data in PIMD-related run
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
 #include "utilities/gpu_vector.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 
 void Dump_Beads::parse(const char** param, int num_param)
diff --git a/src/measure/dump_dipole.cu b/src/measure/dump_dipole.cu
index a9f29f126..ee0d6282e 100644
--- a/src/measure/dump_dipole.cu
+++ b/src/measure/dump_dipole.cu
@@ -23,6 +23,7 @@ Dump energy/force/virial with all loaded potentials at a given interval.
 #include "parse_utilities.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/gpu_vector.cuh"
 #include "utilities/read_file.cuh"
 #include <iostream>
@@ -51,7 +52,7 @@ static __global__ void sum_dipole(
   __syncthreads();
 
   // aggregate the patches in parallel
-#pragma unroll
+
   for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
     if (tid < offset) {
       s_d[tid] += s_d[tid + offset];
@@ -167,7 +168,7 @@ void Dump_Dipole::process(
     atom_copy.potential_per_atom.data(),
     atom_copy.virial_per_atom.data(),
     gpu_dipole_.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // Compute the dipole
   // Use the positions and types from the existing atoms object,
@@ -188,7 +189,7 @@ void Dump_Dipole::process(
     number_of_atoms_per_thread,
     atom_copy.virial_per_atom.data(),
     gpu_dipole_.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // Transfer gpu_sum to the CPU
   gpu_dipole_.copy_to_host(cpu_dipole_.data());
diff --git a/src/measure/dump_exyz.cu b/src/measure/dump_exyz.cu
index 1620866de..079120a2b 100644
--- a/src/measure/dump_exyz.cu
+++ b/src/measure/dump_exyz.cu
@@ -22,6 +22,7 @@ Dump some data to dump.xyz in the extended XYZ format
 #include "model/box.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/gpu_vector.cuh"
 #include "utilities/read_file.cuh"
 
diff --git a/src/measure/dump_force.cu b/src/measure/dump_force.cu
index e238cd4a6..136c1a22d 100644
--- a/src/measure/dump_force.cu
+++ b/src/measure/dump_force.cu
@@ -22,6 +22,7 @@ Dump force data to a file at a given interval.
 #include "parse_utilities.cuh"
 #include "utilities/error.cuh"
 #include "utilities/gpu_vector.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cstring>
 #include <vector>
@@ -124,7 +125,7 @@ void Dump_Force::process(
     for (int d = 0; d < 3; ++d) {
       double* cpu_f = cpu_force_per_atom.data() + group_size * d;
       double* gpu_f = gpu_force_tmp.data() + group_size * d;
-      CHECK(cudaMemcpy(cpu_f, gpu_f, sizeof(double) * group_size, cudaMemcpyDeviceToHost));
+      CHECK(gpuMemcpy(cpu_f, gpu_f, sizeof(double) * group_size, gpuMemcpyDeviceToHost));
     }
     for (int n = 0; n < group_size; n++) {
       fprintf(
diff --git a/src/measure/dump_netcdf.cu b/src/measure/dump_netcdf.cu
index faddc4ab6..8f74d8674 100644
--- a/src/measure/dump_netcdf.cu
+++ b/src/measure/dump_netcdf.cu
@@ -36,6 +36,7 @@ http://ambermd.org/netcdf/nctraj.xhtml
 #include "parse_utilities.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/gpu_vector.cuh"
 #include "utilities/read_file.cuh"
 #include <unistd.h>
diff --git a/src/measure/dump_observer.cu b/src/measure/dump_observer.cu
index cf5968f9e..49d49e85e 100644
--- a/src/measure/dump_observer.cu
+++ b/src/measure/dump_observer.cu
@@ -21,6 +21,7 @@ Dump energy/force/virial with all loaded potentials at a given interval.
 #include "parse_utilities.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/gpu_vector.cuh"
 #include "utilities/read_file.cuh"
 #include <iostream>
@@ -182,7 +183,7 @@ void Dump_Observer::process(
         atom.force_per_atom.data() + number_of_atoms * 2,
         atom.potential_per_atom.data(),
         atom.virial_per_atom.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
       // Compute new potential properties
       force.potentials[potential_index]->compute(
         box,
diff --git a/src/measure/dump_polarizability.cu b/src/measure/dump_polarizability.cu
index a601b180b..89b8b46c0 100644
--- a/src/measure/dump_polarizability.cu
+++ b/src/measure/dump_polarizability.cu
@@ -22,6 +22,7 @@ Dump energy/force/virial with all loaded potentials at a given interval.
 #include "parse_utilities.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/gpu_vector.cuh"
 #include "utilities/read_file.cuh"
 #include <iostream>
@@ -55,7 +56,7 @@ static __global__ void sum_polarizability(
   __syncthreads();
 
   // aggregate the patches in parallel
-#pragma unroll
+
   for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
     if (tid < offset) {
       s_p[tid] += s_p[tid + offset];
@@ -168,7 +169,7 @@ void Dump_Polarizability::process(
     atom_copy.potential_per_atom.data(),
     atom_copy.virial_per_atom.data(),
     gpu_pol_.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // Compute the dipole
   // Use the positions and types from the existing atoms object,
@@ -187,7 +188,7 @@ void Dump_Polarizability::process(
   const int number_of_atoms_per_thread = (number_of_atoms - 1) / number_of_threads + 1;
   sum_polarizability<<<6, number_of_threads>>>(
     number_of_atoms, number_of_atoms_per_thread, atom_copy.virial_per_atom.data(), gpu_pol_.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // Transfer gpu_sum to the CPU
   gpu_pol_.copy_to_host(cpu_pol_.data());
diff --git a/src/measure/dump_position.cu b/src/measure/dump_position.cu
index 177be6f22..40446e51a 100644
--- a/src/measure/dump_position.cu
+++ b/src/measure/dump_position.cu
@@ -22,6 +22,7 @@ Dump position data to movie.xyz.
 #include "model/group.cuh"
 #include "parse_utilities.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/gpu_vector.cuh"
 #include "utilities/read_file.cuh"
 
@@ -169,7 +170,7 @@ void Dump_Position::process(
     for (int d = 0; d < 3; ++d) {
       double* cpu_data = cpu_position_per_atom.data() + num_atoms_total * d;
       double* gpu_data = gpu_position_tmp.data() + group_size * d;
-      CHECK(cudaMemcpy(cpu_data, gpu_data, sizeof(double) * group_size, cudaMemcpyDeviceToHost));
+      CHECK(gpuMemcpy(cpu_data, gpu_data, sizeof(double) * group_size, gpuMemcpyDeviceToHost));
     }
     fprintf(fid_, "%d\n", group_size);
     output_line2(box, cpu_atom_symbol);
diff --git a/src/measure/dump_restart.cu b/src/measure/dump_restart.cu
index 990edb0b3..e91d2531f 100644
--- a/src/measure/dump_restart.cu
+++ b/src/measure/dump_restart.cu
@@ -22,6 +22,7 @@ Dump a restart file
 #include "model/group.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/gpu_vector.cuh"
 #include "utilities/read_file.cuh"
 #include <vector>
diff --git a/src/measure/dump_shock_nemd.cu b/src/measure/dump_shock_nemd.cu
index d94110054..cdef34860 100644
--- a/src/measure/dump_shock_nemd.cu
+++ b/src/measure/dump_shock_nemd.cu
@@ -14,6 +14,7 @@
 */
 
 #include "dump_shock_nemd.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <cstring>
 
 namespace
diff --git a/src/measure/dump_shock_nemd.cuh b/src/measure/dump_shock_nemd.cuh
index fee72bb19..5d235e089 100644
--- a/src/measure/dump_shock_nemd.cuh
+++ b/src/measure/dump_shock_nemd.cuh
@@ -20,7 +20,6 @@
 #include "utilities/common.cuh"
 #include "utilities/gpu_vector.cuh"
 #include "utilities/read_file.cuh"
-#include <cooperative_groups.h>
 #include <string>
 #include <vector>
 
diff --git a/src/measure/dump_thermo.cu b/src/measure/dump_thermo.cu
index f7877dd56..00ead6de6 100644
--- a/src/measure/dump_thermo.cu
+++ b/src/measure/dump_thermo.cu
@@ -21,6 +21,7 @@ Dump thermo data to a file at a given interval.
 #include "model/box.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/gpu_vector.cuh"
 #include "utilities/read_file.cuh"
 
diff --git a/src/measure/dump_velocity.cu b/src/measure/dump_velocity.cu
index dec3d69a4..faf1421bd 100644
--- a/src/measure/dump_velocity.cu
+++ b/src/measure/dump_velocity.cu
@@ -22,6 +22,7 @@ Dump velocity data to a file at a given interval.
 #include "parse_utilities.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/gpu_vector.cuh"
 #include "utilities/read_file.cuh"
 #include <cstring>
@@ -129,7 +130,7 @@ void Dump_Velocity::process(
     for (int d = 0; d < 3; ++d) {
       double* cpu_v = cpu_velocity_per_atom.data() + num_atoms_total * d;
       double* gpu_v = gpu_velocity_tmp.data() + group_size * d;
-      CHECK(cudaMemcpy(cpu_v, gpu_v, sizeof(double) * group_size, cudaMemcpyDeviceToHost));
+      CHECK(gpuMemcpy(cpu_v, gpu_v, sizeof(double) * group_size, gpuMemcpyDeviceToHost));
     }
     for (int n = 0; n < group_size; n++) {
       fprintf(
diff --git a/src/measure/hac.cu b/src/measure/hac.cu
index b80634ed2..61bf3785e 100644
--- a/src/measure/hac.cu
+++ b/src/measure/hac.cu
@@ -20,6 +20,7 @@ Calculate the heat current autocorrelation (HAC) function.
 #include "compute_heat.cuh"
 #include "hac.cuh"
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cstring>
 #include <vector>
@@ -56,7 +57,7 @@ gpu_sum_heat(const int N, const int Nd, const int nd, const double* g_heat, doub
   }
 
   __syncthreads();
-#pragma unroll
+
   for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
     if (tid < offset) {
       s_data[tid] += s_data[tid + offset];
@@ -88,7 +89,7 @@ void HAC::process(
   int nd = (step + 1) / sample_interval - 1;
   int Nd = number_of_steps / sample_interval;
   gpu_sum_heat<<<NUM_OF_HEAT_COMPONENTS, 1024>>>(N, Nd, nd, heat_per_atom.data(), heat_all.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 // Calculate the Heat current Auto-Correlation function (HAC)
@@ -129,7 +130,7 @@ __global__ void gpu_find_hac(const int Nc, const int Nd, const double* g_heat, d
   }
   __syncthreads();
 
-#pragma unroll
+
   for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
     if (tid < offset) {
       s_hac_xi[tid] += s_hac_xi[tid + offset];
@@ -182,7 +183,7 @@ void HAC::postprocess(
 
   // Here, the block size is fixed to 128, which is a good choice
   gpu_find_hac<<<Nc, 128>>>(Nc, Nd, heat_all.data(), hac_gpu.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   hac_gpu.copy_to_host(hac_cpu.data());
 
diff --git a/src/measure/hnemd_kappa.cu b/src/measure/hnemd_kappa.cu
index 5dffa5539..1cc09ede8 100644
--- a/src/measure/hnemd_kappa.cu
+++ b/src/measure/hnemd_kappa.cu
@@ -25,6 +25,7 @@ with many-body potentials, Phys. Rev. B 99, 064308 (2019).
 #include "hnemd_kappa.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <vector>
 
@@ -55,7 +56,7 @@ gpu_sum_heat(const int N, const int step, const double* g_heat, double* g_heat_s
   }
   __syncthreads();
 
-#pragma unroll
+
   for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
     if (tid < offset) {
       s_data[tid] += s_data[tid + offset];
@@ -86,7 +87,7 @@ void HNEMD::process(
   compute_heat(virial_per_atom, velocity_per_atom, heat_per_atom);
 
   gpu_sum_heat<<<NUM_OF_HEAT_COMPONENTS, 1024>>>(N, step, heat_per_atom.data(), heat_all.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   if (output_flag) {
     const int num = NUM_OF_HEAT_COMPONENTS * output_interval;
diff --git a/src/measure/hnemdec_kappa.cu b/src/measure/hnemdec_kappa.cu
index a19f2618a..fd4d99497 100644
--- a/src/measure/hnemdec_kappa.cu
+++ b/src/measure/hnemdec_kappa.cu
@@ -25,6 +25,7 @@ with many-body potentials, Phys. Rev. B 99, 064308 (2019).
 #include "hnemdec_kappa.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <vector>
 
@@ -98,7 +99,7 @@ static __global__ void gpu_sum_heat_and_diffusion(
     }
     __syncthreads();
 
-#pragma unroll
+
     for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
       if (tid < offset) {
         s_data[tid] += s_data[tid + offset];
@@ -122,7 +123,7 @@ static __global__ void gpu_sum_heat_and_diffusion(
     }
     __syncthreads();
 
-#pragma unroll
+
     for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
       if (tid < offset) {
         s_data[tid] += s_data[tid + offset];
@@ -166,7 +167,7 @@ void HNEMDEC::process(
     heat_per_atom.data(),
     heat_all.data(),
     diffusion_all.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   if (output_flag) {
     const int heat_num = NUM_OF_HEAT_COMPONENTS * output_interval;
diff --git a/src/measure/lsqt.cu b/src/measure/lsqt.cu
index 17df92f6e..19ac73cd6 100644
--- a/src/measure/lsqt.cu
+++ b/src/measure/lsqt.cu
@@ -18,6 +18,7 @@
 #include "model/atom.cuh"
 #include "model/box.cuh"
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <algorithm>
 #include <cmath>
@@ -322,37 +323,37 @@ void find_moments_chebyshev(
   double Em_inv = 1.0 / Em;
 
   double *s0r, *s1r, *s2r, *s0i, *s1i, *s2i, *moments_tmp;
-  cudaMalloc((void**)&s0r, sizeof(double) * N);
-  cudaMalloc((void**)&s1r, sizeof(double) * N);
-  cudaMalloc((void**)&s2r, sizeof(double) * N);
-  cudaMalloc((void**)&s0i, sizeof(double) * N);
-  cudaMalloc((void**)&s1i, sizeof(double) * N);
-  cudaMalloc((void**)&s2i, sizeof(double) * N);
-  cudaMalloc((void**)&moments_tmp, memory_moments_tmp);
+  gpuMalloc((void**)&s0r, sizeof(double) * N);
+  gpuMalloc((void**)&s1r, sizeof(double) * N);
+  gpuMalloc((void**)&s2r, sizeof(double) * N);
+  gpuMalloc((void**)&s0i, sizeof(double) * N);
+  gpuMalloc((void**)&s1i, sizeof(double) * N);
+  gpuMalloc((void**)&s2i, sizeof(double) * N);
+  gpuMalloc((void**)&moments_tmp, memory_moments_tmp);
 
   // T_0(H)
   gpu_copy_state<<<grid_size, BLOCK_SIZE_EC>>>(N, srr, sri, s0r, s0i);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
   gpu_find_inner_product_1<<<grid_size, BLOCK_SIZE_EC>>>(
     N, s0r, s0i, slr, sli, moments_tmp, 0 * grid_size);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // T_1(H)
   gpu_apply_hamiltonian<<<grid_size, BLOCK_SIZE_EC>>>(
     N, Em_inv, NN, NL, U, Hr, Hi, s0r, s0i, s1r, s1i);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
   gpu_find_inner_product_1<<<grid_size, BLOCK_SIZE_EC>>>(
     N, s1r, s1i, slr, sli, moments_tmp, 1 * grid_size);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // T_m(H) (m >= 2)
   for (int m = 2; m < Nm; ++m) {
     gpu_kernel_polynomial<<<grid_size, BLOCK_SIZE_EC>>>(
       N, Em_inv, NN, NL, U, Hr, Hi, s0r, s0i, s1r, s1i, s2r, s2i);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
     gpu_find_inner_product_1<<<grid_size, BLOCK_SIZE_EC>>>(
       N, s2r, s2i, slr, sli, moments_tmp, m * grid_size);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
     // permute the pointers; do not need to copy the data
     double* temp_real;
     double* temp_imag;
@@ -368,15 +369,15 @@ void find_moments_chebyshev(
 
   gpu_find_inner_product_2<<<Nm, BLOCK_SIZE_EC>>>(
     number_of_blocks, number_of_patches, moments_tmp, moments);
-  CUDA_CHECK_KERNEL
-
-  cudaFree(s0r);
-  cudaFree(s0i);
-  cudaFree(s1r);
-  cudaFree(s1i);
-  cudaFree(s2r);
-  cudaFree(s2i);
-  cudaFree(moments_tmp);
+  GPU_CHECK_KERNEL
+
+  gpuFree(s0r);
+  gpuFree(s0i);
+  gpuFree(s1r);
+  gpuFree(s1i);
+  gpuFree(s2r);
+  gpuFree(s2i);
+  gpuFree(moments_tmp);
 }
 
 // Jackson damping
@@ -435,28 +436,28 @@ void evolve(
   double* s0i;
   double* s1i;
   double* s2i;
-  cudaMalloc((void**)&s0r, sizeof(double) * N);
-  cudaMalloc((void**)&s0i, sizeof(double) * N);
-  cudaMalloc((void**)&s1r, sizeof(double) * N);
-  cudaMalloc((void**)&s1i, sizeof(double) * N);
-  cudaMalloc((void**)&s2r, sizeof(double) * N);
-  cudaMalloc((void**)&s2i, sizeof(double) * N);
+  gpuMalloc((void**)&s0r, sizeof(double) * N);
+  gpuMalloc((void**)&s0i, sizeof(double) * N);
+  gpuMalloc((void**)&s1r, sizeof(double) * N);
+  gpuMalloc((void**)&s1i, sizeof(double) * N);
+  gpuMalloc((void**)&s2r, sizeof(double) * N);
+  gpuMalloc((void**)&s2i, sizeof(double) * N);
 
   // T_0(H) |psi> = |psi>
   gpu_copy_state<<<grid_size, BLOCK_SIZE_EC>>>(N, sr, si, s0r, s0i);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // T_1(H) |psi> = H |psi>
   gpu_apply_hamiltonian<<<grid_size, BLOCK_SIZE_EC>>>(
     N, Em_inv, NN, NL, U, Hr, Hi, sr, si, s1r, s1i);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // |final_state> = c_0 * T_0(H) |psi> + c_1 * T_1(H) |psi>
   double bessel_0 = j0(time_step_scaled);
   double bessel_1 = 2.0 * j1(time_step_scaled);
   gpu_chebyshev_01<<<grid_size, BLOCK_SIZE_EC>>>(
     N, s0r, s0i, s1r, s1i, sr, si, bessel_0, bessel_1, direction);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   for (int m = 2; m < 1000000; ++m) {
     double bessel_m = jn(m, time_step_scaled);
@@ -477,7 +478,7 @@ void evolve(
     }
     gpu_chebyshev_2<<<grid_size, BLOCK_SIZE_EC>>>(
       N, Em_inv, NN, NL, U, Hr, Hi, s0r, s0i, s1r, s1i, s2r, s2i, sr, si, bessel_m, label);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     // permute the pointers; do not need to copy the data
     double *temp_real, *temp_imag;
@@ -490,12 +491,12 @@ void evolve(
     s2r = temp_real;
     s2i = temp_imag;
   }
-  cudaFree(s0r);
-  cudaFree(s0i);
-  cudaFree(s1r);
-  cudaFree(s1i);
-  cudaFree(s2r);
-  cudaFree(s2i);
+  gpuFree(s0r);
+  gpuFree(s0i);
+  gpuFree(s1r);
+  gpuFree(s1i);
+  gpuFree(s2r);
+  gpuFree(s2i);
 }
 
 #ifdef USE_GRAPHENE_TB
@@ -756,7 +757,7 @@ void LSQT::process(Atom& atom, Box& box, const int step)
     Hr.data(),
     Hi.data(),
     xx.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_dos_and_velocity(atom, box);
   find_sigma(atom, box, step);
@@ -854,7 +855,7 @@ void LSQT::find_sigma(Atom& atom, Box& box, const int step)
       sli.data(),
       srr.data(),
       sri.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   } else {
     evolve(
       number_of_orbitals,
@@ -894,7 +895,7 @@ void LSQT::find_sigma(Atom& atom, Box& box, const int step)
     sli.data(),
     scr.data(),
     sci.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   std::vector<double> vac(number_of_energy_points);
 
diff --git a/src/measure/measure.cu b/src/measure/measure.cu
index f1057e35e..9441c5c92 100644
--- a/src/measure/measure.cu
+++ b/src/measure/measure.cu
@@ -20,6 +20,7 @@ The driver class dealing with measurement.
 #include "measure.cuh"
 #include "model/atom.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cstring>
 #define NUM_OF_HEAT_COMPONENTS 5
diff --git a/src/measure/modal_analysis.cu b/src/measure/modal_analysis.cu
index 7c0b51b57..ba79f3490 100644
--- a/src/measure/modal_analysis.cu
+++ b/src/measure/modal_analysis.cu
@@ -31,6 +31,7 @@ GPUMD Contributing author: Alexander Gabourie (Stanford University)
 
 #include "modal_analysis.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <cstring>
 
 #define NUM_OF_HEAT_COMPONENTS 5
@@ -89,7 +90,7 @@ static __device__ void gpu_bin_reduce(
   }
 
   __syncthreads();
-#pragma unroll
+
   for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
     if (tid < offset) {
       s_data_xin[tid] += s_data_xin[tid + offset];
@@ -252,7 +253,7 @@ void MODAL_ANALYSIS::compute_heat(
     mvx.data(),
     mvy.data(),
     mvz.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // Scale stress tensor by inv(sqrt(mass))
   prepare_sm<<<grid_size, BLOCK_SIZE>>>(
@@ -271,16 +272,16 @@ void MODAL_ANALYSIS::compute_heat(
     smx.data(),
     smy.data(),
     smz.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   const float alpha = 1.0;
   const float beta = 0.0;
   int stride = 1;
 
   // Calculate modal velocities
-  cublasSgemv(
+  gpublasSgemv(
     ma_handle,
-    CUBLAS_OP_N,
+    GPUBLAS_OP_N,
     num_modes,
     num_participating,
     &alpha,
@@ -291,9 +292,9 @@ void MODAL_ANALYSIS::compute_heat(
     &beta,
     xdotx.data(),
     stride);
-  cublasSgemv(
+  gpublasSgemv(
     ma_handle,
-    CUBLAS_OP_N,
+    GPUBLAS_OP_N,
     num_modes,
     num_participating,
     &alpha,
@@ -304,9 +305,9 @@ void MODAL_ANALYSIS::compute_heat(
     &beta,
     xdoty.data(),
     stride);
-  cublasSgemv(
+  gpublasSgemv(
     ma_handle,
-    CUBLAS_OP_N,
+    GPUBLAS_OP_N,
     num_modes,
     num_participating,
     &alpha,
@@ -320,10 +321,10 @@ void MODAL_ANALYSIS::compute_heat(
 
   // Calculate intermediate value
   // (i.e. heat current without modal velocities)
-  cublasSgemm(
+  gpublasSgemm(
     ma_handle,
-    CUBLAS_OP_N,
-    CUBLAS_OP_N,
+    GPUBLAS_OP_N,
+    GPUBLAS_OP_N,
     num_modes,
     3,
     num_participating,
@@ -335,10 +336,10 @@ void MODAL_ANALYSIS::compute_heat(
     &beta,
     jmx.data(),
     num_modes);
-  cublasSgemm(
+  gpublasSgemm(
     ma_handle,
-    CUBLAS_OP_N,
-    CUBLAS_OP_N,
+    GPUBLAS_OP_N,
+    GPUBLAS_OP_N,
     num_modes,
     3,
     num_participating,
@@ -350,10 +351,10 @@ void MODAL_ANALYSIS::compute_heat(
     &beta,
     jmy.data(),
     num_modes);
-  cublasSgemm(
+  gpublasSgemm(
     ma_handle,
-    CUBLAS_OP_N,
-    CUBLAS_OP_N,
+    GPUBLAS_OP_N,
+    GPUBLAS_OP_N,
     num_modes,
     3,
     num_participating,
@@ -367,9 +368,9 @@ void MODAL_ANALYSIS::compute_heat(
     num_modes);
 
   // calculate modal heat current
-  cublasSdgmm(
+  gpublasSdgmm(
     ma_handle,
-    CUBLAS_SIDE_LEFT,
+    GPUBLAS_SIDE_LEFT,
     num_modes,
     3,
     jmx.data(),
@@ -378,9 +379,9 @@ void MODAL_ANALYSIS::compute_heat(
     stride,
     jmx.data(),
     num_modes);
-  cublasSdgmm(
+  gpublasSdgmm(
     ma_handle,
-    CUBLAS_SIDE_LEFT,
+    GPUBLAS_SIDE_LEFT,
     num_modes,
     3,
     jmy.data(),
@@ -389,9 +390,9 @@ void MODAL_ANALYSIS::compute_heat(
     stride,
     jmy.data(),
     num_modes);
-  cublasSdgmm(
+  gpublasSdgmm(
     ma_handle,
-    CUBLAS_SIDE_LEFT,
+    GPUBLAS_SIDE_LEFT,
     num_modes,
     3,
     jmz.data(),
@@ -410,7 +411,7 @@ void MODAL_ANALYSIS::compute_heat(
     gpu_update_jm<ACCUMULATE>
       <<<grid_size, BLOCK_SIZE>>>(num_modes, jmx.data(), jmy.data(), jmz.data(), jm.data());
   }
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 void MODAL_ANALYSIS::setN(const std::vector<int>& cpu_type_size)
@@ -543,9 +544,9 @@ void MODAL_ANALYSIS::preprocess(
   rsqrtmass.resize(num_participating, Memory_Type::managed);
   gpu_set_mass_terms<<<(num_participating - 1) / BLOCK_SIZE + 1, BLOCK_SIZE>>>(
     num_participating, N1, mass.data(), sqrtmass.data(), rsqrtmass.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
-  cublasCreate(&ma_handle);
+  gpublasCreate(&ma_handle);
 }
 
 void MODAL_ANALYSIS::process(
@@ -568,18 +569,18 @@ void MODAL_ANALYSIS::process(
 
   gpu_bin_modes<<<num_bins, BIN_BLOCK>>>(
     num_modes, bin_count.data(), bin_sum.data(), num_bins, jm.data(), bin_out.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   if (method == HNEMA_METHOD) {
     float factor = KAPPA_UNIT_CONVERSION / (volume * temperature * fe * (float)samples_per_output);
     int num_bins_stored = num_bins * NUM_OF_HEAT_COMPONENTS;
     gpu_scale_jm<<<(num_bins_stored - 1) / BLOCK_SIZE + 1, BLOCK_SIZE>>>(
       num_bins_stored, factor, bin_out.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 
   // Compute thermal conductivity and output
-  cudaDeviceSynchronize(); // ensure GPU ready to move data to CPU
+  gpuDeviceSynchronize(); // ensure GPU ready to move data to CPU
   FILE* fid = fopen(output_file_position, "a");
   for (int i = 0; i < num_bins; i++) {
     fprintf(
@@ -597,7 +598,7 @@ void MODAL_ANALYSIS::process(
   if (method == HNEMA_METHOD) {
     int grid_size = (num_heat_stored - 1) / BLOCK_SIZE + 1;
     gpu_reset_data<<<grid_size, BLOCK_SIZE>>>(num_heat_stored, jm.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
@@ -605,5 +606,5 @@ void MODAL_ANALYSIS::postprocess()
 {
   if (!compute)
     return;
-  cublasDestroy(ma_handle);
+  gpublasDestroy(ma_handle);
 }
diff --git a/src/measure/modal_analysis.cuh b/src/measure/modal_analysis.cuh
index db52f8ebd..d05dc4ba6 100644
--- a/src/measure/modal_analysis.cuh
+++ b/src/measure/modal_analysis.cuh
@@ -21,7 +21,11 @@ GPUMD Contributing author: Alexander Gabourie (Stanford University)
 
 #include "utilities/common.cuh"
 #include "utilities/gpu_vector.cuh"
-#include <cublas_v2.h>
+#ifdef USE_HIP
+  #include <hipblas.h>
+#else
+  #include <cublas_v2.h>
+#endif
 #include <fstream>
 #include <iostream>
 #include <sstream>
@@ -97,7 +101,7 @@ private:
   int num_participating;  // Number of particles participating
   int num_heat_stored;    // Number of stored heat current elements
 
-  cublasHandle_t ma_handle;
+  gpublasHandle_t ma_handle;
 
   // stress by by square root mass (intermediate term)
   GPU_Vector<float> smx;
diff --git a/src/measure/msd.cu b/src/measure/msd.cu
index b62e0e018..0eb4d7755 100644
--- a/src/measure/msd.cu
+++ b/src/measure/msd.cu
@@ -24,6 +24,7 @@ Calculate:
 #include "parse_utilities.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cstring>
 
@@ -184,7 +185,7 @@ void MSD::process(const int step, const std::vector<Group>& groups, const GPU_Ve
       y_.data() + step_offset,
       z_.data() + step_offset);
   }
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // start to calculate the MSD when we have enough frames
   if (sample_step >= num_correlation_steps_ - 1) {
@@ -202,7 +203,7 @@ void MSD::process(const int step, const std::vector<Group>& groups, const GPU_Ve
       msdx_.data(),
       msdy_.data(),
       msdz_.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
@@ -211,7 +212,7 @@ void MSD::postprocess()
   if (!compute_)
     return;
 
-  CHECK(cudaDeviceSynchronize()); // needed for pre-Pascal GPU
+  CHECK(gpuDeviceSynchronize()); // needed for pre-Pascal GPU
 
   // normalize by the number of atoms and number of time origins
   const double msd_scaler = 1.0 / ((double)num_atoms_ * (double)num_time_origins_);
diff --git a/src/measure/parse_utilities.cu b/src/measure/parse_utilities.cu
index 8bd34956d..2af131525 100644
--- a/src/measure/parse_utilities.cu
+++ b/src/measure/parse_utilities.cu
@@ -19,6 +19,7 @@ A function parsing the "group" option in some keywords
 
 #include "model/group.cuh"
 #include "parse_utilities.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cstring>
 
diff --git a/src/measure/plumed.cu b/src/measure/plumed.cu
index d5e150f2b..e195338ce 100644
--- a/src/measure/plumed.cu
+++ b/src/measure/plumed.cu
@@ -22,6 +22,7 @@ Interface to the PLUMED plugin: https://www.plumed.org
 #include "plumed.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/gpu_vector.cuh"
 #include "utilities/read_file.cuh"
 
@@ -183,7 +184,7 @@ void PLUMED::process(
   }
 
   gpu_sum<<<6, 1024>>>(n_atom, virial.data(), gpu_v_vector.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
   gpu_v_vector.copy_to_host(tmp.data());
   fill(cpu_v_vector.begin(), cpu_v_vector.end(), 0.0);
 
@@ -223,7 +224,7 @@ void PLUMED::process(
     virial.data() + n_atom * 3,
     virial.data() + n_atom * 4,
     virial.data() + n_atom * 5);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 void PLUMED::postprocess(void)
diff --git a/src/measure/rdf.cu b/src/measure/rdf.cu
index d0392e707..295453ecd 100644
--- a/src/measure/rdf.cu
+++ b/src/measure/rdf.cu
@@ -26,6 +26,7 @@ Calculate:
 #include "rdf.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cstring>
 
@@ -315,7 +316,7 @@ void RDF::find_rdf(
       rdf_g_ind,
       rdf_bins_,
       r_step_);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
   } else {
     gpu_find_rdf_ON1<<<grid_size, block_size>>>(
@@ -342,7 +343,7 @@ void RDF::find_rdf(
       rdf_g_ind,
       rdf_bins_,
       r_step_);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
@@ -499,12 +500,12 @@ void RDF::postprocess(const bool is_pimd, const int number_of_beads)
 
   if (is_pimd) {
 
-    CHECK(cudaMemcpy(
+    CHECK(gpuMemcpy(
       rdf_.data(),
       rdf_g_.data(),
       sizeof(double) * number_of_beads * num_atoms_ * rdf_bins_,
-      cudaMemcpyDeviceToHost));
-    CHECK(cudaDeviceSynchronize()); // needed for pre-Pascal GPU
+      gpuMemcpyDeviceToHost));
+    CHECK(gpuDeviceSynchronize()); // needed for pre-Pascal GPU
 
     std::vector<double> rdf_average(number_of_beads * rdf_atom_count * rdf_bins_, 0.0);
     for (int k = 0; k < number_of_beads; k++) {
@@ -559,9 +560,9 @@ void RDF::postprocess(const bool is_pimd, const int number_of_beads)
 
   } else {
 
-    CHECK(cudaMemcpy(
-      rdf_.data(), rdf_g_.data(), sizeof(double) * num_atoms_ * rdf_bins_, cudaMemcpyDeviceToHost));
-    CHECK(cudaDeviceSynchronize()); // needed for pre-Pascal GPU
+    CHECK(gpuMemcpy(
+      rdf_.data(), rdf_g_.data(), sizeof(double) * num_atoms_ * rdf_bins_, gpuMemcpyDeviceToHost));
+    CHECK(gpuDeviceSynchronize()); // needed for pre-Pascal GPU
 
     std::vector<double> rdf_average(rdf_atom_count * rdf_bins_, 0.0);
     for (int a = 0; a < rdf_atom_count; a++) {
diff --git a/src/measure/sdc.cu b/src/measure/sdc.cu
index 66e191945..fe047a59e 100644
--- a/src/measure/sdc.cu
+++ b/src/measure/sdc.cu
@@ -24,6 +24,7 @@ Calculate:
 #include "sdc.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cstring>
 
@@ -182,7 +183,7 @@ void SDC::process(
       vy_.data() + step_offset,
       vz_.data() + step_offset);
   }
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   // start to calculate the VAC when we have enough frames
   if (sample_step >= num_correlation_steps_ - 1) {
@@ -200,7 +201,7 @@ void SDC::process(
       vacx_.data(),
       vacy_.data(),
       vacz_.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
@@ -209,7 +210,7 @@ void SDC::postprocess()
   if (!compute_)
     return;
 
-  CHECK(cudaDeviceSynchronize()); // needed for pre-Pascal GPU
+  CHECK(gpuDeviceSynchronize()); // needed for pre-Pascal GPU
 
   // normalize by the number of atoms and number of time origins
   const double vac_scaler = 1.0 / ((double)num_atoms_ * (double)num_time_origins_);
diff --git a/src/measure/shc.cu b/src/measure/shc.cu
index 52d0d6e87..1710121f9 100644
--- a/src/measure/shc.cu
+++ b/src/measure/shc.cu
@@ -25,6 +25,7 @@ with many-body potentials, Phys. Rev. B 99, 064308 (2019).
 #include "shc.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cstring>
 
@@ -178,12 +179,12 @@ void SHC::process(
   const double* vz_tmp = velocity_per_atom.data() + N * 2;
 
   if (-1 == group_method) {
-    CHECK(cudaMemcpy(sx.data() + offset, sx_tmp, sizeof(double) * N, cudaMemcpyDeviceToDevice));
-    CHECK(cudaMemcpy(sy.data() + offset, sy_tmp, sizeof(double) * N, cudaMemcpyDeviceToDevice));
-    CHECK(cudaMemcpy(sz.data() + offset, sz_tmp, sizeof(double) * N, cudaMemcpyDeviceToDevice));
-    CHECK(cudaMemcpy(vx.data() + offset, vx_tmp, sizeof(double) * N, cudaMemcpyDeviceToDevice));
-    CHECK(cudaMemcpy(vy.data() + offset, vy_tmp, sizeof(double) * N, cudaMemcpyDeviceToDevice));
-    CHECK(cudaMemcpy(vz.data() + offset, vz_tmp, sizeof(double) * N, cudaMemcpyDeviceToDevice));
+    CHECK(gpuMemcpy(sx.data() + offset, sx_tmp, sizeof(double) * N, gpuMemcpyDeviceToDevice));
+    CHECK(gpuMemcpy(sy.data() + offset, sy_tmp, sizeof(double) * N, gpuMemcpyDeviceToDevice));
+    CHECK(gpuMemcpy(sz.data() + offset, sz_tmp, sizeof(double) * N, gpuMemcpyDeviceToDevice));
+    CHECK(gpuMemcpy(vx.data() + offset, vx_tmp, sizeof(double) * N, gpuMemcpyDeviceToDevice));
+    CHECK(gpuMemcpy(vy.data() + offset, vy_tmp, sizeof(double) * N, gpuMemcpyDeviceToDevice));
+    CHECK(gpuMemcpy(vz.data() + offset, vz_tmp, sizeof(double) * N, gpuMemcpyDeviceToDevice));
   } else {
     if (group_id == -1) {
       for (int n = 1; n < group_num; ++n) {
@@ -207,7 +208,7 @@ void SHC::process(
           vx_tmp,
           vy_tmp,
           vz_tmp);
-        CUDA_CHECK_KERNEL
+        GPU_CHECK_KERNEL
       }
     } else {
       gpu_copy_data<<<(group_size - 1) / BLOCK_SIZE_SHC + 1, BLOCK_SIZE_SHC>>>(
@@ -227,7 +228,7 @@ void SHC::process(
         vy_tmp,
         vz_tmp);
     }
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 
   if (sample_step >= Nc - 1) {
@@ -248,7 +249,7 @@ void SHC::process(
           vz.data() + offset_s,
           ki_negative.data() + Nc * n,
           ko_negative.data() + Nc * n);
-        CUDA_CHECK_KERNEL
+        GPU_CHECK_KERNEL
 
         gpu_find_k<<<Nc, BLOCK_SIZE_SHC>>>(
           group[group_method].cpu_size[n],
@@ -261,7 +262,7 @@ void SHC::process(
           sz.data() + offset_s,
           ki_positive.data() + Nc * n,
           ko_positive.data() + Nc * n);
-        CUDA_CHECK_KERNEL
+        GPU_CHECK_KERNEL
       }
     } else {
       gpu_find_k<<<Nc, BLOCK_SIZE_SHC>>>(
@@ -275,7 +276,7 @@ void SHC::process(
         vz.data(),
         ki_negative.data(),
         ko_negative.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
 
       gpu_find_k<<<Nc, BLOCK_SIZE_SHC>>>(
         group_size,
@@ -288,7 +289,7 @@ void SHC::process(
         sz.data(),
         ki_positive.data(),
         ko_positive.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
     }
   }
 }
diff --git a/src/measure/viscosity.cu b/src/measure/viscosity.cu
index ce13758c6..a64a8283d 100644
--- a/src/measure/viscosity.cu
+++ b/src/measure/viscosity.cu
@@ -18,6 +18,7 @@ Calculate the stress autocorrelation function and viscosity.
 ------------------------------------------------------------------------------*/
 
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include "viscosity.cuh"
 #include <vector>
@@ -92,7 +93,7 @@ void Viscosity::process(
   int Nd = number_of_steps / sample_interval;
   gpu_sum_stress<<<NUM_OF_COMPONENTS, 1024>>>(
     N, Nd, nd, mass.data(), velocity.data(), virial.data(), stress_all.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 static __global__ void gpu_correct_stress(const int Nd, double* g_stress_all)
@@ -196,9 +197,9 @@ void Viscosity::postprocess(
   std::vector<double> correlation_cpu(Nc * NUM_OF_COMPONENTS);
 
   gpu_correct_stress<<<NUM_OF_COMPONENTS, 1024>>>(Nd, stress_all.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
   gpu_find_correlation<<<Nc, 128>>>(Nc, Nd, stress_all.data(), correlation_gpu.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   correlation_gpu.copy_to_host(correlation_cpu.data());
 
diff --git a/src/minimize/minimize.cu b/src/minimize/minimize.cu
index c41db2a4a..4d2bc3205 100644
--- a/src/minimize/minimize.cu
+++ b/src/minimize/minimize.cu
@@ -22,6 +22,7 @@ The driver class for minimizers.
 #include "minimizer_fire.cuh"
 #include "minimizer_sd.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cstring>
 #include <memory>
diff --git a/src/minimize/minimizer.cu b/src/minimize/minimizer.cu
index 1e48e589e..304404c9d 100644
--- a/src/minimize/minimizer.cu
+++ b/src/minimize/minimizer.cu
@@ -18,6 +18,7 @@ The abstract base class (ABC) for the minimizer classes.
 ------------------------------------------------------------------------------*/
 
 #include "minimizer.cuh"
+#include "utilities/gpu_macro.cuh"
 
 namespace
 {
diff --git a/src/minimize/minimizer_fire.cu b/src/minimize/minimizer_fire.cu
index d695c5a29..d53087d3f 100644
--- a/src/minimize/minimizer_fire.cu
+++ b/src/minimize/minimizer_fire.cu
@@ -20,6 +20,7 @@ Reference: PhysRevLett 97, 170201 (2006)
 ------------------------------------------------------------------------------*/
 
 #include "minimizer_fire.cuh"
+#include "utilities/gpu_macro.cuh"
 
 namespace
 {
diff --git a/src/minimize/minimizer_sd.cu b/src/minimize/minimizer_sd.cu
index 6d61a3bb7..968a243d7 100644
--- a/src/minimize/minimizer_sd.cu
+++ b/src/minimize/minimizer_sd.cu
@@ -19,6 +19,7 @@ The SD (steepest decent) minimizer.
 
 #include "force/force.cuh"
 #include "minimizer_sd.cuh"
+#include "utilities/gpu_macro.cuh"
 
 const double decreasing_factor = 0.2;
 const double increasing_factor = 1.2;
diff --git a/src/model/atom.cu b/src/model/atom.cu
index 450b9aff2..783aa68fb 100644
--- a/src/model/atom.cu
+++ b/src/model/atom.cu
@@ -19,6 +19,7 @@ The class defining the simulation box.
 
 #include "atom.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <cmath>
 
 int Atom::number_of_type(std::string& symbol)
diff --git a/src/model/box.cu b/src/model/box.cu
index cf10997db..4cbecd36e 100644
--- a/src/model/box.cu
+++ b/src/model/box.cu
@@ -19,6 +19,7 @@ The class defining the simulation box.
 
 #include "box.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <cmath>
 
 static float get_area_one_direction(const double* a, const double* b)
diff --git a/src/model/group.cu b/src/model/group.cu
index d05ac1731..78f4746eb 100644
--- a/src/model/group.cu
+++ b/src/model/group.cu
@@ -18,6 +18,7 @@ The class defining the grouping methods
 ------------------------------------------------------------------------------*/
 
 #include "group.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <vector>
 
 void Group::find_size(const int N, const int k)
diff --git a/src/model/read_xyz.cu b/src/model/read_xyz.cu
index 98f59d390..d53d2cea7 100644
--- a/src/model/read_xyz.cu
+++ b/src/model/read_xyz.cu
@@ -23,6 +23,7 @@ The class defining the simulation model.
 #include "read_xyz.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <algorithm>
 #include <cctype>
 #include <fstream>
diff --git a/src/phonon/hessian.cu b/src/phonon/hessian.cu
index d16c175f3..20f085bc2 100644
--- a/src/phonon/hessian.cu
+++ b/src/phonon/hessian.cu
@@ -25,6 +25,7 @@ Then calculate the dynamical matrices with different k points.
 #include "utilities/common.cuh"
 #include "utilities/cusolver_wrapper.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <vector>
 
diff --git a/src/utilities/cusolver_wrapper.cu b/src/utilities/cusolver_wrapper.cu
index 23358fec4..df8ac56cc 100644
--- a/src/utilities/cusolver_wrapper.cu
+++ b/src/utilities/cusolver_wrapper.cu
@@ -19,16 +19,21 @@ Some wrappers for the cuSOLVER library
 
 #include "cusolver_wrapper.cuh"
 #include "error.cuh"
+#include "gpu_macro.cuh"
 #include "gpu_vector.cuh"
-#include <cusolverDn.h>
+#ifdef USE_HIP
+  #include <hipsolver.h>
+#else
+  #include <cusolverDn.h>
+#endif
 #include <vector>
 
 void eig_hermitian_QR(size_t N, double* AR, double* AI, double* W_cpu)
 {
   // get A
   size_t N2 = N * N;
-  GPU_Vector<cuDoubleComplex> A(N2);
-  std::vector<cuDoubleComplex> A_cpu(N2);
+  GPU_Vector<gpuDoubleComplex> A(N2);
+  std::vector<gpuDoubleComplex> A_cpu(N2);
 
   for (size_t n = 0; n < N2; ++n) {
     A_cpu[n].x = AR[n];
@@ -40,31 +45,31 @@ void eig_hermitian_QR(size_t N, double* AR, double* AI, double* W_cpu)
   GPU_Vector<double> W(N);
 
   // get handle
-  cusolverDnHandle_t handle = NULL;
-  cusolverDnCreate(&handle);
-  cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
-  cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
+  gpusolverDnHandle_t handle = NULL;
+  gpusolverDnCreate(&handle);
+  gpusolverEigMode_t jobz = GPUSOLVER_EIG_MODE_NOVECTOR;
+  gpusolverFillMode_t uplo = GPUSOLVER_FILL_MODE_LOWER;
 
   // get work
   int lwork = 0;
-  cusolverDnZheevd_bufferSize(handle, jobz, uplo, N, A.data(), N, W.data(), &lwork);
-  GPU_Vector<cuDoubleComplex> work(lwork);
+  gpusolverDnZheevd_bufferSize(handle, jobz, uplo, N, A.data(), N, W.data(), &lwork);
+  GPU_Vector<gpuDoubleComplex> work(lwork);
 
   // get W
   GPU_Vector<int> info(1);
-  cusolverDnZheevd(handle, jobz, uplo, N, A.data(), N, W.data(), work.data(), lwork, info.data());
+  gpusolverDnZheevd(handle, jobz, uplo, N, A.data(), N, W.data(), work.data(), lwork, info.data());
   W.copy_to_host(W_cpu);
 
   // free
-  cusolverDnDestroy(handle);
+  gpusolverDnDestroy(handle);
 }
 
 void eig_hermitian_Jacobi(size_t N, double* AR, double* AI, double* W_cpu)
 {
   // get A
   size_t N2 = N * N;
-  GPU_Vector<cuDoubleComplex> A(N2);
-  std::vector<cuDoubleComplex> A_cpu(N2);
+  GPU_Vector<gpuDoubleComplex> A(N2);
+  std::vector<gpuDoubleComplex> A_cpu(N2);
   for (size_t n = 0; n < N2; ++n) {
     A_cpu[n].x = AR[n];
     A_cpu[n].y = AI[n];
@@ -75,29 +80,29 @@ void eig_hermitian_Jacobi(size_t N, double* AR, double* AI, double* W_cpu)
   GPU_Vector<double> W(N);
 
   // get handle
-  cusolverDnHandle_t handle = NULL;
-  cusolverDnCreate(&handle);
-  cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
-  cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
+  gpusolverDnHandle_t handle = NULL;
+  gpusolverDnCreate(&handle);
+  gpusolverEigMode_t jobz = GPUSOLVER_EIG_MODE_NOVECTOR;
+  gpusolverFillMode_t uplo = GPUSOLVER_FILL_MODE_LOWER;
 
   // some parameters for the Jacobi method
-  syevjInfo_t para = NULL;
-  cusolverDnCreateSyevjInfo(&para);
+  gpusolverSyevjInfo_t para = NULL;
+  gpusolverDnCreateSyevjInfo(&para);
 
   // get work
   int lwork = 0;
-  cusolverDnZheevj_bufferSize(handle, jobz, uplo, N, A.data(), N, W.data(), &lwork, para);
-  GPU_Vector<cuDoubleComplex> work(lwork);
+  gpusolverDnZheevj_bufferSize(handle, jobz, uplo, N, A.data(), N, W.data(), &lwork, para);
+  GPU_Vector<gpuDoubleComplex> work(lwork);
 
   // get W
   GPU_Vector<int> info(1);
-  cusolverDnZheevj(
+  gpusolverDnZheevj(
     handle, jobz, uplo, N, A.data(), N, W.data(), work.data(), lwork, info.data(), para);
   W.copy_to_host(W_cpu);
 
   // free
-  cusolverDnDestroy(handle);
-  cusolverDnDestroySyevjInfo(para);
+  gpusolverDnDestroy(handle);
+  gpusolverDnDestroySyevjInfo(para);
 }
 
 void eigenvectors_symmetric_Jacobi(size_t N, double* A_cpu, double* W_cpu, double* eigenvectors_cpu)
@@ -111,38 +116,38 @@ void eigenvectors_symmetric_Jacobi(size_t N, double* A_cpu, double* W_cpu, doubl
   GPU_Vector<double> W(N);
 
   // get handle
-  cusolverDnHandle_t handle = NULL;
-  cusolverDnCreate(&handle);
-  cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
-  cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
+  gpusolverDnHandle_t handle = NULL;
+  gpusolverDnCreate(&handle);
+  gpusolverEigMode_t jobz = GPUSOLVER_EIG_MODE_VECTOR;
+  gpusolverFillMode_t uplo = GPUSOLVER_FILL_MODE_LOWER;
 
   // some parameters for the Jacobi method
-  syevjInfo_t para = NULL;
-  cusolverDnCreateSyevjInfo(&para);
+  gpusolverSyevjInfo_t para = NULL;
+  gpusolverDnCreateSyevjInfo(&para);
 
   // get work
   int lwork = 0;
-  cusolverDnDsyevj_bufferSize(handle, jobz, uplo, N, A.data(), N, W.data(), &lwork, para);
+  gpusolverDnDsyevj_bufferSize(handle, jobz, uplo, N, A.data(), N, W.data(), &lwork, para);
   GPU_Vector<double> work(lwork);
 
   // get W
   GPU_Vector<int> info(1);
-  cusolverDnDsyevj(
+  gpusolverDnDsyevj(
     handle, jobz, uplo, N, A.data(), N, W.data(), work.data(), lwork, info.data(), para);
   W.copy_to_host(W_cpu);
   A.copy_to_host(eigenvectors_cpu);
 
   // free
-  cusolverDnDestroy(handle);
-  cusolverDnDestroySyevjInfo(para);
+  gpusolverDnDestroy(handle);
+  gpusolverDnDestroySyevjInfo(para);
 }
 
 void eig_hermitian_Jacobi_batch(size_t N, size_t batch_size, double* AR, double* AI, double* W_cpu)
 {
   // get A
   size_t M = N * N * batch_size;
-  GPU_Vector<cuDoubleComplex> A(M);
-  std::vector<cuDoubleComplex> A_cpu(M);
+  GPU_Vector<gpuDoubleComplex> A(M);
+  std::vector<gpuDoubleComplex> A_cpu(M);
   for (size_t n = 0; n < M; ++n) {
     A_cpu[n].x = AR[n];
     A_cpu[n].y = AI[n];
@@ -153,24 +158,24 @@ void eig_hermitian_Jacobi_batch(size_t N, size_t batch_size, double* AR, double*
   GPU_Vector<double> W(N * batch_size);
 
   // get handle
-  cusolverDnHandle_t handle = NULL;
-  cusolverDnCreate(&handle);
-  cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
-  cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
+  gpusolverDnHandle_t handle = NULL;
+  gpusolverDnCreate(&handle);
+  gpusolverEigMode_t jobz = GPUSOLVER_EIG_MODE_NOVECTOR;
+  gpusolverFillMode_t uplo = GPUSOLVER_FILL_MODE_LOWER;
 
   // some parameters for the Jacobi method
-  syevjInfo_t para = NULL;
-  cusolverDnCreateSyevjInfo(&para);
+  gpusolverSyevjInfo_t para = NULL;
+  gpusolverDnCreateSyevjInfo(&para);
 
   // get work
   int lwork = 0;
-  cusolverDnZheevjBatched_bufferSize(
+  gpusolverDnZheevjBatched_bufferSize(
     handle, jobz, uplo, N, A.data(), N, W.data(), &lwork, para, batch_size);
-  GPU_Vector<cuDoubleComplex> work(lwork);
+  GPU_Vector<gpuDoubleComplex> work(lwork);
 
   // get W
   GPU_Vector<int> info(batch_size);
-  cusolverDnZheevjBatched(
+  gpusolverDnZheevjBatched(
     handle,
     jobz,
     uplo,
@@ -186,6 +191,6 @@ void eig_hermitian_Jacobi_batch(size_t N, size_t batch_size, double* AR, double*
   W.copy_to_host(W_cpu);
 
   // free
-  cusolverDnDestroy(handle);
-  cusolverDnDestroySyevjInfo(para);
+  gpusolverDnDestroy(handle);
+  gpusolverDnDestroySyevjInfo(para);
 }
diff --git a/src/utilities/error.cuh b/src/utilities/error.cuh
index f82f4297e..ea2fd396e 100644
--- a/src/utilities/error.cuh
+++ b/src/utilities/error.cuh
@@ -14,6 +14,7 @@
 */
 
 #pragma once
+#include "gpu_macro.cuh"
 #include <fstream>
 #include <stdio.h>
 #include <string>
@@ -21,13 +22,13 @@
 
 #define CHECK(call)                                                                                \
   do {                                                                                             \
-    const cudaError_t error_code = call;                                                           \
-    if (error_code != cudaSuccess) {                                                               \
+    const gpuError_t error_code = call;                                                           \
+    if (error_code != gpuSuccess) {                                                               \
       fprintf(stderr, "CUDA Error:\n");                                                            \
       fprintf(stderr, "    File:       %s\n", __FILE__);                                           \
       fprintf(stderr, "    Line:       %d\n", __LINE__);                                           \
       fprintf(stderr, "    Error code: %d\n", error_code);                                         \
-      fprintf(stderr, "    Error text: %s\n", cudaGetErrorString(error_code));                     \
+      fprintf(stderr, "    Error text: %s\n", gpuGetErrorString(error_code));                     \
       exit(1);                                                                                     \
     }                                                                                              \
   } while (0)
@@ -62,15 +63,15 @@
   } while (0)
 
 #ifdef STRONG_DEBUG
-#define CUDA_CHECK_KERNEL                                                                          \
+#define GPU_CHECK_KERNEL                                                                          \
   {                                                                                                \
-    CHECK(cudaGetLastError());                                                                     \
-    CHECK(cudaDeviceSynchronize());                                                                \
+    CHECK(gpuGetLastError());                                                                     \
+    CHECK(gpuDeviceSynchronize());                                                                \
   }
 #else
-#define CUDA_CHECK_KERNEL                                                                          \
+#define GPU_CHECK_KERNEL                                                                          \
   {                                                                                                \
-    CHECK(cudaGetLastError());                                                                     \
+    CHECK(gpuGetLastError());                                                                     \
   }
 #endif
 
diff --git a/src/utilities/gpu_macro.cuh b/src/utilities/gpu_macro.cuh
new file mode 100644
index 000000000..75a5cbd3a
--- /dev/null
+++ b/src/utilities/gpu_macro.cuh
@@ -0,0 +1,168 @@
+/*
+    Copyright 2017 Zheyong Fan and GPUMD development team
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#ifdef USE_HIP // HIP for AMD card
+
+#include <hip/hip_runtime.h>
+
+// memory manipulation
+#define gpuMalloc hipMalloc 
+#define gpuMallocManaged hipMallocManaged
+#define gpuFree hipFree
+#define gpuMemcpy hipMemcpy
+#define gpuMemcpyFromSymbol hipMemcpyFromSymbol
+#define gpuMemcpyToSymbol hipMemcpyToSymbol
+#define gpuGetSymbolAddress hipGetSymbolAddress
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyHostToHost hipMemcpyHostToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define gpuMemset hipMemset
+
+// error handling
+#define gpuError_t hipError_t 
+#define gpuSuccess hipSuccess
+#define gpuGetErrorString hipGetErrorString
+#define gpuGetLastError hipGetLastError
+
+// device manipulation
+#define gpuSetDevice hipSetDevice
+#define gpuGetDeviceCount hipGetDeviceCount
+#define gpuDeviceProp hipDeviceProp_t
+#define gpuGetDeviceProperties hipGetDeviceProperties
+#define gpuDeviceCanAccessPeer hipDeviceCanAccessPeer
+#define gpuDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define gpuDeviceSynchronize hipDeviceSynchronize
+
+// stream
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate hipStreamCreate
+#define gpuStreamDestroy hipStreamDestroy
+
+// random numbers
+#define gpurandState hiprandState
+#define gpurand_normal_double hiprand_normal_double
+#define gpurand_normal hiprand_normal
+#define gpurand_init hiprand_init
+
+// blas
+#define gpublasHandle_t hipblasHandle_t
+#define gpublasSgemv hipblasSgemv
+#define gpublasSgemm hipblasSgemm
+#define gpublasSdgmm hipblasSdgmm
+#define gpublasDestroy hipblasDestroy
+#define gpublasCreate hipblasCreate
+#define GPUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+
+// lapack
+#define gpuDoubleComplex hipDoubleComplex
+#define gpusolverDnHandle_t hipsolverDnHandle_t
+#define gpusolverDnCreate hipsolverDnCreate
+#define gpusolverDnDestroy hipsolverDnDestroy
+#define gpusolverEigMode_t hipsolverEigMode_t
+#define gpusolverFillMode_t hipsolverFillMode_t
+#define GPUSOLVER_EIG_MODE_NOVECTOR HIPSOLVER_EIG_MODE_NOVECTOR
+#define GPUSOLVER_EIG_MODE_VECTOR HIPSOLVER_EIG_MODE_VECTOR
+#define GPUSOLVER_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER
+#define gpusolverSyevjInfo_t hipsolverSyevjInfo_t
+#define gpusolverDnCreateSyevjInfo hipsolverDnCreateSyevjInfo
+#define gpusolverDnDestroySyevjInfo hipsolverDnDestroySyevjInfo
+#define gpusolverDnZheevj_bufferSize hipsolverDnZheevj_bufferSize
+#define gpusolverDnZheevj hipsolverDnZheevj
+#define gpusolverDnZheevd_bufferSize hipsolverDnZheevd_bufferSize
+#define gpusolverDnZheevd hipsolverDnZheevd
+#define gpusolverDnDsyevj_bufferSize hipsolverDnDsyevj_bufferSize
+#define gpusolverDnDsyevj hipsolverDnDsyevj
+#define gpusolverDnZheevjBatched_bufferSize hipsolverDnZheevjBatched_bufferSize
+#define gpusolverDnZheevjBatched hipsolverDnZheevjBatched
+
+#else // CUDA for Nvidia card
+
+// memory manipulation
+#define gpuMalloc cudaMalloc
+#define gpuMallocManaged cudaMallocManaged
+#define gpuFree cudaFree
+#define gpuMemcpy cudaMemcpy
+#define gpuMemcpyFromSymbol cudaMemcpyFromSymbol
+#define gpuMemcpyToSymbol cudaMemcpyToSymbol
+#define gpuGetSymbolAddress cudaGetSymbolAddress
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyHostToHost cudaMemcpyHostToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+#define gpuMemset cudaMemset
+
+// error handling
+#define gpuError_t cudaError_t 
+#define gpuSuccess cudaSuccess
+#define gpuGetErrorString cudaGetErrorString
+#define gpuGetLastError cudaGetLastError
+
+// device manipulation
+#define gpuSetDevice cudaSetDevice
+#define gpuGetDeviceCount cudaGetDeviceCount
+#define gpuDeviceProp cudaDeviceProp
+#define gpuGetDeviceProperties cudaGetDeviceProperties
+#define gpuDeviceCanAccessPeer cudaDeviceCanAccessPeer
+#define gpuDeviceEnablePeerAccess cudaDeviceEnablePeerAccess
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+
+// stream
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate cudaStreamCreate
+#define gpuStreamDestroy cudaStreamDestroy
+
+// random numbers
+#define gpurandState curandState
+#define gpurand_normal_double curand_normal_double
+#define gpurand_normal curand_normal
+#define gpurand_init curand_init
+
+// blas
+#define gpublasHandle_t cublasHandle_t
+#define gpublasSgemv cublasSgemv
+#define gpublasSgemm cublasSgemm
+#define gpublasSdgmm cublasSdgmm
+#define gpublasDestroy cublasDestroy
+#define gpublasCreate cublasCreate
+#define GPUBLAS_SIDE_LEFT CUBLAS_SIDE_LEFT
+#define GPUBLAS_OP_N CUBLAS_OP_N
+
+// lapack
+#define gpuDoubleComplex cuDoubleComplex
+#define gpusolverDnHandle_t cusolverDnHandle_t
+#define gpusolverDnCreate cusolverDnCreate
+#define gpusolverDnDestroy cusolverDnDestroy
+#define gpusolverEigMode_t cusolverEigMode_t
+#define gpusolverFillMode_t cublasFillMode_t // why cublas?
+#define GPUSOLVER_EIG_MODE_NOVECTOR CUSOLVER_EIG_MODE_NOVECTOR
+#define GPUSOLVER_EIG_MODE_VECTOR CUSOLVER_EIG_MODE_VECTOR
+#define GPUSOLVER_FILL_MODE_LOWER CUBLAS_FILL_MODE_LOWER // why cublas?
+#define gpusolverSyevjInfo_t syevjInfo_t // why not cusolverSyevjInfo_t?
+#define gpusolverDnCreateSyevjInfo cusolverDnCreateSyevjInfo
+#define gpusolverDnDestroySyevjInfo cusolverDnDestroySyevjInfo
+#define gpusolverDnZheevj_bufferSize cusolverDnZheevj_bufferSize
+#define gpusolverDnZheevj cusolverDnZheevj
+#define gpusolverDnZheevd_bufferSize cusolverDnZheevd_bufferSize
+#define gpusolverDnZheevd cusolverDnZheevd
+#define gpusolverDnDsyevj_bufferSize cusolverDnDsyevj_bufferSize
+#define gpusolverDnDsyevj cusolverDnDsyevj
+#define gpusolverDnZheevjBatched_bufferSize cusolverDnZheevjBatched_bufferSize
+#define gpusolverDnZheevjBatched cusolverDnZheevjBatched
+
+#endif
diff --git a/src/utilities/gpu_vector.cuh b/src/utilities/gpu_vector.cuh
index 6aef232d8..839bc86a8 100644
--- a/src/utilities/gpu_vector.cuh
+++ b/src/utilities/gpu_vector.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "error.cuh"
+#include "gpu_macro.cuh"
 
 namespace
 {
@@ -64,7 +65,7 @@ public:
   ~GPU_Vector()
   {
     if (allocated_) {
-      CHECK(cudaFree(data_));
+      CHECK(gpuFree(data_));
       allocated_ = false;
     }
   }
@@ -76,14 +77,14 @@ public:
     memory_ = size_ * sizeof(T);
     memory_type_ = memory_type;
     if (allocated_) {
-      CHECK(cudaFree(data_));
+      CHECK(gpuFree(data_));
       allocated_ = false;
     }
     if (memory_type_ == Memory_Type::global) {
-      CHECK(cudaMalloc((void**)&data_, memory_));
+      CHECK(gpuMalloc((void**)&data_, memory_));
       allocated_ = true;
     } else {
-      CHECK(cudaMallocManaged((void**)&data_, memory_));
+      CHECK(gpuMallocManaged((void**)&data_, memory_));
       allocated_ = true;
     }
   }
@@ -95,14 +96,14 @@ public:
     memory_ = size_ * sizeof(T);
     memory_type_ = memory_type;
     if (allocated_) {
-      CHECK(cudaFree(data_));
+      CHECK(gpuFree(data_));
       allocated_ = false;
     }
     if (memory_type == Memory_Type::global) {
-      CHECK(cudaMalloc((void**)&data_, memory_));
+      CHECK(gpuMalloc((void**)&data_, memory_));
       allocated_ = true;
     } else {
-      CHECK(cudaMallocManaged((void**)&data_, memory_));
+      CHECK(gpuMallocManaged((void**)&data_, memory_));
       allocated_ = true;
     }
     fill(value);
@@ -111,53 +112,53 @@ public:
   // copy data from host with the default size
   void copy_from_host(const T* h_data)
   {
-    CHECK(cudaMemcpy(data_, h_data, memory_, cudaMemcpyHostToDevice));
+    CHECK(gpuMemcpy(data_, h_data, memory_, gpuMemcpyHostToDevice));
   }
 
   // copy data from host with a given size
   void copy_from_host(const T* h_data, const size_t size)
   {
     const size_t memory = sizeof(T) * size;
-    CHECK(cudaMemcpy(data_, h_data, memory, cudaMemcpyHostToDevice));
+    CHECK(gpuMemcpy(data_, h_data, memory, gpuMemcpyHostToDevice));
   }
 
   // copy data from device with the default size
   void copy_from_device(const T* d_data)
   {
-    CHECK(cudaMemcpy(data_, d_data, memory_, cudaMemcpyDeviceToDevice));
+    CHECK(gpuMemcpy(data_, d_data, memory_, gpuMemcpyDeviceToDevice));
   }
 
   // copy data from device with a given size
   void copy_from_device(const T* d_data, const size_t size)
   {
     const size_t memory = sizeof(T) * size;
-    CHECK(cudaMemcpy(data_, d_data, memory, cudaMemcpyDeviceToDevice));
+    CHECK(gpuMemcpy(data_, d_data, memory, gpuMemcpyDeviceToDevice));
   }
 
   // copy data to host with the default size
   void copy_to_host(T* h_data)
   {
-    CHECK(cudaMemcpy(h_data, data_, memory_, cudaMemcpyDeviceToHost));
+    CHECK(gpuMemcpy(h_data, data_, memory_, gpuMemcpyDeviceToHost));
   }
 
   // copy data to host with a given size
   void copy_to_host(T* h_data, const size_t size)
   {
     const size_t memory = sizeof(T) * size;
-    CHECK(cudaMemcpy(h_data, data_, memory, cudaMemcpyDeviceToHost));
+    CHECK(gpuMemcpy(h_data, data_, memory, gpuMemcpyDeviceToHost));
   }
 
   // copy data to device with the default size
   void copy_to_device(T* d_data)
   {
-    CHECK(cudaMemcpy(d_data, data_, memory_, cudaMemcpyDeviceToDevice));
+    CHECK(gpuMemcpy(d_data, data_, memory_, gpuMemcpyDeviceToDevice));
   }
 
   // copy data to device with a given size
   void copy_to_device(T* d_data, const size_t size)
   {
     const size_t memory = sizeof(T) * size;
-    CHECK(cudaMemcpy(d_data, data_, memory, cudaMemcpyDeviceToDevice));
+    CHECK(gpuMemcpy(d_data, data_, memory, gpuMemcpyDeviceToDevice));
   }
 
   // give "value" to each element
@@ -167,7 +168,7 @@ public:
       const int block_size = 128;
       const int grid_size = (size_ + block_size - 1) / block_size;
       gpu_fill<<<grid_size, block_size>>>(size_, value, data_);
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
     } else // managed (or unified) memory
     {
       for (int i = 0; i < size_; ++i)
diff --git a/src/utilities/main_common.cu b/src/utilities/main_common.cu
index 368df0533..4152f0204 100644
--- a/src/utilities/main_common.cu
+++ b/src/utilities/main_common.cu
@@ -14,6 +14,7 @@
 */
 
 #include "error.cuh"
+#include "gpu_macro.cuh"
 #include "main_common.cuh"
 #include <chrono>
 #include <stdio.h>
@@ -40,12 +41,12 @@ void print_gpu_information(void)
   print_line_2();
 
   int num_gpus;
-  CHECK(cudaGetDeviceCount(&num_gpus));
+  CHECK(gpuGetDeviceCount(&num_gpus));
   printf("number of GPUs = %d\n", num_gpus);
 
   for (int device_id = 0; device_id < num_gpus; ++device_id) {
-    cudaDeviceProp prop;
-    CHECK(cudaGetDeviceProperties(&prop, device_id));
+    gpuDeviceProp prop;
+    CHECK(gpuGetDeviceProperties(&prop, device_id));
 
     printf("Device id:                   %d\n", device_id);
     printf("    Device name:             %s\n", prop.name);
@@ -55,13 +56,13 @@ void print_gpu_information(void)
   }
 
   for (int i = 0; i < num_gpus; i++) {
-    cudaSetDevice(i);
+    gpuSetDevice(i);
     for (int j = 0; j < num_gpus; j++) {
       int can_access;
       if (i != j) {
-        CHECK(cudaDeviceCanAccessPeer(&can_access, i, j));
+        CHECK(gpuDeviceCanAccessPeer(&can_access, i, j));
         if (can_access) {
-          CHECK(cudaDeviceEnablePeerAccess(j, 0));
+          CHECK(gpuDeviceEnablePeerAccess(j, 0));
           printf("GPU-%d can access GPU-%d.\n", i, j);
         } else {
           printf("GPU-%d cannot access GPU-%d.\n", i, j);
@@ -70,5 +71,5 @@ void print_gpu_information(void)
     }
   }
 
-  cudaSetDevice(0); // normally use GPU-0
+  gpuSetDevice(0); // normally use GPU-0
 }
diff --git a/src/utilities/read_file.cu b/src/utilities/read_file.cu
index 3acd6b96f..7bfd6cae2 100644
--- a/src/utilities/read_file.cu
+++ b/src/utilities/read_file.cu
@@ -24,7 +24,9 @@ Some functions for dealing with text files. Written by Mikko Ervasti.
 
 int is_valid_int(const char* s, int* result)
 {
-  if (s == NULL || *s == '\0') {
+  if (s == NULL) {
+    return 0;
+  } else if (*s == '\0') {
     return 0;
   }
   char* p;
@@ -39,7 +41,9 @@ int is_valid_int(const char* s, int* result)
 
 int is_valid_real(const char* s, double* result)
 {
-  if (s == NULL || *s == '\0') {
+  if (s == NULL) {
+    return 0;
+  } else if (*s == '\0') {
     return 0;
   }
   char* p;