diff --git a/src/force/dftd3.cu b/src/force/dftd3.cu index a25033a7c..ac0148fd2 100644 --- a/src/force/dftd3.cu +++ b/src/force/dftd3.cu @@ -32,6 +32,7 @@ J. Comput. Chem., 32, 1456 (2011). #include "model/box.cuh" #include "neighbor.cuh" #include "utilities/common.cuh" +#include "utilities/gpu_macro.cuh" #include #include #include @@ -947,7 +948,7 @@ void DFTD3::compute_small_box( r12.data() + size_x12 * 3, r12.data() + size_x12 * 4, r12.data() + size_x12 * 5); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_dftd3_coordination_number_small_box<<<(N - 1) / 64 + 1, 64>>>( dftd3_para, @@ -959,7 +960,7 @@ void DFTD3::compute_small_box( r12.data() + size_x12 * 4, r12.data() + size_x12 * 5, cn.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL add_dftd3_force_small_box<<<(N - 1) / 64 + 1, 64>>>( dftd3_para, @@ -979,7 +980,7 @@ void DFTD3::compute_small_box( virial_per_atom.data(), dc6_sum.data(), dc8_sum.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL add_dftd3_force_extra_small_box<<<(N - 1) / 64 + 1, 64>>>( dftd3_para, @@ -996,7 +997,7 @@ void DFTD3::compute_small_box( force_per_atom.data() + N, force_per_atom.data() + N * 2, virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } void DFTD3::compute_large_box( @@ -1058,7 +1059,7 @@ void DFTD3::compute_large_box( position_per_atom.data() + N, position_per_atom.data() + N * 2, cn.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_dftd3_force_large_box<<<(N - 1) / 64 + 1, 64>>>( dftd3_para, @@ -1084,7 +1085,7 @@ void DFTD3::compute_large_box( virial_per_atom.data(), dc6_sum.data(), dc8_sum.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_dftd3_force_extra_large_box<<<(N - 1) / 64 + 1, 64>>>( dftd3_para, @@ -1107,7 +1108,7 @@ void DFTD3::compute_large_box( force_per_atom.data() + N, force_per_atom.data() + N * 2, virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } void DFTD3::compute( diff --git a/src/force/dftd3para.cuh b/src/force/dftd3para.cuh index 6d00112d7..6407cb30d 100644 --- a/src/force/dftd3para.cuh +++ b/src/force/dftd3para.cuh @@ -15,6 +15,8 @@ #pragma once +#include "utilities/gpu_macro.cuh" + namespace { #define Bohr 0.5291772575069165f diff --git a/src/force/eam.cu b/src/force/eam.cu index 7a910d654..b5debf445 100644 --- a/src/force/eam.cu +++ b/src/force/eam.cu @@ -22,6 +22,7 @@ The EAM potential. Currently two analytical versions: #include "eam.cuh" #include "neighbor.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include #define BLOCK_SIZE_FORCE 64 @@ -514,7 +515,7 @@ void EAM::compute( position_per_atom.data() + number_of_atoms * 2, eam_data.Fp.data(), potential_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_force_eam_step2<0><<>>( eam2004zhou, @@ -535,7 +536,7 @@ void EAM::compute( force_per_atom.data() + 2 * number_of_atoms, virial_per_atom.data(), potential_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } if (potential_model == 1) { @@ -554,7 +555,7 @@ void EAM::compute( position_per_atom.data() + number_of_atoms * 2, eam_data.Fp.data(), potential_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_force_eam_step2<1><<>>( eam2004zhou, @@ -575,6 +576,6 @@ void EAM::compute( force_per_atom.data() + 2 * number_of_atoms, virial_per_atom.data(), potential_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } diff --git a/src/force/fcp.cu b/src/force/fcp.cu index 4c161183f..50168ed6b 100644 --- a/src/force/fcp.cu +++ b/src/force/fcp.cu @@ -19,6 +19,7 @@ The force constant potential (FCP) #include "fcp.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include #include @@ -1038,7 +1039,7 @@ void FCP::compute( position_per_atom.data() + number_of_atoms * 2, fcp_data.r0.data(), fcp_data.u.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL fcp_data.pfv.fill(0.0f); @@ -1125,5 +1126,5 @@ void FCP::compute( force_per_atom.data() + 2 * number_of_atoms, virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } diff --git a/src/force/force.cu b/src/force/force.cu index 2329ebb24..50fa151e5 100644 --- a/src/force/force.cu +++ b/src/force/force.cu @@ -28,6 +28,7 @@ The driver class calculating force and related quantities. #include "ilp_tmd_sw.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include #include @@ -106,7 +107,7 @@ void Force::parse_potential( strcmp(potential_name, "nep4_temperature") == 0 || strcmp(potential_name, "nep4_zbl_temperature") == 0) { int num_gpus; - CHECK(cudaGetDeviceCount(&num_gpus)); + CHECK(gpuGetDeviceCount(&num_gpus)); #ifdef ZHEYONG num_gpus = 3; #endif @@ -226,7 +227,7 @@ static __global__ void gpu_sum_force(int N, double* g_fx, double* g_fy, double* s_f[tid] = f; __syncthreads(); -#pragma unroll + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_f[tid] += s_f[tid + offset]; @@ -466,7 +467,7 @@ void Force::compute( force_per_atom.data() + number_of_atoms * 2, potential_per_atom.data(), virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (multiple_potentials_mode_.compare("observe") == 0) { // If observing, calculate using main potential only @@ -516,7 +517,7 @@ void Force::compute( force_per_atom.data(), virial_per_atom.data(), (double)potentials.size()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } else { PRINT_INPUT_ERROR("Invalid mode for multiple potentials.\n"); } @@ -552,7 +553,7 @@ void Force::compute( force_per_atom.data() + number_of_atoms, force_per_atom.data() + 2 * number_of_atoms, ftot.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>( number_of_atoms, @@ -561,7 +562,7 @@ void Force::compute( force_per_atom.data() + number_of_atoms, force_per_atom.data() + 2 * number_of_atoms, ftot.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } // always correct the force when using the FCP potential @@ -574,7 +575,7 @@ void Force::compute( force_per_atom.data() + number_of_atoms, force_per_atom.data() + 2 * number_of_atoms, ftot.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>( number_of_atoms, @@ -583,7 +584,7 @@ void Force::compute( force_per_atom.data() + number_of_atoms, force_per_atom.data() + 2 * number_of_atoms, ftot.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } } @@ -647,7 +648,7 @@ static __global__ void gpu_sum_tensor(int N, double* g_tensor, double* g_sum_ten s_t[tid] = t; __syncthreads(); -#pragma unroll + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_t[tid] += s_t[tid + offset]; @@ -754,7 +755,7 @@ void Force::compute( force_per_atom.data() + number_of_atoms * 2, potential_per_atom.data(), virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL temperature += delta_T; if (multiple_potentials_mode_.compare("observe") == 0) { @@ -805,7 +806,7 @@ void Force::compute( force_per_atom.data(), virial_per_atom.data(), (double)potentials.size()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } else { PRINT_INPUT_ERROR("Invalid mode for multiple potentials.\n"); } @@ -841,7 +842,7 @@ void Force::compute( force_per_atom.data() + number_of_atoms, force_per_atom.data() + 2 * number_of_atoms, ftot.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>( number_of_atoms, @@ -850,7 +851,7 @@ void Force::compute( force_per_atom.data() + number_of_atoms, force_per_atom.data() + 2 * number_of_atoms, ftot.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } else if (compute_hnemdec_ == 0) { // the tensor: // xx xy xz 0 3 4 @@ -876,10 +877,10 @@ void Force::compute( virial_per_atom.data() + 8 * number_of_atoms, virial_per_atom.data() + 2 * number_of_atoms, tensor_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_sum_tensor<<<9, 1024>>>(number_of_atoms, tensor_per_atom.data(), tensor_tot.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_add_driving_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>( number_of_atoms, @@ -901,7 +902,7 @@ void Force::compute( force_per_atom.data(), force_per_atom.data() + number_of_atoms, force_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } else if (compute_hnemdec_ != -1) { gpu_add_driving_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>( @@ -926,7 +927,7 @@ void Force::compute( force_per_atom.data() + number_of_atoms, force_per_atom.data() + 2 * number_of_atoms, ftot.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>( number_of_atoms, @@ -935,7 +936,7 @@ void Force::compute( force_per_atom.data() + number_of_atoms, force_per_atom.data() + 2 * number_of_atoms, ftot.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } } diff --git a/src/force/force_constant.cu b/src/force/force_constant.cu index 164306d24..90608601f 100644 --- a/src/force/force_constant.cu +++ b/src/force/force_constant.cu @@ -23,6 +23,7 @@ Use finite difference to calculate the seconod order force constants: #include "model/box.cuh" #include "model/group.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include static __global__ void gpu_shift_atom(const double dx, double* x) { x[0] += dx; } @@ -34,13 +35,13 @@ static void shift_atom( if (beta == 0) { gpu_shift_atom<<<1, 1>>>(dx, position_per_atom.data() + n2); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } else if (beta == 1) { gpu_shift_atom<<<1, 1>>>(dx, position_per_atom.data() + number_of_atoms + n2); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } else { gpu_shift_atom<<<1, 1>>>(dx, position_per_atom.data() + number_of_atoms * 2 + n2); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } @@ -67,10 +68,10 @@ static void get_f( box, position_per_atom, type, group, potential_per_atom, force_per_atom, virial_per_atom); size_t M = sizeof(double); - CHECK(cudaMemcpy(f + 0, force_per_atom.data() + n1, M, cudaMemcpyDeviceToHost)); - CHECK(cudaMemcpy(f + 1, force_per_atom.data() + n1 + number_of_atoms, M, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(f + 0, force_per_atom.data() + n1, M, gpuMemcpyDeviceToHost)); + CHECK(gpuMemcpy(f + 1, force_per_atom.data() + n1 + number_of_atoms, M, gpuMemcpyDeviceToHost)); CHECK( - cudaMemcpy(f + 2, force_per_atom.data() + n1 + number_of_atoms * 2, M, cudaMemcpyDeviceToHost)); + gpuMemcpy(f + 2, force_per_atom.data() + n1 + number_of_atoms * 2, M, gpuMemcpyDeviceToHost)); shift_atom(-dx, n2, beta, position_per_atom); } diff --git a/src/force/ilp_tmd_sw.cu b/src/force/ilp_tmd_sw.cu index 931f47348..079029e0d 100644 --- a/src/force/ilp_tmd_sw.cu +++ b/src/force/ilp_tmd_sw.cu @@ -22,6 +22,7 @@ TODO: #include "neighbor.cuh" #include "utilities/error.cuh" #include "utilities/common.cuh" +#include "utilities/gpu_macro.cuh" #define BLOCK_SIZE_FORCE 128 @@ -126,8 +127,7 @@ ILP_TMD_SW::ILP_TMD_SW(FILE* fid_ilp, FILE* fid_sw, int num_types, int num_atoms // init constant cutoff coeff float h_tap_coeff[8] = \ {1.0f, 0.0f, 0.0f, 0.0f, -35.0f, 84.0f, -70.0f, 20.0f}; - cudaMemcpyToSymbol(Tap_coeff_tmd, h_tap_coeff, 8 * sizeof(float)); - CUDA_CHECK_KERNEL + CHECK(gpuMemcpyToSymbol(Tap_coeff_tmd, h_tap_coeff, 8 * sizeof(float))); // set ilp_flag to 1 ilp_flag = 1; @@ -1437,7 +1437,7 @@ void ILP_TMD_SW::compute_ilp( number_of_atoms, N1, N2, box, big_ilp_NN, big_ilp_NL, \ type.data(), ilp_para, x, y, z, ilp_NN, \ ilp_NL, group[1].label.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // initialize force of ilp neighbor temporary vector ilp_data.f12x_ilp_neigh.fill(0); @@ -1485,7 +1485,7 @@ void ILP_TMD_SW::compute_ilp( g_f12x_ilp_neigh, g_f12y_ilp_neigh, g_f12z_ilp_neigh); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL reduce_force_many_body<<>>( number_of_atoms, @@ -1510,7 +1510,7 @@ void ILP_TMD_SW::compute_ilp( g_f12x_ilp_neigh, g_f12y_ilp_neigh, g_f12z_ilp_neigh); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // step 1: calculate the partial forces gpu_find_force_sw3_partial<<>>( @@ -1518,7 +1518,7 @@ void ILP_TMD_SW::compute_ilp( type.data(), position_per_atom.data(), position_per_atom.data() + number_of_atoms, position_per_atom.data() + number_of_atoms * 2, potential_per_atom.data(), sw2_data.f12x.data(), sw2_data.f12y.data(), sw2_data.f12z.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // step 2: calculate force and related quantities find_properties_many_body( diff --git a/src/force/lj.cu b/src/force/lj.cu index 4fafeeb61..88c4b553f 100644 --- a/src/force/lj.cu +++ b/src/force/lj.cu @@ -20,6 +20,7 @@ The class dealing with the Lennard-Jones (LJ) pairwise potentials. #include "lj.cuh" #include "neighbor.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" // best block size here: 128 #define BLOCK_SIZE_FORCE 128 @@ -233,5 +234,5 @@ void LJ::compute( force_per_atom.data() + 2 * number_of_atoms, virial_per_atom.data(), potential_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } diff --git a/src/force/neighbor.cu b/src/force/neighbor.cu index 8b234415e..1a9580a98 100644 --- a/src/force/neighbor.cu +++ b/src/force/neighbor.cu @@ -19,6 +19,7 @@ neighbor list. #include "neighbor.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include #include @@ -183,18 +184,18 @@ void find_cell_list( cell_count_sum.resize(N_cells); } - CHECK(cudaMemset(cell_count.data(), 0, sizeof(int) * N_cells)); - CHECK(cudaMemset(cell_count_sum.data(), 0, sizeof(int) * N_cells)); - CHECK(cudaMemset(cell_contents.data(), 0, sizeof(int) * N)); + CHECK(gpuMemset(cell_count.data(), 0, sizeof(int) * N_cells)); + CHECK(gpuMemset(cell_count_sum.data(), 0, sizeof(int) * N_cells)); + CHECK(gpuMemset(cell_contents.data(), 0, sizeof(int) * N)); find_cell_counts<<>>( box, N, cell_count.data(), x, y, z, num_bins[0], num_bins[1], num_bins[2], rc_inv); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL thrust::exclusive_scan( thrust::device, cell_count.data(), cell_count.data() + N_cells, cell_count_sum.data()); - CHECK(cudaMemset(cell_count.data(), 0, sizeof(int) * N_cells)); + CHECK(gpuMemset(cell_count.data(), 0, sizeof(int) * N_cells)); find_cell_contents<<>>( box, @@ -209,7 +210,7 @@ void find_cell_list( num_bins[1], num_bins[2], rc_inv); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } static void __global__ set_to_zero(int size, int* data) @@ -221,7 +222,7 @@ static void __global__ set_to_zero(int size, int* data) } void find_cell_list( - cudaStream_t& stream, + gpuStream_t& stream, const double rc, const int* num_bins, Box& box, @@ -248,29 +249,33 @@ void find_cell_list( set_to_zero<<<(cell_count.size() - 1) / 64 + 1, 64, 0, stream>>>( cell_count.size(), cell_count.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL set_to_zero<<<(cell_count_sum.size() - 1) / 64 + 1, 64, 0, stream>>>( cell_count_sum.size(), cell_count_sum.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL set_to_zero<<<(cell_contents.size() - 1) / 64 + 1, 64, 0, stream>>>( cell_contents.size(), cell_contents.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_cell_counts<<>>( box, N, cell_count.data(), x, y, z, num_bins[0], num_bins[1], num_bins[2], rc_inv); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL thrust::exclusive_scan( +#ifdef USE_HIP + thrust::hip::par.on(stream), +#else thrust::cuda::par.on(stream), +#endif cell_count.data(), cell_count.data() + N_cells, cell_count_sum.data()); set_to_zero<<<(cell_count.size() - 1) / 64 + 1, 64, 0, stream>>>( cell_count.size(), cell_count.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_cell_contents<<>>( box, @@ -285,7 +290,7 @@ void find_cell_list( num_bins[1], num_bins[2], rc_inv); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } void find_neighbor( @@ -335,11 +340,11 @@ void find_neighbor( num_bins[2], rc_inv_cell_list, rc * rc); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL const int MN = NL.size() / NN.size(); gpu_sort_neighbor_list<<>>(N, NN.data(), NL.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } // For ILP, the neighbor could not contain atoms in the same layer @@ -488,11 +493,11 @@ void find_neighbor_ilp( rc_inv_cell_list, rc * rc, big_ilp_cutoff_square); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL const int MN = NL.size() / NN.size(); gpu_sort_neighbor_list_ilp<<>>(N, NN.data(), NL.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } static __global__ void gpu_find_neighbor_ON1_SW( @@ -624,9 +629,9 @@ void find_neighbor_SW( num_bins[2], rc_inv_cell_list, rc * rc); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL const int MN = NL.size() / NN.size(); gpu_sort_neighbor_list<<>>(N, NN.data(), NL.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } \ No newline at end of file diff --git a/src/force/nep3.cu b/src/force/nep3.cu index 1f7729ccc..8fcec4206 100644 --- a/src/force/nep3.cu +++ b/src/force/nep3.cu @@ -25,6 +25,7 @@ heat transport, Phys. Rev. B. 104, 104309 (2021). #include "nep3_small_box.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/nep_utilities.cuh" #include #include @@ -1176,7 +1177,7 @@ void NEP3::compute_large_box( nep_data.NL_radial.data(), nep_data.NN_angular.data(), nep_data.NL_angular.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL static int num_calls = 0; if (num_calls++ % 1000 == 0) { @@ -1202,11 +1203,11 @@ void NEP3::compute_large_box( gpu_sort_neighbor_list<<>>( N, nep_data.NN_radial.data(), nep_data.NL_radial.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_sort_neighbor_list<<>>( N, nep_data.NN_angular.data(), nep_data.NL_angular.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL bool is_polarizability = paramb.model_type == 2; find_descriptor<<>>( @@ -1233,7 +1234,7 @@ void NEP3::compute_large_box( nep_data.Fp.data(), virial_per_atom.data(), nep_data.sum_fxyz.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL bool is_dipole = paramb.model_type == 1; find_force_radial<<>>( @@ -1258,7 +1259,7 @@ void NEP3::compute_large_box( force_per_atom.data() + N, force_per_atom.data() + N * 2, virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_partial_force_angular<<>>( paramb, @@ -1282,7 +1283,7 @@ void NEP3::compute_large_box( nep_data.f12x.data(), nep_data.f12y.data(), nep_data.f12z.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_properties_many_body( box, @@ -1295,7 +1296,7 @@ void NEP3::compute_large_box( position_per_atom, force_per_atom, virial_per_atom); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (zbl.enabled) { find_force_ZBL<<>>( @@ -1316,7 +1317,7 @@ void NEP3::compute_large_box( force_per_atom.data() + N * 2, virial_per_atom.data(), potential_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } @@ -1362,7 +1363,7 @@ void NEP3::compute_small_box( r12.data() + size_x12 * 3, r12.data() + size_x12 * 4, r12.data() + size_x12 * 5); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL const bool is_polarizability = paramb.model_type == 2; find_descriptor_small_box<<>>( @@ -1391,7 +1392,7 @@ void NEP3::compute_small_box( nep_data.Fp.data(), virial_per_atom.data(), nep_data.sum_fxyz.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL bool is_dipole = paramb.model_type == 1; find_force_radial_small_box<<>>( @@ -1415,7 +1416,7 @@ void NEP3::compute_small_box( force_per_atom.data() + N, force_per_atom.data() + N * 2, virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_force_angular_small_box<<>>( paramb, @@ -1440,7 +1441,7 @@ void NEP3::compute_small_box( force_per_atom.data() + N, force_per_atom.data() + N * 2, virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (zbl.enabled) { find_force_ZBL_small_box<<>>( @@ -1460,7 +1461,7 @@ void NEP3::compute_small_box( force_per_atom.data() + N * 2, virial_per_atom.data(), potential_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } @@ -1758,7 +1759,7 @@ void NEP3::compute_large_box( nep_data.NL_radial.data(), nep_data.NN_angular.data(), nep_data.NL_angular.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL static int num_calls = 0; if (num_calls++ % 1000 == 0) { @@ -1784,11 +1785,11 @@ void NEP3::compute_large_box( gpu_sort_neighbor_list<<>>( N, nep_data.NN_radial.data(), nep_data.NL_radial.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_sort_neighbor_list<<>>( N, nep_data.NN_angular.data(), nep_data.NL_angular.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_descriptor<<>>( temperature, @@ -1814,7 +1815,7 @@ void NEP3::compute_large_box( nep_data.Fp.data(), virial_per_atom.data(), nep_data.sum_fxyz.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL bool is_dipole = paramb.model_type == 1; find_force_radial<<>>( @@ -1839,7 +1840,7 @@ void NEP3::compute_large_box( force_per_atom.data() + N, force_per_atom.data() + N * 2, virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_partial_force_angular<<>>( paramb, @@ -1863,7 +1864,7 @@ void NEP3::compute_large_box( nep_data.f12x.data(), nep_data.f12y.data(), nep_data.f12z.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_properties_many_body( box, @@ -1876,7 +1877,7 @@ void NEP3::compute_large_box( position_per_atom, force_per_atom, virial_per_atom); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (zbl.enabled) { find_force_ZBL<<>>( @@ -1897,7 +1898,7 @@ void NEP3::compute_large_box( force_per_atom.data() + N * 2, virial_per_atom.data(), potential_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } @@ -1944,7 +1945,7 @@ void NEP3::compute_small_box( r12.data() + size_x12 * 3, r12.data() + size_x12 * 4, r12.data() + size_x12 * 5); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_descriptor_small_box<<>>( temperature, @@ -1972,7 +1973,7 @@ void NEP3::compute_small_box( nep_data.Fp.data(), virial_per_atom.data(), nep_data.sum_fxyz.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL bool is_dipole = paramb.model_type == 1; find_force_radial_small_box<<>>( @@ -1996,7 +1997,7 @@ void NEP3::compute_small_box( force_per_atom.data() + N, force_per_atom.data() + N * 2, virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_force_angular_small_box<<>>( paramb, @@ -2021,7 +2022,7 @@ void NEP3::compute_small_box( force_per_atom.data() + N, force_per_atom.data() + N * 2, virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (zbl.enabled) { find_force_ZBL_small_box<<>>( @@ -2041,7 +2042,7 @@ void NEP3::compute_small_box( force_per_atom.data() + N * 2, virial_per_atom.data(), potential_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } diff --git a/src/force/nep3_multigpu.cu b/src/force/nep3_multigpu.cu index 5cf5f4425..9f9342379 100644 --- a/src/force/nep3_multigpu.cu +++ b/src/force/nep3_multigpu.cu @@ -26,6 +26,7 @@ when there is NVlink, but is also not very bad when there is only PCI-E. #include "nep3_multigpu.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/nep_utilities.cuh" #include #include @@ -347,7 +348,7 @@ NEP3_MULTIGPU::NEP3_MULTIGPU( annmb[gpu].dim = annmb[0].dim; annmb[gpu].num_neurons1 = annmb[0].num_neurons1; #ifndef ZHEYONG - CHECK(cudaSetDevice(gpu)); + CHECK(gpuSetDevice(gpu)); #endif nep_data[gpu].parameters.resize(annmb[gpu].num_para); @@ -359,7 +360,7 @@ NEP3_MULTIGPU::NEP3_MULTIGPU( nep_data[gpu].cell_count_sum.resize(num_atoms); nep_data[gpu].cell_contents.resize(num_atoms); - CHECK(cudaStreamCreate(&nep_data[gpu].stream)); + CHECK(gpuStreamCreate(&nep_data[gpu].stream)); #ifdef USE_TABLE nep_data[gpu].gn_radial.resize(table_length * paramb.num_types_sq * (paramb.n_max_radial + 1)); @@ -400,7 +401,7 @@ NEP3_MULTIGPU::NEP3_MULTIGPU( #endif } - CHECK(cudaSetDevice(0)); + CHECK(gpuSetDevice(0)); nep_temp_data.cell_count_sum_cpu.resize(num_atoms); nep_temp_data.cell_count.resize(num_atoms); @@ -417,7 +418,7 @@ void NEP3_MULTIGPU::allocate_memory() for (int gpu = 0; gpu < paramb.num_gpus; ++gpu) { #ifndef ZHEYONG - CHECK(cudaSetDevice(gpu)); + CHECK(gpuSetDevice(gpu)); #endif nep_data[gpu].f12x.resize(nep_temp_data.num_atoms_per_gpu * paramb.MN_angular); @@ -438,7 +439,7 @@ void NEP3_MULTIGPU::allocate_memory() nep_data[gpu].virial.resize(nep_temp_data.num_atoms_per_gpu * 9); } - CHECK(cudaSetDevice(0)); + CHECK(gpuSetDevice(0)); nep_temp_data.type.resize(nep_temp_data.num_atoms_per_gpu); nep_temp_data.position.resize(nep_temp_data.num_atoms_per_gpu * 3); @@ -450,7 +451,7 @@ void NEP3_MULTIGPU::allocate_memory() NEP3_MULTIGPU::~NEP3_MULTIGPU(void) { for (int gpu = 0; gpu < paramb.num_gpus; ++gpu) { - CHECK(cudaStreamDestroy(nep_data[gpu].stream)); + CHECK(gpuStreamDestroy(nep_data[gpu].stream)); } } @@ -625,7 +626,7 @@ static void __global__ set_to_zero(int size, int* data) } static void find_cell_list( - cudaStream_t& stream, + gpuStream_t& stream, const int partition_direction, const double rc, const int* num_bins, @@ -653,15 +654,15 @@ static void find_cell_list( set_to_zero<<<(cell_count.size() - 1) / 64 + 1, 64, 0, stream>>>( cell_count.size(), cell_count.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL set_to_zero<<<(cell_count_sum.size() - 1) / 64 + 1, 64, 0, stream>>>( cell_count_sum.size(), cell_count_sum.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL set_to_zero<<<(cell_contents.size() - 1) / 64 + 1, 64, 0, stream>>>( cell_contents.size(), cell_contents.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_cell_counts<<>>( partition_direction, @@ -675,17 +676,21 @@ static void find_cell_list( num_bins[1], num_bins[2], rc_inv); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL thrust::exclusive_scan( +#ifdef USE_HIP + thrust::hip::par.on(stream), +#else thrust::cuda::par.on(stream), +#endif cell_count.data(), cell_count.data() + N_cells, cell_count_sum.data()); set_to_zero<<<(cell_count.size() - 1) / 64 + 1, 64, 0, stream>>>( cell_count.size(), cell_count.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_cell_contents<<>>( partition_direction, @@ -701,7 +706,7 @@ static void find_cell_list( num_bins[1], num_bins[2], rc_inv); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } static __global__ void find_neighbor_list_large_box( @@ -1719,19 +1724,19 @@ void NEP3_MULTIGPU::compute( position.data(), nep_temp_data.type.data(), nep_temp_data.position.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL - CHECK(cudaMemcpy( + CHECK(gpuMemcpy( nep_data[gpu].type.data(), nep_temp_data.type.data(), sizeof(int) * nep_data[gpu].N3, - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); for (int d = 0; d < 3; ++d) { - CHECK(cudaMemcpy( + CHECK(gpuMemcpy( nep_data[gpu].position.data() + nep_temp_data.num_atoms_per_gpu * d, nep_temp_data.position.data() + nep_temp_data.num_atoms_per_gpu * d, sizeof(double) * nep_data[gpu].N3, - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); } } @@ -1739,7 +1744,7 @@ void NEP3_MULTIGPU::compute( for (int gpu = 0; gpu < paramb.num_gpus; ++gpu) { #ifndef ZHEYONG - CHECK(cudaSetDevice(gpu)); + CHECK(gpuSetDevice(gpu)); #endif find_cell_list( @@ -1779,7 +1784,7 @@ void NEP3_MULTIGPU::compute( nep_data[gpu].NL_radial.data(), nep_data[gpu].NN_angular.data(), nep_data[gpu].NL_angular.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL bool is_polarizability = paramb.model_type == 2; find_descriptor<<< @@ -1810,7 +1815,7 @@ void NEP3_MULTIGPU::compute( nep_data[gpu].Fp.data(), nep_data[gpu].virial.data(), nep_data[gpu].sum_fxyz.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL bool is_dipole = paramb.model_type == 1; find_force_radial<<< @@ -1839,7 +1844,7 @@ void NEP3_MULTIGPU::compute( nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu, nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu * 2, nep_data[gpu].virial.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_partial_force_angular<<< (nep_data[gpu].N5 - nep_data[gpu].N4 - 1) / 64 + 1, @@ -1867,7 +1872,7 @@ void NEP3_MULTIGPU::compute( nep_data[gpu].f12x.data(), nep_data[gpu].f12y.data(), nep_data[gpu].f12z.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_find_force_many_body<<< (nep_data[gpu].N2 - nep_data[gpu].N1 - 1) / 64 + 1, @@ -1890,7 +1895,7 @@ void NEP3_MULTIGPU::compute( nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu, nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu * 2, nep_data[gpu].virial.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (zbl.enabled) { find_force_ZBL<<< @@ -1915,34 +1920,34 @@ void NEP3_MULTIGPU::compute( nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu * 2, nep_data[gpu].virial.data(), nep_data[gpu].potential.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } - CHECK(cudaSetDevice(0)); + CHECK(gpuSetDevice(0)); // serial for (int gpu = 0; gpu < paramb.num_gpus; ++gpu) { - CHECK(cudaMemcpy( + CHECK(gpuMemcpy( nep_temp_data.potential.data() + nep_data[gpu].N1, nep_data[gpu].potential.data() + nep_data[gpu].N1, sizeof(double) * (nep_data[gpu].N2 - nep_data[gpu].N1), - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); for (int d = 0; d < 3; ++d) { - CHECK(cudaMemcpy( + CHECK(gpuMemcpy( nep_temp_data.force.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d, nep_data[gpu].force.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d, sizeof(double) * (nep_data[gpu].N2 - nep_data[gpu].N1), - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); } for (int d = 0; d < 9; ++d) { - CHECK(cudaMemcpy( + CHECK(gpuMemcpy( nep_temp_data.virial.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d, nep_data[gpu].virial.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d, sizeof(double) * (nep_data[gpu].N2 - nep_data[gpu].N1), - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); } collect_properties<<<(nep_data[gpu].N2 - nep_data[gpu].N1 - 1) / 64 + 1, 64>>>( @@ -1958,7 +1963,7 @@ void NEP3_MULTIGPU::compute( force.data(), potential.data(), virial.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } @@ -2269,19 +2274,19 @@ void NEP3_MULTIGPU::compute( position.data(), nep_temp_data.type.data(), nep_temp_data.position.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL - CHECK(cudaMemcpy( + CHECK(gpuMemcpy( nep_data[gpu].type.data(), nep_temp_data.type.data(), sizeof(int) * nep_data[gpu].N3, - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); for (int d = 0; d < 3; ++d) { - CHECK(cudaMemcpy( + CHECK(gpuMemcpy( nep_data[gpu].position.data() + nep_temp_data.num_atoms_per_gpu * d, nep_temp_data.position.data() + nep_temp_data.num_atoms_per_gpu * d, sizeof(double) * nep_data[gpu].N3, - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); } } @@ -2289,7 +2294,7 @@ void NEP3_MULTIGPU::compute( for (int gpu = 0; gpu < paramb.num_gpus; ++gpu) { #ifndef ZHEYONG - CHECK(cudaSetDevice(gpu)); + CHECK(gpuSetDevice(gpu)); #endif find_cell_list( @@ -2329,7 +2334,7 @@ void NEP3_MULTIGPU::compute( nep_data[gpu].NL_radial.data(), nep_data[gpu].NN_angular.data(), nep_data[gpu].NL_angular.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_descriptor<<< (nep_data[gpu].N5 - nep_data[gpu].N4 - 1) / 64 + 1, @@ -2359,7 +2364,7 @@ void NEP3_MULTIGPU::compute( nep_data[gpu].Fp.data(), nep_data[gpu].virial.data(), nep_data[gpu].sum_fxyz.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL bool is_dipole = paramb.model_type == 1; find_force_radial<<< @@ -2388,7 +2393,7 @@ void NEP3_MULTIGPU::compute( nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu, nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu * 2, nep_data[gpu].virial.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_partial_force_angular<<< (nep_data[gpu].N5 - nep_data[gpu].N4 - 1) / 64 + 1, @@ -2416,7 +2421,7 @@ void NEP3_MULTIGPU::compute( nep_data[gpu].f12x.data(), nep_data[gpu].f12y.data(), nep_data[gpu].f12z.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_find_force_many_body<<< (nep_data[gpu].N2 - nep_data[gpu].N1 - 1) / 64 + 1, @@ -2439,7 +2444,7 @@ void NEP3_MULTIGPU::compute( nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu, nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu * 2, nep_data[gpu].virial.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (zbl.enabled) { find_force_ZBL<<< @@ -2464,39 +2469,39 @@ void NEP3_MULTIGPU::compute( nep_data[gpu].force.data() + nep_temp_data.num_atoms_per_gpu * 2, nep_data[gpu].virial.data(), nep_data[gpu].potential.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } for (int gpu = 0; gpu < paramb.num_gpus; ++gpu) { - CHECK(cudaSetDevice(gpu)); - CHECK(cudaDeviceSynchronize()); + CHECK(gpuSetDevice(gpu)); + CHECK(gpuDeviceSynchronize()); } - CHECK(cudaSetDevice(0)); + CHECK(gpuSetDevice(0)); // serial for (int gpu = 0; gpu < paramb.num_gpus; ++gpu) { - CHECK(cudaMemcpy( + CHECK(gpuMemcpy( nep_temp_data.potential.data() + nep_data[gpu].N1, nep_data[gpu].potential.data() + nep_data[gpu].N1, sizeof(double) * (nep_data[gpu].N2 - nep_data[gpu].N1), - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); for (int d = 0; d < 3; ++d) { - CHECK(cudaMemcpy( + CHECK(gpuMemcpy( nep_temp_data.force.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d, nep_data[gpu].force.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d, sizeof(double) * (nep_data[gpu].N2 - nep_data[gpu].N1), - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); } for (int d = 0; d < 9; ++d) { - CHECK(cudaMemcpy( + CHECK(gpuMemcpy( nep_temp_data.virial.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d, nep_data[gpu].virial.data() + nep_data[gpu].N1 + nep_temp_data.num_atoms_per_gpu * d, sizeof(double) * (nep_data[gpu].N2 - nep_data[gpu].N1), - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); } collect_properties<<<(nep_data[gpu].N2 - nep_data[gpu].N1 - 1) / 64 + 1, 64>>>( @@ -2512,6 +2517,6 @@ void NEP3_MULTIGPU::compute( force.data(), potential.data(), virial.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } diff --git a/src/force/nep3_multigpu.cuh b/src/force/nep3_multigpu.cuh index 79280b70e..ecb70d954 100644 --- a/src/force/nep3_multigpu.cuh +++ b/src/force/nep3_multigpu.cuh @@ -57,7 +57,7 @@ struct NEP3_MULTIGPU_Data { int N1, N2, N3, N4, N5; // for local system int M0, M1, M2; // for global system - cudaStream_t stream; + gpuStream_t stream; }; struct NEP3_TEMP_Data { diff --git a/src/force/nep3_small_box.cuh b/src/force/nep3_small_box.cuh index f6d012ce4..cdbf5a7b8 100644 --- a/src/force/nep3_small_box.cuh +++ b/src/force/nep3_small_box.cuh @@ -17,6 +17,7 @@ #include "nep3.cuh" #include "utilities/common.cuh" #include "utilities/nep_utilities.cuh" +#include "utilities/gpu_macro.cuh" #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600) static __device__ __inline__ double atomicAdd(double* address, double val) diff --git a/src/force/potential.cu b/src/force/potential.cu index c7eb2dc8e..b3d894141 100644 --- a/src/force/potential.cu +++ b/src/force/potential.cu @@ -19,6 +19,7 @@ The abstract base class (ABC) for the potential classes. #include "potential.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #define BLOCK_SIZE_FORCE 64 #include #include @@ -162,7 +163,7 @@ void Potential::find_properties_many_body( force_per_atom.data() + number_of_atoms, force_per_atom.data() + 2 * number_of_atoms, virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } static __global__ void gpu_find_force_many_body( @@ -312,5 +313,5 @@ void Potential::find_properties_many_body( force_per_atom.data() + number_of_atoms, force_per_atom.data() + 2 * number_of_atoms, virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } diff --git a/src/force/tersoff1988.cu b/src/force/tersoff1988.cu index 33405a166..823f18f15 100644 --- a/src/force/tersoff1988.cu +++ b/src/force/tersoff1988.cu @@ -23,6 +23,7 @@ The version of the Tersoff potential as described in #include "tersoff1988.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include #define LDG(a, n) __ldg(a + n) @@ -538,7 +539,7 @@ void Tersoff1988::compute( position_per_atom.data() + number_of_atoms * 2, tersoff_data.b.data(), tersoff_data.bp.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // pre-compute the partial forces find_force_tersoff_step2<<>>( @@ -560,7 +561,7 @@ void Tersoff1988::compute( tersoff_data.f12x.data(), tersoff_data.f12y.data(), tersoff_data.f12z.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // the final step: calculate force and related quantities find_properties_many_body( diff --git a/src/force/tersoff1989.cu b/src/force/tersoff1989.cu index 0b8035c77..858f08684 100644 --- a/src/force/tersoff1989.cu +++ b/src/force/tersoff1989.cu @@ -23,6 +23,7 @@ The double-element version of the Tersoff potential as described in #include "tersoff1989.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #define BLOCK_SIZE_FORCE 64 // 128 is also good @@ -556,7 +557,7 @@ void Tersoff1989::compute( position_per_atom.data() + number_of_atoms * 2, tersoff_data.b.data(), tersoff_data.bp.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // pre-compute the partial forces find_force_tersoff_step2<<>>( @@ -579,7 +580,7 @@ void Tersoff1989::compute( tersoff_data.f12x.data(), tersoff_data.f12y.data(), tersoff_data.f12z.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // the final step: calculate force and related quantities find_properties_many_body( diff --git a/src/force/tersoff_mini.cu b/src/force/tersoff_mini.cu index bad984625..65c30848c 100644 --- a/src/force/tersoff_mini.cu +++ b/src/force/tersoff_mini.cu @@ -26,6 +26,7 @@ J. Phys.: Condens. Matter 32, 135901 (2020). #include "tersoff_mini.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #define BLOCK_SIZE_FORCE 64 @@ -371,7 +372,7 @@ void Tersoff_mini::compute( position_per_atom.data() + number_of_atoms * 2, tersoff_mini_data.b.data(), tersoff_mini_data.bp.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // pre-compute the partial forces find_force_step2<<>>( @@ -393,7 +394,7 @@ void Tersoff_mini::compute( tersoff_mini_data.f12x.data(), tersoff_mini_data.f12y.data(), tersoff_mini_data.f12z.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // the final step: calculate force and related quantities find_properties_many_body( diff --git a/src/integrate/ensemble.cu b/src/integrate/ensemble.cu index cb77bbe3a..51c3d3f2d 100644 --- a/src/integrate/ensemble.cu +++ b/src/integrate/ensemble.cu @@ -19,6 +19,7 @@ The abstract base class (ABC) for the ensemble classes. #include "ensemble.cuh" #include "utilities/common.cuh" +#include "utilities/gpu_macro.cuh" #define DIM 3 Ensemble::Ensemble(void) @@ -314,7 +315,7 @@ void Ensemble::velocity_verlet( force_per_atom.data() + number_of_atoms, force_per_atom.data() + 2 * number_of_atoms); } - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } // Find some thermodynamic properties: @@ -757,7 +758,7 @@ void Ensemble::find_thermo( thermo.data()); } - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } // Scale the velocity of every particle in the systems by a factor @@ -782,7 +783,7 @@ void Ensemble::scale_velocity_global(const double factor, GPU_Vector& ve velocity_per_atom.data(), velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } static __global__ void gpu_find_vc_and_ke( @@ -837,7 +838,7 @@ static __global__ void gpu_find_vc_and_ke( } __syncthreads(); -#pragma unroll + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_mc[tid] += s_mc[tid + offset]; @@ -887,7 +888,7 @@ void Ensemble::find_vc_and_ke( vcy, vcz, ke); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } static __global__ void gpu_scale_velocity( @@ -963,5 +964,5 @@ void Ensemble::scale_velocity_local( velocity_per_atom.data(), velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } diff --git a/src/integrate/ensemble_bao.cu b/src/integrate/ensemble_bao.cu index c30ee36ea..b1d0032cc 100644 --- a/src/integrate/ensemble_bao.cu +++ b/src/integrate/ensemble_bao.cu @@ -23,6 +23,7 @@ The Langevin thermostat with the BAOAB splitting: #include "ensemble_bao.cuh" #include "langevin_utilities.cuh" #include "utilities/common.cuh" +#include "utilities/gpu_macro.cuh" #include Ensemble_BAO::Ensemble_BAO(int t, int N, double T, double Tc) @@ -35,7 +36,7 @@ Ensemble_BAO::Ensemble_BAO(int t, int N, double T, double Tc) curand_states.resize(N); int grid_size = (N - 1) / 128 + 1; initialize_curand_states<<>>(curand_states.data(), N, rand()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } Ensemble_BAO::Ensemble_BAO( @@ -69,9 +70,9 @@ Ensemble_BAO::Ensemble_BAO( int grid_size_sink = (N_sink - 1) / 128 + 1; initialize_curand_states<<>>( curand_states_source.data(), N_source, rand()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL initialize_curand_states<<>>(curand_states_sink.data(), N_sink, rand()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL energy_transferred[0] = 0.0; energy_transferred[1] = 0.0; } @@ -96,7 +97,7 @@ void Ensemble_BAO::integrate_nvt_lan( velocity_per_atom.data(), velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_find_momentum<<<4, 1024>>>( number_of_atoms, @@ -104,14 +105,14 @@ void Ensemble_BAO::integrate_nvt_lan( velocity_per_atom.data(), velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_correct_momentum<<<(number_of_atoms - 1) / 128 + 1, 128>>>( number_of_atoms, velocity_per_atom.data(), velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } // wrapper of the local Langevin thermostatting kernels @@ -136,7 +137,7 @@ void Ensemble_BAO::integrate_heat_lan( velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms, ke.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL ke.copy_to_host(ek2.data()); energy_transferred[0] += ek2[source] * 0.5; @@ -153,7 +154,7 @@ void Ensemble_BAO::integrate_heat_lan( velocity_per_atom.data(), velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_langevin<<<(N_sink - 1) / 128 + 1, 128>>>( curand_states_sink.data(), @@ -166,7 +167,7 @@ void Ensemble_BAO::integrate_heat_lan( velocity_per_atom.data(), velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_ke<<>>( group[0].size.data(), @@ -177,7 +178,7 @@ void Ensemble_BAO::integrate_heat_lan( velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms, ke.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL ke.copy_to_host(ek2.data()); energy_transferred[0] -= ek2[source] * 0.5; @@ -271,7 +272,7 @@ void Ensemble_BAO::operator_A( force_per_atom.data(), force_per_atom.data() + number_of_atoms, force_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } else { gpu_operator_A<<<(number_of_atoms - 1) / 128 + 1, 128>>>( number_of_atoms, @@ -288,7 +289,7 @@ void Ensemble_BAO::operator_A( force_per_atom.data(), force_per_atom.data() + number_of_atoms, force_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } @@ -411,7 +412,7 @@ void Ensemble_BAO::operator_B( force_per_atom.data() + number_of_atoms, force_per_atom.data() + 2 * number_of_atoms); } - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } void Ensemble_BAO::compute1( diff --git a/src/integrate/ensemble_bao.cuh b/src/integrate/ensemble_bao.cuh index 47a28561c..e462f5dbe 100644 --- a/src/integrate/ensemble_bao.cuh +++ b/src/integrate/ensemble_bao.cuh @@ -15,7 +15,12 @@ #pragma once #include "ensemble.cuh" -#include +#include "utilities/gpu_macro.cuh" +#ifdef USE_HIP + #include +#else + #include +#endif class Ensemble_BAO : public Ensemble { @@ -41,9 +46,9 @@ public: protected: int N_source, N_sink, offset_source, offset_sink; double c1, c2, c2_source, c2_sink; - GPU_Vector curand_states; - GPU_Vector curand_states_source; - GPU_Vector curand_states_sink; + GPU_Vector curand_states; + GPU_Vector curand_states_source; + GPU_Vector curand_states_sink; void operator_A( const double time_step, diff --git a/src/integrate/ensemble_bdp.cu b/src/integrate/ensemble_bdp.cu index 78e27b675..148f89477 100644 --- a/src/integrate/ensemble_bdp.cu +++ b/src/integrate/ensemble_bdp.cu @@ -21,6 +21,7 @@ The Bussi-Donadio-Parrinello thermostat: #include "ensemble_bdp.cuh" #include "svr_utilities.cuh" #include "utilities/common.cuh" +#include "utilities/gpu_macro.cuh" #include #define DIM 3 diff --git a/src/integrate/ensemble_ber.cu b/src/integrate/ensemble_ber.cu index 70f9ff56c..6523c412e 100644 --- a/src/integrate/ensemble_ber.cu +++ b/src/integrate/ensemble_ber.cu @@ -20,6 +20,7 @@ The Berendsen thermostat and barostat: #include "ensemble_ber.cuh" #include "npt_utilities.cuh" +#include "utilities/gpu_macro.cuh" Ensemble_BER::Ensemble_BER(int t, int mg, double* mv, double T, double Tc) { @@ -95,7 +96,7 @@ static void cpu_pressure_orthogonal( double* scale_factor) { double p[3]; - CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 3, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 3, gpuMemcpyDeviceToHost)); if (deform_x) { scale_factor[0] = box.cpu_h[0]; @@ -141,7 +142,7 @@ static void cpu_pressure_isotropic( Box& box, double* p0, double* p_coupling, double* thermo, double& scale_factor) { double p[3]; - CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 3, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 3, gpuMemcpyDeviceToHost)); scale_factor = 1.0 - p_coupling[0] * (p0[0] - (p[0] + p[1] + p[2]) * 0.3333333333333333); box.cpu_h[0] *= scale_factor; box.cpu_h[1] *= scale_factor; @@ -156,7 +157,7 @@ cpu_pressure_triclinic(Box& box, double* p0, double* p_coupling, double* thermo, { // p_coupling and p0 are in Voigt notation: xx, yy, zz, yz, xz, xy double p[6]; // but thermo is this order: xx, yy, zz, xy, xz, yz - CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 6, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 6, gpuMemcpyDeviceToHost)); mu[0] = 1.0 - p_coupling[0] * (p0[0] - p[0]); // xx mu[4] = 1.0 - p_coupling[1] * (p0[1] - p[1]); // yy mu[8] = 1.0 - p_coupling[2] * (p0[2] - p[2]); // zz @@ -233,7 +234,7 @@ void Ensemble_BER::compute2( atom.velocity_per_atom.data(), atom.velocity_per_atom.data() + number_of_atoms, atom.velocity_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } if (type == 11) { @@ -266,7 +267,7 @@ void Ensemble_BER::compute2( atom.position_per_atom.data(), atom.position_per_atom.data() + number_of_atoms, atom.position_per_atom.data() + number_of_atoms * 2); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } else { double mu[9]; cpu_pressure_triclinic(box, target_pressure, pressure_coupling, thermo.data(), mu); diff --git a/src/integrate/ensemble_lan.cu b/src/integrate/ensemble_lan.cu index 327b69277..6dae340b0 100644 --- a/src/integrate/ensemble_lan.cu +++ b/src/integrate/ensemble_lan.cu @@ -21,6 +21,7 @@ The Bussi-Parrinello integrator of the Langevin thermostat: #include "ensemble_lan.cuh" #include "langevin_utilities.cuh" #include "utilities/common.cuh" +#include "utilities/gpu_macro.cuh" #include Ensemble_LAN::Ensemble_LAN() {} @@ -35,7 +36,7 @@ Ensemble_LAN::Ensemble_LAN(int t, int N, double T, double Tc) curand_states.resize(N); int grid_size = (N - 1) / 128 + 1; initialize_curand_states<<>>(curand_states.data(), N, rand()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } Ensemble_LAN::Ensemble_LAN( @@ -69,9 +70,9 @@ Ensemble_LAN::Ensemble_LAN( int grid_size_sink = (N_sink - 1) / 128 + 1; initialize_curand_states<<>>( curand_states_source.data(), N_source, rand()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL initialize_curand_states<<>>(curand_states_sink.data(), N_sink, rand()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL energy_transferred[0] = 0.0; energy_transferred[1] = 0.0; } @@ -98,7 +99,7 @@ void Ensemble_LAN::integrate_nvt_lan_half( velocity_per_atom.data(), velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_find_momentum<<<4, 1024>>>( number_of_atoms, @@ -106,14 +107,14 @@ void Ensemble_LAN::integrate_nvt_lan_half( velocity_per_atom.data(), velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_correct_momentum<<<(number_of_atoms - 1) / 128 + 1, 128>>>( number_of_atoms, velocity_per_atom.data(), velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } // wrapper of the local Langevin thermostatting kernels @@ -138,7 +139,7 @@ void Ensemble_LAN::integrate_heat_lan_half( velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms, ke.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL ke.copy_to_host(ek2.data()); energy_transferred[0] += ek2[source] * 0.5; @@ -155,7 +156,7 @@ void Ensemble_LAN::integrate_heat_lan_half( velocity_per_atom.data(), velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_langevin<<<(N_sink - 1) / 128 + 1, 128>>>( curand_states_sink.data(), @@ -168,7 +169,7 @@ void Ensemble_LAN::integrate_heat_lan_half( velocity_per_atom.data(), velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_ke<<>>( group[0].size.data(), @@ -179,7 +180,7 @@ void Ensemble_LAN::integrate_heat_lan_half( velocity_per_atom.data() + number_of_atoms, velocity_per_atom.data() + 2 * number_of_atoms, ke.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL ke.copy_to_host(ek2.data()); energy_transferred[0] -= ek2[source] * 0.5; diff --git a/src/integrate/ensemble_lan.cuh b/src/integrate/ensemble_lan.cuh index 5968d8283..f769a6390 100644 --- a/src/integrate/ensemble_lan.cuh +++ b/src/integrate/ensemble_lan.cuh @@ -15,7 +15,12 @@ #pragma once #include "ensemble.cuh" -#include +#include "utilities/gpu_macro.cuh" +#ifdef USE_HIP + #include +#else + #include +#endif class Ensemble_LAN : public Ensemble { @@ -42,9 +47,9 @@ public: protected: int N_source, N_sink, offset_source, offset_sink; double c1, c2, c2_source, c2_sink; - GPU_Vector curand_states; - GPU_Vector curand_states_source; - GPU_Vector curand_states_sink; + GPU_Vector curand_states; + GPU_Vector curand_states_source; + GPU_Vector curand_states_sink; void integrate_nvt_lan_half(const GPU_Vector& mass, GPU_Vector& velocity_per_atom); diff --git a/src/integrate/ensemble_msst.cu b/src/integrate/ensemble_msst.cu index 068f48825..040cb84d8 100644 --- a/src/integrate/ensemble_msst.cu +++ b/src/integrate/ensemble_msst.cu @@ -18,6 +18,7 @@ The NVE ensemble integrator. ------------------------------------------------------------------------------*/ #include "ensemble_msst.cuh" +#include "utilities/gpu_macro.cuh" #include namespace @@ -200,7 +201,7 @@ void Ensemble_MSST::remap(double dilation) dilation, atom->position_per_atom.data() + shock_direction * N, atom->velocity_per_atom.data() + shock_direction * N); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } void Ensemble_MSST::get_conserved() @@ -271,22 +272,22 @@ void Ensemble_MSST::compute1( get_omega(); get_vsum(); - CHECK(cudaMemcpy( + CHECK(gpuMemcpy( gpu_v_backup.data(), atom.velocity_per_atom.data(), sizeof(double) * gpu_v_backup.size(), - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); // propagate velocity sum 1/2 step by temporarily propagating the velocities msst_v(); get_vsum(); // reset the velocities - CHECK(cudaMemcpy( + CHECK(gpuMemcpy( atom.velocity_per_atom.data(), gpu_v_backup.data(), sizeof(double) * gpu_v_backup.size(), - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); // propagate velocities 1/2 step using the new velocity sum msst_v(); diff --git a/src/integrate/ensemble_mttk.cu b/src/integrate/ensemble_mttk.cu index 04a3df07a..b59f6c7f2 100644 --- a/src/integrate/ensemble_mttk.cu +++ b/src/integrate/ensemble_mttk.cu @@ -21,6 +21,7 @@ P and T are both set -> NPT ensemable ------------------------------------------------------------------------------*/ #include "ensemble_mttk.cuh" +#include "utilities/gpu_macro.cuh" #include namespace diff --git a/src/integrate/ensemble_nhc.cu b/src/integrate/ensemble_nhc.cu index 7bd643304..8fc17e1f5 100644 --- a/src/integrate/ensemble_nhc.cu +++ b/src/integrate/ensemble_nhc.cu @@ -21,6 +21,7 @@ Oxford University Press, 2010. #include "ensemble_nhc.cuh" #include "utilities/common.cuh" +#include "utilities/gpu_macro.cuh" #define DIM 3 Ensemble_NHC::Ensemble_NHC(int t, int mg, double* mv, int N, double T, double Tc, double dt) diff --git a/src/integrate/ensemble_nphug.cu b/src/integrate/ensemble_nphug.cu index 3d46399ac..e9c92dbf7 100644 --- a/src/integrate/ensemble_nphug.cu +++ b/src/integrate/ensemble_nphug.cu @@ -14,6 +14,7 @@ */ #include "ensemble_nphug.cuh" +#include "utilities/gpu_macro.cuh" namespace { diff --git a/src/integrate/ensemble_npt_scr.cu b/src/integrate/ensemble_npt_scr.cu index edcaaadd7..c5b104b2b 100644 --- a/src/integrate/ensemble_npt_scr.cu +++ b/src/integrate/ensemble_npt_scr.cu @@ -24,6 +24,7 @@ J. Chem. Phys. 153, 114107 (2020). #include "npt_utilities.cuh" #include "svr_utilities.cuh" #include "utilities/common.cuh" +#include "utilities/gpu_macro.cuh" #include void Ensemble_NPT_SCR::initialize_rng() @@ -85,7 +86,7 @@ static void cpu_pressure_orthogonal( double* scale_factor) { double p[3]; - CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 3, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 3, gpuMemcpyDeviceToHost)); if (deform_x) { scale_factor[0] = box.cpu_h[0]; @@ -146,7 +147,7 @@ static void cpu_pressure_isotropic( double& scale_factor) { double p[3]; - CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 3, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 3, gpuMemcpyDeviceToHost)); const double pressure_instant = (p[0] + p[1] + p[2]) * 0.3333333333333333; const double scale_factor_Berendsen = 1.0 - p_coupling[0] * (target_pressure[0] - pressure_instant); @@ -174,7 +175,7 @@ static void cpu_pressure_triclinic( { // p_coupling and p0 are in Voigt notation: xx, yy, zz, yz, xz, xy double p[6]; // but thermo is this order: xx, yy, zz, xy, xz, yz - CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 6, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 6, gpuMemcpyDeviceToHost)); mu[0] = 1.0 - p_coupling[0] * (p0[0] - p[0]); // xx mu[4] = 1.0 - p_coupling[1] * (p0[1] - p[1]); // yy mu[8] = 1.0 - p_coupling[2] * (p0[2] - p[2]); // zz @@ -295,7 +296,7 @@ void Ensemble_NPT_SCR::compute2( atom.position_per_atom.data(), atom.position_per_atom.data() + number_of_atoms, atom.position_per_atom.data() + number_of_atoms * 2); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } else { double mu[9]; cpu_pressure_triclinic( diff --git a/src/integrate/ensemble_nve.cu b/src/integrate/ensemble_nve.cu index 05ad72c9a..9b0b51e6e 100644 --- a/src/integrate/ensemble_nve.cu +++ b/src/integrate/ensemble_nve.cu @@ -18,6 +18,7 @@ The NVE ensemble integrator. ------------------------------------------------------------------------------*/ #include "ensemble_nve.cuh" +#include "utilities/gpu_macro.cuh" Ensemble_NVE::Ensemble_NVE(int t) { type = t; } diff --git a/src/integrate/ensemble_pimd.cu b/src/integrate/ensemble_pimd.cu index 7e67a18bc..604357f65 100644 --- a/src/integrate/ensemble_pimd.cu +++ b/src/integrate/ensemble_pimd.cu @@ -27,6 +27,7 @@ References for implementation: #include "langevin_utilities.cuh" #include "svr_utilities.cuh" #include "utilities/common.cuh" +#include "utilities/gpu_macro.cuh" #include #include @@ -173,7 +174,7 @@ void Ensemble_PIMD::initialize(Atom& atom) curand_states.resize(number_of_atoms); int grid_size = (number_of_atoms - 1) / 128 + 1; initialize_curand_states<<>>(curand_states.data(), number_of_atoms, rand()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } Ensemble_PIMD::~Ensemble_PIMD(void) @@ -288,7 +289,7 @@ static __global__ void gpu_langevin( const bool thermostat_centroid, const int number_of_atoms, const int number_of_beads, - curandState* g_state, + gpurandState* g_state, const double temperature, const double temperature_coupling, const double omega_n, @@ -315,7 +316,7 @@ static __global__ void gpu_langevin( } } - curandState state = g_state[n]; + gpurandState state = g_state[n]; for (int k = 0; k < number_of_beads; ++k) { if (k == 0 && !thermostat_centroid) { continue; @@ -592,7 +593,7 @@ static void cpu_pressure_orthogonal( double* scale_factor) { double p[3]; - CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 3, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 3, gpuMemcpyDeviceToHost)); if (box.pbc_x == 1) { const double scale_factor_Berendsen = 1.0 - p_coupling[0] * (p0[0] - p[0]); @@ -638,7 +639,7 @@ static void cpu_pressure_isotropic( double& scale_factor) { double p[3]; - CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 3, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 3, gpuMemcpyDeviceToHost)); const double pressure_instant = (p[0] + p[1] + p[2]) * 0.3333333333333333; const double scale_factor_Berendsen = 1.0 - p_coupling[0] * (target_pressure[0] - pressure_instant); @@ -666,7 +667,7 @@ static void cpu_pressure_triclinic( { // p_coupling and p0 are in Voigt notation: xx, yy, zz, yz, xz, xy double p[6]; // but thermo is this order: xx, yy, zz, xy, xz, yz - CHECK(cudaMemcpy(p, thermo + 2, sizeof(double) * 6, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(p, thermo + 2, sizeof(double) * 6, gpuMemcpyDeviceToHost)); mu[0] = 1.0 - p_coupling[0] * (p0[0] - p[0]); // xx mu[4] = 1.0 - p_coupling[1] * (p0[1] - p[1]); // yy mu[8] = 1.0 - p_coupling[2] * (p0[2] - p[2]); // zz @@ -792,15 +793,15 @@ void Ensemble_PIMD::langevin(const double time_step, Atom& atom) transformation_matrix.data(), atom.mass.data(), velocity_beads.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_find_momentum_beads<<>>( number_of_atoms, atom.mass.data(), velocity_beads.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_correct_momentum_beads<<<(number_of_atoms - 1) / 64 + 1, 64>>>( number_of_atoms, number_of_beads, velocity_beads.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } @@ -817,7 +818,7 @@ void Ensemble_PIMD::compute1( gpu_apply_pbc<<<(number_of_atoms - 1) / 64 + 1, 64>>>( box, number_of_atoms, number_of_beads, position_beads.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_nve_1<<<(number_of_atoms - 1) / 64 + 1, 64>>>( number_of_atoms, @@ -829,7 +830,7 @@ void Ensemble_PIMD::compute1( force_beads.data(), position_beads.data(), velocity_beads.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } void Ensemble_PIMD::compute2( @@ -848,13 +849,13 @@ void Ensemble_PIMD::compute2( atom.mass.data(), force_beads.data(), velocity_beads.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL langevin(time_step, atom); gpu_apply_pbc<<<(number_of_atoms - 1) / 64 + 1, 64>>>( box, number_of_atoms, number_of_beads, position_beads.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_average<<<(number_of_atoms - 1) / 64 + 1, 64>>>( number_of_atoms, @@ -869,7 +870,7 @@ void Ensemble_PIMD::compute2( atom.potential_per_atom.data(), atom.force_per_atom.data(), atom.virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_find_kinetic_energy_virial_part<<<(number_of_atoms - 1) / 64 + 1, 64>>>( box, @@ -880,7 +881,7 @@ void Ensemble_PIMD::compute2( atom.position_per_atom.data(), kinetic_energy_virial_part.data(), atom.virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_find_sum_1024<<<1024, 128>>>( number_of_atoms, @@ -888,11 +889,11 @@ void Ensemble_PIMD::compute2( atom.potential_per_atom.data(), atom.virial_per_atom.data(), sum_1024.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_find_thermo<<<8, 1024>>>( box.get_volume(), number_of_atoms * K_B * temperature, sum_1024.data(), thermo.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (num_target_pressure_components == 1) { double scale_factor; @@ -916,7 +917,7 @@ void Ensemble_PIMD::compute2( scale_factor[2], position_beads.data(), atom.position_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } else if (num_target_pressure_components == 6) { double mu[9]; cpu_pressure_triclinic( diff --git a/src/integrate/ensemble_pimd.cuh b/src/integrate/ensemble_pimd.cuh index 498c46b69..3c7e6a806 100644 --- a/src/integrate/ensemble_pimd.cuh +++ b/src/integrate/ensemble_pimd.cuh @@ -15,7 +15,12 @@ #pragma once #include "ensemble.cuh" -#include +#include "utilities/gpu_macro.cuh" +#ifdef USE_HIP + #include +#else + #include +#endif #include #include @@ -59,7 +64,7 @@ protected: bool thermostat_internal = false; bool thermostat_centroid = false; double omega_n; - GPU_Vector curand_states; + GPU_Vector curand_states; GPU_Vector position_beads; GPU_Vector velocity_beads; GPU_Vector potential_beads; diff --git a/src/integrate/ensemble_ti.cu b/src/integrate/ensemble_ti.cu index f64478d1a..4a9f9849d 100644 --- a/src/integrate/ensemble_ti.cu +++ b/src/integrate/ensemble_ti.cu @@ -14,6 +14,7 @@ */ #include "ensemble_ti.cuh" +#include "utilities/gpu_macro.cuh" namespace { @@ -125,7 +126,7 @@ void Ensemble_TI::init() curand_states.resize(N); int grid_size = (N - 1) / 128 + 1; initialize_curand_states<<>>(curand_states.data(), N, rand()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL thermo_cpu.resize(thermo->size()); gpu_k.resize(N); @@ -139,11 +140,11 @@ void Ensemble_TI::init() gpu_k.copy_from_host(cpu_k.data()); gpu_espring.resize(N); position_0.resize(3 * N); - CHECK(cudaMemcpy( + CHECK(gpuMemcpy( position_0.data(), atom->position_per_atom.data(), sizeof(double) * position_0.size(), - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); } void Ensemble_TI::find_thermo() diff --git a/src/integrate/ensemble_ti_as.cu b/src/integrate/ensemble_ti_as.cu index 352e94054..0cb06aea3 100644 --- a/src/integrate/ensemble_ti_as.cu +++ b/src/integrate/ensemble_ti_as.cu @@ -14,6 +14,7 @@ */ #include "ensemble_ti_as.cuh" +#include "utilities/gpu_macro.cuh" Ensemble_TI_AS::Ensemble_TI_AS(const char** params, int num_params) { diff --git a/src/integrate/ensemble_ti_rs.cu b/src/integrate/ensemble_ti_rs.cu index 58440c697..918477fc4 100644 --- a/src/integrate/ensemble_ti_rs.cu +++ b/src/integrate/ensemble_ti_rs.cu @@ -14,6 +14,7 @@ */ #include "ensemble_ti_rs.cuh" +#include "utilities/gpu_macro.cuh" namespace { diff --git a/src/integrate/ensemble_ti_spring.cu b/src/integrate/ensemble_ti_spring.cu index 0da21a50d..f633e0797 100644 --- a/src/integrate/ensemble_ti_spring.cu +++ b/src/integrate/ensemble_ti_spring.cu @@ -14,6 +14,7 @@ */ #include "ensemble_ti_spring.cuh" +#include "utilities/gpu_macro.cuh" namespace { @@ -164,18 +165,18 @@ void Ensemble_TI_Spring::init() curand_states.resize(N); int grid_size = (N - 1) / 128 + 1; initialize_curand_states<<>>(curand_states.data(), N, rand()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL thermo_cpu.resize(thermo->size()); gpu_k.resize(N, 0); cpu_k.resize(N, 0); gpu_espring.resize(N); position_0.resize(3 * N); - CHECK(cudaMemcpy( + CHECK(gpuMemcpy( position_0.data(), atom->position_per_atom.data(), sizeof(double) * position_0.size(), - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); if (!auto_k) { for (int i = 0; i < N; i++) { diff --git a/src/integrate/ensemble_wall_harmonic.cu b/src/integrate/ensemble_wall_harmonic.cu index adcc84df5..d7d073ba5 100644 --- a/src/integrate/ensemble_wall_harmonic.cu +++ b/src/integrate/ensemble_wall_harmonic.cu @@ -14,6 +14,7 @@ */ #include "ensemble_wall_harmonic.cuh" +#include "utilities/gpu_macro.cuh" namespace { diff --git a/src/integrate/ensemble_wall_mirror.cu b/src/integrate/ensemble_wall_mirror.cu index 14f7685d0..8e615976b 100644 --- a/src/integrate/ensemble_wall_mirror.cu +++ b/src/integrate/ensemble_wall_mirror.cu @@ -14,6 +14,7 @@ */ #include "ensemble_wall_mirror.cuh" +#include "utilities/gpu_macro.cuh" namespace { diff --git a/src/integrate/ensemble_wall_piston.cu b/src/integrate/ensemble_wall_piston.cu index 7ace8a5f0..4e4982afb 100644 --- a/src/integrate/ensemble_wall_piston.cu +++ b/src/integrate/ensemble_wall_piston.cu @@ -14,6 +14,7 @@ */ #include "ensemble_wall_piston.cuh" +#include "utilities/gpu_macro.cuh" namespace { diff --git a/src/integrate/integrate.cu b/src/integrate/integrate.cu index c9e815d0b..f92cc9a21 100644 --- a/src/integrate/integrate.cu +++ b/src/integrate/integrate.cu @@ -38,6 +38,7 @@ The driver class for the various integrators. #include "integrate.cuh" #include "model/atom.cuh" #include "utilities/common.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" void Integrate::initialize( @@ -273,7 +274,7 @@ void Integrate::compute1( atom.position_temp.data(), atom.position_temp.data() + num_atoms, atom.position_temp.data() + num_atoms * 2); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL ensemble->compute1(time_step, group, box, atom, thermo); @@ -288,7 +289,7 @@ void Integrate::compute1( atom.unwrapped_position.data(), atom.unwrapped_position.data() + num_atoms, atom.unwrapped_position.data() + num_atoms * 2); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } void Integrate::compute2( diff --git a/src/integrate/langevin_utilities.cuh b/src/integrate/langevin_utilities.cuh index dd186d87d..311dc242c 100644 --- a/src/integrate/langevin_utilities.cuh +++ b/src/integrate/langevin_utilities.cuh @@ -18,20 +18,22 @@ Some CUDA kernels for Langevin thermostats. ------------------------------------------------------------------------------*/ #pragma once -#define CURAND_NORMAL(a) curand_normal_double(a) +#include "utilities/gpu_macro.cuh" + +#define CURAND_NORMAL(a) gpurand_normal_double(a) // initialize curand states -static __global__ void initialize_curand_states(curandState* state, int N, int seed) +static __global__ void initialize_curand_states(gpurandState* state, int N, int seed) { int n = blockIdx.x * blockDim.x + threadIdx.x; if (n < N) { - curand_init(seed, n, 0, &state[n]); + gpurand_init(seed, n, 0, &state[n]); } } // global Langevin thermostatting static __global__ void gpu_langevin( - curandState* g_state, + gpurandState* g_state, const int N, const double c1, const double c2, @@ -42,7 +44,7 @@ static __global__ void gpu_langevin( { int n = blockIdx.x * blockDim.x + threadIdx.x; if (n < N) { - curandState state = g_state[n]; + gpurandState state = g_state[n]; double c2m = c2 * sqrt(1.0 / g_mass[n]); g_vx[n] = c1 * g_vx[n] + c2m * CURAND_NORMAL(&state); g_vy[n] = c1 * g_vy[n] + c2m * CURAND_NORMAL(&state); @@ -123,7 +125,7 @@ static __global__ void gpu_correct_momentum(const int N, double* g_vx, double* g // local Langevin thermostatting static __global__ void gpu_langevin( - curandState* g_state, + gpurandState* g_state, const int N, const int offset, const int* g_group_contents, @@ -136,7 +138,7 @@ static __global__ void gpu_langevin( { int m = blockIdx.x * blockDim.x + threadIdx.x; if (m < N) { - curandState state = g_state[m]; + gpurandState state = g_state[m]; int n = g_group_contents[offset + m]; double c2m = c2 * sqrt(1.0 / g_mass[n]); g_vx[n] = c1 * g_vx[n] + c2m * CURAND_NORMAL(&state); @@ -177,7 +179,7 @@ static __global__ void find_ke( } } __syncthreads(); -#pragma unroll + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_ke[tid] += s_ke[tid + offset]; diff --git a/src/integrate/npt_utilities.cuh b/src/integrate/npt_utilities.cuh index 2fc01db0c..c7d7c0c24 100644 --- a/src/integrate/npt_utilities.cuh +++ b/src/integrate/npt_utilities.cuh @@ -17,6 +17,9 @@ Some CUDA kernels for position scaling in NPT ------------------------------------------------------------------------------*/ +#pragma once +#include "utilities/gpu_macro.cuh" + static __global__ void gpu_pressure_orthogonal( const int number_of_particles, const double scale_factor_x, diff --git a/src/integrate/svr_utilities.cuh b/src/integrate/svr_utilities.cuh index 6aee7985d..f7d8b7b78 100644 --- a/src/integrate/svr_utilities.cuh +++ b/src/integrate/svr_utilities.cuh @@ -22,6 +22,9 @@ [1] G. Bussi et al. J. Chem. Phys. 126, 014101 (2007). ------------------------------------------------------------------------------*/ +#pragma once +#include "utilities/gpu_macro.cuh" + static double gasdev(std::mt19937& rng) { std::uniform_real_distribution rand1(0, 1); diff --git a/src/main_gpumd/add_efield.cu b/src/main_gpumd/add_efield.cu index 01bb595f5..e7a7256c4 100644 --- a/src/main_gpumd/add_efield.cu +++ b/src/main_gpumd/add_efield.cu @@ -20,6 +20,7 @@ Add electric field to a group of atoms. #include "add_efield.cuh" #include "model/atom.cuh" #include "model/group.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include #include @@ -67,7 +68,7 @@ void Add_Efield::compute(const int step, const std::vector& groups, Atom& atom.force_per_atom.data(), atom.force_per_atom.data() + num_atoms_total, atom.force_per_atom.data() + num_atoms_total * 2); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } diff --git a/src/main_gpumd/add_force.cu b/src/main_gpumd/add_force.cu index 92bb952da..4b9f68a4f 100644 --- a/src/main_gpumd/add_force.cu +++ b/src/main_gpumd/add_force.cu @@ -20,6 +20,7 @@ Add force to a group of atoms. #include "add_force.cuh" #include "model/atom.cuh" #include "model/group.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include #include @@ -64,7 +65,7 @@ void Add_Force::compute(const int step, const std::vector& groups, Atom& atom.force_per_atom.data(), atom.force_per_atom.data() + num_atoms_total, atom.force_per_atom.data() + num_atoms_total * 2); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } diff --git a/src/main_gpumd/add_random_force.cu b/src/main_gpumd/add_random_force.cu index d3672a2e3..7e76f19a9 100644 --- a/src/main_gpumd/add_random_force.cu +++ b/src/main_gpumd/add_random_force.cu @@ -19,33 +19,34 @@ Add random forces with zero mean and specified variance. #include "add_random_force.cuh" #include "model/atom.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include #include #include -static __global__ void initialize_curand_states(curandState* state, int N, int seed) +static __global__ void initialize_curand_states(gpurandState* state, int N, int seed) { int n = blockIdx.x * blockDim.x + threadIdx.x; if (n < N) { - curand_init(seed, n, 0, &state[n]); + gpurand_init(seed, n, 0, &state[n]); } } static __global__ void add_random_force( const int N, const double force_variance, - curandState* g_state, + gpurandState* g_state, double* g_fx, double* g_fy, double* g_fz) { int n = blockIdx.x * blockDim.x + threadIdx.x; if (n < N) { - curandState state = g_state[n]; - g_fx[n] += force_variance * curand_normal_double(&state); - g_fy[n] += force_variance * curand_normal_double(&state); - g_fz[n] += force_variance * curand_normal_double(&state); + gpurandState state = g_state[n]; + g_fx[n] += force_variance * gpurand_normal_double(&state); + g_fy[n] += force_variance * gpurand_normal_double(&state); + g_fz[n] += force_variance * gpurand_normal_double(&state); g_state[n] = state; } } @@ -88,7 +89,7 @@ static __global__ void gpu_sum_force(int N, double* g_fx, double* g_fy, double* s_f[tid] = f; __syncthreads(); -#pragma unroll + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_f[tid] += s_f[tid + offset]; @@ -123,14 +124,14 @@ void Add_Random_Force::compute(const int step, Atom& atom) atom.force_per_atom.data(), atom.force_per_atom.data() + atom.number_of_atoms, atom.force_per_atom.data() + atom.number_of_atoms * 2); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_sum_force<<<3, 1024>>>( atom.number_of_atoms, atom.force_per_atom.data(), atom.force_per_atom.data() + atom.number_of_atoms, atom.force_per_atom.data() + 2 * atom.number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_correct_force<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>( atom.number_of_atoms, @@ -138,7 +139,7 @@ void Add_Random_Force::compute(const int step, Atom& atom) atom.force_per_atom.data(), atom.force_per_atom.data() + atom.number_of_atoms, atom.force_per_atom.data() + 2 * atom.number_of_atoms); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } @@ -168,7 +169,7 @@ void Add_Random_Force::parse(const char** param, int num_param, int number_of_at curand_states_.resize(number_of_atoms); int grid_size = (number_of_atoms - 1) / 128 + 1; initialize_curand_states<<>>(curand_states_.data(), number_of_atoms, rand()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } void Add_Random_Force::finalize() { num_calls_ = 0; } diff --git a/src/main_gpumd/add_random_force.cuh b/src/main_gpumd/add_random_force.cuh index 30fc6aaef..3bbc7fd55 100644 --- a/src/main_gpumd/add_random_force.cuh +++ b/src/main_gpumd/add_random_force.cuh @@ -15,7 +15,12 @@ #pragma once #include "utilities/gpu_vector.cuh" -#include +#include "utilities/gpu_macro.cuh" +#ifdef USE_HIP + #include +#else + #include +#endif class Atom; @@ -27,7 +32,7 @@ public: void finalize(); private: - GPU_Vector curand_states_; + GPU_Vector curand_states_; int num_calls_ = 0; double force_variance_ = 0.0; }; diff --git a/src/main_gpumd/cohesive.cu b/src/main_gpumd/cohesive.cu index 96fb6ebcb..4c188db1e 100644 --- a/src/main_gpumd/cohesive.cu +++ b/src/main_gpumd/cohesive.cu @@ -25,6 +25,7 @@ Compute the cohesive energy curve with different deformations. #include "model/group.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" static void __global__ deform_position( diff --git a/src/main_gpumd/electron_stop.cu b/src/main_gpumd/electron_stop.cu index 7e12e5000..12eb55ef5 100644 --- a/src/main_gpumd/electron_stop.cu +++ b/src/main_gpumd/electron_stop.cu @@ -21,6 +21,7 @@ Apply electron stopping. #include "model/atom.cuh" #include "utilities/common.cuh" #include "utilities/gpu_vector.cuh" +#include "utilities/gpu_macro.cuh" #include #include @@ -171,21 +172,21 @@ void Electron_Stop::compute(double time_step, Atom& atom) stopping_force.data(), stopping_loss.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_force_average<<<3, 1024>>>(atom.number_of_atoms, stopping_force.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL apply_electron_stopping<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>( atom.number_of_atoms, stopping_force.data(), atom.force_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_power_loss<<<1, 1024>>>(atom.number_of_atoms, stopping_loss.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL double power_loss_host; - CHECK(cudaMemcpyFromSymbol( - &power_loss_host, device_power_loss, sizeof(double), 0, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpyFromSymbol( + &power_loss_host, device_power_loss, sizeof(double), 0, gpuMemcpyDeviceToHost)); stopping_power_loss += power_loss_host; } diff --git a/src/main_gpumd/main.cu b/src/main_gpumd/main.cu index 57fc332f8..e275cae1c 100644 --- a/src/main_gpumd/main.cu +++ b/src/main_gpumd/main.cu @@ -15,6 +15,7 @@ #include "run.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/main_common.cuh" #include #include @@ -33,12 +34,12 @@ int main(int argc, char* argv[]) printf("Started running GPUMD.\n"); print_line_2(); - CHECK(cudaDeviceSynchronize()); + CHECK(gpuDeviceSynchronize()); clock_t time_begin = clock(); Run run; - CHECK(cudaDeviceSynchronize()); + CHECK(gpuDeviceSynchronize()); clock_t time_finish = clock(); double time_used = (time_finish - time_begin) / double(CLOCKS_PER_SEC); diff --git a/src/main_gpumd/replicate.cu b/src/main_gpumd/replicate.cu index 5070220e1..aaa2ca009 100644 --- a/src/main_gpumd/replicate.cu +++ b/src/main_gpumd/replicate.cu @@ -14,6 +14,7 @@ */ #include "replicate.cuh" +#include "utilities/gpu_macro.cuh" void Replicate(const char** param, int num_param, Box& box, Atom& atoms, std::vector& groups) { diff --git a/src/main_gpumd/run.cu b/src/main_gpumd/run.cu index 17db0bcbb..8c63d55d2 100644 --- a/src/main_gpumd/run.cu +++ b/src/main_gpumd/run.cu @@ -33,6 +33,7 @@ Run simulation according to the inputs in the run.in file. #include "replicate.cuh" #include "run.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include "velocity.cuh" @@ -83,7 +84,7 @@ static void calculate_time_step( } const int N = velocity_per_atom.size() / 3; double* gpu_v2_max; - CHECK(cudaGetSymbolAddress((void**)&gpu_v2_max, device_v2_max)); + CHECK(gpuGetSymbolAddress((void**)&gpu_v2_max, device_v2_max)); gpu_find_largest_v2<<<1, 1024>>>( N, (N - 1) / 1024 + 1, @@ -91,9 +92,9 @@ static void calculate_time_step( velocity_per_atom.data() + N, velocity_per_atom.data() + N * 2, gpu_v2_max); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL double cpu_v2_max[1] = {0.0}; - CHECK(cudaMemcpy(cpu_v2_max, gpu_v2_max, sizeof(double), cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(cpu_v2_max, gpu_v2_max, sizeof(double), gpuMemcpyDeviceToHost)); double cpu_v_max = sqrt(cpu_v2_max[0]); double time_step_min = max_distance_per_step / cpu_v_max; @@ -492,23 +493,26 @@ void Run::parse_one_keyword(std::vector& tokens) void Run::parse_velocity(const char** param, int num_param) { - int seed; + int seed = 0; bool use_seed = false; if (!(num_param == 2 || num_param == 4)) { PRINT_INPUT_ERROR("velocity should have 1 or 2 parameters.\n"); + } else if (num_param == 4) { + // See https://github.com/brucefan1983/GPUMD/pull/768 + // for the reason for putting this branch here. + use_seed = true; + if (!is_valid_int(param[3], &seed)) { + PRINT_INPUT_ERROR("seed should be a positive integer.\n"); + } } + if (!is_valid_real(param[1], &initial_temperature)) { PRINT_INPUT_ERROR("initial temperature should be a real number.\n"); } if (initial_temperature <= 0.0) { PRINT_INPUT_ERROR("initial temperature should be a positive number.\n"); } - if (num_param == 4) { - use_seed = true; - if (!is_valid_int(param[3], &seed)) { - PRINT_INPUT_ERROR("seed should be a positive integer.\n"); - } - } + velocity.initialize( has_velocity_in_xyz, initial_temperature, @@ -766,7 +770,7 @@ void Run::parse_change_box(const char** param, int num_param) atom.position_per_atom.data(), atom.position_per_atom.data() + number_of_atoms, atom.position_per_atom.data() + number_of_atoms * 2); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (box.triclinic == 0) { printf(" Changed box lengths are\n"); diff --git a/src/main_gpumd/velocity.cu b/src/main_gpumd/velocity.cu index 058ca38d2..b69931c13 100644 --- a/src/main_gpumd/velocity.cu +++ b/src/main_gpumd/velocity.cu @@ -24,6 +24,7 @@ If DEBUG is off, the velocities are different in different runs. #include "model/group.cuh" #include "utilities/common.cuh" #include "utilities/gpu_vector.cuh" +#include "utilities/gpu_macro.cuh" #include "velocity.cuh" #include diff --git a/src/main_nep/dataset.cu b/src/main_nep/dataset.cu index 06ded508e..ab44a31de 100644 --- a/src/main_nep/dataset.cu +++ b/src/main_nep/dataset.cu @@ -18,6 +18,7 @@ #include "parameters.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/nep_utilities.cuh" void Dataset::copy_structures(std::vector& structures_input, int n1, int n2) @@ -290,7 +291,7 @@ void Dataset::find_neighbor(Parameters& para) r.data() + N * 2, NN_radial_gpu.data(), NN_angular_gpu.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL NN_radial_gpu.copy_to_host(NN_radial_cpu.data()); NN_angular_gpu.copy_to_host(NN_angular_cpu.data()); @@ -327,7 +328,7 @@ void Dataset::find_neighbor(Parameters& para) void Dataset::construct( Parameters& para, std::vector& structures_input, int n1, int n2, int device_id) { - CHECK(cudaSetDevice(device_id)); + CHECK(gpuSetDevice(device_id)); copy_structures(structures_input, n1, n2); find_has_type(para); error_cpu.resize(Nc); @@ -394,7 +395,7 @@ static __global__ void gpu_sum_force_error( std::vector Dataset::get_rmse_force(Parameters& para, const bool use_weight, int device_id) { - CHECK(cudaSetDevice(device_id)); + CHECK(gpuSetDevice(device_id)); const int block_size = 256; gpu_sum_force_error<<>>( use_weight, @@ -411,7 +412,7 @@ std::vector Dataset::get_rmse_force(Parameters& para, const bool use_weig force_ref_gpu.data() + N * 2, error_gpu.data()); int mem = sizeof(float) * Nc; - CHECK(cudaMemcpy(error_cpu.data(), error_gpu.data(), mem, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(error_cpu.data(), error_gpu.data(), mem, gpuMemcpyDeviceToHost)); std::vector rmse_array(para.num_types + 1, 0.0f); std::vector count_array(para.num_types + 1, 0); @@ -498,7 +499,7 @@ std::vector Dataset::get_rmse_energy( const bool do_shift, int device_id) { - CHECK(cudaSetDevice(device_id)); + CHECK(gpuSetDevice(device_id)); energy_shift_per_structure = 0.0f; const int block_size = 256; @@ -507,7 +508,7 @@ std::vector Dataset::get_rmse_energy( if (do_shift) { gpu_get_energy_shift<<>>( Na.data(), Na_sum.data(), energy.data(), energy_ref_gpu.data(), error_gpu.data()); - CHECK(cudaMemcpy(error_cpu.data(), error_gpu.data(), mem, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(error_cpu.data(), error_gpu.data(), mem, gpuMemcpyDeviceToHost)); for (int n = 0; n < Nc; ++n) { energy_shift_per_structure += error_cpu[n]; } @@ -521,7 +522,7 @@ std::vector Dataset::get_rmse_energy( energy.data(), energy_ref_gpu.data(), error_gpu.data()); - CHECK(cudaMemcpy(error_cpu.data(), error_gpu.data(), mem, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(error_cpu.data(), error_gpu.data(), mem, gpuMemcpyDeviceToHost)); std::vector rmse_array(para.num_types + 1, 0.0f); std::vector count_array(para.num_types + 1, 0); @@ -589,7 +590,7 @@ static __global__ void gpu_sum_virial_error( std::vector Dataset::get_rmse_virial(Parameters& para, const bool use_weight, int device_id) { - CHECK(cudaSetDevice(device_id)); + CHECK(gpuSetDevice(device_id)); std::vector rmse_array(para.num_types + 1, 0.0f); std::vector count_array(para.num_types + 1, 0); @@ -607,7 +608,7 @@ std::vector Dataset::get_rmse_virial(Parameters& para, const bool use_wei virial.data(), virial_ref_gpu.data(), error_gpu.data()); - CHECK(cudaMemcpy(error_cpu.data(), error_gpu.data(), mem, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(error_cpu.data(), error_gpu.data(), mem, gpuMemcpyDeviceToHost)); for (int n = 0; n < Nc; ++n) { if (structures[n].has_virial) { float rmse_temp = use_weight ? weight_cpu[n] * weight_cpu[n] * error_cpu[n] : error_cpu[n]; diff --git a/src/main_nep/fitness.cu b/src/main_nep/fitness.cu index 2870d242e..c6b2c37cd 100644 --- a/src/main_nep/fitness.cu +++ b/src/main_nep/fitness.cu @@ -22,6 +22,7 @@ Get the fitness #include "parameters.cuh" #include "structure.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/gpu_vector.cuh" #include #include @@ -34,7 +35,7 @@ Get the fitness Fitness::Fitness(Parameters& para) { int deviceCount; - CHECK(cudaGetDeviceCount(&deviceCount)); + CHECK(gpuGetDeviceCount(&deviceCount)); std::vector structures_train; read_structures(true, para, structures_train); @@ -63,7 +64,7 @@ Fitness::Fitness(Parameters& para) for (int device_id = 0; device_id < deviceCount; ++device_id) { print_line_1(); printf("Constructing train_set in device %d.\n", device_id); - CHECK(cudaSetDevice(device_id)); + CHECK(gpuSetDevice(device_id)); train_set[batch_id][device_id].construct( para, structures_train, count - batch_size, count, device_id); print_line_2(); @@ -77,7 +78,7 @@ Fitness::Fitness(Parameters& para) for (int device_id = 0; device_id < deviceCount; ++device_id) { print_line_1(); printf("Constructing test_set in device %d.\n", device_id); - CHECK(cudaSetDevice(device_id)); + CHECK(gpuSetDevice(device_id)); test_set[device_id].construct(para, structures_test, 0, structures_test.size(), device_id); print_line_2(); } @@ -133,7 +134,7 @@ void Fitness::compute( const int generation, Parameters& para, const float* population, float* fitness) { int deviceCount; - CHECK(cudaGetDeviceCount(&deviceCount)); + CHECK(gpuGetDeviceCount(&deviceCount)); int population_iter = (para.population_size - 1) / deviceCount + 1; if (generation == 0) { @@ -344,7 +345,7 @@ void Fitness::write_nep_txt(FILE* fid_nep, Parameters& para, float* elite) for (int m = 0; m < para.number_of_variables; ++m) { fprintf(fid_nep, "%15.7e\n", elite[m]); } - CHECK(cudaSetDevice(0)); + CHECK(gpuSetDevice(0)); para.q_scaler_gpu[0].copy_to_host(para.q_scaler_cpu.data()); for (int d = 0; d < para.q_scaler_cpu.size(); ++d) { fprintf(fid_nep, "%15.7e\n", para.q_scaler_cpu[d]); diff --git a/src/main_nep/main.cu b/src/main_nep/main.cu index 3cfc7a616..3157432a6 100644 --- a/src/main_nep/main.cu +++ b/src/main_nep/main.cu @@ -17,6 +17,7 @@ #include "parameters.cuh" #include "snes.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/main_common.cuh" #include #include diff --git a/src/main_nep/nep3.cu b/src/main_nep/nep3.cu index 0f59dbb7a..ece3d8d69 100644 --- a/src/main_nep/nep3.cu +++ b/src/main_nep/nep3.cu @@ -26,6 +26,7 @@ heat transport, Phys. Rev. B. 104, 104309 (2021). #include "parameters.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/gpu_vector.cuh" #include "utilities/nep_utilities.cuh" @@ -292,7 +293,7 @@ NEP3::NEP3( } for (int device_id = 0; device_id < deviceCount; device_id++) { - cudaSetDevice(device_id); + gpuSetDevice(device_id); annmb[device_id].dim = para.dim; annmb[device_id].num_neurons1 = para.num_neurons1; annmb[device_id].num_para = para.number_of_variables; @@ -857,14 +858,14 @@ void NEP3::find_force( { for (int device_id = 0; device_id < device_in_this_iter; ++device_id) { - CHECK(cudaSetDevice(device_id)); + CHECK(gpuSetDevice(device_id)); nep_data[device_id].parameters.copy_from_host( parameters + device_id * para.number_of_variables); update_potential(para, nep_data[device_id].parameters.data(), annmb[device_id]); } for (int device_id = 0; device_id < device_in_this_iter; ++device_id) { - CHECK(cudaSetDevice(device_id)); + CHECK(gpuSetDevice(device_id)); const int block_size = 32; const int grid_size = (dataset[device_id].N - 1) / block_size + 1; @@ -894,7 +895,7 @@ void NEP3::find_force( nep_data[device_id].x12_angular.data(), nep_data[device_id].y12_angular.data(), nep_data[device_id].z12_angular.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } find_descriptors_radial<<>>( @@ -908,7 +909,7 @@ void NEP3::find_force( nep_data[device_id].y12_radial.data(), nep_data[device_id].z12_radial.data(), nep_data[device_id].descriptors.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_descriptors_angular<<>>( dataset[device_id].N, @@ -922,14 +923,14 @@ void NEP3::find_force( nep_data[device_id].z12_angular.data(), nep_data[device_id].descriptors.data(), nep_data[device_id].sum_fxyz.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (calculate_q_scaler) { find_max_min<<>>( dataset[device_id].N, nep_data[device_id].descriptors.data(), para.q_scaler_gpu[device_id].data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } zero_force<<>>( @@ -940,7 +941,7 @@ void NEP3::find_force( dataset[device_id].virial.data(), dataset[device_id].virial.data() + dataset[device_id].N, dataset[device_id].virial.data() + dataset[device_id].N * 2); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (para.train_mode == 2) { apply_ann_pol<<>>( @@ -952,7 +953,7 @@ void NEP3::find_force( para.q_scaler_gpu[device_id].data(), dataset[device_id].virial.data(), nep_data[device_id].Fp.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } else if (para.train_mode == 3) { apply_ann_temperature<<>>( dataset[device_id].N, @@ -964,7 +965,7 @@ void NEP3::find_force( dataset[device_id].temperature_ref_gpu.data(), dataset[device_id].energy.data(), nep_data[device_id].Fp.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } else { apply_ann<<>>( dataset[device_id].N, @@ -975,7 +976,7 @@ void NEP3::find_force( para.q_scaler_gpu[device_id].data(), dataset[device_id].energy.data(), nep_data[device_id].Fp.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } bool is_dipole = para.train_mode == 1; @@ -995,7 +996,7 @@ void NEP3::find_force( dataset[device_id].force.data() + dataset[device_id].N, dataset[device_id].force.data() + dataset[device_id].N * 2, dataset[device_id].virial.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_force_angular<<>>( is_dipole, @@ -1014,7 +1015,7 @@ void NEP3::find_force( dataset[device_id].force.data() + dataset[device_id].N, dataset[device_id].force.data() + dataset[device_id].N * 2, dataset[device_id].virial.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (zbl.enabled) { find_force_ZBL<<>>( @@ -1032,7 +1033,7 @@ void NEP3::find_force( dataset[device_id].force.data() + dataset[device_id].N * 2, dataset[device_id].virial.data(), dataset[device_id].energy.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } } diff --git a/src/main_nep/parameters.cu b/src/main_nep/parameters.cu index c0e76a47a..b2e8ac22c 100644 --- a/src/main_nep/parameters.cu +++ b/src/main_nep/parameters.cu @@ -16,6 +16,7 @@ #include "parameters.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include #include @@ -221,9 +222,9 @@ void Parameters::calculate_parameters() } int deviceCount; - CHECK(cudaGetDeviceCount(&deviceCount)); + CHECK(gpuGetDeviceCount(&deviceCount)); for (int device_id = 0; device_id < deviceCount; device_id++) { - CHECK(cudaSetDevice(device_id)); + CHECK(gpuSetDevice(device_id)); q_scaler_gpu[device_id].resize(dim); q_scaler_gpu[device_id].copy_from_host(q_scaler_cpu.data()); } @@ -932,7 +933,7 @@ void Parameters::parse_population(const char** param, int num_param) } int deviceCount; - CHECK(cudaGetDeviceCount(&deviceCount)); + CHECK(gpuGetDeviceCount(&deviceCount)); int fully_used_device = population_size % deviceCount; int population_should_increase; if (fully_used_device != 0) { diff --git a/src/main_nep/snes.cu b/src/main_nep/snes.cu index 75a04df93..2915560a5 100644 --- a/src/main_nep/snes.cu +++ b/src/main_nep/snes.cu @@ -27,15 +27,16 @@ https://doi.org/10.1145/2001576.2001692 #include "parameters.cuh" #include "snes.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include #include #include -static __global__ void initialize_curand_states(curandState* state, int N, int seed) +static __global__ void initialize_curand_states(gpurandState* state, int N, int seed) { int n = blockIdx.x * blockDim.x + threadIdx.x; if (n < N) { - curand_init(seed, n, 0, &state[n]); + gpurand_init(seed, n, 0, &state[n]); } } @@ -61,7 +62,7 @@ SNES::SNES(Parameters& para, Fitness* fitness_function) type_of_variable.resize(number_of_variables, para.num_types); initialize_rng(); - cudaSetDevice(0); // normally use GPU-0 + gpuSetDevice(0); // normally use GPU-0 gpu_type_of_variable.resize(number_of_variables); gpu_index.resize(population_size * (para.num_types + 1)); gpu_utility.resize(number_of_variables); @@ -73,7 +74,7 @@ SNES::SNES(Parameters& para, Fitness* fitness_function) gpu_population.resize(N); curand_states.resize(N); initialize_curand_states<<<(N - 1) / 128 + 1, 128>>>(curand_states.data(), N, 1234567); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL initialize_mu_and_sigma(para); calculate_utility(); @@ -106,7 +107,7 @@ void SNES::initialize_mu_and_sigma(Parameters& para) } fclose(fid_restart); } - cudaSetDevice(0); // normally use GPU-0 + gpuSetDevice(0); // normally use GPU-0 gpu_mu.copy_from_host(mu.data()); gpu_sigma.copy_from_host(sigma.data()); } @@ -275,15 +276,15 @@ static __global__ void gpu_create_population( const int number_of_variables, const float* g_mu, const float* g_sigma, - curandState* g_state, + gpurandState* g_state, float* g_s, float* g_population) { int n = blockIdx.x * blockDim.x + threadIdx.x; if (n < N) { int v = n % number_of_variables; - curandState state = g_state[n]; - float s = curand_normal(&state); + gpurandState state = g_state[n]; + float s = gpurand_normal(&state); g_s[n] = s; g_population[n] = g_sigma[v] * s + g_mu[v]; g_state[n] = state; @@ -292,7 +293,7 @@ static __global__ void gpu_create_population( void SNES::create_population(Parameters& para) { - cudaSetDevice(0); // normally use GPU-0 + gpuSetDevice(0); // normally use GPU-0 const int N = population_size * number_of_variables; gpu_create_population<<<(N - 1) / 128 + 1, 128>>>( N, @@ -302,7 +303,7 @@ void SNES::create_population(Parameters& para) curand_states.data(), gpu_s.data(), gpu_population.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_population.copy_to_host(population.data()); } @@ -346,7 +347,7 @@ static __global__ void gpu_find_L1_L2_NEP4( void SNES::regularize_NEP4(Parameters& para) { - cudaSetDevice(0); // normally use GPU-0 + gpuSetDevice(0); // normally use GPU-0 for (int t = 0; t <= para.num_types; ++t) { float num_variables = float(para.number_of_variables) / para.num_types; @@ -362,7 +363,7 @@ void SNES::regularize_NEP4(Parameters& para) gpu_population.data(), gpu_cost_L1reg.data(), gpu_cost_L2reg.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_cost_L1reg.copy_to_host(cost_L1reg.data()); gpu_cost_L2reg.copy_to_host(cost_L2reg.data()); @@ -414,10 +415,10 @@ static __global__ void gpu_find_L1_L2( void SNES::regularize(Parameters& para) { - cudaSetDevice(0); // normally use GPU-0 + gpuSetDevice(0); // normally use GPU-0 gpu_find_L1_L2<<>>( number_of_variables, gpu_population.data(), gpu_cost_L1reg.data(), gpu_cost_L2reg.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_cost_L1reg.copy_to_host(cost_L1reg.data()); gpu_cost_L2reg.copy_to_host(cost_L2reg.data()); @@ -495,7 +496,7 @@ static __global__ void gpu_update_mu_and_sigma( void SNES::update_mu_and_sigma(Parameters& para) { - cudaSetDevice(0); // normally use GPU-0 + gpuSetDevice(0); // normally use GPU-0 gpu_type_of_variable.copy_from_host(type_of_variable.data()); gpu_index.copy_from_host(index.data()); gpu_utility.copy_from_host(utility.data()); @@ -510,12 +511,12 @@ void SNES::update_mu_and_sigma(Parameters& para) gpu_s.data(), gpu_mu.data(), gpu_sigma.data()); - CUDA_CHECK_KERNEL; + GPU_CHECK_KERNEL; } void SNES::output_mu_and_sigma(Parameters& para) { - cudaSetDevice(0); // normally use GPU-0 + gpuSetDevice(0); // normally use GPU-0 gpu_mu.copy_to_host(mu.data()); gpu_sigma.copy_to_host(sigma.data()); FILE* fid_restart = my_fopen("nep.restart", "w"); diff --git a/src/main_nep/snes.cuh b/src/main_nep/snes.cuh index 4f9352a98..4b4e7dd84 100644 --- a/src/main_nep/snes.cuh +++ b/src/main_nep/snes.cuh @@ -15,7 +15,12 @@ #pragma once #include "utilities/gpu_vector.cuh" -#include +#include "utilities/gpu_macro.cuh" +#ifdef USE_HIP + #include +#else + #include +#endif #include #include class Fitness; @@ -43,7 +48,7 @@ protected: std::vector cost_L2reg; std::vector type_of_variable; - GPU_Vector curand_states; + GPU_Vector curand_states; GPU_Vector gpu_type_of_variable; GPU_Vector gpu_index; GPU_Vector gpu_utility; diff --git a/src/main_nep/structure.cu b/src/main_nep/structure.cu index 47379cad6..423065ac6 100644 --- a/src/main_nep/structure.cu +++ b/src/main_nep/structure.cu @@ -16,6 +16,7 @@ #include "parameters.cuh" #include "structure.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include #include #include diff --git a/src/makefile.hip b/src/makefile.hip new file mode 100644 index 000000000..7fcfc6d37 --- /dev/null +++ b/src/makefile.hip @@ -0,0 +1,110 @@ +########################################################### +# Note: +# 1) You can modify gfx90a according to your GPU +# architecture. +# 2) Do not remove -DUSE_HIP. +# 3) Add -DUSE_PLUMED to CFLAGS when use the PLUMED plugin +# and remove it otherwise. +# 4) Add -DUSE_TABLE to speed up MD simulations with NEP +# using pre-computed radial functions in the descriptors +########################################################### + + +########################################################### +# some flags +########################################################### +CC = hipcc +CFLAGS = -std=c++14 -O3 --offload-arch=gfx90a -DUSE_HIP +INC = -I./ +LDFLAGS = +LIBS = -lhipblas -lhipsolver + + +########################################################### +# source files +########################################################### +SOURCES_GPUMD = \ + $(wildcard main_gpumd/*.cu) \ + $(wildcard minimize/*.cu) \ + $(wildcard phonon/*.cu) \ + $(wildcard integrate/*.cu) \ + $(wildcard mc/*.cu) \ + $(wildcard force/*.cu) \ + $(wildcard measure/*.cu) \ + $(wildcard model/*.cu) \ + $(wildcard utilities/*.cu) +SOURCES_NEP = \ + $(wildcard main_nep/*.cu) \ + $(wildcard utilities/*.cu) + + +########################################################### +# object files +########################################################### +OBJ_GPUMD = $(SOURCES_GPUMD:.cu=.o) +OBJ_NEP = $(SOURCES_NEP:.cu=.o) + + +########################################################### +# headers +########################################################### +HEADERS = \ + $(wildcard utilities/*.cuh) \ + $(wildcard main_gpumd/*.cuh) \ + $(wildcard integrate/*.cuh) \ + $(wildcard mc/*.cuh) \ + $(wildcard minimize/*.cuh) \ + $(wildcard force/*.cuh) \ + $(wildcard measure/*.cuh) \ + $(wildcard model/*.cuh) \ + $(wildcard phonon/*.cuh) \ + $(wildcard main_nep/*.cuh) + + +########################################################### +# executables +########################################################### +all: gpumd nep +gpumd: $(OBJ_GPUMD) + $(CC) $(LDFLAGS) $^ -o $@ $(LIBS) + @echo ================================================= + @echo The gpumd executable is successfully compiled! + @echo ================================================= +nep: $(OBJ_NEP) + $(CC) $(LDFLAGS) $^ -o $@ $(LIBS) + @echo ================================================= + @echo The nep executable is successfully compiled! + @echo ================================================= + + +########################################################### +# rules for building object files +########################################################### +integrate/%.o: integrate/%.cu $(HEADERS) + $(CC) $(CFLAGS) $(INC) -c $< -o $@ +mc/%.o: mc/%.cu $(HEADERS) + $(CC) $(CFLAGS) $(INC) -c $< -o $@ +minimize/%.o: minimize/%.cu $(HEADERS) + $(CC) $(CFLAGS) $(INC) -c $< -o $@ +force/%.o: force/%.cu $(HEADERS) + $(CC) $(CFLAGS) $(INC) -c $< -o $@ +measure/%.o: measure/%.cu $(HEADERS) + $(CC) $(CFLAGS) $(INC) -c $< -o $@ +main_gpumd/%.o: main_gpumd/%.cu $(HEADERS) + $(CC) $(CFLAGS) $(INC) -c $< -o $@ +utilities/%.o: utilities/%.cu $(HEADERS) + $(CC) $(CFLAGS) $(INC) -c $< -o $@ +model/%.o: model/%.cu $(HEADERS) + $(CC) $(CFLAGS) $(INC) -c $< -o $@ +phonon/%.o: phonon/%.cu $(HEADERS) + $(CC) $(CFLAGS) $(INC) -c $< -o $@ +main_nep/%.o: main_nep/%.cu $(HEADERS) + $(CC) $(CFLAGS) $(INC) -c $< -o $@ + + +########################################################### +# clean up +########################################################### +clean: + rm -f */*.o gpumd nep + diff --git a/src/mc/mc.cu b/src/mc/mc.cu index f078c2aef..8cf2746a6 100644 --- a/src/mc/mc.cu +++ b/src/mc/mc.cu @@ -22,6 +22,7 @@ The driver class for the various MC ensembles. #include "mc_ensemble_sgc.cuh" #include "model/atom.cuh" #include "utilities/common.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include diff --git a/src/mc/mc_ensemble.cu b/src/mc/mc_ensemble.cu index 7fc22271c..05def352f 100644 --- a/src/mc/mc_ensemble.cu +++ b/src/mc/mc_ensemble.cu @@ -19,6 +19,7 @@ The abstract base class (ABC) for the MC_Ensemble classes. #include "mc_ensemble.cuh" #include "utilities/common.cuh" +#include "utilities/gpu_macro.cuh" #include #include #include diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu index 6488c4d20..5a08b8c53 100644 --- a/src/mc/mc_ensemble_canonical.cu +++ b/src/mc/mc_ensemble_canonical.cu @@ -18,6 +18,7 @@ The canonical ensemble for MCMD. ------------------------------------------------------------------------------*/ #include "mc_ensemble_canonical.cuh" +#include "utilities/gpu_macro.cuh" MC_Ensemble_Canonical::MC_Ensemble_Canonical( const char** param, int num_param, int num_steps_mc_input) @@ -243,7 +244,7 @@ void MC_Ensemble_Canonical::compute( type_j = atom.cpu_type[j]; } - CHECK(cudaMemset(NN_ij.data(), 0, sizeof(int))); + CHECK(gpuMemset(NN_ij.data(), 0, sizeof(int))); get_neighbors_of_i_and_j<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>( atom.number_of_atoms, box, @@ -255,7 +256,7 @@ void MC_Ensemble_Canonical::compute( atom.position_per_atom.data() + atom.number_of_atoms * 2, NN_ij.data(), NL_ij.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL int NN_ij_cpu; NN_ij.copy_to_host(&NN_ij_cpu); @@ -269,7 +270,7 @@ void MC_Ensemble_Canonical::compute( atom.type.data(), type_before.data(), type_after.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_local_types<<<(NN_ij_cpu - 1) / 64 + 1, 64>>>( NN_ij_cpu, @@ -278,10 +279,10 @@ void MC_Ensemble_Canonical::compute( type_after.data(), local_type_before.data(), local_type_after.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL - CHECK(cudaMemset(NN_radial.data(), 0, sizeof(int) * NN_radial.size())); - CHECK(cudaMemset(NN_angular.data(), 0, sizeof(int) * NN_angular.size())); + CHECK(gpuMemset(NN_radial.data(), 0, sizeof(int) * NN_radial.size())); + CHECK(gpuMemset(NN_angular.data(), 0, sizeof(int) * NN_angular.size())); create_inputs_for_energy_calculator<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>( atom.number_of_atoms, NN_ij_cpu, @@ -306,7 +307,7 @@ void MC_Ensemble_Canonical::compute( x12_angular.data(), y12_angular.data(), z12_angular.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL nep_energy.find_energy( NN_ij_cpu, diff --git a/src/mc/mc_ensemble_sgc.cu b/src/mc/mc_ensemble_sgc.cu index fa3e5f62a..d68edea39 100644 --- a/src/mc/mc_ensemble_sgc.cu +++ b/src/mc/mc_ensemble_sgc.cu @@ -27,6 +27,7 @@ integration across phase boundaries, Phys. Rev. B 86, 134204 (2012). ------------------------------------------------------------------------------*/ #include "mc_ensemble_sgc.cuh" +#include "utilities/gpu_macro.cuh" #include const std::map MASS_TABLE{ @@ -359,7 +360,7 @@ void MC_Ensemble_SGC::compute( type_j = types[index_new_species]; } - CHECK(cudaMemset(NN_ij.data(), 0, sizeof(int))); + CHECK(gpuMemset(NN_ij.data(), 0, sizeof(int))); get_neighbors_of_i<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>( atom.number_of_atoms, box, @@ -370,14 +371,14 @@ void MC_Ensemble_SGC::compute( atom.position_per_atom.data() + atom.number_of_atoms * 2, NN_ij.data(), NL_ij.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL int NN_ij_cpu; NN_ij.copy_to_host(&NN_ij_cpu); get_types<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>( atom.number_of_atoms, i, type_j, atom.type.data(), type_before.data(), type_after.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_local_types<<<(NN_ij_cpu - 1) / 64 + 1, 64>>>( NN_ij_cpu, @@ -386,10 +387,10 @@ void MC_Ensemble_SGC::compute( type_after.data(), local_type_before.data(), local_type_after.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL - CHECK(cudaMemset(NN_radial.data(), 0, sizeof(int) * NN_radial.size())); - CHECK(cudaMemset(NN_angular.data(), 0, sizeof(int) * NN_angular.size())); + CHECK(gpuMemset(NN_radial.data(), 0, sizeof(int) * NN_radial.size())); + CHECK(gpuMemset(NN_angular.data(), 0, sizeof(int) * NN_angular.size())); create_inputs_for_energy_calculator<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>( atom.number_of_atoms, NN_ij_cpu, @@ -414,7 +415,7 @@ void MC_Ensemble_SGC::compute( x12_angular.data(), y12_angular.data(), z12_angular.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL nep_energy.find_energy( NN_ij_cpu, diff --git a/src/mc/nep_energy.cu b/src/mc/nep_energy.cu index da0c9575c..34c579bf7 100644 --- a/src/mc/nep_energy.cu +++ b/src/mc/nep_energy.cu @@ -23,6 +23,7 @@ heat transport, Phys. Rev. B. 104, 104309 (2021). #include "nep_energy.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/nep_utilities.cuh" #include #include @@ -470,7 +471,7 @@ void NEP_Energy::find_energy( g_y12_angular, g_z12_angular, g_pe); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (zbl.enabled) { find_energy_zbl<<<(N - 1) / 64 + 1, 64>>>( @@ -484,6 +485,6 @@ void NEP_Energy::find_energy( g_y12_angular, g_z12_angular, g_pe); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } diff --git a/src/measure/active.cu b/src/measure/active.cu index b50f57a08..90f0cf318 100644 --- a/src/measure/active.cu +++ b/src/measure/active.cu @@ -23,6 +23,7 @@ Run active learning on-the-fly during MD #include "parse_utilities.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include #include @@ -193,7 +194,7 @@ void Active::process( // Reset mean vectors to zero initialize_mean_vectors<<<(3 * number_of_atoms - 1) / 128 + 1, 128>>>( number_of_atoms, mean_force_.data(), mean_force_sq_.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // Loop backwards over files to evaluate the main potential last, keeping it's properties intact for (int potential_index = number_of_potentials - 1; potential_index >= 0; potential_index--) { @@ -205,7 +206,7 @@ void Active::process( atom.force_per_atom.data() + number_of_atoms * 2, atom.potential_per_atom.data(), atom.virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // Compute new potential properties force.potentials[potential_index]->compute( box, @@ -223,12 +224,12 @@ void Active::process( atom.force_per_atom.data(), atom.force_per_atom.data() + number_of_atoms, atom.force_per_atom.data() + number_of_atoms * 2); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } // Sum mean and mean_sq on GPU, move sum to CPU compute_uncertainty<<<(number_of_atoms - 1) / 128 + 1, 128>>>( number_of_atoms, mean_force_.data(), mean_force_sq_.data(), gpu_uncertainty_.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_uncertainty_.copy_to_host(cpu_uncertainty_.data()); double uncertainty = -1.0; for (int i = 0; i < number_of_atoms; i++) { diff --git a/src/measure/compute.cu b/src/measure/compute.cu index 999ab0e51..31745ad87 100644 --- a/src/measure/compute.cu +++ b/src/measure/compute.cu @@ -20,6 +20,7 @@ Compute block (space) averages of various per-atom quantities. #include "compute.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include #include @@ -186,7 +187,7 @@ static __global__ void find_group_sum_1( } __syncthreads(); -#pragma unroll + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_data[tid] += s_data[tid + offset]; @@ -232,7 +233,7 @@ static __global__ void find_group_sum_3( } __syncthreads(); -#pragma unroll + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_fx[tid] += s_fx[tid + offset]; @@ -278,14 +279,14 @@ void Compute::process( velocity_per_atom.data() + N, velocity_per_atom.data() + 2 * N, gpu_per_atom_x.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_group_sum_1<<>>( group[grouping_method].size.data(), group[grouping_method].size_sum.data(), group[grouping_method].contents.data(), gpu_per_atom_x.data(), gpu_group_sum.data() + offset); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL offset += Ng; } if (compute_potential) { @@ -295,7 +296,7 @@ void Compute::process( group[grouping_method].contents.data(), potential_per_atom.data(), gpu_group_sum.data() + offset); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL offset += Ng; } if (compute_force) { @@ -307,7 +308,7 @@ void Compute::process( force_per_atom.data() + N, force_per_atom.data() + 2 * N, gpu_group_sum.data() + offset); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL offset += Ng * 3; } if (compute_virial) { @@ -319,7 +320,7 @@ void Compute::process( virial_per_atom.data() + N, virial_per_atom.data() + N * 2, gpu_group_sum.data() + offset); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL offset += Ng * 3; } if (compute_jp) { @@ -344,7 +345,7 @@ void Compute::process( gpu_per_atom_x.data(), gpu_per_atom_y.data(), gpu_per_atom_z.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_group_sum_3<<>>( group[grouping_method].size.data(), @@ -354,7 +355,7 @@ void Compute::process( gpu_per_atom_y.data(), gpu_per_atom_z.data(), gpu_group_sum.data() + offset); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL offset += Ng * 3; } if (compute_jk) { @@ -368,7 +369,7 @@ void Compute::process( gpu_per_atom_x.data(), gpu_per_atom_y.data(), gpu_per_atom_z.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_group_sum_3<<>>( group[grouping_method].size.data(), @@ -378,7 +379,7 @@ void Compute::process( gpu_per_atom_y.data(), gpu_per_atom_z.data(), gpu_group_sum.data() + offset); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL offset += Ng * 3; } if (compute_momentum) { @@ -391,7 +392,7 @@ void Compute::process( gpu_per_atom_x.data(), gpu_per_atom_y.data(), gpu_per_atom_z.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_group_sum_3<<>>( group[grouping_method].size.data(), @@ -401,7 +402,7 @@ void Compute::process( gpu_per_atom_y.data(), gpu_per_atom_z.data(), gpu_group_sum.data() + offset); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL offset += Ng * 3; } diff --git a/src/measure/compute_heat.cu b/src/measure/compute_heat.cu index f7103a40d..50af54d4a 100644 --- a/src/measure/compute_heat.cu +++ b/src/measure/compute_heat.cu @@ -26,6 +26,7 @@ https://doi.org/10.1103/PhysRevB.92.094301 #include "compute_heat.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" namespace { @@ -90,7 +91,7 @@ void compute_heat( heat_per_atom.data() + N * 2, heat_per_atom.data() + N * 3, heat_per_atom.data() + N * 4); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } namespace @@ -160,5 +161,5 @@ void compute_heat( heat_per_atom.data(), heat_per_atom.data() + N, heat_per_atom.data() + N * 2); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } diff --git a/src/measure/dos.cu b/src/measure/dos.cu index 910daee9b..f8e53cc0f 100644 --- a/src/measure/dos.cu +++ b/src/measure/dos.cu @@ -29,6 +29,7 @@ Reference for DOS: #include "parse_utilities.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include @@ -225,7 +226,7 @@ void DOS::postprocess() if (!compute_) return; - CHECK(cudaDeviceSynchronize()); // needed for pre-Pascal GPU + CHECK(gpuDeviceSynchronize()); // needed for pre-Pascal GPU normalize_vac(); output_vac(); @@ -301,7 +302,7 @@ void DOS::copy_mass(const GPU_Vector& mass) const int offset = (group_id_ < 0) ? 0 : group_->cpu_size_sum[group_id_]; gpu_copy_mass<<<(num_atoms_ - 1) / 128 + 1, 128>>>( num_atoms_, group_->contents.data() + offset, mass.data(), mass_.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } @@ -338,7 +339,7 @@ void DOS::copy_velocity(const int correlation_step, const GPU_Vector& ve } } } - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } void DOS::find_vac(const int correlation_step) @@ -378,7 +379,7 @@ void DOS::find_vac(const int correlation_step) vacy_.data(), vacz_.data()); } - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } void DOS::normalize_vac() diff --git a/src/measure/dump_beads.cu b/src/measure/dump_beads.cu index 35f020b0f..25d59a195 100644 --- a/src/measure/dump_beads.cu +++ b/src/measure/dump_beads.cu @@ -23,6 +23,7 @@ Dump bead data in PIMD-related run #include "utilities/common.cuh" #include "utilities/error.cuh" #include "utilities/gpu_vector.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" void Dump_Beads::parse(const char** param, int num_param) diff --git a/src/measure/dump_dipole.cu b/src/measure/dump_dipole.cu index a9f29f126..ee0d6282e 100644 --- a/src/measure/dump_dipole.cu +++ b/src/measure/dump_dipole.cu @@ -23,6 +23,7 @@ Dump energy/force/virial with all loaded potentials at a given interval. #include "parse_utilities.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/gpu_vector.cuh" #include "utilities/read_file.cuh" #include @@ -51,7 +52,7 @@ static __global__ void sum_dipole( __syncthreads(); // aggregate the patches in parallel -#pragma unroll + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_d[tid] += s_d[tid + offset]; @@ -167,7 +168,7 @@ void Dump_Dipole::process( atom_copy.potential_per_atom.data(), atom_copy.virial_per_atom.data(), gpu_dipole_.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // Compute the dipole // Use the positions and types from the existing atoms object, @@ -188,7 +189,7 @@ void Dump_Dipole::process( number_of_atoms_per_thread, atom_copy.virial_per_atom.data(), gpu_dipole_.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // Transfer gpu_sum to the CPU gpu_dipole_.copy_to_host(cpu_dipole_.data()); diff --git a/src/measure/dump_exyz.cu b/src/measure/dump_exyz.cu index 1620866de..079120a2b 100644 --- a/src/measure/dump_exyz.cu +++ b/src/measure/dump_exyz.cu @@ -22,6 +22,7 @@ Dump some data to dump.xyz in the extended XYZ format #include "model/box.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/gpu_vector.cuh" #include "utilities/read_file.cuh" diff --git a/src/measure/dump_force.cu b/src/measure/dump_force.cu index e238cd4a6..136c1a22d 100644 --- a/src/measure/dump_force.cu +++ b/src/measure/dump_force.cu @@ -22,6 +22,7 @@ Dump force data to a file at a given interval. #include "parse_utilities.cuh" #include "utilities/error.cuh" #include "utilities/gpu_vector.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include #include @@ -124,7 +125,7 @@ void Dump_Force::process( for (int d = 0; d < 3; ++d) { double* cpu_f = cpu_force_per_atom.data() + group_size * d; double* gpu_f = gpu_force_tmp.data() + group_size * d; - CHECK(cudaMemcpy(cpu_f, gpu_f, sizeof(double) * group_size, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(cpu_f, gpu_f, sizeof(double) * group_size, gpuMemcpyDeviceToHost)); } for (int n = 0; n < group_size; n++) { fprintf( diff --git a/src/measure/dump_netcdf.cu b/src/measure/dump_netcdf.cu index faddc4ab6..8f74d8674 100644 --- a/src/measure/dump_netcdf.cu +++ b/src/measure/dump_netcdf.cu @@ -36,6 +36,7 @@ http://ambermd.org/netcdf/nctraj.xhtml #include "parse_utilities.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/gpu_vector.cuh" #include "utilities/read_file.cuh" #include diff --git a/src/measure/dump_observer.cu b/src/measure/dump_observer.cu index cf5968f9e..49d49e85e 100644 --- a/src/measure/dump_observer.cu +++ b/src/measure/dump_observer.cu @@ -21,6 +21,7 @@ Dump energy/force/virial with all loaded potentials at a given interval. #include "parse_utilities.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/gpu_vector.cuh" #include "utilities/read_file.cuh" #include @@ -182,7 +183,7 @@ void Dump_Observer::process( atom.force_per_atom.data() + number_of_atoms * 2, atom.potential_per_atom.data(), atom.virial_per_atom.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // Compute new potential properties force.potentials[potential_index]->compute( box, diff --git a/src/measure/dump_polarizability.cu b/src/measure/dump_polarizability.cu index a601b180b..89b8b46c0 100644 --- a/src/measure/dump_polarizability.cu +++ b/src/measure/dump_polarizability.cu @@ -22,6 +22,7 @@ Dump energy/force/virial with all loaded potentials at a given interval. #include "parse_utilities.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/gpu_vector.cuh" #include "utilities/read_file.cuh" #include @@ -55,7 +56,7 @@ static __global__ void sum_polarizability( __syncthreads(); // aggregate the patches in parallel -#pragma unroll + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_p[tid] += s_p[tid + offset]; @@ -168,7 +169,7 @@ void Dump_Polarizability::process( atom_copy.potential_per_atom.data(), atom_copy.virial_per_atom.data(), gpu_pol_.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // Compute the dipole // Use the positions and types from the existing atoms object, @@ -187,7 +188,7 @@ void Dump_Polarizability::process( const int number_of_atoms_per_thread = (number_of_atoms - 1) / number_of_threads + 1; sum_polarizability<<<6, number_of_threads>>>( number_of_atoms, number_of_atoms_per_thread, atom_copy.virial_per_atom.data(), gpu_pol_.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // Transfer gpu_sum to the CPU gpu_pol_.copy_to_host(cpu_pol_.data()); diff --git a/src/measure/dump_position.cu b/src/measure/dump_position.cu index 177be6f22..40446e51a 100644 --- a/src/measure/dump_position.cu +++ b/src/measure/dump_position.cu @@ -22,6 +22,7 @@ Dump position data to movie.xyz. #include "model/group.cuh" #include "parse_utilities.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/gpu_vector.cuh" #include "utilities/read_file.cuh" @@ -169,7 +170,7 @@ void Dump_Position::process( for (int d = 0; d < 3; ++d) { double* cpu_data = cpu_position_per_atom.data() + num_atoms_total * d; double* gpu_data = gpu_position_tmp.data() + group_size * d; - CHECK(cudaMemcpy(cpu_data, gpu_data, sizeof(double) * group_size, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(cpu_data, gpu_data, sizeof(double) * group_size, gpuMemcpyDeviceToHost)); } fprintf(fid_, "%d\n", group_size); output_line2(box, cpu_atom_symbol); diff --git a/src/measure/dump_restart.cu b/src/measure/dump_restart.cu index 990edb0b3..e91d2531f 100644 --- a/src/measure/dump_restart.cu +++ b/src/measure/dump_restart.cu @@ -22,6 +22,7 @@ Dump a restart file #include "model/group.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/gpu_vector.cuh" #include "utilities/read_file.cuh" #include diff --git a/src/measure/dump_shock_nemd.cu b/src/measure/dump_shock_nemd.cu index d94110054..cdef34860 100644 --- a/src/measure/dump_shock_nemd.cu +++ b/src/measure/dump_shock_nemd.cu @@ -14,6 +14,7 @@ */ #include "dump_shock_nemd.cuh" +#include "utilities/gpu_macro.cuh" #include namespace diff --git a/src/measure/dump_shock_nemd.cuh b/src/measure/dump_shock_nemd.cuh index fee72bb19..5d235e089 100644 --- a/src/measure/dump_shock_nemd.cuh +++ b/src/measure/dump_shock_nemd.cuh @@ -20,7 +20,6 @@ #include "utilities/common.cuh" #include "utilities/gpu_vector.cuh" #include "utilities/read_file.cuh" -#include #include #include diff --git a/src/measure/dump_thermo.cu b/src/measure/dump_thermo.cu index f7877dd56..00ead6de6 100644 --- a/src/measure/dump_thermo.cu +++ b/src/measure/dump_thermo.cu @@ -21,6 +21,7 @@ Dump thermo data to a file at a given interval. #include "model/box.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/gpu_vector.cuh" #include "utilities/read_file.cuh" diff --git a/src/measure/dump_velocity.cu b/src/measure/dump_velocity.cu index dec3d69a4..faf1421bd 100644 --- a/src/measure/dump_velocity.cu +++ b/src/measure/dump_velocity.cu @@ -22,6 +22,7 @@ Dump velocity data to a file at a given interval. #include "parse_utilities.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/gpu_vector.cuh" #include "utilities/read_file.cuh" #include @@ -129,7 +130,7 @@ void Dump_Velocity::process( for (int d = 0; d < 3; ++d) { double* cpu_v = cpu_velocity_per_atom.data() + num_atoms_total * d; double* gpu_v = gpu_velocity_tmp.data() + group_size * d; - CHECK(cudaMemcpy(cpu_v, gpu_v, sizeof(double) * group_size, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(cpu_v, gpu_v, sizeof(double) * group_size, gpuMemcpyDeviceToHost)); } for (int n = 0; n < group_size; n++) { fprintf( diff --git a/src/measure/hac.cu b/src/measure/hac.cu index b80634ed2..61bf3785e 100644 --- a/src/measure/hac.cu +++ b/src/measure/hac.cu @@ -20,6 +20,7 @@ Calculate the heat current autocorrelation (HAC) function. #include "compute_heat.cuh" #include "hac.cuh" #include "utilities/common.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include #include @@ -56,7 +57,7 @@ gpu_sum_heat(const int N, const int Nd, const int nd, const double* g_heat, doub } __syncthreads(); -#pragma unroll + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_data[tid] += s_data[tid + offset]; @@ -88,7 +89,7 @@ void HAC::process( int nd = (step + 1) / sample_interval - 1; int Nd = number_of_steps / sample_interval; gpu_sum_heat<<>>(N, Nd, nd, heat_per_atom.data(), heat_all.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } // Calculate the Heat current Auto-Correlation function (HAC) @@ -129,7 +130,7 @@ __global__ void gpu_find_hac(const int Nc, const int Nd, const double* g_heat, d } __syncthreads(); -#pragma unroll + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_hac_xi[tid] += s_hac_xi[tid + offset]; @@ -182,7 +183,7 @@ void HAC::postprocess( // Here, the block size is fixed to 128, which is a good choice gpu_find_hac<<>>(Nc, Nd, heat_all.data(), hac_gpu.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL hac_gpu.copy_to_host(hac_cpu.data()); diff --git a/src/measure/hnemd_kappa.cu b/src/measure/hnemd_kappa.cu index 5dffa5539..1cc09ede8 100644 --- a/src/measure/hnemd_kappa.cu +++ b/src/measure/hnemd_kappa.cu @@ -25,6 +25,7 @@ with many-body potentials, Phys. Rev. B 99, 064308 (2019). #include "hnemd_kappa.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include @@ -55,7 +56,7 @@ gpu_sum_heat(const int N, const int step, const double* g_heat, double* g_heat_s } __syncthreads(); -#pragma unroll + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_data[tid] += s_data[tid + offset]; @@ -86,7 +87,7 @@ void HNEMD::process( compute_heat(virial_per_atom, velocity_per_atom, heat_per_atom); gpu_sum_heat<<>>(N, step, heat_per_atom.data(), heat_all.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (output_flag) { const int num = NUM_OF_HEAT_COMPONENTS * output_interval; diff --git a/src/measure/hnemdec_kappa.cu b/src/measure/hnemdec_kappa.cu index a19f2618a..fd4d99497 100644 --- a/src/measure/hnemdec_kappa.cu +++ b/src/measure/hnemdec_kappa.cu @@ -25,6 +25,7 @@ with many-body potentials, Phys. Rev. B 99, 064308 (2019). #include "hnemdec_kappa.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include @@ -98,7 +99,7 @@ static __global__ void gpu_sum_heat_and_diffusion( } __syncthreads(); -#pragma unroll + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_data[tid] += s_data[tid + offset]; @@ -122,7 +123,7 @@ static __global__ void gpu_sum_heat_and_diffusion( } __syncthreads(); -#pragma unroll + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_data[tid] += s_data[tid + offset]; @@ -166,7 +167,7 @@ void HNEMDEC::process( heat_per_atom.data(), heat_all.data(), diffusion_all.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (output_flag) { const int heat_num = NUM_OF_HEAT_COMPONENTS * output_interval; diff --git a/src/measure/lsqt.cu b/src/measure/lsqt.cu index 17df92f6e..19ac73cd6 100644 --- a/src/measure/lsqt.cu +++ b/src/measure/lsqt.cu @@ -18,6 +18,7 @@ #include "model/atom.cuh" #include "model/box.cuh" #include "utilities/common.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include #include @@ -322,37 +323,37 @@ void find_moments_chebyshev( double Em_inv = 1.0 / Em; double *s0r, *s1r, *s2r, *s0i, *s1i, *s2i, *moments_tmp; - cudaMalloc((void**)&s0r, sizeof(double) * N); - cudaMalloc((void**)&s1r, sizeof(double) * N); - cudaMalloc((void**)&s2r, sizeof(double) * N); - cudaMalloc((void**)&s0i, sizeof(double) * N); - cudaMalloc((void**)&s1i, sizeof(double) * N); - cudaMalloc((void**)&s2i, sizeof(double) * N); - cudaMalloc((void**)&moments_tmp, memory_moments_tmp); + gpuMalloc((void**)&s0r, sizeof(double) * N); + gpuMalloc((void**)&s1r, sizeof(double) * N); + gpuMalloc((void**)&s2r, sizeof(double) * N); + gpuMalloc((void**)&s0i, sizeof(double) * N); + gpuMalloc((void**)&s1i, sizeof(double) * N); + gpuMalloc((void**)&s2i, sizeof(double) * N); + gpuMalloc((void**)&moments_tmp, memory_moments_tmp); // T_0(H) gpu_copy_state<<>>(N, srr, sri, s0r, s0i); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_find_inner_product_1<<>>( N, s0r, s0i, slr, sli, moments_tmp, 0 * grid_size); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // T_1(H) gpu_apply_hamiltonian<<>>( N, Em_inv, NN, NL, U, Hr, Hi, s0r, s0i, s1r, s1i); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_find_inner_product_1<<>>( N, s1r, s1i, slr, sli, moments_tmp, 1 * grid_size); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // T_m(H) (m >= 2) for (int m = 2; m < Nm; ++m) { gpu_kernel_polynomial<<>>( N, Em_inv, NN, NL, U, Hr, Hi, s0r, s0i, s1r, s1i, s2r, s2i); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_find_inner_product_1<<>>( N, s2r, s2i, slr, sli, moments_tmp, m * grid_size); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // permute the pointers; do not need to copy the data double* temp_real; double* temp_imag; @@ -368,15 +369,15 @@ void find_moments_chebyshev( gpu_find_inner_product_2<<>>( number_of_blocks, number_of_patches, moments_tmp, moments); - CUDA_CHECK_KERNEL - - cudaFree(s0r); - cudaFree(s0i); - cudaFree(s1r); - cudaFree(s1i); - cudaFree(s2r); - cudaFree(s2i); - cudaFree(moments_tmp); + GPU_CHECK_KERNEL + + gpuFree(s0r); + gpuFree(s0i); + gpuFree(s1r); + gpuFree(s1i); + gpuFree(s2r); + gpuFree(s2i); + gpuFree(moments_tmp); } // Jackson damping @@ -435,28 +436,28 @@ void evolve( double* s0i; double* s1i; double* s2i; - cudaMalloc((void**)&s0r, sizeof(double) * N); - cudaMalloc((void**)&s0i, sizeof(double) * N); - cudaMalloc((void**)&s1r, sizeof(double) * N); - cudaMalloc((void**)&s1i, sizeof(double) * N); - cudaMalloc((void**)&s2r, sizeof(double) * N); - cudaMalloc((void**)&s2i, sizeof(double) * N); + gpuMalloc((void**)&s0r, sizeof(double) * N); + gpuMalloc((void**)&s0i, sizeof(double) * N); + gpuMalloc((void**)&s1r, sizeof(double) * N); + gpuMalloc((void**)&s1i, sizeof(double) * N); + gpuMalloc((void**)&s2r, sizeof(double) * N); + gpuMalloc((void**)&s2i, sizeof(double) * N); // T_0(H) |psi> = |psi> gpu_copy_state<<>>(N, sr, si, s0r, s0i); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // T_1(H) |psi> = H |psi> gpu_apply_hamiltonian<<>>( N, Em_inv, NN, NL, U, Hr, Hi, sr, si, s1r, s1i); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // |final_state> = c_0 * T_0(H) |psi> + c_1 * T_1(H) |psi> double bessel_0 = j0(time_step_scaled); double bessel_1 = 2.0 * j1(time_step_scaled); gpu_chebyshev_01<<>>( N, s0r, s0i, s1r, s1i, sr, si, bessel_0, bessel_1, direction); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL for (int m = 2; m < 1000000; ++m) { double bessel_m = jn(m, time_step_scaled); @@ -477,7 +478,7 @@ void evolve( } gpu_chebyshev_2<<>>( N, Em_inv, NN, NL, U, Hr, Hi, s0r, s0i, s1r, s1i, s2r, s2i, sr, si, bessel_m, label); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // permute the pointers; do not need to copy the data double *temp_real, *temp_imag; @@ -490,12 +491,12 @@ void evolve( s2r = temp_real; s2i = temp_imag; } - cudaFree(s0r); - cudaFree(s0i); - cudaFree(s1r); - cudaFree(s1i); - cudaFree(s2r); - cudaFree(s2i); + gpuFree(s0r); + gpuFree(s0i); + gpuFree(s1r); + gpuFree(s1i); + gpuFree(s2r); + gpuFree(s2i); } #ifdef USE_GRAPHENE_TB @@ -756,7 +757,7 @@ void LSQT::process(Atom& atom, Box& box, const int step) Hr.data(), Hi.data(), xx.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL find_dos_and_velocity(atom, box); find_sigma(atom, box, step); @@ -854,7 +855,7 @@ void LSQT::find_sigma(Atom& atom, Box& box, const int step) sli.data(), srr.data(), sri.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } else { evolve( number_of_orbitals, @@ -894,7 +895,7 @@ void LSQT::find_sigma(Atom& atom, Box& box, const int step) sli.data(), scr.data(), sci.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL std::vector vac(number_of_energy_points); diff --git a/src/measure/measure.cu b/src/measure/measure.cu index f1057e35e..9441c5c92 100644 --- a/src/measure/measure.cu +++ b/src/measure/measure.cu @@ -20,6 +20,7 @@ The driver class dealing with measurement. #include "measure.cuh" #include "model/atom.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include #define NUM_OF_HEAT_COMPONENTS 5 diff --git a/src/measure/modal_analysis.cu b/src/measure/modal_analysis.cu index 7c0b51b57..ba79f3490 100644 --- a/src/measure/modal_analysis.cu +++ b/src/measure/modal_analysis.cu @@ -31,6 +31,7 @@ GPUMD Contributing author: Alexander Gabourie (Stanford University) #include "modal_analysis.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include #define NUM_OF_HEAT_COMPONENTS 5 @@ -89,7 +90,7 @@ static __device__ void gpu_bin_reduce( } __syncthreads(); -#pragma unroll + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_data_xin[tid] += s_data_xin[tid + offset]; @@ -252,7 +253,7 @@ void MODAL_ANALYSIS::compute_heat( mvx.data(), mvy.data(), mvz.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // Scale stress tensor by inv(sqrt(mass)) prepare_sm<<>>( @@ -271,16 +272,16 @@ void MODAL_ANALYSIS::compute_heat( smx.data(), smy.data(), smz.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL const float alpha = 1.0; const float beta = 0.0; int stride = 1; // Calculate modal velocities - cublasSgemv( + gpublasSgemv( ma_handle, - CUBLAS_OP_N, + GPUBLAS_OP_N, num_modes, num_participating, &alpha, @@ -291,9 +292,9 @@ void MODAL_ANALYSIS::compute_heat( &beta, xdotx.data(), stride); - cublasSgemv( + gpublasSgemv( ma_handle, - CUBLAS_OP_N, + GPUBLAS_OP_N, num_modes, num_participating, &alpha, @@ -304,9 +305,9 @@ void MODAL_ANALYSIS::compute_heat( &beta, xdoty.data(), stride); - cublasSgemv( + gpublasSgemv( ma_handle, - CUBLAS_OP_N, + GPUBLAS_OP_N, num_modes, num_participating, &alpha, @@ -320,10 +321,10 @@ void MODAL_ANALYSIS::compute_heat( // Calculate intermediate value // (i.e. heat current without modal velocities) - cublasSgemm( + gpublasSgemm( ma_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, + GPUBLAS_OP_N, + GPUBLAS_OP_N, num_modes, 3, num_participating, @@ -335,10 +336,10 @@ void MODAL_ANALYSIS::compute_heat( &beta, jmx.data(), num_modes); - cublasSgemm( + gpublasSgemm( ma_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, + GPUBLAS_OP_N, + GPUBLAS_OP_N, num_modes, 3, num_participating, @@ -350,10 +351,10 @@ void MODAL_ANALYSIS::compute_heat( &beta, jmy.data(), num_modes); - cublasSgemm( + gpublasSgemm( ma_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, + GPUBLAS_OP_N, + GPUBLAS_OP_N, num_modes, 3, num_participating, @@ -367,9 +368,9 @@ void MODAL_ANALYSIS::compute_heat( num_modes); // calculate modal heat current - cublasSdgmm( + gpublasSdgmm( ma_handle, - CUBLAS_SIDE_LEFT, + GPUBLAS_SIDE_LEFT, num_modes, 3, jmx.data(), @@ -378,9 +379,9 @@ void MODAL_ANALYSIS::compute_heat( stride, jmx.data(), num_modes); - cublasSdgmm( + gpublasSdgmm( ma_handle, - CUBLAS_SIDE_LEFT, + GPUBLAS_SIDE_LEFT, num_modes, 3, jmy.data(), @@ -389,9 +390,9 @@ void MODAL_ANALYSIS::compute_heat( stride, jmy.data(), num_modes); - cublasSdgmm( + gpublasSdgmm( ma_handle, - CUBLAS_SIDE_LEFT, + GPUBLAS_SIDE_LEFT, num_modes, 3, jmz.data(), @@ -410,7 +411,7 @@ void MODAL_ANALYSIS::compute_heat( gpu_update_jm <<>>(num_modes, jmx.data(), jmy.data(), jmz.data(), jm.data()); } - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } void MODAL_ANALYSIS::setN(const std::vector& cpu_type_size) @@ -543,9 +544,9 @@ void MODAL_ANALYSIS::preprocess( rsqrtmass.resize(num_participating, Memory_Type::managed); gpu_set_mass_terms<<<(num_participating - 1) / BLOCK_SIZE + 1, BLOCK_SIZE>>>( num_participating, N1, mass.data(), sqrtmass.data(), rsqrtmass.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL - cublasCreate(&ma_handle); + gpublasCreate(&ma_handle); } void MODAL_ANALYSIS::process( @@ -568,18 +569,18 @@ void MODAL_ANALYSIS::process( gpu_bin_modes<<>>( num_modes, bin_count.data(), bin_sum.data(), num_bins, jm.data(), bin_out.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL if (method == HNEMA_METHOD) { float factor = KAPPA_UNIT_CONVERSION / (volume * temperature * fe * (float)samples_per_output); int num_bins_stored = num_bins * NUM_OF_HEAT_COMPONENTS; gpu_scale_jm<<<(num_bins_stored - 1) / BLOCK_SIZE + 1, BLOCK_SIZE>>>( num_bins_stored, factor, bin_out.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } // Compute thermal conductivity and output - cudaDeviceSynchronize(); // ensure GPU ready to move data to CPU + gpuDeviceSynchronize(); // ensure GPU ready to move data to CPU FILE* fid = fopen(output_file_position, "a"); for (int i = 0; i < num_bins; i++) { fprintf( @@ -597,7 +598,7 @@ void MODAL_ANALYSIS::process( if (method == HNEMA_METHOD) { int grid_size = (num_heat_stored - 1) / BLOCK_SIZE + 1; gpu_reset_data<<>>(num_heat_stored, jm.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } @@ -605,5 +606,5 @@ void MODAL_ANALYSIS::postprocess() { if (!compute) return; - cublasDestroy(ma_handle); + gpublasDestroy(ma_handle); } diff --git a/src/measure/modal_analysis.cuh b/src/measure/modal_analysis.cuh index db52f8ebd..d05dc4ba6 100644 --- a/src/measure/modal_analysis.cuh +++ b/src/measure/modal_analysis.cuh @@ -21,7 +21,11 @@ GPUMD Contributing author: Alexander Gabourie (Stanford University) #include "utilities/common.cuh" #include "utilities/gpu_vector.cuh" -#include +#ifdef USE_HIP + #include +#else + #include +#endif #include #include #include @@ -97,7 +101,7 @@ private: int num_participating; // Number of particles participating int num_heat_stored; // Number of stored heat current elements - cublasHandle_t ma_handle; + gpublasHandle_t ma_handle; // stress by by square root mass (intermediate term) GPU_Vector smx; diff --git a/src/measure/msd.cu b/src/measure/msd.cu index b62e0e018..0eb4d7755 100644 --- a/src/measure/msd.cu +++ b/src/measure/msd.cu @@ -24,6 +24,7 @@ Calculate: #include "parse_utilities.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include @@ -184,7 +185,7 @@ void MSD::process(const int step, const std::vector& groups, const GPU_Ve y_.data() + step_offset, z_.data() + step_offset); } - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // start to calculate the MSD when we have enough frames if (sample_step >= num_correlation_steps_ - 1) { @@ -202,7 +203,7 @@ void MSD::process(const int step, const std::vector& groups, const GPU_Ve msdx_.data(), msdy_.data(), msdz_.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } @@ -211,7 +212,7 @@ void MSD::postprocess() if (!compute_) return; - CHECK(cudaDeviceSynchronize()); // needed for pre-Pascal GPU + CHECK(gpuDeviceSynchronize()); // needed for pre-Pascal GPU // normalize by the number of atoms and number of time origins const double msd_scaler = 1.0 / ((double)num_atoms_ * (double)num_time_origins_); diff --git a/src/measure/parse_utilities.cu b/src/measure/parse_utilities.cu index 8bd34956d..2af131525 100644 --- a/src/measure/parse_utilities.cu +++ b/src/measure/parse_utilities.cu @@ -19,6 +19,7 @@ A function parsing the "group" option in some keywords #include "model/group.cuh" #include "parse_utilities.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include diff --git a/src/measure/plumed.cu b/src/measure/plumed.cu index d5e150f2b..e195338ce 100644 --- a/src/measure/plumed.cu +++ b/src/measure/plumed.cu @@ -22,6 +22,7 @@ Interface to the PLUMED plugin: https://www.plumed.org #include "plumed.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/gpu_vector.cuh" #include "utilities/read_file.cuh" @@ -183,7 +184,7 @@ void PLUMED::process( } gpu_sum<<<6, 1024>>>(n_atom, virial.data(), gpu_v_vector.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_v_vector.copy_to_host(tmp.data()); fill(cpu_v_vector.begin(), cpu_v_vector.end(), 0.0); @@ -223,7 +224,7 @@ void PLUMED::process( virial.data() + n_atom * 3, virial.data() + n_atom * 4, virial.data() + n_atom * 5); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } void PLUMED::postprocess(void) diff --git a/src/measure/rdf.cu b/src/measure/rdf.cu index d0392e707..295453ecd 100644 --- a/src/measure/rdf.cu +++ b/src/measure/rdf.cu @@ -26,6 +26,7 @@ Calculate: #include "rdf.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include @@ -315,7 +316,7 @@ void RDF::find_rdf( rdf_g_ind, rdf_bins_, r_step_); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } else { gpu_find_rdf_ON1<<>>( @@ -342,7 +343,7 @@ void RDF::find_rdf( rdf_g_ind, rdf_bins_, r_step_); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } @@ -499,12 +500,12 @@ void RDF::postprocess(const bool is_pimd, const int number_of_beads) if (is_pimd) { - CHECK(cudaMemcpy( + CHECK(gpuMemcpy( rdf_.data(), rdf_g_.data(), sizeof(double) * number_of_beads * num_atoms_ * rdf_bins_, - cudaMemcpyDeviceToHost)); - CHECK(cudaDeviceSynchronize()); // needed for pre-Pascal GPU + gpuMemcpyDeviceToHost)); + CHECK(gpuDeviceSynchronize()); // needed for pre-Pascal GPU std::vector rdf_average(number_of_beads * rdf_atom_count * rdf_bins_, 0.0); for (int k = 0; k < number_of_beads; k++) { @@ -559,9 +560,9 @@ void RDF::postprocess(const bool is_pimd, const int number_of_beads) } else { - CHECK(cudaMemcpy( - rdf_.data(), rdf_g_.data(), sizeof(double) * num_atoms_ * rdf_bins_, cudaMemcpyDeviceToHost)); - CHECK(cudaDeviceSynchronize()); // needed for pre-Pascal GPU + CHECK(gpuMemcpy( + rdf_.data(), rdf_g_.data(), sizeof(double) * num_atoms_ * rdf_bins_, gpuMemcpyDeviceToHost)); + CHECK(gpuDeviceSynchronize()); // needed for pre-Pascal GPU std::vector rdf_average(rdf_atom_count * rdf_bins_, 0.0); for (int a = 0; a < rdf_atom_count; a++) { diff --git a/src/measure/sdc.cu b/src/measure/sdc.cu index 66e191945..fe047a59e 100644 --- a/src/measure/sdc.cu +++ b/src/measure/sdc.cu @@ -24,6 +24,7 @@ Calculate: #include "sdc.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include @@ -182,7 +183,7 @@ void SDC::process( vy_.data() + step_offset, vz_.data() + step_offset); } - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL // start to calculate the VAC when we have enough frames if (sample_step >= num_correlation_steps_ - 1) { @@ -200,7 +201,7 @@ void SDC::process( vacx_.data(), vacy_.data(), vacz_.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } @@ -209,7 +210,7 @@ void SDC::postprocess() if (!compute_) return; - CHECK(cudaDeviceSynchronize()); // needed for pre-Pascal GPU + CHECK(gpuDeviceSynchronize()); // needed for pre-Pascal GPU // normalize by the number of atoms and number of time origins const double vac_scaler = 1.0 / ((double)num_atoms_ * (double)num_time_origins_); diff --git a/src/measure/shc.cu b/src/measure/shc.cu index 52d0d6e87..1710121f9 100644 --- a/src/measure/shc.cu +++ b/src/measure/shc.cu @@ -25,6 +25,7 @@ with many-body potentials, Phys. Rev. B 99, 064308 (2019). #include "shc.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include @@ -178,12 +179,12 @@ void SHC::process( const double* vz_tmp = velocity_per_atom.data() + N * 2; if (-1 == group_method) { - CHECK(cudaMemcpy(sx.data() + offset, sx_tmp, sizeof(double) * N, cudaMemcpyDeviceToDevice)); - CHECK(cudaMemcpy(sy.data() + offset, sy_tmp, sizeof(double) * N, cudaMemcpyDeviceToDevice)); - CHECK(cudaMemcpy(sz.data() + offset, sz_tmp, sizeof(double) * N, cudaMemcpyDeviceToDevice)); - CHECK(cudaMemcpy(vx.data() + offset, vx_tmp, sizeof(double) * N, cudaMemcpyDeviceToDevice)); - CHECK(cudaMemcpy(vy.data() + offset, vy_tmp, sizeof(double) * N, cudaMemcpyDeviceToDevice)); - CHECK(cudaMemcpy(vz.data() + offset, vz_tmp, sizeof(double) * N, cudaMemcpyDeviceToDevice)); + CHECK(gpuMemcpy(sx.data() + offset, sx_tmp, sizeof(double) * N, gpuMemcpyDeviceToDevice)); + CHECK(gpuMemcpy(sy.data() + offset, sy_tmp, sizeof(double) * N, gpuMemcpyDeviceToDevice)); + CHECK(gpuMemcpy(sz.data() + offset, sz_tmp, sizeof(double) * N, gpuMemcpyDeviceToDevice)); + CHECK(gpuMemcpy(vx.data() + offset, vx_tmp, sizeof(double) * N, gpuMemcpyDeviceToDevice)); + CHECK(gpuMemcpy(vy.data() + offset, vy_tmp, sizeof(double) * N, gpuMemcpyDeviceToDevice)); + CHECK(gpuMemcpy(vz.data() + offset, vz_tmp, sizeof(double) * N, gpuMemcpyDeviceToDevice)); } else { if (group_id == -1) { for (int n = 1; n < group_num; ++n) { @@ -207,7 +208,7 @@ void SHC::process( vx_tmp, vy_tmp, vz_tmp); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } else { gpu_copy_data<<<(group_size - 1) / BLOCK_SIZE_SHC + 1, BLOCK_SIZE_SHC>>>( @@ -227,7 +228,7 @@ void SHC::process( vy_tmp, vz_tmp); } - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } if (sample_step >= Nc - 1) { @@ -248,7 +249,7 @@ void SHC::process( vz.data() + offset_s, ki_negative.data() + Nc * n, ko_negative.data() + Nc * n); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_find_k<<>>( group[group_method].cpu_size[n], @@ -261,7 +262,7 @@ void SHC::process( sz.data() + offset_s, ki_positive.data() + Nc * n, ko_positive.data() + Nc * n); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } else { gpu_find_k<<>>( @@ -275,7 +276,7 @@ void SHC::process( vz.data(), ki_negative.data(), ko_negative.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_find_k<<>>( group_size, @@ -288,7 +289,7 @@ void SHC::process( sz.data(), ki_positive.data(), ko_positive.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } } } diff --git a/src/measure/viscosity.cu b/src/measure/viscosity.cu index ce13758c6..a64a8283d 100644 --- a/src/measure/viscosity.cu +++ b/src/measure/viscosity.cu @@ -18,6 +18,7 @@ Calculate the stress autocorrelation function and viscosity. ------------------------------------------------------------------------------*/ #include "utilities/common.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include "viscosity.cuh" #include @@ -92,7 +93,7 @@ void Viscosity::process( int Nd = number_of_steps / sample_interval; gpu_sum_stress<<>>( N, Nd, nd, mass.data(), velocity.data(), virial.data(), stress_all.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } static __global__ void gpu_correct_stress(const int Nd, double* g_stress_all) @@ -196,9 +197,9 @@ void Viscosity::postprocess( std::vector correlation_cpu(Nc * NUM_OF_COMPONENTS); gpu_correct_stress<<>>(Nd, stress_all.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL gpu_find_correlation<<>>(Nc, Nd, stress_all.data(), correlation_gpu.data()); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL correlation_gpu.copy_to_host(correlation_cpu.data()); diff --git a/src/minimize/minimize.cu b/src/minimize/minimize.cu index c41db2a4a..4d2bc3205 100644 --- a/src/minimize/minimize.cu +++ b/src/minimize/minimize.cu @@ -22,6 +22,7 @@ The driver class for minimizers. #include "minimizer_fire.cuh" #include "minimizer_sd.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include #include diff --git a/src/minimize/minimizer.cu b/src/minimize/minimizer.cu index 1e48e589e..304404c9d 100644 --- a/src/minimize/minimizer.cu +++ b/src/minimize/minimizer.cu @@ -18,6 +18,7 @@ The abstract base class (ABC) for the minimizer classes. ------------------------------------------------------------------------------*/ #include "minimizer.cuh" +#include "utilities/gpu_macro.cuh" namespace { diff --git a/src/minimize/minimizer_fire.cu b/src/minimize/minimizer_fire.cu index d695c5a29..d53087d3f 100644 --- a/src/minimize/minimizer_fire.cu +++ b/src/minimize/minimizer_fire.cu @@ -20,6 +20,7 @@ Reference: PhysRevLett 97, 170201 (2006) ------------------------------------------------------------------------------*/ #include "minimizer_fire.cuh" +#include "utilities/gpu_macro.cuh" namespace { diff --git a/src/minimize/minimizer_sd.cu b/src/minimize/minimizer_sd.cu index 6d61a3bb7..968a243d7 100644 --- a/src/minimize/minimizer_sd.cu +++ b/src/minimize/minimizer_sd.cu @@ -19,6 +19,7 @@ The SD (steepest decent) minimizer. #include "force/force.cuh" #include "minimizer_sd.cuh" +#include "utilities/gpu_macro.cuh" const double decreasing_factor = 0.2; const double increasing_factor = 1.2; diff --git a/src/model/atom.cu b/src/model/atom.cu index 450b9aff2..783aa68fb 100644 --- a/src/model/atom.cu +++ b/src/model/atom.cu @@ -19,6 +19,7 @@ The class defining the simulation box. #include "atom.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include int Atom::number_of_type(std::string& symbol) diff --git a/src/model/box.cu b/src/model/box.cu index cf10997db..4cbecd36e 100644 --- a/src/model/box.cu +++ b/src/model/box.cu @@ -19,6 +19,7 @@ The class defining the simulation box. #include "box.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include static float get_area_one_direction(const double* a, const double* b) diff --git a/src/model/group.cu b/src/model/group.cu index d05ac1731..78f4746eb 100644 --- a/src/model/group.cu +++ b/src/model/group.cu @@ -18,6 +18,7 @@ The class defining the grouping methods ------------------------------------------------------------------------------*/ #include "group.cuh" +#include "utilities/gpu_macro.cuh" #include void Group::find_size(const int N, const int k) diff --git a/src/model/read_xyz.cu b/src/model/read_xyz.cu index 98f59d390..d53d2cea7 100644 --- a/src/model/read_xyz.cu +++ b/src/model/read_xyz.cu @@ -23,6 +23,7 @@ The class defining the simulation model. #include "read_xyz.cuh" #include "utilities/common.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include #include #include diff --git a/src/phonon/hessian.cu b/src/phonon/hessian.cu index d16c175f3..20f085bc2 100644 --- a/src/phonon/hessian.cu +++ b/src/phonon/hessian.cu @@ -25,6 +25,7 @@ Then calculate the dynamical matrices with different k points. #include "utilities/common.cuh" #include "utilities/cusolver_wrapper.cuh" #include "utilities/error.cuh" +#include "utilities/gpu_macro.cuh" #include "utilities/read_file.cuh" #include diff --git a/src/utilities/cusolver_wrapper.cu b/src/utilities/cusolver_wrapper.cu index 23358fec4..df8ac56cc 100644 --- a/src/utilities/cusolver_wrapper.cu +++ b/src/utilities/cusolver_wrapper.cu @@ -19,16 +19,21 @@ Some wrappers for the cuSOLVER library #include "cusolver_wrapper.cuh" #include "error.cuh" +#include "gpu_macro.cuh" #include "gpu_vector.cuh" -#include +#ifdef USE_HIP + #include +#else + #include +#endif #include void eig_hermitian_QR(size_t N, double* AR, double* AI, double* W_cpu) { // get A size_t N2 = N * N; - GPU_Vector A(N2); - std::vector A_cpu(N2); + GPU_Vector A(N2); + std::vector A_cpu(N2); for (size_t n = 0; n < N2; ++n) { A_cpu[n].x = AR[n]; @@ -40,31 +45,31 @@ void eig_hermitian_QR(size_t N, double* AR, double* AI, double* W_cpu) GPU_Vector W(N); // get handle - cusolverDnHandle_t handle = NULL; - cusolverDnCreate(&handle); - cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER; + gpusolverDnHandle_t handle = NULL; + gpusolverDnCreate(&handle); + gpusolverEigMode_t jobz = GPUSOLVER_EIG_MODE_NOVECTOR; + gpusolverFillMode_t uplo = GPUSOLVER_FILL_MODE_LOWER; // get work int lwork = 0; - cusolverDnZheevd_bufferSize(handle, jobz, uplo, N, A.data(), N, W.data(), &lwork); - GPU_Vector work(lwork); + gpusolverDnZheevd_bufferSize(handle, jobz, uplo, N, A.data(), N, W.data(), &lwork); + GPU_Vector work(lwork); // get W GPU_Vector info(1); - cusolverDnZheevd(handle, jobz, uplo, N, A.data(), N, W.data(), work.data(), lwork, info.data()); + gpusolverDnZheevd(handle, jobz, uplo, N, A.data(), N, W.data(), work.data(), lwork, info.data()); W.copy_to_host(W_cpu); // free - cusolverDnDestroy(handle); + gpusolverDnDestroy(handle); } void eig_hermitian_Jacobi(size_t N, double* AR, double* AI, double* W_cpu) { // get A size_t N2 = N * N; - GPU_Vector A(N2); - std::vector A_cpu(N2); + GPU_Vector A(N2); + std::vector A_cpu(N2); for (size_t n = 0; n < N2; ++n) { A_cpu[n].x = AR[n]; A_cpu[n].y = AI[n]; @@ -75,29 +80,29 @@ void eig_hermitian_Jacobi(size_t N, double* AR, double* AI, double* W_cpu) GPU_Vector W(N); // get handle - cusolverDnHandle_t handle = NULL; - cusolverDnCreate(&handle); - cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER; + gpusolverDnHandle_t handle = NULL; + gpusolverDnCreate(&handle); + gpusolverEigMode_t jobz = GPUSOLVER_EIG_MODE_NOVECTOR; + gpusolverFillMode_t uplo = GPUSOLVER_FILL_MODE_LOWER; // some parameters for the Jacobi method - syevjInfo_t para = NULL; - cusolverDnCreateSyevjInfo(¶); + gpusolverSyevjInfo_t para = NULL; + gpusolverDnCreateSyevjInfo(¶); // get work int lwork = 0; - cusolverDnZheevj_bufferSize(handle, jobz, uplo, N, A.data(), N, W.data(), &lwork, para); - GPU_Vector work(lwork); + gpusolverDnZheevj_bufferSize(handle, jobz, uplo, N, A.data(), N, W.data(), &lwork, para); + GPU_Vector work(lwork); // get W GPU_Vector info(1); - cusolverDnZheevj( + gpusolverDnZheevj( handle, jobz, uplo, N, A.data(), N, W.data(), work.data(), lwork, info.data(), para); W.copy_to_host(W_cpu); // free - cusolverDnDestroy(handle); - cusolverDnDestroySyevjInfo(para); + gpusolverDnDestroy(handle); + gpusolverDnDestroySyevjInfo(para); } void eigenvectors_symmetric_Jacobi(size_t N, double* A_cpu, double* W_cpu, double* eigenvectors_cpu) @@ -111,38 +116,38 @@ void eigenvectors_symmetric_Jacobi(size_t N, double* A_cpu, double* W_cpu, doubl GPU_Vector W(N); // get handle - cusolverDnHandle_t handle = NULL; - cusolverDnCreate(&handle); - cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; - cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER; + gpusolverDnHandle_t handle = NULL; + gpusolverDnCreate(&handle); + gpusolverEigMode_t jobz = GPUSOLVER_EIG_MODE_VECTOR; + gpusolverFillMode_t uplo = GPUSOLVER_FILL_MODE_LOWER; // some parameters for the Jacobi method - syevjInfo_t para = NULL; - cusolverDnCreateSyevjInfo(¶); + gpusolverSyevjInfo_t para = NULL; + gpusolverDnCreateSyevjInfo(¶); // get work int lwork = 0; - cusolverDnDsyevj_bufferSize(handle, jobz, uplo, N, A.data(), N, W.data(), &lwork, para); + gpusolverDnDsyevj_bufferSize(handle, jobz, uplo, N, A.data(), N, W.data(), &lwork, para); GPU_Vector work(lwork); // get W GPU_Vector info(1); - cusolverDnDsyevj( + gpusolverDnDsyevj( handle, jobz, uplo, N, A.data(), N, W.data(), work.data(), lwork, info.data(), para); W.copy_to_host(W_cpu); A.copy_to_host(eigenvectors_cpu); // free - cusolverDnDestroy(handle); - cusolverDnDestroySyevjInfo(para); + gpusolverDnDestroy(handle); + gpusolverDnDestroySyevjInfo(para); } void eig_hermitian_Jacobi_batch(size_t N, size_t batch_size, double* AR, double* AI, double* W_cpu) { // get A size_t M = N * N * batch_size; - GPU_Vector A(M); - std::vector A_cpu(M); + GPU_Vector A(M); + std::vector A_cpu(M); for (size_t n = 0; n < M; ++n) { A_cpu[n].x = AR[n]; A_cpu[n].y = AI[n]; @@ -153,24 +158,24 @@ void eig_hermitian_Jacobi_batch(size_t N, size_t batch_size, double* AR, double* GPU_Vector W(N * batch_size); // get handle - cusolverDnHandle_t handle = NULL; - cusolverDnCreate(&handle); - cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER; + gpusolverDnHandle_t handle = NULL; + gpusolverDnCreate(&handle); + gpusolverEigMode_t jobz = GPUSOLVER_EIG_MODE_NOVECTOR; + gpusolverFillMode_t uplo = GPUSOLVER_FILL_MODE_LOWER; // some parameters for the Jacobi method - syevjInfo_t para = NULL; - cusolverDnCreateSyevjInfo(¶); + gpusolverSyevjInfo_t para = NULL; + gpusolverDnCreateSyevjInfo(¶); // get work int lwork = 0; - cusolverDnZheevjBatched_bufferSize( + gpusolverDnZheevjBatched_bufferSize( handle, jobz, uplo, N, A.data(), N, W.data(), &lwork, para, batch_size); - GPU_Vector work(lwork); + GPU_Vector work(lwork); // get W GPU_Vector info(batch_size); - cusolverDnZheevjBatched( + gpusolverDnZheevjBatched( handle, jobz, uplo, @@ -186,6 +191,6 @@ void eig_hermitian_Jacobi_batch(size_t N, size_t batch_size, double* AR, double* W.copy_to_host(W_cpu); // free - cusolverDnDestroy(handle); - cusolverDnDestroySyevjInfo(para); + gpusolverDnDestroy(handle); + gpusolverDnDestroySyevjInfo(para); } diff --git a/src/utilities/error.cuh b/src/utilities/error.cuh index f82f4297e..ea2fd396e 100644 --- a/src/utilities/error.cuh +++ b/src/utilities/error.cuh @@ -14,6 +14,7 @@ */ #pragma once +#include "gpu_macro.cuh" #include #include #include @@ -21,13 +22,13 @@ #define CHECK(call) \ do { \ - const cudaError_t error_code = call; \ - if (error_code != cudaSuccess) { \ + const gpuError_t error_code = call; \ + if (error_code != gpuSuccess) { \ fprintf(stderr, "CUDA Error:\n"); \ fprintf(stderr, " File: %s\n", __FILE__); \ fprintf(stderr, " Line: %d\n", __LINE__); \ fprintf(stderr, " Error code: %d\n", error_code); \ - fprintf(stderr, " Error text: %s\n", cudaGetErrorString(error_code)); \ + fprintf(stderr, " Error text: %s\n", gpuGetErrorString(error_code)); \ exit(1); \ } \ } while (0) @@ -62,15 +63,15 @@ } while (0) #ifdef STRONG_DEBUG -#define CUDA_CHECK_KERNEL \ +#define GPU_CHECK_KERNEL \ { \ - CHECK(cudaGetLastError()); \ - CHECK(cudaDeviceSynchronize()); \ + CHECK(gpuGetLastError()); \ + CHECK(gpuDeviceSynchronize()); \ } #else -#define CUDA_CHECK_KERNEL \ +#define GPU_CHECK_KERNEL \ { \ - CHECK(cudaGetLastError()); \ + CHECK(gpuGetLastError()); \ } #endif diff --git a/src/utilities/gpu_macro.cuh b/src/utilities/gpu_macro.cuh new file mode 100644 index 000000000..75a5cbd3a --- /dev/null +++ b/src/utilities/gpu_macro.cuh @@ -0,0 +1,168 @@ +/* + Copyright 2017 Zheyong Fan and GPUMD development team + This file is part of GPUMD. + GPUMD is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + GPUMD is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with GPUMD. If not, see . +*/ + +#pragma once + +#ifdef USE_HIP // HIP for AMD card + +#include + +// memory manipulation +#define gpuMalloc hipMalloc +#define gpuMallocManaged hipMallocManaged +#define gpuFree hipFree +#define gpuMemcpy hipMemcpy +#define gpuMemcpyFromSymbol hipMemcpyFromSymbol +#define gpuMemcpyToSymbol hipMemcpyToSymbol +#define gpuGetSymbolAddress hipGetSymbolAddress +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyHostToHost hipMemcpyHostToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define gpuMemset hipMemset + +// error handling +#define gpuError_t hipError_t +#define gpuSuccess hipSuccess +#define gpuGetErrorString hipGetErrorString +#define gpuGetLastError hipGetLastError + +// device manipulation +#define gpuSetDevice hipSetDevice +#define gpuGetDeviceCount hipGetDeviceCount +#define gpuDeviceProp hipDeviceProp_t +#define gpuGetDeviceProperties hipGetDeviceProperties +#define gpuDeviceCanAccessPeer hipDeviceCanAccessPeer +#define gpuDeviceEnablePeerAccess hipDeviceEnablePeerAccess +#define gpuDeviceSynchronize hipDeviceSynchronize + +// stream +#define gpuStream_t hipStream_t +#define gpuStreamCreate hipStreamCreate +#define gpuStreamDestroy hipStreamDestroy + +// random numbers +#define gpurandState hiprandState +#define gpurand_normal_double hiprand_normal_double +#define gpurand_normal hiprand_normal +#define gpurand_init hiprand_init + +// blas +#define gpublasHandle_t hipblasHandle_t +#define gpublasSgemv hipblasSgemv +#define gpublasSgemm hipblasSgemm +#define gpublasSdgmm hipblasSdgmm +#define gpublasDestroy hipblasDestroy +#define gpublasCreate hipblasCreate +#define GPUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT +#define GPUBLAS_OP_N HIPBLAS_OP_N + +// lapack +#define gpuDoubleComplex hipDoubleComplex +#define gpusolverDnHandle_t hipsolverDnHandle_t +#define gpusolverDnCreate hipsolverDnCreate +#define gpusolverDnDestroy hipsolverDnDestroy +#define gpusolverEigMode_t hipsolverEigMode_t +#define gpusolverFillMode_t hipsolverFillMode_t +#define GPUSOLVER_EIG_MODE_NOVECTOR HIPSOLVER_EIG_MODE_NOVECTOR +#define GPUSOLVER_EIG_MODE_VECTOR HIPSOLVER_EIG_MODE_VECTOR +#define GPUSOLVER_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER +#define gpusolverSyevjInfo_t hipsolverSyevjInfo_t +#define gpusolverDnCreateSyevjInfo hipsolverDnCreateSyevjInfo +#define gpusolverDnDestroySyevjInfo hipsolverDnDestroySyevjInfo +#define gpusolverDnZheevj_bufferSize hipsolverDnZheevj_bufferSize +#define gpusolverDnZheevj hipsolverDnZheevj +#define gpusolverDnZheevd_bufferSize hipsolverDnZheevd_bufferSize +#define gpusolverDnZheevd hipsolverDnZheevd +#define gpusolverDnDsyevj_bufferSize hipsolverDnDsyevj_bufferSize +#define gpusolverDnDsyevj hipsolverDnDsyevj +#define gpusolverDnZheevjBatched_bufferSize hipsolverDnZheevjBatched_bufferSize +#define gpusolverDnZheevjBatched hipsolverDnZheevjBatched + +#else // CUDA for Nvidia card + +// memory manipulation +#define gpuMalloc cudaMalloc +#define gpuMallocManaged cudaMallocManaged +#define gpuFree cudaFree +#define gpuMemcpy cudaMemcpy +#define gpuMemcpyFromSymbol cudaMemcpyFromSymbol +#define gpuMemcpyToSymbol cudaMemcpyToSymbol +#define gpuGetSymbolAddress cudaGetSymbolAddress +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyHostToHost cudaMemcpyHostToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice +#define gpuMemset cudaMemset + +// error handling +#define gpuError_t cudaError_t +#define gpuSuccess cudaSuccess +#define gpuGetErrorString cudaGetErrorString +#define gpuGetLastError cudaGetLastError + +// device manipulation +#define gpuSetDevice cudaSetDevice +#define gpuGetDeviceCount cudaGetDeviceCount +#define gpuDeviceProp cudaDeviceProp +#define gpuGetDeviceProperties cudaGetDeviceProperties +#define gpuDeviceCanAccessPeer cudaDeviceCanAccessPeer +#define gpuDeviceEnablePeerAccess cudaDeviceEnablePeerAccess +#define gpuDeviceSynchronize cudaDeviceSynchronize + +// stream +#define gpuStream_t cudaStream_t +#define gpuStreamCreate cudaStreamCreate +#define gpuStreamDestroy cudaStreamDestroy + +// random numbers +#define gpurandState curandState +#define gpurand_normal_double curand_normal_double +#define gpurand_normal curand_normal +#define gpurand_init curand_init + +// blas +#define gpublasHandle_t cublasHandle_t +#define gpublasSgemv cublasSgemv +#define gpublasSgemm cublasSgemm +#define gpublasSdgmm cublasSdgmm +#define gpublasDestroy cublasDestroy +#define gpublasCreate cublasCreate +#define GPUBLAS_SIDE_LEFT CUBLAS_SIDE_LEFT +#define GPUBLAS_OP_N CUBLAS_OP_N + +// lapack +#define gpuDoubleComplex cuDoubleComplex +#define gpusolverDnHandle_t cusolverDnHandle_t +#define gpusolverDnCreate cusolverDnCreate +#define gpusolverDnDestroy cusolverDnDestroy +#define gpusolverEigMode_t cusolverEigMode_t +#define gpusolverFillMode_t cublasFillMode_t // why cublas? +#define GPUSOLVER_EIG_MODE_NOVECTOR CUSOLVER_EIG_MODE_NOVECTOR +#define GPUSOLVER_EIG_MODE_VECTOR CUSOLVER_EIG_MODE_VECTOR +#define GPUSOLVER_FILL_MODE_LOWER CUBLAS_FILL_MODE_LOWER // why cublas? +#define gpusolverSyevjInfo_t syevjInfo_t // why not cusolverSyevjInfo_t? +#define gpusolverDnCreateSyevjInfo cusolverDnCreateSyevjInfo +#define gpusolverDnDestroySyevjInfo cusolverDnDestroySyevjInfo +#define gpusolverDnZheevj_bufferSize cusolverDnZheevj_bufferSize +#define gpusolverDnZheevj cusolverDnZheevj +#define gpusolverDnZheevd_bufferSize cusolverDnZheevd_bufferSize +#define gpusolverDnZheevd cusolverDnZheevd +#define gpusolverDnDsyevj_bufferSize cusolverDnDsyevj_bufferSize +#define gpusolverDnDsyevj cusolverDnDsyevj +#define gpusolverDnZheevjBatched_bufferSize cusolverDnZheevjBatched_bufferSize +#define gpusolverDnZheevjBatched cusolverDnZheevjBatched + +#endif diff --git a/src/utilities/gpu_vector.cuh b/src/utilities/gpu_vector.cuh index 6aef232d8..839bc86a8 100644 --- a/src/utilities/gpu_vector.cuh +++ b/src/utilities/gpu_vector.cuh @@ -16,6 +16,7 @@ #pragma once #include "error.cuh" +#include "gpu_macro.cuh" namespace { @@ -64,7 +65,7 @@ public: ~GPU_Vector() { if (allocated_) { - CHECK(cudaFree(data_)); + CHECK(gpuFree(data_)); allocated_ = false; } } @@ -76,14 +77,14 @@ public: memory_ = size_ * sizeof(T); memory_type_ = memory_type; if (allocated_) { - CHECK(cudaFree(data_)); + CHECK(gpuFree(data_)); allocated_ = false; } if (memory_type_ == Memory_Type::global) { - CHECK(cudaMalloc((void**)&data_, memory_)); + CHECK(gpuMalloc((void**)&data_, memory_)); allocated_ = true; } else { - CHECK(cudaMallocManaged((void**)&data_, memory_)); + CHECK(gpuMallocManaged((void**)&data_, memory_)); allocated_ = true; } } @@ -95,14 +96,14 @@ public: memory_ = size_ * sizeof(T); memory_type_ = memory_type; if (allocated_) { - CHECK(cudaFree(data_)); + CHECK(gpuFree(data_)); allocated_ = false; } if (memory_type == Memory_Type::global) { - CHECK(cudaMalloc((void**)&data_, memory_)); + CHECK(gpuMalloc((void**)&data_, memory_)); allocated_ = true; } else { - CHECK(cudaMallocManaged((void**)&data_, memory_)); + CHECK(gpuMallocManaged((void**)&data_, memory_)); allocated_ = true; } fill(value); @@ -111,53 +112,53 @@ public: // copy data from host with the default size void copy_from_host(const T* h_data) { - CHECK(cudaMemcpy(data_, h_data, memory_, cudaMemcpyHostToDevice)); + CHECK(gpuMemcpy(data_, h_data, memory_, gpuMemcpyHostToDevice)); } // copy data from host with a given size void copy_from_host(const T* h_data, const size_t size) { const size_t memory = sizeof(T) * size; - CHECK(cudaMemcpy(data_, h_data, memory, cudaMemcpyHostToDevice)); + CHECK(gpuMemcpy(data_, h_data, memory, gpuMemcpyHostToDevice)); } // copy data from device with the default size void copy_from_device(const T* d_data) { - CHECK(cudaMemcpy(data_, d_data, memory_, cudaMemcpyDeviceToDevice)); + CHECK(gpuMemcpy(data_, d_data, memory_, gpuMemcpyDeviceToDevice)); } // copy data from device with a given size void copy_from_device(const T* d_data, const size_t size) { const size_t memory = sizeof(T) * size; - CHECK(cudaMemcpy(data_, d_data, memory, cudaMemcpyDeviceToDevice)); + CHECK(gpuMemcpy(data_, d_data, memory, gpuMemcpyDeviceToDevice)); } // copy data to host with the default size void copy_to_host(T* h_data) { - CHECK(cudaMemcpy(h_data, data_, memory_, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(h_data, data_, memory_, gpuMemcpyDeviceToHost)); } // copy data to host with a given size void copy_to_host(T* h_data, const size_t size) { const size_t memory = sizeof(T) * size; - CHECK(cudaMemcpy(h_data, data_, memory, cudaMemcpyDeviceToHost)); + CHECK(gpuMemcpy(h_data, data_, memory, gpuMemcpyDeviceToHost)); } // copy data to device with the default size void copy_to_device(T* d_data) { - CHECK(cudaMemcpy(d_data, data_, memory_, cudaMemcpyDeviceToDevice)); + CHECK(gpuMemcpy(d_data, data_, memory_, gpuMemcpyDeviceToDevice)); } // copy data to device with a given size void copy_to_device(T* d_data, const size_t size) { const size_t memory = sizeof(T) * size; - CHECK(cudaMemcpy(d_data, data_, memory, cudaMemcpyDeviceToDevice)); + CHECK(gpuMemcpy(d_data, data_, memory, gpuMemcpyDeviceToDevice)); } // give "value" to each element @@ -167,7 +168,7 @@ public: const int block_size = 128; const int grid_size = (size_ + block_size - 1) / block_size; gpu_fill<<>>(size_, value, data_); - CUDA_CHECK_KERNEL + GPU_CHECK_KERNEL } else // managed (or unified) memory { for (int i = 0; i < size_; ++i) diff --git a/src/utilities/main_common.cu b/src/utilities/main_common.cu index 368df0533..4152f0204 100644 --- a/src/utilities/main_common.cu +++ b/src/utilities/main_common.cu @@ -14,6 +14,7 @@ */ #include "error.cuh" +#include "gpu_macro.cuh" #include "main_common.cuh" #include #include @@ -40,12 +41,12 @@ void print_gpu_information(void) print_line_2(); int num_gpus; - CHECK(cudaGetDeviceCount(&num_gpus)); + CHECK(gpuGetDeviceCount(&num_gpus)); printf("number of GPUs = %d\n", num_gpus); for (int device_id = 0; device_id < num_gpus; ++device_id) { - cudaDeviceProp prop; - CHECK(cudaGetDeviceProperties(&prop, device_id)); + gpuDeviceProp prop; + CHECK(gpuGetDeviceProperties(&prop, device_id)); printf("Device id: %d\n", device_id); printf(" Device name: %s\n", prop.name); @@ -55,13 +56,13 @@ void print_gpu_information(void) } for (int i = 0; i < num_gpus; i++) { - cudaSetDevice(i); + gpuSetDevice(i); for (int j = 0; j < num_gpus; j++) { int can_access; if (i != j) { - CHECK(cudaDeviceCanAccessPeer(&can_access, i, j)); + CHECK(gpuDeviceCanAccessPeer(&can_access, i, j)); if (can_access) { - CHECK(cudaDeviceEnablePeerAccess(j, 0)); + CHECK(gpuDeviceEnablePeerAccess(j, 0)); printf("GPU-%d can access GPU-%d.\n", i, j); } else { printf("GPU-%d cannot access GPU-%d.\n", i, j); @@ -70,5 +71,5 @@ void print_gpu_information(void) } } - cudaSetDevice(0); // normally use GPU-0 + gpuSetDevice(0); // normally use GPU-0 } diff --git a/src/utilities/read_file.cu b/src/utilities/read_file.cu index 3acd6b96f..7bfd6cae2 100644 --- a/src/utilities/read_file.cu +++ b/src/utilities/read_file.cu @@ -24,7 +24,9 @@ Some functions for dealing with text files. Written by Mikko Ervasti. int is_valid_int(const char* s, int* result) { - if (s == NULL || *s == '\0') { + if (s == NULL) { + return 0; + } else if (*s == '\0') { return 0; } char* p; @@ -39,7 +41,9 @@ int is_valid_int(const char* s, int* result) int is_valid_real(const char* s, double* result) { - if (s == NULL || *s == '\0') { + if (s == NULL) { + return 0; + } else if (*s == '\0') { return 0; } char* p;