Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

From CUDA only to CUDA+HIP #768

Merged
merged 32 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
6b0a062
change cudaMalloc and cudaFree
brucefan1983 Oct 23, 2024
a841d35
change cudaMemcpy
brucefan1983 Oct 23, 2024
0ad0cd3
more cuda to gpu
brucefan1983 Oct 23, 2024
12b1e7c
device cuda to gpu
brucefan1983 Oct 23, 2024
707dd3b
deviceSynchronize done
brucefan1983 Oct 23, 2024
3d3736a
memset cuda to gpu
brucefan1983 Oct 23, 2024
bc5fcb7
stream cuda to gpu
brucefan1983 Oct 23, 2024
cb87370
thrust cuda to gpu
brucefan1983 Oct 23, 2024
ee8859f
curand to gpurand
brucefan1983 Oct 23, 2024
8c3ce73
blas cuda to gpu
brucefan1983 Oct 23, 2024
c2c1892
remove unused header
brucefan1983 Oct 23, 2024
c7031c0
lapack cuda to gpu
brucefan1983 Oct 23, 2024
cb43c43
add hip makefile
brucefan1983 Oct 23, 2024
785f9ba
enable hip
brucefan1983 Oct 23, 2024
2511a9e
merge master
brucefan1983 Oct 24, 2024
6fa70a8
fix a macro for hip
brucefan1983 Oct 24, 2024
83b67f1
GPU_KERNEL_CHECK
brucefan1983 Oct 24, 2024
8e9f570
better include
brucefan1983 Oct 24, 2024
364fa28
add more headers
brucefan1983 Oct 24, 2024
f023bb3
remove pragma unroll
brucefan1983 Oct 25, 2024
99fe2bf
try to force to check NULL
brucefan1983 Oct 25, 2024
fdc19ea
another istance of NULL
brucefan1983 Oct 25, 2024
b06796a
O1 optimization level for part of the code
brucefan1983 Oct 25, 2024
9fc234e
simplify makefile for hip
brucefan1983 Oct 25, 2024
f33417d
merge master
brucefan1983 Oct 25, 2024
9fe7f4f
restore makefile for cuda
brucefan1983 Oct 25, 2024
d42f11a
report success message for hip makefile
brucefan1983 Oct 25, 2024
1ac1a4c
O1 for hip
brucefan1983 Oct 27, 2024
e39e6f0
merge master
brucefan1983 Oct 27, 2024
816bd74
initialize a variable
brucefan1983 Oct 28, 2024
dbc91ed
CHECK
brucefan1983 Oct 30, 2024
96e2b59
follow Daniel suggestion
brucefan1983 Oct 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions src/force/dftd3.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ J. Comput. Chem., 32, 1456 (2011).
#include "model/box.cuh"
#include "neighbor.cuh"
#include "utilities/common.cuh"
#include "utilities/gpu_macro.cuh"
#include <algorithm>
#include <cctype>
#include <iostream>
Expand Down Expand Up @@ -947,7 +948,7 @@ void DFTD3::compute_small_box(
r12.data() + size_x12 * 3,
r12.data() + size_x12 * 4,
r12.data() + size_x12 * 5);
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

find_dftd3_coordination_number_small_box<<<(N - 1) / 64 + 1, 64>>>(
dftd3_para,
Expand All @@ -959,7 +960,7 @@ void DFTD3::compute_small_box(
r12.data() + size_x12 * 4,
r12.data() + size_x12 * 5,
cn.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

add_dftd3_force_small_box<<<(N - 1) / 64 + 1, 64>>>(
dftd3_para,
Expand All @@ -979,7 +980,7 @@ void DFTD3::compute_small_box(
virial_per_atom.data(),
dc6_sum.data(),
dc8_sum.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

add_dftd3_force_extra_small_box<<<(N - 1) / 64 + 1, 64>>>(
dftd3_para,
Expand All @@ -996,7 +997,7 @@ void DFTD3::compute_small_box(
force_per_atom.data() + N,
force_per_atom.data() + N * 2,
virial_per_atom.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL
}

void DFTD3::compute_large_box(
Expand Down Expand Up @@ -1058,7 +1059,7 @@ void DFTD3::compute_large_box(
position_per_atom.data() + N,
position_per_atom.data() + N * 2,
cn.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

find_dftd3_force_large_box<<<(N - 1) / 64 + 1, 64>>>(
dftd3_para,
Expand All @@ -1084,7 +1085,7 @@ void DFTD3::compute_large_box(
virial_per_atom.data(),
dc6_sum.data(),
dc8_sum.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

find_dftd3_force_extra_large_box<<<(N - 1) / 64 + 1, 64>>>(
dftd3_para,
Expand All @@ -1107,7 +1108,7 @@ void DFTD3::compute_large_box(
force_per_atom.data() + N,
force_per_atom.data() + N * 2,
virial_per_atom.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL
}

void DFTD3::compute(
Expand Down
2 changes: 2 additions & 0 deletions src/force/dftd3para.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

#pragma once

#include "utilities/gpu_macro.cuh"

namespace
{
#define Bohr 0.5291772575069165f
Expand Down
9 changes: 5 additions & 4 deletions src/force/eam.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ The EAM potential. Currently two analytical versions:
#include "eam.cuh"
#include "neighbor.cuh"
#include "utilities/error.cuh"
#include "utilities/gpu_macro.cuh"
#include <cstring>
#define BLOCK_SIZE_FORCE 64

Expand Down Expand Up @@ -514,7 +515,7 @@ void EAM::compute(
position_per_atom.data() + number_of_atoms * 2,
eam_data.Fp.data(),
potential_per_atom.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

find_force_eam_step2<0><<<grid_size, BLOCK_SIZE_FORCE>>>(
eam2004zhou,
Expand All @@ -535,7 +536,7 @@ void EAM::compute(
force_per_atom.data() + 2 * number_of_atoms,
virial_per_atom.data(),
potential_per_atom.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL
}

if (potential_model == 1) {
Expand All @@ -554,7 +555,7 @@ void EAM::compute(
position_per_atom.data() + number_of_atoms * 2,
eam_data.Fp.data(),
potential_per_atom.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

find_force_eam_step2<1><<<grid_size, BLOCK_SIZE_FORCE>>>(
eam2004zhou,
Expand All @@ -575,6 +576,6 @@ void EAM::compute(
force_per_atom.data() + 2 * number_of_atoms,
virial_per_atom.data(),
potential_per_atom.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL
}
}
5 changes: 3 additions & 2 deletions src/force/fcp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ The force constant potential (FCP)

#include "fcp.cuh"
#include "utilities/error.cuh"
#include "utilities/gpu_macro.cuh"
#include <cstring>
#include <vector>

Expand Down Expand Up @@ -1038,7 +1039,7 @@ void FCP::compute(
position_per_atom.data() + number_of_atoms * 2,
fcp_data.r0.data(),
fcp_data.u.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

fcp_data.pfv.fill(0.0f);

Expand Down Expand Up @@ -1125,5 +1126,5 @@ void FCP::compute(
force_per_atom.data() + 2 * number_of_atoms,
virial_per_atom.data());

CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL
}
37 changes: 19 additions & 18 deletions src/force/force.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ The driver class calculating force and related quantities.
#include "ilp_tmd_sw.cuh"
#include "utilities/common.cuh"
#include "utilities/error.cuh"
#include "utilities/gpu_macro.cuh"
#include "utilities/read_file.cuh"
#include <cstring>
#include <iostream>
Expand Down Expand Up @@ -106,7 +107,7 @@ void Force::parse_potential(
strcmp(potential_name, "nep4_temperature") == 0 ||
strcmp(potential_name, "nep4_zbl_temperature") == 0) {
int num_gpus;
CHECK(cudaGetDeviceCount(&num_gpus));
CHECK(gpuGetDeviceCount(&num_gpus));
#ifdef ZHEYONG
num_gpus = 3;
#endif
Expand Down Expand Up @@ -226,7 +227,7 @@ static __global__ void gpu_sum_force(int N, double* g_fx, double* g_fy, double*
s_f[tid] = f;
__syncthreads();

#pragma unroll

for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
s_f[tid] += s_f[tid + offset];
Expand Down Expand Up @@ -466,7 +467,7 @@ void Force::compute(
force_per_atom.data() + number_of_atoms * 2,
potential_per_atom.data(),
virial_per_atom.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

if (multiple_potentials_mode_.compare("observe") == 0) {
// If observing, calculate using main potential only
Expand Down Expand Up @@ -516,7 +517,7 @@ void Force::compute(
force_per_atom.data(),
virial_per_atom.data(),
(double)potentials.size());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL
} else {
PRINT_INPUT_ERROR("Invalid mode for multiple potentials.\n");
}
Expand Down Expand Up @@ -552,7 +553,7 @@ void Force::compute(
force_per_atom.data() + number_of_atoms,
force_per_atom.data() + 2 * number_of_atoms,
ftot.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
number_of_atoms,
Expand All @@ -561,7 +562,7 @@ void Force::compute(
force_per_atom.data() + number_of_atoms,
force_per_atom.data() + 2 * number_of_atoms,
ftot.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL
}

// always correct the force when using the FCP potential
Expand All @@ -574,7 +575,7 @@ void Force::compute(
force_per_atom.data() + number_of_atoms,
force_per_atom.data() + 2 * number_of_atoms,
ftot.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
number_of_atoms,
Expand All @@ -583,7 +584,7 @@ void Force::compute(
force_per_atom.data() + number_of_atoms,
force_per_atom.data() + 2 * number_of_atoms,
ftot.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL
}
}
}
Expand Down Expand Up @@ -647,7 +648,7 @@ static __global__ void gpu_sum_tensor(int N, double* g_tensor, double* g_sum_ten
s_t[tid] = t;
__syncthreads();

#pragma unroll

for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
s_t[tid] += s_t[tid + offset];
Expand Down Expand Up @@ -754,7 +755,7 @@ void Force::compute(
force_per_atom.data() + number_of_atoms * 2,
potential_per_atom.data(),
virial_per_atom.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

temperature += delta_T;
if (multiple_potentials_mode_.compare("observe") == 0) {
Expand Down Expand Up @@ -805,7 +806,7 @@ void Force::compute(
force_per_atom.data(),
virial_per_atom.data(),
(double)potentials.size());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL
} else {
PRINT_INPUT_ERROR("Invalid mode for multiple potentials.\n");
}
Expand Down Expand Up @@ -841,7 +842,7 @@ void Force::compute(
force_per_atom.data() + number_of_atoms,
force_per_atom.data() + 2 * number_of_atoms,
ftot.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
number_of_atoms,
Expand All @@ -850,7 +851,7 @@ void Force::compute(
force_per_atom.data() + number_of_atoms,
force_per_atom.data() + 2 * number_of_atoms,
ftot.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL
} else if (compute_hnemdec_ == 0) {
// the tensor:
// xx xy xz 0 3 4
Expand All @@ -876,10 +877,10 @@ void Force::compute(
virial_per_atom.data() + 8 * number_of_atoms,
virial_per_atom.data() + 2 * number_of_atoms,
tensor_per_atom.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

gpu_sum_tensor<<<9, 1024>>>(number_of_atoms, tensor_per_atom.data(), tensor_tot.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

gpu_add_driving_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
number_of_atoms,
Expand All @@ -901,7 +902,7 @@ void Force::compute(
force_per_atom.data(),
force_per_atom.data() + number_of_atoms,
force_per_atom.data() + 2 * number_of_atoms);
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

} else if (compute_hnemdec_ != -1) {
gpu_add_driving_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
Expand All @@ -926,7 +927,7 @@ void Force::compute(
force_per_atom.data() + number_of_atoms,
force_per_atom.data() + 2 * number_of_atoms,
ftot.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL

gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
number_of_atoms,
Expand All @@ -935,7 +936,7 @@ void Force::compute(
force_per_atom.data() + number_of_atoms,
force_per_atom.data() + 2 * number_of_atoms,
ftot.data());
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL
}
}
}
13 changes: 7 additions & 6 deletions src/force/force_constant.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Use finite difference to calculate the seconod order force constants:
#include "model/box.cuh"
#include "model/group.cuh"
#include "utilities/error.cuh"
#include "utilities/gpu_macro.cuh"
#include <vector>

static __global__ void gpu_shift_atom(const double dx, double* x) { x[0] += dx; }
Expand All @@ -34,13 +35,13 @@ static void shift_atom(

if (beta == 0) {
gpu_shift_atom<<<1, 1>>>(dx, position_per_atom.data() + n2);
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL
} else if (beta == 1) {
gpu_shift_atom<<<1, 1>>>(dx, position_per_atom.data() + number_of_atoms + n2);
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL
} else {
gpu_shift_atom<<<1, 1>>>(dx, position_per_atom.data() + number_of_atoms * 2 + n2);
CUDA_CHECK_KERNEL
GPU_CHECK_KERNEL
}
}

Expand All @@ -67,10 +68,10 @@ static void get_f(
box, position_per_atom, type, group, potential_per_atom, force_per_atom, virial_per_atom);

size_t M = sizeof(double);
CHECK(cudaMemcpy(f + 0, force_per_atom.data() + n1, M, cudaMemcpyDeviceToHost));
CHECK(cudaMemcpy(f + 1, force_per_atom.data() + n1 + number_of_atoms, M, cudaMemcpyDeviceToHost));
CHECK(gpuMemcpy(f + 0, force_per_atom.data() + n1, M, gpuMemcpyDeviceToHost));
CHECK(gpuMemcpy(f + 1, force_per_atom.data() + n1 + number_of_atoms, M, gpuMemcpyDeviceToHost));
CHECK(
cudaMemcpy(f + 2, force_per_atom.data() + n1 + number_of_atoms * 2, M, cudaMemcpyDeviceToHost));
gpuMemcpy(f + 2, force_per_atom.data() + n1 + number_of_atoms * 2, M, gpuMemcpyDeviceToHost));

shift_atom(-dx, n2, beta, position_per_atom);
}
Expand Down
Loading
Loading