Skip to content

Commit

Permalink
Merge pull request #1724 from CEED/jeremy/fix-hip-shared
Browse files Browse the repository at this point in the history
Fix hip/shared NonTensor
  • Loading branch information
jeremylt authored Jan 13, 2025
2 parents 83153ff + 2d217ac commit d01feaa
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 28 deletions.
10 changes: 6 additions & 4 deletions backends/hip-shared/ceed-hip-shared-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
CeedInt thread = CeedIntMax(Q, P);
void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v};
void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v};

{
CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
Expand All @@ -554,7 +554,7 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
CeedInt block_size = data->block_sizes[2];

CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q));
CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};

{
Expand Down Expand Up @@ -723,8 +723,10 @@ int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num
const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/hip/hip-shared-basis-nontensor.h>\n";

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D",
CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp));
CeedCallBackend(ComputeBasisThreadBlockSizes(dim, num_nodes, num_qpts, num_comp, data->block_sizes));
CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 6, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D",
CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_INTERP_BLOCK_SIZE",
data->block_sizes[0]));
CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
Expand Down
21 changes: 11 additions & 10 deletions include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ inline __device__ void ContractTranspose1d(SharedData_Hip &data, const CeedScala
// Interpolate to quadrature points
//------------------------------------------------------------------------------
template <int NUM_COMP, int P, int Q>
inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
inline __device__ void InterpNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
CeedScalar *__restrict__ r_V) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
}
Expand All @@ -54,8 +55,8 @@ inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restri
// Interpolate transpose
//------------------------------------------------------------------------------
template <int NUM_COMP, int P, int Q>
inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
CeedScalar *__restrict__ r_V) {
inline __device__ void InterpTransposeNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
CeedScalar *__restrict__ r_V) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
r_V[comp] = 0.0;
ContractTranspose1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
Expand All @@ -65,8 +66,8 @@ inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar
//------------------------------------------------------------------------------
// Derivatives at quadrature points
//------------------------------------------------------------------------------
template <int NUM_COMP, int P, int Q>
inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
template <int NUM_COMP, int DIM, int P, int Q>
inline __device__ void GradNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
for (CeedInt dim = 0; dim < DIM; dim++) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], &c_G[dim * P * Q], &r_V[comp + dim * NUM_COMP]);
Expand All @@ -77,9 +78,9 @@ inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict
//------------------------------------------------------------------------------
// Derivatives transpose
//------------------------------------------------------------------------------
template <int NUM_COMP, int P, int Q>
inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
CeedScalar *__restrict__ r_V) {
template <int NUM_COMP, int DIM, int P, int Q>
inline __device__ void GradTransposeNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
CeedScalar *__restrict__ r_V) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0;
for (CeedInt dim = 0; dim < DIM; dim++) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
Expand All @@ -92,6 +93,6 @@ inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *_
// Quadrature weights
//------------------------------------------------------------------------------
template <int Q>
inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
*w = (data.t_id_x < Q) ? q_weight_1d[data.t_id_x] : 0.0;
inline __device__ void WeightNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) {
*w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0;
}
27 changes: 13 additions & 14 deletions include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
//------------------------------------------------------------------------------
// Grad kernels
//------------------------------------------------------------------------------
extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
extern __shared__ CeedScalar slice[];

Expand All @@ -114,8 +114,8 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];

// load grad into shared memory
__shared__ CeedScalar s_G[BASIS_P * BASIS_Q];
LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G);
__shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
__syncthreads();

// Apply basis element by element
Expand All @@ -126,7 +126,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
}
}

extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
extern __shared__ CeedScalar slice[];

Expand All @@ -141,8 +141,8 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
CeedScalar r_V[BASIS_NUM_COMP];

// load grad into shared memory
__shared__ CeedScalar s_G[BASIS_P * BASIS_Q];
LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G);
__shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
__syncthreads();

// Apply basis element by element
Expand All @@ -153,24 +153,23 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
}
}

extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
CeedScalar *__restrict__ d_V) {
extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
extern __shared__ CeedScalar slice[];

SharedData_Hip data;
data.t_id_x = threadIdx.x;
data.t_id_y = threadIdx.y;
data.t_id_z = threadIdx.z;
data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
data.slice = &slice[data.t_id_z * T_1D];
data.slice = slice + data.t_id_z * T_1D;

CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
CeedScalar r_V[BASIS_NUM_COMP];

// load grad into shared memory
__shared__ CeedScalar s_G[BASIS_P * BASIS_Q];
LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G);
__shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
__syncthreads();

// Apply basis element by element
Expand All @@ -184,8 +183,8 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
//------------------------------------------------------------------------------
// Weight kernel
//------------------------------------------------------------------------------
extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) {
extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) {
extern __shared__ CeedScalar slice[];

SharedData_Hip data;
Expand Down

0 comments on commit d01feaa

Please sign in to comment.