From 2d217acfcdb3fc2c258895deb77c68448814c687 Mon Sep 17 00:00:00 2001 From: Jeremy L Thompson Date: Mon, 13 Jan 2025 11:05:25 -0700 Subject: [PATCH] hip - fix missing template, compile values, fn names --- backends/hip-shared/ceed-hip-shared-basis.c | 10 ++++--- .../hip-shared-basis-nontensor-templates.h | 21 ++++++++------- .../hip/hip-shared-basis-nontensor.h | 27 +++++++++---------- 3 files changed, 30 insertions(+), 28 deletions(-) diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c index 63b4c47a99..144af79c2f 100644 --- a/backends/hip-shared/ceed-hip-shared-basis.c +++ b/backends/hip-shared/ceed-hip-shared-basis.c @@ -533,7 +533,7 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); CeedInt thread = CeedIntMax(Q, P); - void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; + void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v}; { CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; @@ -554,7 +554,7 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad CeedInt block_size = data->block_sizes[2]; CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); - CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; { @@ -723,8 +723,10 @@ int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num const char basis_kernel_source[] = "// Non-tensor basis source\n#include \n"; CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D", - CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp)); + CeedCallBackend(ComputeBasisThreadBlockSizes(dim, num_nodes, num_qpts, num_comp, data->block_sizes)); + CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 6, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D", + CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_INTERP_BLOCK_SIZE", + data->block_sizes[0])); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h index 68e2767090..d394179dfe 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h @@ -44,7 +44,8 @@ inline __device__ void ContractTranspose1d(SharedData_Hip &data, const CeedScala // Interpolate to quadrature points //------------------------------------------------------------------------------ template -inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { +inline __device__ void InterpNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { Contract1d(data, &r_U[comp], c_B, &r_V[comp]); } @@ -54,8 +55,8 @@ inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restri // Interpolate transpose //------------------------------------------------------------------------------ template -inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +inline __device__ void InterpTransposeNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { r_V[comp] = 0.0; ContractTranspose1d(data, &r_U[comp], c_B, &r_V[comp]); @@ -65,8 +66,8 @@ inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar //------------------------------------------------------------------------------ // Derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt dim = 0; dim < DIM; dim++) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { Contract1d(data, &r_U[comp], &c_G[dim * P * Q], &r_V[comp + dim * NUM_COMP]); @@ -77,9 +78,9 @@ inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict //------------------------------------------------------------------------------ // Derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTransposeNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0; for (CeedInt dim = 0; dim < DIM; dim++) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -92,6 +93,6 @@ inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *_ // Quadrature weights //------------------------------------------------------------------------------ template -inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { - *w = (data.t_id_x < Q) ? q_weight_1d[data.t_id_x] : 0.0; +inline __device__ void WeightNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) { + *w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0; } diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h index f9ce1398d8..c892a9c939 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h @@ -99,7 +99,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ //------------------------------------------------------------------------------ // Grad kernels //------------------------------------------------------------------------------ -extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; @@ -114,8 +114,8 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; // load grad into shared memory - __shared__ CeedScalar s_G[BASIS_P * BASIS_Q]; - LoadMatrix(data, c_G, s_G); + __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; + LoadMatrix(data, c_G, s_G); __syncthreads(); // Apply basis element by element @@ -126,7 +126,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ } } -extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; @@ -141,8 +141,8 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ CeedScalar r_V[BASIS_NUM_COMP]; // load grad into shared memory - __shared__ CeedScalar s_G[BASIS_P * BASIS_Q]; - LoadMatrix(data, c_G, s_G); + __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; + LoadMatrix(data, c_G, s_G); __syncthreads(); // Apply basis element by element @@ -153,9 +153,8 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ } } -extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ - void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Hip data; @@ -163,14 +162,14 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; - data.slice = &slice[data.t_id_z * T_1D]; + data.slice = slice + data.t_id_z * T_1D; CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; CeedScalar r_V[BASIS_NUM_COMP]; // load grad into shared memory - __shared__ CeedScalar s_G[BASIS_P * BASIS_Q]; - LoadMatrix(data, c_G, s_G); + __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; + LoadMatrix(data, c_G, s_G); __syncthreads(); // Apply basis element by element @@ -184,8 +183,8 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ //------------------------------------------------------------------------------ // Weight kernel //------------------------------------------------------------------------------ -extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__ - void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) { +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) { extern __shared__ CeedScalar slice[]; SharedData_Hip data;