Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nontensor gen operators #1735

Merged
merged 4 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
236 changes: 143 additions & 93 deletions backends/cuda-gen/ceed-cuda-gen-operator-build.cpp

Large diffs are not rendered by default.

76 changes: 65 additions & 11 deletions backends/cuda-gen/ceed-cuda-gen-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <stddef.h>
#include <string.h>

#include "../cuda/ceed-cuda-common.h"
#include "../cuda/ceed-cuda-compile.h"
Expand Down Expand Up @@ -98,7 +99,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
// Apply and add to output
//------------------------------------------------------------------------------
static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
bool is_at_points;
bool is_at_points, is_tensor;
Ceed ceed;
Ceed_Cuda *cuda_data;
CeedInt num_elem, num_input_fields, num_output_fields;
Expand All @@ -110,16 +111,62 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
CeedOperatorField *op_input_fields, *op_output_fields;
CeedOperator_Cuda_gen *data;

// Check for tensor-product bases
// Check for shared bases
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
{
bool has_tensor_bases;
bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;

for (CeedInt i = 0; i < num_input_fields; i++) {
CeedBasis basis;

CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
if (basis != CEED_BASIS_NONE) {
bool is_tensor = true;
const char *resource;
char *resource_root;
Ceed basis_ceed;

CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
is_all_tensor &= is_tensor;
is_all_nontensor &= !is_tensor;
CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
CeedCallBackend(CeedGetResource(basis_ceed, &resource));
CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
CeedCallBackend(CeedFree(&resource_root));
CeedCallBackend(CeedDestroy(&basis_ceed));
}
CeedCallBackend(CeedBasisDestroy(&basis));
}

CeedCallBackend(CeedOperatorHasTensorBases(op, &has_tensor_bases));
// -- Fallback to ref if not all bases are tensor-product
if (!has_tensor_bases) {
for (CeedInt i = 0; i < num_output_fields; i++) {
CeedBasis basis;

CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
if (basis != CEED_BASIS_NONE) {
bool is_tensor = true;
const char *resource;
char *resource_root;
Ceed basis_ceed;

CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
is_all_tensor &= is_tensor;
is_all_nontensor &= !is_tensor;

CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
CeedCallBackend(CeedGetResource(basis_ceed, &resource));
CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
CeedCallBackend(CeedFree(&resource_root));
CeedCallBackend(CeedDestroy(&basis_ceed));
}
CeedCallBackend(CeedBasisDestroy(&basis));
}
// -- Fallback to ref if not all bases are shared
if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
CeedOperator op_fallback;

CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to non-tensor bases");
CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due unsupported bases");
CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
return CEED_ERROR_SUCCESS;
Expand All @@ -132,7 +179,6 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));

// Creation of the operator
Expand Down Expand Up @@ -232,11 +278,19 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
int max_threads_per_block, min_grid_size, grid;

CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
int block[3] = {thread_1d, dim < 2 ? 1 : thread_1d, -1};
int block[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};

CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
if (is_tensor) {
CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
} else {
CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));

grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
block[2] = elems_per_block;
}
CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);

CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, opargs));
Expand Down
2 changes: 1 addition & 1 deletion backends/cuda-ref/ceed-cuda-ref-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
// Input CEED_VECTOR_ACTIVE
// Output CEED_VECTOR_ACTIVE without CEED_EVAL_NONE
// Input CEED_VECTOR_NONE with CEED_EVAL_WEIGHT
// Input passive vectorr with CEED_EVAL_NONE and strided restriction with CEED_STRIDES_BACKEND
// Input passive vector with CEED_EVAL_NONE and strided restriction with CEED_STRIDES_BACKEND
CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &l_vec));
is_active = l_vec == CEED_VECTOR_ACTIVE;
CeedCallBackend(CeedVectorDestroy(&l_vec));
Expand Down
Loading
Loading