Skip to content

Commit

Permalink
Merge pull request #1241 from CEED/jed/fix-vec-size-loop-vars
Browse files Browse the repository at this point in the history
CeedVector/Preconditioning: fix CeedInt loop vars to CeedSize
  • Loading branch information
jedbrown authored Jul 7, 2023
2 parents 3f46b22 + 05c335c commit b3d4ed2
Show file tree
Hide file tree
Showing 14 changed files with 471 additions and 264 deletions.
38 changes: 25 additions & 13 deletions backends/cuda-ref/ceed-cuda-ref-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ static int CreatePBRestriction(CeedElemRestriction rstr, CeedElemRestriction *pb
//------------------------------------------------------------------------------
// Assemble diagonal setup
//------------------------------------------------------------------------------
static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, const bool pointBlock) {
static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, const bool pointBlock, CeedInt use_ceedsize_idx) {
Ceed ceed;
CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
CeedQFunction qf;
Expand Down Expand Up @@ -729,8 +729,8 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, const
CeedCallBackend(CeedBasisGetNumNodes(basisin, &nnodes));
CeedCallBackend(CeedBasisGetNumQuadraturePoints(basisin, &nqpts));
diag->nnodes = nnodes;
CeedCallCuda(ceed, CeedCompile_Cuda(ceed, diagonal_kernel_source, &diag->module, 5, "NUMEMODEIN", numemodein, "NUMEMODEOUT", numemodeout, "NNODES",
nnodes, "NQPTS", nqpts, "NCOMP", ncomp));
CeedCallCuda(ceed, CeedCompile_Cuda(ceed, diagonal_kernel_source, &diag->module, 6, "NUMEMODEIN", numemodein, "NUMEMODEOUT", numemodeout, "NNODES",
nnodes, "NQPTS", nqpts, "NCOMP", ncomp, "CEEDSIZE", use_ceedsize_idx));
CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "linearDiagonal", &diag->linearDiagonal));
CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "linearPointBlockDiagonal", &diag->linearPointBlock));
CeedCallBackend(CeedFree(&diagonal_kernel_path));
Expand Down Expand Up @@ -798,9 +798,15 @@ static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVec
CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, &rstr, request));
CeedCallBackend(CeedElemRestrictionDestroy(&rstr));

CeedSize assembled_length = 0, assembledqf_length = 0;
CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length));
CeedCallBackend(CeedVectorGetLength(assembledqf, &assembledqf_length));
CeedInt use_ceedsize_idx = 0;
if ((assembled_length > INT_MAX) || (assembledqf_length > INT_MAX)) use_ceedsize_idx = 1;

// Setup
if (!impl->diag) {
CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Cuda(op, pointBlock));
CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Cuda(op, pointBlock, use_ceedsize_idx));
}
CeedOperatorDiag_Cuda *diag = impl->diag;
assert(diag != NULL);
Expand Down Expand Up @@ -873,7 +879,7 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op,
//------------------------------------------------------------------------------
// Single operator assembly setup
//------------------------------------------------------------------------------
static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op) {
static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) {
Ceed ceed;
CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
CeedOperator_Cuda *impl;
Expand Down Expand Up @@ -985,8 +991,9 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op) {
asmb->block_size_x = esize;
asmb->block_size_y = esize;
}
CeedCallCuda(ceed, CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 7, "NELEM", nelem, "NUMEMODEIN", num_emode_in, "NUMEMODEOUT",
num_emode_out, "NQPTS", nqpts, "NNODES", esize, "BLOCK_SIZE", block_size, "NCOMP", ncomp));
CeedCallCuda(
ceed, CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 8, "NELEM", nelem, "NUMEMODEIN", num_emode_in, "NUMEMODEOUT", num_emode_out,
"NQPTS", nqpts, "NNODES", esize, "BLOCK_SIZE", block_size, "NCOMP", ncomp, "CEEDSIZE", use_ceedsize_idx));
CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, asmb->module, fallback ? "linearAssembleFallback" : "linearAssemble", &asmb->linearAssemble));
CeedCallBackend(CeedFree(&assembly_kernel_path));
CeedCallBackend(CeedFree(&assembly_kernel_source));
Expand Down Expand Up @@ -1053,12 +1060,6 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed
CeedOperator_Cuda *impl;
CeedCallBackend(CeedOperatorGetData(op, &impl));

// Setup
if (!impl->asmb) {
CeedCallBackend(CeedSingleOperatorAssembleSetup_Cuda(op));
assert(impl->asmb != NULL);
}

// Assemble QFunction
CeedVector assembled_qf = NULL;
CeedElemRestriction rstr_q = NULL;
Expand All @@ -1070,6 +1071,17 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed
const CeedScalar *qf_array;
CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array));

CeedSize values_length = 0, assembled_qf_length = 0;
CeedCallBackend(CeedVectorGetLength(values, &values_length));
CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length));
CeedInt use_ceedsize_idx = 0;
if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
// Setup
if (!impl->asmb) {
CeedCallBackend(CeedSingleOperatorAssembleSetup_Cuda(op, use_ceedsize_idx));
assert(impl->asmb != NULL);
}

// Compute B^T D B
const CeedInt nelem = impl->asmb->nelem;
const CeedInt elemsPerBlock = impl->asmb->elemsPerBlock;
Expand Down
151 changes: 122 additions & 29 deletions backends/cuda-ref/ceed-cuda-ref-vector.c
Original file line number Diff line number Diff line change
Expand Up @@ -277,18 +277,18 @@ static int CeedVectorSetArray_Cuda(const CeedVector vec, const CeedMemType mem_t
//------------------------------------------------------------------------------
// Set host array to value
//------------------------------------------------------------------------------
static int CeedHostSetValue_Cuda(CeedScalar *h_array, CeedInt length, CeedScalar val) {
for (int i = 0; i < length; i++) h_array[i] = val;
static int CeedHostSetValue_Cuda(CeedScalar *h_array, CeedSize length, CeedScalar val) {
for (CeedSize i = 0; i < length; i++) h_array[i] = val;
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Set device array to value (impl in .cu file)
//------------------------------------------------------------------------------
int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedInt length, CeedScalar val);
int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val);

//------------------------------------------------------------------------------
// Set a vector to a value,
// Set a vector to a value
//------------------------------------------------------------------------------
static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) {
Ceed ceed;
Expand Down Expand Up @@ -449,36 +449,129 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no
cublasHandle_t handle;
CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle));

#if CUDA_VERSION < 12000
// With CUDA 12, we can use the 64-bit integer interface. Prior to that,
// we need to check if the vector is too long to handle with int32,
// and if so, divide it into subsections for repeated cuBLAS calls
CeedSize num_calls = length / INT_MAX;
if (length % INT_MAX > 0) num_calls += 1;
#endif

// Compute norm
const CeedScalar *d_array;
CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array));
switch (type) {
case CEED_NORM_1: {
*norm = 0.0;
if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
CeedCallCublas(ceed, cublasSasum(handle, length, (float *)d_array, 1, (float *)norm));
#if CUDA_VERSION >= 12000 // We have CUDA 12, and can use 64-bit integers
CeedCallCublas(ceed, cublasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
#else
float sub_norm = 0.0;
float *d_array_start;
for (CeedInt i = 0; i < num_calls; i++) {
d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX;
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
CeedCallCublas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
*norm += sub_norm;
}
#endif
} else {
CeedCallCublas(ceed, cublasDasum(handle, length, (double *)d_array, 1, (double *)norm));
#if CUDA_VERSION >= 12000
CeedCallCublas(ceed, cublasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
#else
double sub_norm = 0.0;
double *d_array_start;
for (CeedInt i = 0; i < num_calls; i++) {
d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX;
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
CeedCallCublas(ceed, cublasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
*norm += sub_norm;
}
#endif
}
break;
}
case CEED_NORM_2: {
if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
CeedCallCublas(ceed, cublasSnrm2(handle, length, (float *)d_array, 1, (float *)norm));
#if CUDA_VERSION >= 12000
CeedCallCublas(ceed, cublasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
#else
float sub_norm = 0.0, norm_sum = 0.0;
float *d_array_start;
for (CeedInt i = 0; i < num_calls; i++) {
d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX;
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
CeedCallCublas(ceed, cublasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
norm_sum += sub_norm * sub_norm;
}
*norm = sqrt(norm_sum);
#endif
} else {
CeedCallCublas(ceed, cublasDnrm2(handle, length, (double *)d_array, 1, (double *)norm));
#if CUDA_VERSION >= 12000
CeedCallCublas(ceed, cublasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
#else
double sub_norm = 0.0, norm_sum = 0.0;
double *d_array_start;
for (CeedInt i = 0; i < num_calls; i++) {
d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX;
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
CeedCallCublas(ceed, cublasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
norm_sum += sub_norm * sub_norm;
}
*norm = sqrt(norm_sum);
#endif
}
break;
}
case CEED_NORM_MAX: {
CeedInt indx;
if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
CeedCallCublas(ceed, cublasIsamax(handle, length, (float *)d_array, 1, &indx));
#if CUDA_VERSION >= 12000
int64_t indx;
CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &indx));
CeedScalar normNoAbs;
CeedCallCuda(ceed, cudaMemcpy(&normNoAbs, impl->d_array + indx - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
*norm = fabs(normNoAbs);
#else
CeedInt indx;
float sub_max = 0.0, current_max = 0.0;
float *d_array_start;
for (CeedInt i = 0; i < num_calls; i++) {
d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX;
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &indx));
CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + indx - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
}
*norm = current_max;
#endif
} else {
CeedCallCublas(ceed, cublasIdamax(handle, length, (double *)d_array, 1, &indx));
#if CUDA_VERSION >= 12000
int64_t indx;
CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &indx));
CeedScalar normNoAbs;
CeedCallCuda(ceed, cudaMemcpy(&normNoAbs, impl->d_array + indx - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
*norm = fabs(normNoAbs);
#else
CeedInt indx;
double sub_max = 0.0, current_max = 0.0;
double *d_array_start;
for (CeedInt i = 0; i < num_calls; i++) {
d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX;
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &indx));
CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + indx - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
}
*norm = current_max;
#endif
}
CeedScalar normNoAbs;
CeedCallCuda(ceed, cudaMemcpy(&normNoAbs, impl->d_array + indx - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
*norm = fabs(normNoAbs);
break;
}
}
Expand All @@ -490,8 +583,8 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no
//------------------------------------------------------------------------------
// Take reciprocal of a vector on host
//------------------------------------------------------------------------------
static int CeedHostReciprocal_Cuda(CeedScalar *h_array, CeedInt length) {
for (int i = 0; i < length; i++) {
static int CeedHostReciprocal_Cuda(CeedScalar *h_array, CeedSize length) {
for (CeedSize i = 0; i < length; i++) {
if (fabs(h_array[i]) > CEED_EPSILON) h_array[i] = 1. / h_array[i];
}
return CEED_ERROR_SUCCESS;
Expand All @@ -500,7 +593,7 @@ static int CeedHostReciprocal_Cuda(CeedScalar *h_array, CeedInt length) {
//------------------------------------------------------------------------------
// Take reciprocal of a vector on device (impl in .cu file)
//------------------------------------------------------------------------------
int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedInt length);
int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length);

//------------------------------------------------------------------------------
// Take reciprocal of a vector
Expand All @@ -523,15 +616,15 @@ static int CeedVectorReciprocal_Cuda(CeedVector vec) {
//------------------------------------------------------------------------------
// Compute x = alpha x on the host
//------------------------------------------------------------------------------
static int CeedHostScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedInt length) {
for (int i = 0; i < length; i++) x_array[i] *= alpha;
static int CeedHostScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
for (CeedSize i = 0; i < length; i++) x_array[i] *= alpha;
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Compute x = alpha x on device (impl in .cu file)
//------------------------------------------------------------------------------
int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedInt length);
int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length);

//------------------------------------------------------------------------------
// Compute x = alpha x
Expand All @@ -554,15 +647,15 @@ static int CeedVectorScale_Cuda(CeedVector x, CeedScalar alpha) {
//------------------------------------------------------------------------------
// Compute y = alpha x + y on the host
//------------------------------------------------------------------------------
static int CeedHostAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length) {
for (int i = 0; i < length; i++) y_array[i] += alpha * x_array[i];
static int CeedHostAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
for (CeedSize i = 0; i < length; i++) y_array[i] += alpha * x_array[i];
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Compute y = alpha x + y on device (impl in .cu file)
//------------------------------------------------------------------------------
int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length);
int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length);

//------------------------------------------------------------------------------
// Compute y = alpha x + y
Expand Down Expand Up @@ -592,15 +685,15 @@ static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) {
//------------------------------------------------------------------------------
// Compute y = alpha x + beta y on the host
//------------------------------------------------------------------------------
static int CeedHostAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedInt length) {
for (int i = 0; i < length; i++) y_array[i] += alpha * x_array[i] + beta * y_array[i];
static int CeedHostAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) {
for (CeedSize i = 0; i < length; i++) y_array[i] += alpha * x_array[i] + beta * y_array[i];
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Compute y = alpha x + beta y on device (impl in .cu file)
//------------------------------------------------------------------------------
int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedInt length);
int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length);

//------------------------------------------------------------------------------
// Compute y = alpha x + beta y
Expand Down Expand Up @@ -630,15 +723,15 @@ static int CeedVectorAXPBY_Cuda(CeedVector y, CeedScalar alpha, CeedScalar beta,
//------------------------------------------------------------------------------
// Compute the pointwise multiplication w = x .* y on the host
//------------------------------------------------------------------------------
static int CeedHostPointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length) {
for (int i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i];
static int CeedHostPointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
for (CeedSize i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i];
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Compute the pointwise multiplication w = x .* y on device (impl in .cu file)
//------------------------------------------------------------------------------
int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length);
int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length);

//------------------------------------------------------------------------------
// Compute the pointwise multiplication w = x .* y
Expand Down
Loading

0 comments on commit b3d4ed2

Please sign in to comment.