From f5aa42b10e4e122931999a8b5c2bb8c6a39d9667 Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Thu, 2 May 2024 12:08:14 -0700 Subject: [PATCH 01/27] remove assertion --- src/utilities/memory_tracker.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/utilities/memory_tracker.c b/src/utilities/memory_tracker.c index 7d35302584..d37afe6937 100644 --- a/src/utilities/memory_tracker.c +++ b/src/utilities/memory_tracker.c @@ -621,10 +621,12 @@ hypre_PrintMemoryTracker( size_t *totl_bytes_o, } + /* for (t = hypre_MEMORY_HOST; t <= hypre_MEMORY_UNIFIED; t++) { hypre_assert(curr_bytes[t] == 0); } + */ } //HYPRE_Real t1 = hypre_MPI_Wtime() - t0; From 2d764b9545288c09bb48309dd9c1f3315f5f7853 Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Thu, 2 May 2024 12:09:40 -0700 Subject: [PATCH 02/27] add assert(0) when CUDA errors --- src/utilities/_hypre_utilities.hpp | 1 + src/utilities/device_utils.h | 1 + 2 files changed, 2 insertions(+) diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 67f80446ad..38f23be720 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -558,6 +558,7 @@ using hypre_DeviceItem = sycl::nd_item<3>; if (cudaSuccess != err) { \ printf("CUDA ERROR (code = %d, %s) at %s:%d\n", err, cudaGetErrorString(err), \ __FILE__, __LINE__); \ + hypre_assert(0); \ } } while(0) #elif defined(HYPRE_USING_HIP) diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index 86e4be87cf..b8702aeee0 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -363,6 +363,7 @@ using hypre_DeviceItem = sycl::nd_item<3>; if (cudaSuccess != err) { \ printf("CUDA ERROR (code = %d, %s) at %s:%d\n", err, cudaGetErrorString(err), \ __FILE__, __LINE__); \ + hypre_assert(0); \ } } while(0) #elif defined(HYPRE_USING_HIP) From d15d70fdc5ac8cad14426c13c22134f55d383d94 Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Thu, 2 May 2024 12:10:19 -0700 Subject: [PATCH 03/27] update assembly driver --- src/test/ij_assembly.c | 746 +++++++---------------------------------- 1 file changed, 118 insertions(+), 628 deletions(-) diff --git a/src/test/ij_assembly.c b/src/test/ij_assembly.c index 1f2c1d9c34..f3e58f8725 100644 --- a/src/test/ij_assembly.c +++ b/src/test/ij_assembly.c @@ -36,42 +36,7 @@ HYPRE_Int getParCSRMatrixData(HYPRE_ParCSRMatrix A, HYPRE_Int base, HYPRE_Int * HYPRE_Real checkMatrix(HYPRE_ParCSRMatrix parcsr_ref, HYPRE_IJMatrix ij_A); -HYPRE_Int test_Set(MPI_Comm comm, HYPRE_MemoryLocation memory_location, HYPRE_Int option, - HYPRE_BigInt ilower, - HYPRE_BigInt iupper, HYPRE_BigInt jlower, HYPRE_BigInt jupper, HYPRE_Int nrows, - HYPRE_BigInt num_nonzeros, HYPRE_Int nchunks, HYPRE_Int *h_nnzrow, HYPRE_Int *nnzrow, - HYPRE_BigInt *rows, HYPRE_BigInt *cols, HYPRE_Real *coefs, HYPRE_IJMatrix *ij_A_ptr); - -HYPRE_Int test_AddTranspose(MPI_Comm comm, HYPRE_MemoryLocation memory_location, HYPRE_Int option, - HYPRE_BigInt ilower, - HYPRE_BigInt iupper, HYPRE_BigInt jlower, HYPRE_BigInt jupper, HYPRE_Int nrows, - HYPRE_BigInt num_nonzeros, HYPRE_Int nchunks, HYPRE_Int *h_nnzrow, HYPRE_Int *nnzrow, - HYPRE_BigInt *rows, HYPRE_BigInt *cols, HYPRE_Real *coefs, HYPRE_IJMatrix *ij_AT_ptr); - -HYPRE_Int test_SetSet(MPI_Comm comm, HYPRE_MemoryLocation memory_location, HYPRE_Int option, - HYPRE_BigInt ilower, - HYPRE_BigInt iupper, HYPRE_BigInt jlower, HYPRE_BigInt jupper, HYPRE_Int nrows, - HYPRE_BigInt num_nonzeros, HYPRE_Int nchunks, HYPRE_Int *h_nnzrow, HYPRE_Int *nnzrow, - HYPRE_BigInt *rows, HYPRE_BigInt *cols, HYPRE_Real *coefs, HYPRE_IJMatrix *ij_AT_ptr); - -HYPRE_Int test_AddSet(MPI_Comm comm, HYPRE_MemoryLocation memory_location, HYPRE_Int option, - HYPRE_BigInt ilower, - HYPRE_BigInt iupper, HYPRE_BigInt jlower, HYPRE_BigInt jupper, HYPRE_Int nrows, - HYPRE_BigInt num_nonzeros, HYPRE_Int nchunks, HYPRE_Int *h_nnzrow, HYPRE_Int *nnzrow, - HYPRE_BigInt *rows, HYPRE_BigInt *cols, HYPRE_Real *coefs, HYPRE_IJMatrix *ij_AT_ptr); - -HYPRE_Int test_SetAdd(MPI_Comm comm, HYPRE_MemoryLocation memory_location, HYPRE_Int option, - HYPRE_BigInt ilower, - HYPRE_BigInt iupper, HYPRE_BigInt jlower, HYPRE_BigInt jupper, HYPRE_Int nrows, - HYPRE_BigInt num_nonzeros, HYPRE_Int nchunks, HYPRE_Int *h_nnzrow, HYPRE_Int *nnzrow, - HYPRE_BigInt *rows, HYPRE_BigInt *cols, HYPRE_Real *coefs, HYPRE_IJMatrix *ij_AT_ptr); - -HYPRE_Int test_SetAddSet(MPI_Comm comm, HYPRE_MemoryLocation memory_location, HYPRE_Int option, - HYPRE_BigInt ilower, - HYPRE_BigInt iupper, HYPRE_BigInt jlower, HYPRE_BigInt jupper, HYPRE_Int nrows, - HYPRE_BigInt num_nonzeros, HYPRE_Int nchunks, HYPRE_Int *h_nnzrow, HYPRE_Int *nnzrow, - HYPRE_BigInt *rows, HYPRE_BigInt *cols, HYPRE_Real *coefs, HYPRE_IJMatrix *ij_AT_ptr); - +HYPRE_Int test_all(MPI_Comm comm, char *test_name, HYPRE_MemoryLocation memory_location, HYPRE_Int option, char *cmd_sequence, HYPRE_BigInt ilower, HYPRE_BigInt iupper, HYPRE_BigInt jlower, HYPRE_BigInt jupper, HYPRE_Int nrows, HYPRE_BigInt num_nonzeros, HYPRE_Int nchunks, HYPRE_Int init_alloc, HYPRE_Int early_assemble, HYPRE_Real grow_factor, HYPRE_Real shrink_threshold, HYPRE_Int *h_nnzrow, HYPRE_Int *nnzrow, HYPRE_BigInt *rows, HYPRE_BigInt *cols, HYPRE_Real *coefs, HYPRE_IJMatrix *ij_A_ptr); hypre_int main( hypre_int argc, @@ -109,6 +74,10 @@ main( hypre_int argc, HYPRE_Int option, base; HYPRE_Int stencil; HYPRE_Int print_matrix; + HYPRE_Int init_alloc = -1; + HYPRE_Int early_assemble = 0; + HYPRE_Real grow_factor = -1.0; + HYPRE_Real shrink_threshold = -1.0; /* Initialize MPI */ hypre_MPI_Init(&argc, &argv); @@ -151,7 +120,7 @@ main( hypre_int argc, default_exec_policy = HYPRE_EXEC_DEVICE; #endif memory_location = HYPRE_MEMORY_DEVICE; - mode = (1 << 6) - 1; + mode = (1 << 7) - 1; option = 1; nchunks = 3; base = 0; @@ -221,6 +190,26 @@ main( hypre_int argc, arg_index++; base = atoi(argv[arg_index++]); } + else if ( strcmp(argv[arg_index], "-init") == 0 ) + { + arg_index++; + init_alloc = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-early") == 0 ) + { + arg_index++; + early_assemble = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-grow") == 0 ) + { + arg_index++; + grow_factor = (HYPRE_Real) atof(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-shrink") == 0 ) + { + arg_index++; + shrink_threshold = (HYPRE_Real) atof(argv[arg_index++]); + } else if ( strcmp(argv[arg_index], "-print") == 0 ) { arg_index++; @@ -275,6 +264,8 @@ main( hypre_int argc, return (0); } + hypre_MemoryTrackerSetPrint(1); + /*----------------------------------------------------------- * Print driver parameters *-----------------------------------------------------------*/ @@ -362,8 +353,8 @@ main( hypre_int argc, /* Test Set */ if (mode & 1) { - test_Set(comm, memory_location, option, ilower, iupper, jlower, jupper, nrows, num_nonzeros, - nchunks, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); + test_all(comm, "set", memory_location, option, "sA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, + nchunks, init_alloc, early_assemble, grow_factor, shrink_threshold, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); ierr += checkMatrix(parcsr_ref, ij_A) > tol; if (print_matrix) @@ -373,11 +364,14 @@ main( hypre_int argc, HYPRE_IJMatrixDestroy(ij_A); } - /* Test AddTranspose */ + /* Test AddTranspose + * set values with (row, col) reversed, i.e., the transpose of A + * in this way, we can test off-proc add to values + */ if (mode & 2) { - test_AddTranspose(comm, memory_location, 2, ilower, iupper, jlower, jupper, nrows, num_nonzeros, - nchunks, h_nnzrow, nnzrow, rows_coo, cols, coefs, &ij_AT); + test_all(comm, "addtrans", memory_location, 2, "aA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, + nchunks, init_alloc, early_assemble, grow_factor, shrink_threshold, h_nnzrow, nnzrow, cols, rows_coo, coefs, &ij_AT); hypre_ParCSRMatrixTranspose(parcsr_ref, &parcsr_trans, 1); @@ -393,8 +387,8 @@ main( hypre_int argc, /* Test Set/Set */ if (mode & 4) { - test_SetSet(comm, memory_location, option, ilower, iupper, jlower, jupper, nrows, num_nonzeros, - nchunks, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); + test_all(comm, "set/set", memory_location, option, "ssA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, + nchunks, init_alloc, early_assemble, grow_factor, shrink_threshold, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); ierr += checkMatrix(parcsr_ref, ij_A) > tol; if (print_matrix) @@ -407,8 +401,8 @@ main( hypre_int argc, /* Test Add/Set */ if (mode & 8) { - test_AddSet(comm, memory_location, option, ilower, iupper, jlower, jupper, nrows, num_nonzeros, - nchunks, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); + test_all(comm, "add/set", memory_location, option, "asA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, + nchunks, init_alloc, early_assemble, grow_factor, shrink_threshold, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); ierr += checkMatrix(parcsr_ref, ij_A) > tol; if (print_matrix) @@ -421,8 +415,8 @@ main( hypre_int argc, /* Test Set/Add */ if (mode & 16) { - test_SetAdd(comm, memory_location, option, ilower, iupper, jlower, jupper, nrows, num_nonzeros, - nchunks, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); + test_all(comm, "set/add", memory_location, option, "saA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, + nchunks, init_alloc, early_assemble, grow_factor, shrink_threshold, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); hypre_ParCSRMatrix *parcsr_ref2 = hypre_ParCSRMatrixClone(parcsr_ref, 1); hypre_ParCSRMatrixScale(parcsr_ref2, 2.0); @@ -436,20 +430,38 @@ main( hypre_int argc, HYPRE_ParCSRMatrixDestroy(parcsr_ref2); } - /* Test Set/Add/Set */ + /* Test Set/Add/Assemble/Set */ if (mode & 32) { - test_SetAddSet(comm, memory_location, option, ilower, iupper, jlower, jupper, nrows, num_nonzeros, - nchunks, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); + test_all(comm, "set/add/assemble/set", memory_location, option, "saAsA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, + nchunks, init_alloc, early_assemble, grow_factor, shrink_threshold, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); ierr += checkMatrix(parcsr_ref, ij_A) > tol; if (print_matrix) { - HYPRE_IJMatrixPrint(ij_A, "ij_SetAddSet"); + HYPRE_IJMatrixPrint(ij_A, "ij_SetAddAssembleSet"); } HYPRE_IJMatrixDestroy(ij_A); } + /* Test Adds */ + if (mode & 64) + { + test_all(comm, "5adds/set", memory_location, option, "aaaaasA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, + nchunks, init_alloc, early_assemble, grow_factor, shrink_threshold, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); + + hypre_ParCSRMatrix *parcsr_ref2 = hypre_ParCSRMatrixClone(parcsr_ref, 1); + hypre_ParCSRMatrixScale(parcsr_ref2, 1.); + + ierr += checkMatrix(parcsr_ref2, ij_A) > tol; + if (print_matrix) + { + HYPRE_IJMatrixPrint(ij_A, "ij_5AddsSet"); + } + HYPRE_IJMatrixDestroy(ij_A); + HYPRE_ParCSRMatrixDestroy(parcsr_ref2); + } + /* Print the error code */ hypre_ParPrintf(comm, "Test error code = %d\n", ierr); @@ -726,9 +738,11 @@ checkMatrix(HYPRE_ParCSRMatrix h_parcsr_ref, HYPRE_IJMatrix ij_A) /* set values */ HYPRE_Int -test_Set(MPI_Comm comm, +test_all(MPI_Comm comm, + char *test_name, HYPRE_MemoryLocation memory_location, HYPRE_Int option, + char *cmd_sequence, HYPRE_BigInt ilower, HYPRE_BigInt iupper, HYPRE_BigInt jlower, @@ -736,6 +750,10 @@ test_Set(MPI_Comm comm, HYPRE_Int nrows, HYPRE_BigInt num_nonzeros, HYPRE_Int nchunks, + HYPRE_Int init_alloc, + HYPRE_Int early_assemble, + HYPRE_Real grow_factor, + HYPRE_Real shrink_threshold, HYPRE_Int *h_nnzrow, HYPRE_Int *nnzrow, HYPRE_BigInt *rows, @@ -744,102 +762,10 @@ test_Set(MPI_Comm comm, HYPRE_IJMatrix *ij_A_ptr) { HYPRE_IJMatrix ij_A; - HYPRE_Int i, chunk, chunk_size; - HYPRE_Int time_index; - HYPRE_Int *h_rowptr = hypre_CTAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_HOST); - - for (i = 1; i < nrows + 1; i++) - { - h_rowptr[i] = h_rowptr[i - 1] + h_nnzrow[i - 1]; - } - hypre_assert(h_rowptr[nrows] == num_nonzeros); - - HYPRE_IJMatrixCreate(comm, ilower, iupper, jlower, jupper, &ij_A); - HYPRE_IJMatrixSetObjectType(ij_A, HYPRE_PARCSR); - HYPRE_IJMatrixInitialize_v2(ij_A, memory_location); - HYPRE_IJMatrixSetOMPFlag(ij_A, 1); - - chunk_size = nrows / nchunks; - -#if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); -#if defined(CUDA_PROFILER) - cudaProfilerStart(); -#endif -#endif - - time_index = hypre_InitializeTiming("Test SetValues"); - hypre_BeginTiming(time_index); - for (chunk = 0; chunk < nrows; chunk += chunk_size) - { - chunk_size = hypre_min(chunk_size, nrows - chunk); - - if (1 == option) - { - HYPRE_IJMatrixSetValues(ij_A, chunk_size, &nnzrow[chunk], &rows[chunk], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); - } - else - { - HYPRE_IJMatrixSetValues(ij_A, h_rowptr[chunk + chunk_size] - h_rowptr[chunk], - NULL, &rows[h_rowptr[chunk]], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); - } - } - - // Assemble matrix - HYPRE_IJMatrixAssemble(ij_A); - -#if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); -#if defined(CUDA_PROFILER) - cudaProfilerStop(); -#endif -#endif - - // Finalize timer - hypre_EndTiming(time_index); - hypre_PrintTiming("Test SetValues", hypre_MPI_COMM_WORLD); - hypre_FinalizeTiming(time_index); - hypre_ClearTiming(); - - // Free memory - hypre_TFree(h_rowptr, HYPRE_MEMORY_HOST); - - // Set pointer to matrix - *ij_A_ptr = ij_A; - - return hypre_error_flag; -} - -/* set values with (row, col) reversed, i.e., the transpose of A - * in this way, we can test off-proc set values */ -HYPRE_Int -test_AddTranspose(MPI_Comm comm, - HYPRE_MemoryLocation memory_location, - HYPRE_Int option, - HYPRE_BigInt ilower, - HYPRE_BigInt iupper, - HYPRE_BigInt jlower, - HYPRE_BigInt jupper, - HYPRE_Int nrows, - HYPRE_BigInt num_nonzeros, - HYPRE_Int nchunks, - HYPRE_Int *h_nnzrow, - HYPRE_Int *nnzrow, - HYPRE_BigInt *rows, - HYPRE_BigInt *cols, - HYPRE_Real *coefs, - HYPRE_IJMatrix *ij_AT_ptr) - - -{ - hypre_assert(option == 2); - - HYPRE_IJMatrix ij_AT; - HYPRE_Int i, chunk, chunk_size; + HYPRE_Int i, j, chunk, chunk_size; HYPRE_Int time_index; HYPRE_Int *h_rowptr = hypre_CTAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_HOST); + HYPRE_Int cmd_len = strlen(cmd_sequence); for (i = 1; i < nrows + 1; i++) { @@ -847,344 +773,23 @@ test_AddTranspose(MPI_Comm comm, } hypre_assert(h_rowptr[nrows] == num_nonzeros); - HYPRE_IJMatrixCreate(comm, jlower, jupper, ilower, iupper, &ij_AT); - HYPRE_IJMatrixSetObjectType(ij_AT, HYPRE_PARCSR); - HYPRE_IJMatrixInitialize_v2(ij_AT, memory_location); - HYPRE_IJMatrixSetOMPFlag(ij_AT, 1); - - chunk_size = nrows / nchunks; - -#if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); -#if defined(CUDA_PROFILER) - cudaProfilerStart(); -#endif -#endif - - time_index = hypre_InitializeTiming("Test SetValuesTranspose"); - hypre_BeginTiming(time_index); - - for (chunk = 0; chunk < nrows; chunk += chunk_size) - { - chunk_size = hypre_min(chunk_size, nrows - chunk); - - HYPRE_IJMatrixAddToValues(ij_AT, h_rowptr[chunk + chunk_size] - h_rowptr[chunk], - NULL, &cols[h_rowptr[chunk]], - &rows[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); - } - - // Assemble matrix - HYPRE_IJMatrixAssemble(ij_AT); - -#if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); -#if defined(CUDA_PROFILER) - cudaProfilerStop(); -#endif -#endif - - // Finalize timer - hypre_EndTiming(time_index); - hypre_PrintTiming("Test SetValuesTranspose", hypre_MPI_COMM_WORLD); - hypre_FinalizeTiming(time_index); - hypre_ClearTiming(); - - // Free memory - hypre_TFree(h_rowptr, HYPRE_MEMORY_HOST); - - // Set pointer to output - *ij_AT_ptr = ij_AT; - - return hypre_error_flag; -} - -HYPRE_Int -test_SetSet(MPI_Comm comm, - HYPRE_MemoryLocation memory_location, - HYPRE_Int option, - HYPRE_BigInt ilower, - HYPRE_BigInt iupper, - HYPRE_BigInt jlower, - HYPRE_BigInt jupper, - HYPRE_Int nrows, - HYPRE_BigInt num_nonzeros, - HYPRE_Int nchunks, - HYPRE_Int *h_nnzrow, - HYPRE_Int *nnzrow, - HYPRE_BigInt *rows, - HYPRE_BigInt *cols, - HYPRE_Real *coefs, - HYPRE_IJMatrix *ij_A_ptr) -{ - HYPRE_IJMatrix ij_A; - HYPRE_Int i, chunk, chunk_size; - HYPRE_Int time_index; - HYPRE_Int *h_rowptr; - HYPRE_Real *new_coefs; - - HYPRE_IJMatrixCreate(comm, ilower, iupper, jlower, jupper, &ij_A); - HYPRE_IJMatrixSetObjectType(ij_A, HYPRE_PARCSR); - HYPRE_IJMatrixInitialize_v2(ij_A, memory_location); - HYPRE_IJMatrixSetOMPFlag(ij_A, 1); - - h_rowptr = hypre_CTAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_HOST); - for (i = 1; i < nrows + 1; i++) - { - h_rowptr[i] = h_rowptr[i - 1] + h_nnzrow[i - 1]; - } - hypre_assert(h_rowptr[nrows] == num_nonzeros); - - chunk_size = nrows / nchunks; - new_coefs = hypre_TAlloc(HYPRE_Real, num_nonzeros, memory_location); - - if (hypre_GetActualMemLocation(memory_location) == hypre_MEMORY_HOST) - { - for (i = 0; i < num_nonzeros; i++) - { - new_coefs[i] = 2.0 * coefs[i]; - } - } -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - else - { - hypre_TMemcpy(new_coefs, coefs, HYPRE_Real, num_nonzeros, memory_location, memory_location); - hypreDevice_ComplexScalen(new_coefs, num_nonzeros, new_coefs, 2.0); - } -#endif - -#if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); -#if defined(CUDA_PROFILER) - cudaProfilerStart(); -#endif -#endif - - // First Set - time_index = hypre_InitializeTiming("Test Set/Set"); - hypre_BeginTiming(time_index); - for (chunk = 0; chunk < nrows; chunk += chunk_size) - { - chunk_size = hypre_min(chunk_size, nrows - chunk); - - if (1 == option) - { - HYPRE_IJMatrixSetValues(ij_A, chunk_size, &nnzrow[chunk], &rows[chunk], - &cols[h_rowptr[chunk]], &new_coefs[h_rowptr[chunk]]); - } - else - { - HYPRE_IJMatrixSetValues(ij_A, h_rowptr[chunk + chunk_size] - h_rowptr[chunk], - NULL, &rows[h_rowptr[chunk]], - &cols[h_rowptr[chunk]], &new_coefs[h_rowptr[chunk]]); - } - } - - // Assemble matrix - HYPRE_IJMatrixAssemble(ij_A); - - // Second set - for (chunk = 0; chunk < nrows; chunk += chunk_size) - { - chunk_size = hypre_min(chunk_size, nrows - chunk); - - if (1 == option) - { - HYPRE_IJMatrixSetValues(ij_A, chunk_size, &nnzrow[chunk], &rows[chunk], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); - } - else - { - HYPRE_IJMatrixSetValues(ij_A, h_rowptr[chunk + chunk_size] - h_rowptr[chunk], - NULL, &rows[h_rowptr[chunk]], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); - } - } - - // Assemble matrix - HYPRE_IJMatrixAssemble(ij_A); - -#if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); -#if defined(CUDA_PROFILER) - cudaProfilerStop(); -#endif -#endif - - // Finalize timer - hypre_EndTiming(time_index); - hypre_PrintTiming("Test Set/Set", hypre_MPI_COMM_WORLD); - hypre_FinalizeTiming(time_index); - hypre_ClearTiming(); - - // Free memory - hypre_TFree(h_rowptr, HYPRE_MEMORY_HOST); - hypre_TFree(new_coefs, memory_location); - - // Set pointer to matrix - *ij_A_ptr = ij_A; - - return hypre_error_flag; -} - -HYPRE_Int -test_AddSet(MPI_Comm comm, - HYPRE_MemoryLocation memory_location, - HYPRE_Int option, - HYPRE_BigInt ilower, - HYPRE_BigInt iupper, - HYPRE_BigInt jlower, - HYPRE_BigInt jupper, - HYPRE_Int nrows, - HYPRE_BigInt num_nonzeros, - HYPRE_Int nchunks, - HYPRE_Int *h_nnzrow, - HYPRE_Int *nnzrow, - HYPRE_BigInt *rows, - HYPRE_BigInt *cols, - HYPRE_Real *coefs, - HYPRE_IJMatrix *ij_A_ptr) -{ - HYPRE_IJMatrix ij_A; - HYPRE_Int i, chunk, chunk_size; - HYPRE_Int time_index; - HYPRE_Int *h_rowptr; - HYPRE_Real *new_coefs; - HYPRE_IJMatrixCreate(comm, ilower, iupper, jlower, jupper, &ij_A); HYPRE_IJMatrixSetObjectType(ij_A, HYPRE_PARCSR); HYPRE_IJMatrixInitialize_v2(ij_A, memory_location); HYPRE_IJMatrixSetOMPFlag(ij_A, 1); - - h_rowptr = hypre_CTAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_HOST); - for (i = 1; i < nrows + 1; i++) - { - h_rowptr[i] = h_rowptr[i - 1] + h_nnzrow[i - 1]; - } - hypre_assert(h_rowptr[nrows] == num_nonzeros); - - chunk_size = nrows / nchunks; - new_coefs = hypre_TAlloc(HYPRE_Real, num_nonzeros, memory_location); - - if (hypre_GetActualMemLocation(memory_location) == hypre_MEMORY_HOST) - { - for (i = 0; i < num_nonzeros; i++) - { - new_coefs[i] = 2.0 * coefs[i]; - } - } -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - else - { - hypre_TMemcpy(new_coefs, coefs, HYPRE_Real, num_nonzeros, memory_location, memory_location); - hypreDevice_ComplexScalen(new_coefs, num_nonzeros, new_coefs, 2.0); - } -#endif - -#if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); -#if defined(CUDA_PROFILER) - cudaProfilerStart(); -#endif -#endif - - // First Add - time_index = hypre_InitializeTiming("Test Add/Set"); - hypre_BeginTiming(time_index); - for (chunk = 0; chunk < nrows; chunk += chunk_size) + if (init_alloc >= 0) { - chunk_size = hypre_min(chunk_size, nrows - chunk); - - if (1 == option) - { - HYPRE_IJMatrixAddToValues(ij_A, chunk_size, &nnzrow[chunk], &rows[chunk], - &cols[h_rowptr[chunk]], &new_coefs[h_rowptr[chunk]]); - } - else - { - HYPRE_IJMatrixAddToValues(ij_A, h_rowptr[chunk + chunk_size] - h_rowptr[chunk], - NULL, &rows[h_rowptr[chunk]], - &cols[h_rowptr[chunk]], &new_coefs[h_rowptr[chunk]]); - } + HYPRE_IJMatrixSetInitAllocation(ij_A, init_alloc); } - - // Then Set - for (chunk = 0; chunk < nrows; chunk += chunk_size) + HYPRE_IJMatrixSetEarlyAssemble(ij_A, early_assemble); + if (grow_factor > 0) { - chunk_size = hypre_min(chunk_size, nrows - chunk); - - if (1 == option) - { - HYPRE_IJMatrixSetValues(ij_A, chunk_size, &nnzrow[chunk], &rows[chunk], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); - } - else - { - HYPRE_IJMatrixSetValues(ij_A, h_rowptr[chunk + chunk_size] - h_rowptr[chunk], - NULL, &rows[h_rowptr[chunk]], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); - } + HYPRE_IJMatrixSetGrowFactor(ij_A, grow_factor); } - - // Assemble matrix - HYPRE_IJMatrixAssemble(ij_A); - -#if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); -#if defined(CUDA_PROFILER) - cudaProfilerStop(); -#endif -#endif - - // Finalize timer - hypre_EndTiming(time_index); - hypre_PrintTiming("Test Add/Set", hypre_MPI_COMM_WORLD); - hypre_FinalizeTiming(time_index); - hypre_ClearTiming(); - - // Free memory - hypre_TFree(h_rowptr, HYPRE_MEMORY_HOST); - hypre_TFree(new_coefs, memory_location); - - // Set pointer to matrix - *ij_A_ptr = ij_A; - - return hypre_error_flag; -} - -HYPRE_Int -test_SetAdd(MPI_Comm comm, - HYPRE_MemoryLocation memory_location, - HYPRE_Int option, - HYPRE_BigInt ilower, - HYPRE_BigInt iupper, - HYPRE_BigInt jlower, - HYPRE_BigInt jupper, - HYPRE_Int nrows, - HYPRE_BigInt num_nonzeros, - HYPRE_Int nchunks, - HYPRE_Int *h_nnzrow, - HYPRE_Int *nnzrow, - HYPRE_BigInt *rows, - HYPRE_BigInt *cols, - HYPRE_Real *coefs, - HYPRE_IJMatrix *ij_A_ptr) -{ - HYPRE_IJMatrix ij_A; - HYPRE_Int i, chunk, chunk_size; - HYPRE_Int time_index; - HYPRE_Int *h_rowptr; - - HYPRE_IJMatrixCreate(comm, ilower, iupper, jlower, jupper, &ij_A); - HYPRE_IJMatrixSetObjectType(ij_A, HYPRE_PARCSR); - HYPRE_IJMatrixInitialize_v2(ij_A, memory_location); - HYPRE_IJMatrixSetOMPFlag(ij_A, 1); - - h_rowptr = hypre_CTAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_HOST); - for (i = 1; i < nrows + 1; i++) + if (shrink_threshold >= 0) { - h_rowptr[i] = h_rowptr[i - 1] + h_nnzrow[i - 1]; + HYPRE_IJMatrixSetShrinkThreshold(ij_A, shrink_threshold); } - hypre_assert(h_rowptr[nrows] == num_nonzeros); chunk_size = nrows / nchunks; @@ -1195,47 +800,52 @@ test_SetAdd(MPI_Comm comm, #endif #endif - // First Set - time_index = hypre_InitializeTiming("Test Set/Add"); + time_index = hypre_InitializeTiming(test_name); hypre_BeginTiming(time_index); - for (chunk = 0; chunk < nrows; chunk += chunk_size) - { - chunk_size = hypre_min(chunk_size, nrows - chunk); - if (1 == option) - { - HYPRE_IJMatrixSetValues(ij_A, chunk_size, &nnzrow[chunk], &rows[chunk], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); - } - else - { - HYPRE_IJMatrixSetValues(ij_A, h_rowptr[chunk + chunk_size] - h_rowptr[chunk], - NULL, &rows[h_rowptr[chunk]], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); - } - } - - // Second Add - for (chunk = 0; chunk < nrows; chunk += chunk_size) + for (j = 0; j < cmd_len; j++) { - chunk_size = hypre_min(chunk_size, nrows - chunk); - - if (1 == option) + if (cmd_sequence[j] == 's' || cmd_sequence[j] == 'a') { - HYPRE_IJMatrixAddToValues(ij_A, chunk_size, &nnzrow[chunk], &rows[chunk], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); + for (chunk = 0; chunk < nrows; chunk += chunk_size) + { + chunk_size = hypre_min(chunk_size, nrows - chunk); + if (1 == option) + { + if (cmd_sequence[j] == 's') + { + HYPRE_IJMatrixSetValues(ij_A, chunk_size, &nnzrow[chunk], &rows[chunk], + &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); + } + else + { + HYPRE_IJMatrixAddToValues(ij_A, chunk_size, &nnzrow[chunk], &rows[chunk], + &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); + } + } + else + { + if (cmd_sequence[j] == 's') + { + HYPRE_IJMatrixSetValues(ij_A, h_rowptr[chunk + chunk_size] - h_rowptr[chunk], + NULL, &rows[h_rowptr[chunk]], + &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); + } + else + { + HYPRE_IJMatrixAddToValues(ij_A, h_rowptr[chunk + chunk_size] - h_rowptr[chunk], + NULL, &rows[h_rowptr[chunk]], + &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); + } + } + } } - else + else if (cmd_sequence[j] == 'A') { - HYPRE_IJMatrixAddToValues(ij_A, h_rowptr[chunk + chunk_size] - h_rowptr[chunk], - NULL, &rows[h_rowptr[chunk]], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); + HYPRE_IJMatrixAssemble(ij_A); } } - // Assemble matrix - HYPRE_IJMatrixAssemble(ij_A); - #if defined(HYPRE_USING_GPU) hypre_SyncCudaDevice(hypre_handle()); #if defined(CUDA_PROFILER) @@ -1245,7 +855,7 @@ test_SetAdd(MPI_Comm comm, // Finalize timer hypre_EndTiming(time_index); - hypre_PrintTiming("Test Set/Add", hypre_MPI_COMM_WORLD); + hypre_PrintTiming(test_name, hypre_MPI_COMM_WORLD); hypre_FinalizeTiming(time_index); hypre_ClearTiming(); @@ -1258,127 +868,7 @@ test_SetAdd(MPI_Comm comm, return hypre_error_flag; } -HYPRE_Int -test_SetAddSet(MPI_Comm comm, - HYPRE_MemoryLocation memory_location, - HYPRE_Int option, /* 1 or 2 */ - HYPRE_BigInt ilower, - HYPRE_BigInt iupper, - HYPRE_BigInt jlower, - HYPRE_BigInt jupper, - HYPRE_Int nrows, - HYPRE_BigInt num_nonzeros, - HYPRE_Int nchunks, - HYPRE_Int *h_nnzrow, - HYPRE_Int *nnzrow, - HYPRE_BigInt - *rows, /* option = 1: length of nrows, = 2: length of num_nonzeros */ - HYPRE_BigInt *cols, - HYPRE_Real *coefs, - HYPRE_IJMatrix *ij_A_ptr) -{ - HYPRE_IJMatrix ij_A; - HYPRE_Int i, chunk, chunk_size; - HYPRE_Int time_index; - HYPRE_Int *h_rowptr; - - HYPRE_IJMatrixCreate(comm, ilower, iupper, jlower, jupper, &ij_A); - HYPRE_IJMatrixSetObjectType(ij_A, HYPRE_PARCSR); - HYPRE_IJMatrixInitialize_v2(ij_A, memory_location); - HYPRE_IJMatrixSetOMPFlag(ij_A, 1); - - h_rowptr = hypre_CTAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_HOST); - for (i = 1; i < nrows + 1; i++) - { - h_rowptr[i] = h_rowptr[i - 1] + h_nnzrow[i - 1]; - } - hypre_assert(h_rowptr[nrows] == num_nonzeros); - chunk_size = nrows / nchunks; - -#if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); -#if defined(CUDA_PROFILER) - cudaProfilerStart(); -#endif -#endif - - // First Set - time_index = hypre_InitializeTiming("Test Set/Add/Set"); - hypre_BeginTiming(time_index); - for (chunk = 0; chunk < nrows; chunk += chunk_size) - { - chunk_size = hypre_min(chunk_size, nrows - chunk); - if (1 == option) - { - HYPRE_IJMatrixSetValues(ij_A, chunk_size, &nnzrow[chunk], &rows[chunk], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); - } - else - { - HYPRE_IJMatrixSetValues(ij_A, h_rowptr[chunk + chunk_size] - h_rowptr[chunk], - NULL, &rows[h_rowptr[chunk]], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); - } - } - // Then Add - for (chunk = 0; chunk < nrows; chunk += chunk_size) - { - chunk_size = hypre_min(chunk_size, nrows - chunk); - if (1 == option) - { - HYPRE_IJMatrixAddToValues(ij_A, chunk_size, &nnzrow[chunk], &rows[chunk], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); - } - else - { - HYPRE_IJMatrixAddToValues(ij_A, h_rowptr[chunk + chunk_size] - h_rowptr[chunk], - NULL, &rows[h_rowptr[chunk]], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); - } - } - // Then Set - for (chunk = 0; chunk < nrows; chunk += chunk_size) - { - chunk_size = hypre_min(chunk_size, nrows - chunk); - - if (1 == option) - { - HYPRE_IJMatrixSetValues(ij_A, chunk_size, &nnzrow[chunk], &rows[chunk], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); - } - else - { - HYPRE_IJMatrixSetValues(ij_A, h_rowptr[chunk + chunk_size] - h_rowptr[chunk], - NULL, &rows[h_rowptr[chunk]], - &cols[h_rowptr[chunk]], &coefs[h_rowptr[chunk]]); - } - } - - // Assemble matrix - HYPRE_IJMatrixAssemble(ij_A); - -#if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); -#if defined(CUDA_PROFILER) - cudaProfilerStop(); -#endif -#endif - - // Finalize timer - hypre_EndTiming(time_index); - hypre_PrintTiming("Test Set/Add/Set", hypre_MPI_COMM_WORLD); - hypre_FinalizeTiming(time_index); - hypre_ClearTiming(); - - // Free memory - hypre_TFree(h_rowptr, HYPRE_MEMORY_HOST); - - // Set pointer to matrix - *ij_A_ptr = ij_A; - - return hypre_error_flag; -} From e43e11eb3d0316279871bef71c0e13909f78cdcf Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Thu, 2 May 2024 12:11:19 -0700 Subject: [PATCH 04/27] update headers --- src/IJ_mv/HYPRE_IJ_mv.h | 12 ++++++++++++ src/IJ_mv/aux_parcsr_matrix.h | 8 ++++++-- src/IJ_mv/protos.h | 10 ++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/IJ_mv/HYPRE_IJ_mv.h b/src/IJ_mv/HYPRE_IJ_mv.h index 2c9c1edf48..ddc947fe91 100644 --- a/src/IJ_mv/HYPRE_IJ_mv.h +++ b/src/IJ_mv/HYPRE_IJ_mv.h @@ -295,6 +295,18 @@ HYPRE_Int HYPRE_IJMatrixSetDiagOffdSizes(HYPRE_IJMatrix matrix, HYPRE_Int HYPRE_IJMatrixSetMaxOffProcElmts(HYPRE_IJMatrix matrix, HYPRE_Int max_off_proc_elmts); +HYPRE_Int HYPRE_IJMatrixSetInitAllocation(HYPRE_IJMatrix matrix, + HYPRE_Int factor); + +HYPRE_Int HYPRE_IJMatrixSetEarlyAssemble(HYPRE_IJMatrix matrix, + HYPRE_Int early_assemble); + +HYPRE_Int HYPRE_IJMatrixSetGrowFactor(HYPRE_IJMatrix matrix, + HYPRE_Real factor); + +HYPRE_Int HYPRE_IJMatrixSetShrinkThreshold(HYPRE_IJMatrix matrix, + HYPRE_Real threshold); + /** * (Optional) Sets the print level, if the user wants to print * error messages. The default is 0, i.e. no error messages are printed. diff --git a/src/IJ_mv/aux_parcsr_matrix.h b/src/IJ_mv/aux_parcsr_matrix.h index 05ab86b7e5..a86bfffc40 100644 --- a/src/IJ_mv/aux_parcsr_matrix.h +++ b/src/IJ_mv/aux_parcsr_matrix.h @@ -74,8 +74,10 @@ typedef struct char *stack_sora; /* Set (1) or Add (0) */ HYPRE_Int usr_on_proc_elmts; /* user given num elmt on-proc */ HYPRE_Int usr_off_proc_elmts; /* user given num elmt off-proc */ - HYPRE_BigInt init_alloc_factor; - HYPRE_BigInt grow_factor; + HYPRE_Int early_assemble; + HYPRE_Int init_alloc_factor; + HYPRE_Real grow_factor; + HYPRE_Real shrink_threshold; #endif } hypre_AuxParCSRMatrix; @@ -118,8 +120,10 @@ typedef struct #define hypre_AuxParCSRMatrixStackSorA(matrix) ((matrix) -> stack_sora) #define hypre_AuxParCSRMatrixUsrOnProcElmts(matrix) ((matrix) -> usr_on_proc_elmts) #define hypre_AuxParCSRMatrixUsrOffProcElmts(matrix) ((matrix) -> usr_off_proc_elmts) +#define hypre_AuxParCSRMatrixEarlyAssemble(matrix) ((matrix) -> early_assemble) #define hypre_AuxParCSRMatrixInitAllocFactor(matrix) ((matrix) -> init_alloc_factor) #define hypre_AuxParCSRMatrixGrowFactor(matrix) ((matrix) -> grow_factor) +#define hypre_AuxParCSRMatrixShrinkThreshold(matrix) ((matrix) -> shrink_threshold) #endif #endif /* #ifndef hypre_AUX_PARCSR_MATRIX_HEADER */ diff --git a/src/IJ_mv/protos.h b/src/IJ_mv/protos.h index 524277c128..502b753ce7 100644 --- a/src/IJ_mv/protos.h +++ b/src/IJ_mv/protos.h @@ -67,6 +67,14 @@ HYPRE_Int hypre_IJMatrixSetDiagOffdSizesParCSR ( hypre_IJMatrix *matrix, const HYPRE_Int *diag_sizes, const HYPRE_Int *offdiag_sizes ); HYPRE_Int hypre_IJMatrixSetMaxOffProcElmtsParCSR ( hypre_IJMatrix *matrix, HYPRE_Int max_off_proc_elmts ); +HYPRE_Int hypre_IJMatrixSetInitAllocationParCSR(hypre_IJMatrix *matrix, + HYPRE_Int factor); +HYPRE_Int hypre_IJMatrixSetEarlyAssembleParCSR(hypre_IJMatrix *matrix, + HYPRE_Int early_assemble); +HYPRE_Int hypre_IJMatrixSetGrowFactorParCSR(hypre_IJMatrix *matrix, + HYPRE_Real factor); +HYPRE_Int hypre_IJMatrixSetShrinkThresholdParCSR(hypre_IJMatrix *matrix, + HYPRE_Real threshold); HYPRE_Int hypre_IJMatrixInitializeParCSR ( hypre_IJMatrix *matrix ); HYPRE_Int hypre_IJMatrixGetRowCountsParCSR ( hypre_IJMatrix *matrix, HYPRE_Int nrows, HYPRE_BigInt *rows, HYPRE_Int *ncols ); @@ -108,6 +116,8 @@ HYPRE_Int hypre_IJMatrixInitializeParCSR_v2(hypre_IJMatrix *matrix, HYPRE_Int hypre_IJMatrixSetConstantValuesParCSRDevice( hypre_IJMatrix *matrix, HYPRE_Complex value ); +HYPRE_Int hypre_IJMatrixAssembleCommunicateAndCompressDevice(hypre_IJMatrix *matrix, HYPRE_Int reduce_stack_size); + /* IJMatrix_petsc.c */ HYPRE_Int hypre_IJMatrixSetLocalSizePETSc ( hypre_IJMatrix *matrix, HYPRE_Int local_m, HYPRE_Int local_n ); From 78f58fb9fb35edc2f1e79d4a9568f63274571bbc Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Thu, 2 May 2024 12:12:19 -0700 Subject: [PATCH 05/27] apis --- src/IJ_mv/HYPRE_IJMatrix.c | 112 ++++++++++++++++++++++++++++++++++++ src/IJ_mv/IJMatrix_parcsr.c | 112 ++++++++++++++++++++++++++++++++++++ 2 files changed, 224 insertions(+) diff --git a/src/IJ_mv/HYPRE_IJMatrix.c b/src/IJ_mv/HYPRE_IJMatrix.c index 3b7cc08b99..1dcd049d31 100644 --- a/src/IJ_mv/HYPRE_IJMatrix.c +++ b/src/IJ_mv/HYPRE_IJMatrix.c @@ -1009,6 +1009,118 @@ HYPRE_IJMatrixSetMaxOffProcElmts( HYPRE_IJMatrix matrix, return hypre_error_flag; } +/*-------------------------------------------------------------------------- + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_IJMatrixSetInitAllocation(hypre_IJMatrix *matrix, + HYPRE_Int factor) +{ + hypre_IJMatrix *ijmatrix = (hypre_IJMatrix *) matrix; + + if (!ijmatrix) + { + hypre_error_in_arg(1); + return hypre_error_flag; + } + + if ( hypre_IJMatrixObjectType(ijmatrix) == HYPRE_PARCSR ) + { + return ( hypre_IJMatrixSetInitAllocationParCSR(ijmatrix, + factor) ); + } + else + { + hypre_error_in_arg(1); + } + + return hypre_error_flag; +} + +/*-------------------------------------------------------------------------- + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_IJMatrixSetEarlyAssemble(hypre_IJMatrix *matrix, + HYPRE_Int early_assemble) +{ + hypre_IJMatrix *ijmatrix = (hypre_IJMatrix *) matrix; + + if (!ijmatrix) + { + hypre_error_in_arg(1); + return hypre_error_flag; + } + + if ( hypre_IJMatrixObjectType(ijmatrix) == HYPRE_PARCSR ) + { + return ( hypre_IJMatrixSetEarlyAssembleParCSR(ijmatrix, + early_assemble) ); + } + else + { + hypre_error_in_arg(1); + } + + return hypre_error_flag; +} + +/*-------------------------------------------------------------------------- + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_IJMatrixSetGrowFactor(hypre_IJMatrix *matrix, + HYPRE_Real factor) +{ + hypre_IJMatrix *ijmatrix = (hypre_IJMatrix *) matrix; + + if (!ijmatrix) + { + hypre_error_in_arg(1); + return hypre_error_flag; + } + + if ( hypre_IJMatrixObjectType(ijmatrix) == HYPRE_PARCSR ) + { + return ( hypre_IJMatrixSetGrowFactorParCSR(ijmatrix, + factor) ); + } + else + { + hypre_error_in_arg(1); + } + + return hypre_error_flag; +} + +/*-------------------------------------------------------------------------- + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_IJMatrixSetShrinkThreshold(hypre_IJMatrix *matrix, + HYPRE_Real threshold) +{ + hypre_IJMatrix *ijmatrix = (hypre_IJMatrix *) matrix; + + if (!ijmatrix) + { + hypre_error_in_arg(1); + return hypre_error_flag; + } + + if ( hypre_IJMatrixObjectType(ijmatrix) == HYPRE_PARCSR ) + { + return ( hypre_IJMatrixSetShrinkThresholdParCSR(ijmatrix, + threshold) ); + } + else + { + hypre_error_in_arg(1); + } + + return hypre_error_flag; +} + /*-------------------------------------------------------------------------- * HYPRE_IJMatrixRead * diff --git a/src/IJ_mv/IJMatrix_parcsr.c b/src/IJ_mv/IJMatrix_parcsr.c index 15abab4139..055f39aaa0 100644 --- a/src/IJ_mv/IJMatrix_parcsr.c +++ b/src/IJ_mv/IJMatrix_parcsr.c @@ -242,6 +242,118 @@ hypre_IJMatrixSetMaxOffProcElmtsParCSR(hypre_IJMatrix *matrix, return hypre_error_flag; } +/****************************************************************************** + * + * hypre_IJMatrixSetInitAllocationParCSR + * + *****************************************************************************/ + +HYPRE_Int +hypre_IJMatrixSetInitAllocationParCSR(hypre_IJMatrix *matrix, + HYPRE_Int factor) +{ +#if defined(HYPRE_USING_GPU) + hypre_AuxParCSRMatrix *aux_matrix = (hypre_AuxParCSRMatrix *) hypre_IJMatrixTranslator(matrix); + HYPRE_BigInt *row_partitioning = hypre_IJMatrixRowPartitioning(matrix); + HYPRE_BigInt *col_partitioning = hypre_IJMatrixColPartitioning(matrix); + + if (!aux_matrix) + { + HYPRE_Int local_num_rows = (HYPRE_Int)(row_partitioning[1] - row_partitioning[0]); + HYPRE_Int local_num_cols = (HYPRE_Int)(col_partitioning[1] - col_partitioning[0]); + hypre_AuxParCSRMatrixCreate(&aux_matrix, local_num_rows, local_num_cols, NULL); + hypre_IJMatrixTranslator(matrix) = aux_matrix; + } + hypre_AuxParCSRMatrixInitAllocFactor(aux_matrix) = factor; +#endif + + return hypre_error_flag; +} + +/****************************************************************************** + * + * hypre_IJMatrixSetEarlyAssembleParCSR + * + *****************************************************************************/ + +HYPRE_Int +hypre_IJMatrixSetEarlyAssembleParCSR(hypre_IJMatrix *matrix, + HYPRE_Int early_assemble) +{ +#if defined(HYPRE_USING_GPU) + hypre_AuxParCSRMatrix *aux_matrix = (hypre_AuxParCSRMatrix *) hypre_IJMatrixTranslator(matrix); + HYPRE_BigInt *row_partitioning = hypre_IJMatrixRowPartitioning(matrix); + HYPRE_BigInt *col_partitioning = hypre_IJMatrixColPartitioning(matrix); + + if (!aux_matrix) + { + HYPRE_Int local_num_rows = (HYPRE_Int)(row_partitioning[1] - row_partitioning[0]); + HYPRE_Int local_num_cols = (HYPRE_Int)(col_partitioning[1] - col_partitioning[0]); + hypre_AuxParCSRMatrixCreate(&aux_matrix, local_num_rows, local_num_cols, NULL); + hypre_IJMatrixTranslator(matrix) = aux_matrix; + } + hypre_AuxParCSRMatrixEarlyAssemble(aux_matrix) = early_assemble; +#endif + + return hypre_error_flag; +} + +/****************************************************************************** + * + * hypre_IJMatrixSetGrowFactorParCSR + * + *****************************************************************************/ + +HYPRE_Int +hypre_IJMatrixSetGrowFactorParCSR(hypre_IJMatrix *matrix, + HYPRE_Real factor) +{ +#if defined(HYPRE_USING_GPU) + hypre_AuxParCSRMatrix *aux_matrix = (hypre_AuxParCSRMatrix *) hypre_IJMatrixTranslator(matrix); + HYPRE_BigInt *row_partitioning = hypre_IJMatrixRowPartitioning(matrix); + HYPRE_BigInt *col_partitioning = hypre_IJMatrixColPartitioning(matrix); + + if (!aux_matrix) + { + HYPRE_Int local_num_rows = (HYPRE_Int)(row_partitioning[1] - row_partitioning[0]); + HYPRE_Int local_num_cols = (HYPRE_Int)(col_partitioning[1] - col_partitioning[0]); + hypre_AuxParCSRMatrixCreate(&aux_matrix, local_num_rows, local_num_cols, NULL); + hypre_IJMatrixTranslator(matrix) = aux_matrix; + } + hypre_AuxParCSRMatrixGrowFactor(aux_matrix) = factor; +#endif + + return hypre_error_flag; +} + +/****************************************************************************** + * + * hypre_IJMatrixSetShrinkThresholdParCSR + * + *****************************************************************************/ + +HYPRE_Int +hypre_IJMatrixSetShrinkThresholdParCSR(hypre_IJMatrix *matrix, + HYPRE_Real threshold) +{ +#if defined(HYPRE_USING_GPU) + hypre_AuxParCSRMatrix *aux_matrix = (hypre_AuxParCSRMatrix *) hypre_IJMatrixTranslator(matrix); + HYPRE_BigInt *row_partitioning = hypre_IJMatrixRowPartitioning(matrix); + HYPRE_BigInt *col_partitioning = hypre_IJMatrixColPartitioning(matrix); + + if (!aux_matrix) + { + HYPRE_Int local_num_rows = (HYPRE_Int)(row_partitioning[1] - row_partitioning[0]); + HYPRE_Int local_num_cols = (HYPRE_Int)(col_partitioning[1] - col_partitioning[0]); + hypre_AuxParCSRMatrixCreate(&aux_matrix, local_num_rows, local_num_cols, NULL); + hypre_IJMatrixTranslator(matrix) = aux_matrix; + } + hypre_AuxParCSRMatrixShrinkThreshold(aux_matrix) = threshold; +#endif + + return hypre_error_flag; +} + /****************************************************************************** * * hypre_IJMatrixInitializeParCSR From e8cf56c84442d9cb6578b2cf9254255add4f5c28 Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Thu, 2 May 2024 12:12:41 -0700 Subject: [PATCH 06/27] add defaults --- src/IJ_mv/aux_parcsr_matrix.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/IJ_mv/aux_parcsr_matrix.c b/src/IJ_mv/aux_parcsr_matrix.c index 472108327c..cdb21f748f 100644 --- a/src/IJ_mv/aux_parcsr_matrix.c +++ b/src/IJ_mv/aux_parcsr_matrix.c @@ -61,8 +61,10 @@ hypre_AuxParCSRMatrixCreate( hypre_AuxParCSRMatrix **aux_matrix, hypre_AuxParCSRMatrixStackSorA(matrix) = NULL; hypre_AuxParCSRMatrixUsrOnProcElmts(matrix) = -1; hypre_AuxParCSRMatrixUsrOffProcElmts(matrix) = -1; - hypre_AuxParCSRMatrixInitAllocFactor(matrix) = 5; - hypre_AuxParCSRMatrixGrowFactor(matrix) = 2; + hypre_AuxParCSRMatrixInitAllocFactor(matrix) = 0; + hypre_AuxParCSRMatrixEarlyAssemble(matrix) = 0; + hypre_AuxParCSRMatrixGrowFactor(matrix) = 2.0; + hypre_AuxParCSRMatrixShrinkThreshold(matrix) = 0.25; #endif *aux_matrix = matrix; From bf1344cf2bd7f6f250f6464569264de814dbb678 Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Thu, 2 May 2024 12:13:04 -0700 Subject: [PATCH 07/27] header --- src/IJ_mv/_hypre_IJ_mv.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/IJ_mv/_hypre_IJ_mv.h b/src/IJ_mv/_hypre_IJ_mv.h index f69c5b6ba5..9ac09e7bb6 100644 --- a/src/IJ_mv/_hypre_IJ_mv.h +++ b/src/IJ_mv/_hypre_IJ_mv.h @@ -89,8 +89,10 @@ typedef struct char *stack_sora; /* Set (1) or Add (0) */ HYPRE_Int usr_on_proc_elmts; /* user given num elmt on-proc */ HYPRE_Int usr_off_proc_elmts; /* user given num elmt off-proc */ - HYPRE_BigInt init_alloc_factor; - HYPRE_BigInt grow_factor; + HYPRE_Int early_assemble; + HYPRE_Int init_alloc_factor; + HYPRE_Real grow_factor; + HYPRE_Real shrink_threshold; #endif } hypre_AuxParCSRMatrix; @@ -133,8 +135,10 @@ typedef struct #define hypre_AuxParCSRMatrixStackSorA(matrix) ((matrix) -> stack_sora) #define hypre_AuxParCSRMatrixUsrOnProcElmts(matrix) ((matrix) -> usr_on_proc_elmts) #define hypre_AuxParCSRMatrixUsrOffProcElmts(matrix) ((matrix) -> usr_off_proc_elmts) +#define hypre_AuxParCSRMatrixEarlyAssemble(matrix) ((matrix) -> early_assemble) #define hypre_AuxParCSRMatrixInitAllocFactor(matrix) ((matrix) -> init_alloc_factor) #define hypre_AuxParCSRMatrixGrowFactor(matrix) ((matrix) -> grow_factor) +#define hypre_AuxParCSRMatrixShrinkThreshold(matrix) ((matrix) -> shrink_threshold) #endif #endif /* #ifndef hypre_AUX_PARCSR_MATRIX_HEADER */ @@ -437,6 +441,14 @@ HYPRE_Int hypre_IJMatrixSetDiagOffdSizesParCSR ( hypre_IJMatrix *matrix, const HYPRE_Int *diag_sizes, const HYPRE_Int *offdiag_sizes ); HYPRE_Int hypre_IJMatrixSetMaxOffProcElmtsParCSR ( hypre_IJMatrix *matrix, HYPRE_Int max_off_proc_elmts ); +HYPRE_Int hypre_IJMatrixSetInitAllocationParCSR(hypre_IJMatrix *matrix, + HYPRE_Int factor); +HYPRE_Int hypre_IJMatrixSetEarlyAssembleParCSR(hypre_IJMatrix *matrix, + HYPRE_Int early_assemble); +HYPRE_Int hypre_IJMatrixSetGrowFactorParCSR(hypre_IJMatrix *matrix, + HYPRE_Real factor); +HYPRE_Int hypre_IJMatrixSetShrinkThresholdParCSR(hypre_IJMatrix *matrix, + HYPRE_Real threshold); HYPRE_Int hypre_IJMatrixInitializeParCSR ( hypre_IJMatrix *matrix ); HYPRE_Int hypre_IJMatrixGetRowCountsParCSR ( hypre_IJMatrix *matrix, HYPRE_Int nrows, HYPRE_BigInt *rows, HYPRE_Int *ncols ); @@ -478,6 +490,8 @@ HYPRE_Int hypre_IJMatrixInitializeParCSR_v2(hypre_IJMatrix *matrix, HYPRE_Int hypre_IJMatrixSetConstantValuesParCSRDevice( hypre_IJMatrix *matrix, HYPRE_Complex value ); +HYPRE_Int hypre_IJMatrixAssembleCommunicateAndCompressDevice(hypre_IJMatrix *matrix, HYPRE_Int reduce_stack_size); + /* IJMatrix_petsc.c */ HYPRE_Int hypre_IJMatrixSetLocalSizePETSc ( hypre_IJMatrix *matrix, HYPRE_Int local_m, HYPRE_Int local_n ); From 2d189f2b37de0568278bb7d42d90ae6513ee873d Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Thu, 2 May 2024 12:13:31 -0700 Subject: [PATCH 08/27] main changes --- src/IJ_mv/IJMatrix_parcsr_device.c | 505 +++++++++++++++++------------ 1 file changed, 295 insertions(+), 210 deletions(-) diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c index 199c6fd191..de211c0a59 100644 --- a/src/IJ_mv/IJMatrix_parcsr_device.c +++ b/src/IJ_mv/IJMatrix_parcsr_device.c @@ -38,6 +38,45 @@ hypreGPUKernel_IJMatrixValues_dev1(hypre_DeviceItem &item, HYPRE_Int n, HYPRE_In } } +HYPRE_Int +hypre_AuxParCSRMatrixStackReallocate(hypre_AuxParCSRMatrix *aux_matrix, + HYPRE_BigInt new_stack_max) +{ + HYPRE_BigInt stack_max = hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix); + + hypre_AuxParCSRMatrixStackI(aux_matrix) = hypre_TReAlloc_v2(hypre_AuxParCSRMatrixStackI(aux_matrix), HYPRE_BigInt, + stack_max, HYPRE_BigInt, new_stack_max, HYPRE_MEMORY_DEVICE); + + hypre_AuxParCSRMatrixStackJ(aux_matrix) = hypre_TReAlloc_v2(hypre_AuxParCSRMatrixStackJ(aux_matrix), HYPRE_BigInt, + stack_max, HYPRE_BigInt, new_stack_max, HYPRE_MEMORY_DEVICE); + + hypre_AuxParCSRMatrixStackData(aux_matrix) = hypre_TReAlloc_v2(hypre_AuxParCSRMatrixStackData(aux_matrix), HYPRE_Complex, + stack_max, HYPRE_Complex, new_stack_max, HYPRE_MEMORY_DEVICE); + + hypre_AuxParCSRMatrixStackSorA(aux_matrix) = hypre_TReAlloc_v2(hypre_AuxParCSRMatrixStackSorA(aux_matrix), char, + stack_max, char, new_stack_max, HYPRE_MEMORY_DEVICE); + + hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix) = new_stack_max; + + return hypre_error_flag; +} + +inline void +hypre_AuxParCSRMatrixStackPrintInfo(hypre_IJMatrix *matrix) +{ + HYPRE_Int myid; + static HYPRE_Int counter = 0; + hypre_MPI_Comm_rank(hypre_IJMatrixComm(matrix), &myid ); + hypre_AuxParCSRMatrix *aux_matrix = (hypre_AuxParCSRMatrix *) hypre_IJMatrixTranslator(matrix); + + counter ++; + //hypre_printf(" IJMatrixSetAddValues: PID %d: max %d, size %d\n", myid, + //hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix)), hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix)); + hypre_printf(" %d, %d, %d\n", counter, + hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix), + hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix)); +} + /* E.g. nrows = 3 * ncols = 2 3 4 * rows = 10 20 30 @@ -71,6 +110,8 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, HYPRE_Int nelms; HYPRE_Int *row_ptr = NULL; + HYPRE_Int early_assemble = hypre_AuxParCSRMatrixEarlyAssemble(aux_matrix); + HYPRE_Int early_assemble_flag = 0; /* expand rows into full expansion of rows based on ncols * if ncols == NULL, ncols is all ones, so rows are indeed full expansion */ @@ -78,6 +119,7 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, { row_ptr = hypre_TAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_DEVICE); hypre_TMemcpy(row_ptr, ncols, HYPRE_Int, nrows, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + /* RL: have to init the last entry !!! */ hypre_Memset(row_ptr + nrows, 0, sizeof(HYPRE_Int), HYPRE_MEMORY_DEVICE); hypreDevice_IntegerExclusiveScan(nrows + 1, row_ptr); @@ -111,29 +153,40 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, if ( stack_elmts_max < stack_elmts_required ) { - HYPRE_BigInt stack_elmts_max_new = - hypre_max(hypre_AuxParCSRMatrixUsrOnProcElmts (aux_matrix), 0) + - hypre_max(hypre_AuxParCSRMatrixUsrOffProcElmts(aux_matrix), 0); + HYPRE_BigInt stack_elmts_max_new = 0; - if ( hypre_AuxParCSRMatrixUsrOnProcElmts (aux_matrix) < 0 || - hypre_AuxParCSRMatrixUsrOffProcElmts(aux_matrix) < 0 ) + if (stack_elmts_max == 0) { - stack_elmts_max_new = hypre_max(num_local_rows * hypre_AuxParCSRMatrixInitAllocFactor(aux_matrix), - stack_elmts_max_new); - stack_elmts_max_new = hypre_max(stack_elmts_max * hypre_AuxParCSRMatrixGrowFactor(aux_matrix), - stack_elmts_max_new); + /* intial allocation */ + if ( hypre_AuxParCSRMatrixUsrOnProcElmts (aux_matrix) > 0 && hypre_AuxParCSRMatrixUsrOffProcElmts(aux_matrix) > 0 ) + { + stack_elmts_max_new = hypre_AuxParCSRMatrixUsrOnProcElmts (aux_matrix) + hypre_AuxParCSRMatrixUsrOffProcElmts(aux_matrix); + } + else + { + stack_elmts_max_new = num_local_rows * hypre_AuxParCSRMatrixInitAllocFactor(aux_matrix); + } + stack_elmts_max_new = hypre_max(stack_elmts_required, stack_elmts_max_new); } - stack_elmts_max_new = hypre_max(stack_elmts_required, stack_elmts_max_new); - - hypre_AuxParCSRMatrixStackI(aux_matrix) = stack_i = hypre_TReAlloc_v2(stack_i, - HYPRE_BigInt, stack_elmts_max, HYPRE_BigInt, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); - hypre_AuxParCSRMatrixStackJ(aux_matrix) = stack_j = hypre_TReAlloc_v2(stack_j, - HYPRE_BigInt, stack_elmts_max, HYPRE_BigInt, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); - hypre_AuxParCSRMatrixStackData(aux_matrix) = stack_data = hypre_TReAlloc_v2(stack_data, - HYPRE_Complex, stack_elmts_max, HYPRE_Complex, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); - hypre_AuxParCSRMatrixStackSorA(aux_matrix) = stack_sora = hypre_TReAlloc_v2(stack_sora, - char, stack_elmts_max, char, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); - hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix) = stack_elmts_max_new; + else + { + if (early_assemble) + { + stack_elmts_max_new = stack_elmts_required; + early_assemble_flag = 1; + } + else + { + stack_elmts_max_new = stack_elmts_required * hypre_AuxParCSRMatrixGrowFactor(aux_matrix); + } + } + + hypre_AuxParCSRMatrixStackReallocate(aux_matrix, stack_elmts_max_new); + stack_i = hypre_AuxParCSRMatrixStackI(aux_matrix); + stack_j = hypre_AuxParCSRMatrixStackJ(aux_matrix); + stack_data = hypre_AuxParCSRMatrixStackData(aux_matrix); + stack_sora = hypre_AuxParCSRMatrixStackSorA(aux_matrix); + stack_elmts_max = hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix); } hypreDevice_CharFilln(stack_sora + stack_elmts_current, nelms, SorA); @@ -207,7 +260,32 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, HYPRE_MEMORY_DEVICE); } - hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix) += (HYPRE_BigInt) nelms; + stack_elmts_current += (HYPRE_BigInt) nelms; + hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix) = stack_elmts_current; + + hypre_AuxParCSRMatrixStackPrintInfo(matrix); + + if (early_assemble_flag) + { + hypre_IJMatrixAssembleCommunicateAndCompressDevice(matrix, 0); + + stack_elmts_current = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); + hypre_assert(stack_elmts_max == hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix)); + + HYPRE_BigInt stack_elmts_max_new = hypre_max(stack_elmts_current * hypre_AuxParCSRMatrixGrowFactor(aux_matrix), stack_elmts_max); + + if (stack_elmts_current < stack_elmts_max_new * hypre_AuxParCSRMatrixShrinkThreshold(aux_matrix)) + { + stack_elmts_max_new = stack_elmts_current * hypre_AuxParCSRMatrixGrowFactor(aux_matrix); + } + + if (stack_elmts_max_new != stack_elmts_max) + { + hypre_AuxParCSRMatrixStackReallocate(aux_matrix, stack_elmts_max_new); + } + + hypre_AuxParCSRMatrixStackPrintInfo(matrix); + } hypre_TFree(row_ptr, HYPRE_MEMORY_DEVICE); @@ -241,25 +319,41 @@ struct hypre_IJMatrixAssembleFunctor : public }; #endif -/* helper routine used in hypre_IJMatrixAssembleParCSRDevice: +/* This helper routine is used in hypre_IJMatrixAssembleParCSRDevice on on-proc entries: * 1. sort (X0, A0) with key (I0, J0) - * [put the diagonal first; see the comments in cuda_utils.c] + * put the diagonal first by hypreDevice_StableSortTupleByTupleKey(..., 2) + * see the comments in cuda_utils.c * 2. for each segment in (I0, J0), zero out in A0 all before the last `set' * 3. reduce A0 [with sum] and reduce X0 [with max] - * N0: input size; N1: size after reduction (<= N0) - * Note: (I1, J1, X1, A1) are not resized to N1 but have size N0 + * The reason of using max for X0 is that once an entry has a set (1), which may be + * combined with adds, should continue to be considered as a set. + * This is the correct behavior when combined with existing CSR + * On entry + * N: size of I, J, X, A + * On return + * N: the number of entries in I, J, X, A, after reduction + * Note: (I, J, X, A) have length "size" */ HYPRE_Int -hypre_IJMatrixAssembleSortAndReduce1(HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_BigInt *J0, char *X0, - HYPRE_Complex *A0, - HYPRE_Int *N1, HYPRE_BigInt **I1, HYPRE_BigInt **J1, char **X1, HYPRE_Complex **A1 ) +hypre_IJMatrixAssembleSortAndReduce1(HYPRE_Int *Nptr, + HYPRE_BigInt **Iptr, + HYPRE_BigInt **Jptr, + char **Xptr, + HYPRE_Complex **Aptr, + HYPRE_Int size) { + HYPRE_Int N0 = *Nptr; + HYPRE_BigInt *I0 = *Iptr; + HYPRE_BigInt *J0 = *Jptr; + char *X0 = *Xptr; + HYPRE_Complex *A0 = *Aptr; + hypreDevice_StableSortTupleByTupleKey(N0, I0, J0, X0, A0, 2); - HYPRE_BigInt *I = hypre_TAlloc(HYPRE_BigInt, N0, HYPRE_MEMORY_DEVICE); - HYPRE_BigInt *J = hypre_TAlloc(HYPRE_BigInt, N0, HYPRE_MEMORY_DEVICE); - char *X = hypre_TAlloc(char, N0, HYPRE_MEMORY_DEVICE); - HYPRE_Complex *A = hypre_TAlloc(HYPRE_Complex, N0, HYPRE_MEMORY_DEVICE); + HYPRE_BigInt *I = hypre_TAlloc(HYPRE_BigInt, size, HYPRE_MEMORY_DEVICE); + HYPRE_BigInt *J = hypre_TAlloc(HYPRE_BigInt, size, HYPRE_MEMORY_DEVICE); + char *X = hypre_TAlloc(char, size, HYPRE_MEMORY_DEVICE); + HYPRE_Complex *A = hypre_TAlloc(HYPRE_Complex, size, HYPRE_MEMORY_DEVICE); /* dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); @@ -313,7 +407,7 @@ hypre_IJMatrixAssembleSortAndReduce1(HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_Big std::equal_to< std::tuple >(), /* binary_pred */ hypre_IJMatrixAssembleFunctor() /* binary_op */); - *N1 = std::get<0>(new_end.first.base()) - I; + *Nptr = std::get<0>(new_end.first.base()) - I; #else HYPRE_THRUST_CALL( exclusive_scan_by_key, @@ -337,12 +431,18 @@ hypre_IJMatrixAssembleSortAndReduce1(HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_Big thrust::equal_to< thrust::tuple >(), /* binary_pred */ hypre_IJMatrixAssembleFunctor() /* binary_op */); - *N1 = thrust::get<0>(new_end.first.get_iterator_tuple()) - I; + *Nptr = thrust::get<0>(new_end.first.get_iterator_tuple()) - I; #endif - *I1 = I; - *J1 = J; - *X1 = X; - *A1 = A; + + hypre_TFree(I0, HYPRE_MEMORY_DEVICE); + hypre_TFree(J0, HYPRE_MEMORY_DEVICE); + hypre_TFree(X0, HYPRE_MEMORY_DEVICE); + hypre_TFree(A0, HYPRE_MEMORY_DEVICE); + + *Iptr = I; + *Jptr = J; + *Xptr = X; + *Aptr = A; return hypre_error_flag; } @@ -382,16 +482,28 @@ struct hypre_IJMatrixAssembleFunctor2 : public }; #endif +/* This helper routine is for combining new entries with existing CSR. + * Opt = 2 for diag part to keep diagonal first + = 0 for offd part + */ HYPRE_Int -hypre_IJMatrixAssembleSortAndReduce2(HYPRE_Int N0, HYPRE_Int *I0, HYPRE_Int *J0, char *X0, - HYPRE_Complex *A0, - HYPRE_Int *N1, HYPRE_Int **I1, HYPRE_Int **J1, HYPRE_Complex **A1, - HYPRE_Int opt ) +hypre_IJMatrixAssembleSortAndReduce2(HYPRE_Int *Nptr, + HYPRE_Int **Iptr, + HYPRE_Int **Jptr, + char *X0, + HYPRE_Complex **Aptr, + HYPRE_Int opt) { + HYPRE_Int N0 = *Nptr; + HYPRE_Int *I0 = *Iptr; + HYPRE_Int *J0 = *Jptr; + HYPRE_Complex *A0 = *Aptr; + hypreDevice_StableSortTupleByTupleKey(N0, I0, J0, X0, A0, opt); HYPRE_Int *I = hypre_TAlloc(HYPRE_Int, N0, HYPRE_MEMORY_DEVICE); HYPRE_Int *J = hypre_TAlloc(HYPRE_Int, N0, HYPRE_MEMORY_DEVICE); + /* RL: no need to have X. No use before the free at the end */ char *X = hypre_TAlloc(char, N0, HYPRE_MEMORY_DEVICE); HYPRE_Complex *A = hypre_TAlloc(HYPRE_Complex, N0, HYPRE_MEMORY_DEVICE); @@ -405,7 +517,7 @@ hypre_IJMatrixAssembleSortAndReduce2(HYPRE_Int N0, HYPRE_Int *I0, HYPRE_Int * std::equal_to< std::tuple >(), /* binary_pred */ hypre_IJMatrixAssembleFunctor2() /* binary_op */); - *N1 = std::get<0>(new_end.first.base()) - I; + HYPRE_Int N = std::get<0>(new_end.first.base()) - I; #else auto new_end = HYPRE_THRUST_CALL( reduce_by_key, @@ -417,21 +529,44 @@ hypre_IJMatrixAssembleSortAndReduce2(HYPRE_Int N0, HYPRE_Int *I0, HYPRE_Int * thrust::equal_to< thrust::tuple >(), /* binary_pred */ hypre_IJMatrixAssembleFunctor2() /* binary_op */); - *N1 = thrust::get<0>(new_end.first.get_iterator_tuple()) - I; + HYPRE_Int N = thrust::get<0>(new_end.first.get_iterator_tuple()) - I; #endif - *I1 = I; - *J1 = J; - *A1 = A; - hypre_TFree(X, HYPRE_MEMORY_DEVICE); + hypre_TFree(I0, HYPRE_MEMORY_DEVICE); + hypre_TFree(J0, HYPRE_MEMORY_DEVICE); + hypre_TFree(A0, HYPRE_MEMORY_DEVICE); + + J = hypre_TReAlloc_v2(J, HYPRE_Int, N0, HYPRE_Int, N, HYPRE_MEMORY_DEVICE); + A = hypre_TReAlloc_v2(A, HYPRE_Complex, N0, HYPRE_Complex, N, HYPRE_MEMORY_DEVICE); + + *Nptr = N; + *Iptr = I; + *Jptr = J; + *Aptr = A; + + hypre_TFree(X, HYPRE_MEMORY_DEVICE); return hypre_error_flag; } +/* This is used on off-proc entries before sending them to other procs + * 1. StableSort + * 2. Zero out all prior to the last set (including the set itself), since off-proc set is not allowed + * 3. Reduce A (sum) by key (I, J). + * 4. Remove numerical zeros + * On return: + * N1 is the new length of I, J and A + * Content of X0 is destroyed + * Note: + * No need to reduce X, since all should be add + */ HYPRE_Int -hypre_IJMatrixAssembleSortAndReduce3(HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_BigInt *J0, char *X0, +hypre_IJMatrixAssembleSortAndReduce3(HYPRE_Int N0, + HYPRE_BigInt *I0, + HYPRE_BigInt *J0, + char *X0, HYPRE_Complex *A0, - HYPRE_Int *N1) + HYPRE_Int *N1) { hypreDevice_StableSortTupleByTupleKey(N0, I0, J0, X0, A0, 0); @@ -537,70 +672,20 @@ hypre_IJMatrixAssembleSortAndReduce3(HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_Big return hypre_error_flag; } -#if 0 HYPRE_Int -hypre_IJMatrixAssembleSortAndRemove(HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_BigInt *J0, char *X0, - HYPRE_Complex *A0) +hypre_IJMatrixAssembleCommunicateAndCompressDevice(hypre_IJMatrix *matrix, + HYPRE_Int reduce_stack_size) { - hypreDevice_StableSortTupleByTupleKey(N0, I0, J0, X0, A0, 0); - - /* output in X0: 0: keep, 1: remove */ - HYPRE_THRUST_CALL( - inclusive_scan_by_key, - make_reverse_iterator(thrust::make_zip_iterator(thrust::make_tuple(I0 + N0, J0 + N0))), - make_reverse_iterator(thrust::make_zip_iterator(thrust::make_tuple(I0, J0))), - make_reverse_iterator(thrust::device_pointer_cast(X0) + N0), - make_reverse_iterator(thrust::device_pointer_cast(X0) + N0), - thrust::equal_to< thrust::tuple >(), - thrust::maximum() ); - - auto new_end = HYPRE_THRUST_CALL( - remove_if, - thrust::make_zip_iterator(thrust::make_tuple(I0, J0, A0)), - thrust::make_zip_iterator(thrust::make_tuple(I0 + N0, J0 + N0, A0 + N0)), - X0, - thrust::identity()); - - HYPRE_Int N1 = thrust::get<0>(new_end.get_iterator_tuple()) - I0; - - hypre_assert(N1 >= 0 && N1 <= N0); - - return N1; -} -#endif - -HYPRE_Int -hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) -{ - MPI_Comm comm = hypre_IJMatrixComm(matrix); - HYPRE_BigInt *row_partitioning = hypre_IJMatrixRowPartitioning(matrix); - HYPRE_BigInt *col_partitioning = hypre_IJMatrixColPartitioning(matrix); - HYPRE_BigInt row_start = row_partitioning[0]; - HYPRE_BigInt row_end = row_partitioning[1]; - HYPRE_BigInt col_start = col_partitioning[0]; - HYPRE_BigInt col_end = col_partitioning[1]; - HYPRE_BigInt col_first = hypre_IJMatrixGlobalFirstCol(matrix); - HYPRE_Int nrows = row_end - row_start; - HYPRE_Int ncols = col_end - col_start; - - hypre_ParCSRMatrix *par_matrix = (hypre_ParCSRMatrix*) hypre_IJMatrixObject(matrix); - hypre_AuxParCSRMatrix *aux_matrix = (hypre_AuxParCSRMatrix*) hypre_IJMatrixTranslator(matrix); - - if (!aux_matrix) - { - return hypre_error_flag; - } - - if (!par_matrix) - { - return hypre_error_flag; - } - - HYPRE_Int nelms = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); - HYPRE_BigInt *stack_i = hypre_AuxParCSRMatrixStackI(aux_matrix); - HYPRE_BigInt *stack_j = hypre_AuxParCSRMatrixStackJ(aux_matrix); - HYPRE_Complex *stack_data = hypre_AuxParCSRMatrixStackData(aux_matrix); - char *stack_sora = hypre_AuxParCSRMatrixStackSorA(aux_matrix); + MPI_Comm comm = hypre_IJMatrixComm(matrix); + HYPRE_BigInt *row_partitioning = hypre_IJMatrixRowPartitioning(matrix); + HYPRE_BigInt row_start = row_partitioning[0]; + HYPRE_BigInt row_end = row_partitioning[1]; + hypre_AuxParCSRMatrix *aux_matrix = (hypre_AuxParCSRMatrix*) hypre_IJMatrixTranslator(matrix); + HYPRE_Int nelms = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); + HYPRE_BigInt *stack_i = hypre_AuxParCSRMatrixStackI(aux_matrix); + HYPRE_BigInt *stack_j = hypre_AuxParCSRMatrixStackJ(aux_matrix); + HYPRE_Complex *stack_data = hypre_AuxParCSRMatrixStackData(aux_matrix); + char *stack_sora = hypre_AuxParCSRMatrixStackSorA(aux_matrix); in_range pred(row_start, row_end - 1); #if defined(HYPRE_USING_SYCL) @@ -657,7 +742,7 @@ hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) stack_sora )), /* first */ thrust::make_zip_iterator(thrust::make_tuple(stack_i + nelms, stack_j + nelms, stack_data + nelms, stack_sora + nelms)), /* last */ - is_on_proc, /* stencil */ + is_on_proc, /* stencil */ thrust::make_zip_iterator(thrust::make_tuple(off_proc_i, off_proc_j, off_proc_data, off_proc_sora)), /* result */ thrust::not1(thrust::identity()) ); @@ -671,7 +756,7 @@ hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) stack_sora )), /* first */ thrust::make_zip_iterator(thrust::make_tuple(stack_i + nelms, stack_j + nelms, stack_data + nelms, stack_sora + nelms)), /* last */ - is_on_proc, /* stencil */ + is_on_proc, /* stencil */ thrust::not1(thrust::identity()) ); hypre_assert(thrust::get<0>(new_end2.get_iterator_tuple()) - stack_i == nelms_on); @@ -681,10 +766,8 @@ hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) hypre_TFree(is_on_proc, HYPRE_MEMORY_DEVICE); - /* sort and reduce */ hypre_IJMatrixAssembleSortAndReduce3(nelms_off, off_proc_i, off_proc_j, off_proc_sora, off_proc_data, &new_nnz); - // new_nnz = hypre_IJMatrixAssembleSortAndRemove(nelms_off, off_proc_i, off_proc_j, off_proc_sora, off_proc_data); hypre_TFree(off_proc_sora, HYPRE_MEMORY_DEVICE); } @@ -706,8 +789,8 @@ hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) stack_data = hypre_AuxParCSRMatrixStackData(aux_matrix); stack_sora = hypre_AuxParCSRMatrixStackSorA(aux_matrix); -#ifdef HYPRE_DEBUG /* the stack should only have on-proc elements now */ +#if defined(HYPRE_DEBUG) #if defined(HYPRE_USING_SYCL) HYPRE_Int tmp = HYPRE_ONEDPL_CALL(std::count_if, stack_i, stack_i + nelms, pred); #else @@ -718,52 +801,96 @@ hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) if (nelms) { - HYPRE_Int new_nnz; - HYPRE_BigInt *new_i; - HYPRE_BigInt *new_j; - HYPRE_Complex *new_data; - char *new_sora; + if (reduce_stack_size) + { + hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix) = nelms; + } + + hypre_IJMatrixAssembleSortAndReduce1(&nelms, &stack_i, &stack_j, &stack_sora, &stack_data, + hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix)); + + hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix) = nelms; + hypre_AuxParCSRMatrixStackI(aux_matrix) = stack_i; + hypre_AuxParCSRMatrixStackJ(aux_matrix) = stack_j; + hypre_AuxParCSRMatrixStackData(aux_matrix) = stack_data; + hypre_AuxParCSRMatrixStackSorA(aux_matrix) = stack_sora; + } + + return hypre_error_flag; +} + +HYPRE_Int +hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) +{ + HYPRE_BigInt *row_partitioning = hypre_IJMatrixRowPartitioning(matrix); + HYPRE_BigInt *col_partitioning = hypre_IJMatrixColPartitioning(matrix); + HYPRE_BigInt row_start = row_partitioning[0]; + HYPRE_BigInt row_end = row_partitioning[1]; + HYPRE_BigInt col_start = col_partitioning[0]; + HYPRE_BigInt col_end = col_partitioning[1]; + HYPRE_BigInt col_first = hypre_IJMatrixGlobalFirstCol(matrix); + HYPRE_Int nrows = row_end - row_start; + HYPRE_Int ncols = col_end - col_start; + hypre_ParCSRMatrix *par_matrix = (hypre_ParCSRMatrix*) hypre_IJMatrixObject(matrix); + hypre_AuxParCSRMatrix *aux_matrix = (hypre_AuxParCSRMatrix*) hypre_IJMatrixTranslator(matrix); + + if (!aux_matrix) + { + return hypre_error_flag; + } + + if (!par_matrix) + { + return hypre_error_flag; + } + + hypre_IJMatrixAssembleCommunicateAndCompressDevice(matrix, 1); + + hypre_AuxParCSRMatrixStackPrintInfo(matrix); - /* sort and reduce */ - hypre_IJMatrixAssembleSortAndReduce1(nelms, stack_i, stack_j, stack_sora, stack_data, - &new_nnz, &new_i, &new_j, &new_sora, &new_data); + HYPRE_Int nelms = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); + HYPRE_BigInt *stack_i = hypre_AuxParCSRMatrixStackI(aux_matrix); + HYPRE_BigInt *stack_j = hypre_AuxParCSRMatrixStackJ(aux_matrix); + HYPRE_Complex *stack_data = hypre_AuxParCSRMatrixStackData(aux_matrix); + char *stack_sora = hypre_AuxParCSRMatrixStackSorA(aux_matrix); + if (nelms) + { /* adjust row indices from global to local */ - HYPRE_Int *new_i_local = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + HYPRE_Int *stack_i_local = hypre_TAlloc(HYPRE_Int, nelms, HYPRE_MEMORY_DEVICE); + #if defined(HYPRE_USING_SYCL) HYPRE_ONEDPL_CALL( std::transform, - new_i, - new_i + new_nnz, - new_i_local, + stack_i, + stack_i + nelms, + stack_i_local, [row_start = row_start] (const auto & x) {return x - row_start;} ); #else HYPRE_THRUST_CALL( transform, - new_i, - new_i + new_nnz, - new_i_local, + stack_i, + stack_i + nelms, + stack_i_local, _1 - row_start ); #endif - /* adjust col indices wrt the global first index */ + /* adjust column indices wrt the global first index */ if (col_first) { #if defined(HYPRE_USING_SYCL) HYPRE_ONEDPL_CALL( std::transform, - new_j, - new_j + new_nnz, - new_j, + stack_j, + stack_j + nelms, + stack_j, [col_first = col_first] (const auto & x) {return x - col_first;} ); #else HYPRE_THRUST_CALL( transform, - new_j, - new_j + new_nnz, - new_j, + stack_j, + stack_j + nelms, + stack_j, _1 - col_first ); #endif } - hypre_TFree(new_i, HYPRE_MEMORY_DEVICE); - HYPRE_Int num_cols_offd_new; HYPRE_BigInt *col_map_offd_new; HYPRE_Int *col_map_offd_map; @@ -783,9 +910,9 @@ hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) hypre_CSRMatrixSplitDevice_core( 0, nrows, - new_nnz, + nelms, NULL, - new_j, + stack_j, NULL, NULL, col_start - col_first, @@ -831,11 +958,11 @@ hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) /* split IJ into diag and offd */ hypre_CSRMatrixSplitDevice_core( 1, nrows, - new_nnz, - new_i_local, - new_j, - new_data, - diag_nnz_existed || offd_nnz_existed ? new_sora : NULL, + nelms, + stack_i_local, + stack_j, + stack_data, + diag_nnz_existed || offd_nnz_existed ? stack_sora : NULL, col_start - col_first, col_end - col_first - 1, hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(par_matrix)), @@ -854,19 +981,13 @@ hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) offd_a_new + offd_nnz_existed, offd_nnz_existed ? offd_sora_new + offd_nnz_existed : NULL ); - hypre_TFree(new_i_local, HYPRE_MEMORY_DEVICE); - hypre_TFree(new_j, HYPRE_MEMORY_DEVICE); - hypre_TFree(new_data, HYPRE_MEMORY_DEVICE); - hypre_TFree(new_sora, HYPRE_MEMORY_DEVICE); - - HYPRE_Int nnz_new; - HYPRE_Int *tmp_i; - HYPRE_Int *tmp_j; - HYPRE_Complex *tmp_a; + hypre_TFree(stack_i_local, HYPRE_MEMORY_DEVICE); - /* expand the existing diag/offd and compress with the new one */ + /* expand the existing Parcsr's diag/offd and compress with the stack */ if (diag_nnz_new > 0) { + HYPRE_Int diag_nnz = diag_nnz_existed + diag_nnz_new; + if (diag_nnz_existed) { /* the existing parcsr should come first and the entries are "add" */ @@ -881,41 +1002,25 @@ hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) hypreDevice_CharFilln(diag_sora_new, diag_nnz_existed, 0); - hypre_IJMatrixAssembleSortAndReduce2(diag_nnz_existed + diag_nnz_new, diag_i_new, diag_j_new, - diag_sora_new, diag_a_new, - &nnz_new, &tmp_i, &tmp_j, &tmp_a, 2); - - hypre_TFree(diag_i_new, HYPRE_MEMORY_DEVICE); - hypre_TFree(diag_j_new, HYPRE_MEMORY_DEVICE); - hypre_TFree(diag_sora_new, HYPRE_MEMORY_DEVICE); - hypre_TFree(diag_a_new, HYPRE_MEMORY_DEVICE); - - tmp_j = hypre_TReAlloc_v2(tmp_j, HYPRE_Int, diag_nnz_existed + diag_nnz_new, HYPRE_Int, - nnz_new, HYPRE_MEMORY_DEVICE); - tmp_a = hypre_TReAlloc_v2(tmp_a, HYPRE_Complex, diag_nnz_existed + diag_nnz_new, HYPRE_Complex, - nnz_new, HYPRE_MEMORY_DEVICE); - - diag_nnz_new = nnz_new; - diag_i_new = tmp_i; - diag_j_new = tmp_j; - diag_a_new = tmp_a; + hypre_IJMatrixAssembleSortAndReduce2(&diag_nnz, &diag_i_new, &diag_j_new, diag_sora_new, &diag_a_new, 2); } - hypre_CSRMatrix *diag = hypre_CSRMatrixCreate(nrows, ncols, diag_nnz_new); - hypre_CSRMatrixI(diag) = hypreDevice_CsrRowIndicesToPtrs(nrows, diag_nnz_new, - diag_i_new); - hypre_CSRMatrixJ(diag) = diag_j_new; - hypre_CSRMatrixData(diag) = diag_a_new; + hypre_CSRMatrix *diag = hypre_CSRMatrixCreate(nrows, ncols, diag_nnz); + hypre_CSRMatrixI(diag) = hypreDevice_CsrRowIndicesToPtrs(nrows, diag_nnz, diag_i_new); + hypre_CSRMatrixJ(diag) = diag_j_new; + hypre_CSRMatrixData(diag) = diag_a_new; hypre_CSRMatrixMemoryLocation(diag) = HYPRE_MEMORY_DEVICE; hypre_TFree(diag_i_new, HYPRE_MEMORY_DEVICE); - + hypre_TFree(diag_sora_new, HYPRE_MEMORY_DEVICE); hypre_CSRMatrixDestroy(hypre_ParCSRMatrixDiag(par_matrix)); hypre_ParCSRMatrixDiag(par_matrix) = diag; } if (offd_nnz_new > 0) { + HYPRE_Int offd_nnz = offd_nnz_existed + offd_nnz_new; + if (offd_nnz_existed) { /* the existing parcsr should come first and the entries are "add" */ @@ -941,35 +1046,17 @@ hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) hypreDevice_CharFilln(offd_sora_new, offd_nnz_existed, 0); - hypre_IJMatrixAssembleSortAndReduce2(offd_nnz_existed + offd_nnz_new, offd_i_new, offd_j_new, - offd_sora_new, offd_a_new, - &nnz_new, &tmp_i, &tmp_j, &tmp_a, 0); - - hypre_TFree(offd_i_new, HYPRE_MEMORY_DEVICE); - hypre_TFree(offd_j_new, HYPRE_MEMORY_DEVICE); - hypre_TFree(offd_sora_new, HYPRE_MEMORY_DEVICE); - hypre_TFree(offd_a_new, HYPRE_MEMORY_DEVICE); - - tmp_j = hypre_TReAlloc_v2(tmp_j, HYPRE_Int, offd_nnz_existed + offd_nnz_new, HYPRE_Int, - nnz_new, HYPRE_MEMORY_DEVICE); - tmp_a = hypre_TReAlloc_v2(tmp_a, HYPRE_Complex, offd_nnz_existed + offd_nnz_new, HYPRE_Complex, - nnz_new, HYPRE_MEMORY_DEVICE); - - offd_nnz_new = nnz_new; - offd_i_new = tmp_i; - offd_j_new = tmp_j; - offd_a_new = tmp_a; + hypre_IJMatrixAssembleSortAndReduce2(&offd_nnz, &offd_i_new, &offd_j_new, offd_sora_new, &offd_a_new, 0); } - hypre_CSRMatrix *offd = hypre_CSRMatrixCreate(nrows, num_cols_offd_new, offd_nnz_new); - hypre_CSRMatrixI(offd) = hypreDevice_CsrRowIndicesToPtrs(nrows, offd_nnz_new, - offd_i_new); - hypre_CSRMatrixJ(offd) = offd_j_new; - hypre_CSRMatrixData(offd) = offd_a_new; + hypre_CSRMatrix *offd = hypre_CSRMatrixCreate(nrows, num_cols_offd_new, offd_nnz); + hypre_CSRMatrixI(offd) = hypreDevice_CsrRowIndicesToPtrs(nrows, offd_nnz, offd_i_new); + hypre_CSRMatrixJ(offd) = offd_j_new; + hypre_CSRMatrixData(offd) = offd_a_new; hypre_CSRMatrixMemoryLocation(offd) = HYPRE_MEMORY_DEVICE; hypre_TFree(offd_i_new, HYPRE_MEMORY_DEVICE); - + hypre_TFree(offd_sora_new, HYPRE_MEMORY_DEVICE); hypre_CSRMatrixDestroy(hypre_ParCSRMatrixOffd(par_matrix)); hypre_ParCSRMatrixOffd(par_matrix) = offd; @@ -977,11 +1064,9 @@ hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) hypre_ParCSRMatrixDeviceColMapOffd(par_matrix) = col_map_offd_new; hypre_TFree(hypre_ParCSRMatrixColMapOffd(par_matrix), HYPRE_MEMORY_HOST); - hypre_ParCSRMatrixColMapOffd(par_matrix) = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_new, - HYPRE_MEMORY_HOST); + hypre_ParCSRMatrixColMapOffd(par_matrix) = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_new, HYPRE_MEMORY_HOST); hypre_TMemcpy(hypre_ParCSRMatrixColMapOffd(par_matrix), col_map_offd_new, HYPRE_BigInt, - num_cols_offd_new, - HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); + num_cols_offd_new, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); col_map_offd_new = NULL; } From 7706f5a259886d0b32e68ada21c2e4d0651141a2 Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Thu, 2 May 2024 12:20:07 -0700 Subject: [PATCH 09/27] remove the shrink parameter --- src/IJ_mv/HYPRE_IJMatrix.c | 28 ---------------------------- src/IJ_mv/HYPRE_IJ_mv.h | 3 --- src/IJ_mv/IJMatrix_parcsr.c | 28 ---------------------------- src/IJ_mv/IJMatrix_parcsr_device.c | 5 ----- src/IJ_mv/_hypre_IJ_mv.h | 4 ---- src/IJ_mv/aux_parcsr_matrix.c | 1 - src/IJ_mv/aux_parcsr_matrix.h | 2 -- src/IJ_mv/protos.h | 2 -- src/test/ij_assembly.c | 27 ++++++++------------------- 9 files changed, 8 insertions(+), 92 deletions(-) diff --git a/src/IJ_mv/HYPRE_IJMatrix.c b/src/IJ_mv/HYPRE_IJMatrix.c index 1dcd049d31..375ba551fd 100644 --- a/src/IJ_mv/HYPRE_IJMatrix.c +++ b/src/IJ_mv/HYPRE_IJMatrix.c @@ -1093,34 +1093,6 @@ HYPRE_IJMatrixSetGrowFactor(hypre_IJMatrix *matrix, return hypre_error_flag; } -/*-------------------------------------------------------------------------- - *--------------------------------------------------------------------------*/ - -HYPRE_Int -HYPRE_IJMatrixSetShrinkThreshold(hypre_IJMatrix *matrix, - HYPRE_Real threshold) -{ - hypre_IJMatrix *ijmatrix = (hypre_IJMatrix *) matrix; - - if (!ijmatrix) - { - hypre_error_in_arg(1); - return hypre_error_flag; - } - - if ( hypre_IJMatrixObjectType(ijmatrix) == HYPRE_PARCSR ) - { - return ( hypre_IJMatrixSetShrinkThresholdParCSR(ijmatrix, - threshold) ); - } - else - { - hypre_error_in_arg(1); - } - - return hypre_error_flag; -} - /*-------------------------------------------------------------------------- * HYPRE_IJMatrixRead * diff --git a/src/IJ_mv/HYPRE_IJ_mv.h b/src/IJ_mv/HYPRE_IJ_mv.h index ddc947fe91..9c80244294 100644 --- a/src/IJ_mv/HYPRE_IJ_mv.h +++ b/src/IJ_mv/HYPRE_IJ_mv.h @@ -304,9 +304,6 @@ HYPRE_Int HYPRE_IJMatrixSetEarlyAssemble(HYPRE_IJMatrix matrix, HYPRE_Int HYPRE_IJMatrixSetGrowFactor(HYPRE_IJMatrix matrix, HYPRE_Real factor); -HYPRE_Int HYPRE_IJMatrixSetShrinkThreshold(HYPRE_IJMatrix matrix, - HYPRE_Real threshold); - /** * (Optional) Sets the print level, if the user wants to print * error messages. The default is 0, i.e. no error messages are printed. diff --git a/src/IJ_mv/IJMatrix_parcsr.c b/src/IJ_mv/IJMatrix_parcsr.c index 055f39aaa0..d50af0b4b6 100644 --- a/src/IJ_mv/IJMatrix_parcsr.c +++ b/src/IJ_mv/IJMatrix_parcsr.c @@ -326,34 +326,6 @@ hypre_IJMatrixSetGrowFactorParCSR(hypre_IJMatrix *matrix, return hypre_error_flag; } -/****************************************************************************** - * - * hypre_IJMatrixSetShrinkThresholdParCSR - * - *****************************************************************************/ - -HYPRE_Int -hypre_IJMatrixSetShrinkThresholdParCSR(hypre_IJMatrix *matrix, - HYPRE_Real threshold) -{ -#if defined(HYPRE_USING_GPU) - hypre_AuxParCSRMatrix *aux_matrix = (hypre_AuxParCSRMatrix *) hypre_IJMatrixTranslator(matrix); - HYPRE_BigInt *row_partitioning = hypre_IJMatrixRowPartitioning(matrix); - HYPRE_BigInt *col_partitioning = hypre_IJMatrixColPartitioning(matrix); - - if (!aux_matrix) - { - HYPRE_Int local_num_rows = (HYPRE_Int)(row_partitioning[1] - row_partitioning[0]); - HYPRE_Int local_num_cols = (HYPRE_Int)(col_partitioning[1] - col_partitioning[0]); - hypre_AuxParCSRMatrixCreate(&aux_matrix, local_num_rows, local_num_cols, NULL); - hypre_IJMatrixTranslator(matrix) = aux_matrix; - } - hypre_AuxParCSRMatrixShrinkThreshold(aux_matrix) = threshold; -#endif - - return hypre_error_flag; -} - /****************************************************************************** * * hypre_IJMatrixInitializeParCSR diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c index de211c0a59..2283b4d9bf 100644 --- a/src/IJ_mv/IJMatrix_parcsr_device.c +++ b/src/IJ_mv/IJMatrix_parcsr_device.c @@ -274,11 +274,6 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, HYPRE_BigInt stack_elmts_max_new = hypre_max(stack_elmts_current * hypre_AuxParCSRMatrixGrowFactor(aux_matrix), stack_elmts_max); - if (stack_elmts_current < stack_elmts_max_new * hypre_AuxParCSRMatrixShrinkThreshold(aux_matrix)) - { - stack_elmts_max_new = stack_elmts_current * hypre_AuxParCSRMatrixGrowFactor(aux_matrix); - } - if (stack_elmts_max_new != stack_elmts_max) { hypre_AuxParCSRMatrixStackReallocate(aux_matrix, stack_elmts_max_new); diff --git a/src/IJ_mv/_hypre_IJ_mv.h b/src/IJ_mv/_hypre_IJ_mv.h index 9ac09e7bb6..e11b7c35ee 100644 --- a/src/IJ_mv/_hypre_IJ_mv.h +++ b/src/IJ_mv/_hypre_IJ_mv.h @@ -92,7 +92,6 @@ typedef struct HYPRE_Int early_assemble; HYPRE_Int init_alloc_factor; HYPRE_Real grow_factor; - HYPRE_Real shrink_threshold; #endif } hypre_AuxParCSRMatrix; @@ -138,7 +137,6 @@ typedef struct #define hypre_AuxParCSRMatrixEarlyAssemble(matrix) ((matrix) -> early_assemble) #define hypre_AuxParCSRMatrixInitAllocFactor(matrix) ((matrix) -> init_alloc_factor) #define hypre_AuxParCSRMatrixGrowFactor(matrix) ((matrix) -> grow_factor) -#define hypre_AuxParCSRMatrixShrinkThreshold(matrix) ((matrix) -> shrink_threshold) #endif #endif /* #ifndef hypre_AUX_PARCSR_MATRIX_HEADER */ @@ -447,8 +445,6 @@ HYPRE_Int hypre_IJMatrixSetEarlyAssembleParCSR(hypre_IJMatrix *matrix, HYPRE_Int early_assemble); HYPRE_Int hypre_IJMatrixSetGrowFactorParCSR(hypre_IJMatrix *matrix, HYPRE_Real factor); -HYPRE_Int hypre_IJMatrixSetShrinkThresholdParCSR(hypre_IJMatrix *matrix, - HYPRE_Real threshold); HYPRE_Int hypre_IJMatrixInitializeParCSR ( hypre_IJMatrix *matrix ); HYPRE_Int hypre_IJMatrixGetRowCountsParCSR ( hypre_IJMatrix *matrix, HYPRE_Int nrows, HYPRE_BigInt *rows, HYPRE_Int *ncols ); diff --git a/src/IJ_mv/aux_parcsr_matrix.c b/src/IJ_mv/aux_parcsr_matrix.c index cdb21f748f..76411ad708 100644 --- a/src/IJ_mv/aux_parcsr_matrix.c +++ b/src/IJ_mv/aux_parcsr_matrix.c @@ -64,7 +64,6 @@ hypre_AuxParCSRMatrixCreate( hypre_AuxParCSRMatrix **aux_matrix, hypre_AuxParCSRMatrixInitAllocFactor(matrix) = 0; hypre_AuxParCSRMatrixEarlyAssemble(matrix) = 0; hypre_AuxParCSRMatrixGrowFactor(matrix) = 2.0; - hypre_AuxParCSRMatrixShrinkThreshold(matrix) = 0.25; #endif *aux_matrix = matrix; diff --git a/src/IJ_mv/aux_parcsr_matrix.h b/src/IJ_mv/aux_parcsr_matrix.h index a86bfffc40..0460982887 100644 --- a/src/IJ_mv/aux_parcsr_matrix.h +++ b/src/IJ_mv/aux_parcsr_matrix.h @@ -77,7 +77,6 @@ typedef struct HYPRE_Int early_assemble; HYPRE_Int init_alloc_factor; HYPRE_Real grow_factor; - HYPRE_Real shrink_threshold; #endif } hypre_AuxParCSRMatrix; @@ -123,7 +122,6 @@ typedef struct #define hypre_AuxParCSRMatrixEarlyAssemble(matrix) ((matrix) -> early_assemble) #define hypre_AuxParCSRMatrixInitAllocFactor(matrix) ((matrix) -> init_alloc_factor) #define hypre_AuxParCSRMatrixGrowFactor(matrix) ((matrix) -> grow_factor) -#define hypre_AuxParCSRMatrixShrinkThreshold(matrix) ((matrix) -> shrink_threshold) #endif #endif /* #ifndef hypre_AUX_PARCSR_MATRIX_HEADER */ diff --git a/src/IJ_mv/protos.h b/src/IJ_mv/protos.h index 502b753ce7..106385bec1 100644 --- a/src/IJ_mv/protos.h +++ b/src/IJ_mv/protos.h @@ -73,8 +73,6 @@ HYPRE_Int hypre_IJMatrixSetEarlyAssembleParCSR(hypre_IJMatrix *matrix, HYPRE_Int early_assemble); HYPRE_Int hypre_IJMatrixSetGrowFactorParCSR(hypre_IJMatrix *matrix, HYPRE_Real factor); -HYPRE_Int hypre_IJMatrixSetShrinkThresholdParCSR(hypre_IJMatrix *matrix, - HYPRE_Real threshold); HYPRE_Int hypre_IJMatrixInitializeParCSR ( hypre_IJMatrix *matrix ); HYPRE_Int hypre_IJMatrixGetRowCountsParCSR ( hypre_IJMatrix *matrix, HYPRE_Int nrows, HYPRE_BigInt *rows, HYPRE_Int *ncols ); diff --git a/src/test/ij_assembly.c b/src/test/ij_assembly.c index f3e58f8725..cbee8783e5 100644 --- a/src/test/ij_assembly.c +++ b/src/test/ij_assembly.c @@ -36,7 +36,7 @@ HYPRE_Int getParCSRMatrixData(HYPRE_ParCSRMatrix A, HYPRE_Int base, HYPRE_Int * HYPRE_Real checkMatrix(HYPRE_ParCSRMatrix parcsr_ref, HYPRE_IJMatrix ij_A); -HYPRE_Int test_all(MPI_Comm comm, char *test_name, HYPRE_MemoryLocation memory_location, HYPRE_Int option, char *cmd_sequence, HYPRE_BigInt ilower, HYPRE_BigInt iupper, HYPRE_BigInt jlower, HYPRE_BigInt jupper, HYPRE_Int nrows, HYPRE_BigInt num_nonzeros, HYPRE_Int nchunks, HYPRE_Int init_alloc, HYPRE_Int early_assemble, HYPRE_Real grow_factor, HYPRE_Real shrink_threshold, HYPRE_Int *h_nnzrow, HYPRE_Int *nnzrow, HYPRE_BigInt *rows, HYPRE_BigInt *cols, HYPRE_Real *coefs, HYPRE_IJMatrix *ij_A_ptr); +HYPRE_Int test_all(MPI_Comm comm, char *test_name, HYPRE_MemoryLocation memory_location, HYPRE_Int option, char *cmd_sequence, HYPRE_BigInt ilower, HYPRE_BigInt iupper, HYPRE_BigInt jlower, HYPRE_BigInt jupper, HYPRE_Int nrows, HYPRE_BigInt num_nonzeros, HYPRE_Int nchunks, HYPRE_Int init_alloc, HYPRE_Int early_assemble, HYPRE_Real grow_factor, HYPRE_Int *h_nnzrow, HYPRE_Int *nnzrow, HYPRE_BigInt *rows, HYPRE_BigInt *cols, HYPRE_Real *coefs, HYPRE_IJMatrix *ij_A_ptr); hypre_int main( hypre_int argc, @@ -77,7 +77,6 @@ main( hypre_int argc, HYPRE_Int init_alloc = -1; HYPRE_Int early_assemble = 0; HYPRE_Real grow_factor = -1.0; - HYPRE_Real shrink_threshold = -1.0; /* Initialize MPI */ hypre_MPI_Init(&argc, &argv); @@ -205,11 +204,6 @@ main( hypre_int argc, arg_index++; grow_factor = (HYPRE_Real) atof(argv[arg_index++]); } - else if ( strcmp(argv[arg_index], "-shrink") == 0 ) - { - arg_index++; - shrink_threshold = (HYPRE_Real) atof(argv[arg_index++]); - } else if ( strcmp(argv[arg_index], "-print") == 0 ) { arg_index++; @@ -354,7 +348,7 @@ main( hypre_int argc, if (mode & 1) { test_all(comm, "set", memory_location, option, "sA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, - nchunks, init_alloc, early_assemble, grow_factor, shrink_threshold, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); + nchunks, init_alloc, early_assemble, grow_factor, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); ierr += checkMatrix(parcsr_ref, ij_A) > tol; if (print_matrix) @@ -371,7 +365,7 @@ main( hypre_int argc, if (mode & 2) { test_all(comm, "addtrans", memory_location, 2, "aA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, - nchunks, init_alloc, early_assemble, grow_factor, shrink_threshold, h_nnzrow, nnzrow, cols, rows_coo, coefs, &ij_AT); + nchunks, init_alloc, early_assemble, grow_factor, h_nnzrow, nnzrow, cols, rows_coo, coefs, &ij_AT); hypre_ParCSRMatrixTranspose(parcsr_ref, &parcsr_trans, 1); @@ -388,7 +382,7 @@ main( hypre_int argc, if (mode & 4) { test_all(comm, "set/set", memory_location, option, "ssA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, - nchunks, init_alloc, early_assemble, grow_factor, shrink_threshold, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); + nchunks, init_alloc, early_assemble, grow_factor, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); ierr += checkMatrix(parcsr_ref, ij_A) > tol; if (print_matrix) @@ -402,7 +396,7 @@ main( hypre_int argc, if (mode & 8) { test_all(comm, "add/set", memory_location, option, "asA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, - nchunks, init_alloc, early_assemble, grow_factor, shrink_threshold, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); + nchunks, init_alloc, early_assemble, grow_factor, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); ierr += checkMatrix(parcsr_ref, ij_A) > tol; if (print_matrix) @@ -416,7 +410,7 @@ main( hypre_int argc, if (mode & 16) { test_all(comm, "set/add", memory_location, option, "saA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, - nchunks, init_alloc, early_assemble, grow_factor, shrink_threshold, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); + nchunks, init_alloc, early_assemble, grow_factor, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); hypre_ParCSRMatrix *parcsr_ref2 = hypre_ParCSRMatrixClone(parcsr_ref, 1); hypre_ParCSRMatrixScale(parcsr_ref2, 2.0); @@ -434,7 +428,7 @@ main( hypre_int argc, if (mode & 32) { test_all(comm, "set/add/assemble/set", memory_location, option, "saAsA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, - nchunks, init_alloc, early_assemble, grow_factor, shrink_threshold, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); + nchunks, init_alloc, early_assemble, grow_factor, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); ierr += checkMatrix(parcsr_ref, ij_A) > tol; if (print_matrix) @@ -448,7 +442,7 @@ main( hypre_int argc, if (mode & 64) { test_all(comm, "5adds/set", memory_location, option, "aaaaasA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, - nchunks, init_alloc, early_assemble, grow_factor, shrink_threshold, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); + nchunks, init_alloc, early_assemble, grow_factor, h_nnzrow, nnzrow, option == 1 ? rows : rows_coo, cols, coefs, &ij_A); hypre_ParCSRMatrix *parcsr_ref2 = hypre_ParCSRMatrixClone(parcsr_ref, 1); hypre_ParCSRMatrixScale(parcsr_ref2, 1.); @@ -753,7 +747,6 @@ test_all(MPI_Comm comm, HYPRE_Int init_alloc, HYPRE_Int early_assemble, HYPRE_Real grow_factor, - HYPRE_Real shrink_threshold, HYPRE_Int *h_nnzrow, HYPRE_Int *nnzrow, HYPRE_BigInt *rows, @@ -786,10 +779,6 @@ test_all(MPI_Comm comm, { HYPRE_IJMatrixSetGrowFactor(ij_A, grow_factor); } - if (shrink_threshold >= 0) - { - HYPRE_IJMatrixSetShrinkThreshold(ij_A, shrink_threshold); - } chunk_size = nrows / nchunks; From a32df30b122f9ecfe1c54002ac9f9c8e2648dc1c Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Mon, 17 Jun 2024 08:47:23 -0700 Subject: [PATCH 10/27] update assembly driver --- src/test/ij_assembly.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/test/ij_assembly.c b/src/test/ij_assembly.c index cbee8783e5..004e928c92 100644 --- a/src/test/ij_assembly.c +++ b/src/test/ij_assembly.c @@ -70,7 +70,7 @@ main( hypre_int argc, HYPRE_Real cx, cy, cz; HYPRE_Int nchunks; HYPRE_Int mode, ierr = 0; - HYPRE_Real tol = 0.; + HYPRE_Real tol = HYPRE_REAL_EPSILON; HYPRE_Int option, base; HYPRE_Int stencil; HYPRE_Int print_matrix; @@ -364,10 +364,11 @@ main( hypre_int argc, */ if (mode & 2) { - test_all(comm, "addtrans", memory_location, 2, "aA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, + test_all(comm, "addtrans", memory_location, 2, "aaaaaA", ilower, iupper, jlower, jupper, nrows, num_nonzeros, nchunks, init_alloc, early_assemble, grow_factor, h_nnzrow, nnzrow, cols, rows_coo, coefs, &ij_AT); hypre_ParCSRMatrixTranspose(parcsr_ref, &parcsr_trans, 1); + hypre_ParCSRMatrixScale(parcsr_trans, 5.0); ierr += checkMatrix(parcsr_trans, ij_AT) > tol; if (print_matrix) @@ -708,20 +709,22 @@ checkMatrix(HYPRE_ParCSRMatrix h_parcsr_ref, HYPRE_IJMatrix ij_A) HYPRE_ParCSRMatrix parcsr_A = (HYPRE_ParCSRMatrix) hypre_IJMatrixObject(ij_A); HYPRE_ParCSRMatrix h_parcsr_A; HYPRE_ParCSRMatrix parcsr_error; - HYPRE_Real fnorm; + HYPRE_Real fnorm_err, fnorm_ref, rel_err; h_parcsr_A = hypre_ParCSRMatrixClone_v2(parcsr_A, 1, HYPRE_MEMORY_HOST); // Check norm of (parcsr_ref - parcsr_A) hypre_ParCSRMatrixAdd(1.0, h_parcsr_ref, -1.0, h_parcsr_A, &parcsr_error); - fnorm = hypre_ParCSRMatrixFnorm(parcsr_error); + fnorm_err = hypre_ParCSRMatrixFnorm(parcsr_error); + fnorm_ref = hypre_ParCSRMatrixFnorm(h_parcsr_ref); + rel_err = fnorm_err / fnorm_ref; - hypre_ParPrintf(comm, "Frobenius norm of (A_ref - A): %e\n", fnorm); + hypre_ParPrintf(comm, "||A_ref - A||_F / ||A_ref||_F: %e\n", rel_err); HYPRE_ParCSRMatrixDestroy(h_parcsr_A); HYPRE_ParCSRMatrixDestroy(parcsr_error); - return fnorm; + return rel_err; } /* ---------------------------------- * From d5ce23121b8d2a89d565fb44205e3f2a844584c6 Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Mon, 17 Jun 2024 16:22:26 -0700 Subject: [PATCH 11/27] update device add values --- src/IJ_mv/IJMatrix_parcsr_device.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c index 2283b4d9bf..c74bbb1758 100644 --- a/src/IJ_mv/IJMatrix_parcsr_device.c +++ b/src/IJ_mv/IJMatrix_parcsr_device.c @@ -64,17 +64,13 @@ hypre_AuxParCSRMatrixStackReallocate(hypre_AuxParCSRMatrix *aux_matrix, inline void hypre_AuxParCSRMatrixStackPrintInfo(hypre_IJMatrix *matrix) { - HYPRE_Int myid; static HYPRE_Int counter = 0; - hypre_MPI_Comm_rank(hypre_IJMatrixComm(matrix), &myid ); hypre_AuxParCSRMatrix *aux_matrix = (hypre_AuxParCSRMatrix *) hypre_IJMatrixTranslator(matrix); - counter ++; - //hypre_printf(" IJMatrixSetAddValues: PID %d: max %d, size %d\n", myid, - //hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix)), hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix)); - hypre_printf(" %d, %d, %d\n", counter, - hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix), - hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix)); + hypre_ParPrintf(hypre_IJMatrixComm(matrix), + " %d, %d, %d\n", ++counter, + hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix), + hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix)); } /* E.g. nrows = 3 @@ -110,7 +106,6 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, HYPRE_Int nelms; HYPRE_Int *row_ptr = NULL; - HYPRE_Int early_assemble = hypre_AuxParCSRMatrixEarlyAssemble(aux_matrix); HYPRE_Int early_assemble_flag = 0; /* expand rows into full expansion of rows based on ncols @@ -150,6 +145,7 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, HYPRE_BigInt *stack_j = hypre_AuxParCSRMatrixStackJ(aux_matrix); HYPRE_Complex *stack_data = hypre_AuxParCSRMatrixStackData(aux_matrix); char *stack_sora = hypre_AuxParCSRMatrixStackSorA(aux_matrix); + HYPRE_Int early_assemble = hypre_AuxParCSRMatrixEarlyAssemble(aux_matrix); if ( stack_elmts_max < stack_elmts_required ) { @@ -173,7 +169,10 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, if (early_assemble) { stack_elmts_max_new = stack_elmts_required; - early_assemble_flag = 1; + if (early_assemble == 1) + { + early_assemble_flag = 1; + } } else { @@ -263,16 +262,21 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, stack_elmts_current += (HYPRE_BigInt) nelms; hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix) = stack_elmts_current; - hypre_AuxParCSRMatrixStackPrintInfo(matrix); + /* for debug */ + // hypre_AuxParCSRMatrixStackPrintInfo(matrix); if (early_assemble_flag) { + /* temporarily disable early assembly in the next line */ + hypre_AuxParCSRMatrixEarlyAssemble(aux_matrix) = 2; hypre_IJMatrixAssembleCommunicateAndCompressDevice(matrix, 0); + /* resore early assembly */ + hypre_AuxParCSRMatrixEarlyAssemble(aux_matrix) = early_assemble; stack_elmts_current = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); - hypre_assert(stack_elmts_max == hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix)); + stack_elmts_max = hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix); - HYPRE_BigInt stack_elmts_max_new = hypre_max(stack_elmts_current * hypre_AuxParCSRMatrixGrowFactor(aux_matrix), stack_elmts_max); + HYPRE_BigInt stack_elmts_max_new = stack_elmts_current * hypre_AuxParCSRMatrixGrowFactor(aux_matrix); if (stack_elmts_max_new != stack_elmts_max) { From e7344841f4f14445089e1bc786f85c999b82685c Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Tue, 18 Jun 2024 09:20:27 -0700 Subject: [PATCH 12/27] minor change --- src/test/ij_assembly.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/test/ij_assembly.c b/src/test/ij_assembly.c index 004e928c92..8ba282eee4 100644 --- a/src/test/ij_assembly.c +++ b/src/test/ij_assembly.c @@ -16,6 +16,7 @@ #include "_hypre_IJ_mv.h" #include "_hypre_parcsr_mv.h" #include "HYPRE_parcsr_ls.h" +#include "_hypre_utilities.h" //#include "_hypre_utilities.hpp" HYPRE_Int buildMatrixEntries(MPI_Comm comm, @@ -258,7 +259,9 @@ main( hypre_int argc, return (0); } +#if defined(HYPRE_USING_MEMORY_TRACKER) hypre_MemoryTrackerSetPrint(1); +#endif /*----------------------------------------------------------- * Print driver parameters From 9bb95a09a80e9bf55293a3dcbedef785644bbcaf Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Wed, 7 Aug 2024 18:05:48 -0700 Subject: [PATCH 13/27] minor change --- src/IJ_mv/IJMatrix_parcsr_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c index c74bbb1758..859c73c62b 100644 --- a/src/IJ_mv/IJMatrix_parcsr_device.c +++ b/src/IJ_mv/IJMatrix_parcsr_device.c @@ -270,7 +270,7 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, /* temporarily disable early assembly in the next line */ hypre_AuxParCSRMatrixEarlyAssemble(aux_matrix) = 2; hypre_IJMatrixAssembleCommunicateAndCompressDevice(matrix, 0); - /* resore early assembly */ + /* restore early assembly */ hypre_AuxParCSRMatrixEarlyAssemble(aux_matrix) = early_assemble; stack_elmts_current = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); @@ -283,7 +283,7 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, hypre_AuxParCSRMatrixStackReallocate(aux_matrix, stack_elmts_max_new); } - hypre_AuxParCSRMatrixStackPrintInfo(matrix); + //hypre_AuxParCSRMatrixStackPrintInfo(matrix); } hypre_TFree(row_ptr, HYPRE_MEMORY_DEVICE); From 9b89f6c7a092f4567509d24216d01e7632693116 Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Wed, 7 Aug 2024 18:06:41 -0700 Subject: [PATCH 14/27] add sstruct interface --- src/sstruct_mv/HYPRE_sstruct_matrix.c | 14 ++++++++++++++ src/sstruct_mv/HYPRE_sstruct_mv.h | 4 ++++ 2 files changed, 18 insertions(+) diff --git a/src/sstruct_mv/HYPRE_sstruct_matrix.c b/src/sstruct_mv/HYPRE_sstruct_matrix.c index ac3f8273d9..5a3c089e0f 100644 --- a/src/sstruct_mv/HYPRE_sstruct_matrix.c +++ b/src/sstruct_mv/HYPRE_sstruct_matrix.c @@ -323,6 +323,20 @@ HYPRE_SStructMatrixInitialize( HYPRE_SStructMatrix matrix ) return hypre_error_flag; } +HYPRE_Int +HYPRE_SStructMatrixSetEarlyAssembly( HYPRE_SStructMatrix matrix, + HYPRE_Int early_assemble ) +{ + HYPRE_IJMatrix ijmatrix = hypre_SStructMatrixIJMatrix(matrix); + + if (ijmatrix) + { + HYPRE_IJMatrixSetEarlyAssemble(ijmatrix, early_assemble); + } + + return hypre_error_flag; +} + /*-------------------------------------------------------------------------- *--------------------------------------------------------------------------*/ diff --git a/src/sstruct_mv/HYPRE_sstruct_mv.h b/src/sstruct_mv/HYPRE_sstruct_mv.h index b973da198e..de2165e679 100644 --- a/src/sstruct_mv/HYPRE_sstruct_mv.h +++ b/src/sstruct_mv/HYPRE_sstruct_mv.h @@ -511,6 +511,10 @@ HYPRE_SStructMatrixDestroy(HYPRE_SStructMatrix matrix); HYPRE_Int HYPRE_SStructMatrixInitialize(HYPRE_SStructMatrix matrix); +HYPRE_Int +HYPRE_SStructMatrixSetEarlyAssembly( HYPRE_SStructMatrix matrix, + HYPRE_Int early_assemble ); + /** * Set matrix coefficients index by index. The \e values array is of length * \e nentries. From 90043a0e17ffd0bab09f8fbe96283efa4874c3cf Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Wed, 7 Aug 2024 18:09:55 -0700 Subject: [PATCH 15/27] comment print --- src/IJ_mv/IJMatrix_parcsr_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c index 859c73c62b..c4a369eb01 100644 --- a/src/IJ_mv/IJMatrix_parcsr_device.c +++ b/src/IJ_mv/IJMatrix_parcsr_device.c @@ -845,7 +845,7 @@ hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) hypre_IJMatrixAssembleCommunicateAndCompressDevice(matrix, 1); - hypre_AuxParCSRMatrixStackPrintInfo(matrix); + // hypre_AuxParCSRMatrixStackPrintInfo(matrix); HYPRE_Int nelms = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); HYPRE_BigInt *stack_i = hypre_AuxParCSRMatrixStackI(aux_matrix); From b41bab8fa383abb1a4022205ee3bb644d4fa8363 Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Tue, 13 Aug 2024 09:05:41 -0700 Subject: [PATCH 16/27] add API call in test/sstruct.c --- src/test/sstruct.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/test/sstruct.c b/src/test/sstruct.c index 840f837c4e..a8f4eb166a 100644 --- a/src/test/sstruct.c +++ b/src/test/sstruct.c @@ -3220,6 +3220,8 @@ main( hypre_int argc, HYPRE_SStructMatrixInitialize(A); + HYPRE_SStructMatrixSetEarlyAssembly(A, 1); + if (data.nstencils > 0) { /* StencilSetEntry: set stencil values */ From 808b068e739ead32636ea0e9ac94c6f148e4cb39 Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Wed, 25 Sep 2024 15:14:14 -0700 Subject: [PATCH 17/27] tmp fix --- src/IJ_mv/IJMatrix_parcsr_device.c | 48 ++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c index c4a369eb01..c4332205cc 100644 --- a/src/IJ_mv/IJMatrix_parcsr_device.c +++ b/src/IJ_mv/IJMatrix_parcsr_device.c @@ -17,6 +17,8 @@ #if defined(HYPRE_USING_GPU) +HYPRE_Int counter = 0; + __global__ void hypreGPUKernel_IJMatrixValues_dev1(hypre_DeviceItem &item, HYPRE_Int n, HYPRE_Int *rowind, HYPRE_Int *row_ptr, @@ -267,9 +269,12 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, if (early_assemble_flag) { + HYPRE_Int myid; + hypre_MPI_Comm_rank(hypre_IJMatrixComm(matrix), &myid ); + printf("Proc %d, %d, Early Assemble\n", myid, counter++); /* temporarily disable early assembly in the next line */ hypre_AuxParCSRMatrixEarlyAssemble(aux_matrix) = 2; - hypre_IJMatrixAssembleCommunicateAndCompressDevice(matrix, 0); + hypre_IJMatrixAssembleCompressDevice(matrix, 0); /* restore early assembly */ hypre_AuxParCSRMatrixEarlyAssemble(aux_matrix) = early_assemble; @@ -672,8 +677,7 @@ hypre_IJMatrixAssembleSortAndReduce3(HYPRE_Int N0, } HYPRE_Int -hypre_IJMatrixAssembleCommunicateAndCompressDevice(hypre_IJMatrix *matrix, - HYPRE_Int reduce_stack_size) +hypre_IJMatrixAssembleCommunicate(hypre_IJMatrix *matrix) { MPI_Comm comm = hypre_IJMatrixComm(matrix); HYPRE_BigInt *row_partitioning = hypre_IJMatrixRowPartitioning(matrix); @@ -779,23 +783,36 @@ hypre_IJMatrixAssembleCommunicateAndCompressDevice(hypre_IJMatrix *matrix, hypre_TFree(off_proc_j, HYPRE_MEMORY_DEVICE); hypre_TFree(off_proc_data, HYPRE_MEMORY_DEVICE); } + return hypre_error_flag; +} - /* Note: the stack might have been changed in hypre_IJMatrixAssembleOffProcValsParCSR, - * so must get the size and the pointers again */ - nelms = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); - stack_i = hypre_AuxParCSRMatrixStackI(aux_matrix); - stack_j = hypre_AuxParCSRMatrixStackJ(aux_matrix); - stack_data = hypre_AuxParCSRMatrixStackData(aux_matrix); - stack_sora = hypre_AuxParCSRMatrixStackSorA(aux_matrix); +HYPRE_Int +hypre_IJMatrixAssembleCompressDevice(hypre_IJMatrix *matrix, + HYPRE_Int reduce_stack_size) +{ + HYPRE_BigInt *row_partitioning = hypre_IJMatrixRowPartitioning(matrix); + HYPRE_BigInt row_start = row_partitioning[0]; + HYPRE_BigInt row_end = row_partitioning[1]; + hypre_AuxParCSRMatrix *aux_matrix = (hypre_AuxParCSRMatrix*) hypre_IJMatrixTranslator(matrix); + + HYPRE_Int nelms = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); + HYPRE_BigInt *stack_i = hypre_AuxParCSRMatrixStackI(aux_matrix); + HYPRE_BigInt *stack_j = hypre_AuxParCSRMatrixStackJ(aux_matrix); + HYPRE_Complex *stack_data = hypre_AuxParCSRMatrixStackData(aux_matrix); + char *stack_sora = hypre_AuxParCSRMatrixStackSorA(aux_matrix); - /* the stack should only have on-proc elements now */ #if defined(HYPRE_DEBUG) + /* in the final assembly, at this stage, the stack should only have on-proc elements */ + if (reduce_stack_size) + { + in_range pred(row_start, row_end - 1); #if defined(HYPRE_USING_SYCL) - HYPRE_Int tmp = HYPRE_ONEDPL_CALL(std::count_if, stack_i, stack_i + nelms, pred); + HYPRE_Int tmp = HYPRE_ONEDPL_CALL(std::count_if, stack_i, stack_i + nelms, pred); #else - HYPRE_Int tmp = HYPRE_THRUST_CALL(count_if, stack_i, stack_i + nelms, pred); + HYPRE_Int tmp = HYPRE_THRUST_CALL(count_if, stack_i, stack_i + nelms, pred); #endif - hypre_assert(nelms == tmp); + hypre_assert(nelms == tmp); + } #endif if (nelms) @@ -843,7 +860,8 @@ hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) return hypre_error_flag; } - hypre_IJMatrixAssembleCommunicateAndCompressDevice(matrix, 1); + hypre_IJMatrixAssembleCommunicate(matrix); + hypre_IJMatrixAssembleCompressDevice(matrix, 1); // hypre_AuxParCSRMatrixStackPrintInfo(matrix); From d007b83102ad49cb9a625138ff1adffa3879f3a7 Mon Sep 17 00:00:00 2001 From: Rui Peng Li Date: Wed, 9 Oct 2024 15:29:38 -0700 Subject: [PATCH 18/27] minor changes --- src/IJ_mv/IJMatrix_parcsr_device.c | 7 ------- src/IJ_mv/_hypre_IJ_mv.h | 3 ++- src/IJ_mv/protos.h | 3 ++- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c index c4332205cc..be26647d7b 100644 --- a/src/IJ_mv/IJMatrix_parcsr_device.c +++ b/src/IJ_mv/IJMatrix_parcsr_device.c @@ -269,14 +269,7 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, if (early_assemble_flag) { - HYPRE_Int myid; - hypre_MPI_Comm_rank(hypre_IJMatrixComm(matrix), &myid ); - printf("Proc %d, %d, Early Assemble\n", myid, counter++); - /* temporarily disable early assembly in the next line */ - hypre_AuxParCSRMatrixEarlyAssemble(aux_matrix) = 2; hypre_IJMatrixAssembleCompressDevice(matrix, 0); - /* restore early assembly */ - hypre_AuxParCSRMatrixEarlyAssemble(aux_matrix) = early_assemble; stack_elmts_current = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); stack_elmts_max = hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix); diff --git a/src/IJ_mv/_hypre_IJ_mv.h b/src/IJ_mv/_hypre_IJ_mv.h index e11b7c35ee..e37d9438f1 100644 --- a/src/IJ_mv/_hypre_IJ_mv.h +++ b/src/IJ_mv/_hypre_IJ_mv.h @@ -486,7 +486,8 @@ HYPRE_Int hypre_IJMatrixInitializeParCSR_v2(hypre_IJMatrix *matrix, HYPRE_Int hypre_IJMatrixSetConstantValuesParCSRDevice( hypre_IJMatrix *matrix, HYPRE_Complex value ); -HYPRE_Int hypre_IJMatrixAssembleCommunicateAndCompressDevice(hypre_IJMatrix *matrix, HYPRE_Int reduce_stack_size); +HYPRE_Int hypre_IJMatrixAssembleCommunicate(hypre_IJMatrix *matrix); +HYPRE_Int hypre_IJMatrixAssembleCompressDevice(hypre_IJMatrix *matrix, HYPRE_Int reduce_stack_size); /* IJMatrix_petsc.c */ HYPRE_Int hypre_IJMatrixSetLocalSizePETSc ( hypre_IJMatrix *matrix, HYPRE_Int local_m, diff --git a/src/IJ_mv/protos.h b/src/IJ_mv/protos.h index 106385bec1..bd41f13681 100644 --- a/src/IJ_mv/protos.h +++ b/src/IJ_mv/protos.h @@ -114,7 +114,8 @@ HYPRE_Int hypre_IJMatrixInitializeParCSR_v2(hypre_IJMatrix *matrix, HYPRE_Int hypre_IJMatrixSetConstantValuesParCSRDevice( hypre_IJMatrix *matrix, HYPRE_Complex value ); -HYPRE_Int hypre_IJMatrixAssembleCommunicateAndCompressDevice(hypre_IJMatrix *matrix, HYPRE_Int reduce_stack_size); +HYPRE_Int hypre_IJMatrixAssembleCommunicate(hypre_IJMatrix *matrix); +HYPRE_Int hypre_IJMatrixAssembleCompressDevice(hypre_IJMatrix *matrix, HYPRE_Int reduce_stack_size); /* IJMatrix_petsc.c */ HYPRE_Int hypre_IJMatrixSetLocalSizePETSc ( hypre_IJMatrix *matrix, HYPRE_Int local_m, From f5ff87c1cfae1a285e8059cd69a629671441bc42 Mon Sep 17 00:00:00 2001 From: Rui Peng Li Date: Wed, 9 Oct 2024 15:30:10 -0700 Subject: [PATCH 19/27] update the driver --- src/test/ij_assembly.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/test/ij_assembly.c b/src/test/ij_assembly.c index 8ba282eee4..d6c1efc98b 100644 --- a/src/test/ij_assembly.c +++ b/src/test/ij_assembly.c @@ -765,6 +765,9 @@ test_all(MPI_Comm comm, HYPRE_Int time_index; HYPRE_Int *h_rowptr = hypre_CTAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_HOST); HYPRE_Int cmd_len = strlen(cmd_sequence); + HYPRE_Int myid; + + hypre_MPI_Comm_rank(comm, &myid); for (i = 1; i < nrows + 1; i++) { @@ -776,6 +779,7 @@ test_all(MPI_Comm comm, HYPRE_IJMatrixSetObjectType(ij_A, HYPRE_PARCSR); HYPRE_IJMatrixInitialize_v2(ij_A, memory_location); HYPRE_IJMatrixSetOMPFlag(ij_A, 1); + grow_factor= myid ? grow_factor : 2 * grow_factor; if (init_alloc >= 0) { HYPRE_IJMatrixSetInitAllocation(ij_A, init_alloc); From adc831e8df145cbdce6eedd2c2c8e2f22ac89e62 Mon Sep 17 00:00:00 2001 From: Rui Peng Li Date: Sat, 25 Jan 2025 16:37:31 -0800 Subject: [PATCH 20/27] fix format --- src/test/sstruct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/sstruct.c b/src/test/sstruct.c index f9639f482a..2c5cde7dc9 100644 --- a/src/test/sstruct.c +++ b/src/test/sstruct.c @@ -3234,7 +3234,7 @@ main( hypre_int argc, HYPRE_SStructMatrixInitialize(A); - HYPRE_SStructMatrixSetEarlyAssembly(A, 0); + HYPRE_SStructMatrixSetEarlyAssembly(A, 0); if (data.nstencils > 0) { From d3daa4433927d367ec651827225081dd4c605080 Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Mon, 27 Jan 2025 21:51:31 -0800 Subject: [PATCH 21/27] fix lassen tests --- src/IJ_mv/IJMatrix_parcsr_device.c | 17 ++++++++--------- src/test/ij_assembly.c | 8 ++++---- src/test/sstruct.c | 2 -- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c index 1aa0a0d731..9a2c6a8637 100644 --- a/src/IJ_mv/IJMatrix_parcsr_device.c +++ b/src/IJ_mv/IJMatrix_parcsr_device.c @@ -750,21 +750,20 @@ HYPRE_Int hypre_IJMatrixAssembleCompressDevice(hypre_IJMatrix *matrix, HYPRE_Int reduce_stack_size) { - HYPRE_BigInt *row_partitioning = hypre_IJMatrixRowPartitioning(matrix); - HYPRE_BigInt row_start = row_partitioning[0]; - HYPRE_BigInt row_end = row_partitioning[1]; hypre_AuxParCSRMatrix *aux_matrix = (hypre_AuxParCSRMatrix*) hypre_IJMatrixTranslator(matrix); - - HYPRE_Int nelms = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); - HYPRE_BigInt *stack_i = hypre_AuxParCSRMatrixStackI(aux_matrix); - HYPRE_BigInt *stack_j = hypre_AuxParCSRMatrixStackJ(aux_matrix); - HYPRE_Complex *stack_data = hypre_AuxParCSRMatrixStackData(aux_matrix); - char *stack_sora = hypre_AuxParCSRMatrixStackSorA(aux_matrix); + HYPRE_Int nelms = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); + HYPRE_BigInt *stack_i = hypre_AuxParCSRMatrixStackI(aux_matrix); + HYPRE_BigInt *stack_j = hypre_AuxParCSRMatrixStackJ(aux_matrix); + HYPRE_Complex *stack_data = hypre_AuxParCSRMatrixStackData(aux_matrix); + char *stack_sora = hypre_AuxParCSRMatrixStackSorA(aux_matrix); #if defined(HYPRE_DEBUG) /* in the final assembly, at this stage, the stack should only have on-proc elements */ if (reduce_stack_size) { + HYPRE_BigInt *row_partitioning = hypre_IJMatrixRowPartitioning(matrix); + HYPRE_BigInt row_start = row_partitioning[0]; + HYPRE_BigInt row_end = row_partitioning[1]; in_range pred(row_start, row_end - 1); #if defined(HYPRE_USING_SYCL) HYPRE_Int tmp = HYPRE_ONEDPL_CALL(std::count_if, stack_i, stack_i + nelms, pred); diff --git a/src/test/ij_assembly.c b/src/test/ij_assembly.c index d4e8cbc7aa..4ba9230a78 100644 --- a/src/test/ij_assembly.c +++ b/src/test/ij_assembly.c @@ -37,7 +37,7 @@ HYPRE_Int getParCSRMatrixData(HYPRE_ParCSRMatrix A, HYPRE_Int base, HYPRE_Int * HYPRE_Real checkMatrix(HYPRE_ParCSRMatrix parcsr_ref, HYPRE_IJMatrix ij_A); -HYPRE_Int test_all(MPI_Comm comm, char *test_name, HYPRE_MemoryLocation memory_location, HYPRE_Int option, char *cmd_sequence, HYPRE_BigInt ilower, HYPRE_BigInt iupper, HYPRE_BigInt jlower, HYPRE_BigInt jupper, HYPRE_Int nrows, HYPRE_BigInt num_nonzeros, HYPRE_Int nchunks, HYPRE_Int init_alloc, HYPRE_Int early_assemble, HYPRE_Real grow_factor, HYPRE_Int *h_nnzrow, HYPRE_Int *nnzrow, HYPRE_BigInt *rows, HYPRE_BigInt *cols, HYPRE_Real *coefs, HYPRE_IJMatrix *ij_A_ptr); +HYPRE_Int test_all(MPI_Comm comm, const char *test_name, HYPRE_MemoryLocation memory_location, HYPRE_Int option, const char *cmd_sequence, HYPRE_BigInt ilower, HYPRE_BigInt iupper, HYPRE_BigInt jlower, HYPRE_BigInt jupper, HYPRE_Int nrows, HYPRE_BigInt num_nonzeros, HYPRE_Int nchunks, HYPRE_Int init_alloc, HYPRE_Int early_assemble, HYPRE_Real grow_factor, HYPRE_Int *h_nnzrow, HYPRE_Int *nnzrow, HYPRE_BigInt *rows, HYPRE_BigInt *cols, HYPRE_Real *coefs, HYPRE_IJMatrix *ij_A_ptr); hypre_int main( hypre_int argc, @@ -739,10 +739,10 @@ checkMatrix(HYPRE_ParCSRMatrix h_parcsr_ref, HYPRE_IJMatrix ij_A) /* set values */ HYPRE_Int test_all(MPI_Comm comm, - char *test_name, + const char *test_name, HYPRE_MemoryLocation memory_location, HYPRE_Int option, - char *cmd_sequence, + const char *cmd_sequence, HYPRE_BigInt ilower, HYPRE_BigInt iupper, HYPRE_BigInt jlower, @@ -846,7 +846,7 @@ test_all(MPI_Comm comm, } #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(); #if defined(CUDA_PROFILER) cudaProfilerStop(); #endif diff --git a/src/test/sstruct.c b/src/test/sstruct.c index 2c5cde7dc9..475684624d 100644 --- a/src/test/sstruct.c +++ b/src/test/sstruct.c @@ -3234,8 +3234,6 @@ main( hypre_int argc, HYPRE_SStructMatrixInitialize(A); - HYPRE_SStructMatrixSetEarlyAssembly(A, 0); - if (data.nstencils > 0) { /* StencilSetEntry: set stencil values */ From ea43c33e2d10329ee97b8229cc127a73db2550e6 Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Mon, 3 Feb 2025 11:39:41 -0800 Subject: [PATCH 22/27] update .jobs/.sh --- src/test/TEST_ij/assembly.jobs | 2 ++ src/test/TEST_ij/assembly.sh | 1 + 2 files changed, 3 insertions(+) diff --git a/src/test/TEST_ij/assembly.jobs b/src/test/TEST_ij/assembly.jobs index 0f2d901794..851fe71db4 100755 --- a/src/test/TEST_ij/assembly.jobs +++ b/src/test/TEST_ij/assembly.jobs @@ -16,3 +16,5 @@ mpirun -np 3 ./ij_assembly > assembly.out.2 mpirun -np 7 ./ij_assembly > assembly.out.3 +mpirun -np 7 ./ij_assembly -early 1 > assembly.out.4 + diff --git a/src/test/TEST_ij/assembly.sh b/src/test/TEST_ij/assembly.sh index 61e8875c6b..ceb39a5485 100755 --- a/src/test/TEST_ij/assembly.sh +++ b/src/test/TEST_ij/assembly.sh @@ -17,6 +17,7 @@ FILES="\ ${TNAME}.out.1\ ${TNAME}.out.2\ ${TNAME}.out.3\ + ${TNAME}.out.4\ " for i in $FILES From 2724033c98746ce11ad7951f289a8d7473914ba5 Mon Sep 17 00:00:00 2001 From: Rui Peng Li Date: Thu, 20 Feb 2025 16:45:54 -0800 Subject: [PATCH 23/27] Update src/IJ_mv/IJMatrix_parcsr.c Co-authored-by: Victor A. P. Magri <50467563+victorapm@users.noreply.github.com> --- src/IJ_mv/IJMatrix_parcsr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/IJ_mv/IJMatrix_parcsr.c b/src/IJ_mv/IJMatrix_parcsr.c index 412b0c11fd..2e1e9a45e3 100644 --- a/src/IJ_mv/IJMatrix_parcsr.c +++ b/src/IJ_mv/IJMatrix_parcsr.c @@ -265,6 +265,9 @@ hypre_IJMatrixSetInitAllocationParCSR(hypre_IJMatrix *matrix, hypre_IJMatrixTranslator(matrix) = aux_matrix; } hypre_AuxParCSRMatrixInitAllocFactor(aux_matrix) = factor; +#else + HYPRE_UNUSED_VAR(matrix); + HYPRE_UNUSED_VAR(factor); #endif return hypre_error_flag; From 9fdc40639ceae430881c951aa6bc12eefa1e96bc Mon Sep 17 00:00:00 2001 From: Rui Peng Li Date: Thu, 20 Feb 2025 16:46:03 -0800 Subject: [PATCH 24/27] Update src/IJ_mv/IJMatrix_parcsr.c Co-authored-by: Victor A. P. Magri <50467563+victorapm@users.noreply.github.com> --- src/IJ_mv/IJMatrix_parcsr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/IJ_mv/IJMatrix_parcsr.c b/src/IJ_mv/IJMatrix_parcsr.c index 2e1e9a45e3..8f1e906393 100644 --- a/src/IJ_mv/IJMatrix_parcsr.c +++ b/src/IJ_mv/IJMatrix_parcsr.c @@ -296,6 +296,9 @@ hypre_IJMatrixSetEarlyAssembleParCSR(hypre_IJMatrix *matrix, hypre_IJMatrixTranslator(matrix) = aux_matrix; } hypre_AuxParCSRMatrixEarlyAssemble(aux_matrix) = early_assemble; +#else + HYPRE_UNUSED_VAR(matrix); + HYPRE_UNUSED_VAR(early_assemble); #endif return hypre_error_flag; From 71caefe6c9e441a30b885e83513636fe3f85514f Mon Sep 17 00:00:00 2001 From: Rui Peng Li Date: Thu, 20 Feb 2025 16:46:16 -0800 Subject: [PATCH 25/27] Update src/IJ_mv/IJMatrix_parcsr.c Co-authored-by: Victor A. P. Magri <50467563+victorapm@users.noreply.github.com> --- src/IJ_mv/IJMatrix_parcsr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/IJ_mv/IJMatrix_parcsr.c b/src/IJ_mv/IJMatrix_parcsr.c index 8f1e906393..24edb706c9 100644 --- a/src/IJ_mv/IJMatrix_parcsr.c +++ b/src/IJ_mv/IJMatrix_parcsr.c @@ -327,6 +327,9 @@ hypre_IJMatrixSetGrowFactorParCSR(hypre_IJMatrix *matrix, hypre_IJMatrixTranslator(matrix) = aux_matrix; } hypre_AuxParCSRMatrixGrowFactor(aux_matrix) = factor; +#else + HYPRE_UNUSED_VAR(matrix); + HYPRE_UNUSED_VAR(factor); #endif return hypre_error_flag; From 9e60beed1204ceb8f6ca63a65db4b727df0407fb Mon Sep 17 00:00:00 2001 From: Rui Peng Li Date: Thu, 20 Feb 2025 16:47:36 -0800 Subject: [PATCH 26/27] change a function name; add comments --- src/IJ_mv/HYPRE_IJ_mv.h | 17 +++++++++++++++++ src/sstruct_mv/HYPRE_sstruct_matrix.c | 2 +- src/sstruct_mv/HYPRE_sstruct_mv.h | 9 ++++++++- src/test/sstruct.c | 2 -- 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/IJ_mv/HYPRE_IJ_mv.h b/src/IJ_mv/HYPRE_IJ_mv.h index c8e36e7bc1..7466d3c2c4 100644 --- a/src/IJ_mv/HYPRE_IJ_mv.h +++ b/src/IJ_mv/HYPRE_IJ_mv.h @@ -344,12 +344,29 @@ HYPRE_Int HYPRE_IJMatrixSetDiagOffdSizes(HYPRE_IJMatrix matrix, HYPRE_Int HYPRE_IJMatrixSetMaxOffProcElmts(HYPRE_IJMatrix matrix, HYPRE_Int max_off_proc_elmts); +/** + * (Optional, GPU only) Sets the initial memory allocation for matrix + * assemble, which factor * local number of rows + * Not collective. + **/ HYPRE_Int HYPRE_IJMatrixSetInitAllocation(HYPRE_IJMatrix matrix, HYPRE_Int factor); +/** + * (Optional, GPU only) Sets if matrix assemble routine does reductions + * during the accumulation of the entries before calling HYPRE_IJMatrixAssemble. + * This early assemble feature may save the peak memory usage but requires + * extra work. + * Not collective. + **/ HYPRE_Int HYPRE_IJMatrixSetEarlyAssemble(HYPRE_IJMatrix matrix, HYPRE_Int early_assemble); +/** + * (Optional, GPU only) Sets the grow factor of memory in matrix assemble when + * running out of memory. + * Not collective. + **/ HYPRE_Int HYPRE_IJMatrixSetGrowFactor(HYPRE_IJMatrix matrix, HYPRE_Real factor); diff --git a/src/sstruct_mv/HYPRE_sstruct_matrix.c b/src/sstruct_mv/HYPRE_sstruct_matrix.c index 5a3c089e0f..9533367455 100644 --- a/src/sstruct_mv/HYPRE_sstruct_matrix.c +++ b/src/sstruct_mv/HYPRE_sstruct_matrix.c @@ -324,7 +324,7 @@ HYPRE_SStructMatrixInitialize( HYPRE_SStructMatrix matrix ) } HYPRE_Int -HYPRE_SStructMatrixSetEarlyAssembly( HYPRE_SStructMatrix matrix, +HYPRE_SStructMatrixSetEarlyAssemble( HYPRE_SStructMatrix matrix, HYPRE_Int early_assemble ) { HYPRE_IJMatrix ijmatrix = hypre_SStructMatrixIJMatrix(matrix); diff --git a/src/sstruct_mv/HYPRE_sstruct_mv.h b/src/sstruct_mv/HYPRE_sstruct_mv.h index de2165e679..79222f81a9 100644 --- a/src/sstruct_mv/HYPRE_sstruct_mv.h +++ b/src/sstruct_mv/HYPRE_sstruct_mv.h @@ -511,8 +511,15 @@ HYPRE_SStructMatrixDestroy(HYPRE_SStructMatrix matrix); HYPRE_Int HYPRE_SStructMatrixInitialize(HYPRE_SStructMatrix matrix); +/** + * (Optional, GPU only) Sets if the matrix assemble routine does reductions + * of the IJ part before calling HYPRE_SStructMatrixAssemble. + * See also the comments of HYPRE_IJMatrixSetEarlyAssemble. + * This early assemble feature may save the peak memory usage but requires + * extra work. + **/ HYPRE_Int -HYPRE_SStructMatrixSetEarlyAssembly( HYPRE_SStructMatrix matrix, +HYPRE_SStructMatrixSetEarlyAssemble( HYPRE_SStructMatrix matrix, HYPRE_Int early_assemble ); /** diff --git a/src/test/sstruct.c b/src/test/sstruct.c index 2c5cde7dc9..475684624d 100644 --- a/src/test/sstruct.c +++ b/src/test/sstruct.c @@ -3234,8 +3234,6 @@ main( hypre_int argc, HYPRE_SStructMatrixInitialize(A); - HYPRE_SStructMatrixSetEarlyAssembly(A, 0); - if (data.nstencils > 0) { /* StencilSetEntry: set stencil values */ From a7c7882f5efbf34498f5e584091eed2caedbcc93 Mon Sep 17 00:00:00 2001 From: Rui Peng Li Date: Thu, 20 Feb 2025 17:38:12 -0800 Subject: [PATCH 27/27] a minor fix after reviewer comments --- src/IJ_mv/IJMatrix_parcsr_device.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c index 9a2c6a8637..c6776c7d58 100644 --- a/src/IJ_mv/IJMatrix_parcsr_device.c +++ b/src/IJ_mv/IJMatrix_parcsr_device.c @@ -169,10 +169,7 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, if (early_assemble) { stack_elmts_max_new = stack_elmts_required; - if (early_assemble == 1) - { - early_assemble_flag = 1; - } + early_assemble_flag = 1; } else {