From b7d832b37fbe5602917a7325919021f3e1c2dd39 Mon Sep 17 00:00:00 2001 From: Edward Hartnett Date: Tue, 12 Apr 2022 06:56:23 -0600 Subject: [PATCH 1/7] adding test to demonstrate parallel I/O numrecs problem --- nc_test4/Makefile.am | 3 +- nc_test4/run_par_test.sh.in | 4 +++ nc_test4/tst_parallel6.c | 62 +++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 nc_test4/tst_parallel6.c diff --git a/nc_test4/Makefile.am b/nc_test4/Makefile.am index 23328202ec..8ce7e97ded 100644 --- a/nc_test4/Makefile.am +++ b/nc_test4/Makefile.am @@ -93,7 +93,8 @@ endif # BUILD_UTILITIES if TEST_PARALLEL4 check_PROGRAMS += tst_mpi_parallel tst_parallel tst_parallel3 \ tst_parallel4 tst_parallel5 tst_nc4perf tst_mode tst_simplerw_coll_r \ -tst_mode tst_parallel_zlib tst_parallel_compress tst_quantize_par +tst_mode tst_parallel_zlib tst_parallel_compress tst_quantize_par \ +tst_parallel6 TESTS += run_par_test.sh endif # TEST_PARALLEL4 diff --git a/nc_test4/run_par_test.sh.in b/nc_test4/run_par_test.sh.in index fbeded914f..00e4db3631 100644 --- a/nc_test4/run_par_test.sh.in +++ b/nc_test4/run_par_test.sh.in @@ -65,3 +65,7 @@ echo echo "Parallel I/O test for quantize feature." @MPIEXEC@ -n 4 ./tst_quantize_par +echo +echo "Parallel I/O test contributed by wkliao from pnetcdf." +@MPIEXEC@ -n 4 ./tst_parallel6 + diff --git a/nc_test4/tst_parallel6.c b/nc_test4/tst_parallel6.c new file mode 100644 index 0000000000..f6f07350c7 --- /dev/null +++ b/nc_test4/tst_parallel6.c @@ -0,0 +1,62 @@ +/* Copyright 2022, UCAR/Unidata See COPYRIGHT file for copying and + * redistribution conditions. + * + * This parallel I/O test checks the behavior of nc_inq_dimlen() after + * parallel I/O writes. + * + * This program taken from a PNetCDF issue: + * https://github.com/Parallel-NetCDF/PnetCDF/issues/72, thanks + * wkliao! + * + * wkliao, Ed Hartnett, 4/11/22 + */ + +#include +#include "err_macros.h" +#include +#include +#include +#include +#include + +#define FILENAME "tst_parallel6.nc" + +int main(int argc, char** argv) +{ + int err = NC_NOERR, rank, nprocs; + int ncid, cmode, varid, dimid; + size_t start[1], count[1], nrecs; + + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + if (nc_create_par(FILENAME, NC_CLOBBER | NC_NETCDF4, MPI_COMM_WORLD, + MPI_INFO_NULL, &ncid)) ERR; + + if (nc_def_dim(ncid, "time", NC_UNLIMITED, &dimid)) ERR; + if (nc_def_var(ncid, "var", NC_INT, 1, &dimid, &varid)) ERR; + if (nc_var_par_access(ncid, varid, NC_COLLECTIVE)) ERR; + if (nc_enddef(ncid)) ERR; + + start[0] = rank; + count[0] = 1; + if (nc_put_vara_int(ncid, varid, start, count, &rank)) ERR; + MPI_Barrier(MPI_COMM_WORLD); + nc_redef(ncid); + nc_enddef(ncid); + if (nc_inq_dimlen(ncid, dimid, &nrecs)) ERR; + + if (nrecs != nprocs) + { + printf("Rank %d error at line %d of file %s:\n",rank,__LINE__,__FILE__); + printf("\tafter writing start=%zd count=%zd\n", start[0], count[0]); + printf("\texpecting number of records = %d but got %ld\n", + nprocs, nrecs); + ERR; + } + if (nc_close(ncid)) ERR; + + MPI_Finalize(); + return 0; +} From 60a0f07ccc46a8c6719c78573d2e14532ee728e7 Mon Sep 17 00:00:00 2001 From: Edward Hartnett Date: Tue, 12 Apr 2022 07:52:16 -0600 Subject: [PATCH 2/7] attempts to get working --- include/netcdf_par.h | 17 +++++++++++++++++ libhdf5/hdf5internal.c | 28 ++++++++++++++++++++++++---- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/include/netcdf_par.h b/include/netcdf_par.h index 17303c7b4f..5577f69812 100644 --- a/include/netcdf_par.h +++ b/include/netcdf_par.h @@ -21,12 +21,29 @@ #define NETCDF_PAR_H 1 #include +#include +#include /** Use with nc_var_par_access() to set parallel access to independent. */ #define NC_INDEPENDENT 0 /** Use with nc_var_par_access() to set parallel access to collective. */ #define NC_COLLECTIVE 1 +/** Find the MPI type that cooresponds to the C size_t type. */ +#if SIZE_MAX == UCHAR_MAX +#define NC_MPI_SIZE_T MPI_UNSIGNED_CHAR +#elif SIZE_MAX == USHRT_MAX +#define NC_MPI_SIZE_T MPI_UNSIGNED_SHORT +#elif SIZE_MAX == UINT_MAX +#define NC_MPI_SIZE_T MPI_UNSIGNED +#elif SIZE_MAX == ULONG_MAX +#define NC_MPI_SIZE_T MPI_UNSIGNED_LONG +#elif SIZE_MAX == ULLONG_MAX +#define NC_MPI_SIZE_T MPI_UNSIGNED_LONG_LONG +#else +#error "cannot determine MPI type for size_t" +#endif + #if defined(__cplusplus) extern "C" { #endif diff --git a/libhdf5/hdf5internal.c b/libhdf5/hdf5internal.c index 972508dcd0..6fdaf98da2 100644 --- a/libhdf5/hdf5internal.c +++ b/libhdf5/hdf5internal.c @@ -157,11 +157,31 @@ find_var_dim_max_length(NC_GRP_INFO_T *grp, int varid, int dimid, BAIL(NC_EHDFERR); LOG((5, "find_var_dim_max_length: varid %d len %d max: %d", varid, (int)h5dimlen[0], (int)h5dimlenmax[0])); - for (d=0; ddimids[d] == dimid) { + for (d=0; ddimids[d] == dimid) *maxlen = *maxlen > h5dimlen[d] ? *maxlen : h5dimlen[d]; - } - } + +#ifdef USE_PARALLEL + /* If we are doing parallel I/O in collective mode, then + * communicate with all other tasks in the collective and + * find out which has the max value for the dimension + * size. */ + { + assert(grp->nc4_info); + size_t real_maxlen; + + /* If parallel is in use, and var is collective, + * reduce to largest value of maxlen, putting result + * into real_maxlen. */ + if (grp->nc4_info->parallel && var->parallel_access == NC_COLLECTIVE) + { + if (MPI_Allreduce(maxlen, &real_maxlen, 1, NC_MPI_SIZE_T, MPI_MAX, + grp->nc4_info->comm)) + BAIL(NC_EMPI); + } + *maxlen = real_maxlen; + } +#endif /* USE_PARALLEL */ } } From e9ac9bda1cfad3066ea21c9f005b4766a191fd39 Mon Sep 17 00:00:00 2001 From: Edward Hartnett Date: Mon, 18 Apr 2022 08:22:50 -0600 Subject: [PATCH 3/7] working on parallel numrec problem --- libhdf5/hdf5internal.c | 2 +- nc_test/tst_small.c | 3 ++- nc_test4/tst_parallel6.c | 16 +++++++++++++--- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/libhdf5/hdf5internal.c b/libhdf5/hdf5internal.c index 6fdaf98da2..f3b9871f7d 100644 --- a/libhdf5/hdf5internal.c +++ b/libhdf5/hdf5internal.c @@ -178,8 +178,8 @@ find_var_dim_max_length(NC_GRP_INFO_T *grp, int varid, int dimid, if (MPI_Allreduce(maxlen, &real_maxlen, 1, NC_MPI_SIZE_T, MPI_MAX, grp->nc4_info->comm)) BAIL(NC_EMPI); + *maxlen = real_maxlen; } - *maxlen = real_maxlen; } #endif /* USE_PARALLEL */ } diff --git a/nc_test/tst_small.c b/nc_test/tst_small.c index d4172cec49..f44068271d 100644 --- a/nc_test/tst_small.c +++ b/nc_test/tst_small.c @@ -509,7 +509,8 @@ main(int argc, char **argv) /* Go thru formats and run all tests for each of two (for netCDF-3 * only builds), or 4 (for netCDF-4 builds) different formats. */ - for (i = NUM_FORMATS; i >= 1; i--) + /* for (i = NUM_FORMATS; i >= 1; i--) */ + i = NC_FORMAT_NETCDF4; { switch (i) { diff --git a/nc_test4/tst_parallel6.c b/nc_test4/tst_parallel6.c index f6f07350c7..20e7c5cd06 100644 --- a/nc_test4/tst_parallel6.c +++ b/nc_test4/tst_parallel6.c @@ -31,6 +31,12 @@ int main(int argc, char** argv) MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) + printf("\n*** Testing parallel I/O.\n"); + + if (!rank) + printf("*** testing record lenth with multiple processes writing records..."); + if (nc_create_par(FILENAME, NC_CLOBBER | NC_NETCDF4, MPI_COMM_WORLD, MPI_INFO_NULL, &ncid)) ERR; @@ -42,9 +48,6 @@ int main(int argc, char** argv) start[0] = rank; count[0] = 1; if (nc_put_vara_int(ncid, varid, start, count, &rank)) ERR; - MPI_Barrier(MPI_COMM_WORLD); - nc_redef(ncid); - nc_enddef(ncid); if (nc_inq_dimlen(ncid, dimid, &nrecs)) ERR; if (nrecs != nprocs) @@ -57,6 +60,13 @@ int main(int argc, char** argv) } if (nc_close(ncid)) ERR; + if (!rank) + SUMMARIZE_ERR; + MPI_Finalize(); + + if (!rank) + FINAL_RESULTS; + return 0; } From 08b6ea65a40b511caf47d38f2883d721c3236b6b Mon Sep 17 00:00:00 2001 From: Edward Hartnett Date: Tue, 26 Apr 2022 07:54:47 -0600 Subject: [PATCH 4/7] fixing parallel I/O bug with dim length for unlimited dimensions --- libhdf5/hdf5dim.c | 18 +++++++++++------- libhdf5/hdf5internal.c | 32 +++++++++++++++----------------- nc_test4/tst_parallel6.c | 6 ++++-- 3 files changed, 30 insertions(+), 26 deletions(-) diff --git a/libhdf5/hdf5dim.c b/libhdf5/hdf5dim.c index 45fd8c5549..6ea81048ef 100644 --- a/libhdf5/hdf5dim.c +++ b/libhdf5/hdf5dim.c @@ -161,20 +161,24 @@ HDF5_inq_dim(int ncid, int dimid, char *name, size_t *lenp) { if (dim->unlimited) { + *lenp = 0; + +#ifndef USE_PARALLEL + /* Shortcut for non-parallel operation: if the dim->len is + * non-zero, it will be set to the correct size. */ + if (dim->len) + *lenp = dim->len; +#endif + /* Since this is an unlimited dimension, go to the file and see how many records there are. Take the max number of records from all the vars that share this dimension. */ - *lenp = 0; - if (dim->len == 0) { + if (*lenp == 0) + { if ((ret = nc4_find_dim_len(dim_grp, dimid, &lenp))) return ret; - if (h5->no_write == NC_TRUE) { dim->len = *lenp; - } - } - else { - *lenp = dim->len; } } else diff --git a/libhdf5/hdf5internal.c b/libhdf5/hdf5internal.c index f3b9871f7d..83add362dc 100644 --- a/libhdf5/hdf5internal.c +++ b/libhdf5/hdf5internal.c @@ -117,6 +117,8 @@ find_var_dim_max_length(NC_GRP_INFO_T *grp, int varid, int dimid, *maxlen = 0; + LOG((3, "find_var_dim_max_length varid %d dimid %d", varid, dimid)); + /* Find this var. */ var = (NC_VAR_INFO_T*)ncindexith(grp->vars,varid); if (!var) return NC_ENOTVAR; @@ -162,24 +164,20 @@ find_var_dim_max_length(NC_GRP_INFO_T *grp, int varid, int dimid, *maxlen = *maxlen > h5dimlen[d] ? *maxlen : h5dimlen[d]; #ifdef USE_PARALLEL - /* If we are doing parallel I/O in collective mode, then - * communicate with all other tasks in the collective and - * find out which has the max value for the dimension - * size. */ + /* If we are doing parallel I/O in collective mode (with + * either pnetcdf or HDF5), then communicate with all + * other tasks in the collective and find out which has + * the max value for the dimension size. */ + assert(grp->nc4_info); + LOG((3, "before Allreduce *maxlen %ld grp->nc4_info->parallel %d var->parallel_access %d", + *maxlen, grp->nc4_info->parallel, var->parallel_access)); + if (grp->nc4_info->parallel && var->parallel_access == NC_COLLECTIVE) { - assert(grp->nc4_info); - size_t real_maxlen; - - /* If parallel is in use, and var is collective, - * reduce to largest value of maxlen, putting result - * into real_maxlen. */ - if (grp->nc4_info->parallel && var->parallel_access == NC_COLLECTIVE) - { - if (MPI_Allreduce(maxlen, &real_maxlen, 1, NC_MPI_SIZE_T, MPI_MAX, - grp->nc4_info->comm)) - BAIL(NC_EMPI); - *maxlen = real_maxlen; - } + if ((MPI_SUCCESS != MPI_Allreduce(MPI_IN_PLACE, maxlen, 1, + MPI_UNSIGNED_LONG_LONG, MPI_MAX, + grp->nc4_info->comm))) + BAIL(NC_EMPI); + LOG((3, "after Allreduce *maxlen %ld", *maxlen)); } #endif /* USE_PARALLEL */ } diff --git a/nc_test4/tst_parallel6.c b/nc_test4/tst_parallel6.c index 20e7c5cd06..c6976c4b2b 100644 --- a/nc_test4/tst_parallel6.c +++ b/nc_test4/tst_parallel6.c @@ -24,7 +24,7 @@ int main(int argc, char** argv) { int err = NC_NOERR, rank, nprocs; - int ncid, cmode, varid, dimid; + int ncid, varid, dimid; size_t start[1], count[1], nrecs; MPI_Init(&argc, &argv); @@ -37,6 +37,7 @@ int main(int argc, char** argv) if (!rank) printf("*** testing record lenth with multiple processes writing records..."); + nc_set_log_level(4); if (nc_create_par(FILENAME, NC_CLOBBER | NC_NETCDF4, MPI_COMM_WORLD, MPI_INFO_NULL, &ncid)) ERR; @@ -49,6 +50,8 @@ int main(int argc, char** argv) count[0] = 1; if (nc_put_vara_int(ncid, varid, start, count, &rank)) ERR; if (nc_inq_dimlen(ncid, dimid, &nrecs)) ERR; + if (nc_close(ncid)) ERR; + nc_set_log_level(-1); if (nrecs != nprocs) { @@ -58,7 +61,6 @@ int main(int argc, char** argv) nprocs, nrecs); ERR; } - if (nc_close(ncid)) ERR; if (!rank) SUMMARIZE_ERR; From 1bcb87fb0c904d48bf207bde340b9f93393a4f5f Mon Sep 17 00:00:00 2001 From: Edward Hartnett Date: Tue, 26 Apr 2022 07:58:05 -0600 Subject: [PATCH 5/7] fixing parallel I/O bug with dim length for unlimited dimensions --- include/netcdf_par.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/include/netcdf_par.h b/include/netcdf_par.h index 5577f69812..17303c7b4f 100644 --- a/include/netcdf_par.h +++ b/include/netcdf_par.h @@ -21,29 +21,12 @@ #define NETCDF_PAR_H 1 #include -#include -#include /** Use with nc_var_par_access() to set parallel access to independent. */ #define NC_INDEPENDENT 0 /** Use with nc_var_par_access() to set parallel access to collective. */ #define NC_COLLECTIVE 1 -/** Find the MPI type that cooresponds to the C size_t type. */ -#if SIZE_MAX == UCHAR_MAX -#define NC_MPI_SIZE_T MPI_UNSIGNED_CHAR -#elif SIZE_MAX == USHRT_MAX -#define NC_MPI_SIZE_T MPI_UNSIGNED_SHORT -#elif SIZE_MAX == UINT_MAX -#define NC_MPI_SIZE_T MPI_UNSIGNED -#elif SIZE_MAX == ULONG_MAX -#define NC_MPI_SIZE_T MPI_UNSIGNED_LONG -#elif SIZE_MAX == ULLONG_MAX -#define NC_MPI_SIZE_T MPI_UNSIGNED_LONG_LONG -#else -#error "cannot determine MPI type for size_t" -#endif - #if defined(__cplusplus) extern "C" { #endif From d7efa7da82fcc551e24b447643d6a8a0f1aac0fc Mon Sep 17 00:00:00 2001 From: Edward Hartnett Date: Tue, 26 Apr 2022 07:59:24 -0600 Subject: [PATCH 6/7] fixing parallel I/O bug with dim length for unlimited dimensions --- nc_test/tst_small.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nc_test/tst_small.c b/nc_test/tst_small.c index f44068271d..d4172cec49 100644 --- a/nc_test/tst_small.c +++ b/nc_test/tst_small.c @@ -509,8 +509,7 @@ main(int argc, char **argv) /* Go thru formats and run all tests for each of two (for netCDF-3 * only builds), or 4 (for netCDF-4 builds) different formats. */ - /* for (i = NUM_FORMATS; i >= 1; i--) */ - i = NC_FORMAT_NETCDF4; + for (i = NUM_FORMATS; i >= 1; i--) { switch (i) { From 540001859366e6a4422e0eb52c5ea357e42003dd Mon Sep 17 00:00:00 2001 From: Edward Hartnett Date: Tue, 26 Apr 2022 08:00:20 -0600 Subject: [PATCH 7/7] fixing parallel I/O bug with dim length for unlimited dimensions --- nc_test4/tst_parallel6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nc_test4/tst_parallel6.c b/nc_test4/tst_parallel6.c index c6976c4b2b..96eccb5bb7 100644 --- a/nc_test4/tst_parallel6.c +++ b/nc_test4/tst_parallel6.c @@ -37,7 +37,7 @@ int main(int argc, char** argv) if (!rank) printf("*** testing record lenth with multiple processes writing records..."); - nc_set_log_level(4); + /* nc_set_log_level(4); */ if (nc_create_par(FILENAME, NC_CLOBBER | NC_NETCDF4, MPI_COMM_WORLD, MPI_INFO_NULL, &ncid)) ERR; @@ -51,7 +51,7 @@ int main(int argc, char** argv) if (nc_put_vara_int(ncid, varid, start, count, &rank)) ERR; if (nc_inq_dimlen(ncid, dimid, &nrecs)) ERR; if (nc_close(ncid)) ERR; - nc_set_log_level(-1); + /* nc_set_log_level(-1); */ if (nrecs != nprocs) {