Skip to content

Commit

Permalink
Collect performance metrics by using one MPI_Gather instead of multip…
Browse files Browse the repository at this point in the history
…le MPI_Reduce
  • Loading branch information
wkliao committed Nov 2, 2024
1 parent 30ee2ca commit 1fb7c5d
Showing 1 changed file with 179 additions and 93 deletions.
272 changes: 179 additions & 93 deletions src/cases/report_timing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,19 @@
#include <stdlib.h>
#include <string.h>
#include <libgen.h> /* basename() */
#include <limits.h> /* LONG_MAX */
#include <assert.h>

#include <mpi.h>
#include <e3sm_io.h>

typedef struct {
MPI_Offset nreqs;
MPI_Offset amount;
MPI_Offset sum_decomp_varlen;
double timing[7];
} perf_metrics;

#ifdef ENABLE_PNC
/*----< pnetcdf_check_mem_usage() >------------------------------------------*/
/* check PnetCDF library internal memory usage */
Expand Down Expand Up @@ -62,12 +70,11 @@ int print_timing_WR(e3sm_io_config *cfg,
e3sm_io_decom *decom,
case_meta *cmeta)
{
int i, ndecomp;
MPI_Offset off_msg[3], sum_off[3];
MPI_Offset sum_nreqs, sum_amount_WR, max_nreqs, min_nreqs, max_wr, min_wr;
int i, j, ndecomp;
MPI_Offset vlen, sum_decomp_varlen;
double wTime;
MPI_Comm comm=cfg->io_comm;
double ioTime;
perf_metrics metrics, *all_metrics;

ndecomp = decom->num_decomp;

Expand All @@ -90,31 +97,54 @@ int print_timing_WR(e3sm_io_config *cfg,
* total size.
*/

off_msg[0] = cmeta->my_nreqs;
off_msg[1] = cmeta->amount_WR;
off_msg[2] = sum_decomp_varlen;
MPI_Reduce(off_msg, sum_off, 3, MPI_OFFSET, MPI_SUM, 0, comm);
sum_nreqs = sum_off[0];
sum_amount_WR = sum_off[1];
sum_decomp_varlen = sum_off[2];

MPI_Reduce(&cmeta->my_nreqs, &max_nreqs, 1, MPI_OFFSET, MPI_MAX, 0, comm);
MPI_Reduce(&cmeta->my_nreqs, &min_nreqs, 1, MPI_OFFSET, MPI_MIN, 0, comm);
MPI_Reduce(&cmeta->amount_WR, &max_wr, 1, MPI_OFFSET, MPI_MAX, 0, comm);
MPI_Reduce(&cmeta->amount_WR, &min_wr, 1, MPI_OFFSET, MPI_MIN, 0, comm);

double dbl_tmp[7], max_dbl[7], min_dbl[7];
dbl_tmp[0] = cmeta->pre_time;
dbl_tmp[1] = cmeta->open_time;
dbl_tmp[2] = cmeta->def_time;
dbl_tmp[3] = cmeta->post_time;
dbl_tmp[4] = cmeta->flush_time;
dbl_tmp[5] = cmeta->close_time;
dbl_tmp[6] = cmeta->end2end_time;
MPI_Reduce(dbl_tmp, max_dbl, 7, MPI_DOUBLE, MPI_MAX, 0, comm);
MPI_Reduce(dbl_tmp, min_dbl, 7, MPI_DOUBLE, MPI_MIN, 0, comm);
metrics.nreqs = cmeta->my_nreqs;
metrics.amount = cmeta->amount_WR;
metrics.sum_decomp_varlen = sum_decomp_varlen;
metrics.timing[0] = cmeta->pre_time;
metrics.timing[1] = cmeta->open_time;
metrics.timing[2] = cmeta->def_time;
metrics.timing[3] = cmeta->post_time;
metrics.timing[4] = cmeta->flush_time;
metrics.timing[5] = cmeta->close_time;
metrics.timing[6] = cmeta->end2end_time;

if (cfg->rank == 0)
all_metrics = (perf_metrics*) malloc(sizeof(perf_metrics) * cfg->np);

MPI_Gather(&metrics, sizeof(perf_metrics), MPI_BYTE,
all_metrics, sizeof(perf_metrics), MPI_BYTE, 0, comm);

if (cfg->rank == 0) {
MPI_Offset sum_nreqs, sum_amount, max_nreqs, min_nreqs, max_amnt, min_amnt;
double max_t[7], min_t[7];

sum_nreqs = 0;
sum_amount = 0;
sum_decomp_varlen = 0;
max_nreqs = 0;
max_amnt = 0;
min_nreqs = LONG_MAX;
min_amnt = LONG_MAX;
for (j=0; j<7; j++) {
max_t[j] = 0;
min_t[j] = (double)LONG_MAX;
}

for (i=0; i<cfg->np; i++) {
sum_nreqs += all_metrics[i].nreqs;
sum_amount += all_metrics[i].amount;
sum_decomp_varlen += all_metrics[i].sum_decomp_varlen;
max_nreqs = MAX(max_nreqs, all_metrics[i].nreqs);
max_amnt = MAX(max_amnt, all_metrics[i].amount);
min_nreqs = MIN(min_nreqs, all_metrics[i].nreqs);
min_amnt = MIN(min_amnt, all_metrics[i].amount);
for (j=0; j<7; j++) {
max_t[j] = MAX(max_t[j], all_metrics[i].timing[j]);
min_t[j] = MIN(min_t[j], all_metrics[i].timing[j]);
}
}
free(all_metrics);

int nvars_noD = cmeta->nvars;
for (i=0; i<ndecomp; i++) nvars_noD -= cmeta->nvars_D[i];

Expand Down Expand Up @@ -187,13 +217,13 @@ int print_timing_WR(e3sm_io_config *cfg,
}

if (cfg->strategy == canonical && (cfg->api == netcdf4 || cfg->api == hdf5))
wTime = max_dbl[3];
ioTime = max_t[3];
else if (cfg->strategy == log && cfg->api == netcdf4)
wTime = max_dbl[5];
ioTime = max_t[5];
else if (cfg->api == pnetcdf || cfg->strategy == log || cfg->api == hdf5_md)
wTime = max_dbl[4];
ioTime = max_t[4];
else /* write happens at file close for hdf5 blob and adios blob */
wTime = max_dbl[5];
ioTime = max_t[5];

if (cfg->strategy == log && (cfg->api == hdf5_log || cfg->api == netcdf4)) {
if (cfg->num_subfiles != 0) {
Expand Down Expand Up @@ -247,24 +277,24 @@ int print_timing_WR(e3sm_io_config *cfg,
printf("No. I/O flush calls = %6d\n", cmeta->num_flushes);
printf("-----------------------------------------------------------\n");
printf("Indiv write amount MAX = %.2f MiB = %.2f GiB\n",
(double)max_wr / 1048576, (double)max_wr / 1073741824);
(double)max_amnt / 1048576, (double)max_amnt / 1073741824);
printf("Indiv write amount MIN = %.2f MiB = %.2f GiB\n",
(double)min_wr / 1048576, (double)min_wr / 1073741824);
(double)min_amnt / 1048576, (double)min_amnt / 1073741824);
printf("Total write amount = %.2f MiB = %.2f GiB\n",
(double)sum_amount_WR / 1048576, (double)sum_amount_WR / 1073741824);
(double)sum_amount / 1048576, (double)sum_amount / 1073741824);
printf("-----------------------------------------------------------\n");
printf("Time of I/O preparing min/max = %8.4f / %8.4f\n", min_dbl[0], max_dbl[0]);
printf("Time of file open/create min/max = %8.4f / %8.4f\n", min_dbl[1], max_dbl[1]);
printf("Time of define variables min/max = %8.4f / %8.4f\n", min_dbl[2], max_dbl[2]);
printf("Time of posting write requests min/max = %8.4f / %8.4f\n", min_dbl[3], max_dbl[3]);
printf("Time of write flushing min/max = %8.4f / %8.4f\n", min_dbl[4], max_dbl[4]);
printf("Time of close min/max = %8.4f / %8.4f\n", min_dbl[5], max_dbl[5]);
printf("end-to-end time min/max = %8.4f / %8.4f\n", min_dbl[6], max_dbl[6]);
printf("Time of I/O preparing min/max = %8.4f / %8.4f\n", min_t[0], max_t[0]);
printf("Time of file open/create min/max = %8.4f / %8.4f\n", min_t[1], max_t[1]);
printf("Time of define variables min/max = %8.4f / %8.4f\n", min_t[2], max_t[2]);
printf("Time of posting write requests min/max = %8.4f / %8.4f\n", min_t[3], max_t[3]);
printf("Time of write flushing min/max = %8.4f / %8.4f\n", min_t[4], max_t[4]);
printf("Time of close min/max = %8.4f / %8.4f\n", min_t[5], max_t[5]);
printf("end-to-end time min/max = %8.4f / %8.4f\n", min_t[6], max_t[6]);
printf("Emulate computation time (sleep) min/max = %8.4f / %8.4f\n", (double)(cfg->comp_time), (double)(cfg->comp_time));
printf("I/O bandwidth in MiB/sec (write-only) = %.4f\n",
(double)sum_amount_WR / 1048576.0 / wTime);
(double)sum_amount / 1048576.0 / ioTime);
printf("I/O bandwidth in MiB/sec (open-to-close) = %.4f\n",
(double)sum_amount_WR / 1048576.0 / max_dbl[6]);
(double)sum_amount / 1048576.0 / max_t[6]);
printf("-----------------------------------------------------------\n");

/* print MPI-IO hints actually used */
Expand Down Expand Up @@ -308,12 +338,11 @@ int print_timing_RD(e3sm_io_config *cfg,
e3sm_io_decom *decom,
case_meta *cmeta)
{
int i, ndecomp;
MPI_Offset off_msg[3], sum_off[3];
MPI_Offset sum_nreqs, sum_amount_RD, max_nreqs, min_nreqs;
int i, j, ndecomp;
MPI_Offset vlen, sum_decomp_varlen;
double wTime;
MPI_Comm comm=cfg->io_comm;
double ioTime;
perf_metrics metrics, *all_metrics;

ndecomp = decom->num_decomp;

Expand All @@ -336,29 +365,54 @@ int print_timing_RD(e3sm_io_config *cfg,
* total size.
*/

off_msg[0] = cmeta->my_nreqs;
off_msg[1] = cmeta->amount_RD;
off_msg[2] = sum_decomp_varlen;
MPI_Reduce(off_msg, sum_off, 3, MPI_OFFSET, MPI_SUM, 0, comm);
sum_nreqs = sum_off[0];
sum_amount_RD = sum_off[1];
sum_decomp_varlen = sum_off[2];

MPI_Reduce(&cmeta->my_nreqs, &max_nreqs, 1, MPI_OFFSET, MPI_MAX, 0, comm);
MPI_Reduce(&cmeta->my_nreqs, &min_nreqs, 1, MPI_OFFSET, MPI_MIN, 0, comm);

double dbl_tmp[7], max_dbl[7], min_dbl[7];
dbl_tmp[0] = cmeta->pre_time;
dbl_tmp[1] = cmeta->open_time;
dbl_tmp[2] = cmeta->def_time;
dbl_tmp[3] = cmeta->post_time;
dbl_tmp[4] = cmeta->flush_time;
dbl_tmp[5] = cmeta->close_time;
dbl_tmp[6] = cmeta->end2end_time;
MPI_Reduce(dbl_tmp, max_dbl, 7, MPI_DOUBLE, MPI_MAX, 0, comm);
MPI_Reduce(dbl_tmp, min_dbl, 7, MPI_DOUBLE, MPI_MIN, 0, comm);
metrics.nreqs = cmeta->my_nreqs;
metrics.amount = cmeta->amount_RD;
metrics.sum_decomp_varlen = sum_decomp_varlen;
metrics.timing[0] = cmeta->pre_time;
metrics.timing[1] = cmeta->open_time;
metrics.timing[2] = cmeta->def_time;
metrics.timing[3] = cmeta->post_time;
metrics.timing[4] = cmeta->flush_time;
metrics.timing[5] = cmeta->close_time;
metrics.timing[6] = cmeta->end2end_time;

if (cfg->rank == 0)
all_metrics = (perf_metrics*) malloc(sizeof(perf_metrics) * cfg->np);

MPI_Gather(&metrics, sizeof(perf_metrics), MPI_BYTE,
all_metrics, sizeof(perf_metrics), MPI_BYTE, 0, comm);

if (cfg->rank == 0) {
MPI_Offset sum_nreqs, sum_amount, max_nreqs, min_nreqs, max_amnt, min_amnt;
double max_t[7], min_t[7];

sum_nreqs = 0;
sum_amount = 0;
sum_decomp_varlen = 0;
max_nreqs = 0;
max_amnt = 0;
min_nreqs = LONG_MAX;
min_amnt = LONG_MAX;
for (j=0; j<7; j++) {
max_t[j] = 0;
min_t[j] = (double)LONG_MAX;
}

for (i=0; i<cfg->np; i++) {
sum_nreqs += all_metrics[i].nreqs;
sum_amount += all_metrics[i].amount;
sum_decomp_varlen += all_metrics[i].sum_decomp_varlen;
max_nreqs = MAX(max_nreqs, all_metrics[i].nreqs);
max_amnt = MAX(max_amnt, all_metrics[i].amount);
min_nreqs = MIN(min_nreqs, all_metrics[i].nreqs);
min_amnt = MIN(min_amnt, all_metrics[i].amount);
for (j=0; j<7; j++) {
max_t[j] = MAX(max_t[j], all_metrics[i].timing[j]);
min_t[j] = MIN(min_t[j], all_metrics[i].timing[j]);
}
}
free(all_metrics);

int nvars_noD = cmeta->nvars;
for (i=0; i<ndecomp; i++) nvars_noD -= cmeta->nvars_D[i];

Expand All @@ -379,40 +433,64 @@ int print_timing_RD(e3sm_io_config *cfg,
printf("Variable read order: same as variables are defined\n");

if (cfg->strategy == canonical) {
if (cfg->api == pnetcdf)
if (cfg->api == pnetcdf) {
printf("==== PnetCDF canonical I/O using varn API ============\n");
else if (cfg->api == hdf5)
printf("PnetCDF library version = %s\n",cfg->pnetcdf_ver);
}
else if (cfg->api == hdf5) {
printf("==== HDF5 canonical I/O ==============================\n");
else if (cfg->api == hdf5_md)
printf("HDF5 library version = %s\n",cfg->hdf5_ver);
}
else if (cfg->api == hdf5_md) {
printf("==== HDF5 canonical I/O using multi-dataset API ======\n");
else if (cfg->api == netcdf4)
printf("HDF5 library version = %s\n",cfg->hdf5_ver);
}
else if (cfg->api == netcdf4) {
printf("==== NetCDF-4 canonical I/O ==========================\n");
printf("NetCDF4 library version = %s\n",cfg->netcdf4_ver);
}
}
else if (cfg->strategy == log) {
if (cfg->api == hdf5)
if (cfg->api == hdf5) {
printf("==== HDF5 using log-based VOL through native APIs=====\n");
else if (cfg->api == hdf5_log)
printf("HDF5 library version = %s\n",cfg->hdf5_ver);
printf("HDF5 Log VOL connector version = %s\n",cfg->hdf5_log_ver);
}
else if (cfg->api == hdf5_log) {
printf("==== HDF5 using log-based VOL APIs ===================\n");
else if (cfg->api == netcdf4)
printf("HDF5 library version = %s\n",cfg->hdf5_ver);
printf("HDF5 Log VOL connector version = %s\n",cfg->hdf5_log_ver);
}
else if (cfg->api == netcdf4) {
printf("==== NetCDF-4 using HDF5 log-based VOL ===============\n");
printf("NetCDF4 library version = %s\n",cfg->netcdf4_ver);
printf("HDF5 library version = %s\n",cfg->hdf5_ver);
printf("HDF5 Log VOL connector version = %s\n",cfg->hdf5_log_ver);
}
}
else if (cfg->strategy == blob) {
if (cfg->api == pnetcdf)
if (cfg->api == pnetcdf) {
printf("==== PnetCDF blob I/O ================================\n");
else if (cfg->api == hdf5)
printf("PnetCDF library version = %s\n",cfg->pnetcdf_ver);
}
else if (cfg->api == hdf5) {
printf("==== HDF5 blob I/O ===================================\n");
else if (cfg->api == adios)
printf("HDF5 library version = %s\n",cfg->hdf5_ver);
}
else if (cfg->api == adios) {
printf("==== ADIOS blob I/O ==================================\n");
printf("ADIOS library version = %s\n",cfg->adios_ver);
}
}

if (cfg->strategy == canonical && (cfg->api == netcdf4 || cfg->api == hdf5))
wTime = max_dbl[3];
ioTime = max_t[3];
else if (cfg->strategy == log && cfg->api == netcdf4)
wTime = max_dbl[5];
ioTime = max_t[5];
else if (cfg->api == pnetcdf || cfg->strategy == log || cfg->api == hdf5_md)
wTime = max_dbl[4];
ioTime = max_t[4];
else /* read happens at file close for hdf5 blob and adios blob */
wTime = max_dbl[5];
ioTime = max_t[5];

if (cfg->strategy == log && (cfg->api == hdf5_log || cfg->api == netcdf4)) {
if (cfg->num_subfiles != 0) {
Expand Down Expand Up @@ -465,27 +543,35 @@ int print_timing_RD(e3sm_io_config *cfg,
printf("I/O flush frequency = %6d\n", cmeta->ffreq);
printf("No. I/O flush calls = %6d\n", cmeta->num_flushes);
printf("-----------------------------------------------------------\n");
printf("Total read amount = %.2f MiB = %.2f GiB\n",
(double)sum_amount_RD / 1048576, (double)sum_amount_RD / 1073741824);
printf("Time of I/O preparing min/max = %8.4f / %8.4f\n", min_dbl[0], max_dbl[0]);
printf("Time of file open/create min/max = %8.4f / %8.4f\n", min_dbl[1], max_dbl[1]);
printf("Time of define variables min/max = %8.4f / %8.4f\n", min_dbl[2], max_dbl[2]);
printf("Time of posting read requests min/max = %8.4f / %8.4f\n", min_dbl[3], max_dbl[3]);
printf("Time of read flushing min/max = %8.4f / %8.4f\n", min_dbl[4], max_dbl[4]);
printf("Time of close min/max = %8.4f / %8.4f\n", min_dbl[5], max_dbl[5]);
printf("end-to-end time min/max = %8.4f / %8.4f\n", min_dbl[6], max_dbl[6]);
printf("Indiv write amount MAX = %.2f MiB = %.2f GiB\n",
(double)max_amnt / 1048576, (double)max_amnt / 1073741824);
printf("Indiv write amount MIN = %.2f MiB = %.2f GiB\n",
(double)min_amnt / 1048576, (double)min_amnt / 1073741824);
printf("Total read amount = %.2f MiB = %.2f GiB\n",
(double)sum_amount / 1048576, (double)sum_amount / 1073741824);
printf("Time of I/O preparing min/max = %8.4f / %8.4f\n", min_t[0], max_t[0]);
printf("Time of file open/create min/max = %8.4f / %8.4f\n", min_t[1], max_t[1]);
printf("Time of define variables min/max = %8.4f / %8.4f\n", min_t[2], max_t[2]);
printf("Time of posting read requests min/max = %8.4f / %8.4f\n", min_t[3], max_t[3]);
printf("Time of read flushing min/max = %8.4f / %8.4f\n", min_t[4], max_t[4]);
printf("Time of close min/max = %8.4f / %8.4f\n", min_t[5], max_t[5]);
printf("end-to-end time min/max = %8.4f / %8.4f\n", min_t[6], max_t[6]);
printf("Emulate computation time (sleep) min/max = %8.4f / %8.4f\n", (double)(cfg->comp_time), (double)(cfg->comp_time));
printf("I/O bandwidth in MiB/sec (read-only) = %.4f\n",
(double)sum_amount_RD / 1048576.0 / wTime);
(double)sum_amount / 1048576.0 / ioTime);
printf("I/O bandwidth in MiB/sec (open-to-close) = %.4f\n",
(double)sum_amount_RD / 1048576.0 / max_dbl[6]);
(double)sum_amount / 1048576.0 / max_t[6]);
printf("-----------------------------------------------------------\n");

/* print MPI-IO hints actually used */
if (cfg->verbose) print_info(&cmeta->info_used);
}
fflush(stdout);

#ifdef ENABLE_PNC
pnetcdf_check_mem_usage(comm);
#endif

if (cmeta->info_used != MPI_INFO_NULL) MPI_Info_free(&cmeta->info_used);

return 0;
Expand Down

0 comments on commit 1fb7c5d

Please sign in to comment.