NOAA-OWP · robertbartel · Jul 2, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024
diff --git a/data/serialized_dataset_examples/ngen-output-1.json b/data/serialized_dataset_examples/ngen-output-1.json
@@ -5,7 +5,7 @@
    "type" : "OBJECT_STORE",
    "data_domain" : {
       "continuous" : [],
-      "data_format" : "NGEN_OUTPUT",
+      "data_format" : "NGEN_CSV_OUTPUT",
       "discrete" : [
          {
             "values" : [

diff --git a/docker/main/ngen/funcs.sh b/docker/main/ngen/funcs.sh
@@ -212,4 +212,95 @@ run_secondary_mpi_ssh_worker_node()
     while [ -e ${RUN_SENTINEL} ] && kill -s 0 "${_SSH_D_PID}" 2>/dev/null ; do
         sleep 5
     done
+}
+
+tar_and_copy()
+{
+    # If $1 is "--dry-run" then just do sanity checks without tarring or copying, then shift args
+    # If $1 is "--compress", then indicate that tar should be gzipped, then shift args
+
+    # $1 is source directory containing contents to archive
+    # $2 is the name of/path to the produced tar archive
+    # $3 is the location to copy to
+
+    if [ "${1:?No args given to tar_and_copy}" == "--dry-run" ]; then
+        local _DRY_RUN="true"
+        shift
+    fi
+
+    if [ "${1:?No contents directory given to tar_and_copy}" == "--compress" ]; then
+        local _TAR_EXTRA_ARGS="-z"
+        shift
+    fi
+
+    local _CONTENTS_DIR="${1:?No contents directory given to tar_and_copy}"
+    local _TAR_FILE="${2:?No archive file given to tar_and_copy}"
+    local _DEST_DIR="${3:?No copy destination directory given to tar_and_copy}"
+
+    if [ ! -e "${_CONTENTS_DIR}" ]; then
+        >&2 echo "$(print_date) Error: tar_and_copy contents directory '${_CONTENTS_DIR}' does not exist!"
+        exit 1
+    elif [ ! -d "${_CONTENTS_DIR}" ]; then
+        >&2 echo "$(print_date) Error: tar_and_copy contents directory '${_CONTENTS_DIR}' exists but is not a directory!"
+        exit 1
+    elif [ ! -e "${_DEST_DIR}" ]; then
+        >&2 echo "$(print_date) Error: tar_and_copy copy destination directory '${_DEST_DIR}' does not exist!"
+        exit 1
+    elif [ ! -e "${_DEST_DIR}" ]; then
+        >&2 echo "$(print_date) Error: tar_and_copy copy destination directory '${_DEST_DIR}' exist but is not a directory!"
+        exit 1
+    elif [ -e "${_TAR_FILE}" ]; then
+        >&2 echo "$(print_date) Error: tar_and_copy archive file '${_TAR_FILE}' already exists!"
+        exit 1
+    fi
+
+    if [ "${_DRY_RUN:-}" == "true" ]; then
+        return 0
+    fi
+
+    tar -c ${_TAR_EXTRA_ARGS:-} -f "${_DEST_DIR}/${_TAR_FILE}" -C "${_CONTENTS_DIR}" .
+    #cp -a "${_TAR_FILE}" "${_DEST_DIR}/."
+    #rm "${_TAR_FILE}"
+}
+
+gather_output() {
+    echo "$(print_date) Gather from remote worker host ${JOB_OUTPUT_WRITE_DIR:?Job temp output dir not defined} dirs"
+    for i in $(echo "${MPI_HOST_STRING}" | sed 's/,/ /g'); do
+        _HOST_NAME=$(echo "${i}" | awk -F: '{print $1}')
+        if [ "$(hostname)" == "${_HOST_NAME}" ]; then
+            continue
+        fi
+        scp -q -r ${_HOST_NAME}:${JOB_OUTPUT_WRITE_DIR}/ ${JOB_OUTPUT_WRITE_DIR}/. &
+    done
+    for p in $(jobs -p); do
+        wait ${p}
+        _R=$?
+        if [ ${_R} -ne 0 ]; then
+            echo "$(print_date) Error: remote copying of output exited with error ${_R}"
+            exit ${_R}
+        fi
+    done
+}
+
+move_output_to_dataset()
+{
+    # $1 output directory
+    # $2 dataset directory
+
+    if [ ! -d ${1:?No output directory given for copying to dataset} ]; then
+        >&2 echo "$(print_date) Error: cannot move output from non-directory path '${1}' to output dataset!"
+        exit 1
+    elif [ ! -d ${2:?No output dataset directory given for copying} ]; then
+        >&2 echo "$(print_date) Error: cannot move output to non-directory path '${1}' for output dataset!"
+        exit 1
+    fi
+
+    if [ $(ls ${1} | grep '.csv' | wc -l) -gt 0 ]; then
+        echo "$(print_date) Archiving and copying output CSVs to output dataset"
+        tar_and_copy ${1} job-${JOB_ID:?}-output.tar ${2}
+    else
+        echo "$(print_date) Copying output file(s) to output dataset"
+        cp -a ${1}/. ${2}/.
+    fi
+    rm -rf ${1}
 }
diff --git a/docker/main/ngen/ngen_cal_entrypoint.sh b/docker/main/ngen/ngen_cal_entrypoint.sh
@@ -4,54 +4,65 @@
 while [ ${#} -gt 0 ]; do
     case "${1}" in
         --config-dataset)
-            CONFIG_DATASET_NAME="${2:?}"
+            declare -x CONFIG_DATASET_NAME="${2:?}"
             shift
             ;;
         --host-string)
-            MPI_HOST_STRING="${2:?}"
+            declare -x MPI_HOST_STRING="${2:?}"
             shift
             ;;
         --hydrofabric-dataset)
-            HYDROFABRIC_DATASET_NAME="${2:?}"
+            declare -x HYDROFABRIC_DATASET_NAME="${2:?}"
             shift
             ;;
         --job-id)
-            JOB_ID="${2:?}"
+            declare -x JOB_ID="${2:?}"
             shift
             ;;
         --node-count)
-            MPI_NODE_COUNT="${2:?}"
+            declare -x MPI_NODE_COUNT="${2:?}"
             shift
             ;;
         --output-dataset)
-            OUTPUT_DATASET_NAME="${2:?}"
+            declare -x OUTPUT_DATASET_NAME="${2:?}"
             shift
             ;;
         --partition-dataset)
-            PARTITION_DATASET_NAME="${2:?}"
+            declare -x PARTITION_DATASET_NAME="${2:?}"
             shift
             ;;
         --worker-index)
-            WORKER_INDEX="${2:?}"
+            declare -x WORKER_INDEX="${2:?}"
             shift
             ;;
         --calibration-config-file)
-            CALIBRATION_CONFIG_BASENAME="${2:?}"
+            declare -x CALIBRATION_CONFIG_BASENAME="${2:?}"
             shift
             ;;
     esac
     shift
 done
 
+# TODO: (later) in both ngen and ngen-cal entrypoints, add controls for whether this is temp dir or output dataset dir
+declare -x JOB_OUTPUT_WRITE_DIR="/tmp/job_output"
+
 # Get some universally applicable functions and constants
 source ./funcs.sh
 
 ngen_sanity_checks_and_derived_init
 init_script_mpi_vars
 init_ngen_executable_paths
 
-# Move to the output dataset mounted directory
-cd ${OUTPUT_DATASET_DIR:?Output dataset directory not defined}
+# Move to the output write directory
+# TODO: (later) in both ngen and ngen-cal entrypoints, control whether this is needed, based on if write dir is output dataset dir
+#cd ${OUTPUT_DATASET_DIR:?Output dataset directory not defined}
+mkdir ${JOB_OUTPUT_WRITE_DIR:?}
+chown ${MPI_USER}:${MPI_USER} ${JOB_OUTPUT_WRITE_DIR}
+cd ${JOB_OUTPUT_WRITE_DIR}
+#Needed for routing
+if [ ! -e /dmod/datasets/linked_job_output ]; then
+    ln -s ${JOB_OUTPUT_WRITE_DIR} /dmod/datasets/linked_job_output
+fi
 
 start_calibration() {
     # Start ngen calibration
@@ -61,24 +72,31 @@ start_calibration() {
         echo "$(print_date) Starting ngen calibration with serial ngen execution"
     fi
 
-    # Find and use copy of config in output dataset
+    # Find calibration config, then copy to output dataset and use that
     if [ -n "${CALIBRATION_CONFIG_BASENAME:-}" ]; then
-        CALIBRATION_CONFIG_FILE=$(find ${OUTPUT_DATASET_DIR:?} -type f -name "${CALIBRATION_CONFIG_BASENAME}" -maxdepth 1 | head -1)
+        #CALIBRATION_CONFIG_FILE=$(find ${OUTPUT_DATASET_DIR:?} -type f -name "${CALIBRATION_CONFIG_BASENAME}" -maxdepth 1 | head -1)
+        _ORIG_CAL_CONFIG_FILE=$(find ${CONFIG_DATASET_DIR:?} -type f -name "${CALIBRATION_CONFIG_BASENAME}" -maxdepth 1 | head -1)
     else
-        CALIBRATION_CONFIG_FILE=$(find ${OUTPUT_DATASET_DIR:?} -type f -iname "*.yaml" -o -iname "*.yml" -maxdepth 1 | head -1)
+        #CALIBRATION_CONFIG_FILE=$(find ${OUTPUT_DATASET_DIR:?} -type f -iname "*.yaml" -o -iname "*.yml" -maxdepth 1 | head -1)
+        _ORIG_CAL_CONFIG_FILE=$(find ${CONFIG_DATASET_DIR:?} -type f -iname "*.yaml" -o -iname "*.yml" -maxdepth 1 | head -1)
     fi
-
-    if [ -z "${CALIBRATION_CONFIG_FILE}" ]; then
+    if [ -z "${_ORIG_CAL_CONFIG_FILE}" ]; then
         echo "Error: NGEN calibration yaml file not found" 2>&1
         exit 1
     fi
+    cp -a ${_ORIG_CAL_CONFIG_FILE:?} ${OUTPUT_DATASET_DIR:?}/.
+    CALIBRATION_CONFIG_FILE="${OUTPUT_DATASET_DIR:?}/$(basename ${_ORIG_CAL_CONFIG_FILE})"
+
     python3 -m ngen.cal "${CALIBRATION_CONFIG_FILE}"
 
     #Capture the return value to use as service exit code
     NGEN_RETURN=$?
 
     echo "$(print_date) ngen calibration finished with return value: ${NGEN_RETURN}"
 
+    # TODO: (later) make sure outputs are handled properly, and that eventually we support toggling whether written to
+    # TODO:     output dataset dir directly or somewhere else
+
     # Exit with the model's exit code
     return ${NGEN_RETURN}
 }
@@ -89,11 +107,16 @@ if [ "${WORKER_INDEX:-0}" = "0" ]; then
         # This will only have an effect when running with multiple MPI nodes, so its safe to have even in serial exec
         trap close_remote_workers EXIT
         # Have "main" (potentially only) worker copy config files to output dataset for record keeping
-        # TODO: perform copy of configs to output dataset outside of image (in service) for better performance
-        cp -a ${CONFIG_DATASET_DIR:?Config dataset directory not defined}/. ${OUTPUT_DATASET_DIR:?}
+        # TODO: (later) in ngen and ngen-cal entrypoints, consider adding controls for whether this is done or a simpler
+        # TODO:     'cp' call, based on whether we write directly to output dataset dir or some other output write dir
+        # Do a dry run first to sanity check directory and fail if needed before backgrounding process
+        tar_and_copy --dry-run --compress ${CONFIG_DATASET_DIR:?Config dataset directory not defined} config_dataset.tgz ${OUTPUT_DATASET_DIR:?}
+        # Then actually run the archive and copy function in the background
+        tar_and_copy --compress ${CONFIG_DATASET_DIR:?} config_dataset.tgz ${OUTPUT_DATASET_DIR:?} &
+        _CONFIG_COPY_PROC=$!
+        # If there is partitioning, which implies multi-processing job ...
         if [ -n "${PARTITION_DATASET_DIR:-}" ]; then
             # Include partition config dataset too if appropriate
-            # TODO: perform copy of configs to output dataset outside of image (in service) for better performance
             cp -a ${PARTITION_DATASET_DIR}/. ${OUTPUT_DATASET_DIR:?}
         fi
 

diff --git a/docker/main/ngen/ngen_entrypoint.sh b/docker/main/ngen/ngen_entrypoint.sh
@@ -39,18 +39,25 @@ while [ ${#} -gt 0 ]; do
     shift
 done
 
+# TODO: (later) in both ngen and ngen-cal entrypoints, add controls for whether this is temp dir or output dataset dir
+declare -x JOB_OUTPUT_WRITE_DIR="/tmp/job_output"
+
 # Get some universally applicable functions and constants
 source /ngen/funcs.sh
 
 ngen_sanity_checks_and_derived_init
 init_script_mpi_vars
 init_ngen_executable_paths
 
-# Move to the output dataset mounted directory
-cd ${OUTPUT_DATASET_DIR:?Output dataset directory not defined}
+# Move to the output write directory
+# TODO: (later) in both ngen and ngen-cal entrypoints, control whether this is needed, based on if write dir is output dataset dir
+#cd ${OUTPUT_DATASET_DIR:?Output dataset directory not defined}
+mkdir ${JOB_OUTPUT_WRITE_DIR:?}
+chown ${MPI_USER}:${MPI_USER} ${JOB_OUTPUT_WRITE_DIR}
+cd ${JOB_OUTPUT_WRITE_DIR}
 #Needed for routing
 if [ ! -e /dmod/datasets/linked_job_output ]; then
-    ln -s $(pwd) /dmod/datasets/linked_job_output
+    ln -s ${JOB_OUTPUT_WRITE_DIR} /dmod/datasets/linked_job_output
 fi
 
 # We can allow worker index to not be supplied when executing serially
@@ -59,15 +66,47 @@ if [ "${WORKER_INDEX:-0}" = "0" ]; then
         # This will only have an effect when running with multiple MPI nodes, so its safe to have even in serial exec
         trap close_remote_workers EXIT
         # Have "main" (potentially only) worker copy config files to output dataset for record keeping
-        # TODO: perform copy of configs to output dataset outside of image (in service) for better performance
-        cp -a ${CONFIG_DATASET_DIR:?Config dataset directory not defined}/. ${OUTPUT_DATASET_DIR:?}
+        # TODO: (later) in ngen and ngen-cal entrypoints, consider adding controls for whether this is done or a simpler
+        # TODO:     'cp' call, based on whether we write directly to output dataset dir or some other output write dir
+        # Do a dry run first to sanity check directory and fail if needed before backgrounding process
+        tar_and_copy --dry-run --compress ${CONFIG_DATASET_DIR:?Config dataset directory not defined} config_dataset.tgz ${OUTPUT_DATASET_DIR:?}
+        # Then actually run the archive and copy function in the background
+        tar_and_copy --compress ${CONFIG_DATASET_DIR:?} config_dataset.tgz ${OUTPUT_DATASET_DIR:?} &
+        _CONFIG_COPY_PROC=$!
+        # If there is partitioning, which implies multi-processing job ...
         if [ -n "${PARTITION_DATASET_DIR:-}" ]; then
-            # Include partition config dataset too if appropriate
-            # TODO: perform copy of configs to output dataset outside of image (in service) for better performance
+            # Include partition config dataset too if appropriate, though for simplicity, just copy directly
             cp -a ${PARTITION_DATASET_DIR}/. ${OUTPUT_DATASET_DIR:?}
+            # Then run execution
             exec_main_worker_ngen_run
+
+            # TODO: (later) in ngen and ngen-cal entrypoints, add controls for whether this is done base on whether we
+            # TODO:     are writing directly to output dataset dir or some other output write dir; this will be
+            # TODO:     important once netcdf output works
+            # Then gather output from all worker hosts
+            gather_output
+            # Then wait at this point (if necessary) for our background config copy to avoid taxing things
+            echo "$(print_date) Waiting for compression and copying of configuration files to output dataset"
+            wait ${_CONFIG_COPY_PROC}
+            echo "$(print_date) Compression/copying of config data to output dataset complete"
+            echo "$(print_date) Copying results to output dataset"
+            move_output_to_dataset ${JOB_OUTPUT_WRITE_DIR} ${OUTPUT_DATASET_DIR:?}
+            echo "$(print_date) Results copied to output dataset"
+        # Otherwise, we just have a serial job ...
         else
+            # Execute it first
             exec_serial_ngen_run
+
+            # TODO: (later) in ngen and ngen-cal entrypoints, add controls for whether this is done base on whether we
+            # TODO:     are writing directly to output dataset dir or some other output write dir; this will be
+            # TODO:     important once netcdf output works
+            echo "$(print_date) Waiting for compression and copying of configuration files to output dataset"
+            wait ${_CONFIG_COPY_PROC}
+            echo "$(print_date) Compression/copying of config data to output dataset complete"
+
+            echo "$(print_date) Copying results to output dataset"
+            move_output_to_dataset ${JOB_OUTPUT_WRITE_DIR} ${OUTPUT_DATASET_DIR:?}
+            echo "$(print_date) Results copied to output dataset"
         fi
     else
         # Start SSHD on the main worker if have an MPI job

diff --git a/python/lib/communication/dmod/communication/maas_request/ngen/abstract_nextgen_request.py b/python/lib/communication/dmod/communication/maas_request/ngen/abstract_nextgen_request.py
@@ -311,7 +311,7 @@ def output_formats(self) -> List[DataFormat]:
         List[DataFormat]
             List of the formats of each required output dataset for the requested job.
         """
-        return [DataFormat.NGEN_OUTPUT]
+        return [DataFormat.ARCHIVED_NGEN_CSV_OUTPUT]
 
     @property
     def partition_cfg_data_id(self) -> Optional[str]:

diff --git a/python/lib/core/dmod/core/meta_data.py b/python/lib/core/dmod/core/meta_data.py
@@ -118,11 +118,11 @@ class DataFormat(PydanticEnum):
                            True
                            )
     """ The default format for "raw" AORC forcing data. """
-    NGEN_OUTPUT = (3,
-                   {StandardDatasetIndex.CATCHMENT_ID: None, StandardDatasetIndex.TIME: None, StandardDatasetIndex.DATA_ID: None},
-                   None,
-                   True)
-    """ Representation of the format for Nextgen output, with unknown/unspecified configuration of output fields. """
+    NGEN_CSV_OUTPUT = (3,
+                       {StandardDatasetIndex.CATCHMENT_ID: None, StandardDatasetIndex.TIME: None, StandardDatasetIndex.DATA_ID: None},
+                       None,
+                       True)
+    """ Format for output of ngen when written as CSV, with unknown/unspecified configuration of output fields. """
     NGEN_REALIZATION_CONFIG = (
         4, {StandardDatasetIndex.CATCHMENT_ID: None, StandardDatasetIndex.TIME: None, StandardDatasetIndex.DATA_ID: None}, None, True)
     """ Representation of the format of realization configs, which covers catchments (id) has a time period (time). """
@@ -221,15 +221,28 @@ class DataFormat(PydanticEnum):
     is removed).
     """
 
+    ARCHIVED_NGEN_CSV_OUTPUT = (17,
+                       {StandardDatasetIndex.CATCHMENT_ID: None, StandardDatasetIndex.TIME: None, StandardDatasetIndex.DATA_ID: None},
+                       None,
+                       True)
+    """ Format for output of ngen, similar to ``NGEN_CSV_OUTPUT``, but with all output archived to single tar file. """
+
+    NGEN_NETCDF_OUTPUT = (18,
+                          {StandardDatasetIndex.CATCHMENT_ID: None, StandardDatasetIndex.TIME: None,
+                           StandardDatasetIndex.DATA_ID: None},
+                          None,
+                          True)
+    """ Format for output of ngen when written to single NetCDF file, with dynamically configured output fields. """
+
     @classmethod
     def can_format_fulfill(cls, needed: DataFormat, alternate: DataFormat) -> bool:
         """
-        Test whether data in an alternate format is capable of satisfying requirements of some other format.
+        Test whether a dataset and contained data in some format can satisfy requirements of a different format.
 
-        This function indicates whether data in one format (the alternate format) is compatible with requirements
-        specified using a different format (the needed format).  It is an indication of whether data is **potentially**
-        capable of satisfying a requirement - even if the data formats of the two are not the same - due to the two
-        formats being sufficiently similar.
+        This function indicates whether a hypothetical dataset and its data, having some particular format (the
+        alternate format) is compatible with hypothical requirements specified using a different format (the needed
+        format).  It is an indication of whether a dataset and its data are **potentially** capable of satisfying a
+        requirement, even with a different format, due to the two formats being sufficiently similar.
 
         For example, the NextGen framework can support forcings in either CSV or NetCDF formats, represented as
         ``AORC_CSV`` and ``NETCDF_FORCING_CANONICAL`` respectively.  A job to execute NextGen would include a forcing
@@ -264,7 +277,12 @@ def can_format_fulfill(cls, needed: DataFormat, alternate: DataFormat) -> bool:
         compatible_forcing_formats = {cls.AORC_CSV, cls.NETCDF_FORCING_CANONICAL, cls.NETCDF_AORC_DEFAULT}
         if needed in compatible_forcing_formats and alternate in compatible_forcing_formats:
             return True
-        # Anything else, they are compatible
+
+        ngen_csv_output_formats = {cls.ARCHIVED_NGEN_CSV_OUTPUT, cls.NGEN_CSV_OUTPUT}
+        if needed in ngen_csv_output_formats and alternate in ngen_csv_output_formats:
+            return True
+
+        # Anything else, they are not compatible
         return False
 
     @classmethod