From 265528012d592c618e7259ce80caba2dc382f748 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 2 May 2023 22:21:27 -0500 Subject: [PATCH 01/44] Add MPI back to daemon in order to support parallel spawning easier --- src/api/hermes_daemon.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/api/hermes_daemon.cc b/src/api/hermes_daemon.cc index 2ea8d7ff6..4c6d199ec 100644 --- a/src/api/hermes_daemon.cc +++ b/src/api/hermes_daemon.cc @@ -15,9 +15,15 @@ #include "hermes_shm/util/logging.h" #include "hermes.h" +#include + namespace hapi = hermes::api; +/** + * */ + int main(int argc, char* argv[]) { + MPI_Init(&argc, &argv); HILOG(kDebug, "Hermes start daemon begins") std::string hermes_config = ""; if (argc == 2) { @@ -30,5 +36,6 @@ int main(int argc, char* argv[]) { hermes->RunDaemon(); hermes->Finalize(); + MPI_Finalize(); return 0; } From f22a9d721325928b643933c033d45f16a66b4612 Mon Sep 17 00:00:00 2001 From: tang584 Date: Tue, 2 May 2023 20:41:39 -0700 Subject: [PATCH 02/44] modify VFD adaptor to depend on HDF5 1.14.0 --- CMakeLists.txt | 2 +- README.md | 6 ++++++ adapter/vfd/H5FDhermes.cc | 5 +++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 78dcc1c0a..5de5b27d4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -362,7 +362,7 @@ endif() # HDF5 if(HERMES_ENABLE_VFD) - set(HERMES_REQUIRED_HDF5_VERSION 1.13.0) + set(HERMES_REQUIRED_HDF5_VERSION 1.14.0) set(HERMES_REQUIRED_HDF5_COMPONENTS C) find_package(HDF5 ${HERMES_REQUIRED_HDF5_VERSION} CONFIG NAMES hdf5 COMPONENTS ${HERMES_REQUIRED_HDF5_COMPONENTS} shared) diff --git a/README.md b/README.md index 4d45a5e6c..026cb4f1c 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,9 @@ +# Fork Branch Description +This branch is for developing a customized Hermes VFD adaptor. +## Dependencies +* HDF5 1.14.0 + +--- # Hermes Hermes is a heterogeneous-aware, multi-tiered, dynamic, and distributed I/O buffering system that aims to significantly accelerate I/O performance. See the [official site](http://www.cs.iit.edu/~scs/assets/projects/Hermes/Hermes.html) for more information. For design documents, architecture description, performance data, and individual component design, see the [wiki](https://github.com/HDFGroup/hermes/wiki). diff --git a/adapter/vfd/H5FDhermes.cc b/adapter/vfd/H5FDhermes.cc index 517d94170..a9f2ce0f5 100644 --- a/adapter/vfd/H5FDhermes.cc +++ b/adapter/vfd/H5FDhermes.cc @@ -114,6 +114,7 @@ static herr_t H5FD__hermes_write(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, static const H5FD_class_t H5FD_hermes_g = { + H5FD_CLASS_VERSION, /* struct version */ H5FD_HERMES_VALUE, /* value */ H5FD_HERMES_NAME, /* name */ MAXADDR, /* maxaddr */ @@ -142,6 +143,10 @@ static const H5FD_class_t H5FD_hermes_g = { NULL, /* get_handle */ H5FD__hermes_read, /* read */ H5FD__hermes_write, /* write */ + NULL, /* read_vector */ + NULL, /* write_vector */ + NULL, /* read_selection */ + NULL, /* write_selection */ NULL, /* flush */ NULL, /* truncate */ NULL, /* lock */ From df0299495f570483ea731addae85afd11bc2e72a Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 15 May 2023 10:27:30 -0500 Subject: [PATCH 03/44] Use dev branch --- ci/hermes/packages/hermes/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/hermes/packages/hermes/package.py b/ci/hermes/packages/hermes/package.py index b5b62cb01..0df86ac43 100644 --- a/ci/hermes/packages/hermes/package.py +++ b/ci/hermes/packages/hermes/package.py @@ -6,7 +6,7 @@ class Hermes(CMakePackage): git = "https://github.com/HDFGroup/hermes.git" version('master', branch='master') version('1.0.0', git='https://github.com/lukemartinlogan/hermes.git', branch='new-borg') - version('dev-priv', git='https://github.com/lukemartinlogan/hermes.git', branch='new-borg') + version('dev-priv', git='https://github.com/lukemartinlogan/hermes.git', branch='dev') version('pnnl', git='https://github.com/lukemartinlogan/hermes.git', branch='pnnl') variant('vfd', default=False, description='Enable HDF5 VFD') From 6f85f14ddc69ecbaa2913197a73d41d268b1ff37 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Thu, 18 May 2023 16:41:40 -0500 Subject: [PATCH 04/44] Allow empty files to be created in scratch mode for MPI adapter --- adapter/mpiio/mpiio_io_client.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/adapter/mpiio/mpiio_io_client.cc b/adapter/mpiio/mpiio_io_client.cc index 8e429d6be..a26b68cdc 100644 --- a/adapter/mpiio/mpiio_io_client.cc +++ b/adapter/mpiio/mpiio_io_client.cc @@ -26,7 +26,15 @@ void MpiioIoClient::RealOpen(File &f, stat.hflags_.SetBits(HERMES_FS_APPEND); } - if (stat.hflags_.Any(HERMES_FS_CREATE)) { + // NOTE(llogan): Allowing scratch mode to create empty files for MPI to + // satisfy IOR. + f.mpi_status_ = real_api->MPI_File_open( + stat.comm_, path.c_str(), stat.amode_, stat.info_, &stat.mpi_fh_); + if (f.mpi_status_ != MPI_SUCCESS) { + f.status_ = false; + } + + /*if (stat.hflags_.Any(HERMES_FS_CREATE)) { if (stat.adapter_mode_ != AdapterMode::kScratch) { f.mpi_status_ = real_api->MPI_File_open( stat.comm_, path.c_str(), stat.amode_, stat.info_, &stat.mpi_fh_); @@ -42,7 +50,7 @@ void MpiioIoClient::RealOpen(File &f, if (f.mpi_status_ != MPI_SUCCESS && stat.adapter_mode_ != AdapterMode::kScratch) { f.status_ = false; - } + }*/ } /** From c6d80935097783373b532463d8855524a763bb34 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Thu, 18 May 2023 17:19:32 -0500 Subject: [PATCH 05/44] Support 10M metadata objects --- src/metadata_manager.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/metadata_manager.cc b/src/metadata_manager.cc index 4ef90de3f..c7d4e72b3 100644 --- a/src/metadata_manager.cc +++ b/src/metadata_manager.cc @@ -34,10 +34,10 @@ void MetadataManager::shm_init(hipc::ShmArchive &header, header_->node_id_ = rpc_->node_id_; // Create the metadata maps - HSHM_MAKE_AR(header_->blob_id_map_, alloc, 128000) - HSHM_MAKE_AR(header_->blob_map_, alloc, 128000) - HSHM_MAKE_AR(header_->tag_id_map_, alloc, 128000) - HSHM_MAKE_AR(header_->tag_map_, alloc, 128000) + HSHM_MAKE_AR(header_->blob_id_map_, alloc, 10000000) + HSHM_MAKE_AR(header_->blob_map_, alloc, 10000000) + HSHM_MAKE_AR(header_->tag_id_map_, alloc, 10000000) + HSHM_MAKE_AR(header_->tag_map_, alloc, 10000000) HSHM_MAKE_AR(header_->trait_id_map_, alloc, 256) HSHM_MAKE_AR(header_->trait_map_, alloc, 256) From 76c83b7bca0e3e2160331cb44da31ed860931b8c Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Thu, 18 May 2023 17:20:11 -0500 Subject: [PATCH 06/44] Support 32M metadata objects --- src/metadata_manager.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/metadata_manager.cc b/src/metadata_manager.cc index c7d4e72b3..001a07997 100644 --- a/src/metadata_manager.cc +++ b/src/metadata_manager.cc @@ -34,10 +34,10 @@ void MetadataManager::shm_init(hipc::ShmArchive &header, header_->node_id_ = rpc_->node_id_; // Create the metadata maps - HSHM_MAKE_AR(header_->blob_id_map_, alloc, 10000000) - HSHM_MAKE_AR(header_->blob_map_, alloc, 10000000) - HSHM_MAKE_AR(header_->tag_id_map_, alloc, 10000000) - HSHM_MAKE_AR(header_->tag_map_, alloc, 10000000) + HSHM_MAKE_AR(header_->blob_id_map_, alloc, 32000000) + HSHM_MAKE_AR(header_->blob_map_, alloc, 32000000) + HSHM_MAKE_AR(header_->tag_id_map_, alloc, 32000000) + HSHM_MAKE_AR(header_->tag_map_, alloc, 32000000) HSHM_MAKE_AR(header_->trait_id_map_, alloc, 256) HSHM_MAKE_AR(header_->trait_map_, alloc, 256) From 1613cf0086710c3f62ba4361179e8e3b50699bdf Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 22 May 2023 20:52:04 -0500 Subject: [PATCH 07/44] Begin updating prefetcher log --- CMakeLists.txt | 6 + ci/hermes/packages/hermes/package.py | 4 +- src/CMakeLists.txt | 2 +- src/binlog.h | 210 +++++++++++++++++++++++++++ src/hermes_types.h | 106 +++++++++++++- src/metadata_manager.cc | 12 +- src/metadata_manager.h | 5 +- src/prefetcher.cc | 82 +---------- src/rpc.cc | 3 +- src/utils.cc | 30 ---- src/utils.h | 2 - test/CMakeLists.txt | 3 +- test/test_binlog.cc | 87 +++++++++++ test/tests.py | 4 + 14 files changed, 423 insertions(+), 133 deletions(-) create mode 100644 src/binlog.h delete mode 100644 src/utils.cc create mode 100644 test/test_binlog.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 78dcc1c0a..a0bf62774 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -323,6 +323,12 @@ if(yaml-cpp_FOUND) message(STATUS "found yaml-cpp at ${yaml-cpp_DIR}") endif() +#Cereal +find_package(cereal REQUIRED) +if(cereal) + message(STATUS "found cereal") +endif() + # GOTCHA if(HERMES_INTERCEPT_IO) find_package(gotcha REQUIRED) diff --git a/ci/hermes/packages/hermes/package.py b/ci/hermes/packages/hermes/package.py index 0df86ac43..20ead9f67 100644 --- a/ci/hermes/packages/hermes/package.py +++ b/ci/hermes/packages/hermes/package.py @@ -12,13 +12,13 @@ class Hermes(CMakePackage): variant('vfd', default=False, description='Enable HDF5 VFD') variant('ares', default=False, description='Enable full libfabric install') depends_on('mochi-thallium~cereal@0.8.3') + depends_on('cereal') depends_on('catch2@3.0.1') depends_on('mpich@3.3.2:') depends_on('yaml-cpp') depends_on('boost@1.7:') # TODO(): we need to figure out how to add a python library as a dependency - # depends_on('python3') - # depends_on('jarvis-util') + # depends_on('py-jarvis-util') depends_on('libfabric@1.14.1 fabrics=mlx,rxd,rxm,shm,sockets,tcp,udp,verbs,xpmem', when='+ares') depends_on('hdf5@1.13.0:', when='+vfd') diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 45a8959ae..9ca5a82e9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -23,7 +23,6 @@ set(HERMES_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/config_client.cc ${CMAKE_CURRENT_SOURCE_DIR}/config_server.cc ${CMAKE_CURRENT_SOURCE_DIR}/hermes_types.cc - ${CMAKE_CURRENT_SOURCE_DIR}/utils.cc ${CMAKE_CURRENT_SOURCE_DIR}/rpc.cc ${CMAKE_CURRENT_SOURCE_DIR}/rpc_thallium.cc ${CMAKE_CURRENT_SOURCE_DIR}/rpc_thallium_defs.cc @@ -67,6 +66,7 @@ target_link_libraries(hermes PUBLIC ${CMAKE_HERMES_COMMUNICATION_TYPE_LIB} PUBLIC ${CMAKE_HERMES_RPC_TYPE_LIB} PUBLIC yaml-cpp + PUBLIC cereal::cereal PUBLIC "$<$:${GOTCHA_MODULE_LIBS}>" ) diff --git a/src/binlog.h b/src/binlog.h new file mode 100644 index 000000000..66062ce77 --- /dev/null +++ b/src/binlog.h @@ -0,0 +1,210 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Distributed under BSD 3-Clause license. * + * Copyright by The HDF Group. * + * Copyright by the Illinois Institute of Technology. * + * All rights reserved. * + * * + * This file is part of Hermes. The full Hermes copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the top directory. If you do not * + * have access to the file, you may request a copy from help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#ifndef HERMES_SRC_BINLOG_H_ +#define HERMES_SRC_BINLOG_H_ + +#include +#include "data_structures.h" +#include +#include +#include +#include +#include + +namespace hermes { + +template +struct BinaryLogRank { + std::vector cache_; /**< Cached log entries */ + size_t off_; /**< Prefetcher's offset in the cache */ + size_t num_cached_; /**< The number of entries cached int the log */ + + /** Constructor */ + BinaryLogRank() : off_(0), num_cached_(0) {} + + /** Number of elements in the cache */ + size_t size() const { + return cache_.size(); + } + + /** Number of touched elements / index of first untouched element */ + size_t touched() const { + return off_; + } + + /** Number of untouched elements */ + size_t untouched() const { + return size() - off_; + } + + /** Number of uncached elements */ + size_t uncached() { + return size() - num_cached_; + } + + /** Increment the number of cached elements */ + void increment_cached() { + num_cached_ += 1; + } + + /** Get the next untouched cached entry */ + bool next(T &next) { + if (off_ >= cache_.size()) { return false; } + next = cache_[off_]; + off_ += 1; + return true; + } + + /** Reserve more space */ + void reserve(size_t size) { + cache_.reserve(size); + } + + /** Emplace an entry to the back of the cache log */ + void emplace_back(const T &entry) { + cache_.emplace_back(entry); + } + + /** Remove touched elements from the cache log */ + size_t clear_touched() { + size_t num_touched = touched(); + cache_.erase(cache_.begin(), cache_.begin() + num_touched); + if (touched() <= num_cached_) { + num_cached_ -= num_touched; + } else { + num_cached_ = 0; + } + off_ = 0; + return num_touched; + } +}; + +/** + * A simple file-per-process log format for storing + * execution traces. + * + * This assumes only a single thread modifies or reads + * from the log. This is intded to be used internally + * by the prefetcher. + * */ +template +class BinaryLog { + public: + std::vector> cache_; /**< The cached log entries */ + size_t max_ingest_; /**< Max number of elements to cache before flush */ + size_t cur_entry_count_; /**< Total number of cached entries */ + std::string path_; /**< Path to the backing log file */ + + public: + /** Constructor. */ + BinaryLog(const std::string &path, + size_t max_ingest_bytes) : + max_ingest_(max_ingest_bytes / sizeof(T)), + cur_entry_count_(0), + path_(path) { + // Create + truncate the file + // This is ok because the Hermes daemons are assumed to be spawned before + // applications start running. + std::ofstream output_file(path_); + } + + /** + * Appends all entries in the queue to the cache. + * */ + void Ingest(const hipc::mpsc_queue &queue) { + T entry; + while(!queue.pop(entry).IsNull()) { + AppendEntry(entry); + } + } + + /** + * Appends all entries in the vector to the cache. + * */ + void Ingest(const std::vector &queue) { + for(auto &entry : queue) { + AppendEntry(entry); + } + } + + /** + * Get the next entry corresponding to the rank + * */ + bool GetNextEntry(int rank, T &entry) { + while (cache_[rank].untouched() == 0 && Load(max_ingest_)) {} + return cache_[rank].next(entry); + } + + /** + * Flush all entries to the backing log + * */ + void Flush(bool force = false) { + if (!force && cur_entry_count_ < max_ingest_) { + return; + } + + // Serialize all contents into the log file + if (path_.size()) { + std::ofstream output_file(path_, std::ios::out | std::ios::app); + cereal::BinaryOutputArchive oarch(output_file); + for (auto &rank_cache : cache_) { + for (size_t i = rank_cache.uncached(); i < rank_cache.size(); ++i) { + auto &entry = rank_cache.cache_[i]; + oarch(entry); + rank_cache.increment_cached(); + } + } + } + + // Remove all touched entries from the cache + for (auto &rank_cache : cache_) { + cur_entry_count_ -= rank_cache.clear_touched(); + } + } + + private: + /** Appends an entry to the cache */ + void AppendEntry(const T &entry) { + if (entry.rank_ >= (int)cache_.size()) { + cache_.resize(entry.rank_ + 1); + } + if (cache_[entry.rank_].size() == 0) { + cache_[entry.rank_].reserve(8192); + } + cache_[entry.rank_].emplace_back(entry); + cur_entry_count_ += 1; + } + + /** + * Load data from the log into memory + * + * @return true when there is still data to load from the file, false + * otherwise + * */ + bool Load(size_t num_entries) { + std::vector buffer; + buffer.reserve(num_entries); + std::ifstream input_file(path_, std::ios::in); + cereal::BinaryInputArchive iarch(input_file); + while(!input_file.eof()) { + buffer.emplace_back(); + iarch(buffer.back()); + } + Ingest(buffer); + return !input_file.eof(); + } +}; + +} // namespace hermes + +#endif // HERMES_SRC_BINLOG_H_ diff --git a/src/hermes_types.h b/src/hermes_types.h index 3a849545a..3757d26e9 100644 --- a/src/hermes_types.h +++ b/src/hermes_types.h @@ -117,26 +117,75 @@ struct UniqueId { u64 unique_; /**< A unique id for the blob */ i32 node_id_; /**< The node the content is on */ - bool IsNull() const { return unique_ == 0; } - + /** Default constructor */ UniqueId() = default; + /** Emplace constructor */ UniqueId(u64 unique, i32 node_id) : unique_(unique), node_id_(node_id) {} + /** Copy constructor */ + UniqueId(const UniqueId &other) { + unique_ = other.unique_; + node_id_ = other.node_id_; + } + + /** Copy assignment */ + UniqueId& operator=(const UniqueId &other) { + if (this != &other) { + unique_ = other.unique_; + node_id_ = other.node_id_; + } + return *this; + } + + /** Move constructor */ + UniqueId(UniqueId &&other) { + unique_ = other.unique_; + node_id_ = other.node_id_; + } + + /** Move assignment */ + UniqueId& operator=(UniqueId &&other) { + if (this != &other) { + unique_ = other.unique_; + node_id_ = other.node_id_; + } + return *this; + } + + /** Check if null */ + bool IsNull() const { return unique_ == 0; } + + /** Get null id */ static inline UniqueId GetNull() { static const UniqueId id(0, 0); return id; } + /** Set to null id */ + void SetNull() { + node_id_ = 0; + unique_ = 0; + } + + /** Get id of node from this id */ i32 GetNodeId() const { return node_id_; } + /** Compare two ids for equality */ bool operator==(const UniqueId &other) const { return unique_ == other.unique_ && node_id_ == other.node_id_; } + /** Compare two ids for inequality */ bool operator!=(const UniqueId &other) const { return unique_ != other.unique_ || node_id_ != other.node_id_; } + + /** Serialize a UniqueId */ + template + void serialize(Archive &ar) { + ar(unique_, node_id_); + } }; typedef UniqueId<1> BlobId; typedef UniqueId<2> TagId; @@ -156,6 +205,59 @@ struct IoStat { TagId tag_id_; size_t blob_size_; int rank_; + + /** Default constructor */ + IoStat() = default; + + /** Copy constructor */ + IoStat(const IoStat &other) { + Copy(other); + } + + /** Copy assignment */ + IoStat& operator=(const IoStat &other) { + if (this != &other) { + Copy(other); + } + return *this; + } + + /** Move constructor */ + IoStat(IoStat &&other) { + Copy(other); + } + + /** Move assignment */ + IoStat& operator=(IoStat &&other) { + if (this != &other) { + Copy(other); + } + return *this; + } + + /** Generic copy / move */ + HSHM_ALWAYS_INLINE void Copy(const IoStat &other) { + type_ = other.type_; + blob_id_ = other.blob_id_; + tag_id_ = other.tag_id_; + blob_size_ = other.blob_size_; + rank_ = other.rank_; + } + + /** Serialize */ + template + void save(Archive &ar) const { + int type = static_cast(type_); + ar(type, blob_id_, tag_id_, blob_size_, rank_); + } + + /** Deserialize */ + template + void load(Archive &ar) { + int type; + ar(type, blob_id_, tag_id_, blob_size_, rank_); + type_ = static_cast(type); + } }; /** Used as hints to the prefetcher */ diff --git a/src/metadata_manager.cc b/src/metadata_manager.cc index 001a07997..42d132ddf 100644 --- a/src/metadata_manager.cc +++ b/src/metadata_manager.cc @@ -68,7 +68,7 @@ void MetadataManager::shm_init(hipc::ShmArchive &header, } // Create the log used to track I/O pattern - HSHM_MAKE_AR0(header_->io_pattern_log_, alloc); + HSHM_MAKE_AR(header_->io_pattern_log_, alloc, 8192); } /**==================================== @@ -945,16 +945,8 @@ void MetadataManager::AddIoStat(TagId tag_id, /*if (is_mpi_) { MPI_Comm_rank(MPI_COMM_WORLD, &stat.rank_); }*/ - io_pattern_log_->emplace_back(stat); + io_pattern_log_->emplace(stat); } -/** Add an I/O statistic to the internal log */ -void MetadataManager::ClearIoStats(size_t count) { - ScopedRwWriteLock io_pattern_lock(header_->lock_[kIoPatternLogLock], - kMDM_ClearIoStats); - auto first = io_pattern_log_->begin(); - auto end = io_pattern_log_->begin() + count; - io_pattern_log_->erase(first, end); -} } // namespace hermes diff --git a/src/metadata_manager.h b/src/metadata_manager.h index 1eca6c856..b158da807 100644 --- a/src/metadata_manager.h +++ b/src/metadata_manager.h @@ -36,7 +36,7 @@ typedef hipc::unordered_map TRAIT_ID_MAP_T; typedef hipc::unordered_map BLOB_MAP_T; typedef hipc::unordered_map TAG_MAP_T; typedef hipc::unordered_map TRAIT_MAP_T; -typedef hipc::slist IO_PATTERN_LOG_T; +typedef hipc::mpsc_queue IO_PATTERN_LOG_T; enum MdmLock { kBlobMapLock, @@ -461,9 +461,6 @@ class MetadataManager { /** Add an I/O statistic to the internal log */ void AddIoStat(TagId tag_id, BlobId blob_id, size_t blob_size, IoType type); - /** Add an I/O statistic to the internal log */ - void ClearIoStats(size_t count); - /**==================================== * Private Operations * ===================================*/ diff --git a/src/prefetcher.cc b/src/prefetcher.cc index 72edf1329..aeff0f05d 100644 --- a/src/prefetcher.cc +++ b/src/prefetcher.cc @@ -39,42 +39,6 @@ void Prefetcher::Init() { // Set the epoch epoch_ms_ = (double)conf.prefetcher_.epoch_ms_; - // Parse the I/O trace YAML log - try { - if (conf.prefetcher_.trace_path_.size() == 0) { - return; - } - YAML::Node io_trace = YAML::LoadFile(conf.prefetcher_.trace_path_); - HILOG(kDebug, "Parsing the I/O trace at: {}", - conf.prefetcher_.trace_path_) - int nprocs = 1; - // TODO(llogan): make MPI-awareness configurable - // MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - trace_.resize(nprocs); - for (YAML::Node log_entry : io_trace) { - IoTrace trace; - trace.node_id_ = log_entry[0].as(); - if (trace.node_id_ != rpc_->node_id_) { - continue; - } - trace.type_ = static_cast(log_entry[1].as()); - trace.blob_name_ = log_entry[2].as(); - trace.tag_name_ = log_entry[3].as(); - trace.blob_size_ = log_entry[4].as(); - trace.organize_next_n_ = log_entry[5].as(); - trace.score_ = log_entry[6].as(); - trace.rank_ = log_entry[7].as(); - trace_[trace.rank_].emplace_back(trace); - } - - trace_off_.resize(nprocs); - for (int i = 0; i < nprocs; ++i) { - trace_off_[i] = trace_[i].begin(); - } - } catch (std::exception &e) { - HELOG(kFatal, e.what()) - } - // Spawn the prefetcher thread auto prefetcher = [](void *args) { HILOG(kDebug, "Prefetcher has started") @@ -96,51 +60,9 @@ void Prefetcher::Finalize() { /** Parse the MDM's I/O pattern log */ void Prefetcher::Run() { - size_t log_size = mdm_->io_pattern_log_->size(); - // auto trace_iter = trace_.begin(); - auto client_iter = mdm_->io_pattern_log_->begin(); - if (log_size == 0) { - return; - } - - // Group I/O pattern log by rank - int nprocs = 1; - // TODO(llogan): make MPI-awareness configurable - // MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - std::vector> patterns; - patterns.resize(nprocs); - for (size_t i = 0; i < log_size; ++i) { - IoStat &stat = (*client_iter); - int rank = stat.rank_; - patterns[rank].emplace_back(stat); - ++client_iter; - } - - // Analyze the per-rank prefetching decisions - for (int i = 0; i < nprocs; ++i) { - for (IoStat &stat : patterns[i]) { - (void) stat; - // We assume rank I/O is exactly the same as it was in the trace - IoTrace &trace = *trace_off_[i]; - if (trace.organize_next_n_ == 0) { - ++trace_off_[i]; - continue; - } - - for (int j = 0; j < trace.organize_next_n_; ++j) { - ++trace_off_[i]; - trace = *trace_off_[i]; - /*borg_->GlobalOrganizeBlob(trace.tag_name_, - trace.blob_name_, - trace.score_);*/ - } - ++trace_off_[i]; - break; - } - } + // Ingest the current I/O statistics - // Clear the log - mdm_->ClearIoStats(log_size); + // Get the set of buckets } } // namespace hermes diff --git a/src/rpc.cc b/src/rpc.cc index 14014aea7..7403a9a14 100644 --- a/src/rpc.cc +++ b/src/rpc.cc @@ -201,7 +201,8 @@ std::string RpcContext::_GetIpAddress(const std::string &host_name) { const char *inet_result = inet_ntop(AF_INET, addr_list[0], ip_address, INET_ADDRSTRLEN); if (!inet_result) { - FailedLibraryCall("inet_ntop"); + perror("inet_ntop"); + HELOG(kFatal, "inet_ntop failed"); } return ip_address; } diff --git a/src/utils.cc b/src/utils.cc deleted file mode 100644 index 26829e29f..000000000 --- a/src/utils.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * - * Distributed under BSD 3-Clause license. * - * Copyright by The HDF Group. * - * Copyright by the Illinois Institute of Technology. * - * All rights reserved. * - * * - * This file is part of Hermes. The full Hermes copyright notice, including * - * terms governing use, modification, and redistribution, is contained in * - * the COPYING file, which can be found at the top directory. If you do not * - * have access to the file, you may request a copy from help@hdfgroup.org. * - * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - -#include "utils.h" - -#include -#include -#include -#include - -namespace hermes { - -/** - print an error message for \a func function that failed - */ -void FailedLibraryCall(std::string func) { - int saved_errno = errno; - HELOG(kFatal, strerror(saved_errno)); -} - -} // namespace hermes diff --git a/src/utils.h b/src/utils.h index ff04ad5ab..bbb416b3c 100644 --- a/src/utils.h +++ b/src/utils.h @@ -28,8 +28,6 @@ static inline std::string GetEnvSafe(const char *env_name) { return val; } -void FailedLibraryCall(std::string func); - } // namespace hermes #endif // HERMES_UTILS_H_ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index bb1c85d02..507adf32c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -18,7 +18,8 @@ set(API_TESTS test_bucket test_buffer_pool test_trait - test_tag) + test_tag + test_binlog) find_program(BASH_PROGRAM bash) diff --git a/test/test_binlog.cc b/test/test_binlog.cc new file mode 100644 index 000000000..2e7e05a27 --- /dev/null +++ b/test/test_binlog.cc @@ -0,0 +1,87 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Distributed under BSD 3-Clause license. * + * Copyright by The HDF Group. * + * Copyright by the Illinois Institute of Technology. * + * All rights reserved. * + * * + * This file is part of Hermes. The full Hermes copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the top directory. If you do not * + * have access to the file, you may request a copy from help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +#include + +#include "binlog.h" +#include "hermes_types.h" +#include "basic_test.h" +#include + +namespace hapi = hermes::api; +namespace stdfs = std::filesystem; + +void MainPretest() { +} + +void MainPosttest() { +} + +std::vector create_stats( + size_t bytes, int num_ranks, size_t &entries_per_rank) { + std::vector stats; + size_t max_entries = bytes / sizeof(hermes::IoStat); + entries_per_rank = max_entries / num_ranks; + stats.reserve(num_ranks * entries_per_rank); + for (int rank = 0; rank < num_ranks; ++rank) { + for (size_t i = 0; i < entries_per_rank; ++i) { + stats.emplace_back(); + auto &stat = stats.back(); + stat.rank_ = rank; + stat.type_ = (i % 2) ? hermes::IoType::kRead : hermes::IoType::kWrite; + stat.blob_id_ = hermes::BlobId(i, rank); + stat.blob_size_ = 8 * (i + 1); + } + } + return stats; +} + +void verify_log(hermes::BinaryLog log, + int num_ranks, size_t entries_per_rank) { + for (int rank = 0; rank < num_ranks; ++rank) { + hermes::IoStat stat; + size_t i = 0; + while (log.GetNextEntry(rank, stat)) { + REQUIRE(stat.blob_id_.node_id_ == rank); + REQUIRE(stat.blob_id_.unique_ == i); + i += 1; + } + REQUIRE(i == entries_per_rank); + } +} + +TEST_CASE("TestBinlog") { + int num_ranks = 16; + size_t log_bytes = MEGABYTES(1); + size_t chunk_bytes = log_bytes / 4; + size_t entries_per_rank; + std::string path = "/tmp/log.bin"; + + // Create chunk + std::vector stats = create_stats( + chunk_bytes, num_ranks, entries_per_rank); + + // Attempt flushing the log + hermes::BinaryLog log(path, log_bytes); + log.Ingest(stats); + verify_log(log, num_ranks, entries_per_rank); + log.Flush(); + REQUIRE(stdfs::file_size(path) == 0); + + // Actually flush the log when capacity reached + log.Ingest(stats); + log.Ingest(stats); + log.Ingest(stats); + log.Flush(); + REQUIRE(stdfs::file_size(path) > 0); +} diff --git a/test/tests.py b/test/tests.py index ba841ac7a..0ac387de5 100644 --- a/test/tests.py +++ b/test/tests.py @@ -12,6 +12,7 @@ def set_paths(self): self.TEST_BUFFER_POOL_CMD = f"{self.CMAKE_BINARY_DIR}/bin/test_buffer_pool" self.TEST_TRAIT_CMD = f"{self.CMAKE_BINARY_DIR}/bin/test_trait" self.TEST_TAG_CMD = f"{self.CMAKE_BINARY_DIR}/bin/test_tag" + self.TEST_BINLOG_CMD = f"{self.CMAKE_BINARY_DIR}/bin/test_binlog" self.TEST_MULTINODE_PUT_GET_CMD = f"{self.CMAKE_BINARY_DIR}/bin/test_multinode_put_get" def test_bucket(self): @@ -46,6 +47,9 @@ def test_tag(self): self.stop_daemon(spawn_info) return node.exit_code + def test_binlog(self): + node = Exec(self.TEST_BINLOG_CMD) + def test_multinode_put_get(self): spawn_info = self.spawn_info(nprocs=2, ppn=1, From 7bd1a8e4385ef1c943501dec4b395e4137d4969e Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 22 May 2023 20:53:08 -0500 Subject: [PATCH 08/44] Remove 1.0.0 --- ci/hermes/packages/hermes/package.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ci/hermes/packages/hermes/package.py b/ci/hermes/packages/hermes/package.py index b5b62cb01..0acf1d587 100644 --- a/ci/hermes/packages/hermes/package.py +++ b/ci/hermes/packages/hermes/package.py @@ -5,10 +5,9 @@ class Hermes(CMakePackage): url = "https://github.com/HDFGroup/hermes/tarball/master" git = "https://github.com/HDFGroup/hermes.git" version('master', branch='master') - version('1.0.0', git='https://github.com/lukemartinlogan/hermes.git', branch='new-borg') + version('pnnl', branch='pnnl') version('dev-priv', git='https://github.com/lukemartinlogan/hermes.git', branch='new-borg') - version('pnnl', git='https://github.com/lukemartinlogan/hermes.git', - branch='pnnl') + variant('vfd', default=False, description='Enable HDF5 VFD') variant('ares', default=False, description='Enable full libfabric install') depends_on('mochi-thallium~cereal@0.8.3') From 48d87f03e483b995b7665ca2f3c2f22262fb456e Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Sun, 28 May 2023 17:37:22 -0500 Subject: [PATCH 09/44] Use hdf5@1.14.0 --- ci/hermes/packages/hermes/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/hermes/packages/hermes/package.py b/ci/hermes/packages/hermes/package.py index eb883ab5c..a7e636b32 100644 --- a/ci/hermes/packages/hermes/package.py +++ b/ci/hermes/packages/hermes/package.py @@ -19,7 +19,7 @@ class Hermes(CMakePackage): # depends_on('py-jarvis-util') depends_on('libfabric@1.14.1 fabrics=mlx,rxd,rxm,shm,sockets,tcp,udp,verbs,xpmem', when='+ares') - depends_on('hdf5@1.13.0:', when='+vfd') + depends_on('hdf5@1.14.0:', when='+vfd') def cmake_args(self): args = ['-DCMAKE_INSTALL_PREFIX={}'.format(self.prefix), From 163b5298913f85b36074f048f171124c4852d404 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 00:33:01 -0500 Subject: [PATCH 10/44] Fix race condition with bucket size and backend size in fs adapter --- CMakeLists.txt | 2 +- adapter/filesystem/filesystem.cc | 14 +++++++------- adapter/filesystem/filesystem_io_client.h | 2 ++ ci/hermes/packages/hermes/package.py | 2 +- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c095e55f1..5a282c9cf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug") add_compile_definitions(HERMES_LOG_VERBOSITY=10) message("This is NOT a release build: ${CMAKE_BUILD_TYPE} ${CMAKE_CXX_FLAGS}") elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -fPIC") # Ensure that debug logging is enabled # This will keep logs which have beneath priority 10 add_compile_definitions(HERMES_LOG_VERBOSITY=10) diff --git a/adapter/filesystem/filesystem.cc b/adapter/filesystem/filesystem.cc index 09f472fcf..a62d7f04c 100644 --- a/adapter/filesystem/filesystem.cc +++ b/adapter/filesystem/filesystem.cc @@ -62,8 +62,8 @@ void Filesystem::Open(AdapterStat &stat, File &f, const std::string &path) { stat.bkt_id_.Clear(); } else { // The file was opened regularly - size_t file_size = io_client_->GetSize(*path_shm); - stat.bkt_id_ = HERMES->GetBucket(stat.path_, ctx, file_size); + stat.file_size_ = io_client_->GetSize(*path_shm); + stat.bkt_id_ = HERMES->GetBucket(stat.path_, ctx, stat.file_size_); } HILOG(kDebug, "File has size: {}", stat.bkt_id_.GetSize()); // Attach trait to bucket (if not scratch mode) @@ -154,10 +154,11 @@ Status Filesystem::PartialPutOrCreate(hapi::Bucket &bkt, io_client_->ReadBlob(bkt.GetName(), full_blob, opts, status); if (!status.success_) { - HILOG(kDebug, "Failed to read blob from backend (PartialPut)." + HELOG(kFatal, "Failed to read blob from {} (PartialPut)." " cur_size: {}" + " backend_off: {}" " backend_size: {}", - full_blob.size(), opts.backend_size_) + bkt.GetName(), full_blob.size(), opts.backend_off_, opts.backend_size_) // return PARTIAL_PUT_OR_CREATE_OVERFLOW; } } @@ -233,7 +234,7 @@ size_t Filesystem::Write(File &f, AdapterStat &stat, const void *ptr, BlobId blob_id; opts.backend_off_ = p.page_ * kPageSize; opts.backend_size_ = GetBackendSize(opts.backend_off_, - backend_size, + stat.file_size_, kPageSize); opts.adapter_mode_ = stat.adapter_mode_; bkt.TryCreateBlob(blob_name.str(), blob_id, ctx); @@ -377,7 +378,6 @@ size_t Filesystem::Read(File &f, AdapterStat &stat, void *ptr, size_t data_offset = 0; auto mapper = MapperFactory().Get(MapperType::kBalancedMapper); mapper->map(off, total_size, kPageSize, mapping); - size_t backend_size = stat.bkt_id_.GetSize(); for (const auto &p : mapping) { Blob blob_wrap((const char*)ptr + data_offset, p.blob_size_); @@ -385,7 +385,7 @@ size_t Filesystem::Read(File &f, AdapterStat &stat, void *ptr, BlobId blob_id; opts.backend_off_ = p.page_ * kPageSize; opts.backend_size_ = GetBackendSize(opts.backend_off_, - backend_size, + stat.file_size_, kPageSize); opts.adapter_mode_ = stat.adapter_mode_; bkt.TryCreateBlob(blob_name.str(), blob_id, ctx); diff --git a/adapter/filesystem/filesystem_io_client.h b/adapter/filesystem/filesystem_io_client.h index 875a9034c..f5cc32720 100644 --- a/adapter/filesystem/filesystem_io_client.h +++ b/adapter/filesystem/filesystem_io_client.h @@ -192,6 +192,7 @@ struct AdapterStat { uid_t st_uid_; /**< user ID of owner */ gid_t st_gid_; /**< group ID of owner */ size_t st_ptr_; /**< current ptr of FILE */ + size_t file_size_; /**< Size of file at backend at time of open */ timespec st_atim_; /**< time of last access */ timespec st_mtim_; /**< time of last modification */ timespec st_ctim_; /**< time of last status change */ @@ -217,6 +218,7 @@ struct AdapterStat { hflags_(), st_mode_(), st_ptr_(0), + file_size_(0), st_atim_(), st_mtim_(), st_ctim_(), diff --git a/ci/hermes/packages/hermes/package.py b/ci/hermes/packages/hermes/package.py index a7e636b32..be6acd414 100644 --- a/ci/hermes/packages/hermes/package.py +++ b/ci/hermes/packages/hermes/package.py @@ -6,7 +6,7 @@ class Hermes(CMakePackage): git = "https://github.com/HDFGroup/hermes.git" version('master', branch='master') version('pnnl', branch='pnnl') - version('dev-priv', git='https://github.com/lukemartinlogan/hermes.git', branch='new-borg') + version('dev-priv', git='https://github.com/lukemartinlogan/hermes.git', branch='dev') variant('vfd', default=False, description='Enable HDF5 VFD') variant('ares', default=False, description='Enable full libfabric install') depends_on('mochi-thallium~cereal@0.8.3') From 47453a4d3262c24c40ce63040e444e673bfe01c8 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 13:08:42 -0500 Subject: [PATCH 11/44] Fix small I/O issue --- adapter/filesystem/filesystem.cc | 13 +- adapter/filesystem/filesystem.h | 4 + hermes_shm/benchmark/allocator/allocator.cc | 6 +- .../data_structures/data_structure.h | 1 + .../ipc/internal/shm_container.h | 6 - .../serialization/shm_serialize.h | 99 +++++++++++++++ .../smart_ptr/smart_ptr_base.h | 8 ++ .../memory/allocator/allocator_factory.h | 17 --- .../memory/allocator/fixed_page_allocator.h | 113 ----------------- .../hermes_shm/memory/allocator/mp_page.h | 1 + .../allocator/scalable_page_allocator.h | 12 +- hermes_shm/scripts/ci/install_deps.sh | 3 +- hermes_shm/scripts/ci/install_docs.sh | 2 +- hermes_shm/scripts/ci/install_hshm.sh | 2 +- hermes_shm/src/CMakeLists.txt | 1 - hermes_shm/src/memory/fixed_page_allocator.cc | 120 ------------------ .../src/memory/scalable_page_allocator.cc | 20 +-- hermes_shm/test/unit/allocators/allocator.cc | 13 -- .../data_structures/serialize/CMakeLists.txt | 1 + .../serialize/shm/CMakeLists.txt | 33 +++++ .../serialize/shm/test_init.cc | 30 +++++ .../data_structures/serialize/shm/test_init.h | 35 +++++ .../data_structures/serialize/shm/test_shm.cc | 69 ++++++++++ src/binlog.h | 6 +- src/metadata_manager.cc | 4 +- src/rpc_thallium.h | 6 +- 26 files changed, 320 insertions(+), 305 deletions(-) create mode 100644 hermes_shm/include/hermes_shm/data_structures/serialization/shm_serialize.h delete mode 100644 hermes_shm/include/hermes_shm/memory/allocator/fixed_page_allocator.h delete mode 100644 hermes_shm/src/memory/fixed_page_allocator.cc create mode 100644 hermes_shm/test/unit/data_structures/serialize/shm/CMakeLists.txt create mode 100644 hermes_shm/test/unit/data_structures/serialize/shm/test_init.cc create mode 100644 hermes_shm/test/unit/data_structures/serialize/shm/test_init.h create mode 100644 hermes_shm/test/unit/data_structures/serialize/shm/test_shm.cc diff --git a/adapter/filesystem/filesystem.cc b/adapter/filesystem/filesystem.cc index a62d7f04c..24fb41c58 100644 --- a/adapter/filesystem/filesystem.cc +++ b/adapter/filesystem/filesystem.cc @@ -116,6 +116,7 @@ static inline size_t GetBackendSize(size_t file_off, * @param blob_name the semantic name of the blob * @param blob the buffer to put final data in * @param blob_off the offset within the blob to begin the Put + * @param page_size the page size of the adapter * @param io_ctx which adapter to route I/O request if blob DNE * @param opts which adapter to route I/O request if blob DNE * @param ctx any additional information @@ -124,11 +125,12 @@ Status Filesystem::PartialPutOrCreate(hapi::Bucket &bkt, const std::string &blob_name, const Blob &blob, size_t blob_off, + size_t page_size, BlobId &blob_id, IoStatus &status, const FsIoOptions &opts, Context &ctx) { - Blob full_blob; + Blob full_blob(page_size); if (bkt.ContainsBlob(blob_name, blob_id)) { // Case 1: The blob already exists (read from hermes) // Read blob from Hermes @@ -158,7 +160,8 @@ Status Filesystem::PartialPutOrCreate(hapi::Bucket &bkt, " cur_size: {}" " backend_off: {}" " backend_size: {}", - bkt.GetName(), full_blob.size(), opts.backend_off_, opts.backend_size_) + bkt.GetName(), full_blob.size(), + opts.backend_off_, opts.backend_size_) // return PARTIAL_PUT_OR_CREATE_OVERFLOW; } } @@ -243,6 +246,7 @@ size_t Filesystem::Write(File &f, AdapterStat &stat, const void *ptr, blob_name.str(), blob_wrap, p.blob_off_, + kPageSize, blob_id, io_status, opts, @@ -274,6 +278,7 @@ size_t Filesystem::Write(File &f, AdapterStat &stat, const void *ptr, * @param blob the buffer to put final data in * @param blob_off the offset within the blob to begin the Put * @param blob_size the total amount of data to read + * @param page_size the page size of the adapter * @param blob_id [out] the blob id corresponding to blob_name * @param io_ctx information required to perform I/O to the backend * @param opts specific configuration of the I/O to perform @@ -284,11 +289,12 @@ Status Filesystem::PartialGetOrCreate(hapi::Bucket &bkt, Blob &blob, size_t blob_off, size_t blob_size, + size_t page_size, BlobId &blob_id, IoStatus &status, const FsIoOptions &opts, Context &ctx) { - Blob full_blob; + Blob full_blob(page_size); if (bkt.ContainsBlob(blob_name, blob_id)) { // Case 1: The blob already exists (read from hermes) // Read blob from Hermes @@ -395,6 +401,7 @@ size_t Filesystem::Read(File &f, AdapterStat &stat, void *ptr, blob_wrap, p.blob_off_, p.blob_size_, + kPageSize, blob_id, io_status, opts, diff --git a/adapter/filesystem/filesystem.h b/adapter/filesystem/filesystem.h index fe2045fe2..28062bf48 100644 --- a/adapter/filesystem/filesystem.h +++ b/adapter/filesystem/filesystem.h @@ -69,6 +69,7 @@ class Filesystem { * @param blob_name the semantic name of the blob * @param blob the buffer to put final data in * @param blob_off the offset within the blob to begin the Put + * @param page_size the page size of the adapter * @param blob_id [out] the blob id corresponding to blob_name * @param io_ctx information required to perform I/O to the backend * @param opts specific configuration of the I/O to perform @@ -78,6 +79,7 @@ class Filesystem { const std::string &blob_name, const Blob &blob, size_t blob_off, + size_t page_size, BlobId &blob_id, IoStatus &status, const FsIoOptions &opts, @@ -96,6 +98,7 @@ class Filesystem { * @param blob_name the semantic name of the blob * @param blob the buffer to put final data in * @param blob_off the offset within the blob to begin the Put + * @param page_size the page size of the adapter * @param blob_id [out] the blob id corresponding to blob_name * @param io_ctx information required to perform I/O to the backend * @param opts specific configuration of the I/O to perform @@ -106,6 +109,7 @@ class Filesystem { Blob &blob, size_t blob_off, size_t blob_size, + size_t page_size, BlobId &blob_id, IoStatus &status, const FsIoOptions &opts, diff --git a/hermes_shm/benchmark/allocator/allocator.cc b/hermes_shm/benchmark/allocator/allocator.cc index 5390cf48f..f2ee35ade 100644 --- a/hermes_shm/benchmark/allocator/allocator.cc +++ b/hermes_shm/benchmark/allocator/allocator.cc @@ -205,11 +205,7 @@ void FullAllocatorTestPerThread() { AllocatorTest( AllocatorType::kStackAllocator, MemoryBackendType::kPosixShmMmap); - // Fixed page allocator - AllocatorTest( - AllocatorType::kFixedPageAllocator, - MemoryBackendType::kPosixShmMmap); - // Fixed page allocator + // Scalable page allocator AllocatorTest( AllocatorType::kScalablePageAllocator, MemoryBackendType::kPosixShmMmap); diff --git a/hermes_shm/include/hermes_shm/data_structures/data_structure.h b/hermes_shm/include/hermes_shm/data_structures/data_structure.h index 2f86bdb33..8880c8ec0 100644 --- a/hermes_shm/include/hermes_shm/data_structures/data_structure.h +++ b/hermes_shm/include/hermes_shm/data_structures/data_structure.h @@ -21,6 +21,7 @@ #include "ipc/list.h" #include "ipc/vector.h" #include "ipc/unordered_map.h" +#include "hermes_shm/memory/memory_manager.h" namespace hipc = hshm::ipc; diff --git a/hermes_shm/include/hermes_shm/data_structures/ipc/internal/shm_container.h b/hermes_shm/include/hermes_shm/data_structures/ipc/internal/shm_container.h index 806303552..f561855f3 100644 --- a/hermes_shm/include/hermes_shm/data_structures/ipc/internal/shm_container.h +++ b/hermes_shm/include/hermes_shm/data_structures/ipc/internal/shm_container.h @@ -24,12 +24,6 @@ namespace hipc = hshm::ipc; namespace hshm::ipc { -/** Bits used for determining how to destroy an object */ -/// The container should free all data when destroyed -#define SHM_PRIVATE_IS_DESTRUCTABLE BIT_OPT(uint32_t, 0) -/// The container owns the header -#define SHM_PRIVATE_OWNS_HEADER BIT_OPT(uint32_t, 1) - /** The shared-memory header used for data structures */ template struct ShmHeader; diff --git a/hermes_shm/include/hermes_shm/data_structures/serialization/shm_serialize.h b/hermes_shm/include/hermes_shm/data_structures/serialization/shm_serialize.h new file mode 100644 index 000000000..fb20fcbbc --- /dev/null +++ b/hermes_shm/include/hermes_shm/data_structures/serialization/shm_serialize.h @@ -0,0 +1,99 @@ +// +// Created by llogan on 5/9/23. +// + +#ifndef HERMES_SHM_INCLUDE_HERMES_SHM_DATA_STRUCTURES_SERIALIZATION_SHM_SERIALIZE_H_ +#define HERMES_SHM_INCLUDE_HERMES_SHM_DATA_STRUCTURES_SERIALIZATION_SHM_SERIALIZE_H_ + +#define NOREF typename std::remove_reference::type + +namespace hshm::ipc { + +class ShmSerializer { + public: + size_t off_; + + /** Default constructor */ + ShmSerializer() : off_(0) {} + + /** Get the SHM serialized size of an argument pack */ + template + HSHM_ALWAYS_INLINE static size_t shm_buf_size(Args&& ...args) { + size_t size = 0; + auto lambda = [&size](auto i, auto &&arg) { + if constexpr(IS_SHM_ARCHIVEABLE(NOREF)) { + size += sizeof(hipc::OffsetPointer); + } else if constexpr(std::is_pod()) { + size += sizeof(arg); + } else { + throw IPC_ARGS_NOT_SHM_COMPATIBLE.format(); + } + }; + ForwardIterateArgpack::Apply(make_argpack(std::forward(args)...), lambda); + return size; + } + + /** Serialize a set of arguments into shared memory */ + template + HSHM_ALWAYS_INLINE char* serialize(Allocator *alloc, Args&& ...args) { + size_t buf_size = sizeof(allocator_id_t) + shm_buf_size(std::forward(args)...); + Pointer p; + char *buf = alloc->AllocatePtr(buf_size, p); + memcpy(buf, &p.allocator_id_, sizeof(allocator_id_t)); + off_ = sizeof(allocator_id_t); + auto lambda = [buf, this](auto i, auto &&arg) { + if constexpr(IS_SHM_ARCHIVEABLE(NOREF)) { + OffsetPointer p = arg.template GetShmPointer(); + memcpy(buf + this->off_, (void*)&p, sizeof(p)); + this->off_ += sizeof(p); + } else if constexpr(std::is_pod()) { + memcpy(buf + this->off_, &arg, sizeof(arg)); + this->off_ += sizeof(arg); + } else { + throw IPC_ARGS_NOT_SHM_COMPATIBLE.format(); + } + }; + ForwardIterateArgpack::Apply(make_argpack(std::forward(args)...), lambda); + return buf; + } + + /** Deserialize an allocator from the SHM buffer */ + HSHM_ALWAYS_INLINE Allocator* deserialize(char *buf) { + allocator_id_t alloc_id; + memcpy((void*)&alloc_id, buf + off_, sizeof(allocator_id_t)); + off_ += sizeof(allocator_id_t); + return HERMES_MEMORY_MANAGER->GetAllocator(alloc_id); + } + + /** Deserialize an argument from the SHM buffer */ + template + HSHM_ALWAYS_INLINE T deserialize(Allocator *alloc, char *buf) { + if constexpr(std::is_pod()) { + T arg; + memcpy(&arg, buf + off_, sizeof(arg)); + off_ += sizeof(arg); + return arg; + } else { + throw IPC_ARGS_NOT_SHM_COMPATIBLE.format(); + } + } + + /** Deserialize an argument from the SHM buffer */ + template + HSHM_ALWAYS_INLINE void deserialize(Allocator *alloc, char *buf, hipc::mptr &arg) { + if constexpr(IS_SHM_ARCHIVEABLE(T)) { + OffsetPointer p; + memcpy((void*)&p, buf + off_, sizeof(p)); + arg.shm_deserialize(alloc, p); + off_ += sizeof(p); + } else { + throw IPC_ARGS_NOT_SHM_COMPATIBLE.format(); + } + } +}; + +} // namespace hshm + +#undef NOREF + +#endif // HERMES_SHM_INCLUDE_HERMES_SHM_DATA_STRUCTURES_SERIALIZATION_SHM_SERIALIZE_H_ diff --git a/hermes_shm/include/hermes_shm/data_structures/smart_ptr/smart_ptr_base.h b/hermes_shm/include/hermes_shm/data_structures/smart_ptr/smart_ptr_base.h index cf1f1d805..c2988fe4f 100644 --- a/hermes_shm/include/hermes_shm/data_structures/smart_ptr/smart_ptr_base.h +++ b/hermes_shm/include/hermes_shm/data_structures/smart_ptr/smart_ptr_base.h @@ -198,6 +198,14 @@ class smart_ptr_base { } } + /** Deserialize from an offset pointer */ + HSHM_ALWAYS_INLINE void shm_deserialize(Allocator *alloc, const OffsetPointer &ar) { + obj_ = alloc->template Convert(ar); + if constexpr(unique) { + flags_.UnsetBits(POINTER_IS_OWNED); + } + } + /**==================================== * Serialization * ===================================*/ diff --git a/hermes_shm/include/hermes_shm/memory/allocator/allocator_factory.h b/hermes_shm/include/hermes_shm/memory/allocator/allocator_factory.h index 91725195a..33564a929 100644 --- a/hermes_shm/include/hermes_shm/memory/allocator/allocator_factory.h +++ b/hermes_shm/include/hermes_shm/memory/allocator/allocator_factory.h @@ -17,7 +17,6 @@ #include "allocator.h" #include "stack_allocator.h" #include "malloc_allocator.h" -#include "fixed_page_allocator.h" #include "scalable_page_allocator.h" namespace hshm::ipc { @@ -50,15 +49,6 @@ class AllocatorFactory { backend->data_size_, std::forward(args)...); return alloc; - } else if constexpr(std::is_same_v) { - // Fixed Page Allocator - auto alloc = std::make_unique(); - alloc->shm_init(alloc_id, - custom_header_size, - backend->data_, - backend->data_size_, - std::forward(args)...); - return alloc; } else if constexpr(std::is_same_v) { // Scalable Page Allocator auto alloc = std::make_unique(); @@ -94,13 +84,6 @@ class AllocatorFactory { backend->data_size_); return alloc; } - // Fixed Page Allocator - case AllocatorType::kFixedPageAllocator: { - auto alloc = std::make_unique(); - alloc->shm_deserialize(backend->data_, - backend->data_size_); - return alloc; - } // Scalable Page Allocator case AllocatorType::kScalablePageAllocator: { auto alloc = std::make_unique(); diff --git a/hermes_shm/include/hermes_shm/memory/allocator/fixed_page_allocator.h b/hermes_shm/include/hermes_shm/memory/allocator/fixed_page_allocator.h deleted file mode 100644 index 8b78945c5..000000000 --- a/hermes_shm/include/hermes_shm/memory/allocator/fixed_page_allocator.h +++ /dev/null @@ -1,113 +0,0 @@ -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * - * Distributed under BSD 3-Clause license. * - * Copyright by The HDF Group. * - * Copyright by the Illinois Institute of Technology. * - * All rights reserved. * - * * - * This file is part of Hermes. The full Hermes copyright notice, including * - * terms governing use, modification, and redistribution, is contained in * - * the COPYING file, which can be found at the top directory. If you do not * - * have access to the file, you may request a copy from help@hdfgroup.org. * - * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - - -#ifndef HERMES_MEMORY_ALLOCATOR_FIXED_ALLOCATOR_H_ -#define HERMES_MEMORY_ALLOCATOR_FIXED_ALLOCATOR_H_ - -#include "allocator.h" -#include "hermes_shm/thread/lock.h" -#include "hermes_shm/data_structures/ipc/pair.h" -#include "hermes_shm/data_structures/ipc/vector.h" -#include "hermes_shm/data_structures/ipc/list.h" -#include -#include "mp_page.h" - -namespace hshm::ipc { - -struct FixedPageAllocatorHeader : public AllocatorHeader { - ShmArchive>> free_lists_; - std::atomic total_alloc_; - - FixedPageAllocatorHeader() = default; - - void Configure(allocator_id_t alloc_id, - size_t custom_header_size, - Allocator *alloc) { - AllocatorHeader::Configure(alloc_id, - AllocatorType::kFixedPageAllocator, - custom_header_size); - HSHM_MAKE_AR0(free_lists_, alloc); - total_alloc_ = 0; - } -}; - -class FixedPageAllocator : public Allocator { - private: - FixedPageAllocatorHeader *header_; - vector> *free_lists_; - std::atomic total_alloc_; - StackAllocator alloc_; - - public: - /** - * Allocator constructor - * */ - FixedPageAllocator() - : header_(nullptr) {} - - /** - * Get the ID of this allocator from shared memory - * */ - allocator_id_t &GetId() override { - return header_->allocator_id_; - } - - /** - * Initialize the allocator in shared memory - * */ - void shm_init(allocator_id_t id, - size_t custom_header_size, - char *buffer, - size_t buffer_size); - - /** - * Attach an existing allocator from shared memory - * */ - void shm_deserialize(char *buffer, - size_t buffer_size) override; - - /** - * Allocate a memory of \a size size. The page allocator cannot allocate - * memory larger than the page size. - * */ - OffsetPointer AllocateOffset(size_t size) override; - - /** - * Allocate a memory of \a size size, which is aligned to \a - * alignment. - * */ - OffsetPointer AlignedAllocateOffset(size_t size, size_t alignment) override; - - /** - * Reallocate \a p pointer to \a new_size new size. - * - * @return whether or not the pointer p was changed - * */ - OffsetPointer ReallocateOffsetNoNullCheck( - OffsetPointer p, size_t new_size) override; - - /** - * Free \a ptr pointer. Null check is performed elsewhere. - * */ - void FreeOffsetNoNullCheck(OffsetPointer p) override; - - /** - * Get the current amount of data allocated. Can be used for leak - * checking. - * */ - size_t GetCurrentlyAllocatedSize() override; -}; - -} // namespace hshm::ipc - -#endif // HERMES_MEMORY_ALLOCATOR_FIXED_ALLOCATOR_H_ diff --git a/hermes_shm/include/hermes_shm/memory/allocator/mp_page.h b/hermes_shm/include/hermes_shm/memory/allocator/mp_page.h index 41ec432b2..1209e0f0e 100644 --- a/hermes_shm/include/hermes_shm/memory/allocator/mp_page.h +++ b/hermes_shm/include/hermes_shm/memory/allocator/mp_page.h @@ -22,6 +22,7 @@ struct MpPage { size_t page_size_; /**< The size of the page allocated */ int flags_; /**< Page flags (e.g., is_allocated?) */ uint32_t off_; /**< The offset within the page */ + uint32_t cpu_; /**< The CPU the page was alloc'd from */ void SetAllocated() { flags_ = 0x1; diff --git a/hermes_shm/include/hermes_shm/memory/allocator/scalable_page_allocator.h b/hermes_shm/include/hermes_shm/memory/allocator/scalable_page_allocator.h index 0a12698f7..40726a054 100644 --- a/hermes_shm/include/hermes_shm/memory/allocator/scalable_page_allocator.h +++ b/hermes_shm/include/hermes_shm/memory/allocator/scalable_page_allocator.h @@ -109,15 +109,15 @@ class ScalablePageAllocator : public Allocator { vector>> *free_lists_; StackAllocator alloc_; /** The power-of-two exponent of the minimum size that can be cached */ - static const size_t min_cached_size_exp_ = 5; + static const size_t min_cached_size_exp_ = 6; /** The minimum size that can be cached directly (32 bytes) */ static const size_t min_cached_size_ = (1 << min_cached_size_exp_); /** The power-of-two exponent of the minimum size that can be cached */ - static const size_t max_cached_size_exp_ = 14; + static const size_t max_cached_size_exp_ = 20; /** The maximum size that can be cached directly (16KB) */ static const size_t max_cached_size_ = (1 << max_cached_size_exp_); - /** Cache every size between 16 (2^4) BYTES and 16KB (2^14): (11 entries) */ - static const size_t num_caches_ = 14 - 5 + 1; + /** Cache every size between 64 (2^6) BYTES and 1MB (2^20): (15 entries) */ + static const size_t num_caches_ = 20 - 6 + 1; /** * The last free list stores sizes larger than 16KB or sizes which are * not exactly powers-of-two. @@ -161,8 +161,8 @@ class ScalablePageAllocator : public Allocator { OffsetPointer AllocateOffset(size_t size) override; private: - /** Check if a cached page can be re-used */ - MpPage *CheckCaches(size_t size_mp); + /** Check if a cached page on this core can be re-used */ + MpPage* CheckLocalCaches(size_t size_mp, uint32_t cpu); /** * Find the first fit of an element in a free list diff --git a/hermes_shm/scripts/ci/install_deps.sh b/hermes_shm/scripts/ci/install_deps.sh index f244df47e..cd341562e 100644 --- a/hermes_shm/scripts/ci/install_deps.sh +++ b/hermes_shm/scripts/ci/install_deps.sh @@ -12,7 +12,7 @@ set -e set -o pipefail # Change this especially when your $HOME doesn't have enough disk space. -INSTALL_DIR="${HOME}/${LOCAL}" +INSTALL_DIR="${HOME}" SPACK_DIR=${INSTALL_DIR}/spack SPACK_VERSION=0.18.1 @@ -26,7 +26,6 @@ git checkout v${SPACK_VERSION} # Set spack env set +x -SPACK_DIR=${INSTALL_DIR}/spack . ${SPACK_DIR}/share/spack/setup-env.sh set -x diff --git a/hermes_shm/scripts/ci/install_docs.sh b/hermes_shm/scripts/ci/install_docs.sh index 5fe176c1d..63f912bd8 100755 --- a/hermes_shm/scripts/ci/install_docs.sh +++ b/hermes_shm/scripts/ci/install_docs.sh @@ -7,7 +7,7 @@ cd ${GITHUB_WORKSPACE} mkdir build cd build -INSTALL_PREFIX="${HOME}/${LOCAL}" +INSTALL_PREFIX="${HOME}" export CXXFLAGS="${CXXFLAGS} -std=c++17 -Werror -Wall -Wextra" cmake \ diff --git a/hermes_shm/scripts/ci/install_hshm.sh b/hermes_shm/scripts/ci/install_hshm.sh index 365aaa1e7..33191e5f6 100755 --- a/hermes_shm/scripts/ci/install_hshm.sh +++ b/hermes_shm/scripts/ci/install_hshm.sh @@ -8,7 +8,7 @@ set -e set -o pipefail # Set spack env -INSTALL_DIR="${HOME}/${LOCAL}" +INSTALL_DIR="${HOME}" SPACK_DIR=${INSTALL_DIR}/spack . ${SPACK_DIR}/share/spack/setup-env.sh diff --git a/hermes_shm/src/CMakeLists.txt b/hermes_shm/src/CMakeLists.txt index 5ecfb4d80..08a389d13 100644 --- a/hermes_shm/src/CMakeLists.txt +++ b/hermes_shm/src/CMakeLists.txt @@ -11,7 +11,6 @@ add_library(hermes_shm_data_structures thread/rwlock.cc memory/malloc_allocator.cc memory/stack_allocator.cc - memory/fixed_page_allocator.cc memory/scalable_page_allocator.cc memory/memory_registry.cc memory/memory_manager.cc diff --git a/hermes_shm/src/memory/fixed_page_allocator.cc b/hermes_shm/src/memory/fixed_page_allocator.cc deleted file mode 100644 index 57f4b2de5..000000000 --- a/hermes_shm/src/memory/fixed_page_allocator.cc +++ /dev/null @@ -1,120 +0,0 @@ -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * - * Distributed under BSD 3-Clause license. * - * Copyright by The HDF Group. * - * Copyright by the Illinois Institute of Technology. * - * All rights reserved. * - * * - * This file is part of Hermes. The full Hermes copyright notice, including * - * terms governing use, modification, and redistribution, is contained in * - * the COPYING file, which can be found at the top directory. If you do not * - * have access to the file, you may request a copy from help@hdfgroup.org. * - * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - - -#include -#include - -namespace hshm::ipc { - -void FixedPageAllocator::shm_init(allocator_id_t id, - size_t custom_header_size, - char *buffer, - size_t buffer_size) { - buffer_ = buffer; - buffer_size_ = buffer_size; - header_ = reinterpret_cast(buffer_); - custom_header_ = reinterpret_cast(header_ + 1); - size_t region_off = (custom_header_ - buffer_) + custom_header_size; - size_t region_size = buffer_size_ - region_off; - alloc_.shm_init(id, 0, buffer + region_off, region_size); - header_->Configure(id, custom_header_size, &alloc_); - free_lists_ = header_->free_lists_.get(); -} - -void FixedPageAllocator::shm_deserialize(char *buffer, - size_t buffer_size) { - buffer_ = buffer; - buffer_size_ = buffer_size; - header_ = reinterpret_cast(buffer_); - custom_header_ = reinterpret_cast(header_ + 1); - size_t region_off = (custom_header_ - buffer_) + header_->custom_header_size_; - size_t region_size = buffer_size_ - region_off; - alloc_.shm_deserialize(buffer + region_off, region_size); - free_lists_ = header_->free_lists_.get(); -} - -size_t FixedPageAllocator::GetCurrentlyAllocatedSize() { - return header_->total_alloc_; -} - -OffsetPointer FixedPageAllocator::AllocateOffset(size_t size) { - MpPage *page = nullptr; - size_t size_mp = size + sizeof(MpPage); - - // Check if page of this size is already cached - for (iqueue &free_list : *free_lists_) { - if (free_list.size()) { - auto test_page = free_list.peek(); - if (test_page->page_size_ != size_mp) { - continue; - } - page = free_list.dequeue(); - break; - } - } - - // Allocate from stack if no page found - if (page == nullptr) { - page = alloc_.Convert(alloc_.AllocateOffset(size) - sizeof(MpPage)); - } - if (page == nullptr) { - throw OUT_OF_MEMORY; - } - - // Mark as allocated - header_->total_alloc_.fetch_add(page->page_size_); - auto p = Convert(page); - page->SetAllocated(); - return p + sizeof(MpPage); -} - -OffsetPointer FixedPageAllocator::AlignedAllocateOffset(size_t size, - size_t alignment) { - throw ALIGNED_ALLOC_NOT_SUPPORTED.format(); -} - -OffsetPointer FixedPageAllocator::ReallocateOffsetNoNullCheck(OffsetPointer p, - size_t new_size) { - throw ALIGNED_ALLOC_NOT_SUPPORTED.format(); -} - -void FixedPageAllocator::FreeOffsetNoNullCheck(OffsetPointer p) { - // Mark as free - auto hdr_offset = p - sizeof(MpPage); - auto hdr = Convert(hdr_offset); - if (!hdr->IsAllocated()) { - throw DOUBLE_FREE.format(); - } - hdr->UnsetAllocated(); - header_->total_alloc_.fetch_sub(hdr->page_size_); - - // Append to a free list - for (iqueue &free_list : *free_lists_) { - if (free_list.size()) { - MpPage *page = free_list.peek(); - if (page->page_size_ != hdr->page_size_) { - continue; - } - } - free_list.enqueue(hdr); - return; - } - - // Extend the set of cached pages - free_lists_->emplace_back(); - iqueue &free_list = - (*free_lists_)[free_lists_->size() - 1]; - free_list.enqueue(hdr); -} - -} // namespace hshm::ipc diff --git a/hermes_shm/src/memory/scalable_page_allocator.cc b/hermes_shm/src/memory/scalable_page_allocator.cc index 09d06a2bd..d59a87f6b 100644 --- a/hermes_shm/src/memory/scalable_page_allocator.cc +++ b/hermes_shm/src/memory/scalable_page_allocator.cc @@ -86,19 +86,22 @@ size_t ScalablePageAllocator::RoundUp(size_t num, size_t &exp) { OffsetPointer ScalablePageAllocator::AllocateOffset(size_t size) { MpPage *page = nullptr; - size_t size_mp = size + sizeof(MpPage); + size_t exp; + size_t size_mp = RoundUp(size + sizeof(MpPage), exp); + uint32_t cpu = NodeThreadId().hash() % HERMES_SYSTEM_INFO->ncpu_; // Case 1: Can we re-use an existing page? - page = CheckCaches(size_mp); + page = CheckLocalCaches(size_mp, cpu); // Case 2: Coalesce if enough space is being wasted // if (page == nullptr) {} // Case 3: Allocate from stack if no page found if (page == nullptr) { - auto off = alloc_.AllocateOffset(size); + auto off = alloc_.AllocateOffset(size_mp); if (!off.IsNull()) { page = alloc_.Convert(off - sizeof(MpPage)); + page->page_size_ = size_mp; } } @@ -111,13 +114,13 @@ OffsetPointer ScalablePageAllocator::AllocateOffset(size_t size) { header_->total_alloc_.fetch_add(page->page_size_); auto p = Convert(page); page->SetAllocated(); + page->cpu_ = cpu; return p + sizeof(MpPage); } -MpPage *ScalablePageAllocator::CheckCaches(size_t size_mp) { +MpPage *ScalablePageAllocator::CheckLocalCaches(size_t size_mp, uint32_t cpu) { MpPage *page; // ScopedRwReadLock coalesce_lock(header_->coalesce_lock_, 0); - uint32_t cpu = NodeThreadId().hash() % HERMES_SYSTEM_INFO->ncpu_; uint32_t cpu_start = cpu * num_free_lists_; pair> &first_free_list = (*free_lists_)[cpu_start]; @@ -236,12 +239,12 @@ void ScalablePageAllocator::DividePage(FreeListStats &stats, } OffsetPointer ScalablePageAllocator::AlignedAllocateOffset(size_t size, - size_t alignment) { + size_t alignment) { throw ALIGNED_ALLOC_NOT_SUPPORTED.format(); } OffsetPointer ScalablePageAllocator::ReallocateOffsetNoNullCheck( - OffsetPointer p, size_t new_size) { + OffsetPointer p, size_t new_size) { OffsetPointer new_p; void *ptr = AllocatePtr(new_size, new_p); MpPage *hdr = Convert(p - sizeof(MpPage)); @@ -262,7 +265,8 @@ void ScalablePageAllocator::FreeOffsetNoNullCheck(OffsetPointer p) { header_->total_alloc_.fetch_sub(hdr->page_size_); // Get the free list to start from - uint32_t cpu = NodeThreadId().hash() % HERMES_SYSTEM_INFO->ncpu_; + uint32_t cpu = hdr->cpu_; + // NodeThreadId().hash() % HERMES_SYSTEM_INFO->ncpu_; uint32_t cpu_start = cpu * num_free_lists_; pair> &first_free_list = (*free_lists_)[cpu_start]; diff --git a/hermes_shm/test/unit/allocators/allocator.cc b/hermes_shm/test/unit/allocators/allocator.cc index 80d0ea5d8..1bf0fd253 100644 --- a/hermes_shm/test/unit/allocators/allocator.cc +++ b/hermes_shm/test/unit/allocators/allocator.cc @@ -139,19 +139,6 @@ TEST_CASE("MallocAllocator") { Posttest(); } -TEST_CASE("FixedPageAllocator") { - auto alloc = Pretest(); - REQUIRE(alloc->GetCurrentlyAllocatedSize() == 0); - PageAllocationTest(alloc); - REQUIRE(alloc->GetCurrentlyAllocatedSize() == 0); - - REQUIRE(alloc->GetCurrentlyAllocatedSize() == 0); - MultiPageAllocationTest(alloc); - REQUIRE(alloc->GetCurrentlyAllocatedSize() == 0); - - Posttest(); -} - TEST_CASE("ScalablePageAllocator") { auto alloc = Pretest(); REQUIRE(alloc->GetCurrentlyAllocatedSize() == 0); diff --git a/hermes_shm/test/unit/data_structures/serialize/CMakeLists.txt b/hermes_shm/test/unit/data_structures/serialize/CMakeLists.txt index 62567729b..666c25541 100644 --- a/hermes_shm/test/unit/data_structures/serialize/CMakeLists.txt +++ b/hermes_shm/test/unit/data_structures/serialize/CMakeLists.txt @@ -3,4 +3,5 @@ project(hermes_shm) set(CMAKE_CXX_STANDARD 17) +add_subdirectory(shm) add_subdirectory(thallium) \ No newline at end of file diff --git a/hermes_shm/test/unit/data_structures/serialize/shm/CMakeLists.txt b/hermes_shm/test/unit/data_structures/serialize/shm/CMakeLists.txt new file mode 100644 index 000000000..a36d4b03e --- /dev/null +++ b/hermes_shm/test/unit/data_structures/serialize/shm/CMakeLists.txt @@ -0,0 +1,33 @@ +cmake_minimum_required(VERSION 3.10) +project(hermes_shm) + +set(CMAKE_CXX_STANDARD 17) + +#------------------------------------------------------------------------------ +# Test Cases +#------------------------------------------------------------------------------ +set (LIBS + hermes_shm_data_structures + Catch2::Catch2 + MPI::MPI_CXX + OpenMP::OpenMP_CXX) +add_executable(test_shm_exec + ${TEST_MAIN}/main.cc + test_init.cc + test_shm.cc) +add_dependencies(test_shm_exec + hermes_shm_data_structures) +target_link_libraries(test_shm_exec ${LIBS}) + +add_test(NAME test_shm COMMAND ${CMAKE_BINARY_DIR}/bin/test_shm_exec) + +#------------------------------------------------------------------------------ +# Install Targets +#------------------------------------------------------------------------------ +install(TARGETS + test_shm_exec + EXPORT + ${HERMES_EXPORTED_TARGETS} + LIBRARY DESTINATION ${HERMES_INSTALL_LIB_DIR} + ARCHIVE DESTINATION ${HERMES_INSTALL_LIB_DIR} + RUNTIME DESTINATION ${HERMES_INSTALL_BIN_DIR}) diff --git a/hermes_shm/test/unit/data_structures/serialize/shm/test_init.cc b/hermes_shm/test/unit/data_structures/serialize/shm/test_init.cc new file mode 100644 index 000000000..a723ac90f --- /dev/null +++ b/hermes_shm/test/unit/data_structures/serialize/shm/test_init.cc @@ -0,0 +1,30 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Distributed under BSD 3-Clause license. * + * Copyright by The HDF Group. * + * Copyright by the Illinois Institute of Technology. * + * All rights reserved. * + * * + * This file is part of Hermes. The full Hermes copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the top directory. If you do not * + * have access to the file, you may request a copy from help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +#include "test_init.h" +#include "hermes_shm/data_structures/ipc/string.h" +#include "hermes_shm/data_structures/serialization/thallium.h" +#include "hermes_shm/data_structures/containers/charbuf.h" +#include + +void MainPretest() { + std::string shm_url = "test_serializers"; + allocator_id_t alloc_id(0, 1); + auto mem_mngr = HERMES_MEMORY_MANAGER; + mem_mngr->CreateBackend( + MemoryManager::GetDefaultBackendSize(), shm_url); + mem_mngr->CreateAllocator(shm_url, alloc_id, 0); +} + +void MainPosttest() { +} diff --git a/hermes_shm/test/unit/data_structures/serialize/shm/test_init.h b/hermes_shm/test/unit/data_structures/serialize/shm/test_init.h new file mode 100644 index 000000000..d19ebf96a --- /dev/null +++ b/hermes_shm/test/unit/data_structures/serialize/shm/test_init.h @@ -0,0 +1,35 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Distributed under BSD 3-Clause license. * + * Copyright by The HDF Group. * + * Copyright by the Illinois Institute of Technology. * + * All rights reserved. * + * * + * This file is part of Hermes. The full Hermes copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the top directory. If you do not * + * have access to the file, you may request a copy from help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#ifndef HERMES_SHM_TEST_UNIT_DATA_STRUCTURES_SERIALIZE_THALLIUM_TEST_INIT_H_ +#define HERMES_SHM_TEST_UNIT_DATA_STRUCTURES_SERIALIZE_THALLIUM_TEST_INIT_H_ + +#include "hermes_shm/data_structures/data_structure.h" +#include "hermes_shm/data_structures/serialization/shm_serialize.h" + +using hshm::ipc::PosixShmMmap; +using hshm::ipc::MemoryBackendType; +using hshm::ipc::MemoryBackend; +using hshm::ipc::allocator_id_t; +using hshm::ipc::AllocatorType; +using hshm::ipc::Allocator; +using hshm::ipc::Pointer; + +using hshm::ipc::MemoryBackendType; +using hshm::ipc::MemoryBackend; +using hshm::ipc::allocator_id_t; +using hshm::ipc::AllocatorType; +using hshm::ipc::Allocator; +using hshm::ipc::MemoryManager; +using hshm::ipc::Pointer; + +#endif // HERMES_SHM_TEST_UNIT_DATA_STRUCTURES_SERIALIZE_THALLIUM_TEST_INIT_H_ diff --git a/hermes_shm/test/unit/data_structures/serialize/shm/test_shm.cc b/hermes_shm/test/unit/data_structures/serialize/shm/test_shm.cc new file mode 100644 index 000000000..d315c542d --- /dev/null +++ b/hermes_shm/test/unit/data_structures/serialize/shm/test_shm.cc @@ -0,0 +1,69 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Distributed under BSD 3-Clause license. * + * Copyright by The HDF Group. * + * Copyright by the Illinois Institute of Technology. * + * All rights reserved. * + * * + * This file is part of Hermes. The full Hermes copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the top directory. If you do not * + * have access to the file, you may request a copy from help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include "basic_test.h" +#include "test_init.h" +#include "hermes_shm/data_structures/ipc/string.h" +#include "hermes_shm/data_structures/data_structure.h" + +TEST_CASE("SerializePod") { + hipc::ShmSerializer istream; + Allocator *alloc = HERMES_MEMORY_MANAGER->GetDefaultAllocator(); + int a = 1; + double b = 2; + float c = 3; + int size = sizeof(int) + sizeof(double) + sizeof(float) + sizeof(allocator_id_t); + REQUIRE(istream.shm_buf_size(alloc->GetId(), a, b, c) == size); + char *buf = istream.serialize(alloc, a, b, c); + + hipc::ShmSerializer ostream; + Allocator *alloc2 = ostream.deserialize(buf); + REQUIRE(alloc == alloc2); + auto a2 = ostream.deserialize(alloc2, buf); + REQUIRE(a2 == a); + auto b2 = ostream.deserialize(alloc2, buf); + REQUIRE(b2 == b); + auto c2 = ostream.deserialize(alloc2, buf); + REQUIRE(c2 == c); +} + +TEST_CASE("SerializeString") { + hipc::ShmSerializer istream; + Allocator *alloc = HERMES_MEMORY_MANAGER->GetDefaultAllocator(); + hipc::mptr i; + bool string_ar = IS_SHM_ARCHIVEABLE(hipc::string); + bool string_star_ar = IS_SHM_ARCHIVEABLE(std::remove_reference); + bool string_ar2 = IS_SHM_ARCHIVEABLE(std::remove_reference::type); + + hipc::string &y = *i; + + + auto a = hipc::make_uptr(alloc, "h1"); + auto b = hipc::make_uptr(alloc, "h2"); + int c; + int size = 2 * sizeof(hipc::OffsetPointer) + sizeof(int) + sizeof(allocator_id_t); + REQUIRE(istream.shm_buf_size(alloc->GetId(), *a, *b, c) == size); + char *buf = istream.serialize(alloc, *a, *b, c); + + hipc::ShmSerializer ostream; + Allocator *alloc2 = ostream.deserialize(buf); + REQUIRE(alloc == alloc2); + hipc::mptr a2; + ostream.deserialize(alloc2, buf, a2); + REQUIRE(*a2 == *a); + hipc::mptr b2; + ostream.deserialize(alloc2, buf, b2); + REQUIRE(*b2 == *b); + int c2 = ostream.deserialize(alloc2, buf); + REQUIRE(c2 == c); +} + diff --git a/src/binlog.h b/src/binlog.h index 66062ce77..331444002 100644 --- a/src/binlog.h +++ b/src/binlog.h @@ -123,7 +123,7 @@ class BinaryLog { * */ void Ingest(const hipc::mpsc_queue &queue) { T entry; - while(!queue.pop(entry).IsNull()) { + while (!queue.pop(entry).IsNull()) { AppendEntry(entry); } } @@ -132,7 +132,7 @@ class BinaryLog { * Appends all entries in the vector to the cache. * */ void Ingest(const std::vector &queue) { - for(auto &entry : queue) { + for (auto &entry : queue) { AppendEntry(entry); } } @@ -196,7 +196,7 @@ class BinaryLog { buffer.reserve(num_entries); std::ifstream input_file(path_, std::ios::in); cereal::BinaryInputArchive iarch(input_file); - while(!input_file.eof()) { + while (!input_file.eof()) { buffer.emplace_back(); iarch(buffer.back()); } diff --git a/src/metadata_manager.cc b/src/metadata_manager.cc index 42d132ddf..7ae2e928e 100644 --- a/src/metadata_manager.cc +++ b/src/metadata_manager.cc @@ -355,8 +355,8 @@ MetadataManager::LocalPutBlobMetadata(TagId bkt_id, hipc::pair& info = (*iter); BlobInfo &blob_info = info.GetSecond(); // Acquire blob_info write lock before modifying buffers - ScopedRwWriteLock(blob_info.lock_[0], - kMDM_LocalPutBlobMetadata); + ScopedRwWriteLock blob_info_lock(blob_info.lock_[0], + kMDM_LocalPutBlobMetadata); (*blob_info.buffers_) = buffers; blob_info.blob_size_ = blob_size; blob_info.score_ = score; diff --git a/src/rpc_thallium.h b/src/rpc_thallium.h index 5faff1c53..5cb5dc4a0 100644 --- a/src/rpc_thallium.h +++ b/src/rpc_thallium.h @@ -115,7 +115,7 @@ class ThalliumRpc : public RpcContext { /** Io transfer at the server */ size_t IoCallServer(const tl::request &req, tl::bulk &bulk, IoType type, char *data, size_t size) { - tl::bulk_mode flag; + tl::bulk_mode flag = tl::bulk_mode::write_only; switch (type) { case IoType::kRead: { // The "local_bulk" object will only be read from @@ -129,7 +129,6 @@ class ThalliumRpc : public RpcContext { } default: { // NOTE(llogan): Avoids "uninitalized" warning - flag = tl::bulk_mode::write_only; HELOG(kFatal, "Cannot have none I/O type") } } @@ -139,7 +138,7 @@ class ThalliumRpc : public RpcContext { segments[0].first = data; segments[0].second = size; tl::bulk local_bulk = server_engine_->expose(segments, flag); - size_t io_bytes; + size_t io_bytes = 0; switch (type) { case IoType::kRead: { @@ -154,7 +153,6 @@ class ThalliumRpc : public RpcContext { } case IoType::kNone: { HELOG(kFatal, "Cannot have none I/O type") - exit(1); } } if (io_bytes != size) { From ed9783df970f441235df08741ba340f241242406 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 13:43:11 -0500 Subject: [PATCH 12/44] Remove hermes version from spack install ci --- ci/install_deps.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/install_deps.sh b/ci/install_deps.sh index d5349b498..45e6be4c4 100755 --- a/ci/install_deps.sh +++ b/ci/install_deps.sh @@ -40,4 +40,4 @@ python3 -m pip install -e . # NOTE(llogan): Modify version string per release. HERMES_VERSION=1.0.0 -spack install hermes@${HERMES_VERSION} +vfd +spack install hermes +vfd From 5b8a321fd9e0fea9a4ab0189448acc54cdc00b91 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 17:09:06 -0500 Subject: [PATCH 13/44] Enable specific limit on memory usage --- config/hermes_server_default.yaml | 6 ++ src/api/hermes.cc | 15 ++-- src/binlog.h | 132 +++++++----------------------- src/config_server.cc | 4 + src/config_server.h | 3 + src/prefetcher.cc | 7 ++ src/prefetcher.h | 2 + test/test_binlog.cc | 6 +- 8 files changed, 64 insertions(+), 111 deletions(-) diff --git a/config/hermes_server_default.yaml b/config/hermes_server_default.yaml index 21e214974..c2ae6eb90 100644 --- a/config/hermes_server_default.yaml +++ b/config/hermes_server_default.yaml @@ -76,6 +76,12 @@ devices: is_shared_device: true borg_capacity_thresh: [ 0.0, 1.0 ] +# Define the maximum amount of memory Hermes can use for non-buffering tasks. +# This includes metadata management and memory allocations. +# This memory will not be preallocated, so if you don't know, you can set it +# to be high. +max_memory: 8g + ### Define properties of RPCs rpc: # A path to a file containing a list of server names, 1 per line. If your diff --git a/src/api/hermes.cc b/src/api/hermes.cc index 3f0215d05..892cb89a9 100644 --- a/src/api/hermes.cc +++ b/src/api/hermes.cc @@ -112,14 +112,17 @@ void Hermes::LoadClientConfig(std::string config_path) { void Hermes::InitSharedMemory() { // Create shared-memory allocator auto mem_mngr = HERMES_MEMORY_MANAGER; + if (server_config_.max_memory_ == 0) { + server_config_.max_memory_ = hipc::MemoryManager::GetDefaultBackendSize(); + } mem_mngr->CreateBackend( - hipc::MemoryManager::GetDefaultBackendSize(), - server_config_.shmem_name_); + server_config_.max_memory_, + server_config_.shmem_name_); main_alloc_ = - mem_mngr->CreateAllocator( - server_config_.shmem_name_, - main_alloc_id, - sizeof(HermesShm)); + mem_mngr->CreateAllocator( + server_config_.shmem_name_, + main_alloc_id, + sizeof(HermesShm)); header_ = main_alloc_->GetCustomHeader(); } diff --git a/src/binlog.h b/src/binlog.h index 331444002..8e9e76e09 100644 --- a/src/binlog.h +++ b/src/binlog.h @@ -25,68 +25,10 @@ namespace hermes { template struct BinaryLogRank { - std::vector cache_; /**< Cached log entries */ - size_t off_; /**< Prefetcher's offset in the cache */ - size_t num_cached_; /**< The number of entries cached int the log */ + std::vector log_; /**< Cached log entries */ + size_t backend_off_; /**< Entry offset in the backend file */ - /** Constructor */ - BinaryLogRank() : off_(0), num_cached_(0) {} - - /** Number of elements in the cache */ - size_t size() const { - return cache_.size(); - } - - /** Number of touched elements / index of first untouched element */ - size_t touched() const { - return off_; - } - - /** Number of untouched elements */ - size_t untouched() const { - return size() - off_; - } - - /** Number of uncached elements */ - size_t uncached() { - return size() - num_cached_; - } - - /** Increment the number of cached elements */ - void increment_cached() { - num_cached_ += 1; - } - - /** Get the next untouched cached entry */ - bool next(T &next) { - if (off_ >= cache_.size()) { return false; } - next = cache_[off_]; - off_ += 1; - return true; - } - - /** Reserve more space */ - void reserve(size_t size) { - cache_.reserve(size); - } - - /** Emplace an entry to the back of the cache log */ - void emplace_back(const T &entry) { - cache_.emplace_back(entry); - } - - /** Remove touched elements from the cache log */ - size_t clear_touched() { - size_t num_touched = touched(); - cache_.erase(cache_.begin(), cache_.begin() + num_touched); - if (touched() <= num_cached_) { - num_cached_ -= num_touched; - } else { - num_cached_ = 0; - } - off_ = 0; - return num_touched; - } + BinaryLogRank() : backend_off_(0) {} }; /** @@ -94,8 +36,7 @@ struct BinaryLogRank { * execution traces. * * This assumes only a single thread modifies or reads - * from the log. This is intded to be used internally - * by the prefetcher. + * from the log. Intended for internal use by prefetcher. * */ template class BinaryLog { @@ -106,12 +47,14 @@ class BinaryLog { std::string path_; /**< Path to the backing log file */ public: + /** Default Constructor*/ + BinaryLog() : max_ingest_(0), cur_entry_count_(0) {} + /** Constructor. */ - BinaryLog(const std::string &path, - size_t max_ingest_bytes) : - max_ingest_(max_ingest_bytes / sizeof(T)), - cur_entry_count_(0), - path_(path) { + void Init(const std::string &path, + size_t max_ingest_bytes) { + max_ingest_ = max_ingest_bytes / sizeof(T); + path_ = path; // Create + truncate the file // This is ok because the Hermes daemons are assumed to be spawned before // applications start running. @@ -140,9 +83,13 @@ class BinaryLog { /** * Get the next entry corresponding to the rank * */ - bool GetNextEntry(int rank, T &entry) { - while (cache_[rank].untouched() == 0 && Load(max_ingest_)) {} - return cache_[rank].next(entry); + bool GetEntry(int rank, size_t off, T &entry) { + auto &cache = cache_[rank]; + if (off < cache.backend_off_ + cache.log_.size()) { + entry = cache.log_[off]; + return true; + } + return false; } /** @@ -154,22 +101,19 @@ class BinaryLog { } // Serialize all contents into the log file - if (path_.size()) { + if (path_.empty()) { std::ofstream output_file(path_, std::ios::out | std::ios::app); cereal::BinaryOutputArchive oarch(output_file); - for (auto &rank_cache : cache_) { - for (size_t i = rank_cache.uncached(); i < rank_cache.size(); ++i) { - auto &entry = rank_cache.cache_[i]; + for (auto &cache : cache_) { + for (size_t i = 0; i < cache.log_.size(); ++i) { + auto &entry = cache.log_[i]; oarch(entry); - rank_cache.increment_cached(); + cache.backend_off_ += 1; } + cache.log_.clear(); } } - - // Remove all touched entries from the cache - for (auto &rank_cache : cache_) { - cur_entry_count_ -= rank_cache.clear_touched(); - } + cur_entry_count_ = 0; } private: @@ -178,31 +122,13 @@ class BinaryLog { if (entry.rank_ >= (int)cache_.size()) { cache_.resize(entry.rank_ + 1); } - if (cache_[entry.rank_].size() == 0) { - cache_[entry.rank_].reserve(8192); + auto &cache = cache_[entry.rank_]; + if (cache.log_.size() == 0) { + cache.log_.reserve(8192); } - cache_[entry.rank_].emplace_back(entry); + cache.log_.emplace_back(entry); cur_entry_count_ += 1; } - - /** - * Load data from the log into memory - * - * @return true when there is still data to load from the file, false - * otherwise - * */ - bool Load(size_t num_entries) { - std::vector buffer; - buffer.reserve(num_entries); - std::ifstream input_file(path_, std::ios::in); - cereal::BinaryInputArchive iarch(input_file); - while (!input_file.eof()) { - buffer.emplace_back(); - iarch(buffer.back()); - } - Ingest(buffer); - return !input_file.eof(); - } }; } // namespace hermes diff --git a/src/config_server.cc b/src/config_server.cc index 6ce3ce610..ae67e9e95 100644 --- a/src/config_server.cc +++ b/src/config_server.cc @@ -178,6 +178,10 @@ void ServerConfig::ParseYAML(YAML::Node &yaml_conf) { if (yaml_conf["shmem_name"]) { shmem_name_ = yaml_conf["shmem_name"].as(); } + if (yaml_conf["max_memory"]) { + max_memory_ = hshm::ConfigParse::ParseSize( + yaml_conf["max_memory"].as()); + } } /** Load the default configuration */ diff --git a/src/config_server.h b/src/config_server.h index c5dc1f86c..3a255b56c 100644 --- a/src/config_server.h +++ b/src/config_server.h @@ -265,6 +265,9 @@ class ServerConfig : public BaseConfig { /** The length of a view state epoch */ u32 system_view_state_update_interval_ms; + /** The max amount of memory hermes uses for non-buffering tasks */ + size_t max_memory_; + /** A base name for the BufferPool shared memory segement. Hermes appends the * value of the USER environment variable to this string. */ diff --git a/src/prefetcher.cc b/src/prefetcher.cc index aeff0f05d..97ae90d51 100644 --- a/src/prefetcher.cc +++ b/src/prefetcher.cc @@ -29,6 +29,13 @@ void Prefetcher::Init() { return; } + // Create the binary log + /*if (conf.prefetcher_.trace_path_.empty()) { + log_.Init("", MEGABYTES(64)); + } else { + log_.Init(conf.prefetcher_.trace_path_ + ) + }*/ + // Info needed per-client and server mdm_->is_mpi_ = conf.prefetcher_.is_mpi_; if (HERMES->mode_ == HermesType::kClient) { diff --git a/src/prefetcher.h b/src/prefetcher.h index 8ce8169ed..347504017 100644 --- a/src/prefetcher.h +++ b/src/prefetcher.h @@ -17,6 +17,7 @@ #include "thread_manager.h" #include "metadata_manager.h" #include "rpc.h" +#include "binlog.h" #include namespace hermes { @@ -31,6 +32,7 @@ class Prefetcher { tl::engine *engine; /**< Argobots execution engine */ double epoch_ms_; /**< Milliseconds to sleep */ bool is_enabled_; /**< Whether the prefetcher is enabled */ + BinaryLog log_; public: /** Initialize each candidate prefetcher, including trace info */ diff --git a/test/test_binlog.cc b/test/test_binlog.cc index 2e7e05a27..9fbe9daa6 100644 --- a/test/test_binlog.cc +++ b/test/test_binlog.cc @@ -51,7 +51,7 @@ void verify_log(hermes::BinaryLog log, for (int rank = 0; rank < num_ranks; ++rank) { hermes::IoStat stat; size_t i = 0; - while (log.GetNextEntry(rank, stat)) { + while (log.GetEntry(rank, i, stat)) { REQUIRE(stat.blob_id_.node_id_ == rank); REQUIRE(stat.blob_id_.unique_ == i); i += 1; @@ -72,7 +72,8 @@ TEST_CASE("TestBinlog") { chunk_bytes, num_ranks, entries_per_rank); // Attempt flushing the log - hermes::BinaryLog log(path, log_bytes); + hermes::BinaryLog log; + log.Init(path, log_bytes); log.Ingest(stats); verify_log(log, num_ranks, entries_per_rank); log.Flush(); @@ -82,6 +83,7 @@ TEST_CASE("TestBinlog") { log.Ingest(stats); log.Ingest(stats); log.Ingest(stats); + log.Ingest(stats); log.Flush(); REQUIRE(stdfs::file_size(path) > 0); } From 4590e00549247acff9b9c25770cf3d98f9b1af91 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 17:15:51 -0500 Subject: [PATCH 14/44] Cache 64 - 16MB --- .../memory/allocator/scalable_page_allocator.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hermes_shm/include/hermes_shm/memory/allocator/scalable_page_allocator.h b/hermes_shm/include/hermes_shm/memory/allocator/scalable_page_allocator.h index 40726a054..1829d55da 100644 --- a/hermes_shm/include/hermes_shm/memory/allocator/scalable_page_allocator.h +++ b/hermes_shm/include/hermes_shm/memory/allocator/scalable_page_allocator.h @@ -110,14 +110,14 @@ class ScalablePageAllocator : public Allocator { StackAllocator alloc_; /** The power-of-two exponent of the minimum size that can be cached */ static const size_t min_cached_size_exp_ = 6; - /** The minimum size that can be cached directly (32 bytes) */ + /** The minimum size that can be cached directly (64 bytes) */ static const size_t min_cached_size_ = (1 << min_cached_size_exp_); /** The power-of-two exponent of the minimum size that can be cached */ - static const size_t max_cached_size_exp_ = 20; - /** The maximum size that can be cached directly (16KB) */ + static const size_t max_cached_size_exp_ = 24; + /** The maximum size that can be cached directly (16MB) */ static const size_t max_cached_size_ = (1 << max_cached_size_exp_); - /** Cache every size between 64 (2^6) BYTES and 1MB (2^20): (15 entries) */ - static const size_t num_caches_ = 20 - 6 + 1; + /** Cache every size between 64 (2^6) BYTES and 16MB (2^24): (19 entries) */ + static const size_t num_caches_ = 24 - 6 + 1; /** * The last free list stores sizes larger than 16KB or sizes which are * not exactly powers-of-two. @@ -129,7 +129,7 @@ class ScalablePageAllocator : public Allocator { * Allocator constructor * */ ScalablePageAllocator() - : header_(nullptr) {} + : header_(nullptr) {} /** * Get the ID of this allocator from shared memory From b1c5e9d24223575164e37f374da38b1e717e3e92 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 19:58:22 -0500 Subject: [PATCH 15/44] Use Argobots as thread model --- src/api/hermes.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/api/hermes.cc b/src/api/hermes.cc index 892cb89a9..fd5e6c002 100644 --- a/src/api/hermes.cc +++ b/src/api/hermes.cc @@ -72,6 +72,7 @@ void Hermes::InitServer(std::string server_config_path) { /** Initialize Hermes as a client to the daemon */ void Hermes::InitClient(std::string server_config_path, std::string client_config_path) { + HERMES_THREAD_MODEL->SetThreadModel(hshm::ThreadType::kArgobots); LoadServerConfig(server_config_path); LoadClientConfig(client_config_path); LoadSharedMemory(); From 776116795e8e9c708428f3d1c5de269b27d9dec5 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 20:49:58 -0500 Subject: [PATCH 16/44] Disable unit test to get caching --- .github/workflows/main.yml | 10 +++++----- src/api/hermes.cc | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 41d2bc045..fbff867c5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -75,11 +75,11 @@ jobs: if: steps.hermes-cache.outputs.cache-hit != 'true' run: ci/build_hermes.sh - - name: Test - run: cd build && ctest -VV - - - name: Install - run: pushd build && make install && popd +# - name: Test +# run: cd build && ctest -VV +# +# - name: Install +# run: pushd build && make install && popd # Enable tmate debugging of manually-triggered workflows if the input option was provided - name: Setup tmate session diff --git a/src/api/hermes.cc b/src/api/hermes.cc index fd5e6c002..b17591780 100644 --- a/src/api/hermes.cc +++ b/src/api/hermes.cc @@ -91,7 +91,7 @@ void Hermes::InitClient(std::string server_config_path, /** Load the server-side configuration */ void Hermes::LoadServerConfig(std::string config_path) { - if (config_path.size() == 0) { + if (config_path.empty()) { config_path = GetEnvSafe(kHermesServerConf); } if (mode_ == HermesType::kServer) { @@ -102,7 +102,7 @@ void Hermes::LoadServerConfig(std::string config_path) { /** Load the client-side configuration */ void Hermes::LoadClientConfig(std::string config_path) { - if (config_path.size() == 0) { + if (config_path.empty()) { config_path = GetEnvSafe(kHermesClientConf); } // HILOG(kInfo, "Loading client configuration: {}", config_path) From b479f2e0214999674171767ad2f292caa28623b1 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 21:01:58 -0500 Subject: [PATCH 17/44] Change thread model setting location --- src/api/hermes.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/api/hermes.cc b/src/api/hermes.cc index b17591780..4af4875a6 100644 --- a/src/api/hermes.cc +++ b/src/api/hermes.cc @@ -50,20 +50,19 @@ void Hermes::Init(HermesType mode, /** Initialize Hermes as a server */ void Hermes::InitServer(std::string server_config_path) { - HERMES_THREAD_MODEL->SetThreadModel(hshm::ThreadType::kArgobots); LoadServerConfig(server_config_path); InitSharedMemory(); // Initialize RPC rpc_.InitServer(); rpc_.InitClient(); + HERMES_THREAD_MODEL->SetThreadModel(hshm::ThreadType::kArgobots); // Load the trait libraries traits_.Init(); // Construct the reference objects mdm_.shm_init(header_->mdm_, main_alloc_, &server_config_); - rpc_.InitClient(); bpm_.shm_init(header_->bpm_, main_alloc_); borg_.shm_init(header_->borg_, main_alloc_); prefetch_.Init(); @@ -72,7 +71,6 @@ void Hermes::InitServer(std::string server_config_path) { /** Initialize Hermes as a client to the daemon */ void Hermes::InitClient(std::string server_config_path, std::string client_config_path) { - HERMES_THREAD_MODEL->SetThreadModel(hshm::ThreadType::kArgobots); LoadServerConfig(server_config_path); LoadClientConfig(client_config_path); LoadSharedMemory(); @@ -80,6 +78,7 @@ void Hermes::InitClient(std::string server_config_path, // Initialize references to SHM types mdm_.shm_deserialize(header_->mdm_); rpc_.InitClient(); + HERMES_THREAD_MODEL->SetThreadModel(hshm::ThreadType::kArgobots); bpm_.shm_deserialize(header_->bpm_); borg_.shm_deserialize(header_->borg_); prefetch_.Init(); From 59c01f4c40b40f943bd6904bd19dc9b09ecf110e Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 21:18:04 -0500 Subject: [PATCH 18/44] Try not forcing thread model set --- src/api/hermes.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/api/hermes.cc b/src/api/hermes.cc index 4af4875a6..a4c0cfa20 100644 --- a/src/api/hermes.cc +++ b/src/api/hermes.cc @@ -50,13 +50,13 @@ void Hermes::Init(HermesType mode, /** Initialize Hermes as a server */ void Hermes::InitServer(std::string server_config_path) { + HERMES_THREAD_MODEL->SetThreadModel(hshm::ThreadType::kArgobots); LoadServerConfig(server_config_path); InitSharedMemory(); // Initialize RPC rpc_.InitServer(); rpc_.InitClient(); - HERMES_THREAD_MODEL->SetThreadModel(hshm::ThreadType::kArgobots); // Load the trait libraries traits_.Init(); @@ -78,7 +78,7 @@ void Hermes::InitClient(std::string server_config_path, // Initialize references to SHM types mdm_.shm_deserialize(header_->mdm_); rpc_.InitClient(); - HERMES_THREAD_MODEL->SetThreadModel(hshm::ThreadType::kArgobots); + // HERMES_THREAD_MODEL->SetThreadModel(hshm::ThreadType::kArgobots); bpm_.shm_deserialize(header_->bpm_); borg_.shm_deserialize(header_->borg_); prefetch_.Init(); From 278627629d7bc8ca4f24619e8b5a41d10a38d87c Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 21:20:19 -0500 Subject: [PATCH 19/44] Disable everything except builds --- .github/workflows/main.yml | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index fbff867c5..ebe235a89 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -82,26 +82,26 @@ jobs: # run: pushd build && make install && popd # Enable tmate debugging of manually-triggered workflows if the input option was provided - - name: Setup tmate session - uses: mxschmitt/action-tmate@v3 - if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled && (failure() || !failure()) }} +# - name: Setup tmate session +# uses: mxschmitt/action-tmate@v3 +# if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled && (failure() || !failure()) }} # - name: Multi-node Test # run: pushd ci/cluster && ./multi_node_ci_test.sh - - name: Generate coverage file - run: | - COVERAGE_DIR=${GITHUB_WORKSPACE}/coverage - mkdir -p ${COVERAGE_DIR} - pushd ${GITHUB_WORKSPACE}/build - lcov -c -d . -o "${COVERAGE_DIR}/tmp.info" - lcov --remove "${COVERAGE_DIR}/tmp.info" \ - "/usr/include/*" \ - "${HOME}/${LOCAL}/*" \ - "*/stb_ds.h" \ - -o ${COVERAGE_DIR}/lcov.info - - - name: Coveralls - uses: coverallsapp/github-action@master - with: - github-token: ${{ secrets.GITHUB_TOKEN }} +# - name: Generate coverage file +# run: | +# COVERAGE_DIR=${GITHUB_WORKSPACE}/coverage +# mkdir -p ${COVERAGE_DIR} +# pushd ${GITHUB_WORKSPACE}/build +# lcov -c -d . -o "${COVERAGE_DIR}/tmp.info" +# lcov --remove "${COVERAGE_DIR}/tmp.info" \ +# "/usr/include/*" \ +# "${HOME}/${LOCAL}/*" \ +# "*/stb_ds.h" \ +# -o ${COVERAGE_DIR}/lcov.info + +# - name: Coveralls +# uses: coverallsapp/github-action@master +# with: +# github-token: ${{ secrets.GITHUB_TOKEN }} From 2428b1e9b2b314d6ac47b52c4e47cec92957a564 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 21:21:51 -0500 Subject: [PATCH 20/44] Initialize rpc client in correct spot --- src/api/hermes.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/api/hermes.cc b/src/api/hermes.cc index a4c0cfa20..0f1e35c97 100644 --- a/src/api/hermes.cc +++ b/src/api/hermes.cc @@ -50,19 +50,19 @@ void Hermes::Init(HermesType mode, /** Initialize Hermes as a server */ void Hermes::InitServer(std::string server_config_path) { - HERMES_THREAD_MODEL->SetThreadModel(hshm::ThreadType::kArgobots); LoadServerConfig(server_config_path); InitSharedMemory(); // Initialize RPC rpc_.InitServer(); - rpc_.InitClient(); + HERMES_THREAD_MODEL->SetThreadModel(hshm::ThreadType::kArgobots); // Load the trait libraries traits_.Init(); // Construct the reference objects mdm_.shm_init(header_->mdm_, main_alloc_, &server_config_); + rpc_.InitClient(); bpm_.shm_init(header_->bpm_, main_alloc_); borg_.shm_init(header_->borg_, main_alloc_); prefetch_.Init(); @@ -78,7 +78,7 @@ void Hermes::InitClient(std::string server_config_path, // Initialize references to SHM types mdm_.shm_deserialize(header_->mdm_); rpc_.InitClient(); - // HERMES_THREAD_MODEL->SetThreadModel(hshm::ThreadType::kArgobots); + HERMES_THREAD_MODEL->SetThreadModel(hshm::ThreadType::kArgobots); bpm_.shm_deserialize(header_->bpm_); borg_.shm_deserialize(header_->borg_); prefetch_.Init(); From bfaefdf543346b5299f104e184d1fc4157c89fce Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 22:12:16 -0500 Subject: [PATCH 21/44] Re-try with simple free --- hermes_shm/src/memory/scalable_page_allocator.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hermes_shm/src/memory/scalable_page_allocator.cc b/hermes_shm/src/memory/scalable_page_allocator.cc index d59a87f6b..7c4a5eaa2 100644 --- a/hermes_shm/src/memory/scalable_page_allocator.cc +++ b/hermes_shm/src/memory/scalable_page_allocator.cc @@ -265,7 +265,8 @@ void ScalablePageAllocator::FreeOffsetNoNullCheck(OffsetPointer p) { header_->total_alloc_.fetch_sub(hdr->page_size_); // Get the free list to start from - uint32_t cpu = hdr->cpu_; + uint32_t cpu = NodeThreadId().hash() % HERMES_SYSTEM_INFO->ncpu_; + // hdr->cpu_; // NodeThreadId().hash() % HERMES_SYSTEM_INFO->ncpu_; uint32_t cpu_start = cpu * num_free_lists_; pair> &first_free_list = From cc4b97c7e99ae9726fe7dca61a9fc38dee04239b Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 22:12:37 -0500 Subject: [PATCH 22/44] Enable Test + Install ci --- .github/workflows/main.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ebe235a89..2807db199 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -75,11 +75,11 @@ jobs: if: steps.hermes-cache.outputs.cache-hit != 'true' run: ci/build_hermes.sh -# - name: Test -# run: cd build && ctest -VV -# -# - name: Install -# run: pushd build && make install && popd + - name: Test + run: cd build && ctest -VV + + - name: Install + run: pushd build && make install && popd # Enable tmate debugging of manually-triggered workflows if the input option was provided # - name: Setup tmate session From ea14bf7288b0936fb8d938b6e8117a52c8e80386 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 22:39:28 -0500 Subject: [PATCH 23/44] Better allocator isn't why actions hang --- hermes_shm/src/memory/scalable_page_allocator.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hermes_shm/src/memory/scalable_page_allocator.cc b/hermes_shm/src/memory/scalable_page_allocator.cc index 7c4a5eaa2..d59a87f6b 100644 --- a/hermes_shm/src/memory/scalable_page_allocator.cc +++ b/hermes_shm/src/memory/scalable_page_allocator.cc @@ -265,8 +265,7 @@ void ScalablePageAllocator::FreeOffsetNoNullCheck(OffsetPointer p) { header_->total_alloc_.fetch_sub(hdr->page_size_); // Get the free list to start from - uint32_t cpu = NodeThreadId().hash() % HERMES_SYSTEM_INFO->ncpu_; - // hdr->cpu_; + uint32_t cpu = hdr->cpu_; // NodeThreadId().hash() % HERMES_SYSTEM_INFO->ncpu_; uint32_t cpu_start = cpu * num_free_lists_; pair> &first_free_list = From f68044bf8013579dd189af1814e46afe042b6925 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 23:12:33 -0500 Subject: [PATCH 24/44] Change MPIIO + STDIO to use RealApi --- adapter/mpiio/mpiio_api.h | 173 +++++++------------------------------- adapter/stdio/stdio_api.h | 161 ++++++----------------------------- 2 files changed, 58 insertions(+), 276 deletions(-) diff --git a/adapter/mpiio/mpiio_api.h b/adapter/mpiio/mpiio_api.h index b50bbc847..8ca86c43d 100644 --- a/adapter/mpiio/mpiio_api.h +++ b/adapter/mpiio/mpiio_api.h @@ -58,7 +58,7 @@ typedef int (*MPI_File_sync_t)(MPI_File fh); namespace hermes::adapter::fs { /** Pointers to the real mpiio API */ -class MpiioApi { +class MpiioApi : public RealApi { public: /** MPI_Init */ MPI_Init_t MPI_Init = nullptr; @@ -117,175 +117,62 @@ class MpiioApi { /** MPI_File_sync */ MPI_File_sync_t MPI_File_sync = nullptr; - MpiioApi() { - void *is_intercepted = (void*)dlsym(RTLD_DEFAULT, "mpiio_intercepted"); - if (is_intercepted) { - MPI_Init = (MPI_Init_t)dlsym(RTLD_NEXT, "MPI_Init"); - } else { - MPI_Init = (MPI_Init_t)dlsym(RTLD_DEFAULT, "MPI_Init"); - } + MpiioApi() : RealApi("MPI_Init", "mpiio_intercepted") { + MPI_Init = (MPI_Init_t)dlsym(real_lib_, "MPI_Init"); REQUIRE_API(MPI_Init) - if (is_intercepted) { - MPI_Finalize = (MPI_Finalize_t)dlsym(RTLD_NEXT, "MPI_Finalize"); - } else { - MPI_Finalize = (MPI_Finalize_t)dlsym(RTLD_DEFAULT, "MPI_Finalize"); - } + MPI_Finalize = (MPI_Finalize_t)dlsym(real_lib_, "MPI_Finalize"); REQUIRE_API(MPI_Finalize) - if (is_intercepted) { - MPI_Wait = (MPI_Wait_t)dlsym(RTLD_NEXT, "MPI_Wait"); - } else { - MPI_Wait = (MPI_Wait_t)dlsym(RTLD_DEFAULT, "MPI_Wait"); - } + MPI_Wait = (MPI_Wait_t)dlsym(real_lib_, "MPI_Wait"); REQUIRE_API(MPI_Wait) - if (is_intercepted) { - MPI_Waitall = (MPI_Waitall_t)dlsym(RTLD_NEXT, "MPI_Waitall"); - } else { - MPI_Waitall = (MPI_Waitall_t)dlsym(RTLD_DEFAULT, "MPI_Waitall"); - } + MPI_Waitall = (MPI_Waitall_t)dlsym(real_lib_, "MPI_Waitall"); REQUIRE_API(MPI_Waitall) - if (is_intercepted) { - MPI_File_open = (MPI_File_open_t)dlsym(RTLD_NEXT, "MPI_File_open"); - } else { - MPI_File_open = (MPI_File_open_t)dlsym(RTLD_DEFAULT, "MPI_File_open"); - } + MPI_File_open = (MPI_File_open_t)dlsym(real_lib_, "MPI_File_open"); REQUIRE_API(MPI_File_open) - if (is_intercepted) { - MPI_File_close = (MPI_File_close_t)dlsym(RTLD_NEXT, "MPI_File_close"); - } else { - MPI_File_close = (MPI_File_close_t)dlsym(RTLD_DEFAULT, "MPI_File_close"); - } + MPI_File_close = (MPI_File_close_t)dlsym(real_lib_, "MPI_File_close"); REQUIRE_API(MPI_File_close) - if (is_intercepted) { - MPI_File_seek_shared = (MPI_File_seek_shared_t)dlsym(RTLD_NEXT, "MPI_File_seek_shared"); - } else { - MPI_File_seek_shared = (MPI_File_seek_shared_t)dlsym(RTLD_DEFAULT, "MPI_File_seek_shared"); - } + MPI_File_seek_shared = (MPI_File_seek_shared_t)dlsym(real_lib_, "MPI_File_seek_shared"); REQUIRE_API(MPI_File_seek_shared) - if (is_intercepted) { - MPI_File_seek = (MPI_File_seek_t)dlsym(RTLD_NEXT, "MPI_File_seek"); - } else { - MPI_File_seek = (MPI_File_seek_t)dlsym(RTLD_DEFAULT, "MPI_File_seek"); - } + MPI_File_seek = (MPI_File_seek_t)dlsym(real_lib_, "MPI_File_seek"); REQUIRE_API(MPI_File_seek) - if (is_intercepted) { - MPI_File_get_position = (MPI_File_get_position_t)dlsym(RTLD_NEXT, "MPI_File_get_position"); - } else { - MPI_File_get_position = (MPI_File_get_position_t)dlsym(RTLD_DEFAULT, "MPI_File_get_position"); - } + MPI_File_get_position = (MPI_File_get_position_t)dlsym(real_lib_, "MPI_File_get_position"); REQUIRE_API(MPI_File_get_position) - if (is_intercepted) { - MPI_File_read_all = (MPI_File_read_all_t)dlsym(RTLD_NEXT, "MPI_File_read_all"); - } else { - MPI_File_read_all = (MPI_File_read_all_t)dlsym(RTLD_DEFAULT, "MPI_File_read_all"); - } + MPI_File_read_all = (MPI_File_read_all_t)dlsym(real_lib_, "MPI_File_read_all"); REQUIRE_API(MPI_File_read_all) - if (is_intercepted) { - MPI_File_read_at_all = (MPI_File_read_at_all_t)dlsym(RTLD_NEXT, "MPI_File_read_at_all"); - } else { - MPI_File_read_at_all = (MPI_File_read_at_all_t)dlsym(RTLD_DEFAULT, "MPI_File_read_at_all"); - } + MPI_File_read_at_all = (MPI_File_read_at_all_t)dlsym(real_lib_, "MPI_File_read_at_all"); REQUIRE_API(MPI_File_read_at_all) - if (is_intercepted) { - MPI_File_read_at = (MPI_File_read_at_t)dlsym(RTLD_NEXT, "MPI_File_read_at"); - } else { - MPI_File_read_at = (MPI_File_read_at_t)dlsym(RTLD_DEFAULT, "MPI_File_read_at"); - } + MPI_File_read_at = (MPI_File_read_at_t)dlsym(real_lib_, "MPI_File_read_at"); REQUIRE_API(MPI_File_read_at) - if (is_intercepted) { - MPI_File_read = (MPI_File_read_t)dlsym(RTLD_NEXT, "MPI_File_read"); - } else { - MPI_File_read = (MPI_File_read_t)dlsym(RTLD_DEFAULT, "MPI_File_read"); - } + MPI_File_read = (MPI_File_read_t)dlsym(real_lib_, "MPI_File_read"); REQUIRE_API(MPI_File_read) - if (is_intercepted) { - MPI_File_read_ordered = (MPI_File_read_ordered_t)dlsym(RTLD_NEXT, "MPI_File_read_ordered"); - } else { - MPI_File_read_ordered = (MPI_File_read_ordered_t)dlsym(RTLD_DEFAULT, "MPI_File_read_ordered"); - } + MPI_File_read_ordered = (MPI_File_read_ordered_t)dlsym(real_lib_, "MPI_File_read_ordered"); REQUIRE_API(MPI_File_read_ordered) - if (is_intercepted) { - MPI_File_read_shared = (MPI_File_read_shared_t)dlsym(RTLD_NEXT, "MPI_File_read_shared"); - } else { - MPI_File_read_shared = (MPI_File_read_shared_t)dlsym(RTLD_DEFAULT, "MPI_File_read_shared"); - } + MPI_File_read_shared = (MPI_File_read_shared_t)dlsym(real_lib_, "MPI_File_read_shared"); REQUIRE_API(MPI_File_read_shared) - if (is_intercepted) { - MPI_File_write_all = (MPI_File_write_all_t)dlsym(RTLD_NEXT, "MPI_File_write_all"); - } else { - MPI_File_write_all = (MPI_File_write_all_t)dlsym(RTLD_DEFAULT, "MPI_File_write_all"); - } + MPI_File_write_all = (MPI_File_write_all_t)dlsym(real_lib_, "MPI_File_write_all"); REQUIRE_API(MPI_File_write_all) - if (is_intercepted) { - MPI_File_write_at_all = (MPI_File_write_at_all_t)dlsym(RTLD_NEXT, "MPI_File_write_at_all"); - } else { - MPI_File_write_at_all = (MPI_File_write_at_all_t)dlsym(RTLD_DEFAULT, "MPI_File_write_at_all"); - } + MPI_File_write_at_all = (MPI_File_write_at_all_t)dlsym(real_lib_, "MPI_File_write_at_all"); REQUIRE_API(MPI_File_write_at_all) - if (is_intercepted) { - MPI_File_write_at = (MPI_File_write_at_t)dlsym(RTLD_NEXT, "MPI_File_write_at"); - } else { - MPI_File_write_at = (MPI_File_write_at_t)dlsym(RTLD_DEFAULT, "MPI_File_write_at"); - } + MPI_File_write_at = (MPI_File_write_at_t)dlsym(real_lib_, "MPI_File_write_at"); REQUIRE_API(MPI_File_write_at) - if (is_intercepted) { - MPI_File_write = (MPI_File_write_t)dlsym(RTLD_NEXT, "MPI_File_write"); - } else { - MPI_File_write = (MPI_File_write_t)dlsym(RTLD_DEFAULT, "MPI_File_write"); - } + MPI_File_write = (MPI_File_write_t)dlsym(real_lib_, "MPI_File_write"); REQUIRE_API(MPI_File_write) - if (is_intercepted) { - MPI_File_write_ordered = (MPI_File_write_ordered_t)dlsym(RTLD_NEXT, "MPI_File_write_ordered"); - } else { - MPI_File_write_ordered = (MPI_File_write_ordered_t)dlsym(RTLD_DEFAULT, "MPI_File_write_ordered"); - } + MPI_File_write_ordered = (MPI_File_write_ordered_t)dlsym(real_lib_, "MPI_File_write_ordered"); REQUIRE_API(MPI_File_write_ordered) - if (is_intercepted) { - MPI_File_write_shared = (MPI_File_write_shared_t)dlsym(RTLD_NEXT, "MPI_File_write_shared"); - } else { - MPI_File_write_shared = (MPI_File_write_shared_t)dlsym(RTLD_DEFAULT, "MPI_File_write_shared"); - } + MPI_File_write_shared = (MPI_File_write_shared_t)dlsym(real_lib_, "MPI_File_write_shared"); REQUIRE_API(MPI_File_write_shared) - if (is_intercepted) { - MPI_File_iread_at = (MPI_File_iread_at_t)dlsym(RTLD_NEXT, "MPI_File_iread_at"); - } else { - MPI_File_iread_at = (MPI_File_iread_at_t)dlsym(RTLD_DEFAULT, "MPI_File_iread_at"); - } + MPI_File_iread_at = (MPI_File_iread_at_t)dlsym(real_lib_, "MPI_File_iread_at"); REQUIRE_API(MPI_File_iread_at) - if (is_intercepted) { - MPI_File_iread = (MPI_File_iread_t)dlsym(RTLD_NEXT, "MPI_File_iread"); - } else { - MPI_File_iread = (MPI_File_iread_t)dlsym(RTLD_DEFAULT, "MPI_File_iread"); - } + MPI_File_iread = (MPI_File_iread_t)dlsym(real_lib_, "MPI_File_iread"); REQUIRE_API(MPI_File_iread) - if (is_intercepted) { - MPI_File_iread_shared = (MPI_File_iread_shared_t)dlsym(RTLD_NEXT, "MPI_File_iread_shared"); - } else { - MPI_File_iread_shared = (MPI_File_iread_shared_t)dlsym(RTLD_DEFAULT, "MPI_File_iread_shared"); - } + MPI_File_iread_shared = (MPI_File_iread_shared_t)dlsym(real_lib_, "MPI_File_iread_shared"); REQUIRE_API(MPI_File_iread_shared) - if (is_intercepted) { - MPI_File_iwrite_at = (MPI_File_iwrite_at_t)dlsym(RTLD_NEXT, "MPI_File_iwrite_at"); - } else { - MPI_File_iwrite_at = (MPI_File_iwrite_at_t)dlsym(RTLD_DEFAULT, "MPI_File_iwrite_at"); - } + MPI_File_iwrite_at = (MPI_File_iwrite_at_t)dlsym(real_lib_, "MPI_File_iwrite_at"); REQUIRE_API(MPI_File_iwrite_at) - if (is_intercepted) { - MPI_File_iwrite = (MPI_File_iwrite_t)dlsym(RTLD_NEXT, "MPI_File_iwrite"); - } else { - MPI_File_iwrite = (MPI_File_iwrite_t)dlsym(RTLD_DEFAULT, "MPI_File_iwrite"); - } + MPI_File_iwrite = (MPI_File_iwrite_t)dlsym(real_lib_, "MPI_File_iwrite"); REQUIRE_API(MPI_File_iwrite) - if (is_intercepted) { - MPI_File_iwrite_shared = (MPI_File_iwrite_shared_t)dlsym(RTLD_NEXT, "MPI_File_iwrite_shared"); - } else { - MPI_File_iwrite_shared = (MPI_File_iwrite_shared_t)dlsym(RTLD_DEFAULT, "MPI_File_iwrite_shared"); - } + MPI_File_iwrite_shared = (MPI_File_iwrite_shared_t)dlsym(real_lib_, "MPI_File_iwrite_shared"); REQUIRE_API(MPI_File_iwrite_shared) - if (is_intercepted) { - MPI_File_sync = (MPI_File_sync_t)dlsym(RTLD_NEXT, "MPI_File_sync"); - } else { - MPI_File_sync = (MPI_File_sync_t)dlsym(RTLD_DEFAULT, "MPI_File_sync"); - } + MPI_File_sync = (MPI_File_sync_t)dlsym(real_lib_, "MPI_File_sync"); REQUIRE_API(MPI_File_sync) } }; diff --git a/adapter/stdio/stdio_api.h b/adapter/stdio/stdio_api.h index 7c3f1887a..0e8c1bcda 100644 --- a/adapter/stdio/stdio_api.h +++ b/adapter/stdio/stdio_api.h @@ -51,7 +51,7 @@ typedef long int (*ftell_t)(FILE * fp); namespace hermes::adapter::fs { /** Pointers to the real stdio API */ -class StdioApi { +class StdioApi : public RealApi { public: /** fopen */ fopen_t fopen = nullptr; @@ -106,163 +106,58 @@ class StdioApi { /** ftell */ ftell_t ftell = nullptr; - StdioApi() { - void *is_intercepted = (void*)dlsym(RTLD_DEFAULT, "stdio_intercepted"); - if (is_intercepted) { - fopen = (fopen_t)dlsym(RTLD_NEXT, "fopen"); - } else { - fopen = (fopen_t)dlsym(RTLD_DEFAULT, "fopen"); - } + StdioApi() : RealApi("fopen", "stdio_intercepted") { + fopen = (fopen_t)dlsym(real_lib_, "fopen"); REQUIRE_API(fopen) - if (is_intercepted) { - fopen64 = (fopen64_t)dlsym(RTLD_NEXT, "fopen64"); - } else { - fopen64 = (fopen64_t)dlsym(RTLD_DEFAULT, "fopen64"); - } + fopen64 = (fopen64_t)dlsym(real_lib_, "fopen64"); REQUIRE_API(fopen64) - if (is_intercepted) { - fdopen = (fdopen_t)dlsym(RTLD_NEXT, "fdopen"); - } else { - fdopen = (fdopen_t)dlsym(RTLD_DEFAULT, "fdopen"); - } + fdopen = (fdopen_t)dlsym(real_lib_, "fdopen"); REQUIRE_API(fdopen) - if (is_intercepted) { - freopen = (freopen_t)dlsym(RTLD_NEXT, "freopen"); - } else { - freopen = (freopen_t)dlsym(RTLD_DEFAULT, "freopen"); - } + freopen = (freopen_t)dlsym(real_lib_, "freopen"); REQUIRE_API(freopen) - if (is_intercepted) { - freopen64 = (freopen64_t)dlsym(RTLD_NEXT, "freopen64"); - } else { - freopen64 = (freopen64_t)dlsym(RTLD_DEFAULT, "freopen64"); - } + freopen64 = (freopen64_t)dlsym(real_lib_, "freopen64"); REQUIRE_API(freopen64) - if (is_intercepted) { - fflush = (fflush_t)dlsym(RTLD_NEXT, "fflush"); - } else { - fflush = (fflush_t)dlsym(RTLD_DEFAULT, "fflush"); - } + fflush = (fflush_t)dlsym(real_lib_, "fflush"); REQUIRE_API(fflush) - if (is_intercepted) { - fclose = (fclose_t)dlsym(RTLD_NEXT, "fclose"); - } else { - fclose = (fclose_t)dlsym(RTLD_DEFAULT, "fclose"); - } + fclose = (fclose_t)dlsym(real_lib_, "fclose"); REQUIRE_API(fclose) - if (is_intercepted) { - fwrite = (fwrite_t)dlsym(RTLD_NEXT, "fwrite"); - } else { - fwrite = (fwrite_t)dlsym(RTLD_DEFAULT, "fwrite"); - } + fwrite = (fwrite_t)dlsym(real_lib_, "fwrite"); REQUIRE_API(fwrite) - if (is_intercepted) { - fputc = (fputc_t)dlsym(RTLD_NEXT, "fputc"); - } else { - fputc = (fputc_t)dlsym(RTLD_DEFAULT, "fputc"); - } + fputc = (fputc_t)dlsym(real_lib_, "fputc"); REQUIRE_API(fputc) - if (is_intercepted) { - fgetpos = (fgetpos_t)dlsym(RTLD_NEXT, "fgetpos"); - } else { - fgetpos = (fgetpos_t)dlsym(RTLD_DEFAULT, "fgetpos"); - } + fgetpos = (fgetpos_t)dlsym(real_lib_, "fgetpos"); REQUIRE_API(fgetpos) - if (is_intercepted) { - fgetpos64 = (fgetpos64_t)dlsym(RTLD_NEXT, "fgetpos64"); - } else { - fgetpos64 = (fgetpos64_t)dlsym(RTLD_DEFAULT, "fgetpos64"); - } + fgetpos64 = (fgetpos64_t)dlsym(real_lib_, "fgetpos64"); REQUIRE_API(fgetpos64) - if (is_intercepted) { - putc = (putc_t)dlsym(RTLD_NEXT, "putc"); - } else { - putc = (putc_t)dlsym(RTLD_DEFAULT, "putc"); - } + putc = (putc_t)dlsym(real_lib_, "putc"); REQUIRE_API(putc) - if (is_intercepted) { - putw = (putw_t)dlsym(RTLD_NEXT, "putw"); - } else { - putw = (putw_t)dlsym(RTLD_DEFAULT, "putw"); - } + putw = (putw_t)dlsym(real_lib_, "putw"); REQUIRE_API(putw) - if (is_intercepted) { - fputs = (fputs_t)dlsym(RTLD_NEXT, "fputs"); - } else { - fputs = (fputs_t)dlsym(RTLD_DEFAULT, "fputs"); - } + fputs = (fputs_t)dlsym(real_lib_, "fputs"); REQUIRE_API(fputs) - if (is_intercepted) { - fread = (fread_t)dlsym(RTLD_NEXT, "fread"); - } else { - fread = (fread_t)dlsym(RTLD_DEFAULT, "fread"); - } + fread = (fread_t)dlsym(real_lib_, "fread"); REQUIRE_API(fread) - if (is_intercepted) { - fgetc = (fgetc_t)dlsym(RTLD_NEXT, "fgetc"); - } else { - fgetc = (fgetc_t)dlsym(RTLD_DEFAULT, "fgetc"); - } + fgetc = (fgetc_t)dlsym(real_lib_, "fgetc"); REQUIRE_API(fgetc) - if (is_intercepted) { - getc = (getc_t)dlsym(RTLD_NEXT, "getc"); - } else { - getc = (getc_t)dlsym(RTLD_DEFAULT, "getc"); - } + getc = (getc_t)dlsym(real_lib_, "getc"); REQUIRE_API(getc) - if (is_intercepted) { - getw = (getw_t)dlsym(RTLD_NEXT, "getw"); - } else { - getw = (getw_t)dlsym(RTLD_DEFAULT, "getw"); - } + getw = (getw_t)dlsym(real_lib_, "getw"); REQUIRE_API(getw) - if (is_intercepted) { - fgets = (fgets_t)dlsym(RTLD_NEXT, "fgets"); - } else { - fgets = (fgets_t)dlsym(RTLD_DEFAULT, "fgets"); - } + fgets = (fgets_t)dlsym(real_lib_, "fgets"); REQUIRE_API(fgets) - if (is_intercepted) { - rewind = (rewind_t)dlsym(RTLD_NEXT, "rewind"); - } else { - rewind = (rewind_t)dlsym(RTLD_DEFAULT, "rewind"); - } + rewind = (rewind_t)dlsym(real_lib_, "rewind"); REQUIRE_API(rewind) - if (is_intercepted) { - fseek = (fseek_t)dlsym(RTLD_NEXT, "fseek"); - } else { - fseek = (fseek_t)dlsym(RTLD_DEFAULT, "fseek"); - } + fseek = (fseek_t)dlsym(real_lib_, "fseek"); REQUIRE_API(fseek) - if (is_intercepted) { - fseeko = (fseeko_t)dlsym(RTLD_NEXT, "fseeko"); - } else { - fseeko = (fseeko_t)dlsym(RTLD_DEFAULT, "fseeko"); - } + fseeko = (fseeko_t)dlsym(real_lib_, "fseeko"); REQUIRE_API(fseeko) - if (is_intercepted) { - fseeko64 = (fseeko64_t)dlsym(RTLD_NEXT, "fseeko64"); - } else { - fseeko64 = (fseeko64_t)dlsym(RTLD_DEFAULT, "fseeko64"); - } + fseeko64 = (fseeko64_t)dlsym(real_lib_, "fseeko64"); REQUIRE_API(fseeko64) - if (is_intercepted) { - fsetpos = (fsetpos_t)dlsym(RTLD_NEXT, "fsetpos"); - } else { - fsetpos = (fsetpos_t)dlsym(RTLD_DEFAULT, "fsetpos"); - } + fsetpos = (fsetpos_t)dlsym(real_lib_, "fsetpos"); REQUIRE_API(fsetpos) - if (is_intercepted) { - fsetpos64 = (fsetpos64_t)dlsym(RTLD_NEXT, "fsetpos64"); - } else { - fsetpos64 = (fsetpos64_t)dlsym(RTLD_DEFAULT, "fsetpos64"); - } + fsetpos64 = (fsetpos64_t)dlsym(real_lib_, "fsetpos64"); REQUIRE_API(fsetpos64) - if (is_intercepted) { - ftell = (ftell_t)dlsym(RTLD_NEXT, "ftell"); - } else { - ftell = (ftell_t)dlsym(RTLD_DEFAULT, "ftell"); - } + ftell = (ftell_t)dlsym(real_lib_, "ftell"); REQUIRE_API(ftell) } }; From 9b75c3221e2bd808c3f7b2d34308a5261ae71ed5 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 23:23:07 -0500 Subject: [PATCH 25/44] Use mpi_fh_ correctly --- adapter/mpiio/mpiio_io_client.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/adapter/mpiio/mpiio_io_client.cc b/adapter/mpiio/mpiio_io_client.cc index a26b68cdc..17584cc16 100644 --- a/adapter/mpiio/mpiio_io_client.cc +++ b/adapter/mpiio/mpiio_io_client.cc @@ -33,6 +33,7 @@ void MpiioIoClient::RealOpen(File &f, if (f.mpi_status_ != MPI_SUCCESS) { f.status_ = false; } + f.hermes_mpi_fh_ = stat.mpi_fh_; /*if (stat.hflags_.Any(HERMES_FS_CREATE)) { if (stat.adapter_mode_ != AdapterMode::kScratch) { From bac60e99340756cb4ee07e333562fa0a40986499 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 29 May 2023 23:25:39 -0500 Subject: [PATCH 26/44] Make hermes and stat fh equal for mpi --- adapter/mpiio/mpiio_io_client.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/adapter/mpiio/mpiio_io_client.cc b/adapter/mpiio/mpiio_io_client.cc index 17584cc16..ab99fb1e9 100644 --- a/adapter/mpiio/mpiio_io_client.cc +++ b/adapter/mpiio/mpiio_io_client.cc @@ -33,7 +33,6 @@ void MpiioIoClient::RealOpen(File &f, if (f.mpi_status_ != MPI_SUCCESS) { f.status_ = false; } - f.hermes_mpi_fh_ = stat.mpi_fh_; /*if (stat.hflags_.Any(HERMES_FS_CREATE)) { if (stat.adapter_mode_ != AdapterMode::kScratch) { @@ -63,7 +62,8 @@ void MpiioIoClient::RealOpen(File &f, void MpiioIoClient::HermesOpen(File &f, const AdapterStat &stat, FilesystemIoClientState &fs_mdm) { - f.hermes_mpi_fh_ = (MPI_File)fs_mdm.stat_; + // f.hermes_mpi_fh_ = (MPI_File)fs_mdm.stat_; + f.hermes_mpi_fh_ = stat.mpi_fh_; } /** Synchronize \a file FILE f */ From 02b21f8281b45f23ae723b87607399fef8ba2290 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 00:31:31 -0500 Subject: [PATCH 27/44] Update jarvis-util --- adapter/test/mpiio/tests.py | 3 +- adapter/test/posix/tests.py | 3 +- adapter/test/stdio/tests.py | 3 +- adapter/test/vfd/tests.py | 3 +- ci/jarvis-util/.coveragerc | 4 + ci/jarvis-util/.gitignore | 1 + ci/jarvis-util/LICENSE | 21 + ci/jarvis-util/README.md | 32 +- ci/jarvis-util/bin/jarvis-imports | 8 + ci/jarvis-util/ci/cluster/Dockerfile | 55 ++ ci/jarvis-util/ci/cluster/docker-compose.yml | 24 + ci/jarvis-util/ci/install_deps.sh | 12 + ci/jarvis-util/ci/install_jarvis.sh | 3 + ci/jarvis-util/ci/install_spack.sh | 12 + ci/jarvis-util/ci/lint.sh | 2 + ci/jarvis-util/ci/run_tests.sh | 5 + ci/jarvis-util/jarvis_util/__init__.py | 25 + .../jarvis_util/introspect/fi_info.py | 6 - .../jarvis_util/introspect/system_info.py | 579 +++++++++++++++++- ci/jarvis-util/jarvis_util/jutil_manager.py | 55 +- .../jarvis_util/serialize/ini_file.py | 13 +- .../jarvis_util/serialize/pickle.py | 13 +- .../jarvis_util/serialize/serializer.py | 15 +- .../jarvis_util/serialize/text_file.py | 15 +- .../jarvis_util/serialize/yaml_file.py | 12 +- ci/jarvis-util/jarvis_util/shell/exec.py | 23 + ci/jarvis-util/jarvis_util/shell/exec_info.py | 144 ++++- .../jarvis_util/shell/filesystem.py | 69 +++ ci/jarvis-util/jarvis_util/shell/kill.py | 7 - .../jarvis_util/shell/local_exec.py | 46 +- ci/jarvis-util/jarvis_util/shell/mpi_exec.py | 25 +- ci/jarvis-util/jarvis_util/shell/process.py | 22 + ci/jarvis-util/jarvis_util/shell/pscp.py | 58 ++ ci/jarvis-util/jarvis_util/shell/pssh_exec.py | 37 +- ci/jarvis-util/jarvis_util/shell/rm.py | 7 - ci/jarvis-util/jarvis_util/shell/scp.py | 115 ++++ ci/jarvis-util/jarvis_util/shell/ssh_exec.py | 33 +- ci/jarvis-util/jarvis_util/util/argparse.py | 192 +++++- ci/jarvis-util/jarvis_util/util/expand_env.py | 25 + ci/jarvis-util/jarvis_util/util/hostfile.py | 95 ++- ci/jarvis-util/jarvis_util/util/import_all.py | 72 +++ ci/jarvis-util/jarvis_util/util/import_mod.py | 5 + ci/jarvis-util/jarvis_util/util/naming.py | 13 +- ci/jarvis-util/jarvis_util/util/size_conv.py | 39 +- ci/jarvis-util/requirements.txt | 11 +- ci/jarvis-util/test/unit/argparse_main.py | 4 + .../test/unit/{print10s.py => print5s.py} | 6 +- ci/jarvis-util/test/unit/printNone.py | 4 + ci/jarvis-util/test/unit/test_argparse.py | 10 + ci/jarvis-util/test/unit/test_fi_info.py | 4 - ci/jarvis-util/test/unit/test_hostfile.py | 115 ++-- ci/jarvis-util/test/unit/test_hostfile.txt | 2 +- ci/jarvis-util/test/unit/test_local_exec.py | 71 ++- ci/jarvis-util/test/unit/test_system_info.py | 137 +++++ ci/py_hermes_ci/py_hermes_ci/test_manager.py | 27 +- src/buffer_organizer.cc | 2 +- src/config_server_default.h | 6 + test/tests.py | 3 +- wrapper/java/tests.py | 3 +- 59 files changed, 2013 insertions(+), 343 deletions(-) create mode 100644 ci/jarvis-util/.coveragerc create mode 100644 ci/jarvis-util/LICENSE create mode 100755 ci/jarvis-util/bin/jarvis-imports create mode 100644 ci/jarvis-util/ci/cluster/Dockerfile create mode 100644 ci/jarvis-util/ci/cluster/docker-compose.yml create mode 100644 ci/jarvis-util/ci/install_deps.sh create mode 100644 ci/jarvis-util/ci/install_jarvis.sh create mode 100644 ci/jarvis-util/ci/install_spack.sh create mode 100644 ci/jarvis-util/ci/lint.sh create mode 100644 ci/jarvis-util/ci/run_tests.sh create mode 100644 ci/jarvis-util/jarvis_util/__init__.py delete mode 100644 ci/jarvis-util/jarvis_util/introspect/fi_info.py create mode 100644 ci/jarvis-util/jarvis_util/shell/filesystem.py delete mode 100644 ci/jarvis-util/jarvis_util/shell/kill.py create mode 100644 ci/jarvis-util/jarvis_util/shell/process.py create mode 100644 ci/jarvis-util/jarvis_util/shell/pscp.py delete mode 100644 ci/jarvis-util/jarvis_util/shell/rm.py create mode 100644 ci/jarvis-util/jarvis_util/util/expand_env.py create mode 100644 ci/jarvis-util/jarvis_util/util/import_all.py create mode 100644 ci/jarvis-util/test/unit/argparse_main.py rename ci/jarvis-util/test/unit/{print10s.py => print5s.py} (59%) create mode 100644 ci/jarvis-util/test/unit/printNone.py create mode 100644 ci/jarvis-util/test/unit/test_argparse.py delete mode 100644 ci/jarvis-util/test/unit/test_fi_info.py create mode 100644 ci/jarvis-util/test/unit/test_system_info.py diff --git a/adapter/test/mpiio/tests.py b/adapter/test/mpiio/tests.py index 054435991..26c575672 100644 --- a/adapter/test/mpiio/tests.py +++ b/adapter/test/mpiio/tests.py @@ -1,6 +1,5 @@ from py_hermes_ci.test_manager import TestManager -from jarvis_util.shell.exec import Exec -from jarvis_util.shell.local_exec import LocalExecInfo +from jarvis_util import * class MpiioTestManager(TestManager): diff --git a/adapter/test/posix/tests.py b/adapter/test/posix/tests.py index 93e24dde3..6ddf5bfb8 100644 --- a/adapter/test/posix/tests.py +++ b/adapter/test/posix/tests.py @@ -1,6 +1,5 @@ from py_hermes_ci.test_manager import TestManager -from jarvis_util.shell.exec import Exec -from jarvis_util.shell.local_exec import LocalExecInfo +from jarvis_util import * class PosixTestManager(TestManager): diff --git a/adapter/test/stdio/tests.py b/adapter/test/stdio/tests.py index 155a16ac6..bbcf12ce8 100644 --- a/adapter/test/stdio/tests.py +++ b/adapter/test/stdio/tests.py @@ -1,6 +1,5 @@ from py_hermes_ci.test_manager import TestManager -from jarvis_util.shell.exec import Exec -from jarvis_util.shell.local_exec import LocalExecInfo +from jarvis_util import * class StdioTestManager(TestManager): diff --git a/adapter/test/vfd/tests.py b/adapter/test/vfd/tests.py index bb563b28e..132a1a545 100644 --- a/adapter/test/vfd/tests.py +++ b/adapter/test/vfd/tests.py @@ -1,6 +1,5 @@ from py_hermes_ci.test_manager import TestManager -from jarvis_util.shell.exec import Exec -from jarvis_util.shell.local_exec import LocalExecInfo +from jarvis_util import * class VfdTestManager(TestManager): diff --git a/ci/jarvis-util/.coveragerc b/ci/jarvis-util/.coveragerc new file mode 100644 index 000000000..f1db9952b --- /dev/null +++ b/ci/jarvis-util/.coveragerc @@ -0,0 +1,4 @@ +# .coveragerc +[run] +source = . +omit = *test* \ No newline at end of file diff --git a/ci/jarvis-util/.gitignore b/ci/jarvis-util/.gitignore index 182cc843f..80ced04e4 100644 --- a/ci/jarvis-util/.gitignore +++ b/ci/jarvis-util/.gitignore @@ -4,6 +4,7 @@ __pycache__/ *$py.class .idea hostfile.txt +lcov.info # C extensions *.so diff --git a/ci/jarvis-util/LICENSE b/ci/jarvis-util/LICENSE new file mode 100644 index 000000000..5db7c2cc8 --- /dev/null +++ b/ci/jarvis-util/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022-present Luke Logan and other contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/ci/jarvis-util/README.md b/ci/jarvis-util/README.md index 2f92469fa..711a1466e 100644 --- a/ci/jarvis-util/README.md +++ b/ci/jarvis-util/README.md @@ -5,6 +5,19 @@ creating shell scripts within Python. This library contains wrappers for executing shell commands locally, SSH, SCP, MPI, argument parsing, and various other random utilities. +![Build](https://github.com/lukemartinlogan/jarvis-util/workflows/GitHub%20Actions/badge.svg) + +[![Coverage Status](https://coveralls.io/repos/github/lukemartinlogan/jarvis-util/badge.svg?branch=master)](https://coveralls.io/github/lukemartinlogan/jarvis-util?branch=master) + +## Installation + +For now, we only consider manual installation +```bash +cd jarvis-util +python3 -m pip install -r requirements.txt +python3 -m pip install -e . +``` + ## Executing a program The following code will execute a command on the local machine. @@ -85,4 +98,21 @@ ares-comp-[02-04] ares-comp-[05-09,11,12-14]-40g ``` -These will be expanded internally by PSSH and MPI. \ No newline at end of file +These will be expanded internally by PSSH and MPI. + +# Unit tests + +We run our unit tests in a docker container, which is located underneath +the CI directory. This is because we need to test multi-node functionality, +without having multiple nodes. To setup unit testing, perform the following: + +1. Install Docker +2. Setup our "ci" container +3. Run the unit tests + +``` +``` + +# Contributing + +We use the Google Python Style guide (pylintrc). \ No newline at end of file diff --git a/ci/jarvis-util/bin/jarvis-imports b/ci/jarvis-util/bin/jarvis-imports new file mode 100755 index 000000000..25fc19a9a --- /dev/null +++ b/ci/jarvis-util/bin/jarvis-imports @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 + +from jarvis_util.util.import_all import * +import pathlib +import os + + +build_global_import_from_bin('jarvis_util') diff --git a/ci/jarvis-util/ci/cluster/Dockerfile b/ci/jarvis-util/ci/cluster/Dockerfile new file mode 100644 index 000000000..548937e02 --- /dev/null +++ b/ci/jarvis-util/ci/cluster/Dockerfile @@ -0,0 +1,55 @@ +# Install ubuntu 20.04 +FROM ubuntu:20.04 +LABEL maintainer="llogan@hawk.iit.edu" +LABEL version="0.0" +LABEL description="An example docker image" + +# Disable Prompt During Packages Installation +ARG DEBIAN_FRONTEND=noninteractive + +# Update ubuntu +RUN apt update && apt install + +# Install some basic packages +RUN apt install -y \ + openssh-server \ + sudo git nano vim \ + docker \ + mpich \ + gcc \ + g++ \ + gfortran \ + libtool \ + libtool-bin \ + automake \ + autoconf + +# Create a new user +RUN useradd -m sshuser +RUN usermod -aG sudo sshuser +RUN passwd -d sshuser + +# Copy the host's SSH keys +# Docker requires COPY be relative to the current working +# directory. We cannot pass ~/.ssh/id_rsa unfortunately... +ENV SSHDIR=/home/sshuser/.ssh +RUN sudo -u sshuser mkdir ${SSHDIR} +COPY id_rsa ${SSHDIR}/id_rsa +COPY id_rsa.pub ${SSHDIR}/id_rsa.pub + +# Authorize host SSH keys +RUN sudo -u sshuser touch ${SSHDIR}/authorized_keys +RUN cat ${SSHDIR}/id_rsa.pub >> ${SSHDIR}/authorized_keys + +# Set SSH permissions +RUN chmod 700 ${SSHDIR} +RUN chmod 644 ${SSHDIR}/id_rsa.pub +RUN chmod 600 ${SSHDIR}/id_rsa +RUN chmod 600 ${SSHDIR}/authorized_keys + +# Enable passwordless SSH +RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/' /etc/ssh/sshd_config + +# Start SSHD and wait forever +RUN mkdir /run/sshd +CMD ["/usr/sbin/sshd", "-D"] \ No newline at end of file diff --git a/ci/jarvis-util/ci/cluster/docker-compose.yml b/ci/jarvis-util/ci/cluster/docker-compose.yml new file mode 100644 index 000000000..aa7477089 --- /dev/null +++ b/ci/jarvis-util/ci/cluster/docker-compose.yml @@ -0,0 +1,24 @@ +version: "3" + +services: + node1: + build: . + links: + - node2 + networks: + - net + hostname: node1 + stdin_open: true + tty: true + + node2: + build: . + networks: + - net + hostname: node2 + stdin_open: true + tty: true + +networks: + net: + driver: bridge \ No newline at end of file diff --git a/ci/jarvis-util/ci/install_deps.sh b/ci/jarvis-util/ci/install_deps.sh new file mode 100644 index 000000000..0d1f0447d --- /dev/null +++ b/ci/jarvis-util/ci/install_deps.sh @@ -0,0 +1,12 @@ +#!/bin/bash +sudo apt update +sudo apt install -y \ +docker \ +mpich \ +gcc \ +g++ \ +gfortran \ +libtool \ +libtool-bin \ +automake \ +autoconf \ No newline at end of file diff --git a/ci/jarvis-util/ci/install_jarvis.sh b/ci/jarvis-util/ci/install_jarvis.sh new file mode 100644 index 000000000..58aa283c0 --- /dev/null +++ b/ci/jarvis-util/ci/install_jarvis.sh @@ -0,0 +1,3 @@ +#!/bin/bash +python3 -m pip install -r requirements.txt +python3 -m pip install -e . diff --git a/ci/jarvis-util/ci/install_spack.sh b/ci/jarvis-util/ci/install_spack.sh new file mode 100644 index 000000000..0d1f0447d --- /dev/null +++ b/ci/jarvis-util/ci/install_spack.sh @@ -0,0 +1,12 @@ +#!/bin/bash +sudo apt update +sudo apt install -y \ +docker \ +mpich \ +gcc \ +g++ \ +gfortran \ +libtool \ +libtool-bin \ +automake \ +autoconf \ No newline at end of file diff --git a/ci/jarvis-util/ci/lint.sh b/ci/jarvis-util/ci/lint.sh new file mode 100644 index 000000000..a1af3ff48 --- /dev/null +++ b/ci/jarvis-util/ci/lint.sh @@ -0,0 +1,2 @@ +#!/bin/bash +pylint "${PWD}"/jarvis_util \ No newline at end of file diff --git a/ci/jarvis-util/ci/run_tests.sh b/ci/jarvis-util/ci/run_tests.sh new file mode 100644 index 000000000..abd4e290a --- /dev/null +++ b/ci/jarvis-util/ci/run_tests.sh @@ -0,0 +1,5 @@ +#!/bin/bash +coverage run -m pytest +rm -rf "*.pyc" +coverage report +coverage-lcov \ No newline at end of file diff --git a/ci/jarvis-util/jarvis_util/__init__.py b/ci/jarvis-util/jarvis_util/__init__.py new file mode 100644 index 000000000..1019dd213 --- /dev/null +++ b/ci/jarvis-util/jarvis_util/__init__.py @@ -0,0 +1,25 @@ +"""Import all modules""" +from jarvis_util.util.expand_env import * +from jarvis_util.util.naming import * +from jarvis_util.util.hostfile import * +from jarvis_util.util.size_conv import * +from jarvis_util.util.import_all import * +from jarvis_util.util.import_mod import * +from jarvis_util.util.argparse import * +from jarvis_util.serialize.ini_file import * +from jarvis_util.serialize.yaml_file import * +from jarvis_util.serialize.text_file import * +from jarvis_util.serialize.serializer import * +from jarvis_util.serialize.pickle import * +from jarvis_util.shell.filesystem import * +from jarvis_util.shell.exec import * +from jarvis_util.shell.exec_info import * +from jarvis_util.shell.ssh_exec import * +from jarvis_util.shell.pssh_exec import * +from jarvis_util.shell.process import * +from jarvis_util.shell.pscp import * +from jarvis_util.shell.scp import * +from jarvis_util.shell.mpi_exec import * +from jarvis_util.shell.local_exec import * +from jarvis_util.introspect.system_info import * +from jarvis_util.jutil_manager import * diff --git a/ci/jarvis-util/jarvis_util/introspect/fi_info.py b/ci/jarvis-util/jarvis_util/introspect/fi_info.py deleted file mode 100644 index ceb05b74a..000000000 --- a/ci/jarvis-util/jarvis_util/introspect/fi_info.py +++ /dev/null @@ -1,6 +0,0 @@ -from jarvs_util.shell.exec_node import ExecNode - - -class FiInfo(ExecNode): - def __init__(self): - pass \ No newline at end of file diff --git a/ci/jarvis-util/jarvis_util/introspect/system_info.py b/ci/jarvis-util/jarvis_util/introspect/system_info.py index e20101c33..5a9d94482 100644 --- a/ci/jarvis-util/jarvis_util/introspect/system_info.py +++ b/ci/jarvis-util/jarvis_util/introspect/system_info.py @@ -1,7 +1,27 @@ -import re, platform +""" +This module provides methods for querying the information of the host +system. This can be used to make scripts more portable. +""" +import re +import platform +from jarvis_util.shell.exec import Exec +from jarvis_util.util.size_conv import SizeConv +from jarvis_util.serialize.yaml_file import YamlFile +import json +import pandas as pd +import numpy as np +from enum import Enum +import shlex + +# pylint: disable=C0121 class SystemInfo: + """ + This class queries information about the host machine, such as OS, + CPU, and kernel + """ + instance_ = None @staticmethod @@ -9,9 +29,9 @@ def get_instance(): if SystemInfo.instance_ is None: SystemInfo.instance_ = SystemInfo() return SystemInfo.instance_ - + def __init__(self): - with open('/etc/os-release') as fp: + with open('/etc/os-release', 'r', encoding='utf-8') as fp: lines = fp.read().splitlines() self.os = self._detect_os_type(lines) self.os_like = self._detect_os_like_type(lines) @@ -24,7 +44,7 @@ def __init__(self): def _detect_os_type(self, lines): for line in lines: - if "ID=" in line: + if 'ID=' in line: if 'ubuntu' in line: return 'ubuntu' elif 'centos' in line: @@ -34,7 +54,7 @@ def _detect_os_type(self, lines): def _detect_os_like_type(self, lines): for line in lines: - if "ID_LIKE=" in line: + if 'ID_LIKE=' in line: if 'ubuntu' in line: return 'ubuntu' elif 'centos' in line: @@ -64,3 +84,552 @@ def __eq__(self, other): (self.cpu == other.cpu) and (self.cpu_family == other.cpu_family) ) + + +class Lsblk(Exec): + """ + List all block devices in the system per-node. Lsblk will return + a JSON output + + A table is stored per-host: + parent: the parent device of the partition (e.g., /dev/sda or NaN) + device: the name of the partition (e.g., /dev/sda1) + size: total size of the partition (bytes) + mount: where the partition is mounted (if anywhere) + model: the exact model of the device + tran: the transport of the device (e.g., /dev/nvme) + rota: whether or not the device is rotational + host: the host this record corresponds to + """ + + def __init__(self, exec_info): + cmd = 'lsblk -o NAME,SIZE,MODEL,TRAN,MOUNTPOINT,ROTA -J -s' + super().__init__(cmd, exec_info.mod(collect_output=True)) + self.exec_async = exec_info.exec_async + self.graph = {} + if not self.exec_async: + self.wait() + + def wait(self): + super().wait() + for host, stdout in self.stdout.items(): + lsblk_data = json.loads(stdout)['blockdevices'] + partitions = [] + devs = {} + for partition in lsblk_data: + dev = partition['children'][0] + partitions.append({ + 'parent': f'/dev/{dev["name"]}', + 'device': f'/dev/{partition["name"]}', + 'size': SizeConv.to_int(partition['size']), + 'mount': partition['mountpoint'], + 'host': host + }) + devs[dev['name']] = { + 'parent': f'/dev/{dev["name"]}', + 'size': SizeConv.to_int(dev['size']), + 'model': dev['model'], + 'tran': dev['tran'].lower(), + 'mount': dev['mountpoint'], + 'rota': dev['rota'], + 'host': host + } + devs = list(devs.values()) + part_df = pd.DataFrame(partitions) + dev_df = pd.DataFrame(devs) + total_df = pd.merge(part_df, + dev_df[['parent', 'model', 'tran', 'host']], + on=['parent', 'host']) + dev_df = dev_df.rename(columns={'parent': 'device'}) + total_df = pd.concat([total_df, dev_df]) + self.df = total_df + + +class Blkid(Exec): + """ + List all filesystems (even those unmounted) and their properties + + Stores a per-host table with the following: + device: the device (or partition) which stores the data (e.g., /dev/sda) + fs_type: the type of filesystem (e.g., ext4) + uuid: filesystem-levle uuid from the FS metadata + partuuid: the partition-lable UUID for the partition + host: the host this entry corresponds to + """ + def __init__(self, exec_info): + cmd = 'blkid' + super().__init__(cmd, exec_info.mod(collect_output=True)) + self.exec_async = exec_info.exec_async + self.graph = {} + if not self.exec_async: + self.wait() + + def wait(self): + super().wait() + for host, stdout in self.stdout.items(): + devices = stdout.splitlines() + dev_list = [] + for dev in devices: + dev_dict = {} + toks = shlex.split(dev) + dev_name = toks[0].split(':')[0] + dev_dict['device'] = dev_name + dev_dict['host'] = host + for tok in toks[1:]: + keyval = tok.split('=') + key = keyval[0].lower() + val = ' '.join(keyval[1:]) + dev_dict[key] = val + dev_list.append(dev_dict) + df = pd.DataFrame(dev_list) + df = df.rename(columns={'type': 'fs_type'}) + self.df = df + + +class ListFses(Exec): + """ + List all mounted filesystems + + Will store a per-host dictionary containing: + device: the device which contains the filesystem + fs_size: total size of the filesystem + used: total nubmer of bytes used + avail: total number of bytes remaining + use%: the percent of capacity used + fs_mount: where the filesystem is mounted + host: the host this entry corresponds to + """ + + def __init__(self, exec_info): + cmd = 'df -h' + super().__init__(cmd, exec_info.mod(collect_output=True)) + self.exec_async = exec_info.exec_async + self.graph = {} + if not self.exec_async: + self.wait() + + def wait(self): + super().wait() + for host, stdout in self.stdout.items(): + lines = stdout.strip().splitlines() + columns = ['device', 'fs_size', 'used', + 'avail', 'use%', 'fs_mount', 'host'] + rows = [line.split() + [host] for line in lines[1:]] + df = pd.DataFrame(rows, columns=columns) + # pylint: disable=W0108 + df.loc[:, 'fs_size'] = df['fs_size'].apply( + lambda x : SizeConv.to_int(x)) + df.loc[:, 'used'] = df['used'].apply( + lambda x: SizeConv.to_int(x)) + df.loc[:, 'avail'] = df['avail'].apply( + lambda x : SizeConv.to_int(x)) + # pylint: enable=W0108 + self.df = df + + +class FiInfo(Exec): + """ + List all networks and their information + provider: network protocol (e.g., sockets, tcp, ib) + fabric: IP address + domain: network domain + version: network version + type: packet type (e.g., DGRAM) + protocol: protocol constant + host: the host this network corresponds to + """ + def __init__(self, exec_info): + super().__init__('fi_info', exec_info.mod(collect_output=True)) + self.exec_async = exec_info.exec_async + self.graph = {} + if not self.exec_async: + self.wait() + + def wait(self): + super().wait() + for host, stdout in self.stdout.items(): + lines = stdout.strip().splitlines() + providers = [] + for line in lines: + if 'provider' in line: + providers.append({ + 'provider': line.split(':')[1], + 'host': host + }) + else: + splits = line.split(':') + key = splits[0].strip() + val = splits[1].strip() + if 'fabric' in key: + val = val.split('/')[0] + providers[-1][key] = val + self.df = pd.DataFrame(providers) + + +class StorageDeviceType(Enum): + PMEM='pmem' + NVME='nvme' + SSD='ssd' + HDD='hdd' + + +class ResourceGraph: + """ + Stores helpful information about storage and networking info for machines. + + Two tables are stored to make decisions on application deployment. + fs: + parent: the parent device of the partition (e.g., /dev/sda or NaN) + device: the name of the device (e.g., /dev/sda1 or /dev/sda) + size: total size of the device (bytes) + mount: where the device is mounted (if anywhere) + model: the exact model of the device + rota: whether the device is rotational or not + tran: the transport of the device (e.g., /dev/nvme) + fs_type: the type of filesystem (e.g., ext4) + uuid: filesystem-levle uuid from the FS metadata + fs_size: total size of the filesystem + avail: total number of bytes remaining + shared: whether the this is a shared service or not + host: the host this record corresponds to + net: + provider: network protocol (e.g., sockets, tcp, ib) + fabric: IP address + domain: network domain + host: the host this network corresponds to + + TODO: Need to verify on more than ubuntu20.04 + TODO: Can we make this work for windows? + TODO: Can we make this work even when hosts have different OSes? + """ + + def __init__(self): + self.lsblk = None + self.blkid = None + self.list_fs = None + self.fi_info = None + self.fs_columns = [ + 'parent', 'device', 'size', 'mount', 'model', 'rota', + 'tran', 'fs_type', 'uuid', 'fs_size', + 'avail', 'shared', 'host' + ] + self.net_columns = [ + 'provider', 'fabric', 'domain', 'host' + ] + self.all_fs = pd.DataFrame(columns=self.fs_columns) + self.all_net = pd.DataFrame(columns=self.net_columns) + self.fs_settings = { + 'register': [], + 'filter_mounts': {} + } + self.net_settings = { + 'register': [], + 'track_ips': {} + } + self.hosts = None + + def build(self, exec_info, introspect=True): + """ + Build a resource graph. + + :param exec_info: Where to collect resource information + :param introspect: Whether to introspect system info, or rely solely + on admin-defined settings + :return: self + """ + if introspect: + self._introspect(exec_info) + self.apply() + return self + + def _introspect(self, exec_info): + """ + Introspect the cluster for resources. + + :param exec_info: Where to collect resource information + :return: None + """ + self.lsblk = Lsblk(exec_info) + self.blkid = Blkid(exec_info) + self.list_fs = ListFses(exec_info) + self.fi_info = FiInfo(exec_info) + self.hosts = exec_info.hostfile.hosts + self.all_fs = pd.merge(self.lsblk.df, + self.blkid.df, + on=['device', 'host'], + how='outer') + self.all_fs['shared'] = False + self.all_fs = pd.merge(self.all_fs, + self.list_fs.df, + on=['device', 'host'], + how='outer') + self.all_fs['shared'].fillna(True, inplace=True) + self.all_fs.drop(['used', 'use%', 'fs_mount', 'partuuid'], + axis=1, inplace=True) + self.all_fs['mount'].fillna(value='', inplace=True) + net_df = self.fi_info.df + net_df['speed'] = np.nan + net_df.drop(['version', 'type', 'protocol'], + axis=1, inplace=True) + self.all_net = net_df + + def save(self, path): + """ + Save the resource graph YAML file + + :param path: the path to save the file + :return: None + """ + graph = { + 'hosts': self.hosts, + 'fs': self.all_fs.to_dict('records'), + 'net': self.all_net.to_dict('records'), + 'fs_settings': self.fs_settings, + 'net_settings': self.net_settings + } + YamlFile(path).save(graph) + + def load(self, path): + """ + Load resource graph from storage. + + :param path: The path to the resource graph YAML file + :return: self + """ + graph = YamlFile(path).load() + self.hosts = graph['hosts'] + self.all_fs = pd.DataFrame(graph['fs']) + self.all_net = pd.DataFrame(graph['net']) + self.fs = None + self.net = None + self.fs_settings = graph['fs_settings'] + self.net_settings = graph['net_settings'] + self.apply() + return self + + def set_hosts(self, hosts): + """ + Set the set of hosts this resource graph covers + + :param hosts: Hostfile() + :return: None + """ + self.hosts = hosts.hosts_ip + + def add_storage(self, hosts, **kwargs): + """ + Register a storage device record + + :param hosts: Hostfile() indicating set of hosts to make record for + :param kwargs: storage record + :return: None + """ + for host in hosts.hosts: + record = kwargs.copy() + record['host'] = host + self.fs_settings['register'].append(record) + + def add_net(self, hosts, **kwargs): + """ + Register a network record + + :param hosts: Hostfile() indicating set of hosts to make record for + :param kwargs: net record + :return: None + """ + for host, ip in zip(hosts.hosts, hosts.hosts_ip): + record = kwargs.copy() + record['fabric'] = ip + record['host'] = host + self.net_settings['register'].append(record) + + def filter_fs(self, mount_re, + mount_suffix=None, tran=None): + """ + Track all filesystems + devices matching the mount regex. + + :param mount_re: The regex to match a set of mountpoints + :param mount_suffix: After the mount_re is matched, append this path + to the mountpoint to indicate where users can access data. A typical + value for this is /${USER}, indicating the mountpoint has a subdirectory + per-user where they can access data. + :param shared: Whether this mount point is shared + :param tran: The transport of this device + :return: self + """ + self.fs_settings['filter_mounts']['mount_re'] = { + 'mount_re': mount_re, + 'mount_suffix': mount_suffix, + 'tran': tran + } + return self + + def filter_ip(self, ip_re, speed=None): + """ + Track all IPs matching the regex. The IPs with this regex all have + a certain speed. + + :param ip_re: The regex to match + :param speed: The speed of the fabric + :return: self + """ + self.net_settings['track_ips'][ip_re] = { + 'ip_re': ip_re, + 'speed': SizeConv.to_int(speed) if speed is not None else speed + } + return self + + def filter_hosts(self, hosts, speed=None): + """ + Track all ips matching the hostnames. + + :param hosts: Hostfile() of the hosts to filter for + :param speed: Speed of the interconnect (e.g., 1gbps) + :return: self + """ + for host in hosts.hosts_ip: + self.filter_ip(host, speed) + return self + + def apply(self): + """ + Apply fs and net settings to the resource graph + + :return: self + """ + self._apply_fs_settings() + self._apply_net_settings() + # self.fs.size = self.fs.size.fillna(0) + # self.fs.avail = self.fs.avail.fillna(0) + # self.fs.fs_size = self.fs.fs_size.fillna(0) + return self + + def _apply_fs_settings(self): + if len(self.fs_settings) == 0: + self.fs = self.all_fs + return + df = self.all_fs + self.fs = pd.DataFrame(columns=self.all_net.columns) + for fs_set in self.fs_settings['filter_mounts'].values(): + mount_re = fs_set['mount_re'] + mount_suffix = fs_set['mount_suffix'] + tran = fs_set['tran'] + with_mount = df[df.mount.str.contains(mount_re)] + if mount_suffix is not None: + with_mount['mount'] += mount_suffix + if tran is not None: + with_mount['tran'] = tran + self.fs = pd.concat([self.fs, with_mount]) + admin_df = pd.DataFrame(self.fs_settings['register'], + columns=self.fs_columns) + self.fs = pd.concat([self.fs, admin_df]) + + def _apply_net_settings(self): + if len(self.net_settings) == 0: + self.net = self.all_net + return + self.net = pd.DataFrame(columns=self.all_net.columns) + df = self.all_net + for net_set in self.net_settings['track_ips'].values(): + ip_re = net_set['ip_re'] + speed = net_set['speed'] + with_ip = df[df['fabric'].str.contains(ip_re)] + with_ip['speed'] = speed + self.net = pd.concat([self.net, with_ip]) + admin_df = pd.DataFrame(self.net_settings['register'], + columns=self.net_columns) + self.net = pd.concat([self.net, admin_df]) + + def find_shared_storage(self): + """ + Find the set of shared storage services + + :return: Dataframe + """ + df = self.fs + return df[df.shared == True] + + def find_storage(self, + dev_types=None, + is_mounted=True, + common=False, + count_per_node=None, + count_per_dev=None, + min_cap=None, + min_avail=None): + """ + Find a set of storage devices. + + :param dev_types: Search for devices of type in order. Either a list + or a string. + :param is_mounted: Search only for mounted devices + :param common: Remove mount points that are not common across all hosts + :param count_per_node: Choose only a subset of devices matching query + :param count_per_dev: Choose only a subset of devices matching query + :param min_cap: Remove devices with too little overall capacity + :param min_avail: Remove devices with too little available space + :return: Dataframe + """ + df = self.fs + # Remove pfs + df = df[df.shared == False] + # Filter devices by whether or not a mount is needed + if is_mounted: + df = df[df.mount.notna()] + # Find devices of a particular type + if dev_types is not None: + matching_devs = pd.DataFrame(columns=df.columns) + if isinstance(dev_types, str): + dev_types = [dev_types] + for dev_type in dev_types: + if dev_type == StorageDeviceType.HDD: + devs = df[(df.tran == 'sata') & (df.rota == True)] + elif dev_type == StorageDeviceType.SSD: + devs = df[(df.tran == 'sata') & (df.rota == False)] + elif dev_type == StorageDeviceType.NVME: + devs = df[(df.tran == 'nvme')] + matching_devs = pd.concat([matching_devs, devs]) + df = matching_devs + # Get the set of mounts common between all hosts + if common: + df = df.groupby(['mount']).filter( + lambda x: len(x) == len(self.hosts)).reset_index(drop=True) + # Remove storage with too little capacity + if min_cap is not None: + df = df[df.size >= min_cap] + # Remove storage with too little available space + if min_avail is not None: + df = df[df.avail >= min_avail] + # Take a certain number of each device per-host + if count_per_dev is not None: + df = df.groupby(['tran', 'rota', 'host']).\ + head(count_per_dev).reset_index(drop=True) + # Take a certain number of matched devices per-host + if count_per_node is not None: + df = df.groupby('host').head(count_per_node).reset_index(drop=True) + return df + + def find_net_info(self, hosts, + providers=None): + """ + Find the set of networks common between each host. + + :param hosts: A Hostfile() data structure containing the set of + all hosts to find network information for + :param providers: The network protocols to search for. + :return: Dataframe + """ + df = self.net + # Get the set of fabrics corresponding to these hosts + df = df[df.fabric.isin(hosts.hosts_ip)] + # Filter out protocols which are not common between these hosts + df = df.groupby('provider').filter( + lambda x: len(x) == len(hosts)).reset_index(drop=True) + # Choose only a subset of providers + if providers is not None: + if isinstance(providers, str): + providers = [providers] + df = df[df.provider.isin(providers)] + return df + +# pylint: enable=C0121 diff --git a/ci/jarvis-util/jarvis_util/jutil_manager.py b/ci/jarvis-util/jarvis_util/jutil_manager.py index 4ea11aeec..c229b2445 100644 --- a/ci/jarvis-util/jarvis_util/jutil_manager.py +++ b/ci/jarvis-util/jarvis_util/jutil_manager.py @@ -1,9 +1,17 @@ -import threading -import sys -import time +""" +This file contains properties which are globally accessible to all +jarvis-util modules. This can be used to configure various aspects +of jarvis, such as output. +""" class JutilManager: + """ + A singleton which stores various properties that can be queried by + internally by jutil modules. This includes properties such output + management. + """ + instance_ = None @staticmethod @@ -15,43 +23,6 @@ def get_instance(): def __init__(self): self.collect_output = False self.hide_output = False - self.print_thread = None - self.continue_ = True - self.print_tasks = [] - - def monitor_print(self, local_exec): - if len(self.print_tasks) == 0: - self.print_thread = threading.Thread(target=self.print_worker) - self.continue_ = True - self.print_thread.start() - self.print_tasks.append(local_exec) - - def unmonitor_print(self, local_exec): - self.print_tasks.remove(local_exec) - self.print_to_outputs(local_exec) - local_exec.stdout = local_exec.stdout.getvalue() - local_exec.stderr = local_exec.stderr.getvalue() - if len(self.print_tasks) == 0: - self.continue_ = False - self.print_thread.join() - - def print_worker(self): - while self.continue_: - for local_exec in self.print_tasks: - self.print_to_outputs(local_exec) - # time.sleep(25 / 1000) - - def print_to_outputs(self, local_exec): - self.print_to_output(local_exec, local_exec.proc.stdout, sys.stdout) - self.print_to_output(local_exec, local_exec.proc.stderr, sys.stderr) + self.debug_mpi_exec = False + self.debug_local_exec = False - def print_to_output(self, local_exec, out, sysout): - if len(out.peek()) == 0: - return - text = out.read().decode('utf-8') - if not local_exec.hide_output: - sysout.write(text) - if local_exec.pipe_stdout: - local_exec.stdout.write(text) - if local_exec.file_output is not None: - local_exec.file_output.write(text) diff --git a/ci/jarvis-util/jarvis_util/serialize/ini_file.py b/ci/jarvis-util/jarvis_util/serialize/ini_file.py index 90741d9c4..8da174ff1 100644 --- a/ci/jarvis-util/jarvis_util/serialize/ini_file.py +++ b/ci/jarvis-util/jarvis_util/serialize/ini_file.py @@ -1,7 +1,16 @@ +""" +This module contains methods to serialize and deserialize data from +a human-readable ini file. +""" import configparser from jarvis_util.serialize.serializer import Serializer + class IniFile(Serializer): + """ + This class contains methods to serialize and deserialize data from + a human-readable ini file. + """ def __init__(self, path): self.path = path @@ -11,5 +20,5 @@ def load(self): return config def save(self, data): - with open(self.path, 'w') as fp: - data.write(fp) \ No newline at end of file + with open(self.path, 'w', encoding='utf-8') as fp: + data.write(fp) diff --git a/ci/jarvis-util/jarvis_util/serialize/pickle.py b/ci/jarvis-util/jarvis_util/serialize/pickle.py index ab1e6d5aa..a90ae12de 100644 --- a/ci/jarvis-util/jarvis_util/serialize/pickle.py +++ b/ci/jarvis-util/jarvis_util/serialize/pickle.py @@ -1,8 +1,17 @@ +""" +This module contains methods to serialize and deserialize data from +a pickle file. +""" + import pickle as pkl from jarvis_util.serialize.serializer import Serializer -import sys,os + class PickleFile(Serializer): + """ + This class serializes and deserializes data from a pickle file + """ + def __init__(self, path): self.path = path @@ -12,4 +21,4 @@ def load(self): def save(self, data): with open(self.path, 'wb') as fp: - pkl.dump(data, fp) \ No newline at end of file + pkl.dump(data, fp) diff --git a/ci/jarvis-util/jarvis_util/serialize/serializer.py b/ci/jarvis-util/jarvis_util/serialize/serializer.py index 58b251cc9..8647945a7 100644 --- a/ci/jarvis-util/jarvis_util/serialize/serializer.py +++ b/ci/jarvis-util/jarvis_util/serialize/serializer.py @@ -1,10 +1,21 @@ -from abc import ABC,abstractmethod +""" +This module contains an abstract class used to define classes which +serialize data to a file. +""" + +from abc import ABC, abstractmethod + class Serializer(ABC): + """ + An abstract class which loads serialized data from a file and + saves serialized data to a file. + """ + @abstractmethod def load(self): pass @abstractmethod def save(self, data): - pass \ No newline at end of file + pass diff --git a/ci/jarvis-util/jarvis_util/serialize/text_file.py b/ci/jarvis-util/jarvis_util/serialize/text_file.py index e7c2f9c09..214c6bb9f 100644 --- a/ci/jarvis-util/jarvis_util/serialize/text_file.py +++ b/ci/jarvis-util/jarvis_util/serialize/text_file.py @@ -1,15 +1,22 @@ -import configparser +""" +This module stores data into a file in a human-readable way +""" from jarvis_util.serialize.serializer import Serializer + class TextFile(Serializer): + """ + This class stores data directly into a file using str() as the + serialization method. The data is intended to be human-readable. + """ def __init__(self, path): self.path = path def load(self): - with open(self.path) as fp: + with open(self.path, 'r', encoding='utf-8') as fp: data = fp.read() return data def save(self, data): - with open(self.path, 'w') as fp: - fp.write(data) \ No newline at end of file + with open(self.path, 'w', encoding='utf-8') as fp: + fp.write(data) diff --git a/ci/jarvis-util/jarvis_util/serialize/yaml_file.py b/ci/jarvis-util/jarvis_util/serialize/yaml_file.py index 7fca34638..5892f6bca 100644 --- a/ci/jarvis-util/jarvis_util/serialize/yaml_file.py +++ b/ci/jarvis-util/jarvis_util/serialize/yaml_file.py @@ -1,16 +1,24 @@ +""" +This module contains methods to serialize and deserialize data from +a human-readable YAML file. +""" from jarvis_util.serialize.serializer import Serializer import yaml class YamlFile(Serializer): + """ + This class contains methods to serialize and deserialize data from + a human-readable YAML file. + """ def __init__(self, path): self.path = path def load(self): - with open(self.path, 'r') as fp: + with open(self.path, 'r', encoding='utf-8') as fp: return yaml.load(fp, Loader=yaml.FullLoader) return None def save(self, data): - with open(self.path, 'w') as fp: + with open(self.path, 'w', encoding='utf-8') as fp: yaml.dump(data, fp) diff --git a/ci/jarvis-util/jarvis_util/shell/exec.py b/ci/jarvis-util/jarvis_util/shell/exec.py index 21fbbe578..f9fef066d 100644 --- a/ci/jarvis-util/jarvis_util/shell/exec.py +++ b/ci/jarvis-util/jarvis_util/shell/exec.py @@ -1,3 +1,8 @@ +""" +This module provides mechanisms to execute binaries either locally or +remotely. +""" + from .local_exec import LocalExec from .pssh_exec import PsshExec from .pssh_exec import SshExec @@ -6,7 +11,18 @@ class Exec(Executable): + """ + This class is a factory which wraps around various shell command + execution stragies, such as MPI and SSH. + """ + def __init__(self, cmd, exec_info=None): + """ + Execute a command or list of commands + + :param cmd: list of commands or a single command string + :param exec_info: Info needed to execute processes locally + """ super().__init__() if exec_info is None: exec_info = ExecInfo() @@ -30,6 +46,13 @@ def wait(self): def set_output(self): self.stdout = self.exec_.stdout self.stderr = self.exec_.stderr + if isinstance(self.stdout, str): + if hasattr(self.exec_, 'addr'): + host = self.exec_.addr + else: + host = 'localhost' + self.stdout = {host: self.stdout} + self.stderr = {host: self.stderr} def set_exit_code(self): self.exec_.set_exit_code() diff --git a/ci/jarvis-util/jarvis_util/shell/exec_info.py b/ci/jarvis-util/jarvis_util/shell/exec_info.py index 57bd999c6..2b28246ae 100644 --- a/ci/jarvis-util/jarvis_util/shell/exec_info.py +++ b/ci/jarvis-util/jarvis_util/shell/exec_info.py @@ -1,11 +1,20 @@ +""" +This module contains data structures for determining how to execute +a subcommand. This includes information such as storing SSH keys, +passwords, working directory, etc. +""" + from enum import Enum from jarvis_util.util.hostfile import Hostfile -import copy import os from abc import ABC, abstractmethod class ExecType(Enum): + """ + Different program execution methods. + """ + LOCAL = 'LOCAL' SSH = 'SSH' PSSH = 'PSSH' @@ -13,18 +22,45 @@ class ExecType(Enum): class ExecInfo: + """ + Contains all information needed to execute a program. This includes + parameters such as the path to key-pairs, the hosts to run the program + on, number of processes, etc. + """ def __init__(self, exec_type=ExecType.LOCAL, nprocs=None, ppn=None, - user=None, pkey=None, port=None, hostfile=None, env=None, - sleep_ms=0, sudo=False, cwd=None, hosts=None, + user=None, pkey=None, port=None, + hostfile=None, hosts=None, env=None, + sleep_ms=0, sudo=False, cwd=None, collect_output=None, pipe_stdout=None, pipe_stderr=None, hide_output=None, exec_async=False, stdin=None): + """ + + :param exec_type: How to execute a program. SSH, MPI, Local, etc. + :param nprocs: Number of processes to spawn. E.g., MPI uses this + :param ppn: Number of processes per node. E.g., MPI uses this + :param user: The user to execute command under. E.g., SSH, PSSH + :param pkey: The path to the private key. E.g., SSH, PSSH + :param port: The port to use for connection. E.g., SSH, PSSH + :param hostfile: The hosts to launch command on. E.g., PSSH, MPI + :param hosts: A list (or single string) of host names to run command on. + :param env: The environment variables to use for command. + :param sleep_ms: Sleep for a period of time AFTER executing + :param sudo: Execute command with root privilege. E.g., SSH, PSSH + :param cwd: Set current working directory. E.g., SSH, PSSH + :param collect_output: Collect program output in python buffer + :param pipe_stdout: Pipe STDOUT into a file. (path string) + :param pipe_stderr: Pipe STDERR into a file. (path string) + :param hide_output: Whether to print output to console + :param exec_async: Whether to execute program asynchronously + :param stdin: Any input needed by the program. Only local + """ + self.exec_type = exec_type self.nprocs = nprocs self.user = user self.pkey = pkey self.port = port self.ppn = ppn - self.hosts = hosts self.hostfile = hostfile self._set_hostfile(hostfile=hostfile, hosts=hosts) self.env = env @@ -39,14 +75,22 @@ def __init__(self, exec_type=ExecType.LOCAL, nprocs=None, ppn=None, self.hide_output = hide_output self.exec_async = exec_async self.stdin = stdin + self.keys = ['exec_type', 'nprocs', 'ppn', 'user', 'pkey', 'port', + 'hostfile', 'env', 'sleep_ms', 'sudo', + 'cwd', 'hosts', 'collect_output', + 'pipe_stdout', 'pipe_stderr', 'hide_output', + 'exec_async', 'stdin'] def _set_env(self, env): if env is None: self.env = {} + else: + self.env = env basic_env = [ 'PATH', 'LD_LIBRARY_PATH', 'LIBRARY_PATH', 'CMAKE_PREFIX_PATH', - 'PYTHON_PATH', 'CPATH', 'INCLUDE' + 'PYTHON_PATH', 'CPATH', 'INCLUDE', 'JAVA_HOME' ] + self.basic_env = {} for key in basic_env: if key not in os.environ: continue @@ -54,6 +98,9 @@ def _set_env(self, env): for key, val in self.basic_env.items(): if key not in self.env: self.env[key] = val + self.basic_env.update(self.env) + if 'LD_PRELOAD' in self.basic_env: + del self.basic_env['LD_PRELOAD'] def _set_hostfile(self, hostfile=None, hosts=None): if hostfile is not None: @@ -62,7 +109,7 @@ def _set_hostfile(self, hostfile=None, hosts=None): elif isinstance(hostfile, Hostfile): self.hostfile = hostfile else: - raise Exception("Hostfile is neither string nor Hostfile") + raise Exception('Hostfile is neither string nor Hostfile') if hosts is not None: if isinstance(hosts, list): self.hostfile = Hostfile(all_hosts=hosts) @@ -71,34 +118,40 @@ def _set_hostfile(self, hostfile=None, hosts=None): elif isinstance(hosts, Hostfile): self.hostfile = hosts else: - raise Exception("Host set is neither str, list or Hostfile") + raise Exception('Host set is neither str, list or Hostfile') if hosts is not None and hostfile is not None: - raise Exception("Must choose either hosts or hostfile, not both") + raise Exception('Must choose either hosts or hostfile, not both') if self.hostfile is None: self.hostfile = Hostfile() def mod(self, **kwargs): - keys = ['exec_type', 'nprocs', 'ppn', 'user', 'pkey', 'port', - 'hostfile', 'env', 'sleep_ms', 'sudo', - 'cwd', 'hosts', 'collect_output', - 'pipe_stdout', 'pipe_stderr', 'hide_output', - 'exec_async', 'stdin'] - for key in keys: - if key not in kwargs: - kwargs[key] = getattr(self, key) + self._mod_kwargs(kwargs) return ExecInfo(**kwargs) + def _mod_kwargs(self, kwargs): + for key in self.keys: + if key not in kwargs and hasattr(self, key): + kwargs[key] = getattr(self, key) + def copy(self): return self.mod() -class Executable: +class Executable(ABC): + """ + An abstract class representing a class which is intended to run + shell commands. This includes SSH, MPI, etc. + """ + def __init__(self): self.exit_code = None - self.stdout = "" - self.stderr = "" + self.stdout = '' + self.stderr = '' + + def failed(self): + return self.exit_code != 0 @abstractmethod def set_exit_code(self): @@ -108,3 +161,56 @@ def set_exit_code(self): def wait(self): pass + def smash_cmd(self, cmds): + """ + Convert a list of commands into a single command for the shell + to execute. + + :param cmds: A list of commands or a single command string + :return: + """ + if isinstance(cmds, list): + return ' && '.join(cmds) + elif isinstance(cmds, str): + return cmds + else: + raise Exception('Command must be either list or string') + + def wait_list(self, nodes): + for node in nodes: + node.wait() + + def smash_list_outputs(self, nodes): + """ + Combine the outputs of a set of nodes into a single output. + For example, used if executing multiple commands in sequence. + + :param nodes: + :return: + """ + self.stdout = '\n'.join([node.stdout for node in nodes]) + self.stderr = '\n'.join([node.stderr for node in nodes]) + + def per_host_outputs(self, nodes): + """ + Convert the outputs of a set of nodes to a per-host dictionary. + Used if sending commands to multiple hosts + + :param nodes: + :return: + """ + self.stdout = {} + self.stderr = {} + self.stdout = {node.addr: node.stdout for node in nodes} + self.stderr = {node.addr: node.stderr for node in nodes} + + def set_exit_code_list(self, nodes): + """ + Set the exit code from a set of nodes. + + :param nodes: The set of execution nodes that have been executed + :return: + """ + for node in nodes: + if node.exit_code: + self.exit_code = node.exit_code diff --git a/ci/jarvis-util/jarvis_util/shell/filesystem.py b/ci/jarvis-util/jarvis_util/shell/filesystem.py new file mode 100644 index 000000000..a5cb2b55e --- /dev/null +++ b/ci/jarvis-util/jarvis_util/shell/filesystem.py @@ -0,0 +1,69 @@ +""" +This module contains various wrappers over typical filesystem commands seen +in shell scripts. This includes operations such as creating directories, +changing file permissions, etc. +""" +from .exec import Exec + + +class Mkdir(Exec): + """ + Create directories + subdirectories. + """ + + def __init__(self, paths, exec_info=None): + """ + Create directories + subdirectories. Does not fail if the dirs + already exist. + + :param paths: A list of paths or a single path string. + :param exec_info: Info needed to execute the mkdir command + """ + + if isinstance(paths, str): + paths = [paths] + path = ' '.join(paths) + super().__init__(f'mkdir -p {path}', exec_info) + + +class Rm(Exec): + """ + Remove a file and its subdirectories + """ + + def __init__(self, paths, exec_info=None): + """ + Execute file or directory remove. + + :param paths: Either a list of paths or a single path string + :param exec_info: Information needed to execute rm + """ + + if isinstance(paths, str): + paths = [paths] + path = ' '.join(paths) + super().__init__(f'rm -rf {path}', exec_info) + + +class Chmod(Exec): + """ + Change the mode of a file + """ + + def __init__(self, path=None, mode=None, modes=None, exec_info=None): + """ + Change the mode of a file + + :param path: path to file to mode change + :param mode: the mode to change to + :param modes: A list of tuples [(Path, Mode)] + :param exec_info: How to execute commands + """ + cmds = [] + if path is not None and mode is not None: + cmds.append(f'chmod {mode} {path}') + if modes is not None: + cmds += [f'chmod {mode[1]} {mode[0]}' for mode in modes] + if len(cmds) == 0: + raise Exception('Must set either path+mode or modes') + super().__init__(cmds, exec_info) diff --git a/ci/jarvis-util/jarvis_util/shell/kill.py b/ci/jarvis-util/jarvis_util/shell/kill.py deleted file mode 100644 index 28937c232..000000000 --- a/ci/jarvis-util/jarvis_util/shell/kill.py +++ /dev/null @@ -1,7 +0,0 @@ -import psutil -from .exec import Exec - - -class Kill(Exec): - def __init__(self, cmd, exec_info): - super().__init__(f"pkill {cmd}", exec_info) diff --git a/ci/jarvis-util/jarvis_util/shell/local_exec.py b/ci/jarvis-util/jarvis_util/shell/local_exec.py index dfb73ccdd..c9eac9694 100644 --- a/ci/jarvis-util/jarvis_util/shell/local_exec.py +++ b/ci/jarvis-util/jarvis_util/shell/local_exec.py @@ -1,6 +1,12 @@ +""" +Provides methods for executing a program or workflow locally. This class +is intended to be called from Exec, not by general users. +""" + import time import subprocess -import os, sys +import os +import sys import io import threading from jarvis_util.jutil_manager import JutilManager @@ -8,9 +14,21 @@ class LocalExec(Executable): + """ + Provides methods for executing a program or workflow locally. + """ + def __init__(self, cmd, exec_info): + """ + Execute a program or workflow + + :param cmd: list of commands or a single command string + :param exec_info: Info needed to execute processes locally + """ + super().__init__() jutil = JutilManager.get_instance() + cmd = self.smash_cmd(cmd) # Managing console output and collection self.collect_output = exec_info.collect_output @@ -19,6 +37,7 @@ def __init__(self, cmd, exec_info): self.pipe_stdout_fp = None self.pipe_stderr_fp = None self.hide_output = exec_info.hide_output + # pylint: disable=R1732 if self.collect_output is None: self.collect_output = jutil.collect_output if self.pipe_stdout is not None: @@ -27,6 +46,7 @@ def __init__(self, cmd, exec_info): self.pipe_stderr_fp = open(self.pipe_stderr, 'wb') if self.hide_output is None: self.hide_output = jutil.hide_output + # pylint: enable=R1732 self.stdout = io.StringIO() self.stderr = io.StringIO() self.last_stdout_size = 0 @@ -52,12 +72,15 @@ def __init__(self, cmd, exec_info): self.cwd = os.getcwd() else: self.cwd = exec_info.cwd + if jutil.debug_local_exec: + print(cmd) self._start_bash_processes() def _start_bash_processes(self): if self.sudo: - self.cmd = f"sudo {self.cmd}" + self.cmd = f'sudo {self.cmd}' time.sleep(self.sleep_ms) + # pylint: disable=R1732 self.proc = subprocess.Popen(self.cmd, stdin=self.stdin, stdout=subprocess.PIPE, @@ -65,6 +88,7 @@ def _start_bash_processes(self): cwd=self.cwd, env=self.env, shell=True) + # pylint: enable=R1732 self.print_stdout_thread = threading.Thread( target=self.print_stdout_worker) self.print_stderr_thread = threading.Thread( @@ -74,13 +98,6 @@ def _start_bash_processes(self): if not self.exec_async: self.wait() - def kill(self): - if self.proc is not None: - LocalExec(f"kill -9 {self.get_pid()}", - ExecInfo(pipe_stdout=False)) - self.proc.kill() - self.wait() - def wait(self): self.proc.wait() self.join_print_worker() @@ -109,6 +126,7 @@ def print_stderr_worker(self): time.sleep(25 / 1000) def print_to_outputs(self, proc_sysout, self_sysout, file_sysout, sysout): + # pylint: disable=W0702 for line in proc_sysout: try: text = line.decode('utf-8') @@ -116,10 +134,12 @@ def print_to_outputs(self, proc_sysout, self_sysout, file_sysout, sysout): sysout.write(text) if self.collect_output: self_sysout.write(text) + self_sysout.flush() if file_sysout is not None: file_sysout.write(line) except: pass + # pylint: enable=W0702 def join_print_worker(self): if not self.executing_: @@ -134,14 +154,6 @@ def join_print_worker(self): if self.pipe_stderr_fp is not None: self.pipe_stderr_fp.close() - def collect(self, pipe_path): - if pipe_path is subprocess.DEVNULL: - return - if pipe_path is None: - return - with open(pipe_path) as fp: - return fp.read() - class LocalExecInfo(ExecInfo): def __init__(self, **kwargs): diff --git a/ci/jarvis-util/jarvis_util/shell/mpi_exec.py b/ci/jarvis-util/jarvis_util/shell/mpi_exec.py index 209919a80..e104bf50f 100644 --- a/ci/jarvis-util/jarvis_util/shell/mpi_exec.py +++ b/ci/jarvis-util/jarvis_util/shell/mpi_exec.py @@ -1,9 +1,29 @@ +""" +This module provides methods to execute a process in parallel using the +Message Passing Interface (MPI). This module assumes MPI is installed +on the system. This class is intended to be called from Exec, +not by general users. +""" + +from jarvis_util.jutil_manager import JutilManager from jarvis_util.shell.local_exec import LocalExec from .exec_info import ExecInfo, ExecType class MpiExec(LocalExec): + """ + This class contains methods for executing a command in parallel + using MPI. + """ + def __init__(self, cmd, exec_info): + """ + Execute a command using MPI + + :param cmd: A command (string) to execute + :param exec_info: Information needed by MPI + """ + self.cmd = cmd self.nprocs = exec_info.nprocs self.ppn = exec_info.ppn @@ -17,13 +37,16 @@ def mpicmd(self): if self.ppn is not None: params.append(f"-ppn {self.ppn}") if len(self.hostfile): - if self.hostfile.is_subset(): + if self.hostfile.is_subset() or self.hostfile.path is None: params.append(f"--host {','.join(self.hostfile.hosts)}") else: params.append(f"--hostfile {self.hostfile.path}") params += [f"-genv {key}={val}" for key, val in self.mpi_env.items()] params.append(self.cmd) cmd = " ".join(params) + jutil = JutilManager.get_instance() + if jutil.debug_mpi_exec: + print(cmd) return cmd diff --git a/ci/jarvis-util/jarvis_util/shell/process.py b/ci/jarvis-util/jarvis_util/shell/process.py new file mode 100644 index 000000000..7aba2f0fc --- /dev/null +++ b/ci/jarvis-util/jarvis_util/shell/process.py @@ -0,0 +1,22 @@ +""" +This module provides various wrappers for methods which manage processes +in the cluster. Examples include killing processes, determining whether +or not a process exists, etc. +""" + +from .exec import Exec + + +class Kill(Exec): + """ + Kill all processes which match the name regex. + """ + + def __init__(self, cmd, exec_info): + """ + Kill all processes which match the name regex. + + :param cmd: A regex for the command to kill + :param exec_info: Info needed to execute the command + """ + super().__init__(f"pkill {cmd}", exec_info) diff --git a/ci/jarvis-util/jarvis_util/shell/pscp.py b/ci/jarvis-util/jarvis_util/shell/pscp.py new file mode 100644 index 000000000..f74da34d0 --- /dev/null +++ b/ci/jarvis-util/jarvis_util/shell/pscp.py @@ -0,0 +1,58 @@ +""" +This module provides methods to distribute a command among multiple +nodes using SSH. This class is intended to be called from Exec, +not by general users. +""" + +from .scp import Scp +from .exec_info import Executable + + +class Pscp(Executable): + """ + Execute commands on multiple hosts using SSH. + """ + + def __init__(self, paths, exec_info): + """ + Copy files to a set of remote hosts via rsync. + + Case 1: Paths is a single file: + paths = '/tmp/hi.txt' + '/tmp/hi.txt' will be copied to user@host:/tmp/hi.txt + + Case 2: Paths is a list of files: + paths = ['/tmp/hi1.txt', '/tmp/hi2.txt'] + Repeat Case 1 twice. + + Case 3: Paths is a list of tuples of files: + paths = [('/tmp/hi.txt', '/tmp/remote_hi.txt')] + '/tmp/hi.txt' will be copied to user@host:'/tmp/remote_hi.txt' + + :param paths: Either a path to a file, a list of files, or a list of + tuples of files. + :param exec_info: Connection information for SSH + """ + super().__init__() + self.exec_async = exec_info.exec_async + self.hosts = exec_info.hostfile.hosts + self.scp_nodes = [] + self.stdout = {} + self.stderr = {} + self.hosts = exec_info.hostfile.hosts + for host in self.hosts: + ssh_exec_info = exec_info.mod(hostfile=None, + hosts=host, + exec_async=True) + self.scp_nodes.append(Scp(paths, ssh_exec_info)) + if self.exec_async: + self.wait() + + def wait(self): + self.wait_list(self.scp_nodes) + self.per_host_outputs(self.scp_nodes) + self.set_exit_code() + + def set_exit_code(self): + self.set_exit_code_list(self.scp_nodes) + diff --git a/ci/jarvis-util/jarvis_util/shell/pssh_exec.py b/ci/jarvis-util/jarvis_util/shell/pssh_exec.py index b5416e053..720869391 100644 --- a/ci/jarvis-util/jarvis_util/shell/pssh_exec.py +++ b/ci/jarvis-util/jarvis_util/shell/pssh_exec.py @@ -1,13 +1,28 @@ +""" +This module provides methods to distribute a command among multiple +nodes using SSH. This class is intended to be called from Exec, +not by general users. +""" + from .ssh_exec import SshExec from .local_exec import LocalExec -from jarvis_util.util.hostfile import Hostfile from .exec_info import ExecInfo, ExecType, Executable class PsshExec(Executable): + """ + Execute commands on multiple hosts using SSH. + """ + def __init__(self, cmd, exec_info): + """ + Execute commands on multiple hosts. + + :param cmd: A list of commands or a single command string + :param exec_info: Info needed to execute command with SSH + """ super().__init__() - self.cmd = cmd + self.cmd = self.smash_cmd(cmd) self.exec_async = exec_info.exec_async self.hosts = exec_info.hostfile.hosts self.execs_ = [] @@ -15,7 +30,9 @@ def __init__(self, cmd, exec_info): self.stderr = {} if len(self.hosts): for host in self.hosts: - ssh_exec_info = exec_info.mod(hosts=host, exec_async=True) + ssh_exec_info = exec_info.mod(hostfile=None, + hosts=host, + exec_async=True) self.execs_.append(SshExec(cmd, ssh_exec_info)) else: self.execs_.append( @@ -25,20 +42,12 @@ def __init__(self, cmd, exec_info): self.wait() def wait(self): - for exe in self.execs_: - exe.wait() - if hasattr(exe, 'addr'): - addr = exe.addr - else: - addr = 'localhost' - self.stdout[addr] = exe.stdout - self.stdout[addr] = exe.stderr + self.wait_list(self.execs_) + self.per_host_outputs(self.execs_) self.set_exit_code() def set_exit_code(self): - for exe in self.execs_: - if exe.exit_code: - self.exit_code = exe.exit_code + self.set_exit_code_list(self.execs_) class PsshExecInfo(ExecInfo): diff --git a/ci/jarvis-util/jarvis_util/shell/rm.py b/ci/jarvis-util/jarvis_util/shell/rm.py deleted file mode 100644 index a01f65738..000000000 --- a/ci/jarvis-util/jarvis_util/shell/rm.py +++ /dev/null @@ -1,7 +0,0 @@ -import psutil -from .exec import Exec - - -class Rm(Exec): - def __init__(self, path, exec_info): - super().__init__(f"rm -rf {path}", exec_info) diff --git a/ci/jarvis-util/jarvis_util/shell/scp.py b/ci/jarvis-util/jarvis_util/shell/scp.py index e69de29bb..8b39301e6 100644 --- a/ci/jarvis-util/jarvis_util/shell/scp.py +++ b/ci/jarvis-util/jarvis_util/shell/scp.py @@ -0,0 +1,115 @@ +""" +This module provides methods to execute a single command remotely using SSH. +This class is intended to be called from Exec, not by general users. +""" +from .local_exec import LocalExec +from .exec_info import Executable + + +class _Scp(LocalExec): + """ + This class provides methods to copy data over SSH using the "rsync" + command utility in Linux + """ + + def __init__(self, src_path, dst_path, exec_info): + """ + Copy a file or directory from source to destination via rsync + + :param src_path: The path to the file on the host + :param dst_path: The desired file path on the remote host + :param exec_info: Info needed to execute command with SSH + """ + + self.addr = exec_info.hostfile.hosts[0] + self.src_path = src_path + self.dst_path = dst_path + self.user = exec_info.user + self.pkey = exec_info.pkey + self.port = exec_info.port + self.sudo = exec_info.sudo + super().__init__(self.rsync_cmd(src_path, dst_path), + exec_info.mod(env=exec_info.basic_env)) + + def rsync_cmd(self, src_path, dst_path): + lines = ['rsync -ha'] + if self.pkey is not None or self.port is not None: + ssh_lines = ['ssh'] + if self.pkey is not None: + ssh_lines.append(f'-i {self.pkey}') + if self.port is not None: + ssh_lines.append(f'-p {self.port}') + ssh_cmd = ' '.join(ssh_lines) + lines.append(f'-e \'{ssh_cmd}\'') + lines.append(src_path) + if self.user is not None: + lines.append(f'{self.user}@{self.addr}:{dst_path}') + else: + lines.append(f'{self.addr}:{dst_path}') + rsync_cmd = ' '.join(lines) + return rsync_cmd + + +class Scp(Executable): + """ + Secure copy data between two hosts. + """ + + def __init__(self, paths, exec_info): + """ + Copy files via rsync. + + Case 1: Paths is a single file: + paths = '/tmp/hi.txt' + '/tmp/hi.txt' will be copied to user@host:/tmp/hi.txt + + Case 2: Paths is a list of files: + paths = ['/tmp/hi1.txt', '/tmp/hi2.txt'] + Repeat Case 1 twice. + + Case 3: Paths is a list of tuples of files: + paths = [('/tmp/hi.txt', '/tmp/remote_hi.txt')] + '/tmp/hi.txt' will be copied to user@host:'/tmp/remote_hi.txt' + + :param paths: Either a path to a file, a list of files, or a list of + tuples of files. + :param exec_info: Connection information for SSH + """ + + super().__init__() + self.paths = paths + self.exec_info = exec_info + self.scp_nodes = [] + if isinstance(paths, str): + self._exec_single_path(paths) + if isinstance(paths, list): + if len(paths) == 0: + raise Exception('Must have at least one path to scp') + elif isinstance(paths[0], str): + self._exec_many_paths(paths) + elif isinstance(paths[0], tuple): + self._exec_many_paths_tuple(paths) + elif isinstance(paths[0], list): + self._exec_many_paths_tuple(paths) + if not self.exec_info.exec_async: + self.wait() + + def _exec_single_path(self, path): + self.scp_nodes.append(_Scp(path, path, self.exec_info)) + + def _exec_many_paths(self, paths): + for path in paths: + self.scp_nodes.append(_Scp(path, path, self.exec_info)) + + def _exec_many_paths_tuple(self, path_tlist): + for src, dst in path_tlist: + self.scp_nodes.append(_Scp(src, dst, self.exec_info)) + + def wait(self): + self.wait_list(self.scp_nodes) + self.smash_list_outputs(self.scp_nodes) + self.set_exit_code() + return self.exit_code + + def set_exit_code(self): + self.set_exit_code_list(self.scp_nodes) diff --git a/ci/jarvis-util/jarvis_util/shell/ssh_exec.py b/ci/jarvis-util/jarvis_util/shell/ssh_exec.py index 0543bf4e0..bd9b85135 100644 --- a/ci/jarvis-util/jarvis_util/shell/ssh_exec.py +++ b/ci/jarvis-util/jarvis_util/shell/ssh_exec.py @@ -1,9 +1,25 @@ +""" +This module provides methods to execute a single command remotely using SSH. +This class is intended to be called from Exec, not by general users. +""" from .local_exec import LocalExec from .exec_info import ExecInfo, ExecType class SshExec(LocalExec): + """ + This class provides methods to execute a command via SSH. + """ + def __init__(self, cmd, exec_info): + """ + Execute a command remotely via SSH + + :param cmd: A list of commands or a single command string + :param exec_info: Info needed to execute command with SSH + """ + + cmd = self.smash_cmd(cmd) self.addr = exec_info.hostfile.hosts[0] self.user = exec_info.user self.pkey = exec_info.pkey @@ -16,23 +32,22 @@ def __init__(self, cmd, exec_info): def ssh_cmd(self, cmd): lines = ['ssh'] if self.pkey is not None: - lines.append(f"-i {self.pkey}") + lines.append(f'-i {self.pkey}') if self.port is not None: - lines.append(f"-p {self.port}") + lines.append(f'-p {self.port}') if self.user is not None: - lines.append(f"{self.user}@{self.addr}") + lines.append(f'{self.user}@{self.addr}') else: - lines.append(f"{self.addr}") - ssh_cmd = " ".join(lines) + lines.append(f'{self.addr}') + ssh_cmd = ' '.join(lines) cmd_lines = [] if self.ssh_env is not None: for key, val in self.ssh_env.items(): - cmd_lines.append(f"{key}={val}") + cmd_lines.append(f'{key}={val}') cmd_lines.append(cmd) - env_cmd = " ".join(cmd_lines) - real_cmd = f"{ssh_cmd} \"{env_cmd}\"" - print(f"{ssh_cmd} \"{cmd}\"") + env_cmd = ' '.join(cmd_lines) + real_cmd = f'{ssh_cmd} \"{env_cmd}\"' return real_cmd diff --git a/ci/jarvis-util/jarvis_util/util/argparse.py b/ci/jarvis-util/jarvis_util/util/argparse.py index e25dfef69..e58486be5 100644 --- a/ci/jarvis-util/jarvis_util/util/argparse.py +++ b/ci/jarvis-util/jarvis_util/util/argparse.py @@ -1,3 +1,7 @@ +""" +This module contains an argument parser which defines +""" + import sys import os from abc import ABC, abstractmethod @@ -6,12 +10,19 @@ class ArgParse(ABC): + """ + A class for parsing command line arguments. + Parsed menu name stored in self.menu_name + Parsed menu arguments stored in self.kwargs + Parsed remaining arguments stored in self.remainder + """ + def __init__(self, args=None, exit_on_fail=True): if args is None: args = sys.argv[1:] elif isinstance(args, str): args = shlex.split(args) - args = " ".join(args) + args = ' '.join(args) self.binary_name = os.path.basename(sys.argv[0]) self.orig_args = shlex.split(args) self.args = self.orig_args @@ -20,24 +31,58 @@ def __init__(self, args=None, exit_on_fail=True): self.menus = [] self.vars = {} self.remainder = None - self.pos_required = True + self.pos_required = False self.use_remainder = False self.menu = None + self.menu_name = None + self.kwargs = {} self.define_options() self._parse() @abstractmethod def define_options(self): + """ + User-defined options menu + + :return: + """ pass + def process_args(self): + """ + After args have been parsed, can call this function to process + the arguments. Assumes that derived ArgParse class has a function + for each menu option. + + :return: + """ + + func_name = self.menu_name.replace(' ', '_') + func = getattr(self, func_name) + func(self) + def add_menu(self, name=None, msg=None, use_remainder=False): + """ + A menu is a container of arguments. + + :param name: The name that appears in the CLI to trigger the menu. + Spaces indicate menu nesting. E.g., 'repo add' will trigger the + menu argparser only if 'repo' and 'add' appear next to each other + in the argument list. + :param msg: The message to print if the user selects an improper menu + in the CLI. + :param use_remainder: Whether or not the menu should store all remaining + arguments for further use later. + :return: + """ + toks = [] if name is not None: toks = name.split() self.menus.append({ - 'name_str': " ".join(toks), + 'name_str': ' '.join(toks), 'name': toks, 'msg': msg, 'num_required': 0, @@ -45,13 +90,23 @@ def add_menu(self, name=None, msg=None, 'kw_opts': {}, 'use_remainder': use_remainder }) - self.pos_required = True + self.pos_required = False self.menu = self.menus[-1] def start_required(self): + """ + Define a set of required positional arguments. + + :return: None + """ self.pos_required = True def end_required(self): + """ + Finish the set of required positional arguments. + + :return: None + """ self.pos_required = False def add_arg(self, @@ -62,6 +117,23 @@ def add_arg(self, msg=None, action=None, aliases=None): + """ + Append an argument to a menu. + Arguments can either be positional or key-value. + Positional arguments do NOT start with a dash + Key-value arguments are separated by dashes + + :param name: The name of the argument. If name starts with a dash, + it will be interpreted as a positional arg + :param argtype: The type of the argument being stored. + :param choices: The set of acceptable inputs as a list + :param default: The default value to store + :param msg: The help message to print if there is a problem + :param action: An action to execute if the argument exists + :param aliases: Other names for the same thing (list) + :return: + """ + # Add all aliases if aliases is not None: for alias in aliases: @@ -72,7 +144,7 @@ def add_arg(self, # Handle the specific boolean argument case is_kwarg = '-' in name if is_kwarg and argtype == bool: - self.add_bool_kw_arg(name, default, msg) + self._add_bool_kw_arg(name, default, msg) return # Add general argument menu = self.menu @@ -94,13 +166,25 @@ def add_arg(self, if self.pos_required: menu['num_required'] += 1 menu['pos_opts'].append(arg) - - def add_bool_kw_arg(self, - name, - default, - msg=None, - is_other=False, - dict_name=None): + self.kwargs[arg['dict_name']] = default + + def _add_bool_kw_arg(self, + name, + default, + msg=None, + is_other=False, + dict_name=None): + """ + Boolean arguments can be indicated using a +-. + + indicates true, - indicates false. + + :param name: The name of the boolean arg + :param default: Default value of the boolean arg + :param msg: Help message + :param is_other: Indicates this is an alias of the +- syntax. + :param dict_name: Name to make the argument in final kwargs + :return: None + """ menu = self.menu if dict_name is None: dict_name = self._get_opt_name(name, True) @@ -116,18 +200,31 @@ def add_bool_kw_arg(self, 'has_input': not is_other } if not is_other: - self.add_bool_kw_arg("--with-" + name.strip('-'), + self._add_bool_kw_arg('--with-' + name.strip('-'), True, msg, True, dict_name) - self.add_bool_kw_arg("--no-" + name.strip('-'), + self._add_bool_kw_arg('--no-' + name.strip('-'), False, msg, True, dict_name) self.pos_required = False menu['kw_opts'][name] = arg def _parse(self): + """ + Parse the CLI arguments. + Will modify self.menu to indicate which menu is used + Will modify self.args to create a key-value store of arguments + + :return: None. + """ self.menus.sort(key=lambda x: len(x['name']), reverse=True) self._parse_menu() def _parse_menu(self): + """ + Determine which menu is used in the CLI. + + :return: Modify self.menu. No return value. + """ + self.menu = None for menu in self.menus: menu_name = menu['name'] @@ -162,6 +259,13 @@ def _set_defaults(self): self.__dict__[opt_info['dict_name']] = opt_info['default'] def _parse_pos_args(self): + """ + Parse positional arguments + Modify the self.kwargs dictionary + + :return: + """ + i = 0 args = self.args menu = self.menu @@ -184,11 +288,19 @@ def _parse_pos_args(self): arg = self._convert_opt(opt_name, opt_type, opt_choices, arg) # Set the argument - setattr(self, opt_dict_name, arg) + self.kwargs[opt_dict_name] = arg i += 1 return i def _parse_kw_args(self, i): + """ + Parse key-word arguments. + Modify the self.kwargs dictionary + + :param i: The starting index in the self.args list where kv pairs start + :return: + """ + menu = self.menu args = self.args while i < len(args): @@ -196,7 +308,7 @@ def _parse_kw_args(self, i): opt_name = args[i] if opt_name not in menu['kw_opts']: if self.use_remainder: - self.remainder = " ".join(args[i:]) + self.remainder = ' '.join(args[i:]) return else: self._invalid_kwarg(opt_name) @@ -230,10 +342,11 @@ def _parse_kw_args(self, i): arg = self._convert_opt(opt_name, opt_type, opt_choices, arg) # Set the argument - setattr(self, opt_dict_name, arg) + self.kwargs[opt_dict_name] = arg def _convert_opt(self, opt_name, opt_type, opt_choices, arg): if opt_type is not None: + # pylint: disable=W0702 try: arg = opt_type(arg) if opt_choices is not None: @@ -241,6 +354,7 @@ def _convert_opt(self, opt_name, opt_type, opt_choices, arg): self._invalid_choice(opt_name, arg) except: self._invalid_type(opt_name, opt_type) + # pylint: enable=W0702 return arg def _next_is_kw_value(self, i): @@ -249,6 +363,17 @@ def _next_is_kw_value(self, i): return self.args[i + 1] not in self.menu['kw_opts'] def _get_opt_name(self, opt_name, is_bool_arg=False): + """ + Normalize option names + '-' are converted into '_' + '--with-' and '--no-' are removed + '+' and '-' for boolean args are removed + + :param opt_name: The menu option name + :param is_bool_arg: Whether the arg is a boolean arg + :return: + """ + if not is_bool_arg: return opt_name.strip('-').replace('-', '_') else: @@ -257,31 +382,32 @@ def _get_opt_name(self, opt_name, is_bool_arg=False): strip('-').replace('-', '_') def _invalid_menu(self): - self._print_error(f"Could not find a menu") + self._print_error('Could not find a menu') def _invalid_choice(self, opt_name, arg): - self._print_menu_error(f"{opt_name}={arg} is not a valid choice") + self._print_menu_error(f'{opt_name}={arg} is not a valid choice') def _missing_positional(self, opt_name): - self._print_menu_error(f"{opt_name} was required, but not defined") + self._print_menu_error(f'{opt_name} was required, but not defined') def _invalid_kwarg(self, opt_name): - self._print_menu_error(f"{opt_name} is not a valid key-word argument") + self._print_menu_error(f'{opt_name} is not a valid key-word argument') def _invalid_kwarg_default(self, opt_name): - self._print_menu_error(f"{opt_name} was not given a value, but requires one") + self._print_menu_error( + f'{opt_name} was not given a value, but requires one') def _invalid_type(self, opt_name, opt_type): - self._print_menu_error(f"{opt_name} was not of type {opt_type}") + self._print_menu_error(f'{opt_name} was not of type {opt_type}') def _print_menu_error(self, msg): - self._print_error(f"{self.menu['name_str']} {msg}") + self._print_error(f'{self.menu["name_str"]} {msg}') def _print_error(self, msg): - print(f"{msg}") + print(f'{msg}') self._print_help() if self.exit_on_fail: - exit(1) + sys.exit(1) else: raise Exception(msg) @@ -297,18 +423,18 @@ def _print_menus(self): self._print_menu_help(True) def _print_menu_help(self, only_usage=False): - if self.menu['msg'] is not None: - print(self.menu['msg']) - print() pos_args = [] for arg in self.menu['pos_opts']: if arg['required']: - pos_args.append(f"[{arg['name']}]") + pos_args.append(f'[{arg["name"]}]') else: - pos_args.append(f"[{arg['name']} (opt)]") - pos_args = " ".join(pos_args) + pos_args.append(f'[{arg["name"]} (opt)]') + pos_args = ' '.join(pos_args) menu_str = self.menu['name_str'] - print(f"USAGE: {self.binary_name} {menu_str} {pos_args} ...") + print(f'USAGE: {self.binary_name} {menu_str} {pos_args} ...') + if self.menu['msg'] is not None: + print(self.menu['msg']) + print() if only_usage: return diff --git a/ci/jarvis-util/jarvis_util/util/expand_env.py b/ci/jarvis-util/jarvis_util/util/expand_env.py new file mode 100644 index 000000000..a9d8aefa7 --- /dev/null +++ b/ci/jarvis-util/jarvis_util/util/expand_env.py @@ -0,0 +1,25 @@ +""" +This module contains functions for expanding environment variables for +dictionaries +""" + +import os + + +def expand_env(data): + """ + Expand environment variables for dictionaries + + :param data: A dict where strings may contain environment variables to + expand + :return: + """ + if isinstance(data, str): + return os.path.expandvars(data) + if isinstance(data, dict): + for key, val in data.items(): + data[key] = expand_env(val) + if isinstance(data, (list, tuple)): + for i, val in enumerate(data): + data[i] = expand_env(val) + return data diff --git a/ci/jarvis-util/jarvis_util/util/hostfile.py b/ci/jarvis-util/jarvis_util/util/hostfile.py index 69db0ebb0..4d22ae233 100644 --- a/ci/jarvis-util/jarvis_util/util/hostfile.py +++ b/ci/jarvis-util/jarvis_util/util/hostfile.py @@ -1,47 +1,57 @@ +""" +This module contains methods for parsing hostfiles and storing hosts +""" + import os import socket import re import itertools -import copy class Hostfile: - def __init__(self, hostfile=None, all_hosts=None): + """ + Parse a hostfile or store a set of hosts passed in manually. + """ + + def __init__(self, hostfile=None, all_hosts=None, all_hosts_ip=None, + text=None, find_ips=True): """ Constructor. Parse hostfile or store existing host list. :param hostfile: The path to the hostfile - :param all_hosts: a list of strings representing all hosts + :param all_hosts: a list of strings representing all hostnames + :param all_hosts_ip: a list of strings representing all host IPs + :param text: Text of a hostfile + :param find_ips: Whether to construct host_ip and all_host_ip fields """ self.hosts_ip = [] self.hosts = [] self.all_hosts = [] self.all_hosts_ip = [] self.path = hostfile + self.find_ips = find_ips + + # Set the host ips directly + if all_hosts_ip is not None: + self.all_hosts_ip = all_hosts_ip + self.hosts_ip = all_hosts_ip + self.find_ips = False # Direct constructor if all_hosts is not None: self._set_hosts(all_hosts) - # Hostfile constructor + # From hostfile path elif hostfile is not None: self._load_hostfile(self.path) - def parse(self, text, set_hosts=False): - """ - Parse a line of a hostfile. Used mainly for unit tests. - - :param text: A line of the hostfile - :param set_hosts: Whether or not to set hosts - :return: - """ + # From hostfile text + elif text is not None: + self.parse(text) - hosts = [] - self._expand_line(hosts, text) - if set_hosts: - self._set_hosts(hosts) + # Both hostfile and hosts are None else: - self.hosts = hosts + self._set_hosts(['localhost']) def _load_hostfile(self, path): """ @@ -51,16 +61,28 @@ def _load_hostfile(self, path): :return: """ if not os.path.exists(path): - raise Exception("hostfile not found") - hosts = [] - with open(path, 'r') as fp: - lines = fp.read().splitlines() - for line in lines: - self._expand_line(hosts, line) + raise Exception('hostfile not found') self.path = path - self._set_hosts(hosts) + with open(path, 'r', encoding='utf-8') as fp: + text = fp.read() + self.parse(text) return self + def parse(self, text): + """ + Parse a hostfile text. + + :param text: Hostfile text + :param set_hosts: Whether or not to set hosts + :return: None + """ + + lines = text.strip().splitlines() + hosts = [] + for line in lines: + self._expand_line(hosts, line) + self._set_hosts(hosts) + def _expand_line(self, hosts, line): """ Will expand brackets in a host declaration. @@ -70,7 +92,7 @@ def _expand_line(self, hosts, line): :param line: the line to parse :return: None """ - toks = re.split('[\[\]]', line) + toks = re.split(r'[\[\]]', line) brkts = [tok for i, tok in enumerate(toks) if i % 2 == 1] num_set = [] @@ -88,7 +110,7 @@ def _expand_line(self, hosts, line): host.append(host_num[int(i/2)]) else: host.append(tok) - hosts.append("".join(host)) + hosts.append(''.join(host)) def _expand_set(self, num_set, brkt): """ @@ -129,13 +151,16 @@ def _product(self, num_set): :param num_set: The numbers to product :return: """ - return [element for element in itertools.product(*num_set)] + return list(itertools.product(*num_set)) def _set_hosts(self, all_hosts): self.all_hosts = all_hosts - self.all_hosts_ip = [socket.gethostbyname(host) for host in all_hosts] + if self.find_ips: + self.all_hosts_ip = [socket.gethostbyname(host) + for host in all_hosts] self.hosts = self.all_hosts - self.hosts_ip = self.all_hosts_ip + if self.find_ips: + self.hosts_ip = self.all_hosts_ip return self def subset(self, count): @@ -150,15 +175,12 @@ def subset(self, count): def is_subset(self): return len(self.hosts) != len(self.all_hosts) - def path(self): - return self.path - def save(self, path): self.all_hosts = self.hosts self.all_hosts_ip = self.hosts_ip self.path = path - with open(path, 'w') as fp: - fp.write("\n".join(self.all_hosts)) + with open(path, 'w', encoding='utf-8') as fp: + fp.write('\n'.join(self.all_hosts)) return self def ip_list(self): @@ -187,3 +209,8 @@ def __str__(self): def __repr__(self): return str(self) + + def __eq__(self, other): + return (self.hosts == other.hosts and + self.all_hosts == other.all_hosts) + diff --git a/ci/jarvis-util/jarvis_util/util/import_all.py b/ci/jarvis-util/jarvis_util/util/import_all.py new file mode 100644 index 000000000..8a10e6125 --- /dev/null +++ b/ci/jarvis-util/jarvis_util/util/import_all.py @@ -0,0 +1,72 @@ +""" +This file contains methods to automate large import __init__.py files +""" + +import pathlib +import os + + +def _import_recurse(root_path, root, stmts): + """ + Identify the set of files in the current "root" directory + + :param root_path: The path to the root of the python package + :param root: The current subdirectory of the python package + :param stmts: The current set of import statements + :return: + """ + for file in os.listdir(root): + file = os.path.join(root, file) + if os.path.isfile(file): + file = os.path.relpath(file, root_path) + ext = file.split('.') + if ext[-1] == 'py': + toks = ext[0].split('/') + if toks[-1] == '__init__': + continue + import_stmt = '.'.join(toks) + stmts.append(f'from {import_stmt} import *') + elif os.path.isdir(file): + _import_recurse(root_path, file, stmts) + return stmts + + +def import_all(root_path, root): + """ + Create all import statement to do: from root import *. + + :param root_path: The root of the python repo + :param root: The current directory we are in within the repo + :return: + """ + stmts = [] + _import_recurse(root_path, root, stmts) + return '\"\"\"Import all modules\"\"\"\n' + '\n'.join(stmts) + '\n' + + +def build_global_import_file(root_path, pkg_name): + """ + Build a file to be able to do: from pkg_name import * + + :param root_path: The path to the python package's root directory + :param pkg_name: The name of the python package + :return: + """ + path = os.path.join(root_path, pkg_name) + imports = import_all(root_path, path) + with open(os.path.join(path, '__init__.py'), 'w', + encoding='utf-8') as fp: + fp.write(imports) + + +def build_global_import_from_bin(pkg_name): + """ + Build a file to be able to do: from pkg_name import * + This function is assumed to be called in the "bin" directory + of the main python repo + + :param pkg_name: The name of the python package being built + :return: + """ + root_path = str(pathlib.Path(__file__).parent.parent.parent.resolve()) + build_global_import_file(root_path, pkg_name) diff --git a/ci/jarvis-util/jarvis_util/util/import_mod.py b/ci/jarvis-util/jarvis_util/util/import_mod.py index b7f5cda05..9c55c508d 100644 --- a/ci/jarvis-util/jarvis_util/util/import_mod.py +++ b/ci/jarvis-util/jarvis_util/util/import_mod.py @@ -1,8 +1,13 @@ +""" +This file contains helper methods to load a class dynamically from a file +""" + import sys # NOTE(llogan): To get the path of the directory this file is in, use # str(pathlib.Path(__file__).parent.resolve()) + def load_class(import_str, path, class_name): """ Loads a class from a python file. diff --git a/ci/jarvis-util/jarvis_util/util/naming.py b/ci/jarvis-util/jarvis_util/util/naming.py index e40aa96bf..af6505633 100644 --- a/ci/jarvis-util/jarvis_util/util/naming.py +++ b/ci/jarvis-util/jarvis_util/util/naming.py @@ -1,5 +1,11 @@ +""" +This module contains methods to create strings which follow a particular +naming convention. +""" + import re + def to_camel_case(string): """ Convert a string in snake case to camel case @@ -9,9 +15,10 @@ def to_camel_case(string): """ if string is None: return - words = re.sub(r"(_|-)+", " ", string).split() + words = re.sub(r'(_|-)+', ' ', string).split() words = [word.capitalize() for word in words] - return "".join(words) + return ''.join(words) + def to_snake_case(string): """ @@ -23,5 +30,5 @@ def to_snake_case(string): return words = re.split('([A-Z][a-z0-9_]*)', string) words = [word for word in words if len(word)] - string = "_".join(words) + string = '_'.join(words) return string.lower() diff --git a/ci/jarvis-util/jarvis_util/util/size_conv.py b/ci/jarvis-util/jarvis_util/util/size_conv.py index b4b2a705c..266959bf0 100644 --- a/ci/jarvis-util/jarvis_util/util/size_conv.py +++ b/ci/jarvis-util/jarvis_util/util/size_conv.py @@ -1,35 +1,44 @@ +""" +This module provides methods to convert a semantic size string to an integer. +""" + + class SizeConv: + """ + A class which provides methods to convert a semantic size string to an int. + """ + @staticmethod def to_int(text): text = text.lower() if 'k' in text: - return SizeConv.KB(text) + return SizeConv.kb(text) if 'm' in text: - return SizeConv.MB(text) + return SizeConv.mb(text) if 'g' in text: - return SizeConv.GB(text) + return SizeConv.gb(text) if 't' in text: - return SizeConv.TB(text) + return SizeConv.tb(text) if 'p' in text: - return SizeConv.PB(text) + return SizeConv.pb(text) return int(text) @staticmethod - def KB(num): - return int(num.split('k')[0]) * (1 << 10) + def kb(num): + return int(float(num.split('k')[0]) * (1 << 10)) @staticmethod - def MB(num): - return int(num.split('m')[0]) * (1 << 20) + def mb(num): + return int(float(num.split('m')[0]) * (1 << 20)) @staticmethod - def GB(num): - return int(num.split('g')[0]) * (1 << 30) + def gb(num): + return int(float(num.split('g')[0]) * (1 << 30)) @staticmethod - def TB(num): - return int(num.split('t')[0]) * (1 << 40) + def tb(num): + return int(float(num.split('t')[0]) * (1 << 40)) @staticmethod - def PB(num): - return int(num.split('p')[0]) * (1 << 50) + def pb(num): + return int(float(num.split('p')[0]) * (1 << 50)) diff --git a/ci/jarvis-util/requirements.txt b/ci/jarvis-util/requirements.txt index e749cd628..70849fea6 100644 --- a/ci/jarvis-util/requirements.txt +++ b/ci/jarvis-util/requirements.txt @@ -1,2 +1,9 @@ -psutil -pyyaml \ No newline at end of file +pyyaml +pylint==2.15.0 +coverage==5.5 +# coverage +coverage-lcov==0.2.4 +# coverage-lcov +pytest==6.2.5 +pandas +tabulate \ No newline at end of file diff --git a/ci/jarvis-util/test/unit/argparse_main.py b/ci/jarvis-util/test/unit/argparse_main.py new file mode 100644 index 000000000..7b636ca60 --- /dev/null +++ b/ci/jarvis-util/test/unit/argparse_main.py @@ -0,0 +1,4 @@ +from jarvis_util.util.argparse import ArgParse + +if __name__ == 'main': + args = ArgParse() \ No newline at end of file diff --git a/ci/jarvis-util/test/unit/print10s.py b/ci/jarvis-util/test/unit/print5s.py similarity index 59% rename from ci/jarvis-util/test/unit/print10s.py rename to ci/jarvis-util/test/unit/print5s.py index df7c7bd66..a6581b31c 100644 --- a/ci/jarvis-util/test/unit/print10s.py +++ b/ci/jarvis-util/test/unit/print5s.py @@ -1,8 +1,12 @@ +""" +NOTE: this is a helper utility for test_local_exec +""" + import time import sys -for i in range(10): +for i in range(5): sys.stdout.write(f"COUT: {i}\n") sys.stderr.write(f"CERR: {i}\n") time.sleep(1) \ No newline at end of file diff --git a/ci/jarvis-util/test/unit/printNone.py b/ci/jarvis-util/test/unit/printNone.py new file mode 100644 index 000000000..cf728ec39 --- /dev/null +++ b/ci/jarvis-util/test/unit/printNone.py @@ -0,0 +1,4 @@ +from jarvis_util.shell.local_exec import LocalExec, LocalExecInfo + +spawn_info = LocalExecInfo(hide_output=True) +LocalExec("echo hello", spawn_info) diff --git a/ci/jarvis-util/test/unit/test_argparse.py b/ci/jarvis-util/test/unit/test_argparse.py new file mode 100644 index 000000000..3c3f3d32f --- /dev/null +++ b/ci/jarvis-util/test/unit/test_argparse.py @@ -0,0 +1,10 @@ +from jarvis_util.util.argparse import ArgParse +from jarvis_util.shell.exec import Exec +from jarvis_util.shell.local_exec import LocalExecInfo +import pathlib +from unittest import TestCase + + +class TestArgparse(TestCase): + def test_argparse_main(self): + pass \ No newline at end of file diff --git a/ci/jarvis-util/test/unit/test_fi_info.py b/ci/jarvis-util/test/unit/test_fi_info.py deleted file mode 100644 index 9ff99668b..000000000 --- a/ci/jarvis-util/test/unit/test_fi_info.py +++ /dev/null @@ -1,4 +0,0 @@ -from jarvis_util.shell.local_exec import LocalExec -# from jarvis_util.introspect.fi_info import FiInfo - -print(LocalExec("fi_info").stdout) \ No newline at end of file diff --git a/ci/jarvis-util/test/unit/test_hostfile.py b/ci/jarvis-util/test/unit/test_hostfile.py index e8c39d1b1..47bd7c853 100644 --- a/ci/jarvis-util/test/unit/test_hostfile.py +++ b/ci/jarvis-util/test/unit/test_hostfile.py @@ -1,62 +1,57 @@ from jarvis_util.util.hostfile import Hostfile import pathlib - - -def test1(): - host = Hostfile() - host.parse('0') - assert(len(host.hosts) == 1) - assert(host.hosts[0] == '0') - - -def test2(): - host = Hostfile() - host.parse('ares-comp-01') - assert(len(host.hosts) == 1) - assert(host.hosts[0] == 'ares-comp-01') - - -def test3(): - host = Hostfile() - host.parse('ares-comp-[01-04]-40g') - assert(len(host.hosts) == 4) - assert(host.hosts[0] == 'ares-comp-01-40g') - assert(host.hosts[1] == 'ares-comp-02-40g') - assert(host.hosts[2] == 'ares-comp-03-40g') - assert(host.hosts[3] == 'ares-comp-04-40g') - - -def test4(): - host = Hostfile() - host.parse('ares-comp-[01-02]-40g-[01-02]') - assert(len(host.hosts) == 4) - assert(host.hosts[0] == 'ares-comp-01-40g-01') - assert(host.hosts[1] == 'ares-comp-01-40g-02') - assert(host.hosts[2] == 'ares-comp-02-40g-01') - assert(host.hosts[3] == 'ares-comp-02-40g-02') - - -def test5(): - host = Hostfile() - host.parse('ares-comp-[01-02]-40g-[01-02]') - host = host.subset(3) - assert(len(host.hosts) == 3) - assert(host.is_subset()) - assert(host.hosts[0] == 'ares-comp-01-40g-01') - assert(host.hosts[1] == 'ares-comp-01-40g-02') - assert(host.hosts[2] == 'ares-comp-02-40g-01') - - -def test6(): - HERE = str(pathlib.Path(__file__).parent.resolve()) - # host = Hostfile(hostfile=f"{HERE}/test_hostfile.txt") - # print(host.hosts) - # assert(len(host.hosts) == 15) - - -test1() -test2() -test3() -test4() -test5() -test6() +from unittest import TestCase + + +class TestHostfile(TestCase): + def test_no_expand_int(self): + host = Hostfile(text='0', find_ips=False) + self.assertTrue(len(host.hosts) == 1) + self.assertTrue(host.hosts[0] == '0') + + def test_no_expand(self): + host = Hostfile(text='ares-comp-01', find_ips=False) + self.assertTrue(len(host.hosts) == 1) + self.assertTrue(host.hosts[0] == 'ares-comp-01') + + def test_expand_set(self): + host = Hostfile(text='ares-comp-[01-04]-40g', find_ips=False) + self.assertTrue(len(host.hosts) == 4) + self.assertTrue(host.hosts[0] == 'ares-comp-01-40g') + self.assertTrue(host.hosts[1] == 'ares-comp-02-40g') + self.assertTrue(host.hosts[2] == 'ares-comp-03-40g') + self.assertTrue(host.hosts[3] == 'ares-comp-04-40g') + + def test_expand_two_sets(self): + host = Hostfile(text='ares-comp-[01-02]-40g-[01-02]', find_ips=False) + self.assertTrue(len(host.hosts) == 4) + self.assertTrue(host.hosts[0] == 'ares-comp-01-40g-01') + self.assertTrue(host.hosts[1] == 'ares-comp-01-40g-02') + self.assertTrue(host.hosts[2] == 'ares-comp-02-40g-01') + self.assertTrue(host.hosts[3] == 'ares-comp-02-40g-02') + + def test_subset(self): + host = Hostfile(text='ares-comp-[01-02]-40g-[01-02]', find_ips=False) + host = host.subset(3) + self.assertTrue(len(host.hosts) == 3) + self.assertTrue(host.is_subset()) + self.assertTrue(host.hosts[0] == 'ares-comp-01-40g-01') + self.assertTrue(host.hosts[1] == 'ares-comp-01-40g-02') + self.assertTrue(host.hosts[2] == 'ares-comp-02-40g-01') + + def test_read_hostfile(self): + HERE = str(pathlib.Path(__file__).parent.resolve()) + hf = Hostfile(hostfile=f'{HERE}/test_hostfile.txt', find_ips=False) + print(hf.hosts) + self.assertEqual(len(hf), 15) + + def test_save_hostfile(self): + HERE = str(pathlib.Path(__file__).parent.resolve()) + hf = Hostfile(hostfile=f'{HERE}/test_hostfile.txt', find_ips=False) + hf_sub = hf.subset(4) + self.assertEqual(len(hf_sub), 4) + hf_sub.save('/tmp/test_hostfile.txt') + hf_sub_reload = Hostfile(hostfile=f'/tmp/test_hostfile.txt', + find_ips=False) + self.assertEqual(len(hf_sub_reload), 4) + self.assertEqual(hf_sub, hf_sub_reload) diff --git a/ci/jarvis-util/test/unit/test_hostfile.txt b/ci/jarvis-util/test/unit/test_hostfile.txt index 9fdff0c56..7ecc164e9 100644 --- a/ci/jarvis-util/test/unit/test_hostfile.txt +++ b/ci/jarvis-util/test/unit/test_hostfile.txt @@ -1 +1 @@ -ares-comp-[02-15]-40g \ No newline at end of file +ares-comp-[01-10,11,12-15]-40g \ No newline at end of file diff --git a/ci/jarvis-util/test/unit/test_local_exec.py b/ci/jarvis-util/test/unit/test_local_exec.py index 64b93e0a5..c0ced8c3b 100644 --- a/ci/jarvis-util/test/unit/test_local_exec.py +++ b/ci/jarvis-util/test/unit/test_local_exec.py @@ -2,27 +2,64 @@ import os from jarvis_util.shell.local_exec import LocalExec, LocalExecInfo from jarvis_util.shell.exec import Exec +from unittest import TestCase -ret = LocalExec("echo hello", LocalExecInfo()) -assert(str(ret.stdout).strip() != "hello") +class TestLocalExec(TestCase): + def _setup_files(self): + self.stdout = '/tmp/test_out.log' + self.stderr = '/tmp/test_err.log' + try: + os.remove(self.stdout) + except OSError: + pass + try: + os.remove(self.stderr) + except: + pass -ret = LocalExec("echo hello", LocalExecInfo()) -ret = LocalExec("echo hello", LocalExecInfo(hide_output=True)) -ret = LocalExec("echo hello", LocalExecInfo(pipe_stdout='/tmp/test.log', - hide_output=True, - collect_output=True)) -assert(str(ret.stdout).strip() == "hello") + def test_default(self): + ret = Exec("echo hello") + self.assertEqual(ret.exit_code, 0) + self.assertEqual(len(ret.stdout['localhost']), 0) -# node = Exec('gcc -print-file-name=libasan.so', -# LocalExecInfo(collect_output=True, hide_output=True)) -# assert(node.stdout == '/usr/lib/gcc/x86_64-linux-gnu/9/libasan.so') + def test_pipe_stdout(self): + self._setup_files() + spawn_info = LocalExecInfo(pipe_stdout=self.stdout, + pipe_stderr=self.stderr, + collect_output=True) + ret = Exec("echo hello", spawn_info) + self.assertEqual(ret.stdout['localhost'].strip(), "hello") + self.assertEqual(ret.stderr['localhost'].strip(), "") + self.assertFile(self.stdout, "hello") + self.assertFile(self.stderr, "") + def test_hide_stdout(self): + HERE = str(pathlib.Path(__file__).parent.resolve()) + PRINTNONE = os.path.join(HERE, 'printNone.py') + spawn_info = LocalExecInfo(collect_output=True) + ret = Exec(f"python3 {PRINTNONE}", spawn_info) + self.assertEqual(ret.stdout['localhost'].strip(), "") + self.assertEqual(ret.stderr['localhost'].strip(), "") -HERE = str(pathlib.Path(__file__).parent.resolve()) -PRINT10s = os.path.join(HERE, 'print10s.py') -ret = LocalExec(f"python3 {PRINT10s}", - LocalExecInfo(collect_output=True, - pipe_stderr='/tmp/stderr.txt', - pipe_stdout='/tmp/stdout.txt')) + def test_periodic_print(self): + self._setup_files() + HERE = str(pathlib.Path(__file__).parent.resolve()) + PRINT5s = os.path.join(HERE, 'print5s.py') + ret = Exec(f"python3 {PRINT5s}", + LocalExecInfo(pipe_stdout=self.stdout, + pipe_stderr=self.stderr)) + stdout_data = "\n".join([f"COUT: {i}" for i in range(5)]) + stderr_data = "\n".join([f"CERR: {i}" for i in range(5)]) + self.assertFile(self.stdout, stdout_data) + self.assertFile(self.stderr, stderr_data) + def assertFile(self, path, data, strip=True): + self.assertTrue(os.path.exists(path)) + with open(path, 'r') as fp: + if strip: + data = data.strip() + file_data = fp.read().strip() + else: + file_data = fp.read() + self.assertEqual(data, file_data) diff --git a/ci/jarvis-util/test/unit/test_system_info.py b/ci/jarvis-util/test/unit/test_system_info.py new file mode 100644 index 000000000..a9b6c9d91 --- /dev/null +++ b/ci/jarvis-util/test/unit/test_system_info.py @@ -0,0 +1,137 @@ +from jarvis_util.util.argparse import ArgParse +from jarvis_util.shell.exec import Exec +from jarvis_util.shell.local_exec import LocalExecInfo +from jarvis_util.util.hostfile import Hostfile +from jarvis_util.introspect.system_info import Lsblk, \ + ListFses, FiInfo, Blkid, ResourceGraph, StorageDeviceType +from jarvis_util.util.size_conv import SizeConv +import pathlib +import itertools +from unittest import TestCase + + +class TestSystemInfo(TestCase): + def test_lsblk(self): + Lsblk(LocalExecInfo(hide_output=True)) + + def test_list_fses(self): + ListFses(LocalExecInfo(hide_output=True)) + + def test_fi_info(self): + FiInfo(LocalExecInfo(hide_output=True)) + + def test_blkid(self): + Blkid(LocalExecInfo(hide_output=True)) + + def test_resource_graph(self): + rg = ResourceGraph() + rg.build(LocalExecInfo(hide_output=True)) + rg.save('/tmp/resource_graph.yaml') + rg.load('/tmp/resource_graph.yaml') + rg.filter_fs(r'/$', '/${USER}', 'NVME') + rg.filter_hosts(Hostfile(), '1gbps') + rg.save('/tmp/resource_graph.yaml') + + def test_custom_resource_graph(self): + rg = ResourceGraph() + all_hosts = ['host1', 'host2', 'host3'] + all_hosts_ip = ['192.168.1.0', '192.168.1.1', '192.168.1.2'] + providers = ['tcp', 'ib', 'roce'] + hosts = Hostfile(all_hosts=all_hosts, all_hosts_ip=all_hosts_ip) + + # Add networks for each node + rg.set_hosts(hosts) + for provider in providers: + rg.add_net(hosts, + provider=provider) + rg.add_net(hosts.subset(1), + provider='uncommon') + + # Add common storage for each node + rg.add_storage(hosts, + device='/dev/sda1', + mount='/', + tran='sata', + rota=True, + size=SizeConv.to_int('10g'), + shared=False) + rg.add_storage(hosts, + device='/dev/sda2', + mount='/mnt/hdd/$USER', + tran='sata', + rota=True, + size=SizeConv.to_int('200g'), + shared=False) + rg.add_storage(hosts, + device='/dev/sdb1', + mount='/mnt/ssd/$USER', + tran='sata', + rota=False, + size=SizeConv.to_int('50g'), + shared=False) + rg.add_storage(hosts, + device='/dev/nvme0n1', + mount='/mnt/nvme/$USER', + tran='nvme', + rota=False, + size=SizeConv.to_int('100g'), + shared=False) + rg.add_storage(hosts.subset(1), + device='/dev/nvme0n2', + mount='/mnt/nvme2/$USER', + tran='nvme', + rota=False, + size=SizeConv.to_int('10g'), + shared=False) + rg.add_storage(hosts, + device='/dev/nvme0n3', + tran='nvme', + rota=False, + size=SizeConv.to_int('100g'), + shared=False) + + # Filter only mounts in '/mnt' + rg.filter_fs('/mnt/*') + + # Apply changes + rg.apply() + + # Find all mounted NVMes + df = rg.find_storage([StorageDeviceType.NVME]) + self.assertTrue(len(df[df.tran == 'nvme']) == 4) + self.assertTrue(len(df[df.tran == 'sata']) == 0) + self.assertTrue(len(df) == 4) + + # Find all mounted & common NVMes and SSDs + df = rg.find_storage([StorageDeviceType.NVME, + StorageDeviceType.SSD], + common=True) + self.assertTrue(len(df[df.tran == 'nvme']) == 3) + self.assertTrue(len(df[df.tran == 'sata']) == 3) + self.assertTrue(len(df) == 6) + + # Select a single nvme per-node + df = rg.find_storage([StorageDeviceType.NVME, + StorageDeviceType.SSD], + common=True, + count_per_node=1) + self.assertTrue(len(df[df.tran == 'nvme']) == 3) + self.assertTrue(len(df[df.tran == 'sata']) == 0) + self.assertTrue(len(df) == 3) + + # Select a single nvme and ssd per-node + df = rg.find_storage([StorageDeviceType.NVME, + StorageDeviceType.SSD], + common=True, + count_per_dev=1) + self.assertTrue(len(df[df.tran == 'nvme']) == 3) + self.assertTrue(len(df[df.tran == 'sata']) == 3) + self.assertTrue(len(df) == 6) + + # Find common networks between hosts + df = rg.find_net_info(hosts) + self.assertTrue(len(df) == 9) + + # Find common tcp networks + df = rg.find_net_info(hosts, providers='tcp') + self.assertTrue(len(df) == 3) diff --git a/ci/py_hermes_ci/py_hermes_ci/test_manager.py b/ci/py_hermes_ci/py_hermes_ci/test_manager.py index 7634b4082..78fca895d 100644 --- a/ci/py_hermes_ci/py_hermes_ci/test_manager.py +++ b/ci/py_hermes_ci/py_hermes_ci/test_manager.py @@ -1,11 +1,4 @@ -from jarvis_util.jutil_manager import JutilManager -from jarvis_util.shell.kill import Kill -from jarvis_util.shell.exec_info import ExecType, ExecInfo -from jarvis_util.shell.local_exec import LocalExecInfo -from jarvis_util.shell.mpi_exec import MpiExecInfo -from jarvis_util.shell.pssh_exec import PsshExecInfo -from jarvis_util.shell.rm import Rm -from jarvis_util.shell.exec import Exec +from jarvis_util import * import time import os, sys import pathlib @@ -15,11 +8,10 @@ class SpawnInfo(MpiExecInfo): def __init__(self, nprocs, hermes_conf=None, hermes_mode=None, api=None, - daemon_env=None, **kwargs): + **kwargs): super().__init__(nprocs=nprocs, **kwargs) self.hermes_conf = hermes_conf self.hermes_mode = hermes_mode - self.daemon_env = daemon_env self.api = api @@ -110,10 +102,6 @@ def spawn_info(self, nprocs=None, ppn=None, hostfile=None, if hermes_mode == 'kBypass': env['HERMES_ADAPTER_MODE'] = 'kBypass' - daemon_env = env.copy() - if 'LD_PRELOAD' in daemon_env and self.ADDRESS_SANITIZER: - del daemon_env['LD_PRELOAD'] - return SpawnInfo(nprocs=nprocs, ppn=ppn, hostfile=hostfile, @@ -121,7 +109,6 @@ def spawn_info(self, nprocs=None, ppn=None, hostfile=None, hermes_mode=hermes_mode, api=api, env=env, - daemon_env=daemon_env, cwd=cwd) @abstractmethod @@ -140,7 +127,7 @@ def spawn_all_nodes(self): def cleanup(self): dirs = " ".join([os.path.join(d, '*') for d in self.devices.values()]) - Rm(dirs, PsshExecInfo(hostfile=self.spawn_all_nodes().hostfile)) + Rm(dirs, LocalExecInfo(hostfile=self.spawn_all_nodes().hostfile)) def find_tests(self): # Filter the list to include only attributes that start with "test" @@ -181,15 +168,15 @@ def start_daemon(self, spawn_info): :return: None """ Kill("hermes", - PsshExecInfo( + LocalExecInfo( hostfile=spawn_info.hostfile, collect_output=False)) print("Start daemon") self.daemon = Exec(f"{self.CMAKE_BINARY_DIR}/bin/hermes_daemon", - PsshExecInfo( + LocalExecInfo( hostfile=spawn_info.hostfile, - env=spawn_info.daemon_env, + env=spawn_info.basic_env, exec_async=True)) time.sleep(5) print("Launched") @@ -204,6 +191,6 @@ def stop_daemon(self, spawn_info): print("Stop daemon") Exec(f"{self.CMAKE_BINARY_DIR}/bin/finalize_hermes", LocalExecInfo( - env=spawn_info.daemon_env)) + env=spawn_info.basic_env)) self.daemon.wait() print("Stopped daemon") diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 3ef813cb5..bff8fc1d9 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -54,7 +54,7 @@ void BorgIoThreadManager::SpawnFlushWorkers(int num_threads) { while (HERMES_BORG_IO_THREAD_MANAGER->Alive() || (!HERMES_BORG_IO_THREAD_MANAGER->Alive() && bq_info.load_)) { borg->LocalProcessFlushes(bq_info, queue); - tl::thread::self().sleep(*HERMES->rpc_.server_engine_, 1); + tl::thread::self().sleep(*HERMES->rpc_.server_engine_, 25); } HILOG(kDebug, "Flushing worker {} has stopped", bq_info.id_) }; diff --git a/src/config_server_default.h b/src/config_server_default.h index 5c5557bcb..8b2859281 100644 --- a/src/config_server_default.h +++ b/src/config_server_default.h @@ -79,6 +79,12 @@ const char* kServerDefaultConfigStr = " is_shared_device: true\n" " borg_capacity_thresh: [ 0.0, 1.0 ]\n" "\n" +"# Define the maximum amount of memory Hermes can use for non-buffering tasks.\n" +"# This includes metadata management and memory allocations.\n" +"# This memory will not be preallocated, so if you don\'t know, you can set it\n" +"# to be high.\n" +"max_memory: 8g\n" +"\n" "### Define properties of RPCs\n" "rpc:\n" " # A path to a file containing a list of server names, 1 per line. If your\n" diff --git a/test/tests.py b/test/tests.py index 0ac387de5..4c7921189 100644 --- a/test/tests.py +++ b/test/tests.py @@ -1,6 +1,5 @@ from py_hermes_ci.test_manager import TestManager -from jarvis_util.shell.exec import Exec -from jarvis_util.shell.local_exec import LocalExecInfo +from jarvis_util import * class NativeTestManager(TestManager): diff --git a/wrapper/java/tests.py b/wrapper/java/tests.py index 4fc13763c..b9fae8a53 100644 --- a/wrapper/java/tests.py +++ b/wrapper/java/tests.py @@ -1,6 +1,5 @@ from py_hermes_ci.test_manager import TestManager -from jarvis_util.shell.exec import Exec -from jarvis_util.shell.local_exec import LocalExecInfo +from jarvis_util import * class JavaWrapperTestManager(TestManager): From 92e7c1907f00768c217b950dce7c44d0916dcb50 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 01:17:34 -0500 Subject: [PATCH 28/44] test only posix in action --- .github/workflows/main.yml | 2 +- adapter/test/posix/tests.py | 2 +- ci/build_hermes.sh | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2807db199..fbb99bee2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -76,7 +76,7 @@ jobs: run: ci/build_hermes.sh - name: Test - run: cd build && ctest -VV + run: cd build && ctest -VV -R test_hermes_posix_basic_small - name: Install run: pushd build && make install && popd diff --git a/adapter/test/posix/tests.py b/adapter/test/posix/tests.py index 6ddf5bfb8..b29b4300f 100644 --- a/adapter/test/posix/tests.py +++ b/adapter/test/posix/tests.py @@ -24,7 +24,7 @@ def test_posix_basic(self): def test_hermes_posix_basic_small(self): posix_cmd = f"{self.HERMES_POSIX_CMD} " \ - f"~[request_size=range-large] " \ + f"~[request_size=range-small] " \ f"--reporter compact -d yes" spawn_info = self.spawn_info(nprocs=1, diff --git a/ci/build_hermes.sh b/ci/build_hermes.sh index 7ee0c965c..fbd601e27 100755 --- a/ci/build_hermes.sh +++ b/ci/build_hermes.sh @@ -32,7 +32,7 @@ cmake \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DBUILD_SHARED_LIBS=ON \ -DHERMES_ENABLE_COVERAGE=ON \ - -DHERMES_BUILD_BENCHMARKS=ON \ + -DHERMES_BUILD_BENCHMARKS=OFF \ -DHERMES_BUILD_BUFFER_POOL_VISUALIZER=OFF \ -DHERMES_USE_ADDRESS_SANITIZER=OFF \ -DHERMES_USE_THREAD_SANITIZER=OFF \ @@ -41,6 +41,6 @@ cmake \ -DHERMES_ENABLE_VFD=ON \ -DHERMES_DEBUG_LOCK=OFF \ .. -cmake --build . -- -j4 +make -j8 popd From 0570c49843e848d85ce9c4eb1f4e4b7fa1ddcf22 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 01:29:04 -0500 Subject: [PATCH 29/44] View what jarvis is doing in CI --- ci/py_hermes_ci/py_hermes_ci/test_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/py_hermes_ci/py_hermes_ci/test_manager.py b/ci/py_hermes_ci/py_hermes_ci/test_manager.py index 78fca895d..57e3feb2f 100644 --- a/ci/py_hermes_ci/py_hermes_ci/test_manager.py +++ b/ci/py_hermes_ci/py_hermes_ci/test_manager.py @@ -26,6 +26,8 @@ def __init__(self, cmake_source_dir, cmake_binary_dir, address_sanitizer): jutil = JutilManager.get_instance() jutil.collect_output = False jutil.hide_output = False + jutil.debug_mpi_exec = True + jutil.debug_local_exec = True self.MY_DIR = str(pathlib.Path(inspect.getfile(LocalExecInfo)).parent) self.CMAKE_SOURCE_DIR = cmake_source_dir self.CMAKE_BINARY_DIR = cmake_binary_dir From a86f1c08c1d5a57bab1d8822367d09b507a67869 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 01:53:16 -0500 Subject: [PATCH 30/44] Try just spawning the daemon --- adapter/test/posix/tests.py | 5 +++-- config/hermes_server_default.yaml | 6 ++++++ src/buffer_organizer.cc | 32 +++++++++++++++++++++++++++++-- src/buffer_organizer.h | 8 +++++++- 4 files changed, 46 insertions(+), 5 deletions(-) diff --git a/adapter/test/posix/tests.py b/adapter/test/posix/tests.py index b29b4300f..1141c45aa 100644 --- a/adapter/test/posix/tests.py +++ b/adapter/test/posix/tests.py @@ -30,9 +30,10 @@ def test_hermes_posix_basic_small(self): spawn_info = self.spawn_info(nprocs=1, hermes_conf='hermes_server') self.start_daemon(spawn_info) - node = Exec(posix_cmd, spawn_info) + # node = Exec(posix_cmd, spawn_info) self.stop_daemon(spawn_info) - return node.exit_code + # return node.exit_code + return 0 def test_hermes_posix_basic_large(self): posix_cmd = f"{self.HERMES_POSIX_CMD} " \ diff --git a/config/hermes_server_default.yaml b/config/hermes_server_default.yaml index c2ae6eb90..607157391 100644 --- a/config/hermes_server_default.yaml +++ b/config/hermes_server_default.yaml @@ -113,6 +113,12 @@ buffer_organizer: # The number of threads used in the background organization of internal Hermes buffers. num_threads: 1 + # Interval (seconds) where blobs are checked for flushing + flush_period: 1 + + # Interval (seconds) where blobs are checked for re-organization + blob_reorg_period: 1 + # Desired RPC port number for buffer organizer. port: 8081 diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index bff8fc1d9..1126c3b4f 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -22,8 +22,25 @@ namespace hermes { * BORG I/O thread manager * ===================================*/ +/** Spawn a thread for re-organizing blobs */ +void BorgIoThreadManager::SpawnBlobMonitor() { + auto flush_scheduler = [](void *args) { + HILOG(kDebug, "Blob re-organization thread has started") + (void) args; + auto borg = &HERMES->borg_; + while (HERMES_THREAD_MANAGER->Alive()) { + borg->LocalAnalyzeBlobs(); + // TODO(llogan): make configurable + tl::thread::self().sleep(*HERMES->rpc_.server_engine_, 1000); + } + HERMES_BORG_IO_THREAD_MANAGER->Join(); + HILOG(kDebug, "Blob re-organization thread has stopped") + }; + HERMES_THREAD_MANAGER->Spawn(flush_scheduler); +} + /** Spawn the enqueuing thread */ -void BorgIoThreadManager::SpawnFlushMonitor(int num_threads) { +void BorgIoThreadManager::SpawnFlushMonitor() { auto flush_scheduler = [](void *args) { HILOG(kDebug, "Flushing scheduler thread has started") (void) args; @@ -112,7 +129,8 @@ void BufferOrganizer::shm_init(hipc::ShmArchive &header, int num_threads = HERMES->server_config_.borg_.num_threads_; HSHM_MAKE_AR((*header).queues_, alloc, num_threads) HERMES_BORG_IO_THREAD_MANAGER->queues_ = queues_; - HERMES_BORG_IO_THREAD_MANAGER->SpawnFlushMonitor(num_threads); + HERMES_BORG_IO_THREAD_MANAGER->SpawnBlobMonitor(); + HERMES_BORG_IO_THREAD_MANAGER->SpawnFlushMonitor(); HERMES_BORG_IO_THREAD_MANAGER->SpawnFlushWorkers(num_threads); } @@ -334,6 +352,16 @@ void BufferOrganizer::GlobalOrganizeBlob(const std::string &bucket_name, * BORG Flushing methods * ===================================*/ +/** Find blobs which should be re-organized */ +void BufferOrganizer::LocalAnalyzeBlobs() { + auto mdm = &HERMES->mdm_; + ScopedRwReadLock blob_map_lock(mdm->header_->lock_[kBlobMapLock], + kBORG_LocalEnqueueFlushes); + for (hipc::pair& blob_p : *mdm->blob_map_) { + // TODO + } +} + /** Flush all blobs registered in this daemon */ void BufferOrganizer::LocalEnqueueFlushes() { auto mdm = &HERMES->mdm_; diff --git a/src/buffer_organizer.h b/src/buffer_organizer.h index fc5781e90..ac41d9571 100644 --- a/src/buffer_organizer.h +++ b/src/buffer_organizer.h @@ -144,8 +144,11 @@ class BorgIoThreadManager { return !kill_requested_.load(); } + /** Spawn a thread for re-organizing blobs */ + void SpawnBlobMonitor(); + /** Spawn the enqueuing thread */ - void SpawnFlushMonitor(int num_threads); + void SpawnFlushMonitor(); /** Spawn the I/O threads */ void SpawnFlushWorkers(int num_threads); @@ -257,6 +260,9 @@ class BufferOrganizer { * BORG Flushing methods * ===================================*/ + /** Find blobs which should be re-organized */ + void LocalAnalyzeBlobs(); + /** Flush all blobs registered in this daemon */ void LocalEnqueueFlushes(); From c7a6e860e98ba6649ef0fa09db64d7ab17f198b8 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 02:14:06 -0500 Subject: [PATCH 31/44] Make borg thread intervals configurable --- config/hermes_server_default.yaml | 13 +++++++++++-- src/buffer_organizer.cc | 15 ++++++++++----- src/config_server.cc | 21 ++++++++++++++++++--- src/config_server.h | 14 ++++++++++++-- 4 files changed, 51 insertions(+), 12 deletions(-) diff --git a/config/hermes_server_default.yaml b/config/hermes_server_default.yaml index 607157391..3bd1a2109 100644 --- a/config/hermes_server_default.yaml +++ b/config/hermes_server_default.yaml @@ -119,8 +119,17 @@ buffer_organizer: # Interval (seconds) where blobs are checked for re-organization blob_reorg_period: 1 - # Desired RPC port number for buffer organizer. - port: 8081 + ## What does "recently accessed" mean? + # Time when score is equal to 1 (seconds) + recency_min: 0 + # Time when score is equal to 0 (seconds) + recency_max: 60 + + ## What does "frequently accessed" mean? + # Number of accesses for score to be equal to 1 (count) + freq_max: 15 + # Number of accesses for score to be equal to 0 (count) + freq_min: 0 ### Define the default data placement policy dpe: diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 1126c3b4f..140e86f54 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -30,8 +30,8 @@ void BorgIoThreadManager::SpawnBlobMonitor() { auto borg = &HERMES->borg_; while (HERMES_THREAD_MANAGER->Alive()) { borg->LocalAnalyzeBlobs(); - // TODO(llogan): make configurable - tl::thread::self().sleep(*HERMES->rpc_.server_engine_, 1000); + tl::thread::self().sleep(*HERMES->rpc_.server_engine_, + HERMES->server_config_.borg_.blob_reorg_period_); } HERMES_BORG_IO_THREAD_MANAGER->Join(); HILOG(kDebug, "Blob re-organization thread has stopped") @@ -47,8 +47,8 @@ void BorgIoThreadManager::SpawnFlushMonitor() { auto borg = &HERMES->borg_; while (HERMES_THREAD_MANAGER->Alive()) { borg->LocalEnqueueFlushes(); - // TODO(llogan): make configurable - tl::thread::self().sleep(*HERMES->rpc_.server_engine_, 1000); + tl::thread::self().sleep(*HERMES->rpc_.server_engine_, + HERMES->server_config_.borg_.flush_period_); } HERMES_BORG_IO_THREAD_MANAGER->Join(); HILOG(kDebug, "Flush scheduler thread has stopped") @@ -357,8 +357,13 @@ void BufferOrganizer::LocalAnalyzeBlobs() { auto mdm = &HERMES->mdm_; ScopedRwReadLock blob_map_lock(mdm->header_->lock_[kBlobMapLock], kBORG_LocalEnqueueFlushes); + float recency_max = HERMES->server_config_.borg_.recency_max_; + float recency_min = HERMES->server_config_.borg_.recency_min_; + float freq_max = HERMES->server_config_.borg_.freq_max_; + float freq_min = HERMES->server_config_.borg_.freq_min_; + for (hipc::pair& blob_p : *mdm->blob_map_) { - // TODO + // TODO(llogan) } } diff --git a/src/config_server.cc b/src/config_server.cc index ae67e9e95..8f04e7b12 100644 --- a/src/config_server.cc +++ b/src/config_server.cc @@ -99,12 +99,27 @@ void ServerConfig::ParseDpeInfo(YAML::Node yaml_conf) { /** parse buffer organizer information from YAML config */ void ServerConfig::ParseBorgInfo(YAML::Node yaml_conf) { - if (yaml_conf["port"]) { - borg_.port_ = yaml_conf["port"].as(); - } if (yaml_conf["num_threads"]) { borg_.num_threads_ = yaml_conf["num_threads"].as(); } + if (yaml_conf["flush_period"]) { + borg_.flush_period_ = yaml_conf["flush_period"].as(); + } + if (yaml_conf["blob_reorg_period"]) { + borg_.blob_reorg_period_ = yaml_conf["blob_reorg_period"].as(); + } + if (yaml_conf["recency_min"]) { + borg_.recency_min_ = yaml_conf["recency_min"].as(); + } + if (yaml_conf["recency_max"]) { + borg_.recency_max_ = yaml_conf["recency_max"].as(); + } + if (yaml_conf["freq_max"]) { + borg_.freq_max_ = yaml_conf["freq_max"].as(); + } + if (yaml_conf["freq_min"]) { + borg_.freq_min_ = yaml_conf["freq_min"].as(); + } } /** parse I/O tracing information from YAML config */ diff --git a/src/config_server.h b/src/config_server.h index 3a255b56c..a6d1651e2 100644 --- a/src/config_server.h +++ b/src/config_server.h @@ -212,10 +212,20 @@ struct DpeInfo { * Buffer organizer information defined in server config * */ struct BorgInfo { - /** The RPC port number for the buffer organizer. */ - int port_; /** The number of buffer organizer threads. */ int num_threads_; + /** Interval (seconds) where blobs are checked for flushing */ + size_t flush_period_; + /** Interval (seconds) where blobs are checked for re-organization */ + size_t blob_reorg_period_; + /** Time when score is equal to 1 (seconds) */ + float recency_min_; + /** Time when score is equal to 0 (seconds) */ + float recency_max_; + /** Number of accesses for score to be equal to 1 (count) */ + float freq_max_; + /** Number of accesses for score to be equal to 0 (count) */ + float freq_min_; }; /** From 4d5c6d377f8a000c85911d81e414da7b654a008a Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 02:15:16 -0500 Subject: [PATCH 32/44] Re-generate default config --- src/config_server_default.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/config_server_default.h b/src/config_server_default.h index 8b2859281..dc8982eed 100644 --- a/src/config_server_default.h +++ b/src/config_server_default.h @@ -116,8 +116,23 @@ const char* kServerDefaultConfigStr = " # The number of threads used in the background organization of internal Hermes buffers.\n" " num_threads: 1\n" "\n" -" # Desired RPC port number for buffer organizer.\n" -" port: 8081\n" +" # Interval (seconds) where blobs are checked for flushing\n" +" flush_period: 1\n" +"\n" +" # Interval (seconds) where blobs are checked for re-organization\n" +" blob_reorg_period: 1\n" +"\n" +" ## What does \"recently accessed\" mean?\n" +" # Time when score is equal to 1 (seconds)\n" +" recency_min: 0\n" +" # Time when score is equal to 0 (seconds)\n" +" recency_max: 60\n" +"\n" +" ## What does \"frequently accessed\" mean?\n" +" # Number of accesses for score to be equal to 1 (count)\n" +" freq_max: 15\n" +" # Number of accesses for score to be equal to 0 (count)\n" +" freq_min: 0\n" "\n" "### Define the default data placement policy\n" "dpe:\n" From aa26704887a924e29ec645c78128039436999049 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 02:42:07 -0500 Subject: [PATCH 33/44] Make start_daemon and stop_daemon empty --- ci/py_hermes_ci/py_hermes_ci/test_manager.py | 5 +++- src/metadata_manager.cc | 9 ++++++-- src/metadata_types.h | 24 ++++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/ci/py_hermes_ci/py_hermes_ci/test_manager.py b/ci/py_hermes_ci/py_hermes_ci/test_manager.py index 57e3feb2f..93e7a1541 100644 --- a/ci/py_hermes_ci/py_hermes_ci/test_manager.py +++ b/ci/py_hermes_ci/py_hermes_ci/test_manager.py @@ -169,7 +169,9 @@ def start_daemon(self, spawn_info): :param env: Hermes environment variables :return: None """ - Kill("hermes", + print("Killing daemon") + return + Kill("hermes_daemon", LocalExecInfo( hostfile=spawn_info.hostfile, collect_output=False)) @@ -191,6 +193,7 @@ def stop_daemon(self, spawn_info): :return: None """ print("Stop daemon") + return Exec(f"{self.CMAKE_BINARY_DIR}/bin/finalize_hermes", LocalExecInfo( env=spawn_info.basic_env)) diff --git a/src/metadata_manager.cc b/src/metadata_manager.cc index 7ae2e928e..ca3d8f04e 100644 --- a/src/metadata_manager.cc +++ b/src/metadata_manager.cc @@ -345,8 +345,10 @@ MetadataManager::LocalPutBlobMetadata(TagId bkt_id, blob_info.tag_id_ = bkt_id; blob_info.blob_size_ = blob_size; blob_info.score_ = score; - blob_info.mod_count_ = 1; + blob_info.mod_count_ = 0; + blob_info.access_freq_ = 0; blob_info.last_flush_ = 0; + blob_info.UpdateWriteStats(); } else { HILOG(kDebug, "Found existing blob: {}. Total num blobs: {}", blob_name, blob_map_->size()) @@ -360,8 +362,9 @@ MetadataManager::LocalPutBlobMetadata(TagId bkt_id, (*blob_info.buffers_) = buffers; blob_info.blob_size_ = blob_size; blob_info.score_ = score; - blob_info.mod_count_.fetch_add(1); + blob_info.UpdateWriteStats(); } + AddIoStat(bkt_id, blob_id, blob_size, IoType::kWrite); return std::tuple(blob_id, did_create, orig_blob_size); } @@ -463,6 +466,8 @@ std::vector MetadataManager::LocalGetBlobBuffers(BlobId blob_id) { // Acquire blob_info read lock ScopedRwReadLock blob_info_lock(blob_info.lock_[0], kMDM_LocalGetBlobBuffers); + AddIoStat(blob_info.tag_id_, blob_id, blob_info.blob_size_, IoType::kRead); + blob_info.UpdateReadStats(); auto vec = blob_info.buffers_->vec(); return vec; } diff --git a/src/metadata_types.h b/src/metadata_types.h index 6ebafd4ae..461b3c163 100644 --- a/src/metadata_types.h +++ b/src/metadata_types.h @@ -132,6 +132,8 @@ struct BlobInfo : public hipc::ShmContainer { RwLock lock_[2]; /**< Ensures BlobInfo access is synchronized */ size_t blob_size_; /**< The overall size of the blob */ float score_; /**< The priority of this blob */ + std::atomic access_freq_; /**< Number of times blob accessed in epoch */ + u64 last_access_; /**< Last time blob accessed */ std::atomic mod_count_; /**< The number of times blob modified */ std::atomic last_flush_; /**< The last mod that was flushed */ @@ -243,6 +245,28 @@ struct BlobInfo : public hipc::ShmContainer { /** Destroy all allocated data */ void shm_destroy_main(); + + /**==================================== + * Statistics + * ===================================*/ + + void UpdateWriteStats() { + mod_count_.fetch_add(1); + UpdateReadStats(); + } + + void UpdateReadStats() { + last_access_ = GetTimeFromStartNs(); + access_freq_.fetch_add(1); + } + + static u64 GetTimeFromStartNs() { + struct timespec currentTime; + clock_gettime(CLOCK_MONOTONIC, ¤tTime); + unsigned long long nanoseconds = + currentTime.tv_sec * 1000000000ULL + currentTime.tv_nsec; + return nanoseconds; + } }; /** Represents TagInfo in shared memory */ From 1445d43cb4b58f3d1a568f254f58051db96aa200 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 03:23:59 -0500 Subject: [PATCH 34/44] Try killing the daemon --- ci/py_hermes_ci/py_hermes_ci/test_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/py_hermes_ci/py_hermes_ci/test_manager.py b/ci/py_hermes_ci/py_hermes_ci/test_manager.py index 93e7a1541..42c14eac5 100644 --- a/ci/py_hermes_ci/py_hermes_ci/test_manager.py +++ b/ci/py_hermes_ci/py_hermes_ci/test_manager.py @@ -170,11 +170,11 @@ def start_daemon(self, spawn_info): :return: None """ print("Killing daemon") - return Kill("hermes_daemon", LocalExecInfo( hostfile=spawn_info.hostfile, collect_output=False)) + return print("Start daemon") self.daemon = Exec(f"{self.CMAKE_BINARY_DIR}/bin/hermes_daemon", From 0081c1482662df863aeab7bfec5fcbd6ab151db4 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 04:16:39 -0500 Subject: [PATCH 35/44] try only starting and killing daemon --- ci/py_hermes_ci/py_hermes_ci/test_manager.py | 1 - src/buffer_organizer.cc | 61 +++++++++++++++++++- src/buffer_organizer.h | 7 +++ src/dpe/minimize_io_time.cc | 6 +- src/dpe/random.cc | 3 + src/dpe/round_robin.cc | 3 + 6 files changed, 76 insertions(+), 5 deletions(-) diff --git a/ci/py_hermes_ci/py_hermes_ci/test_manager.py b/ci/py_hermes_ci/py_hermes_ci/test_manager.py index 42c14eac5..eb53b1cab 100644 --- a/ci/py_hermes_ci/py_hermes_ci/test_manager.py +++ b/ci/py_hermes_ci/py_hermes_ci/test_manager.py @@ -174,7 +174,6 @@ def start_daemon(self, spawn_info): LocalExecInfo( hostfile=spawn_info.hostfile, collect_output=False)) - return print("Start daemon") self.daemon = Exec(f"{self.CMAKE_BINARY_DIR}/bin/hermes_daemon", diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 140e86f54..2329c9f27 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -323,6 +323,16 @@ void BufferOrganizer::GlobalOrganizeBlob(const std::string &bucket_name, BlobId blob_id; bkt.GetBlobId(blob_name, blob_id); float blob_score = bkt.GetBlobScore(blob_id); + GlobalOrganizeBlob(bkt, blob_name, blob_id, blob_score, score); +} + +/** Re-organize blobs based on a score */ +void BufferOrganizer::GlobalOrganizeBlob(hapi::Bucket &bkt, + const std::string &blob_name, + BlobId &blob_id, + float blob_score, + float score) { + AUTO_TRACE(1); Context ctx; HILOG(kDebug, "Changing blob score from: {} to {}", blob_score, score) @@ -357,13 +367,60 @@ void BufferOrganizer::LocalAnalyzeBlobs() { auto mdm = &HERMES->mdm_; ScopedRwReadLock blob_map_lock(mdm->header_->lock_[kBlobMapLock], kBORG_LocalEnqueueFlushes); - float recency_max = HERMES->server_config_.borg_.recency_max_; float recency_min = HERMES->server_config_.borg_.recency_min_; + float recency_max = HERMES->server_config_.borg_.recency_max_; float freq_max = HERMES->server_config_.borg_.freq_max_; float freq_min = HERMES->server_config_.borg_.freq_min_; + // Only re-organize if there's a capacity trigger + bool is_below_thresh = false; + auto targets = mdm->LocalGetTargetInfo(); + for (TargetInfo &target : targets) { + DeviceInfo &dev_info = + (*mdm_->devices_)[target.id_.GetDeviceId()]; + float rem_cap = (float) target.rem_cap_ / (float)target.max_cap_; + if (rem_cap < dev_info.borg_min_thresh_) { + is_below_thresh = true; + } + } + if (!is_below_thresh) { + return; + } + + u64 time = BlobInfo::GetTimeFromStartNs(); for (hipc::pair& blob_p : *mdm->blob_map_) { - // TODO(llogan) + BlobInfo &blob_info = blob_p.GetSecond(); + // Get the recency score [0, 1] + float last_access_elapse = (float)(time - blob_info.last_access_); + float recency_score; + if (last_access_elapse <= recency_min) { + recency_score = 1; + } else if (last_access_elapse >= recency_max) { + recency_score = 0; + } else { + recency_score = (last_access_elapse - recency_min) / + (recency_max - recency_min); + recency_score = 1 - recency_score; + } + + // Get the frequency score [0, 1] + float freq_score; + float freq = (float)blob_info.access_freq_; + if (freq <= freq_min) { + freq_score = 0; + } else if (freq >= freq_max) { + freq_score = 1; + } else { + freq_score = (freq - freq_min) / (freq_max - freq_min); + } + + // Update the current blob score + auto bkt = HERMES->GetBucket(blob_info.tag_id_); + GlobalOrganizeBlob(bkt, + blob_info.name_->str(), + blob_info.blob_id_, + blob_info.score_, + std::max(freq_score, recency_score)); } } diff --git a/src/buffer_organizer.h b/src/buffer_organizer.h index ac41d9571..ea07eea61 100644 --- a/src/buffer_organizer.h +++ b/src/buffer_organizer.h @@ -255,6 +255,13 @@ class BufferOrganizer { const std::string &blob_name, float score); + /** Re-organize blobs based on a score */ + void GlobalOrganizeBlob(hermes::api::Bucket &bkt, + const std::string &blob_name, + BlobId &blob_id, + float blob_score, + float score); + public: /**==================================== * BORG Flushing methods diff --git a/src/dpe/minimize_io_time.cc b/src/dpe/minimize_io_time.cc index 40dcd2b1d..6f58d2acd 100644 --- a/src/dpe/minimize_io_time.cc +++ b/src/dpe/minimize_io_time.cc @@ -25,9 +25,8 @@ Status MinimizeIoTime::Placement(const std::vector &blob_sizes, // Initialize blob's size, score, and schema size_t rem_blob_size = blob_size; float score = ctx.blob_score_; - if (score == -1) { + if (ctx.blob_score_ == -1) { score = 1; - ctx.blob_score_ = 1; } output.emplace_back(); PlacementSchema &blob_schema = output.back(); @@ -40,6 +39,9 @@ Status MinimizeIoTime::Placement(const std::vector &blob_sizes, // TODO(llogan): add other considerations of this DPE continue; } + if (ctx.blob_score_ == -1) { + ctx.blob_score_ = target.score_; + } // NOTE(llogan): we assume the TargetInfo list is sorted if (target.rem_cap_ >= rem_blob_size) { diff --git a/src/dpe/random.cc b/src/dpe/random.cc index 05e9a6254..cb51e637f 100644 --- a/src/dpe/random.cc +++ b/src/dpe/random.cc @@ -37,6 +37,9 @@ Status Random::Placement(const std::vector &blob_sizes, if (target.rem_cap_ < blob_size) { continue; } + if (ctx.blob_score_ == -1) { + ctx.blob_score_ = target.score_; + } // Place the blob on this target blob_schema.plcmnts_.emplace_back(rem_blob_size, diff --git a/src/dpe/round_robin.cc b/src/dpe/round_robin.cc index 6682ce11a..ed96b2a64 100644 --- a/src/dpe/round_robin.cc +++ b/src/dpe/round_robin.cc @@ -34,6 +34,9 @@ Status RoundRobin::Placement(const std::vector &blob_sizes, if (target.rem_cap_ < blob_size) { continue; } + if (ctx.blob_score_ == -1) { + ctx.blob_score_ = target.score_; + } // Place the blob on this target blob_schema.plcmnts_.emplace_back(rem_blob_size, From 3be3795caac32f3e54cb987cd12ffcd570ee2bf2 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 05:22:06 -0500 Subject: [PATCH 36/44] Prefetcher trait --- benchmarks/CMakeLists.txt | 10 +- benchmarks/memcpy_bench.cc | 3 +- benchmarks/reorganize.cc | 142 ++++++++++++++++++ .../jarvis_util/shell/local_exec.py | 3 - src/prefetcher.h | 4 + src/trait_manager.h | 2 + traits/CMakeLists.txt | 1 + traits/prefetcher/CMakeLists.txt | 41 +++++ traits/prefetcher/prefetcher_trait.cc | 24 +++ traits/prefetcher/prefetcher_trait.h | 48 ++++++ 10 files changed, 269 insertions(+), 9 deletions(-) create mode 100644 benchmarks/reorganize.cc create mode 100644 traits/prefetcher/CMakeLists.txt create mode 100644 traits/prefetcher/prefetcher_trait.cc create mode 100644 traits/prefetcher/prefetcher_trait.h diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 4f84beeb3..faae961b1 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -7,12 +7,14 @@ include_directories( set(BENCHMARKS api_bench - memcpy_bench) + memcpy_bench + reorganize) foreach(benchmark ${BENCHMARKS}) message("Building ${benchmark}") add_executable(${benchmark} ${benchmark}.cc) - add_dependencies(${benchmark} hermes) - target_link_libraries(${benchmark} hermes MPI::MPI_CXX - $<$:thallium>) + add_dependencies(${benchmark} hermes hermes_prefetcher_trait) + target_link_libraries(${benchmark} hermes hermes_prefetcher_trait + MPI::MPI_CXX + $<$:thallium>) endforeach() diff --git a/benchmarks/memcpy_bench.cc b/benchmarks/memcpy_bench.cc index c9238a95f..267439cb6 100644 --- a/benchmarks/memcpy_bench.cc +++ b/benchmarks/memcpy_bench.cc @@ -88,7 +88,7 @@ void MemcpyBench(int nprocs, int rank, std::string shm_url = "test_mem_backend"; std::string backend_type; size_t backend_size; - hipc::MemoryBackendType type; + hipc::MemoryBackendType type = hipc::MemoryBackendType::kPosixShmMmap; if constexpr(std::is_same_v) { backend_type = "kPosixShmMmap"; @@ -99,7 +99,6 @@ void MemcpyBench(int nprocs, int rank, type = hipc::MemoryBackendType::kPosixMmap; backend_size = blob_size * blobs_per_rank; } else { - (void) type; HELOG(kFatal, "Invalid backend type"); } diff --git a/benchmarks/reorganize.cc b/benchmarks/reorganize.cc new file mode 100644 index 000000000..f5fae00f5 --- /dev/null +++ b/benchmarks/reorganize.cc @@ -0,0 +1,142 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * +* Distributed under BSD 3-Clause license. * +* Copyright by The HDF Group. * +* Copyright by the Illinois Institute of Technology. * +* All rights reserved. * +* * +* This file is part of Hermes. The full Hermes copyright notice, including * +* terms governing use, modification, and redistribution, is contained in * +* the COPYING file, which can be found at the top directory. If you do not * +* have access to the file, you may request a copy from help@hdfgroup.org. * +* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +// #define HERMES_ENABLE_PROFILING 1 + +#include "mpi.h" +#include +#include "hermes.h" +#include "hermes_shm/memory/backend/posix_shm_mmap.h" +#include +#include "traits/prefetcher/prefetcher_trait.h" + +using hshm::ipc::PosixShmMmap; +using Timer = hshm::HighResMonotonicTimer; +hipc::MemoryBackend *backend; + +void GatherTimes(const std::string &test_name, + size_t total_size, + Timer &t, Timer &io_t) { + MPI_Barrier(MPI_COMM_WORLD); + int nprocs, rank; + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + double time = t.GetUsec(), max_runtime; + MPI_Reduce(&time, &max_runtime, + 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + double io_time = t.GetUsec(), max_io_time; + MPI_Reduce(&io_time, &max_io_time, + 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + if (rank == 0) { + HIPRINT("{} {}: MBps: {}, Time: {}\n", + nprocs, test_name, + total_size / max_io_time, + max_runtime) + } +} + +/** Each process PUTS into the same bucket, but with different blob names */ +void PutTest(int nprocs, int rank, + size_t blobs_per_checkpt, + size_t num_checkpts, + size_t blob_size, + int compute_sec) { + Timer t, io_t; + auto bkt = HERMES->GetBucket("hello"); + hermes::api::Context ctx; + hermes::BlobId blob_id; + hermes::Blob blob(blob_size); + t.Resume(); + + size_t blobs_per_rank = blobs_per_checkpt * num_checkpts; + size_t cur_blob = 0; + for (size_t i = 0; i < num_checkpts; ++i) { + io_t.Resume(); + for (size_t j = 0; j < blobs_per_checkpt; ++j) { + size_t blob_name_int = rank * blobs_per_rank + cur_blob; + std::string name = std::to_string(blob_name_int); + bkt.Put(name, blob, blob_id, ctx); + ++cur_blob; + } + io_t.Pause(); + sleep(compute_sec); + } + t.Pause(); + GatherTimes("Put", nprocs * blobs_per_rank * blob_size, t, io_t); +} + +/** + * Each process GETS from the same bucket, but with different blob names + * MUST run PutTest first. + * */ +void GetTest(int nprocs, int rank, + size_t blobs_per_checkpt, + size_t num_checkpts, + size_t blob_size, + int compute_sec) { + Timer t, io_t; + auto bkt = HERMES->GetBucket("hello"); + hermes::api::Context ctx; + hermes::BlobId blob_id; + t.Resume(); + size_t blobs_per_rank = blobs_per_checkpt * num_checkpts; + size_t cur_blob = 0; + for (size_t i = 0; i < num_checkpts; ++i) { + io_t.Resume(); + for (size_t j = 0; j < blobs_per_checkpt; ++j) { + size_t blob_name_int = rank * blobs_per_rank + cur_blob; + std::string name = std::to_string(blob_name_int); + hermes::Blob ret; + bkt.GetBlobId(name, blob_id); + bkt.Get(blob_id, ret, ctx); + ++cur_blob; + } + io_t.Pause(); + sleep(compute_sec); + } + t.Pause(); + GatherTimes("Get", nprocs * blobs_per_rank * blob_size, t, io_t); +} + +int main(int argc, char **argv) { + int rank, nprocs; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + if (argc != 7) { + printf("USAGE: ./reorganize [prefetch] [blob_size (K/M/G)] " + "[blobs_per_checkpt] [num_checkpts] " + "[compute_put (sec)] [compute_get (sec)]\n"); + exit(1); + } + int with_prefetch = atoi(argv[1]); + size_t blob_size = hshm::ConfigParse::ParseSize(argv[2]); + size_t blobs_per_checkpt = atoi(argv[3]); + size_t num_checkpts = atoi(argv[4]); + int compute_put = atoi(argv[5]); + int compute_get = atoi(argv[6]); + + // Register the Apriori trait + hermes::TraitId apriori_trait = HERMES->RegisterTrait( + "apriori", hermes::PrefetcherType::kApriori); + if (with_prefetch) { + auto bkt = HERMES->GetBucket("hello"); + bkt.AttachTrait(apriori_trait); + } + + PutTest(nprocs, rank, blobs_per_checkpt, num_checkpts, + blob_size, compute_put); + MPI_Barrier(MPI_COMM_WORLD); + GetTest(nprocs, rank, blobs_per_checkpt, num_checkpts, + blob_size, compute_get); + MPI_Finalize(); +} diff --git a/ci/jarvis-util/jarvis_util/shell/local_exec.py b/ci/jarvis-util/jarvis_util/shell/local_exec.py index c9eac9694..86d2a6aa7 100644 --- a/ci/jarvis-util/jarvis_util/shell/local_exec.py +++ b/ci/jarvis-util/jarvis_util/shell/local_exec.py @@ -82,9 +82,6 @@ def _start_bash_processes(self): time.sleep(self.sleep_ms) # pylint: disable=R1732 self.proc = subprocess.Popen(self.cmd, - stdin=self.stdin, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, cwd=self.cwd, env=self.env, shell=True) diff --git a/src/prefetcher.h b/src/prefetcher.h index 347504017..f82521406 100644 --- a/src/prefetcher.h +++ b/src/prefetcher.h @@ -22,6 +22,10 @@ namespace hermes { +enum class PrefetcherType { + kApriori +}; + class Prefetcher { public: std::vector> trace_; diff --git a/src/trait_manager.h b/src/trait_manager.h index b62f83be6..6c0f2394a 100644 --- a/src/trait_manager.h +++ b/src/trait_manager.h @@ -24,6 +24,8 @@ namespace hermes { #define HERMES_TRAIT_PUT_GET BIT_OPT(uint32_t, 1) /** This trait is useful to BORG's Flush operation */ #define HERMES_TRAIT_FLUSH BIT_OPT(uint32_t, 2) +/** This trait is a prefetcher */ +#define HERMES_TRAIT_PREFETCHER BIT_OPT(uint32_t, 3) /** The basic state needed to be stored by every trait */ struct TraitHeader { diff --git a/traits/CMakeLists.txt b/traits/CMakeLists.txt index 5d5faa6b2..3b582633a 100644 --- a/traits/CMakeLists.txt +++ b/traits/CMakeLists.txt @@ -10,4 +10,5 @@ include_directories( ${HERMES_IO_CLIENT_DIR}) add_subdirectory(example) +add_subdirectory(prefetcher) diff --git a/traits/prefetcher/CMakeLists.txt b/traits/prefetcher/CMakeLists.txt new file mode 100644 index 000000000..737186f25 --- /dev/null +++ b/traits/prefetcher/CMakeLists.txt @@ -0,0 +1,41 @@ + + +# Creates an example trait +add_library(hermes_prefetcher_trait + prefetcher_trait.cc) +add_dependencies(hermes_prefetcher_trait hermes) +target_link_libraries(hermes_prefetcher_trait + hermes MPI::MPI_CXX stdc++fs dl) + +#----------------------------------------------------------------------------- +# Add Target(s) to CMake Install +#----------------------------------------------------------------------------- +install( + TARGETS + hermes_prefetcher_trait + EXPORT + ${HERMES_EXPORTED_TARGETS} + LIBRARY DESTINATION ${HERMES_INSTALL_LIB_DIR} + ARCHIVE DESTINATION ${HERMES_INSTALL_LIB_DIR} + RUNTIME DESTINATION ${HERMES_INSTALL_BIN_DIR} +) +#----------------------------------------------------------------------------- +# Export all exported targets to the build tree for use by parent project +#----------------------------------------------------------------------------- +set(HERMES_EXPORTED_LIBS + hermes_prefetcher_trait + ${HERMES_EXPORTED_LIBS}) +if(NOT HERMES_EXTERNALLY_CONFIGURED) + EXPORT ( + TARGETS + ${HERMES_EXPORTED_LIBS} + FILE + ${HERMES_EXPORTED_TARGETS}.cmake + ) +endif() +#----------------------------------------------------------------------------- +# Add Target(s) to Coverage +#----------------------------------------------------------------------------- +if(HERMES_ENABLE_COVERAGE) + set_coverage_flags(hermes_prefetcher_trait) +endif() diff --git a/traits/prefetcher/prefetcher_trait.cc b/traits/prefetcher/prefetcher_trait.cc new file mode 100644 index 000000000..9d2c4d157 --- /dev/null +++ b/traits/prefetcher/prefetcher_trait.cc @@ -0,0 +1,24 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * +* Distributed under BSD 3-Clause license. * +* Copyright by The HDF Group. * +* Copyright by the Illinois Institute of Technology. * +* All rights reserved. * +* * +* This file is part of Hermes. The full Hermes copyright notice, including * +* terms governing use, modification, and redistribution, is contained in * +* the COPYING file, which can be found at the top directory. If you do not * +* have access to the file, you may request a copy from help@hdfgroup.org. * +* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include "prefetcher_trait.h" + +namespace hermes::api { + +void PrefetcherTrait::Run(int method, void *params) { + (void) method; + (void) params; +} + +} // namespace hermes::api + +HERMES_TRAIT_CC(hermes::api::PrefetcherTrait) \ No newline at end of file diff --git a/traits/prefetcher/prefetcher_trait.h b/traits/prefetcher/prefetcher_trait.h new file mode 100644 index 000000000..22c9d49e6 --- /dev/null +++ b/traits/prefetcher/prefetcher_trait.h @@ -0,0 +1,48 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * +* Distributed under BSD 3-Clause license. * +* Copyright by The HDF Group. * +* Copyright by the Illinois Institute of Technology. * +* All rights reserved. * +* * +* This file is part of Hermes. The full Hermes copyright notice, including * +* terms governing use, modification, and redistribution, is contained in * +* the COPYING file, which can be found at the top directory. If you do not * +* have access to the file, you may request a copy from help@hdfgroup.org. * +* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#ifndef HERMES_TRAITS_EXAMPLE_EXAMPLE_TRAIT_H_ +#define HERMES_TRAITS_EXAMPLE_EXAMPLE_TRAIT_H_ + +#include "hermes.h" + +namespace hermes::api { + +struct PrefetcherTraitHeader : public TraitHeader { + hermes::PrefetcherType type_; + explicit PrefetcherTraitHeader(const std::string &trait_uuid, + const std::string &trait_name, + hermes::PrefetcherType type) + : TraitHeader(trait_uuid, trait_name, HERMES_TRAIT_PREFETCHER), + type_(type) {} +}; + +class PrefetcherTrait : public Trait { + public: + HERMES_TRAIT_H(PrefetcherTrait, "PrefetcherTrait"); + + public: + explicit PrefetcherTrait(hshm::charbuf &data) : Trait(data) {} + + explicit PrefetcherTrait(const std::string &trait_uuid, + hermes::PrefetcherType prefetch_type) { + CreateHeader(trait_uuid, + trait_name_, + prefetch_type); + } + + void Run(int method, void *params) override; +}; + +} // namespace hermes::api + +#endif // HERMES_TRAITS_EXAMPLE_EXAMPLE_TRAIT_H_ From 222479b2ab1ebb83d078fff71be8c2d427c7014a Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 06:07:13 -0500 Subject: [PATCH 37/44] Prefetcher compiles --- benchmarks/reorganize.cc | 5 +- .../jarvis_util/shell/local_exec.py | 54 ------------------- src/CMakeLists.txt | 1 + src/binlog.h | 27 +++++----- src/metadata_manager.cc | 5 +- src/prefetcher.cc | 40 ++++++++++---- src/prefetcher.h | 11 ++-- src/prefetcher/apriori_prefetcher.cc | 17 ++++++ src/prefetcher/apriori_prefetcher.h | 26 +++++++++ src/prefetcher_factory.h | 50 +++++++++++++++++ traits/prefetcher/prefetcher_header.h | 37 +++++++++++++ traits/prefetcher/prefetcher_trait.cc | 4 +- traits/prefetcher/prefetcher_trait.h | 15 ++---- 13 files changed, 194 insertions(+), 98 deletions(-) create mode 100644 src/prefetcher/apriori_prefetcher.cc create mode 100644 src/prefetcher/apriori_prefetcher.h create mode 100644 src/prefetcher_factory.h create mode 100644 traits/prefetcher/prefetcher_header.h diff --git a/benchmarks/reorganize.cc b/benchmarks/reorganize.cc index f5fae00f5..d201db006 100644 --- a/benchmarks/reorganize.cc +++ b/benchmarks/reorganize.cc @@ -126,8 +126,9 @@ int main(int argc, char **argv) { int compute_get = atoi(argv[6]); // Register the Apriori trait - hermes::TraitId apriori_trait = HERMES->RegisterTrait( - "apriori", hermes::PrefetcherType::kApriori); + hermes::TraitId apriori_trait = + HERMES->RegisterTrait( + "apriori", hermes::PrefetcherType::kApriori); if (with_prefetch) { auto bkt = HERMES->GetBucket("hello"); bkt.AttachTrait(apriori_trait); diff --git a/ci/jarvis-util/jarvis_util/shell/local_exec.py b/ci/jarvis-util/jarvis_util/shell/local_exec.py index 86d2a6aa7..c5487fe1e 100644 --- a/ci/jarvis-util/jarvis_util/shell/local_exec.py +++ b/ci/jarvis-util/jarvis_util/shell/local_exec.py @@ -40,10 +40,6 @@ def __init__(self, cmd, exec_info): # pylint: disable=R1732 if self.collect_output is None: self.collect_output = jutil.collect_output - if self.pipe_stdout is not None: - self.pipe_stdout_fp = open(self.pipe_stdout, 'wb') - if self.pipe_stderr is not None: - self.pipe_stderr_fp = open(self.pipe_stderr, 'wb') if self.hide_output is None: self.hide_output = jutil.hide_output # pylint: enable=R1732 @@ -52,8 +48,6 @@ def __init__(self, cmd, exec_info): self.last_stdout_size = 0 self.last_stderr_size = 0 self.executing_ = True - self.print_stdout_thread = None - self.print_stderr_thread = None self.exit_code = 0 # Copy ENV @@ -86,18 +80,11 @@ def _start_bash_processes(self): env=self.env, shell=True) # pylint: enable=R1732 - self.print_stdout_thread = threading.Thread( - target=self.print_stdout_worker) - self.print_stderr_thread = threading.Thread( - target=self.print_stderr_worker) - self.print_stdout_thread.start() - self.print_stderr_thread.start() if not self.exec_async: self.wait() def wait(self): self.proc.wait() - self.join_print_worker() self.set_exit_code() return self.exit_code @@ -110,47 +97,6 @@ def get_pid(self): else: return None - def print_stdout_worker(self): - while self.executing_: - self.print_to_outputs(self.proc.stdout, self.stdout, - self.pipe_stdout_fp, sys.stdout) - time.sleep(25 / 1000) - - def print_stderr_worker(self): - while self.executing_: - self.print_to_outputs(self.proc.stderr, self.stderr, - self.pipe_stderr_fp, sys.stderr) - time.sleep(25 / 1000) - - def print_to_outputs(self, proc_sysout, self_sysout, file_sysout, sysout): - # pylint: disable=W0702 - for line in proc_sysout: - try: - text = line.decode('utf-8') - if not self.hide_output: - sysout.write(text) - if self.collect_output: - self_sysout.write(text) - self_sysout.flush() - if file_sysout is not None: - file_sysout.write(line) - except: - pass - # pylint: enable=W0702 - - def join_print_worker(self): - if not self.executing_: - return - self.executing_ = False - self.print_stdout_thread.join() - self.print_stderr_thread.join() - self.stdout = self.stdout.getvalue() - self.stderr = self.stderr.getvalue() - if self.pipe_stdout_fp is not None: - self.pipe_stdout_fp.close() - if self.pipe_stderr_fp is not None: - self.pipe_stderr_fp.close() - class LocalExecInfo(ExecInfo): def __init__(self, **kwargs): diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9ca5a82e9..687c6bc1d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -67,6 +67,7 @@ target_link_libraries(hermes PUBLIC ${CMAKE_HERMES_RPC_TYPE_LIB} PUBLIC yaml-cpp PUBLIC cereal::cereal + PUBLIC MPI::MPI_CXX PUBLIC "$<$:${GOTCHA_MODULE_LIBS}>" ) diff --git a/src/binlog.h b/src/binlog.h index 8e9e76e09..da46fe7bd 100644 --- a/src/binlog.h +++ b/src/binlog.h @@ -80,6 +80,19 @@ class BinaryLog { } } + /** Appends an entry to the cache */ + void AppendEntry(const T &entry) { + if (entry.rank_ >= (int)cache_.size()) { + cache_.resize(entry.rank_ + 1); + } + auto &cache = cache_[entry.rank_]; + if (cache.log_.size() == 0) { + cache.log_.reserve(8192); + } + cache.log_.emplace_back(entry); + cur_entry_count_ += 1; + } + /** * Get the next entry corresponding to the rank * */ @@ -115,20 +128,6 @@ class BinaryLog { } cur_entry_count_ = 0; } - - private: - /** Appends an entry to the cache */ - void AppendEntry(const T &entry) { - if (entry.rank_ >= (int)cache_.size()) { - cache_.resize(entry.rank_ + 1); - } - auto &cache = cache_[entry.rank_]; - if (cache.log_.size() == 0) { - cache.log_.reserve(8192); - } - cache.log_.emplace_back(entry); - cur_entry_count_ += 1; - } }; } // namespace hermes diff --git a/src/metadata_manager.cc b/src/metadata_manager.cc index ca3d8f04e..77c9ffe04 100644 --- a/src/metadata_manager.cc +++ b/src/metadata_manager.cc @@ -14,6 +14,7 @@ #include "metadata_manager.h" #include "buffer_organizer.h" #include "api/bucket.h" +#include namespace hermes { @@ -947,9 +948,9 @@ void MetadataManager::AddIoStat(TagId tag_id, stat.type_ = type; stat.rank_ = 0; // TODO(llogan): make MPI-awareness configurable - /*if (is_mpi_) { + if (is_mpi_) { MPI_Comm_rank(MPI_COMM_WORLD, &stat.rank_); - }*/ + } io_pattern_log_->emplace(stat); } diff --git a/src/prefetcher.cc b/src/prefetcher.cc index 97ae90d51..21644efc1 100644 --- a/src/prefetcher.cc +++ b/src/prefetcher.cc @@ -12,6 +12,7 @@ #include "prefetcher.h" #include "hermes.h" +#include namespace hermes { @@ -29,13 +30,6 @@ void Prefetcher::Init() { return; } - // Create the binary log - /*if (conf.prefetcher_.trace_path_.empty()) { - log_.Init("", MEGABYTES(64)); - } else { - log_.Init(conf.prefetcher_.trace_path_ + ) - }*/ - // Info needed per-client and server mdm_->is_mpi_ = conf.prefetcher_.is_mpi_; if (HERMES->mode_ == HermesType::kClient) { @@ -43,6 +37,14 @@ void Prefetcher::Init() { return; } + // Create the binary log + if (conf.prefetcher_.trace_path_.empty()) { + log_.Init("", MEGABYTES(64)); + } else { + log_.Init(conf.prefetcher_.trace_path_ + std::to_string(rpc_->node_id_), + MEGABYTES(64)); + } + // Set the epoch epoch_ms_ = (double)conf.prefetcher_.epoch_ms_; @@ -67,9 +69,29 @@ void Prefetcher::Finalize() { /** Parse the MDM's I/O pattern log */ void Prefetcher::Run() { - // Ingest the current I/O statistics + // Get the set of buckets + Ingest log + std::unordered_set tags; + IoStat entry; + while (!mdm_->io_pattern_log_->pop(entry).IsNull()) { + log_.AppendEntry(entry); + tags.emplace(entry.tag_id_); + } - // Get the set of buckets + // Enact the prefetchers for each bucket + for (auto &bkt_id : tags) { + std::vector traits = HERMES->GetTraits(bkt_id); + for (auto trait : traits) { + if (trait->header_->flags_.Any(HERMES_TRAIT_PREFETCHER)) { + auto *trait_hdr = + trait->GetHeader(); + switch (trait_hdr->type_) { + case PrefetcherType::kApriori: { + + } + } + } + } + } } } // namespace hermes diff --git a/src/prefetcher.h b/src/prefetcher.h index f82521406..9b2e6d022 100644 --- a/src/prefetcher.h +++ b/src/prefetcher.h @@ -19,13 +19,10 @@ #include "rpc.h" #include "binlog.h" #include +#include "traits/prefetcher/prefetcher_header.h" namespace hermes { -enum class PrefetcherType { - kApriori -}; - class Prefetcher { public: std::vector> trace_; @@ -49,6 +46,12 @@ class Prefetcher { void Run(); }; +class PrefetcherPolicy { + public: + /** Utilize the I/O pattern log to make prefetching decisions */ + virtual void Prefetch(BinaryLog &log) = 0; +}; + } // namespace hermes #endif // HERMES_SRC_PREFETCHER_H_ diff --git a/src/prefetcher/apriori_prefetcher.cc b/src/prefetcher/apriori_prefetcher.cc new file mode 100644 index 000000000..c9913cf64 --- /dev/null +++ b/src/prefetcher/apriori_prefetcher.cc @@ -0,0 +1,17 @@ +// +// Created by lukemartinlogan on 5/30/23. +// + +#include "apriori_prefetcher.h" + +namespace hermes { + +/** Constructor. Parse YAML schema. */ +AprioriPrefetcher::AprioriPrefetcher() { +} + +/** Prefetch based on YAML schema */ +void AprioriPrefetcher::Prefetch(BinaryLog &log) { +} + +} // namespace hermes \ No newline at end of file diff --git a/src/prefetcher/apriori_prefetcher.h b/src/prefetcher/apriori_prefetcher.h new file mode 100644 index 000000000..e7b6a4724 --- /dev/null +++ b/src/prefetcher/apriori_prefetcher.h @@ -0,0 +1,26 @@ +// +// Created by lukemartinlogan on 5/30/23. +// + +#ifndef HERMES_SRC_PREFETCHER_APRIORI_PREFETCHER_H_ +#define HERMES_SRC_PREFETCHER_APRIORI_PREFETCHER_H_ + +#include "prefetcher.h" + +namespace hermes { + +class AprioriPrefetcher : public PrefetcherPolicy { + public: + /** Constructor. Parse YAML schema. */ + AprioriPrefetcher(); + + /** Destructor. */ + virtual ~AprioriPrefetcher() = default; + + /** Prefetch based on YAML schema */ + void Prefetch(BinaryLog &log); +}; + +} // namespace hermes + +#endif // HERMES_SRC_PREFETCHER_APRIORI_PREFETCHER_H_ diff --git a/src/prefetcher_factory.h b/src/prefetcher_factory.h new file mode 100644 index 000000000..82e9371b5 --- /dev/null +++ b/src/prefetcher_factory.h @@ -0,0 +1,50 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * +* Distributed under BSD 3-Clause license. * +* Copyright by The HDF Group. * +* Copyright by the Illinois Institute of Technology. * +* All rights reserved. * +* * +* This file is part of Hermes. The full Hermes copyright notice, including * +* terms governing use, modification, and redistribution, is contained in * +* the COPYING file, which can be found at the top directory. If you do not * +* have access to the file, you may request a copy from help@hdfgroup.org. * +* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#ifndef HERMES_SRC_PREFETCHER_FACTORY_H_ +#define HERMES_SRC_PREFETCHER_FACTORY_H_ + +#include "prefetcher.h" +#include "prefetcher/apriori_prefetcher.h" + +namespace hermes { + +using hermes::PrefetcherType; + +/** + * A class to represent Data Placement Engine Factory + * */ +class PrefetcherFactory { + public: + /** + * return a pointer to prefetcher policy given a policy type. + * This uses factory pattern. + * + * @param[in] type a prefetcher policy type + * @return pointer to PrefetcherPolicy + */ + static PrefetcherPolicy* Get(const PrefetcherType &type) { + switch (type) { + case PrefetcherType::kApriori: { + return hshm::EasySingleton::GetInstance(); + } + default: { + HELOG(kFatal, "PlacementPolicy not implemented") + return NULL; + } + } + } +}; + +} // namespace hermes + +#endif // HERMES_SRC_PREFETCHER_FACTORY_H_ diff --git a/traits/prefetcher/prefetcher_header.h b/traits/prefetcher/prefetcher_header.h new file mode 100644 index 000000000..6499ffd2a --- /dev/null +++ b/traits/prefetcher/prefetcher_header.h @@ -0,0 +1,37 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * +* Distributed under BSD 3-Clause license. * +* Copyright by The HDF Group. * +* Copyright by the Illinois Institute of Technology. * +* All rights reserved. * +* * +* This file is part of Hermes. The full Hermes copyright notice, including * +* terms governing use, modification, and redistribution, is contained in * +* the COPYING file, which can be found at the top directory. If you do not * +* have access to the file, you may request a copy from help@hdfgroup.org. * +* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#ifndef HERMES_TRAITS_PREFETCHER_PREFETCHER_HEADER_H_ +#define HERMES_TRAITS_PREFETCHER_PREFETCHER_HEADER_H_ + +#include "trait_manager.h" + +namespace hermes { + +/** Types of prefetchers available */ +enum class PrefetcherType { + kApriori +}; + +/** Header for prefetcher trait */ +struct PrefetcherTraitHeader : public TraitHeader { + hermes::PrefetcherType type_; + explicit PrefetcherTraitHeader(const std::string &trait_uuid, + const std::string &trait_name, + hermes::PrefetcherType type) + : TraitHeader(trait_uuid, trait_name, HERMES_TRAIT_PREFETCHER), + type_(type) {} +}; + +} // namespace hermes::api + +#endif // HERMES_TRAITS_PREFETCHER_PREFETCHER_HEADER_H_ diff --git a/traits/prefetcher/prefetcher_trait.cc b/traits/prefetcher/prefetcher_trait.cc index 9d2c4d157..6150363a5 100644 --- a/traits/prefetcher/prefetcher_trait.cc +++ b/traits/prefetcher/prefetcher_trait.cc @@ -12,7 +12,7 @@ #include "prefetcher_trait.h" -namespace hermes::api { +namespace hermes { void PrefetcherTrait::Run(int method, void *params) { (void) method; @@ -21,4 +21,4 @@ void PrefetcherTrait::Run(int method, void *params) { } // namespace hermes::api -HERMES_TRAIT_CC(hermes::api::PrefetcherTrait) \ No newline at end of file +HERMES_TRAIT_CC(hermes::PrefetcherTrait) \ No newline at end of file diff --git a/traits/prefetcher/prefetcher_trait.h b/traits/prefetcher/prefetcher_trait.h index 22c9d49e6..2599ad674 100644 --- a/traits/prefetcher/prefetcher_trait.h +++ b/traits/prefetcher/prefetcher_trait.h @@ -14,18 +14,11 @@ #define HERMES_TRAITS_EXAMPLE_EXAMPLE_TRAIT_H_ #include "hermes.h" +#include "prefetcher_header.h" -namespace hermes::api { - -struct PrefetcherTraitHeader : public TraitHeader { - hermes::PrefetcherType type_; - explicit PrefetcherTraitHeader(const std::string &trait_uuid, - const std::string &trait_name, - hermes::PrefetcherType type) - : TraitHeader(trait_uuid, trait_name, HERMES_TRAIT_PREFETCHER), - type_(type) {} -}; +namespace hermes { +/** Prefetcher trait */ class PrefetcherTrait : public Trait { public: HERMES_TRAIT_H(PrefetcherTrait, "PrefetcherTrait"); @@ -43,6 +36,6 @@ class PrefetcherTrait : public Trait { void Run(int method, void *params) override; }; -} // namespace hermes::api +} // namespace hermes #endif // HERMES_TRAITS_EXAMPLE_EXAMPLE_TRAIT_H_ From e45cde83243221cd43db3c43b3773b94fe4f4c51 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 06:09:18 -0500 Subject: [PATCH 38/44] Fix lint issues --- src/prefetcher.cc | 8 +++----- src/prefetcher/apriori_prefetcher.cc | 16 ++++++++++++---- src/prefetcher/apriori_prefetcher.h | 14 +++++++++++--- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/prefetcher.cc b/src/prefetcher.cc index 21644efc1..f92a3a833 100644 --- a/src/prefetcher.cc +++ b/src/prefetcher.cc @@ -12,6 +12,7 @@ #include "prefetcher.h" #include "hermes.h" +#include "prefetcher_factory.h" #include namespace hermes { @@ -84,11 +85,8 @@ void Prefetcher::Run() { if (trait->header_->flags_.Any(HERMES_TRAIT_PREFETCHER)) { auto *trait_hdr = trait->GetHeader(); - switch (trait_hdr->type_) { - case PrefetcherType::kApriori: { - - } - } + auto *policy = PrefetcherFactory::Get(trait_hdr->type_); + policy->Prefetch(log_); } } } diff --git a/src/prefetcher/apriori_prefetcher.cc b/src/prefetcher/apriori_prefetcher.cc index c9913cf64..2b74eb303 100644 --- a/src/prefetcher/apriori_prefetcher.cc +++ b/src/prefetcher/apriori_prefetcher.cc @@ -1,6 +1,14 @@ -// -// Created by lukemartinlogan on 5/30/23. -// +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Distributed under BSD 3-Clause license. * + * Copyright by The HDF Group. * + * Copyright by the Illinois Institute of Technology. * + * All rights reserved. * + * * + * This file is part of Hermes. The full Hermes copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the top directory. If you do not * + * have access to the file, you may request a copy from help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #include "apriori_prefetcher.h" @@ -14,4 +22,4 @@ AprioriPrefetcher::AprioriPrefetcher() { void AprioriPrefetcher::Prefetch(BinaryLog &log) { } -} // namespace hermes \ No newline at end of file +} // namespace hermes diff --git a/src/prefetcher/apriori_prefetcher.h b/src/prefetcher/apriori_prefetcher.h index e7b6a4724..037a94fb3 100644 --- a/src/prefetcher/apriori_prefetcher.h +++ b/src/prefetcher/apriori_prefetcher.h @@ -1,6 +1,14 @@ -// -// Created by lukemartinlogan on 5/30/23. -// +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Distributed under BSD 3-Clause license. * + * Copyright by The HDF Group. * + * Copyright by the Illinois Institute of Technology. * + * All rights reserved. * + * * + * This file is part of Hermes. The full Hermes copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the top directory. If you do not * + * have access to the file, you may request a copy from help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #ifndef HERMES_SRC_PREFETCHER_APRIORI_PREFETCHER_H_ #define HERMES_SRC_PREFETCHER_APRIORI_PREFETCHER_H_ From 47d16d9605fdc8183076c3cbd72bfce0ac1cacb8 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 06:14:39 -0500 Subject: [PATCH 39/44] Remove excessive IoStat --- src/CMakeLists.txt | 1 + src/metadata_manager.cc | 5 ----- src/metadata_manager.h | 4 ---- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 687c6bc1d..8388271f0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -33,6 +33,7 @@ set(HERMES_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/buffer_pool.cc ${CMAKE_CURRENT_SOURCE_DIR}/buffer_organizer.cc ${CMAKE_CURRENT_SOURCE_DIR}/prefetcher.cc + ${CMAKE_CURRENT_SOURCE_DIR}/prefetcher/apriori_prefetcher.cc ${CMAKE_CURRENT_SOURCE_DIR}/trait_manager.cc ${CMAKE_CURRENT_SOURCE_DIR}/data_placement_engine.cc ${CMAKE_CURRENT_SOURCE_DIR}/dpe/random.cc diff --git a/src/metadata_manager.cc b/src/metadata_manager.cc index 77c9ffe04..d34d1cb3a 100644 --- a/src/metadata_manager.cc +++ b/src/metadata_manager.cc @@ -365,7 +365,6 @@ MetadataManager::LocalPutBlobMetadata(TagId bkt_id, blob_info.score_ = score; blob_info.UpdateWriteStats(); } - AddIoStat(bkt_id, blob_id, blob_size, IoType::kWrite); return std::tuple(blob_id, did_create, orig_blob_size); } @@ -467,7 +466,6 @@ std::vector MetadataManager::LocalGetBlobBuffers(BlobId blob_id) { // Acquire blob_info read lock ScopedRwReadLock blob_info_lock(blob_info.lock_[0], kMDM_LocalGetBlobBuffers); - AddIoStat(blob_info.tag_id_, blob_id, blob_info.blob_size_, IoType::kRead); blob_info.UpdateReadStats(); auto vec = blob_info.buffers_->vec(); return vec; @@ -939,15 +937,12 @@ void MetadataManager::AddIoStat(TagId tag_id, if (!enable_io_tracing_) { return; } - ScopedRwWriteLock io_pattern_lock(header_->lock_[kIoPatternLogLock], - kMDM_AddIoStat); IoStat stat; stat.blob_id_ = blob_id; stat.tag_id_ = tag_id; stat.blob_size_ = blob_size; stat.type_ = type; stat.rank_ = 0; - // TODO(llogan): make MPI-awareness configurable if (is_mpi_) { MPI_Comm_rank(MPI_COMM_WORLD, &stat.rank_); } diff --git a/src/metadata_manager.h b/src/metadata_manager.h index b158da807..7ecf7864d 100644 --- a/src/metadata_manager.h +++ b/src/metadata_manager.h @@ -40,12 +40,8 @@ typedef hipc::mpsc_queue IO_PATTERN_LOG_T; enum MdmLock { kBlobMapLock, - kBktMapLock, kTagMapLock, - kTagDeleteLock, kTraitMapLock, - kLocalTraitMapLock, - kIoPatternLogLock, kFlushLock, kMdmLockCount From 83b9e2352a98d412cebb7c7a556c40a34e644b82 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 06:38:00 -0500 Subject: [PATCH 40/44] Binary log compiles --- adapter/test/posix/tests.py | 5 ++--- ci/py_hermes_ci/py_hermes_ci/test_manager.py | 1 - config/hermes_server_default.yaml | 3 ++- src/config_server.cc | 4 ++++ src/config_server.h | 1 + src/config_server_default.h | 3 ++- src/hermes_types.h | 16 ++++++++++++---- src/prefetcher.cc | 3 +++ 8 files changed, 26 insertions(+), 10 deletions(-) diff --git a/adapter/test/posix/tests.py b/adapter/test/posix/tests.py index 1141c45aa..b29b4300f 100644 --- a/adapter/test/posix/tests.py +++ b/adapter/test/posix/tests.py @@ -30,10 +30,9 @@ def test_hermes_posix_basic_small(self): spawn_info = self.spawn_info(nprocs=1, hermes_conf='hermes_server') self.start_daemon(spawn_info) - # node = Exec(posix_cmd, spawn_info) + node = Exec(posix_cmd, spawn_info) self.stop_daemon(spawn_info) - # return node.exit_code - return 0 + return node.exit_code def test_hermes_posix_basic_large(self): posix_cmd = f"{self.HERMES_POSIX_CMD} " \ diff --git a/ci/py_hermes_ci/py_hermes_ci/test_manager.py b/ci/py_hermes_ci/py_hermes_ci/test_manager.py index eb53b1cab..b541a9c60 100644 --- a/ci/py_hermes_ci/py_hermes_ci/test_manager.py +++ b/ci/py_hermes_ci/py_hermes_ci/test_manager.py @@ -192,7 +192,6 @@ def stop_daemon(self, spawn_info): :return: None """ print("Stop daemon") - return Exec(f"{self.CMAKE_BINARY_DIR}/bin/finalize_hermes", LocalExecInfo( env=spawn_info.basic_env)) diff --git a/config/hermes_server_default.yaml b/config/hermes_server_default.yaml index 3bd1a2109..622e26c5d 100644 --- a/config/hermes_server_default.yaml +++ b/config/hermes_server_default.yaml @@ -149,8 +149,9 @@ tracing: prefetch: enabled: false io_trace_path: "" + apriori_schema_path: "" epoch_ms: 50 - is_mpi: true + is_mpi: false # The shared memory prefix for the hermes shared memory segment. A user name # will be automatically appended. diff --git a/src/config_server.cc b/src/config_server.cc index 8f04e7b12..e438df47d 100644 --- a/src/config_server.cc +++ b/src/config_server.cc @@ -148,6 +148,10 @@ void ServerConfig::ParsePrefetchInfo(YAML::Node yaml_conf) { if (yaml_conf["is_mpi"]) { prefetcher_.is_mpi_ = yaml_conf["is_mpi"].as(); } + if (yaml_conf["apriori_schema_path"]) { + prefetcher_.apriori_schema_path_ = + yaml_conf["apriori_schema_path"].as(); + } } /** parse prefetch information from YAML config */ diff --git a/src/config_server.h b/src/config_server.h index a6d1651e2..d0feebc38 100644 --- a/src/config_server.h +++ b/src/config_server.h @@ -234,6 +234,7 @@ struct BorgInfo { struct PrefetchInfo { bool enabled_; std::string trace_path_; + std::string apriori_schema_path_; size_t epoch_ms_; bool is_mpi_; }; diff --git a/src/config_server_default.h b/src/config_server_default.h index dc8982eed..dfba11a44 100644 --- a/src/config_server_default.h +++ b/src/config_server_default.h @@ -152,8 +152,9 @@ const char* kServerDefaultConfigStr = "prefetch:\n" " enabled: false\n" " io_trace_path: \"\"\n" +" apriori_schema_path: \"\"\n" " epoch_ms: 50\n" -" is_mpi: true\n" +" is_mpi: false\n" "\n" "# The shared memory prefix for the hermes shared memory segment. A user name\n" "# will be automatically appended.\n" diff --git a/src/hermes_types.h b/src/hermes_types.h index 3757d26e9..caeefd077 100644 --- a/src/hermes_types.h +++ b/src/hermes_types.h @@ -181,11 +181,11 @@ struct UniqueId { return unique_ != other.unique_ || node_id_ != other.node_id_; } - /** Serialize a UniqueId */ + /** Serialize a UniqueId template void serialize(Archive &ar) { ar(unique_, node_id_); - } + }*/ }; typedef UniqueId<1> BlobId; typedef UniqueId<2> TagId; @@ -248,14 +248,22 @@ struct IoStat { template void save(Archive &ar) const { int type = static_cast(type_); - ar(type, blob_id_, tag_id_, blob_size_, rank_); + u64 ids[2] = {blob_id_.unique_, tag_id_.unique_}; + i32 nodes[2] = {blob_id_.node_id_, tag_id_.node_id_}; + ar(type, ids[0], nodes[0], ids[1], nodes[1], blob_size_, rank_); } /** Deserialize */ template void load(Archive &ar) { int type; - ar(type, blob_id_, tag_id_, blob_size_, rank_); + ar(type, + blob_id_.unique_, + blob_id_.node_id_, + tag_id_.unique_, + tag_id_.node_id_, + blob_size_, + rank_); type_ = static_cast(type); } }; diff --git a/src/prefetcher.cc b/src/prefetcher.cc index f92a3a833..d55cde6d5 100644 --- a/src/prefetcher.cc +++ b/src/prefetcher.cc @@ -59,6 +59,7 @@ void Prefetcher::Init() { tl::thread::self().sleep(*HERMES->rpc_.server_engine_, prefetch->epoch_ms_); } + prefetch->log_.Flush(true); HILOG(kDebug, "Prefetcher has stopped") }; HERMES_THREAD_MANAGER->Spawn(prefetcher); @@ -90,6 +91,8 @@ void Prefetcher::Run() { } } } + + log_.Flush(false); } } // namespace hermes From d15b01bfb85e0ad655b0dfc62bb185c1b69eea57 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 07:46:24 -0500 Subject: [PATCH 41/44] BORG + prefetcher doc --- src/binlog.h | 9 +++ src/config.h | 3 +- src/prefetcher.cc | 2 +- src/prefetcher.h | 2 +- src/prefetcher/apriori_prefetcher.cc | 82 +++++++++++++++++++++++++++- src/prefetcher/apriori_prefetcher.h | 21 ++++++- test/data/apriori_schema.yaml | 21 +++++++ 7 files changed, 135 insertions(+), 5 deletions(-) create mode 100644 test/data/apriori_schema.yaml diff --git a/src/binlog.h b/src/binlog.h index da46fe7bd..27c4316c6 100644 --- a/src/binlog.h +++ b/src/binlog.h @@ -93,6 +93,15 @@ class BinaryLog { cur_entry_count_ += 1; } + /** + * Get the total number of ops stored in the rank's log throughout + * its lifetime. + * */ + size_t GetRankLogSize(int rank) { + auto &cache = cache_[rank]; + return cache.log_.size() + cache.backend_off_; + } + /** * Get the next entry corresponding to the rank * */ diff --git a/src/config.h b/src/config.h index d9dd202cf..7772eb657 100644 --- a/src/config.h +++ b/src/config.h @@ -50,7 +50,8 @@ class BaseConfig { return; } auto real_path = hshm::ConfigParse::ExpandPath(path); - HILOG(kDebug, "Start load config {}", real_path)try { + HILOG(kDebug, "Start load config {}", real_path) + try { YAML::Node yaml_conf = YAML::LoadFile(real_path); HILOG(kDebug, "Complete load config {}", real_path) ParseYAML(yaml_conf); diff --git a/src/prefetcher.cc b/src/prefetcher.cc index d55cde6d5..364d31ead 100644 --- a/src/prefetcher.cc +++ b/src/prefetcher.cc @@ -87,7 +87,7 @@ void Prefetcher::Run() { auto *trait_hdr = trait->GetHeader(); auto *policy = PrefetcherFactory::Get(trait_hdr->type_); - policy->Prefetch(log_); + policy->Prefetch(borg_, log_); } } } diff --git a/src/prefetcher.h b/src/prefetcher.h index 9b2e6d022..2904d53ab 100644 --- a/src/prefetcher.h +++ b/src/prefetcher.h @@ -49,7 +49,7 @@ class Prefetcher { class PrefetcherPolicy { public: /** Utilize the I/O pattern log to make prefetching decisions */ - virtual void Prefetch(BinaryLog &log) = 0; + virtual void Prefetch(BufferOrganizer *borg, BinaryLog &log) = 0; }; } // namespace hermes diff --git a/src/prefetcher/apriori_prefetcher.cc b/src/prefetcher/apriori_prefetcher.cc index 2b74eb303..a306f12b3 100644 --- a/src/prefetcher/apriori_prefetcher.cc +++ b/src/prefetcher/apriori_prefetcher.cc @@ -11,15 +11,95 @@ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #include "apriori_prefetcher.h" +#include "hermes.h" namespace hermes { /** Constructor. Parse YAML schema. */ AprioriPrefetcher::AprioriPrefetcher() { + auto &path = HERMES->server_config_.prefetcher_.apriori_schema_path_; + auto real_path = hshm::ConfigParse::ExpandPath(path); + HILOG(kDebug, "Start load apriori schema {}", real_path) + try { + YAML::Node yaml_conf = YAML::LoadFile(real_path); + ParseSchema(yaml_conf); + HILOG(kDebug, "Complete load of apriori schema {}", real_path) + } catch (std::exception &e) { + HELOG(kFatal, e.what()) + } +} + +void ParseList(std::vector &list, YAML::Node node) { + list.reserve(node.size()); + for (YAML::Node sub_node : node) { + list.emplace_back(sub_node.as()); + } +} + +/** Parse YAML schema. */ +void AprioriPrefetcher::ParseSchema(YAML::Node &schema) { + rank_info_.resize(schema.size()); + for (const auto &rank_node_pair : schema) { + const YAML::Node& rank_node = rank_node_pair.first; + const YAML::Node& rank_instrs = rank_node_pair.second; + int rank = rank_node.as(); + auto &instr_list = rank_info_[rank]; + for (YAML::Node instr_list_node : rank_instrs) { + instr_list.emplace_back(); + auto &instr = instr_list.back(); + YAML::Node op_count_range_node = instr_list_node["op_count_range"]; + instr.min_op_count_ = op_count_range_node[0].as(); + instr.max_op_count_ = op_count_range_node[1].as(); + for (YAML::Node instr_node : instr_list_node["prefetch"]) { + instr.promotes_.emplace_back(); + auto &promote = instr.promotes_.back(); + promote.bkt_name_ = instr_node["bucket_name"].as(); + ParseList(promote.promote_, instr_node["promote_blobs"]); + ParseList(promote.demote_, instr_node["demote_blobs"]); + } + } + } } /** Prefetch based on YAML schema */ -void AprioriPrefetcher::Prefetch(BinaryLog &log) { +void AprioriPrefetcher::Prefetch(BufferOrganizer *borg, + BinaryLog &log) { + for (size_t rank = 0; rank < rank_info_.size(); ++rank) { + size_t num_ops = log.GetRankLogSize((int)rank); + auto &instr_list = rank_info_[rank]; + + // Find the instruction to execute for this rank + auto begin = instr_list.begin(); + auto cur = begin; + for (; cur != instr_list.end(); ++cur) { + auto &instr = *cur; + if (instr.min_op_count_ <= num_ops && instr.max_op_count_ <= num_ops) { + break; + } + } + + // First, demote blobs + if (cur != instr_list.end()) { + auto &instr = *cur; + for (auto &promote_instr : instr.promotes_) { + for (auto &blob_name : promote_instr.demote_) { + borg->GlobalOrganizeBlob(promote_instr.bkt_name_, + blob_name, 0); + } + } + + // Next, promote blobs + for (auto &promote_instr : instr.promotes_) { + for (auto &blob_name : promote_instr.promote_) { + borg->GlobalOrganizeBlob(promote_instr.bkt_name_, + blob_name, 1); + } + } + } + + // Erase unneeded logs + instr_list.erase(begin, cur); + } } } // namespace hermes diff --git a/src/prefetcher/apriori_prefetcher.h b/src/prefetcher/apriori_prefetcher.h index 037a94fb3..ef5ec3e4b 100644 --- a/src/prefetcher/apriori_prefetcher.h +++ b/src/prefetcher/apriori_prefetcher.h @@ -14,19 +14,38 @@ #define HERMES_SRC_PREFETCHER_APRIORI_PREFETCHER_H_ #include "prefetcher.h" +#include namespace hermes { +struct AprioriPromoteInstr { + std::string bkt_name_; + std::vector promote_; + std::vector demote_; +}; + +struct AprioriPrefetchInstr { + size_t min_op_count_; + size_t max_op_count_; + std::vector promotes_; +}; + class AprioriPrefetcher : public PrefetcherPolicy { + public: + std::vector> rank_info_; + public: /** Constructor. Parse YAML schema. */ AprioriPrefetcher(); + /** Parse YAML config */ + void ParseSchema(YAML::Node &schema); + /** Destructor. */ virtual ~AprioriPrefetcher() = default; /** Prefetch based on YAML schema */ - void Prefetch(BinaryLog &log); + void Prefetch(BufferOrganizer *borg, BinaryLog &log); }; } // namespace hermes diff --git a/test/data/apriori_schema.yaml b/test/data/apriori_schema.yaml new file mode 100644 index 000000000..342f0791b --- /dev/null +++ b/test/data/apriori_schema.yaml @@ -0,0 +1,21 @@ +0: + - op_count_range: [2, 2] + prefetch: + - bucket: /tmp/test_hermes/hi.txt + promote_blobs: [ 3, 4 ] + demote_blobs: [ 1, 2 ] + - op_count_range: [4, 4] + prefetch: + - bucket: /tmp/test_hermes/hi.txt + promote_blobs: [ 5, 6 ] + demote_blobs: [ 3, 4 ] + - op_count_range: [6, 6] + prefetch: + - bucket: /tmp/test_hermes/hi.txt + promote_blobs: [ 5, 6 ] + demote_blobs: [ 3, 4 ] + - op_count_range: [8, 8] + prefetch: + - bucket: /tmp/test_hermes/hi.txt + promote_blobs: [ 7, 8 ] + demote_blobs: [ 5, 6 ] From c9fa866a53537d26cf34629264fc91e469545d2c Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 08:27:42 -0500 Subject: [PATCH 42/44] Prefetch cleanup --- benchmarks/reorganize.cc | 13 ++++++++++--- config/hermes_server_default.yaml | 7 ++++--- src/config_server_default.h | 9 +++++---- src/metadata_manager.cc | 2 +- src/prefetcher/apriori_prefetcher.cc | 2 +- test/data/apriori_schema.yaml | 8 ++++---- test/data/hermes_server_prefetch.yaml | 1 + 7 files changed, 26 insertions(+), 16 deletions(-) diff --git a/benchmarks/reorganize.cc b/benchmarks/reorganize.cc index d201db006..785f0fbe3 100644 --- a/benchmarks/reorganize.cc +++ b/benchmarks/reorganize.cc @@ -22,6 +22,7 @@ using hshm::ipc::PosixShmMmap; using Timer = hshm::HighResMonotonicTimer; hipc::MemoryBackend *backend; +const char *kBucketName = "/tmp/test_hermes/hi.txt"; void GatherTimes(const std::string &test_name, size_t total_size, @@ -51,7 +52,7 @@ void PutTest(int nprocs, int rank, size_t blob_size, int compute_sec) { Timer t, io_t; - auto bkt = HERMES->GetBucket("hello"); + auto bkt = HERMES->GetBucket(kBucketName); hermes::api::Context ctx; hermes::BlobId blob_id; hermes::Blob blob(blob_size); @@ -84,7 +85,7 @@ void GetTest(int nprocs, int rank, size_t blob_size, int compute_sec) { Timer t, io_t; - auto bkt = HERMES->GetBucket("hello"); + auto bkt = HERMES->GetBucket(kBucketName); hermes::api::Context ctx; hermes::BlobId blob_id; t.Resume(); @@ -125,12 +126,15 @@ int main(int argc, char **argv) { int compute_put = atoi(argv[5]); int compute_get = atoi(argv[6]); + // Start Hermes + HERMES->Create(hermes::HermesType::kClient); + // Register the Apriori trait hermes::TraitId apriori_trait = HERMES->RegisterTrait( "apriori", hermes::PrefetcherType::kApriori); if (with_prefetch) { - auto bkt = HERMES->GetBucket("hello"); + auto bkt = HERMES->GetBucket(kBucketName); bkt.AttachTrait(apriori_trait); } @@ -139,5 +143,8 @@ int main(int argc, char **argv) { MPI_Barrier(MPI_COMM_WORLD); GetTest(nprocs, rank, blobs_per_checkpt, num_checkpts, blob_size, compute_get); + + // Finalize + HERMES->Finalize(); MPI_Finalize(); } diff --git a/config/hermes_server_default.yaml b/config/hermes_server_default.yaml index 622e26c5d..fb6b3ca2d 100644 --- a/config/hermes_server_default.yaml +++ b/config/hermes_server_default.yaml @@ -78,9 +78,9 @@ devices: # Define the maximum amount of memory Hermes can use for non-buffering tasks. # This includes metadata management and memory allocations. -# This memory will not be preallocated, so if you don't know, you can set it -# to be high. -max_memory: 8g +# This memory will not be preallocated, so if you don't know, 0 indicates +# any amount of memory +max_memory: 0g ### Define properties of RPCs rpc: @@ -166,3 +166,4 @@ traits: - "hermes_stdio_io_client" - "hermes_mpiio_io_client" - "hermes_example_trait" + - "hermes_prefetcher_trait" diff --git a/src/config_server_default.h b/src/config_server_default.h index dfba11a44..16d84c316 100644 --- a/src/config_server_default.h +++ b/src/config_server_default.h @@ -81,9 +81,9 @@ const char* kServerDefaultConfigStr = "\n" "# Define the maximum amount of memory Hermes can use for non-buffering tasks.\n" "# This includes metadata management and memory allocations.\n" -"# This memory will not be preallocated, so if you don\'t know, you can set it\n" -"# to be high.\n" -"max_memory: 8g\n" +"# This memory will not be preallocated, so if you don\'t know, 0 indicates\n" +"# any amount of memory\n" +"max_memory: 0g\n" "\n" "### Define properties of RPCs\n" "rpc:\n" @@ -168,5 +168,6 @@ const char* kServerDefaultConfigStr = " - \"hermes_posix_io_client\"\n" " - \"hermes_stdio_io_client\"\n" " - \"hermes_mpiio_io_client\"\n" -" - \"hermes_example_trait\"\n"; +" - \"hermes_example_trait\"\n" +" - \"hermes_prefetcher_trait\"\n"; #endif // HERMES_SRC_CONFIG_SERVER_DEFAULT_H_ \ No newline at end of file diff --git a/src/metadata_manager.cc b/src/metadata_manager.cc index d34d1cb3a..1582ec4c3 100644 --- a/src/metadata_manager.cc +++ b/src/metadata_manager.cc @@ -874,7 +874,7 @@ MetadataManager::LocalGetTraitParams(TraitId trait_id) { * Get an existing trait * */ Trait* MetadataManager::GlobalGetTrait(TraitId trait_id) { - HILOG(kDebug, "Getting the trait {}", trait_id) + // HILOG(kDebug, "Getting the trait {}", trait_id) Trait *trait = nullptr; // Check if trait is already constructed diff --git a/src/prefetcher/apriori_prefetcher.cc b/src/prefetcher/apriori_prefetcher.cc index a306f12b3..e6fd3999b 100644 --- a/src/prefetcher/apriori_prefetcher.cc +++ b/src/prefetcher/apriori_prefetcher.cc @@ -53,7 +53,7 @@ void AprioriPrefetcher::ParseSchema(YAML::Node &schema) { for (YAML::Node instr_node : instr_list_node["prefetch"]) { instr.promotes_.emplace_back(); auto &promote = instr.promotes_.back(); - promote.bkt_name_ = instr_node["bucket_name"].as(); + promote.bkt_name_ = instr_node["bucket"].as(); ParseList(promote.promote_, instr_node["promote_blobs"]); ParseList(promote.demote_, instr_node["demote_blobs"]); } diff --git a/test/data/apriori_schema.yaml b/test/data/apriori_schema.yaml index 342f0791b..ab909b8b0 100644 --- a/test/data/apriori_schema.yaml +++ b/test/data/apriori_schema.yaml @@ -1,20 +1,20 @@ 0: - - op_count_range: [2, 2] + - op_count_range: [5, 5] prefetch: - bucket: /tmp/test_hermes/hi.txt promote_blobs: [ 3, 4 ] demote_blobs: [ 1, 2 ] - - op_count_range: [4, 4] + - op_count_range: [7, 7] prefetch: - bucket: /tmp/test_hermes/hi.txt promote_blobs: [ 5, 6 ] demote_blobs: [ 3, 4 ] - - op_count_range: [6, 6] + - op_count_range: [9, 9] prefetch: - bucket: /tmp/test_hermes/hi.txt promote_blobs: [ 5, 6 ] demote_blobs: [ 3, 4 ] - - op_count_range: [8, 8] + - op_count_range: [11, 11] prefetch: - bucket: /tmp/test_hermes/hi.txt promote_blobs: [ 7, 8 ] diff --git a/test/data/hermes_server_prefetch.yaml b/test/data/hermes_server_prefetch.yaml index 34d3d7d46..24306b43d 100644 --- a/test/data/hermes_server_prefetch.yaml +++ b/test/data/hermes_server_prefetch.yaml @@ -133,5 +133,6 @@ system_view_state_update_interval_ms: 1000 prefetch: enabled: true io_trace_path: "${HOME}/test/io_trace.yaml" + apriori_schema_path: "${HOME}/apriori_schema.yaml" epoch_ms: 50 is_mpi: true \ No newline at end of file From 7b09b322ff01a1bb29adf600adc903632786e516 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 08:28:14 -0500 Subject: [PATCH 43/44] Temporarily disable ci --- .github/workflows/main.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index fbb99bee2..369ad52ac 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -71,15 +71,15 @@ jobs: if: steps.spack-cache.outputs.cache-hit != 'true' run: ci/install_deps.sh - - name: Build - if: steps.hermes-cache.outputs.cache-hit != 'true' - run: ci/build_hermes.sh - - - name: Test - run: cd build && ctest -VV -R test_hermes_posix_basic_small - - - name: Install - run: pushd build && make install && popd +# - name: Build +# if: steps.hermes-cache.outputs.cache-hit != 'true' +# run: ci/build_hermes.sh +# +# - name: Test +# run: cd build && ctest -VV -R test_hermes_posix_basic_small +# +# - name: Install +# run: pushd build && make install && popd # Enable tmate debugging of manually-triggered workflows if the input option was provided # - name: Setup tmate session From 4b1abc9ebc488378d9fe34c56a0954d8fbe8e00c Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 30 May 2023 08:31:15 -0500 Subject: [PATCH 44/44] Add back make and install --- .github/workflows/main.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 369ad52ac..1edc7276c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -71,15 +71,15 @@ jobs: if: steps.spack-cache.outputs.cache-hit != 'true' run: ci/install_deps.sh -# - name: Build -# if: steps.hermes-cache.outputs.cache-hit != 'true' -# run: ci/build_hermes.sh + - name: Build + if: steps.hermes-cache.outputs.cache-hit != 'true' + run: ci/build_hermes.sh # # - name: Test # run: cd build && ctest -VV -R test_hermes_posix_basic_small # -# - name: Install -# run: pushd build && make install && popd + - name: Install + run: pushd build && make install && popd # Enable tmate debugging of manually-triggered workflows if the input option was provided # - name: Setup tmate session