Skip to content

Commit

Permalink
Merge pull request #465 from imperialCHEPI/data_source_from_config
Browse files Browse the repository at this point in the history
Allow users to specify data source in config file
  • Loading branch information
alexdewar committed Jul 10, 2024
2 parents 4578306 + eb72187 commit c598e10
Show file tree
Hide file tree
Showing 12 changed files with 215 additions and 52 deletions.
17 changes: 7 additions & 10 deletions src/HealthGPS.Console/command_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,18 +61,15 @@ CommandOptions parse_arguments(cxxopts::Options &options, int &argc, char *argv[
}

if (result.count("storage")) {
cmd.data_path_or_url = result["storage"].as<std::string>();
auto source = result["storage"].as<std::string>();

if (!cmd.data_path_or_url.starts_with("http://") &&
!cmd.data_path_or_url.starts_with("https://")) {
const std::filesystem::path path = cmd.data_path_or_url;
fmt::print(fmt::fg(fmt::color::yellow),
"WARNING: Path to data source specified with command-line argument. "
"This functionality is deprecatated and will be removed in future. You "
"should pass the data source via the config file.\n");
fmt::print("Data source: {}\n", source);

if (path.is_relative()) {
cmd.data_path_or_url = std::filesystem::absolute(path).string();
}
}

fmt::print("Data source: {}\n", cmd.data_path_or_url);
cmd.data_source = hgps::input::DataSource(std::move(source));
}

if (result.count("jobid")) {
Expand Down
6 changes: 5 additions & 1 deletion src/HealthGPS.Console/command_options.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
* @brief Functionality for parsing console application's command-line arguments
*/
#pragma once

#include <cxxopts.hpp>

#include "HealthGPS.Input/data_source.h"

#include <filesystem>
#include <optional>

namespace hgps {
/// @brief Defines the Command Line Interface (CLI) arguments options
Expand All @@ -20,7 +24,7 @@ struct CommandOptions {
std::filesystem::path config_file{};

/// @brief The back-end storage full path or URL argument value
std::string data_path_or_url;
std::optional<hgps::input::DataSource> data_source;

/// @brief Indicates whether the application logging is verbose
bool verbose{};
Expand Down
17 changes: 16 additions & 1 deletion src/HealthGPS.Console/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,23 @@ int main(int argc, char *argv[]) { // NOLINT(bugprone-exception-escape)
#ifdef CATCH_EXCEPTIONS
try {
#endif
// In future, we want users to supply the data source via the config file only, but for now
// we also allow passing it via a command line argument. Sanity check: Make sure they only
// do one of these things!
if (cmd_args.data_source.has_value() == config.data_source.has_value()) {
fmt::print(
fg(fmt::color::red),
"Must provide a data source via config file or command line, but not both\n");
return exit_application(EXIT_FAILURE);
}

// NOLINTBEGIN(bugprone-unchecked-optional-access)
const auto &data_source =
cmd_args.data_source.has_value() ? *cmd_args.data_source : *config.data_source;
// NOLINTEND(bugprone-unchecked-optional-access)

// Create back-end data store, cached data repository wrapper
auto data_api = input::DataManager(cmd_args.data_path_or_url, config.verbosity);
auto data_api = input::DataManager(data_source.get_data_directory(), config.verbosity);
auto data_repository = hgps::CachedRepository{data_api};

// Register the input risk factors model definitions
Expand Down
2 changes: 2 additions & 0 deletions src/HealthGPS.Input/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ target_sources(
"csvparser.h"
"datamanager.cpp"
"datamanager.h"
"data_source.cpp"
"data_source.h"
"download_file.cpp"
"download_file.h"
"jsonparser.cpp"
Expand Down
27 changes: 26 additions & 1 deletion src/HealthGPS.Input/configuration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
#include "HealthGPS.Core/poco.h"
#include "HealthGPS.Core/scoped_timer.h"

#include <chrono>
#include <fmt/chrono.h>
#include <fmt/color.h>

#include <chrono>
#include <filesystem>
#include <fstream>
#include <iostream>
#include <optional>
Expand All @@ -31,6 +33,23 @@
#define MEASURE_FUNCTION()
#endif

namespace {
using namespace hgps::input;

DataSource get_data_source_from_json(const nlohmann::json &opt,
const std::filesystem::path &root_path) {
auto source = opt["source"].get<std::string>();

// Checksum is not required if source is a directory, else it is mandatory
std::optional<std::string> file_hash;
if (opt.contains("checksum")) {
file_hash = opt["checksum"].get<std::string>();
}

return DataSource(std::move(source), root_path, std::move(file_hash));
}
} // anonymous namespace

namespace hgps::input {
using namespace hgps;
using json = nlohmann::json;
Expand Down Expand Up @@ -74,6 +93,12 @@ Configuration get_configuration(const std::filesystem::path &config_file, int jo
// Base dir for relative paths
config.root_path = config_file.parent_path();

// Read data source from JSON file. For now, this is optional, but in future it will be
// mandatory.
if (opt.contains("data")) {
config.data_source = get_data_source_from_json(opt["data"], config.root_path);
}

// input dataset file
try {
load_input_info(opt, config);
Expand Down
7 changes: 5 additions & 2 deletions src/HealthGPS.Input/configuration.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@
*/
#pragma once

#include "data_source.h"
#include "poco.h"
#include "version.h"

#include "HealthGPS.Core/api.h"
#include "HealthGPS/intervention_scenario.h"
#include "HealthGPS/modelinput.h"
#include "HealthGPS/scenario.h"
#include "HealthGPS/simulation.h"

#include "HealthGPS.Core/api.h"

#include <optional>
#include <stdexcept>

Expand All @@ -27,6 +27,9 @@ struct Configuration {
/// @brief The root path for configuration files
std::filesystem::path root_path;

/// @brief Static data source for the simulation (either URL or path)
std::optional<DataSource> data_source;

/// @brief The input data file details
FileInfo file;

Expand Down
107 changes: 107 additions & 0 deletions src/HealthGPS.Input/data_source.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#include "data_source.h"
#include "HealthGPS/sha256.h"
#include "download_file.h"
#include "zip_file.h"

#include <fmt/format.h>

namespace {
// If source is a relative path to a directory, rebase it on root_path, else just return source
std::string try_rebase_path(std::string source, const std::filesystem::path &root_path) {
if (!std::filesystem::is_directory(source)) {
return source;
}

const std::filesystem::path path = source;
if (path.is_absolute()) {
return source;
}

return (root_path / path).string();
}

// Get a path to a zip file; if source is a URL it will be downloaded first
std::filesystem::path get_zip_file_path(const std::string &source) {
if (source.ends_with(".zip") && std::filesystem::is_regular_file(source)) {
return source;
}

// If it's URL rather than a zip file, we have to download it first
if (source.starts_with("http://") || source.starts_with("https://")) {
return hgps::input::download_file_to_temporary(source, ".zip");
}

throw std::runtime_error(
"Data source must be a directory, a zip file or a URL pointing to a zip file");
}

std::filesystem::path get_data_directory_with_validation(const std::string &source,
const std::string &file_hash) {
// If the cache folder already exists, then we don't need to download or extract anything
auto cache_path = hgps::input::get_zip_cache_directory(file_hash);
if (std::filesystem::is_directory(cache_path)) {
return cache_path;
}

const auto zip_file_path = get_zip_file_path(source);

// Validate file with checksum
const auto computed_hash = hgps::compute_sha256_for_file(zip_file_path);
if (computed_hash != file_hash) {
throw std::runtime_error(
fmt::format("Checksum validation failed for {} (actual: {}, expected: {})", source,
computed_hash, file_hash));
}

// Extract files
hgps::input::extract_zip_file(zip_file_path, cache_path);

return cache_path;
}

std::filesystem::path get_data_directory_without_validation(const std::string &source) {
const auto zip_file_path = get_zip_file_path(source);
const auto file_hash = hgps::compute_sha256_for_file(zip_file_path);

// If the cache folder already exists, then we don't need to extract anything
auto cache_path = hgps::input::get_zip_cache_directory(file_hash);
if (std::filesystem::is_directory(cache_path)) {
return cache_path;
}

// Extract files
hgps::input::extract_zip_file(zip_file_path, cache_path);

return cache_path;
}

} // anonymous namespace

namespace hgps::input {

DataSource::DataSource(std::string source)
: source_(std::move(source)), file_hash_(std::nullopt), validate_checksum_(false) {}

DataSource::DataSource(std::string source, const std::filesystem::path &root_path,
std::optional<std::string> file_hash)
: source_(try_rebase_path(std::move(source), root_path)), file_hash_(std::move(file_hash)),
validate_checksum_(true) {}

std::filesystem::path DataSource::get_data_directory() const {
// If the data source is already a directory we can just return it (no checksum validation
// needed)
if (std::filesystem::is_directory(source_)) {
return source_;
}

if (!validate_checksum_) {
return get_data_directory_without_validation(source_);
}

if (file_hash_) {
return get_data_directory_with_validation(source_, *file_hash_);
}

throw std::runtime_error("Checksum must be supplied if data source is URL or zip file");
}
} // namespace hgps::input
41 changes: 41 additions & 0 deletions src/HealthGPS.Input/data_source.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#pragma once

#include <filesystem>
#include <optional>
#include <string>

namespace hgps::input {

//! Represents a source (either file/directory path or a URL) for static data
class DataSource {
public:
/// @brief Create a new DataSource without checksum validatation
/// @param source File/directory path or URL
explicit DataSource(std::string source);

/// @brief Create a new DataSource with checksum validation
/// @details Note that the file hash is not required for directories, but is compulsory
/// otherwise
/// @param source File/directory path or URL
/// @param root_path Path to use as root for relative paths
/// @param file_hash SHA256 hash of file contents
explicit DataSource(std::string source, const std::filesystem::path &root_path,
std::optional<std::string> file_hash);

// Copy and move constructors
DataSource(const DataSource &) noexcept = default;
DataSource(DataSource &&) noexcept = default;
DataSource &operator=(const DataSource &) noexcept = default;
DataSource &operator=(DataSource &&) noexcept = default;

/// @brief Get the path to a directory containing the data
/// @details This function will download, extract and validate data as needed
/// @return Path to directory containing the data
std::filesystem::path get_data_directory() const;

private:
std::string source_;
std::optional<std::string> file_hash_;
bool validate_checksum_;
};
} // namespace hgps::input
23 changes: 3 additions & 20 deletions src/HealthGPS.Input/datamanager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,26 +47,9 @@ nlohmann::json read_input_files_from_directory(const std::filesystem::path &root
} // anonymous namespace

namespace hgps::input {
DataManager::DataManager(const std::string &path_or_url, VerboseMode verbosity)
: verbosity_{verbosity} {
if (path_or_url.starts_with("http:") || path_or_url.starts_with("https:")) {
// Download file to temporary folder and extract it
const auto path = download_file_to_temporary(path_or_url, ".zip");
root_ = extract_zip_file_or_load_from_cache(path);
} else {
std::filesystem::path path = path_or_url;
if (std::filesystem::is_directory(path)) {
root_ = std::move(path);
} else if (std::filesystem::is_regular_file(path) && path.extension() == ".zip") {
root_ = extract_zip_file_or_load_from_cache(path);
} else {
throw std::runtime_error(fmt::format(
"Path must either point to a zip file or a directory: {}", path_or_url));
}
}

index_ = read_input_files_from_directory(root_);
}
DataManager::DataManager(std::filesystem::path data_path, VerboseMode verbosity)
: root_(std::move(data_path)), verbosity_(verbosity),
index_(read_input_files_from_directory(root_)) {}

std::vector<Country> DataManager::get_countries() const {
auto results = std::vector<Country>();
Expand Down
5 changes: 3 additions & 2 deletions src/HealthGPS.Input/datamanager.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@ class DataManager : public Datastore {
DataManager() = delete;

/// @brief Initialises a new instance of the hgps::input::DataManager class.
/// @param path_or_url The path or URL pointing to the input files.
/// @param data_path The path to the directory containing the data.
/// @param verbosity The terminal logging verbosity mode to use.
/// @throws std::invalid_argument if the root directory or index.json is missing.
/// @throws std::runtime_error for invalid or unsupported index.json file schema version.
explicit DataManager(const std::string &path_or_url, VerboseMode verbosity = VerboseMode::none);
explicit DataManager(std::filesystem::path data_path,
VerboseMode verbosity = VerboseMode::none);

std::vector<Country> get_countries() const override;

Expand Down
10 changes: 0 additions & 10 deletions src/HealthGPS.Input/zip_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,4 @@ void extract_zip_file(const std::filesystem::path &file_path,
}
}
}

std::filesystem::path extract_zip_file_or_load_from_cache(const std::filesystem::path &file_path) {
const auto file_hash = compute_sha256_for_file(file_path);
auto cache_path = get_zip_cache_directory(file_hash);
if (!std::filesystem::exists(cache_path)) {
extract_zip_file(file_path, cache_path);
}

return cache_path;
}
} // namespace hgps::input
5 changes: 0 additions & 5 deletions src/HealthGPS.Input/zip_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,4 @@ std::filesystem::path get_zip_cache_directory(const std::string &file_hash);
/// @param output_directory The path to the output folder
void extract_zip_file(const std::filesystem::path &file_path,
const std::filesystem::path &output_directory);

/// @brief Load the contents of the zip file from cache or extract
/// @param file_path Path to zip file
/// @return The path to the cache folder where the contents were extracted
std::filesystem::path extract_zip_file_or_load_from_cache(const std::filesystem::path &file_path);
} // namespace hgps::input

0 comments on commit c598e10

Please sign in to comment.