From 1f99c93a2a6afea956a683244e71a9a2dde098bc Mon Sep 17 00:00:00 2001 From: rileyh Date: Fri, 13 Dec 2024 20:30:33 +0000 Subject: [PATCH] [#181] Remove the scripts.main.load_conf() function Instead of using this function to get the config and add attributes to it, we now separately get the config with load_conf_file() and pass attributes to Spark. I've translated some of the tests for load_conf() to tests for load_conf_file(). --- hlink/scripts/main.py | 46 ----- hlink/tests/config_loader_test.py | 47 ++++- hlink/tests/main_test.py | 306 ------------------------------ 3 files changed, 37 insertions(+), 362 deletions(-) delete mode 100644 hlink/tests/main_test.py diff --git a/hlink/scripts/main.py b/hlink/scripts/main.py index 6544c04..cec92d3 100755 --- a/hlink/scripts/main.py +++ b/hlink/scripts/main.py @@ -32,52 +32,6 @@ logger = logging.getLogger(__name__) -def load_conf(conf_name: str, user: str) -> tuple[Path, dict[str, Any]]: - """Load and return the hlink config dictionary. - - Add the following attributes to the config dictionary: - "derby_dir", "warehouse_dir", "spark_tmp_dir", "log_dir", "python", - "conf_path", "run_name" - """ - if "HLINK_CONF" not in os.environ: - global_conf = None - else: - global_conf_file = os.environ["HLINK_CONF"] - with open(global_conf_file) as f: - global_conf = json.load(f) - - run_name = Path(conf_name).stem - - if global_conf is None: - current_dir = Path.cwd() - hlink_dir = current_dir / "hlink_config" - base_derby_dir = hlink_dir / "derby" - base_warehouse_dir = hlink_dir / "warehouse" - base_spark_tmp_dir = hlink_dir / "spark_tmp_dir" - path, conf = load_conf_file(conf_name) - - conf["derby_dir"] = base_derby_dir / run_name - conf["warehouse_dir"] = base_warehouse_dir / run_name - conf["spark_tmp_dir"] = base_spark_tmp_dir / run_name - conf["log_dir"] = hlink_dir / "logs" - conf["python"] = sys.executable - else: - user_dir = Path(global_conf["users_dir"]) / user - user_dir_fast = Path(global_conf["users_dir_fast"]) / user - conf_dir = user_dir / "confs" - conf_path = conf_dir / conf_name - path, conf = load_conf_file(str(conf_path)) - - conf["derby_dir"] = user_dir / "derby" / run_name - conf["warehouse_dir"] = user_dir_fast / "warehouse" / run_name - conf["spark_tmp_dir"] = user_dir_fast / "tmp" / run_name - conf["log_dir"] = user_dir / "logs" - conf["python"] = global_conf["python"] - - conf["run_name"] = run_name - return path, conf - - def cli(): """Called by the hlink script.""" if "--version" in sys.argv: diff --git a/hlink/tests/config_loader_test.py b/hlink/tests/config_loader_test.py index 58c497e..b14e0b4 100644 --- a/hlink/tests/config_loader_test.py +++ b/hlink/tests/config_loader_test.py @@ -3,23 +3,50 @@ # in this project's top-level directory, and also on-line at: # https://github.com/ipums/hlink +from pathlib import Path + +import pytest + from hlink.configs.load_config import load_conf_file -import os.path +from hlink.errors import UsageError -def test_load_conf_file_json(conf_dir_path): - conf_file = os.path.join(conf_dir_path, "test") - _path, conf = load_conf_file(conf_file) +@pytest.mark.parametrize("file_name", ["test", "test.json"]) +def test_load_conf_file_json(conf_dir_path: str, file_name: str) -> None: + conf_file = Path(conf_dir_path) / file_name + path, conf = load_conf_file(str(conf_file)) assert conf["id_column"] == "id" + assert path == conf_file.with_suffix(".json") -def test_load_conf_file_toml(conf_dir_path): - conf_file = os.path.join(conf_dir_path, "test1") - _path, conf = load_conf_file(conf_file) +@pytest.mark.parametrize("file_name", ["test1", "test1.toml"]) +def test_load_conf_file_toml(conf_dir_path: str, file_name: str) -> None: + conf_file = Path(conf_dir_path) / file_name + path, conf = load_conf_file(str(conf_file)) assert conf["id_column"] == "id-toml" + assert path == conf_file.with_suffix(".toml") -def test_load_conf_file_json2(conf_dir_path): - conf_file = os.path.join(conf_dir_path, "test_conf_flag_run") - _path, conf = load_conf_file(conf_file) +def test_load_conf_file_json2(conf_dir_path: str) -> None: + conf_file = Path(conf_dir_path) / "test_conf_flag_run" + path, conf = load_conf_file(str(conf_file)) assert conf["id_column"] == "id_conf_flag" + assert path == conf_file.with_suffix(".json") + + +def test_load_conf_file_does_not_exist(tmp_path: Path) -> None: + conf_file = tmp_path / "notthere" + with pytest.raises( + FileNotFoundError, match="Couldn't find any of these three files:" + ): + load_conf_file(str(conf_file)) + + +def test_load_conf_file_unrecognized_extension(tmp_path: Path) -> None: + conf_file = tmp_path / "test.yaml" + conf_file.touch() + with pytest.raises( + UsageError, + match="The file .+ exists, but it doesn't have a '.toml' or '.json' extension", + ): + load_conf_file(str(conf_file)) diff --git a/hlink/tests/main_test.py b/hlink/tests/main_test.py deleted file mode 100644 index c236a3f..0000000 --- a/hlink/tests/main_test.py +++ /dev/null @@ -1,306 +0,0 @@ -# This file is part of the ISRDI's hlink. -# For copyright and licensing information, see the NOTICE and LICENSE files -# in this project's top-level directory, and also on-line at: -# https://github.com/ipums/hlink - -import pytest -import json -import toml -from pathlib import Path - -from hlink.scripts.main import load_conf -from hlink.errors import UsageError - -users = ("jesse", "woody") - - -@pytest.fixture() -def global_conf(tmp_path): - """The contents of the test global config as a dictionary.""" - global_conf = {} - global_conf["users_dir"] = str(tmp_path / "users_dir") - global_conf["users_dir_fast"] = str(tmp_path / "users_dir_fast") - global_conf["python"] = "python" - - return global_conf - - -@pytest.fixture() -def set_up_global_conf_file(monkeypatch, tmp_path, global_conf): - """Create the global config file and set the HLINK_CONF environment variable. - - The contents of the global config file are the same as the `global_conf` fixture - dictionary. - """ - file = tmp_path / "global_config_file.json" - - with open(file, "w") as f: - json.dump(global_conf, f) - - monkeypatch.setenv("HLINK_CONF", str(file)) - - -def get_conf_dir(global_conf, user): - """Given the global config and user, return the path to the user's config directory.""" - return Path(global_conf["users_dir"]) / user / "confs" - - -@pytest.mark.parametrize("conf_file", ("my_conf", "my_conf.toml", "my_conf.json")) -@pytest.mark.parametrize("user", users) -def test_load_conf_does_not_exist_no_env(monkeypatch, tmp_path, conf_file, user): - monkeypatch.delenv("HLINK_CONF", raising=False) - - filename = str(tmp_path / conf_file) - toml_filename = filename + ".toml" - json_filename = filename + ".json" - - error_msg = f"Couldn't find any of these three files: {filename}, {toml_filename}, {json_filename}" - with pytest.raises(FileNotFoundError, match=error_msg): - load_conf(filename, user) - - -@pytest.mark.parametrize("conf_file", ("my_conf.json",)) -@pytest.mark.parametrize("user", users) -def test_load_conf_json_exists_no_env(monkeypatch, tmp_path, conf_file, user): - monkeypatch.delenv("HLINK_CONF", raising=False) - monkeypatch.chdir(tmp_path) - filename = str(tmp_path / conf_file) - - contents = {} - with open(filename, "w") as f: - json.dump(contents, f) - - path, _conf = load_conf(filename, user) - assert str(path) == filename - - -@pytest.mark.parametrize("conf_name", ("my_conf", "my_conf.json", "my_conf.toml")) -@pytest.mark.parametrize("user", users) -def test_load_conf_json_exists_ext_added_no_env(monkeypatch, tmp_path, conf_name, user): - monkeypatch.delenv("HLINK_CONF", raising=False) - monkeypatch.chdir(tmp_path) - filename = str(tmp_path / conf_name) + ".json" - - contents = {} - with open(filename, "w") as f: - json.dump(contents, f) - - path, _conf = load_conf(str(tmp_path / conf_name), user) - assert str(path) == filename - - -@pytest.mark.parametrize("conf_file", ("my_conf.toml",)) -@pytest.mark.parametrize("user", users) -def test_load_conf_toml_exists_no_env(monkeypatch, tmp_path, conf_file, user): - monkeypatch.delenv("HLINK_CONF", raising=False) - monkeypatch.chdir(tmp_path) - filename = str(tmp_path / conf_file) - - contents = {} - with open(filename, "w") as f: - toml.dump(contents, f) - - path, _conf = load_conf(filename, user) - assert str(path) == filename - - -@pytest.mark.parametrize("conf_name", ("my_conf", "my_conf.json", "my_conf.toml")) -@pytest.mark.parametrize("user", users) -def test_load_conf_toml_exists_ext_added_no_env(monkeypatch, tmp_path, conf_name, user): - monkeypatch.delenv("HLINK_CONF", raising=False) - monkeypatch.chdir(tmp_path) - filename = str(tmp_path / conf_name) + ".toml" - - contents = {} - with open(filename, "w") as f: - toml.dump(contents, f) - - path, _conf = load_conf(str(tmp_path / conf_name), user) - assert str(path) == filename - - -@pytest.mark.parametrize("conf_name", ("my_conf", "testing.txt", "what.yaml")) -@pytest.mark.parametrize("user", users) -def test_load_conf_unrecognized_ext_env( - monkeypatch, tmp_path, set_up_global_conf_file, global_conf, conf_name, user -): - monkeypatch.chdir(tmp_path) - - conf_dir = get_conf_dir(global_conf, user) - conf_dir.mkdir(parents=True) - file = conf_dir / conf_name - file.touch() - - error_msg = ( - f"The file {file} exists, but it doesn't have a '.toml' or '.json' extension." - ) - with pytest.raises(UsageError, match=error_msg): - load_conf(str(file), user) - - -def test_load_conf_keys_set_no_env(monkeypatch, tmp_path): - monkeypatch.delenv("HLINK_CONF", raising=False) - monkeypatch.chdir(tmp_path) - filename = str(tmp_path / "keys_test.json") - contents = {"key1": "value1", "rock": "stone", "how": "about that"} - - with open(filename, "w") as f: - json.dump(contents, f) - - _path, conf = load_conf(filename, "test") - - for key, value in contents.items(): - assert conf[key] == value - - # Check for extra keys added by load_conf() - assert "derby_dir" in conf - assert "warehouse_dir" in conf - assert "spark_tmp_dir" in conf - assert "log_dir" in conf - assert "python" in conf - - -@pytest.mark.parametrize("global_conf", ("my_global_conf.json", "test.json")) -def test_load_conf_global_conf_does_not_exist_env(monkeypatch, tmp_path, global_conf): - global_path = str(tmp_path / global_conf) - monkeypatch.setenv("HLINK_CONF", global_path) - - with pytest.raises(FileNotFoundError): - load_conf("notthere.toml", "test") - - -@pytest.mark.parametrize("conf_file", ("my_conf", "my_conf.json", "my_conf.toml")) -@pytest.mark.parametrize("user", users) -def test_load_conf_does_not_exist_env( - monkeypatch, tmp_path, set_up_global_conf_file, global_conf, conf_file, user -): - monkeypatch.chdir(tmp_path) - - conf_dir = get_conf_dir(global_conf, user) - filename = str(conf_dir / conf_file) - toml_filename = filename + ".toml" - json_filename = filename + ".json" - - error_msg = f"Couldn't find any of these three files: {filename}, {toml_filename}, {json_filename}" - with pytest.raises(FileNotFoundError, match=error_msg): - load_conf(conf_file, user) - - -@pytest.mark.parametrize("conf_file", ("my_conf.json",)) -@pytest.mark.parametrize("user", users) -def test_load_conf_json_exists_in_conf_dir_env( - monkeypatch, tmp_path, set_up_global_conf_file, global_conf, conf_file, user -): - monkeypatch.chdir(tmp_path) - conf_dir = get_conf_dir(global_conf, user) - conf_dir.mkdir(parents=True) - - file = conf_dir / conf_file - contents = {} - - with open(file, "w") as f: - json.dump(contents, f) - - path, _conf = load_conf(conf_file, user) - assert path == file - - -@pytest.mark.parametrize("conf_file", ("my_conf.toml",)) -@pytest.mark.parametrize("user", users) -def test_load_conf_toml_exists_in_conf_dir_env( - monkeypatch, tmp_path, set_up_global_conf_file, global_conf, conf_file, user -): - monkeypatch.chdir(tmp_path) - conf_dir = get_conf_dir(global_conf, user) - conf_dir.mkdir(parents=True) - - file = conf_dir / conf_file - contents = {} - - with open(file, "w") as f: - toml.dump(contents, f) - - path, _conf = load_conf(conf_file, user) - assert path == file - - -@pytest.mark.parametrize("conf_name", ("my_conf", "test", "testingtesting123.txt")) -@pytest.mark.parametrize("user", users) -def test_load_conf_json_exists_in_conf_dir_ext_added_env( - monkeypatch, tmp_path, set_up_global_conf_file, global_conf, conf_name, user -): - monkeypatch.chdir(tmp_path) - conf_dir = get_conf_dir(global_conf, user) - conf_dir.mkdir(parents=True) - - conf_file = conf_name + ".json" - file = conf_dir / conf_file - contents = {} - - with open(file, "w") as f: - json.dump(contents, f) - - path, _conf = load_conf(conf_name, user) - assert path == file - - -@pytest.mark.parametrize("conf_name", ("my_conf", "test", "testingtesting123.txt")) -@pytest.mark.parametrize("user", users) -def test_load_conf_toml_exists_in_conf_dir_ext_added_env( - monkeypatch, tmp_path, set_up_global_conf_file, global_conf, conf_name, user -): - monkeypatch.chdir(tmp_path) - conf_dir = get_conf_dir(global_conf, user) - conf_dir.mkdir(parents=True) - - conf_file = conf_name + ".toml" - file = conf_dir / conf_file - contents = {} - - with open(file, "w") as f: - toml.dump(contents, f) - - path, _conf = load_conf(conf_name, user) - assert path == file - - -@pytest.mark.parametrize("conf_name", ("my_conf", "testing.txt", "what.yaml")) -@pytest.mark.parametrize("user", users) -def test_load_conf_unrecognized_ext_no_env(monkeypatch, tmp_path, conf_name, user): - monkeypatch.delenv("HLINK_CONF", raising=False) - monkeypatch.chdir(tmp_path) - - file = tmp_path / conf_name - file.touch() - - error_msg = f"The file {conf_name} exists, but it doesn't have a '.toml' or '.json' extension." - with pytest.raises(UsageError, match=error_msg): - load_conf(conf_name, user) - - -def test_load_conf_keys_set_env( - monkeypatch, tmp_path, set_up_global_conf_file, global_conf -): - monkeypatch.chdir(tmp_path) - user = "test" - conf_dir = get_conf_dir(global_conf, user) - conf_dir.mkdir(parents=True) - file = conf_dir / "keys_test.json" - filename = str(file) - - contents = {"key1": "value1", "rock": "stone", "how": "about that"} - - with open(file, "w") as f: - json.dump(contents, f) - - _path, conf = load_conf(filename, user) - - for key, value in contents.items(): - assert conf[key] == value - - # Check for extra keys added by load_conf() - assert "derby_dir" in conf - assert "warehouse_dir" in conf - assert "spark_tmp_dir" in conf - assert "log_dir" in conf - assert "python" in conf