From 4e9fcf7d7a0d0abafc15079adff86d0cce0972a6 Mon Sep 17 00:00:00 2001 From: Tom Reitz Date: Wed, 10 May 2023 16:54:25 -0500 Subject: [PATCH] several bugfixes - state_file was being ignored, more macros issues, and charset issues when writing output --- earthmover/__main__.py | 4 ++-- earthmover/earthmover.py | 25 +++++++++++------------ earthmover/nodes/destination.py | 4 ++-- earthmover/operations/column.py | 2 +- earthmover/runs_file.py | 6 +++--- example_projects/10_jinja/earthmover.yaml | 7 ++++--- example_projects/run_all.sh | 4 ++-- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/earthmover/__main__.py b/earthmover/__main__.py index 3f713d19..846482a0 100644 --- a/earthmover/__main__.py +++ b/earthmover/__main__.py @@ -101,7 +101,7 @@ def main(argv=None): if args.version: em_dir = os.path.dirname(os.path.abspath(__file__)) version_file = os.path.join(em_dir, 'VERSION.txt') - with open(version_file, 'r') as f: + with open(version_file, 'r', encoding='utf-8') as f: VERSION = f.read().strip() print(f"earthmover, version {VERSION}") exit(0) @@ -154,7 +154,7 @@ def main(argv=None): cli_state_configs=cli_state_configs ) except Exception as err: - logger.exception(err, exc_info=False) + logger.exception(err, exc_info=True) raise # Avoids linting error if args.command == 'compile': diff --git a/earthmover/earthmover.py b/earthmover/earthmover.py index 2bdde4eb..1f8efdb6 100644 --- a/earthmover/earthmover.py +++ b/earthmover/earthmover.py @@ -68,6 +68,8 @@ def __init__(self, 'log_level': _state_configs['log_level'].upper(), 'show_stacktrace': _state_configs['show_stacktrace'], } + if 'state_file' in _state_configs.keys(): + self.state_configs.update({'state_file': _state_configs['state_file']}) # Set up the logger self.logger = logger @@ -98,7 +100,7 @@ def load_config_file(self) -> dict: """ # pass 1: grab config.macros (if any) so Jinja in the YAML can be rendered with macros - with open(self.config_file, "r") as stream: + with open(self.config_file, "r", encoding='utf-8') as stream: # cannot just yaml.load() here, since Jinja in the YAML may make it invalid... # instead, pull out just the `config` section, which must not contain Jinja (except for `macros`) # then we yaml.load() just the config section to grab any `macros` @@ -121,14 +123,14 @@ def load_config_file(self) -> dict: # Read the configs block and extract the (optional) macros field. if start is not None and end is not None: configs_pass1 = yaml.safe_load("".join(lines[start:end])) - self.macros = configs_pass1.get("config", {}).get("macros", "").strip() + self.macros = configs_pass1.get("config", {}).get("macros", "") else: configs_pass1 = {} # Figure out lines range of macro definitions, to skip (re)reading/parsing them later self.macros_lines = self.macros.count("\n") - macros_definitions = [i for i, x in enumerate(lines) if x.strip().startswith('macros:')] - + self.macros = self.macros.strip() + # pass 2: # (a) load template YAML minus macros (which were already loaded in pass 1) @@ -137,10 +139,7 @@ def load_config_file(self) -> dict: # (d) load YAML to config Dict # (a) - if len(macros_definitions)>0: - self.config_template_string = "".join(lines[:macros_definitions[0]] + lines[macros_definitions[0] + self.macros_lines + 2:]) - else: - self.config_template_string = "".join(lines) + self.config_template_string = "".join(lines) # (b) _env_backup = os.environ.copy() # backup envvars @@ -166,7 +165,7 @@ def load_config_file(self) -> dict: try: self.config_template = jinja2.Environment( loader=jinja2.FileSystemLoader(os.path.dirname('./')) - ).from_string(self.macros + self.config_template_string) + ).from_string(self.macros + "\n\n" + self.config_template_string) self.config_template.globals['md5'] = util.jinja_md5 self.config_yaml = self.config_template.render() @@ -340,7 +339,7 @@ def generate(self, selector): ### Hashing requires an entire class mixin and multiple additional steps. - if not self.skip_hashing and 'state_file' in self.state_configs: + if not self.skip_hashing and self.state_configs.get('state_file', False): _runs_path = os.path.expanduser(self.state_configs['state_file']) self.logger.info(f"computing input hashes for run log at {_runs_path}") @@ -379,7 +378,7 @@ def generate(self, selector): ) self.do_generate = False - elif 'state_file' not in self.state_configs: + elif not self.state_configs.get('state_file', False): self.logger.info("skipping hashing and run-logging (no `state_file` defined in config)") runs_file = None # This instantiation will never be used, but this avoids linter alerts. @@ -452,12 +451,12 @@ def test(self, tests_dir): # load expected and outputted content as dataframes, and sort them # because dask may shuffle output order _expected_file = os.path.join(tests_dir, 'expected', filename) - with open(_expected_file, "r") as f: + with open(_expected_file, "r", encoding='utf-8') as f: _expected_df = pd.DataFrame([l.strip() for l in f.readlines()]) _expected_df = _expected_df.sort_values(by=_expected_df.columns.tolist()).reset_index(drop=True) _outputted_file = os.path.join(tests_dir, 'outputs', filename) - with open(_outputted_file, "r") as f: + with open(_outputted_file, "r", encoding='utf-8') as f: _outputted_df = pd.DataFrame([l.strip() for l in f.readlines()]) _outputted_df = _outputted_df.sort_values(by=_outputted_df.columns.tolist()).reset_index(drop=True) diff --git a/earthmover/nodes/destination.py b/earthmover/nodes/destination.py index e75d8ed3..48d416ad 100644 --- a/earthmover/nodes/destination.py +++ b/earthmover/nodes/destination.py @@ -90,7 +90,7 @@ def compile(self): # try: - with open(self.template, 'r') as fp: + with open(self.template, 'r', encoding='utf-8') as fp: template_string = fp.read() except Exception as err: @@ -134,7 +134,7 @@ def execute(self): self.data = self.data.fillna('') os.makedirs(os.path.dirname(self.file), exist_ok=True) - with open(self.file, 'w') as fp: + with open(self.file, 'w', encoding='utf-8') as fp: if self.header: fp.write(self.header + "\n") diff --git a/earthmover/operations/column.py b/earthmover/operations/column.py index 7fc59ec6..ba8915dd 100644 --- a/earthmover/operations/column.py +++ b/earthmover/operations/column.py @@ -492,7 +492,7 @@ def _read_map_file(self, file) -> dict: try: - with open(file, 'r') as fp: + with open(file, 'r', encoding='utf-8') as fp: _translations_list = list(csv.reader(fp, delimiter=sep)) return dict(_translations_list[1:]) diff --git a/earthmover/runs_file.py b/earthmover/runs_file.py index 57b2c368..59d8d933 100644 --- a/earthmover/runs_file.py +++ b/earthmover/runs_file.py @@ -58,7 +58,7 @@ def write_row(self, selector: Optional[str] = None): if selector: row_dict['selector'] = selector - with open(self.file, 'a') as fp: + with open(self.file, 'a', encoding='utf-8') as fp: writer = csv.DictWriter(fp, fieldnames=self.HEADER) writer.writerow(row_dict) @@ -205,7 +205,7 @@ def _write_header(self): :return: """ - with open(self.file, 'x') as fp: + with open(self.file, 'x', encoding='utf-8') as fp: writer = csv.writer(fp) writer.writerow(self.HEADER) @@ -215,7 +215,7 @@ def _read_runs(self): :return: """ - with open(self.file, 'r') as fp: + with open(self.file, 'r', encoding='utf-8') as fp: runs = list(csv.DictReader(fp, delimiter=',')) # Raise a warning for the user to manually reset or select a new log-runs file. diff --git a/example_projects/10_jinja/earthmover.yaml b/example_projects/10_jinja/earthmover.yaml index 2fdb2d8b..c3e13ab4 100644 --- a/example_projects/10_jinja/earthmover.yaml +++ b/example_projects/10_jinja/earthmover.yaml @@ -4,9 +4,9 @@ config: # show_graph: True # show_stacktrace: True macros: > - {% macro test() %} + {% macro test() -%} testing! - {% endmacro %} + {%- endmacro %} parameter_defaults: DO_LINEARIZE: "True" @@ -20,6 +20,7 @@ sources: transformations: {% for i in range(0,5) %} + # {{ test() }} actions{{i}}: operations: - operation: map_values @@ -63,4 +64,4 @@ destinations: template: ./templates/disciplineAction.jsont extension: jsonl linearize: True - {% endfor%} + {% endfor%} \ No newline at end of file diff --git a/example_projects/run_all.sh b/example_projects/run_all.sh index c8a0214d..a21e921b 100644 --- a/example_projects/run_all.sh +++ b/example_projects/run_all.sh @@ -56,8 +56,8 @@ earthmover rm -f output/* echo " ... done!" -echo " running 10_simple..." -cd ../10_simple/ +echo " running 10_jinja..." +cd ../10_jinja/ earthmover rm -rf outputs/* echo " ... done!"