From 8851e481c67f86b36795add0aef967068298cd79 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 23 Jan 2025 08:23:28 +0100 Subject: [PATCH 01/27] Move dumping test fixtures to `conftest.py` --- tests/conftest.py | 84 +++++++++++++++++++++++++++ tests/tools/dumping/test_processes.py | 78 ------------------------- 2 files changed, 84 insertions(+), 78 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 89b0a1bad7..5aa0ef3b89 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -951,3 +951,87 @@ def cat_path() -> Path: run_process = subprocess.run(['which', 'cat'], capture_output=True, check=True) path = run_process.stdout.decode('utf-8').strip() return Path(path) + + +@pytest.fixture +def generate_calculation_node_io(generate_calculation_node, tmp_path): + def _generate_calculation_node_io(entry_point: str | None = None, attach_outputs: bool = True): + import io + + import numpy as np + + from aiida.orm import ArrayData, FolderData, SinglefileData + + filename = 'file.txt' + filecontent = 'a' + singlefiledata_linklabel = 'singlefile' + folderdata_linklabel = 'folderdata' + folderdata_relpath = Path('relative_path') + arraydata_linklabel = 'arraydata' + + singlefiledata_input = SinglefileData.from_string(content=filecontent, filename=filename) + # ? Use instance for folderdata + folderdata = FolderData() + folderdata.put_object_from_filelike(handle=io.StringIO(filecontent), path=str(folderdata_relpath / filename)) # type: ignore[arg-type] + arraydata_input = ArrayData(arrays=np.ones(3)) + + # Create calculation inputs, outputs + calculation_node_inputs = { + singlefiledata_linklabel: singlefiledata_input, + folderdata_linklabel: folderdata, + arraydata_linklabel: arraydata_input, + } + + singlefiledata_output = singlefiledata_input.clone() + folderdata_output = folderdata.clone() + + if attach_outputs: + calculation_outputs = { + folderdata_linklabel: folderdata_output, + singlefiledata_linklabel: singlefiledata_output, + } + else: + calculation_outputs = None + + # Actually write repository file and then read it in when generating calculation_node + (tmp_path / filename).write_text(filecontent) + + calculation_node = generate_calculation_node( + repository=tmp_path, + inputs=calculation_node_inputs, + outputs=calculation_outputs, + entry_point=entry_point, + ) + return calculation_node + + return _generate_calculation_node_io + + +@pytest.fixture +def generate_workchain_node_io(): + def _generate_workchain_node_io(cj_nodes, store_all: bool = True): + """Generate an instance of a `WorkChain` that contains a sub-`WorkChain` and a `Calculation` with file io.""" + from aiida.orm import WorkflowNode + + wc_node = WorkflowNode() + wc_node_sub = WorkflowNode() + + # Add sub-workchain that calls a calculation + wc_node_sub.base.links.add_incoming(wc_node, link_type=LinkType.CALL_WORK, link_label='sub_workflow') + for cj_node in cj_nodes: + cj_node.base.links.add_incoming(wc_node_sub, link_type=LinkType.CALL_CALC, link_label='calculation') + + # Set process_state so that tests don't throw exception for build_call_graph of README generation + [cj_node.set_process_state('finished') for cj_node in cj_nodes] + wc_node.set_process_state('finished') + wc_node_sub.set_process_state('finished') + + # Need to store so that outputs are being dumped + if store_all: + wc_node.store() + wc_node_sub.store() + [cj_node.store() for cj_node in cj_nodes] + + return wc_node + + return _generate_workchain_node_io diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index accfbd17d2..88dad0323e 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -10,13 +10,11 @@ from __future__ import annotations -import io import shutil from pathlib import Path import pytest -from aiida.common.links import LinkType from aiida.tools.dumping.processes import ProcessDumper # Non-AiiDA variables @@ -38,82 +36,6 @@ node_metadata_file = '.aiida_node_metadata.yaml' -# Helper functions to generate the actual `WorkflowNode`s and `CalculationNode`s used for testing -@pytest.fixture -def generate_calculation_node_io(generate_calculation_node, tmp_path): - def _generate_calculation_node_io(entry_point: str | None = None, attach_outputs: bool = True): - import numpy as np - - from aiida.orm import ArrayData, FolderData, SinglefileData - - singlefiledata_input = SinglefileData.from_string(content=filecontent, filename=filename) - # ? Use instance for folderdata - folderdata = FolderData() - folderdata.put_object_from_filelike(handle=io.StringIO(filecontent), path=str(folderdata_relpath / filename)) # type: ignore[arg-type] - arraydata_input = ArrayData(arrays=np.ones(3)) - - # Create calculation inputs, outputs - calculation_node_inputs = { - singlefiledata_linklabel: singlefiledata_input, - folderdata_linklabel: folderdata, - arraydata_linklabel: arraydata_input, - } - - singlefiledata_output = singlefiledata_input.clone() - folderdata_output = folderdata.clone() - - if attach_outputs: - calculation_outputs = { - folderdata_linklabel: folderdata_output, - singlefiledata_linklabel: singlefiledata_output, - } - else: - calculation_outputs = None - - # Actually write repository file and then read it in when generating calculation_node - (tmp_path / filename).write_text(filecontent) - - calculation_node = generate_calculation_node( - repository=tmp_path, - inputs=calculation_node_inputs, - outputs=calculation_outputs, - entry_point=entry_point, - ) - return calculation_node - - return _generate_calculation_node_io - - -@pytest.fixture -def generate_workchain_node_io(): - def _generate_workchain_node_io(cj_nodes, store_all: bool = True): - """Generate an instance of a `WorkChain` that contains a sub-`WorkChain` and a `Calculation` with file io.""" - from aiida.orm import WorkflowNode - - wc_node = WorkflowNode() - wc_node_sub = WorkflowNode() - - # Add sub-workchain that calls a calculation - wc_node_sub.base.links.add_incoming(wc_node, link_type=LinkType.CALL_WORK, link_label='sub_workflow') - for cj_node in cj_nodes: - cj_node.base.links.add_incoming(wc_node_sub, link_type=LinkType.CALL_CALC, link_label='calculation') - - # Set process_state so that tests don't throw exception for build_call_graph of README generation - [cj_node.set_process_state('finished') for cj_node in cj_nodes] - wc_node.set_process_state('finished') - wc_node_sub.set_process_state('finished') - - # Need to store so that outputs are being dumped - if store_all: - wc_node.store() - wc_node_sub.store() - [cj_node.store() for cj_node in cj_nodes] - - return wc_node - - return _generate_workchain_node_io - - # Only test top-level actions, like path and README creation # Other things tested via `_dump_workflow` and `_dump_calculation` def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path): From 98ea050a929a42df265a67db8857d860c976a391 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 23 Jan 2025 09:52:56 +0100 Subject: [PATCH 02/27] First working version with `DataDumper` and `CollectionDumper` Take it from here --- src/aiida/cmdline/commands/cmd_process.py | 124 ++++--- src/aiida/cmdline/commands/cmd_profile.py | 282 +++++++++++++++ src/aiida/cmdline/params/options/main.py | 277 ++++++++++++++ src/aiida/tools/dumping/__init__.py | 6 +- src/aiida/tools/dumping/collection.py | 338 ++++++++++++++++++ src/aiida/tools/dumping/data.py | 292 +++++++++++++++ src/aiida/tools/dumping/parser.py | 56 +++ src/aiida/tools/dumping/processes.py | 280 ++++++++++++--- src/aiida/tools/dumping/rich.py | 86 +++++ src/aiida/tools/dumping/test-config-file.yaml | 23 ++ src/aiida/tools/dumping/utils.py | 94 +++++ tests/tools/dumping/test_processes.py | 1 + 12 files changed, 1756 insertions(+), 103 deletions(-) create mode 100644 src/aiida/tools/dumping/collection.py create mode 100644 src/aiida/tools/dumping/data.py create mode 100644 src/aiida/tools/dumping/parser.py create mode 100644 src/aiida/tools/dumping/rich.py create mode 100644 src/aiida/tools/dumping/test-config-file.yaml diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 5ad7c5d53c..ed0b3ccdd0 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -558,42 +558,19 @@ def process_repair(manager, broker, dry_run): echo.echo_report(f'Revived process `{pid}`') -@verdi_process.command('dump') +@verdi_process.command("dump") @arguments.PROCESS() @options.PATH() @options.OVERWRITE() -@click.option( - '--include-inputs/--exclude-inputs', - default=True, - show_default=True, - help='Include the linked input nodes of the `CalculationNode`(s).', -) -@click.option( - '--include-outputs/--exclude-outputs', - default=False, - show_default=True, - help='Include the linked output nodes of the `CalculationNode`(s).', -) -@click.option( - '--include-attributes/--exclude-attributes', - default=True, - show_default=True, - help='Include attributes in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', -) -@click.option( - '--include-extras/--exclude-extras', - default=True, - show_default=True, - help='Include extras in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', -) -@click.option( - '-f', - '--flat', - is_flag=True, - default=False, - show_default=True, - help='Dump files in a flat directory for every step of the workflow.', -) +@options.FLAT() +@options.INCLUDE_INPUTS() +@options.INCLUDE_OUTPUTS() +@options.INCLUDE_ATTRIBUTES() +@options.INCLUDE_EXTRAS() +@options.ALSO_RAW() +@options.ALSO_RICH() +@options.RICH_SPEC() +@options.RICH_DUMP_ALL() @click.option( '--dump-unsealed', is_flag=True, @@ -602,17 +579,23 @@ def process_repair(manager, broker, dry_run): help='Also allow the dumping of unsealed process nodes.', ) @options.INCREMENTAL() +# TODO: Also add CONFIG_FILE option here +# TODO: Currently, setting rich options is not supported here directly def process_dump( process, path, overwrite, + flat, include_inputs, include_outputs, include_attributes, include_extras, - flat, dump_unsealed, incremental, + also_raw, + also_rich, + rich_spec, + rich_dump_all, ) -> None: """Dump process input and output files to disk. @@ -630,29 +613,74 @@ def process_dump( node data for further inspection. """ - from aiida.tools.archive.exceptions import ExportValidationError + from aiida.tools.dumping.data import DataDumper from aiida.tools.dumping.processes import ProcessDumper + from aiida.tools.archive.exceptions import ExportValidationError + + # from aiida.tools.dumping.utils import validate_rich_options + from aiida.tools.dumping.rich import rich_from_cli + + processdumper_kwargs = { + "include_inputs": include_inputs, + "include_outputs": include_outputs, + "include_attributes": include_attributes, + "include_extras": include_extras, + "flat": flat, + "dump_unsealed": dump_unsealed, + "incremental": incremental, + } + + rich_kwargs = { + "rich_dump_all": rich_dump_all, + } + + datadumper_kwargs = { + "also_raw": also_raw, + "also_rich": also_rich, + } + + # if also_rich: + # try: + # validate_rich_options( + # rich_options=rich_options, rich_config_file=rich_config_file + # ) + # except ValueError as exc: + # echo.echo_critical(f"{exc!s}") + + if rich_spec is not None: + rich_spec_dict = rich_from_cli(rich_spec=rich_spec, **rich_kwargs) + else: + rich_spec_dict = {} + + data_dumper = DataDumper( + overwrite=overwrite, + rich_spec_dict=rich_spec_dict, + **datadumper_kwargs, + **rich_kwargs, + ) process_dumper = ProcessDumper( - include_inputs=include_inputs, - include_outputs=include_outputs, - include_attributes=include_attributes, - include_extras=include_extras, overwrite=overwrite, - flat=flat, - dump_unsealed=dump_unsealed, - incremental=incremental, + **processdumper_kwargs, + **rich_kwargs, + data_dumper=data_dumper, ) try: - dump_path = process_dumper.dump(process_node=process, output_path=path) + dump_path = process_dumper.dump( + process_node=process, + output_path=path, + ) + echo.echo_success( + f"Raw files for {process.__class__.__name__} <{process.pk}> dumped into folder `{dump_path}`." + ) except FileExistsError: echo.echo_critical( - 'Dumping directory exists and overwrite is False. Set overwrite to True, or delete directory manually.' + "Dumping directory exists and overwrite is False. Set overwrite to True, or delete directory manually." ) except ExportValidationError as e: - echo.echo_critical(f'{e!s}') + echo.echo_critical(f"{e!s}") except Exception as e: - echo.echo_critical(f'Unexpected error while dumping {process.__class__.__name__} <{process.pk}>:\n ({e!s}).') - - echo.echo_success(f'Raw files for {process.__class__.__name__} <{process.pk}> dumped into folder `{dump_path}`.') + echo.echo_critical( + f"Unexpected error while dumping {process.__class__.__name__} <{process.pk}>:\n ({e!s})." + ) diff --git a/src/aiida/cmdline/commands/cmd_profile.py b/src/aiida/cmdline/commands/cmd_profile.py index 7cb0e018ae..f035977f2c 100644 --- a/src/aiida/cmdline/commands/cmd_profile.py +++ b/src/aiida/cmdline/commands/cmd_profile.py @@ -19,6 +19,7 @@ from aiida.cmdline.utils import defaults, echo from aiida.common import exceptions from aiida.manage.configuration import Profile, create_profile, get_config +from aiida.tools.dumping import CollectionDumper, DataDumper, ProcessDumper @verdi.group('profile') @@ -269,3 +270,284 @@ def profile_delete(force, delete_data, profiles): get_config().delete_profile(profile.name, delete_storage=delete_data) echo.echo_success(f'Profile `{profile.name}` was deleted.') + + +# ? Specify groups via giving the groups, or just enabling "groups" and then all are dumped? +# ? Provide some mechanism to allow for both, e.g. if no argument is provided, all groups are dumped +@verdi_profile.command('mirror') +@options.PATH() +@options.OVERWRITE() +@options.INCREMENTAL() +@options.ORGANIZE_BY_GROUPS() +@options.DRY_RUN() +@options.DUMP_PROCESSES() +@options.ONLY_TOP_LEVEL_WORKFLOWS() +@options.DUMP_DATA() +@options.DEDUPLICATE() +@options.DATA_HIDDEN() +@options.ALSO_RAW() +@options.ALSO_RICH() +@options.INCLUDE_INPUTS() +@options.INCLUDE_OUTPUTS() +@options.INCLUDE_ATTRIBUTES() +@options.INCLUDE_EXTRAS() +@options.FLAT() +@options.RICH_SPEC() +@options.RICH_DUMP_ALL() +@options.DUMP_CONFIG_FILE() +@options.NODES() +@options.GROUPS() +@click.pass_context +def storage_mirror( + ctx, + path, + overwrite, + incremental, + organize_by_groups, + dry_run, + dump_processes, + only_top_level_workflows, + dump_data, + deduplicate, + data_hidden, + also_raw, + also_rich, + include_inputs, + include_outputs, + include_attributes, + include_extras, + flat, + rich_spec, + rich_dump_all, + dump_config_file, + nodes, + groups, +): + """Dump all data in an AiiDA profile's storage to disk.""" + + + from aiida import orm + from aiida.tools.dumping.parser import DumpConfigParser + from aiida.tools.dumping.rich import ( + DEFAULT_CORE_EXPORT_MAPPING, + rich_from_cli, + rich_from_config, + ) + from aiida.tools.dumping.utils import prepare_dump_path + + profile = ctx.obj['profile'] + + # from aiida.manage.manager import get_manager + + # manager = get_manager() + # storage = manager.get_profile_storage() + + # with spinner(): + # data = storage.get_info(detailed=True) + + # echo.echo_dictionary(data, sort_keys=False, fmt='yaml') + + # print(f"Profile name: {profile_name}") + + # # TODO: export computers alone, and groups + # t1 = time.time() + # qb = orm.QueryBuilder().append(orm.Node, tag='node', project=['uuid']) + # all_uuids = qb.all(flat=True) + # print(f"All UUIDs retrieved in {time.time() - t1:6.3f} s.") + + # t1 = time.time() + # with open('all-source-uuids.json', 'w') as fhandle: + # json.dump({'profile_name': profile_name, 'uuids': all_uuids}, fhandle) + # print(f"{len(all_uuids)} UUIDs written in {time.time() - t1:6.3f} s.") + + if nodes and groups: + echo.echo_critical('`nodes` and `groups` specified. Set only one.') + # if all_entries and groups: + # echo.echo_critical('`all_entries` and `groups` specified. Set only one.') + + if dump_config_file is None: + general_kwargs = { + 'path': path, + 'overwrite': overwrite, + 'incremental': incremental, + 'dry_run': dry_run, + } + + processdumper_kwargs = { + 'include_inputs': include_inputs, + 'include_outputs': include_outputs, + 'include_attributes': include_attributes, + 'include_extras': include_extras, + 'flat': flat, + # "calculations_hidden": calculations_hidden + } + + datadumper_kwargs = { + 'also_raw': also_raw, + 'also_rich': also_rich, + 'data_hidden': data_hidden, + } + + collection_kwargs = { + 'should_dump_processes': dump_processes, + 'should_dump_data': dump_data, + 'only_top_level_workflows': only_top_level_workflows, + } + + rich_kwargs = { + 'rich_dump_all': rich_dump_all, + } + + if rich_spec is not None: + rich_spec_dict = rich_from_cli(rich_spec=rich_spec, **rich_kwargs) + else: + rich_spec_dict = DEFAULT_CORE_EXPORT_MAPPING + + # TODO: Also allow for mixing. Currently one can _only_ specify either the config file, or the arguments on the + # TODO: command line + else: + kwarg_dicts_from_config = DumpConfigParser.parse_config_file(dump_config_file) + + general_kwargs = kwarg_dicts_from_config['general_kwargs'] + processdumper_kwargs = kwarg_dicts_from_config['processdumper_kwargs'] + datadumper_kwargs = kwarg_dicts_from_config['datadumper_kwargs'] + collection_kwargs = kwarg_dicts_from_config['collection_kwargs'] + rich_kwargs = kwarg_dicts_from_config['rich_kwargs'] + + rich_spec_dict = rich_from_config(kwarg_dicts_from_config['rich_spec'], **rich_kwargs) + + # Obtain these specifically for easy access and modifications + path = general_kwargs['path'] + overwrite = general_kwargs['overwrite'] + dry_run = general_kwargs['dry_run'] + incremental = general_kwargs['incremental'] + + if not overwrite and incremental: + echo.echo_report('Overwrite set to false, but incremental dumping selected. Will keep existing directories.') + + if not str(path).endswith(profile.name): + path /= profile.name + + # TODO: Implement proper dry-run feature + dry_run_message = f"Dry run for dumping of profile `{profile.name}`'s data at path: `{path}`.\n" + dry_run_message += 'Only directories will be created.' + + if dry_run or (not collection_kwargs['should_dump_processes'] and not collection_kwargs['should_dump_data']): + echo.echo_report(dry_run_message) + return + + else: + echo.echo_report(f"Dumping of profile `{profile.name}`'s data at path: `{path}`.") + + SAFEGUARD_FILE = '.verdi_storage_dump' # noqa: N806 + + try: + prepare_dump_path( + path_to_validate=path, + overwrite=overwrite, + incremental=incremental, + safeguard_file=SAFEGUARD_FILE, + ) + except FileExistsError as exc: + echo.echo_critical(str(exc)) + + (path / SAFEGUARD_FILE).touch() + + data_dumper = DataDumper( + dump_parent_path=path, + overwrite=overwrite, + incremental=incremental, + rich_spec_dict=rich_spec_dict, + **datadumper_kwargs, + ) + # dumper_pretty_print(data_dumper) + + process_dumper = ProcessDumper( + dump_parent_path=path, + overwrite=overwrite, + incremental=incremental, + data_dumper=data_dumper, + **processdumper_kwargs, + ) + # dumper_pretty_print(process_dumper) + + from aiida.tools.dumping.incremental import DumpNodeCollector + + dumpnodecollector = DumpNodeCollector(dump_parent_path=path) + + dumpnodecollector.update_uuids_before_dump() + dumpnodecollector.create_organized_uuid_dicts() + # dumpnodecollector.populate_uuid_dict() + + # raise SystemExit() + + # TODO: Possibly implement specifying specific computers + # TODO: Although, users could just specify the relevant nodes + # TODO: Also add option to specify node types via entry points + + # === Dump the data that is not associated with any group === + if not groups: + collection_dumper = CollectionDumper( + dump_parent_path=path, + output_path=path, + overwrite=overwrite, + incremental=incremental, + nodes=nodes, + **collection_kwargs, + **rich_kwargs, + data_dumper=data_dumper, + process_dumper=process_dumper, + deduplicate=deduplicate, + ) + collection_dumper.create_entity_counter() + # dumper_pretty_print(collection_dumper, include_private_and_dunder=False) + + if dump_processes and collection_dumper._should_dump_processes(): + echo.echo_report(f'Dumping processes not in any group for profile `{profile.name}`...') + collection_dumper.dump_processes() + if dump_data: + if not also_rich and not also_raw: + echo.echo_critical('`--dump-data was given, but neither --also-raw or --also-rich specified.') + echo.echo_report(f'Dumping data not in any group for profile {profile.name}...') + + collection_dumper.dump_data_rich() + # collection_dumper.dump_plugin_data() + + # === Dump data per-group if Groups exist in profile or are selected === + # TODO: Invert default behavior here, as I typically want to dump all entries + # TODO: Possibly define a new click option instead + # all_entries = not all_entries + if not groups: # and all_entries: + groups = orm.QueryBuilder().append(orm.Group).all(flat=True) + + if groups is not None and not nodes: + for group in groups: + if organize_by_groups: + group_subdir = Path(*group.type_string.split('.')) + group_path = path / 'groups' / group_subdir / group.label + else: + group_path = path + + collection_dumper = CollectionDumper( + dump_parent_path=path, + output_path=group_path, + overwrite=overwrite, + incremental=incremental, + group=group, + **collection_kwargs, + **rich_kwargs, + process_dumper=process_dumper, + data_dumper=data_dumper, + ) + collection_dumper.create_entity_counter() + if dump_processes: + # The additional `_should_dump_processes` check here ensures that no reporting like + # "Dumping processes for group `SSSP/1.3/PBE/efficiency`" is printed for groups that + # don't contain processes + if collection_dumper._should_dump_processes(): + echo.echo_report(f'Dumping processes for group `{group.label}`...') + collection_dumper.dump_processes() + if dump_data: + echo.echo_report(f'Dumping data for group `{group.label}`...') + collection_dumper.dump_data_rich() + # collection_dumper.dump_plugin_data() \ No newline at end of file diff --git a/src/aiida/cmdline/params/options/main.py b/src/aiida/cmdline/params/options/main.py index c2ce719375..6f19a3c465 100644 --- a/src/aiida/cmdline/params/options/main.py +++ b/src/aiida/cmdline/params/options/main.py @@ -27,6 +27,8 @@ 'ALL', 'ALL_STATES', 'ALL_USERS', + 'ALSO_RAW', + 'ALSO_RICH', 'APPEND_TEXT', 'ARCHIVE_FORMAT', 'BROKER_HOST', @@ -44,6 +46,7 @@ 'COMPUTERS', 'CONFIG_FILE', 'DATA', + 'DATA_HIDDEN', 'DATUM', 'DB_BACKEND', 'DB_ENGINE', @@ -53,13 +56,18 @@ 'DB_PORT', 'DB_USERNAME', 'DEBUG', + 'DEDUPLICATE', 'DESCRIPTION', 'DICT_FORMAT', 'DICT_KEYS', 'DRY_RUN', + 'DUMP_CONFIG_FILE', + 'DUMP_DATA', + 'DUMP_PROCESSES', 'EXIT_STATUS', 'EXPORT_FORMAT', 'FAILED', + 'FLAT', 'FORCE', 'FORMULA_MODE', 'FREQUENCY', @@ -68,6 +76,10 @@ 'GROUP_CLEAR', 'HOSTNAME', 'IDENTIFIER', + 'INCLUDE_ATTRIBUTES', + 'INCLUDE_EXTRAS', + 'INCLUDE_INPUTS', + 'INCLUDE_OUTPUTS', 'INCREMENTAL', 'INPUT_FORMAT', 'INPUT_PLUGIN', @@ -78,8 +90,10 @@ 'NODES', 'NON_INTERACTIVE', 'OLDER_THAN', + 'ONLY_TOP_LEVEL_WORKFLOWS', 'ORDER_BY', 'ORDER_DIRECTION', + 'ORGANIZE_BY_GROUPS', 'OVERWRITE', 'PAST_DAYS', 'PATH', @@ -95,6 +109,8 @@ 'PROJECT', 'RAW', 'REPOSITORY_PATH', + 'RICH_DUMP_ALL', + 'RICH_SPEC', 'SCHEDULER', 'SILENT', 'SORT', @@ -783,6 +799,182 @@ def set_log_level(ctx, _param, value): show_default=True, ) +DEDUPLICATE = OverridableOption( + '--deduplicate/--no-deduplicate', + is_flag=True, + default=False, + show_default=True, + help='', +) + +DUMP_PROCESSES = OverridableOption( + '--dump-processes/--no-dump-processes', + is_flag=True, + default=True, + show_default=True, + help='Dump process data.', +) + +DUMP_DATA = OverridableOption( + '--dump-data/--no-dump-data', + is_flag=True, + default=True, + type=bool, + show_default=True, + help='Dump data nodes in a dedicated directory.', +) + +DATA_HIDDEN = OverridableOption( + '--data-hidden/--data-non-hidden', + is_flag=True, + default=True, + show_default=True, + help='Dump all `orm.Data` in the hidden directory and link to there.', +) + +ALSO_RAW = OverridableOption( + '--also-raw/--not-also-raw', + is_flag=True, + default=False, + show_default=True, + help='Dump the `attributes` of all nodes related to the Process.', +) + +ALSO_RICH = OverridableOption( + '--also-rich/--not-also-rich', + is_flag=True, + default=True, + show_default=True, + help='Dump also nicely prepared outputs, e.g. CIF for structures or PDF image for bands.', +) + +RICH_SPEC = OverridableOption( + '--rich-spec', + default=None, + type=str, + help='Specifications for rich data dumping.', +) + +DUMP_CONFIG_FILE = OverridableOption( + '--dump-config-file', + default=None, + type=types.FileOrUrl(), + help='Provide dumping options via a config file in YAML format.', +) + +RICH_DUMP_ALL = OverridableOption( + '--rich-dump-all/--no-rich-dump-all', + default=True, + is_flag=True, + type=bool, + show_default=True, + help='If a rich specification is provided, this triggers if all other Data nodes should also be dumped or not.', +) + +ORGANIZE_BY_GROUPS = OverridableOption( + '--organize-by-groups/--no-organize-by-groups', + default=True, + is_flag=True, + type=bool, + show_default=True, + help='If the collection of nodes to be dumped is organized in groups, reproduce its hierarchy.', +) + +INCLUDE_INPUTS = OverridableOption( + '--include-inputs/--exclude-inputs', + default=True, + show_default=True, + help='Include the linked input nodes of the `CalculationNode`(s).', +) + +INCLUDE_OUTPUTS = OverridableOption( + '--include-outputs/--exclude-outputs', + default=False, + show_default=True, + help='Include the linked output nodes of the `CalculationNode`(s).', +) + +INCLUDE_ATTRIBUTES = OverridableOption( + '--include-attributes/--exclude-attributes', + default=True, + show_default=True, + help='Include attributes in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', +) + +INCLUDE_EXTRAS = OverridableOption( + '--include-extras/--exclude-extras', + default=True, + show_default=True, + help='Include extras in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', +) + +FLAT = OverridableOption( + '-f', + '--flat', + is_flag=True, + default=False, + help='Dump files in a flat directory for every step of a workflow.', +) + +ONLY_TOP_LEVEL_WORKFLOWS = OverridableOption( + '--only-top-level-workflows/--not-only-top-level-workflows', + is_flag=True, + default=True, + type=bool, + show_default=True, + help='Dump only the top-level workflows in their own dedicated directories.', +) + +DUMP_PROCESSES = OverridableOption( + '--dump-processes/--no-dump-processes', + is_flag=True, + default=True, + show_default=True, + help='Dump process data.', +) + +DUMP_DATA = OverridableOption( + '--dump-data/--no-dump-data', + is_flag=True, + default=True, + type=bool, + show_default=True, + help='Dump data nodes in a dedicated directory.', +) + +CALCULATIONS_HIDDEN = OverridableOption( + '--calculations-hidden/--calculations-non-hidden', + is_flag=True, + default=True, + type=bool, + show_default=True, + help='Dump all `orm.CalculationNode`s in the hidden directory and link to there.', +) + +DATA_HIDDEN = OverridableOption( + '--data-hidden/--data-non-hidden', + is_flag=True, + default=True, + show_default=True, + help='Dump all `orm.Data` in the hidden directory and link to there.', +) + +ALSO_RAW = OverridableOption( + '--also-raw/--not-also-raw', + is_flag=True, + default=False, + show_default=True, + help='Dump the `attributes` of all nodes related to the Process.', +) + +ALSO_RICH = OverridableOption( + '--also-rich/--not-also-rich', + is_flag=True, + default=True, + show_default=True, + help='Dump also nicely prepared outputs, e.g. CIF for structures or PDF image for bands.', +) + INCREMENTAL = OverridableOption( '--incremental/--no-incremental', is_flag=True, @@ -790,3 +982,88 @@ def set_log_level(ctx, _param, value): show_default=True, help="Incremental dumping of data to disk. Doesn't require using overwrite to clean previous directories.", ) + +RICH_OPTIONS = OverridableOption( + '--rich-options', + default=None, + type=str, + help='Specifications for rich data dumping.', +) + +DUMP_CONFIG_FILE = OverridableOption( + '--dump-config-file', + default=None, + type=types.FileOrUrl(), + help='Provide dumping options via a config file in YAML format.', +) + +RICH_DUMP_ALL = OverridableOption( + '--rich-dump-all/--no-rich-dump-all', + default=True, + is_flag=True, + type=bool, + show_default=True, + help='If a rich specification is provided, this triggers if all other Data nodes should also be dumped or not.', +) + +ORGANIZE_BY_GROUPS = OverridableOption( + '--organize-by-groups/--no-organize-by-groups', + default=True, + is_flag=True, + type=bool, + show_default=True, + help='If the collection of nodes to be dumped is organized in groups, reproduce its hierarchy.', +) + +INCLUDE_INPUTS = OverridableOption( + '--include-inputs/--exclude-inputs', + default=True, + show_default=True, + help='Include the linked input nodes of the `CalculationNode`(s).', +) + +INCLUDE_OUTPUTS = OverridableOption( + '--include-outputs/--exclude-outputs', + default=False, + show_default=True, + help='Include the linked output nodes of the `CalculationNode`(s).', +) + +INCLUDE_ATTRIBUTES = OverridableOption( + '--include-attributes/--exclude-attributes', + default=True, + show_default=True, + help='Include attributes in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', +) + +INCLUDE_EXTRAS = OverridableOption( + '--include-extras/--exclude-extras', + default=True, + show_default=True, + help='Include extras in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', +) + +FLAT = OverridableOption( + '-f', + '--flat', + is_flag=True, + default=False, + help='Dump files in a flat directory for every step of a workflow.', +) + +ONLY_TOP_LEVEL_WORKFLOWS = OverridableOption( + '--only-top-level-workflows/--not-only-top-level-workflows', + is_flag=True, + default=True, + type=bool, + show_default=True, + help='Dump only the top-level workflows in their own dedicated directories.', +) + +INCREMENTAL = OverridableOption( + '--incremental/--non-incremental', + is_flag=True, + default=True, + show_default=True, + help='Dump files incrementally when dumping collections of data to disk.', +) diff --git a/src/aiida/tools/dumping/__init__.py b/src/aiida/tools/dumping/__init__.py index a746fa171e..49713c9b8a 100644 --- a/src/aiida/tools/dumping/__init__.py +++ b/src/aiida/tools/dumping/__init__.py @@ -8,4 +8,8 @@ ########################################################################### """Modules related to the dumping of AiiDA data.""" -__all__ = ('processes',) +from .collection import CollectionDumper +from .data import DataDumper +from .processes import ProcessDumper + +__all__ = ('CollectionDumper', 'DataDumper', 'ProcessDumper') diff --git a/src/aiida/tools/dumping/collection.py b/src/aiida/tools/dumping/collection.py new file mode 100644 index 0000000000..169f5b3862 --- /dev/null +++ b/src/aiida/tools/dumping/collection.py @@ -0,0 +1,338 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Functionality for dumping of a Collection of AiiDA ORM entities.""" + +from __future__ import annotations + +import contextlib +import itertools as it +import logging +import os +from collections import Counter +from pathlib import Path + +from aiida import orm +from aiida.tools.dumping.data import DataDumper +from aiida.tools.dumping.processes import ProcessDumper +from aiida.tools.dumping.utils import sanitize_file_extension + +logger = logging.getLogger(__name__) + +DEFAULT_PROCESSES_TO_DUMP = [orm.CalculationNode, orm.WorkflowNode] +DEFAULT_DATA_TO_DUMP = [orm.StructureData, orm.Code, orm.Computer, orm.BandsData, orm.UpfData] +# DEFAULT_COLLECTIONS_TO_DUMP ?? +DEFAULT_ENTITIES_TO_DUMP = DEFAULT_PROCESSES_TO_DUMP + DEFAULT_DATA_TO_DUMP + + +# ! This class is instantiated once for every group, or once for the full profile +class CollectionDumper: + def __init__( + self, + *args, + dump_parent_path: Path = Path().cwd(), + output_path: Path = Path().cwd(), + overwrite: bool = False, + incremental: bool = True, + should_dump_processes: bool = False, + should_dump_data: bool = False, + only_top_level_workflows: bool = True, + group: orm.Group | None = None, + nodes: set = {}, + process_dumper: ProcessDumper | None = None, + data_dumper: DataDumper | None = None, + **kwargs, + ): + self.args = args + self.dump_parent_path = dump_parent_path + self.output_path = output_path + self.overwrite = overwrite + self.incremental = incremental + self.should_dump_processes = should_dump_processes + self.should_dump_data = should_dump_data + self.only_top_level_workflows = only_top_level_workflows + self.nodes = nodes + self.process_dumper = process_dumper + self.data_dumper = data_dumper + self.kwargs = kwargs + + self.hidden_aiida_path = dump_parent_path / '.aiida-raw-data' + + # Allow passing of group via label + if isinstance(group, str): + group = orm.Group.get(self.group) + self.group = group + + self.output_path = output_path + + if not hasattr(self, 'entity_counter'): + self.create_entity_counter() + + def create_entity_counter(self) -> Counter: + entity_counter = Counter() + if self.group is not None: + # If the group only has one WorkChain assigned to it, this will only return a count of 1 for the + # WorkChainNode, nothing more, that is, it doesn't work recursively. + nodes = self.group.nodes + elif self.nodes is not None: + nodes = self.nodes + else: + nodes = orm.QueryBuilder().append(orm.Node).all(flat=True) + + # Iterate over all the entities in the group + for node in nodes: + # Count the type string of each entity + entity_counter[node.__class__] += 1 + + # Convert the Counter to a dictionary (optional) + self.entity_counter = entity_counter + + return entity_counter + + def get_collection_nodes(self): + if self.nodes: + self.collection_nodes = self.nodes + + # if hasattr(self, 'collection_nodes'): + # return self.collection_nodes + + # Get all nodes that are in the group + if self.group is not None: + nodes = list(self.group.nodes) + + # Get all nodes that are _not_ in any group + else: + groups = orm.QueryBuilder().append(orm.Group).all(flat=True) + nodes_in_groups = [node.pk for group in groups for node in group.nodes] + # Need to expand here also with the called_descendants of `WorkflowNodes`, otherwise the called + # `CalculationNode`s for `WorkflowNode`s that are part of a group are dumped twice + sub_nodes_in_groups = list( + it.chain( + *[ + orm.load_node(node).called_descendants + for node in nodes_in_groups + if isinstance(orm.load_node(node), orm.WorkflowNode) + ] + ) + ) + sub_nodes_in_groups = [node.pk for node in sub_nodes_in_groups] + nodes_in_groups = nodes_in_groups + sub_nodes_in_groups + + profile_nodes = orm.QueryBuilder().append(orm.Node, project=['pk']).all(flat=True) + nodes = [profile_node for profile_node in profile_nodes if profile_node not in nodes_in_groups] + nodes = [orm.load_node(node) for node in nodes] + + self.collection_nodes = nodes + + return nodes + + def _should_dump_processes(self) -> bool: + if not self.nodes: + return ( + sum( + self.entity_counter.get(orm_process_class, 0) + for orm_process_class in [ + orm.CalcJobNode, + orm.CalcFunctionNode, + orm.WorkChainNode, + orm.WorkFunctionNode, + orm.ProcessNode, + ] + ) + > 0 + ) + else: + return len([node for node in self.nodes if isinstance(node, orm.ProcessNode)]) > 0 + + def _dump_calculations_hidden(self, calculations): + # ? Dump only top-level workchains, as that includes sub-workchains already + + for calculation in calculations: + calculation_dumper = self.process_dumper + + calculation_dump_path = self.hidden_aiida_path / 'calculations' / calculation.uuid + + # if not self.dry_run: + # with contextlib.suppress(FileExistsError): + try: + calculation_dumper._dump_calculation(calculation_node=calculation, output_path=calculation_dump_path) + except: + raise + + # # To make development quicker + # if iworkflow_ > 1: + # break + + def _dump_link_workflows(self, workflows, link_calculations: bool = True): + # workflow_nodes = get_nodes_from_db(aiida_node_type=orm.WorkflowNode, with_group=self.group, flat=True) + for workflow in workflows: + workflow_dumper = self.process_dumper + + link_calculations_dir = self.hidden_aiida_path / 'calculations' + # TODO: If the GroupDumper is called from somewhere else outside, prefix the path with `groups/core` etc + workflow_dump_path = ( + self.output_path + / 'workflows' + / workflow_dumper._generate_default_dump_path(process_node=workflow, prefix=None) + ) + # logger.report(f'WORKFLOW_DUMP_PATH: {workflow_dump_path}') + + workflow_dumper._dump_workflow( + workflow_node=workflow, + output_path=workflow_dump_path, + link_calculations=link_calculations, + link_calculations_dir=link_calculations_dir, + ) + + def _link_calculations_hidden(self, calculations): + # calculation_nodes = get_nodes_from_db(aiida_node_type=orm.CalculationNode, with_group=self.group, flat=True) + for calculation_node in calculations: + calculation_dumper = self.process_dumper + + link_calculations_dir = self.hidden_aiida_path / 'calculations' + + calculation_dump_path = self.output_path / 'calculations' + calculation_dump_path.mkdir(parents=True, exist_ok=True) + calculation_dump_path = calculation_dump_path / calculation_dumper._generate_default_dump_path( + process_node=calculation_node + ) + + with contextlib.suppress(FileExistsError): + os.symlink(link_calculations_dir / calculation_node.uuid, calculation_dump_path) + + def dump_processes(self): + # ? Here, these could be all kinds of entities that could be grouped in AiiDA + # if len(self.entities_to_dump) > 0: + # pass + # # nodes = self.entities_to_dump + # else: + nodes = self.get_collection_nodes() + workflows = [node for node in nodes if isinstance(node, orm.WorkflowNode)] + + if self.only_top_level_workflows: + workflows = [workflow for workflow in workflows if workflow.caller is None] + + # Also need to obtain sub-calculations that were called by workflows of the group + # These are not contained in the group.nodes directly + called_calculations = [] + for workflow in workflows: + called_calculations += [ + node for node in workflow.called_descendants if isinstance(node, orm.CalculationNode) + ] + + calculations = set([node for node in nodes if isinstance(node, orm.CalculationNode)] + called_calculations) + + if len(workflows) + len(calculations) == 0: + return + + self.output_path.mkdir(exist_ok=True, parents=True) + + print(f'self.process_dumper.calculations_hidden: {self.process_dumper.calculations_hidden}') + print(f'self.output_path: {self.output_path}') + if self.process_dumper.calculations_hidden: + print('dump hidden') + self._dump_calculations_hidden(calculations=calculations) + self._dump_link_workflows(workflows=workflows) + self._link_calculations_hidden(calculations=calculations) + else: + print('dump non-hidden') + for workflow in workflows: + workflow_path = ( + self.output_path + / 'workflows' + / self.process_dumper._generate_default_dump_path(process_node=workflow) + ) + self.process_dumper.dump(process_node=workflow, output_path=workflow_path) + + # TODO: Add `dump_data_raw` here, as well + def dump_data_rich(self): + nodes = self.get_collection_nodes() + nodes = [node for node in nodes if isinstance(node, (orm.Data, orm.Computer))] + # Here, when providing logic to set the exporters and fileformat via the rich-options, don't have to filter + # anymore for `core` + nodes = [node for node in nodes if node.entry_point.name.startswith('core')] + if len(nodes) == 0: + return + + self.output_path.mkdir(exist_ok=True, parents=True) + data_dumper = self.data_dumper + + for data_node in nodes: + node_entry_point_name = data_node.entry_point.name + + # Get the fileformat and exporter for the data node + try: + fileformat = data_dumper.rich_spec_dict[node_entry_point_name]['export_format'] + exporter = data_dumper.rich_spec_dict[node_entry_point_name]['exporter'] + + # If options for the rich dumping are specified and not all the other defaults are being used + # Some entry_points might not be inside the `rich_spec_dict` + except KeyError: + continue + + except: + # Raise all exceptions here during development + raise + + # Don't go further if no importer implemented for a data type anyway + if exporter is None: + continue + + try: + # Generate a nice filename and sanitize it + nice_output_path = self.output_path / 'data' / data_node.__class__.__name__.lower() + nice_fname = data_dumper.generate_output_fname_rich(data_node=data_node, fileformat=fileformat).replace( + '__', '_' + ) + nice_fname = sanitize_file_extension(nice_fname) + + if data_dumper.data_hidden: + # Define paths for hidden dump and linking + hidden_output_path = self.hidden_aiida_path / 'data' / data_node.__class__.__name__.lower() + uuid_fname = sanitize_file_extension(f'{data_node.uuid}.{fileformat}') + + # Dump the data in the hidden directory + data_dumper.dump_core_data_node_rich(data_node, hidden_output_path, uuid_fname) + + # Link the hidden file to the expected output path + (nice_output_path / nice_fname).parent.mkdir(exist_ok=True, parents=True) + os.symlink(hidden_output_path / uuid_fname, nice_output_path / nice_fname) + + else: + # Dump the data in the non-hidden directory + data_dumper.dump_core_data_node_rich(data_node, nice_output_path, nice_fname) + + except TypeError: + # Handle case when no exporter is implemented for a given data_node type + raise + except OSError: + # A Data node, e.g. a Code might already be existent, so don't worry about this exception + continue + except Exception: + raise + + def dump_plugin_data(self): + return + # from importlib.metadata import entry_points + + # plugin_data_entry_points = [entry_point.name for entry_point in entry_points(group='aiida.data')] + # # print(plugin_data_entry_points) + # # print(self.entity_counter) + # from aiida.manage.manager import get_manager + + # manager = get_manager() + # storage = manager.get_profile_storage() + # orm_entities = storage.get_orm_entities(detailed=True)['Nodes']['node_types'] + # non_core_data_entities = [ + # orm_entity + # for orm_entity in orm_entities + # if orm_entity.startswith('data') and not orm_entity.startswith('data.core') + # ] + # # TODO: Implement dumping here. Stashed for now, as both `HubbardStructureData` and `UpfData` I wanted to use + # # TODO: for testing don't implement `export` either way + # # print(non_core_data_entities) diff --git a/src/aiida/tools/dumping/data.py b/src/aiida/tools/dumping/data.py new file mode 100644 index 0000000000..3a75d8d743 --- /dev/null +++ b/src/aiida/tools/dumping/data.py @@ -0,0 +1,292 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Functionality for dumping of Data nodes.""" + +from __future__ import annotations + +import logging +from functools import singledispatchmethod +from pathlib import Path + +import yaml + +from aiida import orm + +logger = logging.getLogger(__name__) + + +class DataDumper: + def __init__( + self, + *args, + dump_parent_path: Path = Path.cwd(), + overwrite: bool = False, + incremental: bool = True, + data_hidden: bool = False, + also_raw: bool = False, + also_rich: bool = False, + rich_spec_dict: dict | None = None, + **kwargs, + ) -> None: + self.args = args + self.dump_parent_path = dump_parent_path + self.overwrite = overwrite + self.incremental = incremental + self.data_hidden = data_hidden + self.also_raw = also_raw + self.also_rich = also_rich + self.kwargs = kwargs + + self.rich_spec_dict = rich_spec_dict + + self.hidden_aiida_path = dump_parent_path / '.aiida-raw-data' + + @singledispatchmethod + def dump_core_data_node_rich(self, data_node, output_path, output_fname): + # raise NotImplementedError(f'Dumping not implemented for type {type(data_node)}') + # print(f'No specific handler found for type <{type(data_node)}> <{data_node}>, doing nothing.') + # output_path /= 'general' + # This is effectively the `rich` dumping + data_node_entry_point_name = data_node.entry_point.name + export_settings = self.rich_spec_dict[data_node_entry_point_name] + exporter = export_settings['exporter'] + fileformat = export_settings['export_format'] + if exporter is not None: + output_path.mkdir(exist_ok=True, parents=True) + exporter( + node=data_node, + output_fname=output_path / output_fname, + fileformat=fileformat, + overwrite=self.overwrite, + ) + # This is for orm.Data types for which no default dumping is implemented, e.g. Bool or Float + # except ValueError: + # pass + # This is for orm.Data types for whose entry_point names no entry exists in the DEFAULT_CORE_EXPORT_MAPPING + # This is now captured outside in the `CollectionDumper`, so should not be relevant anymore + # except TypeError: + # raise + + @dump_core_data_node_rich.register + def _( + self, + data_node: orm.StructureData, + output_path: str | Path | None = None, + output_fname: str | None = None, + ): + if type(data_node) is orm.StructureData: + self._dump_structuredata(data_node, output_path=output_path, output_fname=output_fname) + else: + # Handle the case where data_node is a subclass of orm.StructureData + # Just use the default dispatch function implementation + self.dump_core_data_node_rich.dispatch(object)(self, data_node, output_path, output_fname) + + @dump_core_data_node_rich.register + def _( + self, + data_node: orm.Code, + output_path: str | Path | None = None, + output_fname: str | None = None, + ): + self._dump_code(data_node=data_node, output_path=output_path, output_fname=output_fname) + + @dump_core_data_node_rich.register + def _( + self, + data_node: orm.Computer, + output_path: str | Path | None = None, + output_fname: str | None = None, + ): + self._dump_computer_setup(data_node=data_node, output_path=output_path, output_fname=output_fname) + self._dump_computer_config(data_node=data_node, output_path=output_path, output_fname=output_fname) + + @dump_core_data_node_rich.register + def _( + self, + data_node: orm.BandsData, + output_path: str | Path | None = None, + output_fname: str | None = None, + ): + self._dump_bandsdata(data_node=data_node, output_path=output_path, output_fname=output_fname) + + # These are the rich dumping implementations that actually differ from the default dispatch + def _dump_structuredata( + self, + data_node: orm.StructureData, + output_path: Path | None = None, + output_fname: str | None = None, + ): + from aiida.common.exceptions import UnsupportedSpeciesError + + node_entry_point_name = data_node.entry_point.name + exporter = self.rich_spec_dict[node_entry_point_name]['exporter'] + fileformat = self.rich_spec_dict[node_entry_point_name]['export_format'] + + if output_fname is None: + output_fname = DataDumper.generate_output_fname_rich(data_node=data_node, fileformat=fileformat) + + # ? There also exists a CifData file type + # output_path /= 'structures' + output_path.mkdir(exist_ok=True, parents=True) + try: + exporter( + node=data_node, + output_fname=output_path / output_fname, + fileformat=fileformat, + overwrite=self.overwrite, + ) + except UnsupportedSpeciesError: + # This is the case for, e.g. HubbardStructureData that has species like `Mn0` + # Not sure how to resolve this. Wouldn't add a singledispatch for data types defined in plugins. Currently, + # do strict type check. HubbardStructureData doesn't implement an export method itself, though. + pass + + def _dump_code( + self, + data_node: orm.Code, + output_path: Path | None = None, + output_fname: str | None = None, + ): + # output_path /= 'codes' + + node_entry_point_name = data_node.entry_point.name + exporter = self.rich_spec_dict[node_entry_point_name]['exporter'] + fileformat = self.rich_spec_dict[node_entry_point_name]['export_format'] + + if fileformat != 'yaml': + raise NotImplementedError('No other fileformats supported so far apart from YAML.') + output_path.mkdir(exist_ok=True, parents=True) + if output_fname is None: + output_fname = DataDumper.generate_output_fname_rich(data_node=data_node, fileformat=fileformat) + + exporter( + node=data_node, + output_fname=output_path / output_fname, + fileformat=fileformat, + overwrite=self.overwrite, + ) + + def _dump_computer_setup( + self, + data_node: orm.Computer, + output_path: Path | None = None, + output_fname: str | None = None, + ): + node_entry_point_name = data_node.entry_point.name + # TODO: Don't use the `exporter` here, as `Computer` doesn't derive from Data, so custom implementation + fileformat = self.rich_spec_dict[node_entry_point_name]['export_format'] + + if fileformat != 'yaml': + raise NotImplementedError('No other fileformats supported so far apart from YAML.') + + output_path.mkdir(exist_ok=True, parents=True) + + # This is a bit of a hack. Should split this up into two different functions. + if output_fname is None: + output_fname = output_path / f'{data_node.full_label}-setup-{data_node.pk}.{fileformat}' + + # ? Copied over from `cmd_computer` as importing `computer_export_setup` led to click Context error: + # TypeError: Context.__init__() got an unexpected keyword argument 'computer' + computer_setup = { + 'label': data_node.label, + 'hostname': data_node.hostname, + 'description': data_node.description, + 'transport': data_node.transport_type, + 'scheduler': data_node.scheduler_type, + 'shebang': data_node.get_shebang(), + 'work_dir': data_node.get_workdir(), + 'mpirun_command': ' '.join(data_node.get_mpirun_command()), + 'mpiprocs_per_machine': data_node.get_default_mpiprocs_per_machine(), + 'default_memory_per_machine': data_node.get_default_memory_per_machine(), + 'use_double_quotes': data_node.get_use_double_quotes(), + 'prepend_text': data_node.get_prepend_text(), + 'append_text': data_node.get_append_text(), + } + + if not output_fname.is_file(): + output_fname.write_text(yaml.dump(computer_setup, sort_keys=False), 'utf-8') + + def _dump_computer_config( + self, + data_node: orm.Computer, + output_path: Path | None = None, + output_fname: str | None = None, + ): + from aiida.orm import User + + node_entry_point_name = data_node.entry_point.name + # TODO: Don't use the `exporter` here, as `Computer` doesn't derive from Data, so custom implementation + fileformat = self.rich_spec_dict[node_entry_point_name]['export_format'] + + # output_path /= 'computers' + if fileformat != 'yaml': + raise NotImplementedError('No other fileformats supported so far apart from YAML.') + + output_path.mkdir(exist_ok=True, parents=True) + + # This is a bit of a hack. Should split this up into two different functions. + if output_fname is None: + output_fname = output_path / f'{data_node.full_label}-config-{data_node.pk}.{fileformat}' + + users = User.collection.all() + for user in users: + computer_configuration = data_node.get_configuration(user) + if not output_fname.is_file(): + output_fname.write_text(yaml.dump(computer_configuration, sort_keys=False), 'utf-8') + + def _dump_bandsdata( + self, + data_node: orm.BandsData, + output_path: Path | None = None, + output_fname: str | None = None, + ): + node_entry_point_name = data_node.entry_point.name + exporter = self.rich_spec_dict[node_entry_point_name]['exporter'] + fileformat = self.rich_spec_dict[node_entry_point_name]['export_format'] + + from aiida.tools.dumping.utils import sanitize_file_extension + + output_path.mkdir(exist_ok=True, parents=True) + + if output_fname is None: + output_fname = DataDumper.generate_output_fname_rich(data_node=data_node, fileformat=fileformat) + + output_fname = sanitize_file_extension(output_fname) + + exporter( + node=data_node, + output_fname=output_path / output_fname, + fileformat=fileformat, + overwrite=self.overwrite, + ) + + def _dump_user_info(self): ... + + def dump_core_data_node_raw(self, data_node: orm.Data, output_path: Path, output_fname: str | None = None): + output_path.mkdir(exist_ok=True, parents=True) + + if output_fname is None: + output_fname = DataDumper.generate_output_fname_raw(data_node=data_node) + + with open(output_path.resolve() / output_fname, 'w') as handle: + yaml.dump(data_node.attributes, handle) + + @staticmethod + def generate_output_fname_raw(data_node, prefix: str | None = None): + if prefix is None: + return f'{data_node.__class__.__name__}-{data_node.pk}_attrs.yaml' + else: + return f'{prefix}-{data_node.__class__.__name__}-{data_node.pk}_attrs.yaml' + + @staticmethod + def generate_output_fname_rich(data_node, fileformat, prefix: str | None = None): + if prefix is None: + return f'{data_node.__class__.__name__}-{data_node.pk}.{fileformat}' + else: + return f'{prefix}-{data_node.__class__.__name__}-{data_node.pk}.{fileformat}' diff --git a/src/aiida/tools/dumping/parser.py b/src/aiida/tools/dumping/parser.py new file mode 100644 index 0000000000..cc19b0f141 --- /dev/null +++ b/src/aiida/tools/dumping/parser.py @@ -0,0 +1,56 @@ +from pathlib import Path + +import yaml + + +class DumpConfigParser: + @staticmethod + def parse_config_file(config_file: str | Path | None) -> dict: + if isinstance(config_file, (str, Path)): + with open(config_file, 'r') as file: + config = yaml.safe_load(file) + else: + config = yaml.safe_load(config_file) + + general_kwargs = { + 'path': Path(config.get('path', Path.cwd())), + 'overwrite': config.get('overwrite', False), + 'incremental': config.get('incremental', True), + 'dry_run': config.get('dry_run', False), + } + + processdumper_kwargs = { + 'include_inputs': config.get('include_inputs', True), + 'include_outputs': config.get('include_outputs', True), + 'include_attributes': config.get('include_attributes', True), + 'include_extras': config.get('include_extras', False), + 'flat': config.get('flat', False), + 'calculations_hidden': config.get('calculations_hidden', True), + } + + datadumper_kwargs = { + 'also_raw': config.get('also_raw', False), + 'also_rich': config.get('also_rich', True), + 'data_hidden': config.get('data_hidden', True), + } + + collection_kwargs = { + 'should_dump_processes': config.get('dump_processes', True), + 'should_dump_data': config.get('dump_data', True), + 'only_top_level_workflows': config.get('only_top_level_workflows', True), + } + + rich_kwargs = { + 'rich_dump_all': config.get('rich_dump_all', True), + } + + rich_spec = config.get('rich_spec', None) + + return { + 'general_kwargs': general_kwargs, + 'processdumper_kwargs': processdumper_kwargs, + 'datadumper_kwargs': datadumper_kwargs, + 'collection_kwargs': collection_kwargs, + 'rich_kwargs': rich_kwargs, + 'rich_spec': rich_spec, + } diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 794b1fcab2..29fbef07c9 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -8,9 +8,23 @@ ########################################################################### """Functionality for dumping of ProcessNodes.""" +# ? Possibly add dry_run option here +# TODO: Add symlinking feature +# -> This would be for calculations which are subprocesses of the workflow +# -> But also PPs +# -> Could define a symlink-mapping based on a dict in the form: +# { +# CalculationNode: , +# PPs: +# } +# Based on this, I could check the linked directory for the entity based on its UUID +# TODO: Or, could add a `programmatic` option that doesn't create the README.md, and does a few other things, as well + from __future__ import annotations +import contextlib import logging +import os from pathlib import Path from types import SimpleNamespace from typing import List @@ -23,7 +37,6 @@ CalcFunctionNode, CalcJobNode, CalculationNode, - LinkManager, ProcessNode, WorkChainNode, WorkflowNode, @@ -31,6 +44,7 @@ ) from aiida.orm.utils import LinkTriple from aiida.tools.archive.exceptions import ExportValidationError +from aiida.tools.dumping.data import DataDumper from aiida.tools.dumping.utils import prepare_dump_path LOGGER = logging.getLogger(__name__) @@ -39,26 +53,43 @@ class ProcessDumper: def __init__( self, + *args, + dump_parent_path: Path = Path.cwd(), + overwrite: bool = False, + incremental: bool = True, + flat: bool = False, + calculations_hidden: bool = True, include_inputs: bool = True, include_outputs: bool = False, include_attributes: bool = True, include_extras: bool = True, - overwrite: bool = False, - flat: bool = False, + rich_options: str = '', + rich_config_file: Path | None = None, + rich_dump_all: bool = True, + data_dumper: DataDumper | None = DataDumper(), dump_unsealed: bool = False, - incremental: bool = True, + **kwargs, ) -> None: + self.args = args + self.dump_parent_path = dump_parent_path + self.overwrite = overwrite + self.incremental = incremental + self.flat = flat self.include_inputs = include_inputs self.include_outputs = include_outputs self.include_attributes = include_attributes self.include_extras = include_extras - self.overwrite = overwrite - self.flat = flat + self.rich_options = rich_options + self.rich_config_file = rich_config_file + self.rich_dump_all = rich_dump_all + self.data_dumper = data_dumper + self.kwargs = kwargs self.dump_unsealed = dump_unsealed - self.incremental = incremental + + self.hidden_aiida_path = dump_parent_path / '.aiida-raw-data' @staticmethod - def _generate_default_dump_path(process_node: ProcessNode) -> Path: + def _generate_default_dump_path(process_node: ProcessNode, prefix: str = 'dump') -> Path: """Simple helper function to generate the default parent-dumping directory if none given. This function is not called for the recursive sub-calls of `_dump_calculation` as it just creates the default @@ -69,11 +100,12 @@ def _generate_default_dump_path(process_node: ProcessNode) -> Path: """ pk = process_node.pk + # TODO: Use UUID[:8] here try: - return Path(f'dump-{process_node.process_label}-{pk}') + return Path(f'{prefix}-{process_node.process_label}-{pk}') except AttributeError: # This case came up during testing, not sure how relevant it actually is - return Path(f'dump-{process_node.process_type}-{pk}') + return Path(f'{prefix}-{process_node.process_type}-{pk}') @staticmethod def _generate_readme(process_node: ProcessNode, output_path: Path) -> None: @@ -165,15 +197,15 @@ def _generate_child_node_label(index: int, link_triple: LinkTriple) -> str: node_label = '-'.join(label_list) # `CALL-` as part of the link labels also for MultiplyAddWorkChain -> Seems general enough, so remove node_label = node_label.replace('CALL-', '') - node_label = node_label.replace('None-', '') - - return node_label + return node_label.replace('None-', '') def dump( self, process_node: ProcessNode, output_path: Path | None, io_dump_paths: List[str | Path] | None = None, + *args, + **kwargs, ) -> Path: """Dumps all data involved in a `ProcessNode`, including its outgoing links. @@ -192,6 +224,13 @@ def dump( f'Process `{process_node.pk} must be sealed before it can be dumped, or `dump_unsealed` set to True.' ) + # This here is mainly for `include_attributes` and `include_extras`. + # I don't want to include them in the general class `__init__`, as they don't really fit there. + # But the `_dump_node_yaml` function is private, so it's never called outside by the user. + # Setting the class attributes here dynamically is probably not a good solution, but it works for now. + for key, value in kwargs.items(): + setattr(self, key, value) + if output_path is None: output_path = self._generate_default_dump_path(process_node=process_node) @@ -216,7 +255,12 @@ def dump( return output_path def _dump_workflow( - self, workflow_node: WorkflowNode, output_path: Path, io_dump_paths: List[str | Path] | None = None + self, + workflow_node: WorkflowNode, + output_path: Path, + io_dump_paths: List[str | Path] | None = None, + link_calculations: bool = False, + link_calculations_dir: str | None = None, ) -> None: """Recursive function to traverse a `WorkflowNode` and dump its `CalculationNode` s. @@ -242,15 +286,25 @@ def _dump_workflow( workflow_node=child_node, output_path=child_output_path, io_dump_paths=io_dump_paths, + # TODO: Always need to pass this stuff through due to the recursive nature of the function call... + # TODO: Maybe one can make a separate method that only does the linking + link_calculations=link_calculations, + link_calculations_dir=link_calculations_dir, ) # Once a `CalculationNode` as child reached, dump it elif isinstance(child_node, CalculationNode): - self._dump_calculation( - calculation_node=child_node, - output_path=child_output_path, - io_dump_paths=io_dump_paths, - ) + if not link_calculations: + self._dump_calculation( + calculation_node=child_node, + output_path=child_output_path, + io_dump_paths=io_dump_paths, + ) + else: + try: + os.symlink(link_calculations_dir / child_node.uuid, child_output_path) + except FileExistsError: + pass def _dump_calculation( self, @@ -275,30 +329,75 @@ def _dump_calculation( calculation_node.base.repository.copy_tree(output_path.resolve() / io_dump_mapping.repository) # Dump the repository contents of `outputs.retrieved` - try: + with contextlib.suppress(NotExistentAttributeError): calculation_node.outputs.retrieved.base.repository.copy_tree( output_path.resolve() / io_dump_mapping.retrieved ) - except NotExistentAttributeError: - pass + + if self.data_dumper.also_raw: + # TODO: Replace with attached self.data_dumper attribute + self.data_dumper.dump_core_data_node_raw(data_node=calculation_node, output_path=output_path) # Dump the node_inputs if self.include_inputs: input_links = calculation_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC) - self._dump_calculation_io(parent_path=output_path / io_dump_mapping.inputs, link_triples=input_links) + # Need to create the path before, otherwise getting Exception + input_path = output_path / io_dump_mapping.inputs + input_path.mkdir(parents=True, exist_ok=True) + + self._dump_calculation_io_files(parent_path=output_path / io_dump_mapping.inputs, link_triples=input_links) + + if self.data_dumper.also_raw: + # Always dump the `raw` data inside the calculation directories + # I don't see a reason why one would want all the node attribute files in a centralized location + self._dump_calculation_io_files_raw( + output_path=output_path / io_dump_mapping.inputs, link_triples=input_links + ) + + if self.data_dumper.also_rich: + rich_data_output_path = output_path / io_dump_mapping.inputs + # if not self.data_dumper.data_hidden: + # rich_data_output_path = output_path / io_dump_mapping.inputs + # else: + # # TODO: Currently, when dumping only one selected workflow, if rich dumping is activated, but + # # TODO: `data-hidden` is set, no data nodes were actually being dumped + # # TODO: With the current implementation below, they are dumped, but not in the same structure as for the + # # TODO: `dump_rich_core` function. Quick fix for now + # pass + + # Only dump the rich data output files in the process directories if data_hidden is False + self._dump_calculation_io_files_rich( + output_path=rich_data_output_path, link_triples=input_links + ) # Dump the node_outputs apart from `retrieved` if self.include_outputs: output_links = list(calculation_node.base.links.get_outgoing(link_type=LinkType.CREATE)) output_links = [output_link for output_link in output_links if output_link.link_label != 'retrieved'] - self._dump_calculation_io( + self._dump_calculation_io_files( parent_path=output_path / io_dump_mapping.outputs, link_triples=output_links, ) - def _dump_calculation_io(self, parent_path: Path, link_triples: LinkManager | List[LinkTriple]): - """Small helper function to dump linked input/output nodes of a `CalculationNode`. + if self.data_dumper.also_raw: + self._dump_calculation_io_files_raw( + output_path=output_path / io_dump_mapping.outputs, + link_triples=output_links, + ) + + if self.data_dumper.also_rich: + self._dump_calculation_io_files_rich( + output_path=output_path / io_dump_mapping.outputs, + link_triples=output_links, + ) + + def _dump_calculation_io_files( + self, + parent_path: Path, + link_triples: orm.LinkManager | List[orm.LinkTriple], + ): + """Small helper function to dump linked input/output nodes of a `orm.CalculationNode`. :param parent_path: Parent directory for dumping the linked node contents. :param link_triples: List of link triples. @@ -315,6 +414,92 @@ def _dump_calculation_io(self, parent_path: Path, link_triples: LinkManager | Li link_triple.node.base.repository.copy_tree(linked_node_path.resolve()) + def _dump_calculation_io_files_raw( + self, + output_path: Path, + link_triples: orm.LinkManager | List[orm.LinkTriple], + ): + """Small helper function to dump linked input/output nodes of a `orm.CalculationNode`. + + :param parent_path: Parent directory for dumping the linked node contents. + :param link_triples: List of link triples. + """ + + output_path /= 'raw' + + for link_triple in link_triples: + link_label = link_triple.link_label + data_node = link_triple.node + + # linked_node_path.parent.mkdir(parents=True, exist_ok=True) + output_path.mkdir(parents=True, exist_ok=True) + + # Then dump the node attributes for each node + output_fname = DataDumper.generate_output_fname_raw(prefix=link_label, data_node=data_node) + output_fname = output_fname.replace('__', '_') + + if self.data_dumper.data_hidden: + self.data_dumper.dump_core_data_node_raw( + data_node=data_node, output_path=output_path, output_fname=output_fname + ) + self.data_dumper.dump_core_data_node_raw( + data_node=data_node, output_path=output_path, output_fname=output_fname + ) + + def _dump_calculation_io_files_rich( + self, + output_path: Path, + link_triples: orm.LinkManager | List[orm.LinkTriple], + ): + """Small helper function to dump linked input/output nodes of a `orm.CalculationNode`. + + :param parent_path: Parent directory for dumping the linked node contents. + :param link_triples: List of link triples. + """ + + # Set up the rich parsing functions + + # Extend (at least the keys) by the dynamic entry points + rich_spec_dict = self.data_dumper.rich_spec_dict + + for link_triple in link_triples: + link_label = link_triple.link_label + data_node = link_triple.node + + node = link_triple.node + node_entry_point = node.entry_point + node_entry_point_name = node_entry_point.name + + # TODO: Somehow obtain sensible filenames -> Should this be done here, or by the export function that is + # TODO: possibly written by the plugin developer + if node_entry_point_name.startswith('core'): + # Obtain settings from the export dict + # TODO: -> This might break when plugin is missing + try: + exporter = rich_spec_dict[node_entry_point_name]['exporter'] + fileformat = rich_spec_dict[node_entry_point_name]['export_format'] + output_fname = self.data_dumper.generate_output_fname_rich( + prefix=link_label, data_node=data_node, fileformat=fileformat + ) + output_fname = output_fname.replace('__', '_') + except KeyError: + continue + + # No exporter set + if exporter is None: + continue + + # Only create subdirectory if `Data` node has an exporter + rich_output_path = output_path / 'rich' / node.__class__.__name__.lower() + rich_output_path.mkdir(parents=True, exist_ok=True) + + # TODO: Here, if data_hidden is True, dump in hidden directory, else in output_path + self.data_dumper.dump_core_data_node_rich( + node, + output_path=rich_output_path, + output_fname=output_fname, + ) + def _generate_calculation_io_mapping(self, io_dump_paths: List[str | Path] | None = None) -> SimpleNamespace: """Helper function to generate mapping for entities dumped for each `CalculationNode`. @@ -328,12 +513,12 @@ def _generate_calculation_io_mapping(self, io_dump_paths: List[str | Path] | Non aiida_entities_to_dump = ['repository', 'retrieved', 'inputs', 'outputs'] default_calculation_io_dump_paths = ['inputs', 'outputs', 'node_inputs', 'node_outputs'] - empty_calculation_io_dump_paths = [''] * 4 - if self.flat and io_dump_paths is None: LOGGER.info( 'Flat set to True and no `io_dump_paths`. Dumping in a flat directory, files might be overwritten.' ) + empty_calculation_io_dump_paths = [''] * 4 + return SimpleNamespace(**dict(zip(aiida_entities_to_dump, empty_calculation_io_dump_paths))) elif not self.flat and io_dump_paths is None: @@ -343,7 +528,7 @@ def _generate_calculation_io_mapping(self, io_dump_paths: List[str | Path] | Non ) return SimpleNamespace(**dict(zip(aiida_entities_to_dump, default_calculation_io_dump_paths))) - elif self.flat and io_dump_paths is not None: + elif self.flat: LOGGER.info('Flat set to True but `io_dump_paths` provided. These will be used, but `inputs` not nested.') return SimpleNamespace(**dict(zip(aiida_entities_to_dump, io_dump_paths))) else: @@ -381,44 +566,31 @@ def _dump_node_yaml( computer_properties = ('label', 'hostname', 'scheduler_type', 'transport_type') - node_dict = {} - metadata_dict = {} - - # Add actual node `@property`s to dictionary - for metadata_property in node_properties: - metadata_dict[metadata_property] = getattr(process_node, metadata_property) - - node_dict['Node data'] = metadata_dict - + metadata_dict = { + metadata_property: getattr(process_node, metadata_property) for metadata_property in node_properties + } + node_dict = {'Node data': metadata_dict} # Add user data - try: + with contextlib.suppress(AttributeError): node_dbuser = process_node.user - user_dict = {} - for user_property in user_properties: - user_dict[user_property] = getattr(node_dbuser, user_property) + user_dict = {user_property: getattr(node_dbuser, user_property) for user_property in user_properties} node_dict['User data'] = user_dict - except AttributeError: - pass # Add computer data - try: + with contextlib.suppress(AttributeError): node_dbcomputer = process_node.computer - computer_dict = {} - for computer_property in computer_properties: - computer_dict[computer_property] = getattr(node_dbcomputer, computer_property) + computer_dict = { + computer_property: getattr(node_dbcomputer, computer_property) + for computer_property in computer_properties + } node_dict['Computer data'] = computer_dict - except AttributeError: - pass - # Add node attributes if self.include_attributes: node_attributes = process_node.base.attributes.all node_dict['Node attributes'] = node_attributes - # Add node extras if self.include_extras: - node_extras = process_node.base.extras.all - if node_extras: + if node_extras := process_node.base.extras.all: node_dict['Node extras'] = node_extras output_file = output_path.resolve() / output_filename diff --git a/src/aiida/tools/dumping/rich.py b/src/aiida/tools/dumping/rich.py new file mode 100644 index 0000000000..bbc89ef072 --- /dev/null +++ b/src/aiida/tools/dumping/rich.py @@ -0,0 +1,86 @@ +from aiida.cmdline.commands.cmd_data.cmd_export import data_export + +__all__ = ('DEFAULT_CORE_EXPORT_MAPPING', 'rich_from_cli', 'rich_from_config') + +DEFAULT_CORE_EXPORT_MAPPING = { + 'core.array': {'exporter': data_export, 'export_format': 'json'}, + 'core.array.bands': {'exporter': data_export, 'export_format': 'mpl_pdf'}, + 'core.array.kpoints': {'exporter': data_export, 'export_format': 'json'}, + 'core.array.projection': {'exporter': data_export, 'export_format': 'json'}, + 'core.array.trajectory': {'exporter': data_export, 'export_format': 'cif'}, + 'core.array.xy': {'exporter': data_export, 'export_format': 'json'}, + 'core.base': {'exporter': None, 'export_format': None}, + 'core.bool': {'exporter': None, 'export_format': None}, + 'core.cif': {'exporter': data_export, 'export_format': 'cif'}, + 'core.code': {'exporter': data_export, 'export_format': 'yaml'}, + 'core.code.containerized': {'exporter': data_export, 'export_format': 'yaml'}, + 'core.code.installed': {'exporter': data_export, 'export_format': 'yaml'}, + 'core.code.portable': {'exporter': data_export, 'export_format': 'yaml'}, + 'core.dict': {'exporter': None, 'export_format': None}, + 'core.enum': {'exporter': None, 'export_format': None}, + 'core.float': {'exporter': None, 'export_format': None}, + # TODO: Just use copy-tree + 'core.folder': {'exporter': None, 'export_format': None}, + 'core.int': {'exporter': None, 'export_format': None}, + 'core.jsonable': { + 'exporter': data_export, + 'export_format': 'json', # duh + }, + 'core.list': {'exporter': None, 'export_format': None}, + 'core.numeric': {'exporter': None, 'export_format': None}, + 'core.orbital': {'exporter': None, 'export_format': None}, + # TODO: Here, try-except existance on remote and if so, dump it here locally + 'core.remote': {'exporter': None, 'export_format': None}, + 'core.remote.stash': {'exporter': None, 'export_format': None}, + 'core.remote.stash.folder': {'exporter': None, 'export_format': None}, + 'core.singlefile': {'exporter': None, 'export_format': None}, + 'core.str': {'exporter': None, 'export_format': None}, + 'core.structure': {'exporter': data_export, 'export_format': 'cif'}, + 'core.upf': {'exporter': data_export, 'export_format': 'upf'}, +} + + +def rich_from_cli(rich_spec, rich_dump_all): + # If all, also non-specified data types should be exported, then set the default exporter dict here + if rich_dump_all: + options_dict = DEFAULT_CORE_EXPORT_MAPPING + else: + options_dict = {} + + if rich_spec: + entries = rich_spec.split(',') + # print(f'ENTRIES: {entries}') + for entry in entries: + entry_list = entry.split(':') + entry_point = entry_list[0] + + # This is the case if no exporter explicitly provided, then we set it to the default exporter + exporter = entry_list[1] or DEFAULT_CORE_EXPORT_MAPPING[entry_point]['exporter'] + + # This is the case if no fileformat explicitly provided, then we set it to the default fileformat + export_format = entry_list[2] or DEFAULT_CORE_EXPORT_MAPPING[entry_point]['export_format'] + + # If it is provided, then the assignment is done with an equal sign and we resolve it + if '=' in export_format: + export_format = export_format.split('=')[1] + + # print(f'ENTRY_LIST: {entry_list}') + + options_dict[entry_point] = {'exporter': exporter, 'export_format': export_format} + + return options_dict + + +def rich_from_config(rich_spec, rich_dump_all): + if rich_dump_all: + options_dict = DEFAULT_CORE_EXPORT_MAPPING + else: + options_dict = {} + + for entry_point, spec in rich_spec.items(): + export_format = spec.get('format') or DEFAULT_CORE_EXPORT_MAPPING[entry_point]['export_format'] + exporter = spec.get('exporter') or DEFAULT_CORE_EXPORT_MAPPING[entry_point]['exporter'] + + options_dict[entry_point] = {'exporter': exporter, 'export_format': export_format} + + return options_dict diff --git a/src/aiida/tools/dumping/test-config-file.yaml b/src/aiida/tools/dumping/test-config-file.yaml new file mode 100644 index 0000000000..b868f8afcb --- /dev/null +++ b/src/aiida/tools/dumping/test-config-file.yaml @@ -0,0 +1,23 @@ +path: /home/geiger_j/aiida_projects/verdi-profile-dump/dev-dumps/storage-mirror +overwrite: true +incremental: true +dry_run: false +organize_by_groups: true +dump_processes: true +only_top_level_workflows: true +dump_data: true +calculations_hidden: true +data_hidden: true +also_raw: false +also_rich: true +include_inputs: true +include_outputs: true +include_attributes: true +include_extras: false +flat: false +rich_spec: + core.array.bands: + format: mpl_png + core.structure: + format: xsf +rich_dump_all: false diff --git a/src/aiida/tools/dumping/utils.py b/src/aiida/tools/dumping/utils.py index a631ac25e5..c4c1ac0fc1 100644 --- a/src/aiida/tools/dumping/utils.py +++ b/src/aiida/tools/dumping/utils.py @@ -14,6 +14,9 @@ import shutil from pathlib import Path +from rich.console import Console +from rich.table import Table + __all__ = ['prepare_dump_path'] logger = logging.getLogger(__name__) @@ -73,3 +76,94 @@ def prepare_dump_path( # Create directory if it doesn't exist or was removed path_to_validate.mkdir(exist_ok=True, parents=True) (path_to_validate / safeguard_file).touch() + + +def get_nodes_from_db(qb_instance, qb_filters: t.List | None = None, flat=False): + # Computers cannot be associated via `with_group` + # for qb_filter in qb_filters: + # qb.add_filter(**qb_filter) + + return_iterable = qb_instance.iterall() if qb_instance.count() > 10 ^ 3 else qb_instance.all() + + # Manual flattening as `iterall` doesn't have `flat` option unlike `all` + if flat: + return_iterable = [_[0] for _ in return_iterable] + + return return_iterable + + +# def validate_rich_options(rich_options, rich_config_file): +# if rich_options is not None and rich_config_file is not None: +# raise ValueError('Specify rich options either via CLI or config file, not both.') + +# else: +# logger.report('Neither `--rich-options` nor `--rich-config` set, using defaults.') + + +@staticmethod +def dumper_pretty_print(dumper_instance, include_private_and_dunder: bool = False): + console = Console() + table = Table(title=f'Attributes and Methods of {dumper_instance.__class__.__name__}') + + # Adding columns to the table + table.add_column('Name', justify='left') + table.add_column('Type', justify='left') + table.add_column('Value', justify='left') + + # Lists to store attributes and methods + entries = [] + + # Iterate over the class attributes and methods + for attr_name in dir(dumper_instance): + # Exclude private attributes and dunder methods + attr_value = getattr(dumper_instance, attr_name) + entry_type = 'Attribute' if not callable(attr_value) else 'Method' + + if attr_name.startswith('_'): + if include_private_and_dunder: + entries.append((attr_name, entry_type, str(attr_value))) + else: + pass + else: + entries.append((attr_name, entry_type, str(attr_value))) + + # Sort entries: attributes first, then methods + entries.sort(key=lambda x: (x[1] == 'Method', x[0])) + + # Add sorted entries to the table + for name, entry_type, value in entries: + table.add_row(name, entry_type, value) + + # Print the formatted table + console.print(table) + + +# def check_storage_size_user(): +# from aiida.manage.manager import get_manager + +# manager = get_manager() +# storage = manager.get_profile_storage() + +# data = storage.get_info(detailed=True) +# repository_data = data['repository']['Size (MB)'] +# total_size_gb = sum(repository_data.values()) / 1024 +# if total_size_gb > 10: +# user_input = ( +# input('Repository size larger than 10gb. Do you still want to dump the profile data? (y/N): ') +# .strip() +# .lower() +# ) + +# if user_input != 'y': +# sys.exit() + + +def sanitize_file_extension(filename: str | Path): + if isinstance(filename, Path): + filename = str(filename) + if filename.endswith('.mpl_pdf'): + filename = filename.replace('.mpl_pdf', '.pdf') + if filename.endswith('.mpl_png'): + filename = filename.replace('.mpl_png', '.png') + + return Path(filename) diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index 88dad0323e..c409a438dc 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -404,6 +404,7 @@ def test_dump_node_yaml(generate_calculation_node_io, tmp_path, generate_workcha process_dumper = ProcessDumper(include_attributes=False, include_extras=False) + (tmp_path / node_metadata_file).unlink() process_dumper._dump_node_yaml(process_node=wc_node, output_path=tmp_path) # Open the dumped YAML file and read its contents From 65214e8e323f19a5a6c36af72c460f98fca82e2e Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 23 Jan 2025 17:06:44 +0100 Subject: [PATCH 03/27] Mirroring of workflows and calculations works Either in groups, or not associated with any group. Either sorted by groups, or in a top-level flat hierarchy. "De-duplication" works by symlinking calculations if they are part of a workflow. Next, check what happens if a workflow is part of two groups -> Here, de-deplucation should actually make more sense. --- docs/source/reference/command_line.rst | 1 + src/aiida/cmdline/commands/cmd_profile.py | 132 +++------ src/aiida/cmdline/params/options/main.py | 142 +--------- src/aiida/tools/dumping/collection.py | 253 +++++++----------- src/aiida/tools/dumping/parser.py | 2 - src/aiida/tools/dumping/processes.py | 54 ++-- src/aiida/tools/dumping/test-config-file.yaml | 1 - 7 files changed, 179 insertions(+), 406 deletions(-) diff --git a/docs/source/reference/command_line.rst b/docs/source/reference/command_line.rst index 1e8370e5e8..283993fac9 100644 --- a/docs/source/reference/command_line.rst +++ b/docs/source/reference/command_line.rst @@ -398,6 +398,7 @@ Below is a list with all available subcommands. configure-rabbitmq Configure RabbitMQ for a profile. delete Delete one or more profiles. list Display a list of all available profiles. + mirror Dump all data in an AiiDA profile's storage to disk. set-default Set a profile as the default profile. setdefault (Deprecated) Set a profile as the default profile. setup Set up a new profile. diff --git a/src/aiida/cmdline/commands/cmd_profile.py b/src/aiida/cmdline/commands/cmd_profile.py index f035977f2c..48fc6e98a3 100644 --- a/src/aiida/cmdline/commands/cmd_profile.py +++ b/src/aiida/cmdline/commands/cmd_profile.py @@ -277,39 +277,36 @@ def profile_delete(force, delete_data, profiles): @verdi_profile.command('mirror') @options.PATH() @options.OVERWRITE() -@options.INCREMENTAL() -@options.ORGANIZE_BY_GROUPS() -@options.DRY_RUN() +# @options.INCREMENTAL() @options.DUMP_PROCESSES() -@options.ONLY_TOP_LEVEL_WORKFLOWS() @options.DUMP_DATA() @options.DEDUPLICATE() -@options.DATA_HIDDEN() -@options.ALSO_RAW() -@options.ALSO_RICH() @options.INCLUDE_INPUTS() @options.INCLUDE_OUTPUTS() @options.INCLUDE_ATTRIBUTES() @options.INCLUDE_EXTRAS() @options.FLAT() +@options.ALSO_RAW() +@options.ALSO_RICH() @options.RICH_SPEC() @options.RICH_DUMP_ALL() @options.DUMP_CONFIG_FILE() @options.NODES() @options.GROUPS() +@options.ORGANIZE_BY_GROUPS() +@options.ONLY_TOP_LEVEL_WORKFLOWS() +@options.DRY_RUN() @click.pass_context -def storage_mirror( +def profile_mirror( ctx, path, overwrite, - incremental, organize_by_groups, dry_run, dump_processes, only_top_level_workflows, dump_data, deduplicate, - data_hidden, also_raw, also_rich, include_inputs, @@ -325,6 +322,7 @@ def storage_mirror( ): """Dump all data in an AiiDA profile's storage to disk.""" + from pathlib import Path from aiida import orm from aiida.tools.dumping.parser import DumpConfigParser @@ -337,39 +335,14 @@ def storage_mirror( profile = ctx.obj['profile'] - # from aiida.manage.manager import get_manager - - # manager = get_manager() - # storage = manager.get_profile_storage() - - # with spinner(): - # data = storage.get_info(detailed=True) - - # echo.echo_dictionary(data, sort_keys=False, fmt='yaml') - - # print(f"Profile name: {profile_name}") - - # # TODO: export computers alone, and groups - # t1 = time.time() - # qb = orm.QueryBuilder().append(orm.Node, tag='node', project=['uuid']) - # all_uuids = qb.all(flat=True) - # print(f"All UUIDs retrieved in {time.time() - t1:6.3f} s.") - - # t1 = time.time() - # with open('all-source-uuids.json', 'w') as fhandle: - # json.dump({'profile_name': profile_name, 'uuids': all_uuids}, fhandle) - # print(f"{len(all_uuids)} UUIDs written in {time.time() - t1:6.3f} s.") - if nodes and groups: echo.echo_critical('`nodes` and `groups` specified. Set only one.') - # if all_entries and groups: - # echo.echo_critical('`all_entries` and `groups` specified. Set only one.') if dump_config_file is None: general_kwargs = { 'path': path, 'overwrite': overwrite, - 'incremental': incremental, + # 'incremental': incremental, 'dry_run': dry_run, } @@ -379,19 +352,18 @@ def storage_mirror( 'include_attributes': include_attributes, 'include_extras': include_extras, 'flat': flat, - # "calculations_hidden": calculations_hidden } datadumper_kwargs = { 'also_raw': also_raw, 'also_rich': also_rich, - 'data_hidden': data_hidden, } collection_kwargs = { 'should_dump_processes': dump_processes, 'should_dump_data': dump_data, 'only_top_level_workflows': only_top_level_workflows, + 'organize_by_groups': organize_by_groups, } rich_kwargs = { @@ -420,13 +392,10 @@ def storage_mirror( path = general_kwargs['path'] overwrite = general_kwargs['overwrite'] dry_run = general_kwargs['dry_run'] - incremental = general_kwargs['incremental'] - - if not overwrite and incremental: - echo.echo_report('Overwrite set to false, but incremental dumping selected. Will keep existing directories.') + incremental = not overwrite - if not str(path).endswith(profile.name): - path /= profile.name + if path is None: + path = Path.cwd() / f'{profile.name}-mirror' # TODO: Implement proper dry-run feature dry_run_message = f"Dry run for dumping of profile `{profile.name}`'s data at path: `{path}`.\n" @@ -439,7 +408,7 @@ def storage_mirror( else: echo.echo_report(f"Dumping of profile `{profile.name}`'s data at path: `{path}`.") - SAFEGUARD_FILE = '.verdi_storage_dump' # noqa: N806 + SAFEGUARD_FILE = '.verdi_profile_mirror' # noqa: N806 try: prepare_dump_path( @@ -471,25 +440,15 @@ def storage_mirror( ) # dumper_pretty_print(process_dumper) - from aiida.tools.dumping.incremental import DumpNodeCollector - - dumpnodecollector = DumpNodeCollector(dump_parent_path=path) - - dumpnodecollector.update_uuids_before_dump() - dumpnodecollector.create_organized_uuid_dicts() - # dumpnodecollector.populate_uuid_dict() - - # raise SystemExit() - # TODO: Possibly implement specifying specific computers # TODO: Although, users could just specify the relevant nodes # TODO: Also add option to specify node types via entry points + # TODO: Use `batch_iter` from aiida.tools.archive.common # === Dump the data that is not associated with any group === if not groups: collection_dumper = CollectionDumper( dump_parent_path=path, - output_path=path, overwrite=overwrite, incremental=incremental, nodes=nodes, @@ -505,13 +464,13 @@ def storage_mirror( if dump_processes and collection_dumper._should_dump_processes(): echo.echo_report(f'Dumping processes not in any group for profile `{profile.name}`...') collection_dumper.dump_processes() + if dump_data: if not also_rich and not also_raw: echo.echo_critical('`--dump-data was given, but neither --also-raw or --also-rich specified.') echo.echo_report(f'Dumping data not in any group for profile {profile.name}...') - collection_dumper.dump_data_rich() - # collection_dumper.dump_plugin_data() + # collection_dumper.dump_data_rich() # === Dump data per-group if Groups exist in profile or are selected === # TODO: Invert default behavior here, as I typically want to dump all entries @@ -520,34 +479,29 @@ def storage_mirror( if not groups: # and all_entries: groups = orm.QueryBuilder().append(orm.Group).all(flat=True) - if groups is not None and not nodes: - for group in groups: - if organize_by_groups: - group_subdir = Path(*group.type_string.split('.')) - group_path = path / 'groups' / group_subdir / group.label - else: - group_path = path - - collection_dumper = CollectionDumper( - dump_parent_path=path, - output_path=group_path, - overwrite=overwrite, - incremental=incremental, - group=group, - **collection_kwargs, - **rich_kwargs, - process_dumper=process_dumper, - data_dumper=data_dumper, - ) - collection_dumper.create_entity_counter() - if dump_processes: - # The additional `_should_dump_processes` check here ensures that no reporting like - # "Dumping processes for group `SSSP/1.3/PBE/efficiency`" is printed for groups that - # don't contain processes - if collection_dumper._should_dump_processes(): - echo.echo_report(f'Dumping processes for group `{group.label}`...') - collection_dumper.dump_processes() - if dump_data: - echo.echo_report(f'Dumping data for group `{group.label}`...') - collection_dumper.dump_data_rich() - # collection_dumper.dump_plugin_data() \ No newline at end of file + if groups: + if not nodes: + for group in groups: + collection_dumper = CollectionDumper( + dump_parent_path=path, + overwrite=overwrite, + incremental=incremental, + group=group, + **collection_kwargs, + **rich_kwargs, + process_dumper=process_dumper, + data_dumper=data_dumper, + deduplicate=deduplicate, + ) + + collection_dumper.create_entity_counter() + if dump_processes: + # The additional `_should_dump_processes` check here ensures that no reporting like + # "Dumping processes for group `SSSP/1.3/PBE/efficiency`" is printed for groups that + # don't contain processes + if collection_dumper._should_dump_processes(): + echo.echo_report(f'Dumping processes for group `{group.label}`...') + collection_dumper.dump_processes() + if dump_data: + echo.echo_report(f'Dumping data for group `{group.label}`...') + collection_dumper.dump_data_rich() diff --git a/src/aiida/cmdline/params/options/main.py b/src/aiida/cmdline/params/options/main.py index 6f19a3c465..4d1c308c43 100644 --- a/src/aiida/cmdline/params/options/main.py +++ b/src/aiida/cmdline/params/options/main.py @@ -818,7 +818,7 @@ def set_log_level(ctx, _param, value): DUMP_DATA = OverridableOption( '--dump-data/--no-dump-data', is_flag=True, - default=True, + default=False, type=bool, show_default=True, help='Dump data nodes in a dedicated directory.', @@ -833,7 +833,7 @@ def set_log_level(ctx, _param, value): ) ALSO_RAW = OverridableOption( - '--also-raw/--not-also-raw', + '--also-raw/--no-also-raw', is_flag=True, default=False, show_default=True, @@ -841,9 +841,9 @@ def set_log_level(ctx, _param, value): ) ALSO_RICH = OverridableOption( - '--also-rich/--not-also-rich', + '--also-rich/--no-also-rich', is_flag=True, - default=True, + default=False, show_default=True, help='Dump also nicely prepared outputs, e.g. CIF for structures or PDF image for bands.', ) @@ -884,14 +884,14 @@ def set_log_level(ctx, _param, value): '--include-inputs/--exclude-inputs', default=True, show_default=True, - help='Include the linked input nodes of the `CalculationNode`(s).', + help='Include linked input nodes of `CalculationNode`(s).', ) INCLUDE_OUTPUTS = OverridableOption( '--include-outputs/--exclude-outputs', default=False, show_default=True, - help='Include the linked output nodes of the `CalculationNode`(s).', + help='Include linked output nodes of `CalculationNode`(s).', ) INCLUDE_ATTRIBUTES = OverridableOption( @@ -917,7 +917,7 @@ def set_log_level(ctx, _param, value): ) ONLY_TOP_LEVEL_WORKFLOWS = OverridableOption( - '--only-top-level-workflows/--not-only-top-level-workflows', + '--only-top-level-workflows/--no-only-top-level-workflows', is_flag=True, default=True, type=bool, @@ -925,56 +925,6 @@ def set_log_level(ctx, _param, value): help='Dump only the top-level workflows in their own dedicated directories.', ) -DUMP_PROCESSES = OverridableOption( - '--dump-processes/--no-dump-processes', - is_flag=True, - default=True, - show_default=True, - help='Dump process data.', -) - -DUMP_DATA = OverridableOption( - '--dump-data/--no-dump-data', - is_flag=True, - default=True, - type=bool, - show_default=True, - help='Dump data nodes in a dedicated directory.', -) - -CALCULATIONS_HIDDEN = OverridableOption( - '--calculations-hidden/--calculations-non-hidden', - is_flag=True, - default=True, - type=bool, - show_default=True, - help='Dump all `orm.CalculationNode`s in the hidden directory and link to there.', -) - -DATA_HIDDEN = OverridableOption( - '--data-hidden/--data-non-hidden', - is_flag=True, - default=True, - show_default=True, - help='Dump all `orm.Data` in the hidden directory and link to there.', -) - -ALSO_RAW = OverridableOption( - '--also-raw/--not-also-raw', - is_flag=True, - default=False, - show_default=True, - help='Dump the `attributes` of all nodes related to the Process.', -) - -ALSO_RICH = OverridableOption( - '--also-rich/--not-also-rich', - is_flag=True, - default=True, - show_default=True, - help='Dump also nicely prepared outputs, e.g. CIF for structures or PDF image for bands.', -) - INCREMENTAL = OverridableOption( '--incremental/--no-incremental', is_flag=True, @@ -989,81 +939,3 @@ def set_log_level(ctx, _param, value): type=str, help='Specifications for rich data dumping.', ) - -DUMP_CONFIG_FILE = OverridableOption( - '--dump-config-file', - default=None, - type=types.FileOrUrl(), - help='Provide dumping options via a config file in YAML format.', -) - -RICH_DUMP_ALL = OverridableOption( - '--rich-dump-all/--no-rich-dump-all', - default=True, - is_flag=True, - type=bool, - show_default=True, - help='If a rich specification is provided, this triggers if all other Data nodes should also be dumped or not.', -) - -ORGANIZE_BY_GROUPS = OverridableOption( - '--organize-by-groups/--no-organize-by-groups', - default=True, - is_flag=True, - type=bool, - show_default=True, - help='If the collection of nodes to be dumped is organized in groups, reproduce its hierarchy.', -) - -INCLUDE_INPUTS = OverridableOption( - '--include-inputs/--exclude-inputs', - default=True, - show_default=True, - help='Include the linked input nodes of the `CalculationNode`(s).', -) - -INCLUDE_OUTPUTS = OverridableOption( - '--include-outputs/--exclude-outputs', - default=False, - show_default=True, - help='Include the linked output nodes of the `CalculationNode`(s).', -) - -INCLUDE_ATTRIBUTES = OverridableOption( - '--include-attributes/--exclude-attributes', - default=True, - show_default=True, - help='Include attributes in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', -) - -INCLUDE_EXTRAS = OverridableOption( - '--include-extras/--exclude-extras', - default=True, - show_default=True, - help='Include extras in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', -) - -FLAT = OverridableOption( - '-f', - '--flat', - is_flag=True, - default=False, - help='Dump files in a flat directory for every step of a workflow.', -) - -ONLY_TOP_LEVEL_WORKFLOWS = OverridableOption( - '--only-top-level-workflows/--not-only-top-level-workflows', - is_flag=True, - default=True, - type=bool, - show_default=True, - help='Dump only the top-level workflows in their own dedicated directories.', -) - -INCREMENTAL = OverridableOption( - '--incremental/--non-incremental', - is_flag=True, - default=True, - show_default=True, - help='Dump files incrementally when dumping collections of data to disk.', -) diff --git a/src/aiida/tools/dumping/collection.py b/src/aiida/tools/dumping/collection.py index 169f5b3862..9cf0dccde6 100644 --- a/src/aiida/tools/dumping/collection.py +++ b/src/aiida/tools/dumping/collection.py @@ -10,17 +10,14 @@ from __future__ import annotations -import contextlib import itertools as it import logging -import os from collections import Counter from pathlib import Path from aiida import orm from aiida.tools.dumping.data import DataDumper from aiida.tools.dumping.processes import ProcessDumper -from aiida.tools.dumping.utils import sanitize_file_extension logger = logging.getLogger(__name__) @@ -34,41 +31,46 @@ class CollectionDumper: def __init__( self, - *args, dump_parent_path: Path = Path().cwd(), - output_path: Path = Path().cwd(), + nodes: set = {}, + group: orm.Group | str | None = None, overwrite: bool = False, incremental: bool = True, should_dump_processes: bool = False, should_dump_data: bool = False, only_top_level_workflows: bool = True, - group: orm.Group | None = None, - nodes: set = {}, + rich_dump_all: bool = True, + deduplicate: bool = True, + organize_by_groups: bool = True, process_dumper: ProcessDumper | None = None, data_dumper: DataDumper | None = None, - **kwargs, ): - self.args = args self.dump_parent_path = dump_parent_path - self.output_path = output_path self.overwrite = overwrite self.incremental = incremental self.should_dump_processes = should_dump_processes self.should_dump_data = should_dump_data self.only_top_level_workflows = only_top_level_workflows self.nodes = nodes + self.deduplicate = deduplicate + self.process_dumper = process_dumper self.data_dumper = data_dumper - self.kwargs = kwargs - - self.hidden_aiida_path = dump_parent_path / '.aiida-raw-data' # Allow passing of group via label if isinstance(group, str): - group = orm.Group.get(self.group) + group = orm.Group.get(group) + self.group = group - self.output_path = output_path + if organize_by_groups: + if group is not None: + group_subdir = Path(*group.type_string.split('.')) + self.output_path = self.dump_parent_path / 'groups' / group_subdir / self.group.label + else: + self.output_path = self.dump_parent_path / 'no-group' + else: + self.output_path = self.dump_parent_path if not hasattr(self, 'entity_counter'): self.create_entity_counter() @@ -149,31 +151,27 @@ def _should_dump_processes(self) -> bool: else: return len([node for node in self.nodes if isinstance(node, orm.ProcessNode)]) > 0 - def _dump_calculations_hidden(self, calculations): - # ? Dump only top-level workchains, as that includes sub-workchains already - + def _dump_calculations(self, calculations): for calculation in calculations: calculation_dumper = self.process_dumper - calculation_dump_path = self.hidden_aiida_path / 'calculations' / calculation.uuid + calculation_dump_path = ( + self.output_path + / 'calculations' + / calculation_dumper._generate_default_dump_path(process_node=calculation, prefix='') + ) - # if not self.dry_run: - # with contextlib.suppress(FileExistsError): - try: + if calculation.caller is None or (calculation.caller is not None and self.deduplicate): calculation_dumper._dump_calculation(calculation_node=calculation, output_path=calculation_dump_path) - except: - raise - - # # To make development quicker - # if iworkflow_ > 1: - # break - def _dump_link_workflows(self, workflows, link_calculations: bool = True): + def _dump_workflows(self, workflows): # workflow_nodes = get_nodes_from_db(aiida_node_type=orm.WorkflowNode, with_group=self.group, flat=True) for workflow in workflows: + # if workflow.pk == 47: + # breakpoint() + workflow_dumper = self.process_dumper - link_calculations_dir = self.hidden_aiida_path / 'calculations' # TODO: If the GroupDumper is called from somewhere else outside, prefix the path with `groups/core` etc workflow_dump_path = ( self.output_path @@ -181,36 +179,14 @@ def _dump_link_workflows(self, workflows, link_calculations: bool = True): / workflow_dumper._generate_default_dump_path(process_node=workflow, prefix=None) ) # logger.report(f'WORKFLOW_DUMP_PATH: {workflow_dump_path}') - workflow_dumper._dump_workflow( workflow_node=workflow, output_path=workflow_dump_path, - link_calculations=link_calculations, - link_calculations_dir=link_calculations_dir, + link_calculations=self.deduplicate, + link_calculations_dir=self.output_path / 'calculations', ) - def _link_calculations_hidden(self, calculations): - # calculation_nodes = get_nodes_from_db(aiida_node_type=orm.CalculationNode, with_group=self.group, flat=True) - for calculation_node in calculations: - calculation_dumper = self.process_dumper - - link_calculations_dir = self.hidden_aiida_path / 'calculations' - - calculation_dump_path = self.output_path / 'calculations' - calculation_dump_path.mkdir(parents=True, exist_ok=True) - calculation_dump_path = calculation_dump_path / calculation_dumper._generate_default_dump_path( - process_node=calculation_node - ) - - with contextlib.suppress(FileExistsError): - os.symlink(link_calculations_dir / calculation_node.uuid, calculation_dump_path) - def dump_processes(self): - # ? Here, these could be all kinds of entities that could be grouped in AiiDA - # if len(self.entities_to_dump) > 0: - # pass - # # nodes = self.entities_to_dump - # else: nodes = self.get_collection_nodes() workflows = [node for node in nodes if isinstance(node, orm.WorkflowNode)] @@ -232,107 +208,72 @@ def dump_processes(self): self.output_path.mkdir(exist_ok=True, parents=True) - print(f'self.process_dumper.calculations_hidden: {self.process_dumper.calculations_hidden}') - print(f'self.output_path: {self.output_path}') - if self.process_dumper.calculations_hidden: - print('dump hidden') - self._dump_calculations_hidden(calculations=calculations) - self._dump_link_workflows(workflows=workflows) - self._link_calculations_hidden(calculations=calculations) - else: - print('dump non-hidden') - for workflow in workflows: - workflow_path = ( - self.output_path - / 'workflows' - / self.process_dumper._generate_default_dump_path(process_node=workflow) - ) - self.process_dumper.dump(process_node=workflow, output_path=workflow_path) + self._dump_calculations(calculations=calculations) + self._dump_workflows(workflows=workflows) # TODO: Add `dump_data_raw` here, as well - def dump_data_rich(self): - nodes = self.get_collection_nodes() - nodes = [node for node in nodes if isinstance(node, (orm.Data, orm.Computer))] - # Here, when providing logic to set the exporters and fileformat via the rich-options, don't have to filter - # anymore for `core` - nodes = [node for node in nodes if node.entry_point.name.startswith('core')] - if len(nodes) == 0: - return - - self.output_path.mkdir(exist_ok=True, parents=True) - data_dumper = self.data_dumper - - for data_node in nodes: - node_entry_point_name = data_node.entry_point.name - - # Get the fileformat and exporter for the data node - try: - fileformat = data_dumper.rich_spec_dict[node_entry_point_name]['export_format'] - exporter = data_dumper.rich_spec_dict[node_entry_point_name]['exporter'] - - # If options for the rich dumping are specified and not all the other defaults are being used - # Some entry_points might not be inside the `rich_spec_dict` - except KeyError: - continue - - except: - # Raise all exceptions here during development - raise - - # Don't go further if no importer implemented for a data type anyway - if exporter is None: - continue - - try: - # Generate a nice filename and sanitize it - nice_output_path = self.output_path / 'data' / data_node.__class__.__name__.lower() - nice_fname = data_dumper.generate_output_fname_rich(data_node=data_node, fileformat=fileformat).replace( - '__', '_' - ) - nice_fname = sanitize_file_extension(nice_fname) - - if data_dumper.data_hidden: - # Define paths for hidden dump and linking - hidden_output_path = self.hidden_aiida_path / 'data' / data_node.__class__.__name__.lower() - uuid_fname = sanitize_file_extension(f'{data_node.uuid}.{fileformat}') - - # Dump the data in the hidden directory - data_dumper.dump_core_data_node_rich(data_node, hidden_output_path, uuid_fname) - - # Link the hidden file to the expected output path - (nice_output_path / nice_fname).parent.mkdir(exist_ok=True, parents=True) - os.symlink(hidden_output_path / uuid_fname, nice_output_path / nice_fname) - - else: - # Dump the data in the non-hidden directory - data_dumper.dump_core_data_node_rich(data_node, nice_output_path, nice_fname) - - except TypeError: - # Handle case when no exporter is implemented for a given data_node type - raise - except OSError: - # A Data node, e.g. a Code might already be existent, so don't worry about this exception - continue - except Exception: - raise - - def dump_plugin_data(self): - return - # from importlib.metadata import entry_points - - # plugin_data_entry_points = [entry_point.name for entry_point in entry_points(group='aiida.data')] - # # print(plugin_data_entry_points) - # # print(self.entity_counter) - # from aiida.manage.manager import get_manager - - # manager = get_manager() - # storage = manager.get_profile_storage() - # orm_entities = storage.get_orm_entities(detailed=True)['Nodes']['node_types'] - # non_core_data_entities = [ - # orm_entity - # for orm_entity in orm_entities - # if orm_entity.startswith('data') and not orm_entity.startswith('data.core') - # ] - # # TODO: Implement dumping here. Stashed for now, as both `HubbardStructureData` and `UpfData` I wanted to use - # # TODO: for testing don't implement `export` either way - # # print(non_core_data_entities) + # def dump_data_rich(self): + # nodes = self.get_collection_nodes() + # nodes = [node for node in nodes if isinstance(node, (orm.Data, orm.Computer))] + # # Here, when providing logic to set the exporters and fileformat via the rich-options, don't have to filter + # # anymore for `core` + # nodes = [node for node in nodes if node.entry_point.name.startswith('core')] + # if len(nodes) == 0: + # return + + # self.output_path.mkdir(exist_ok=True, parents=True) + # data_dumper = self.data_dumper + + # for data_node in nodes: + # node_entry_point_name = data_node.entry_point.name + + # # Get the fileformat and exporter for the data node + # try: + # fileformat = data_dumper.rich_spec_dict[node_entry_point_name]['export_format'] + # exporter = data_dumper.rich_spec_dict[node_entry_point_name]['exporter'] + + # # If options for the rich dumping are specified and not all the other defaults are being used + # # Some entry_points might not be inside the `rich_spec_dict` + # except KeyError: + # continue + + # except: + # # Raise all exceptions here during development + # raise + + # # Don't go further if no importer implemented for a data type anyway + # if exporter is None: + # continue + + # try: + # # Generate a nice filename and sanitize it + # nice_output_path = self.output_path / 'data' / data_node.__class__.__name__.lower() + # nice_fname = data_dumper.generate_output_fname_rich(data_node=data_node, fileformat=fileformat).replace( + # '__', '_' + # ) + # nice_fname = sanitize_file_extension(nice_fname) + + # if data_dumper.data_hidden: + # # Define paths for hidden dump and linking + # hidden_output_path = self.hidden_aiida_path / 'data' / data_node.__class__.__name__.lower() + # uuid_fname = sanitize_file_extension(f'{data_node.uuid}.{fileformat}') + + # # Dump the data in the hidden directory + # data_dumper.dump_core_data_node_rich(data_node, hidden_output_path, uuid_fname) + + # # Link the hidden file to the expected output path + # (nice_output_path / nice_fname).parent.mkdir(exist_ok=True, parents=True) + # os.symlink(hidden_output_path / uuid_fname, nice_output_path / nice_fname) + + # else: + # # Dump the data in the non-hidden directory + # data_dumper.dump_core_data_node_rich(data_node, nice_output_path, nice_fname) + + # except TypeError: + # # Handle case when no exporter is implemented for a given data_node type + # raise + # except OSError: + # # A Data node, e.g. a Code might already be existent, so don't worry about this exception + # continue + # except Exception: + # raise diff --git a/src/aiida/tools/dumping/parser.py b/src/aiida/tools/dumping/parser.py index cc19b0f141..39288929e6 100644 --- a/src/aiida/tools/dumping/parser.py +++ b/src/aiida/tools/dumping/parser.py @@ -25,13 +25,11 @@ def parse_config_file(config_file: str | Path | None) -> dict: 'include_attributes': config.get('include_attributes', True), 'include_extras': config.get('include_extras', False), 'flat': config.get('flat', False), - 'calculations_hidden': config.get('calculations_hidden', True), } datadumper_kwargs = { 'also_raw': config.get('also_raw', False), 'also_rich': config.get('also_rich', True), - 'data_hidden': config.get('data_hidden', True), } collection_kwargs = { diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 29fbef07c9..fcc8671ff6 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -58,7 +58,6 @@ def __init__( overwrite: bool = False, incremental: bool = True, flat: bool = False, - calculations_hidden: bool = True, include_inputs: bool = True, include_outputs: bool = False, include_attributes: bool = True, @@ -66,7 +65,7 @@ def __init__( rich_options: str = '', rich_config_file: Path | None = None, rich_dump_all: bool = True, - data_dumper: DataDumper | None = DataDumper(), + data_dumper: DataDumper = DataDumper(), dump_unsealed: bool = False, **kwargs, ) -> None: @@ -86,8 +85,6 @@ def __init__( self.kwargs = kwargs self.dump_unsealed = dump_unsealed - self.hidden_aiida_path = dump_parent_path / '.aiida-raw-data' - @staticmethod def _generate_default_dump_path(process_node: ProcessNode, prefix: str = 'dump') -> Path: """Simple helper function to generate the default parent-dumping directory if none given. @@ -99,13 +96,21 @@ def _generate_default_dump_path(process_node: ProcessNode, prefix: str = 'dump') :return: The absolute default parent dump path. """ - pk = process_node.pk - # TODO: Use UUID[:8] here + entities_to_dump = [] + + if prefix: + # No '' and None + entities_to_dump += [prefix] + try: - return Path(f'{prefix}-{process_node.process_label}-{pk}') + entities_to_dump += [process_node.process_label] except AttributeError: # This case came up during testing, not sure how relevant it actually is - return Path(f'{prefix}-{process_node.process_type}-{pk}') + entities_to_dump += [process_node.process_type] + + entities_to_dump += [str(process_node.pk)] + + return Path('-'.join(entities_to_dump)) @staticmethod def _generate_readme(process_node: ProcessNode, output_path: Path) -> None: @@ -171,7 +176,7 @@ def _generate_readme(process_node: ProcessNode, output_path: Path) -> None: (output_path / 'README.md').write_text(_readme_string) @staticmethod - def _generate_child_node_label(index: int, link_triple: LinkTriple) -> str: + def _generate_child_node_label(index: int, link_triple: LinkTriple, append_pk: bool = True) -> str: """Small helper function to generate and clean directory label for child nodes during recursion. :param index: Index assigned to step at current level of recursion. @@ -194,6 +199,9 @@ def _generate_child_node_label(index: int, link_triple: LinkTriple) -> str: if process_type is not None and process_type != link_label: label_list += [process_type] + if append_pk: + label_list += [str(node.pk)] + node_label = '-'.join(label_list) # `CALL-` as part of the link labels also for MultiplyAddWorkChain -> Seems general enough, so remove node_label = node_label.replace('CALL-', '') @@ -260,7 +268,7 @@ def _dump_workflow( output_path: Path, io_dump_paths: List[str | Path] | None = None, link_calculations: bool = False, - link_calculations_dir: str | None = None, + link_calculations_dir: Path | None = None, ) -> None: """Recursive function to traverse a `WorkflowNode` and dump its `CalculationNode` s. @@ -302,7 +310,10 @@ def _dump_workflow( ) else: try: - os.symlink(link_calculations_dir / child_node.uuid, child_output_path) + calculation_dump_path = link_calculations_dir / ProcessDumper._generate_default_dump_path( + process_node=child_node, prefix='' + ) + os.symlink(calculation_dump_path, child_output_path) except FileExistsError: pass @@ -354,22 +365,19 @@ def _dump_calculation( output_path=output_path / io_dump_mapping.inputs, link_triples=input_links ) - if self.data_dumper.also_rich: rich_data_output_path = output_path / io_dump_mapping.inputs - # if not self.data_dumper.data_hidden: - # rich_data_output_path = output_path / io_dump_mapping.inputs - # else: - # # TODO: Currently, when dumping only one selected workflow, if rich dumping is activated, but - # # TODO: `data-hidden` is set, no data nodes were actually being dumped - # # TODO: With the current implementation below, they are dumped, but not in the same structure as for the - # # TODO: `dump_rich_core` function. Quick fix for now - # pass + # if not self.data_dumper.data_hidden: + # rich_data_output_path = output_path / io_dump_mapping.inputs + # else: + # # TODO: Currently, when dumping only one selected workflow, if rich dumping is activated, but + # # TODO: `data-hidden` is set, no data nodes were actually being dumped + # # TODO: With the current implementation below, they are dumped, but not in the same structure as for the + # # TODO: `dump_rich_core` function. Quick fix for now + # pass # Only dump the rich data output files in the process directories if data_hidden is False - self._dump_calculation_io_files_rich( - output_path=rich_data_output_path, link_triples=input_links - ) + self._dump_calculation_io_files_rich(output_path=rich_data_output_path, link_triples=input_links) # Dump the node_outputs apart from `retrieved` if self.include_outputs: output_links = list(calculation_node.base.links.get_outgoing(link_type=LinkType.CREATE)) diff --git a/src/aiida/tools/dumping/test-config-file.yaml b/src/aiida/tools/dumping/test-config-file.yaml index b868f8afcb..63bbe38180 100644 --- a/src/aiida/tools/dumping/test-config-file.yaml +++ b/src/aiida/tools/dumping/test-config-file.yaml @@ -6,7 +6,6 @@ organize_by_groups: true dump_processes: true only_top_level_workflows: true dump_data: true -calculations_hidden: true data_hidden: true also_raw: false also_rich: true From 17f2730ec87030896f3bf3fcb12708f68381a2d7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 23 Jan 2025 16:08:52 +0000 Subject: [PATCH 04/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/aiida/cmdline/commands/cmd_process.py | 34 +++++++++++------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index ed0b3ccdd0..842232fc8e 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -558,7 +558,7 @@ def process_repair(manager, broker, dry_run): echo.echo_report(f'Revived process `{pid}`') -@verdi_process.command("dump") +@verdi_process.command('dump') @arguments.PROCESS() @options.PATH() @options.OVERWRITE() @@ -613,30 +613,30 @@ def process_dump( node data for further inspection. """ + from aiida.tools.archive.exceptions import ExportValidationError from aiida.tools.dumping.data import DataDumper from aiida.tools.dumping.processes import ProcessDumper - from aiida.tools.archive.exceptions import ExportValidationError # from aiida.tools.dumping.utils import validate_rich_options from aiida.tools.dumping.rich import rich_from_cli processdumper_kwargs = { - "include_inputs": include_inputs, - "include_outputs": include_outputs, - "include_attributes": include_attributes, - "include_extras": include_extras, - "flat": flat, - "dump_unsealed": dump_unsealed, - "incremental": incremental, + 'include_inputs': include_inputs, + 'include_outputs': include_outputs, + 'include_attributes': include_attributes, + 'include_extras': include_extras, + 'flat': flat, + 'dump_unsealed': dump_unsealed, + 'incremental': incremental, } rich_kwargs = { - "rich_dump_all": rich_dump_all, + 'rich_dump_all': rich_dump_all, } datadumper_kwargs = { - "also_raw": also_raw, - "also_rich": also_rich, + 'also_raw': also_raw, + 'also_rich': also_rich, } # if also_rich: @@ -672,15 +672,13 @@ def process_dump( output_path=path, ) echo.echo_success( - f"Raw files for {process.__class__.__name__} <{process.pk}> dumped into folder `{dump_path}`." + f'Raw files for {process.__class__.__name__} <{process.pk}> dumped into folder `{dump_path}`.' ) except FileExistsError: echo.echo_critical( - "Dumping directory exists and overwrite is False. Set overwrite to True, or delete directory manually." + 'Dumping directory exists and overwrite is False. Set overwrite to True, or delete directory manually.' ) except ExportValidationError as e: - echo.echo_critical(f"{e!s}") + echo.echo_critical(f'{e!s}') except Exception as e: - echo.echo_critical( - f"Unexpected error while dumping {process.__class__.__name__} <{process.pk}>:\n ({e!s})." - ) + echo.echo_critical(f'Unexpected error while dumping {process.__class__.__name__} <{process.pk}>:\n ({e!s}).') From 0d37e59ec4d0b68890d5a0215de1fd82ddcce85d Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 27 Jan 2025 13:31:28 +0100 Subject: [PATCH 05/27] Major code refactor Add `BaseDumper`, `ProfileDumper` and `CollecionDumper` -> `GroupDumper` Remove code related to data and rich dumping --- src/aiida/cmdline/commands/cmd_process.py | 37 +-- src/aiida/cmdline/commands/cmd_profile.py | 196 +++--------- src/aiida/cmdline/params/options/main.py | 10 - src/aiida/tools/dumping/__init__.py | 10 +- src/aiida/tools/dumping/base.py | 25 ++ src/aiida/tools/dumping/data.py | 292 ------------------ .../tools/dumping/{collection.py => group.py} | 159 ++-------- src/aiida/tools/dumping/parser.py | 1 - .../dumping/{processes.py => process.py} | 202 ++---------- src/aiida/tools/dumping/profile.py | 102 ++++++ src/aiida/tools/dumping/test-config-file.yaml | 1 - tests/tools/dumping/test_processes.py | 2 +- 12 files changed, 240 insertions(+), 797 deletions(-) create mode 100644 src/aiida/tools/dumping/base.py delete mode 100644 src/aiida/tools/dumping/data.py rename src/aiida/tools/dumping/{collection.py => group.py} (52%) rename src/aiida/tools/dumping/{processes.py => process.py} (71%) create mode 100644 src/aiida/tools/dumping/profile.py diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 842232fc8e..30e75a4295 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -614,11 +614,7 @@ def process_dump( """ from aiida.tools.archive.exceptions import ExportValidationError - from aiida.tools.dumping.data import DataDumper - from aiida.tools.dumping.processes import ProcessDumper - - # from aiida.tools.dumping.utils import validate_rich_options - from aiida.tools.dumping.rich import rich_from_cli + from aiida.tools.dumping.process import ProcessDumper processdumper_kwargs = { 'include_inputs': include_inputs, @@ -630,40 +626,9 @@ def process_dump( 'incremental': incremental, } - rich_kwargs = { - 'rich_dump_all': rich_dump_all, - } - - datadumper_kwargs = { - 'also_raw': also_raw, - 'also_rich': also_rich, - } - - # if also_rich: - # try: - # validate_rich_options( - # rich_options=rich_options, rich_config_file=rich_config_file - # ) - # except ValueError as exc: - # echo.echo_critical(f"{exc!s}") - - if rich_spec is not None: - rich_spec_dict = rich_from_cli(rich_spec=rich_spec, **rich_kwargs) - else: - rich_spec_dict = {} - - data_dumper = DataDumper( - overwrite=overwrite, - rich_spec_dict=rich_spec_dict, - **datadumper_kwargs, - **rich_kwargs, - ) - process_dumper = ProcessDumper( overwrite=overwrite, **processdumper_kwargs, - **rich_kwargs, - data_dumper=data_dumper, ) try: diff --git a/src/aiida/cmdline/commands/cmd_profile.py b/src/aiida/cmdline/commands/cmd_profile.py index 48fc6e98a3..2c16566672 100644 --- a/src/aiida/cmdline/commands/cmd_profile.py +++ b/src/aiida/cmdline/commands/cmd_profile.py @@ -19,7 +19,7 @@ from aiida.cmdline.utils import defaults, echo from aiida.common import exceptions from aiida.manage.configuration import Profile, create_profile, get_config -from aiida.tools.dumping import CollectionDumper, DataDumper, ProcessDumper +from aiida.tools.dumping import GroupDumper, ProfileDumper, ProcessDumper @verdi.group('profile') @@ -272,29 +272,20 @@ def profile_delete(force, delete_data, profiles): echo.echo_success(f'Profile `{profile.name}` was deleted.') -# ? Specify groups via giving the groups, or just enabling "groups" and then all are dumped? -# ? Provide some mechanism to allow for both, e.g. if no argument is provided, all groups are dumped @verdi_profile.command('mirror') @options.PATH() @options.OVERWRITE() # @options.INCREMENTAL() @options.DUMP_PROCESSES() -@options.DUMP_DATA() @options.DEDUPLICATE() @options.INCLUDE_INPUTS() @options.INCLUDE_OUTPUTS() @options.INCLUDE_ATTRIBUTES() @options.INCLUDE_EXTRAS() @options.FLAT() -@options.ALSO_RAW() -@options.ALSO_RICH() -@options.RICH_SPEC() -@options.RICH_DUMP_ALL() @options.DUMP_CONFIG_FILE() -@options.NODES() @options.GROUPS() @options.ORGANIZE_BY_GROUPS() -@options.ONLY_TOP_LEVEL_WORKFLOWS() @options.DRY_RUN() @click.pass_context def profile_mirror( @@ -304,94 +295,41 @@ def profile_mirror( organize_by_groups, dry_run, dump_processes, - only_top_level_workflows, - dump_data, deduplicate, - also_raw, - also_rich, include_inputs, include_outputs, include_attributes, include_extras, flat, - rich_spec, - rich_dump_all, dump_config_file, - nodes, groups, ): """Dump all data in an AiiDA profile's storage to disk.""" from pathlib import Path + from datetime import datetime from aiida import orm from aiida.tools.dumping.parser import DumpConfigParser - from aiida.tools.dumping.rich import ( - DEFAULT_CORE_EXPORT_MAPPING, - rich_from_cli, - rich_from_config, - ) from aiida.tools.dumping.utils import prepare_dump_path + from aiida.tools.dumping.base import BaseDumper profile = ctx.obj['profile'] - if nodes and groups: - echo.echo_critical('`nodes` and `groups` specified. Set only one.') - - if dump_config_file is None: - general_kwargs = { - 'path': path, - 'overwrite': overwrite, - # 'incremental': incremental, - 'dry_run': dry_run, - } - - processdumper_kwargs = { - 'include_inputs': include_inputs, - 'include_outputs': include_outputs, - 'include_attributes': include_attributes, - 'include_extras': include_extras, - 'flat': flat, - } - - datadumper_kwargs = { - 'also_raw': also_raw, - 'also_rich': also_rich, - } - - collection_kwargs = { - 'should_dump_processes': dump_processes, - 'should_dump_data': dump_data, - 'only_top_level_workflows': only_top_level_workflows, - 'organize_by_groups': organize_by_groups, - } - - rich_kwargs = { - 'rich_dump_all': rich_dump_all, - } - - if rich_spec is not None: - rich_spec_dict = rich_from_cli(rich_spec=rich_spec, **rich_kwargs) - else: - rich_spec_dict = DEFAULT_CORE_EXPORT_MAPPING + # if nodes and groups: + # echo.echo_critical('`nodes` and `groups` specified. Set only one.') - # TODO: Also allow for mixing. Currently one can _only_ specify either the config file, or the arguments on the - # TODO: command line - else: - kwarg_dicts_from_config = DumpConfigParser.parse_config_file(dump_config_file) + # if dump_config_file is None: - general_kwargs = kwarg_dicts_from_config['general_kwargs'] - processdumper_kwargs = kwarg_dicts_from_config['processdumper_kwargs'] - datadumper_kwargs = kwarg_dicts_from_config['datadumper_kwargs'] - collection_kwargs = kwarg_dicts_from_config['collection_kwargs'] - rich_kwargs = kwarg_dicts_from_config['rich_kwargs'] + # # TODO: Also allow for mixing. Currently one can _only_ specify either the config file, or the arguments on the + # # TODO: command line + # else: + # kwarg_dicts_from_config = DumpConfigParser.parse_config_file(dump_config_file) - rich_spec_dict = rich_from_config(kwarg_dicts_from_config['rich_spec'], **rich_kwargs) + # general_kwargs = kwarg_dicts_from_config['general_kwargs'] + # processdumper_kwargs = kwarg_dicts_from_config['processdumper_kwargs'] + # datadumper_kwargs = kwarg_dicts_from_config['datadumper_kwargs'] - # Obtain these specifically for easy access and modifications - path = general_kwargs['path'] - overwrite = general_kwargs['overwrite'] - dry_run = general_kwargs['dry_run'] incremental = not overwrite if path is None: @@ -401,14 +339,15 @@ def profile_mirror( dry_run_message = f"Dry run for dumping of profile `{profile.name}`'s data at path: `{path}`.\n" dry_run_message += 'Only directories will be created.' - if dry_run or (not collection_kwargs['should_dump_processes'] and not collection_kwargs['should_dump_data']): + if dry_run: echo.echo_report(dry_run_message) return else: echo.echo_report(f"Dumping of profile `{profile.name}`'s data at path: `{path}`.") - SAFEGUARD_FILE = '.verdi_profile_mirror' # noqa: N806 + SAFEGUARD_FILE: str = '.verdi_profile_mirror' + safeguard_file_path: Path = path / SAFEGUARD_FILE try: prepare_dump_path( @@ -420,88 +359,41 @@ def profile_mirror( except FileExistsError as exc: echo.echo_critical(str(exc)) - (path / SAFEGUARD_FILE).touch() + try: + with safeguard_file_path.open("r") as fhandle: + last_dump_time = datetime.fromisoformat(fhandle.readlines()[-1].strip().split()[-1]).astimezone() + except IndexError: + last_dump_time = None - data_dumper = DataDumper( + base_dumper = BaseDumper( dump_parent_path=path, overwrite=overwrite, incremental=incremental, - rich_spec_dict=rich_spec_dict, - **datadumper_kwargs, + last_dump_time=last_dump_time, ) - # dumper_pretty_print(data_dumper) process_dumper = ProcessDumper( - dump_parent_path=path, - overwrite=overwrite, - incremental=incremental, - data_dumper=data_dumper, - **processdumper_kwargs, + base=base_dumper, + include_inputs= include_inputs, + include_outputs= include_outputs, + include_attributes= include_attributes, + include_extras=include_extras, + flat=flat, ) - # dumper_pretty_print(process_dumper) - # TODO: Possibly implement specifying specific computers - # TODO: Although, users could just specify the relevant nodes - # TODO: Also add option to specify node types via entry points - # TODO: Use `batch_iter` from aiida.tools.archive.common + profile_dumper = ProfileDumper( + base_dumper=base_dumper, + process_dumper=process_dumper, + groups=groups, + organize_by_groups=organize_by_groups, + deduplicate=deduplicate, + profile=profile, + dump_processes=dump_processes, + ) - # === Dump the data that is not associated with any group === - if not groups: - collection_dumper = CollectionDumper( - dump_parent_path=path, - overwrite=overwrite, - incremental=incremental, - nodes=nodes, - **collection_kwargs, - **rich_kwargs, - data_dumper=data_dumper, - process_dumper=process_dumper, - deduplicate=deduplicate, - ) - collection_dumper.create_entity_counter() - # dumper_pretty_print(collection_dumper, include_private_and_dunder=False) - - if dump_processes and collection_dumper._should_dump_processes(): - echo.echo_report(f'Dumping processes not in any group for profile `{profile.name}`...') - collection_dumper.dump_processes() - - if dump_data: - if not also_rich and not also_raw: - echo.echo_critical('`--dump-data was given, but neither --also-raw or --also-rich specified.') - echo.echo_report(f'Dumping data not in any group for profile {profile.name}...') - - # collection_dumper.dump_data_rich() - - # === Dump data per-group if Groups exist in profile or are selected === - # TODO: Invert default behavior here, as I typically want to dump all entries - # TODO: Possibly define a new click option instead - # all_entries = not all_entries - if not groups: # and all_entries: - groups = orm.QueryBuilder().append(orm.Group).all(flat=True) - - if groups: - if not nodes: - for group in groups: - collection_dumper = CollectionDumper( - dump_parent_path=path, - overwrite=overwrite, - incremental=incremental, - group=group, - **collection_kwargs, - **rich_kwargs, - process_dumper=process_dumper, - data_dumper=data_dumper, - deduplicate=deduplicate, - ) - - collection_dumper.create_entity_counter() - if dump_processes: - # The additional `_should_dump_processes` check here ensures that no reporting like - # "Dumping processes for group `SSSP/1.3/PBE/efficiency`" is printed for groups that - # don't contain processes - if collection_dumper._should_dump_processes(): - echo.echo_report(f'Dumping processes for group `{group.label}`...') - collection_dumper.dump_processes() - if dump_data: - echo.echo_report(f'Dumping data for group `{group.label}`...') - collection_dumper.dump_data_rich() + profile_dumper.dump() + + # Append the current time to the file + last_dump_time = datetime.now().astimezone().isoformat() + with safeguard_file_path.open("a") as fhandle: + fhandle.write(f"Last profile mirror time: {last_dump_time}\n") diff --git a/src/aiida/cmdline/params/options/main.py b/src/aiida/cmdline/params/options/main.py index 4d1c308c43..e7a18eedc1 100644 --- a/src/aiida/cmdline/params/options/main.py +++ b/src/aiida/cmdline/params/options/main.py @@ -90,7 +90,6 @@ 'NODES', 'NON_INTERACTIVE', 'OLDER_THAN', - 'ONLY_TOP_LEVEL_WORKFLOWS', 'ORDER_BY', 'ORDER_DIRECTION', 'ORGANIZE_BY_GROUPS', @@ -916,15 +915,6 @@ def set_log_level(ctx, _param, value): help='Dump files in a flat directory for every step of a workflow.', ) -ONLY_TOP_LEVEL_WORKFLOWS = OverridableOption( - '--only-top-level-workflows/--no-only-top-level-workflows', - is_flag=True, - default=True, - type=bool, - show_default=True, - help='Dump only the top-level workflows in their own dedicated directories.', -) - INCREMENTAL = OverridableOption( '--incremental/--no-incremental', is_flag=True, diff --git a/src/aiida/tools/dumping/__init__.py b/src/aiida/tools/dumping/__init__.py index 49713c9b8a..c6031fc35a 100644 --- a/src/aiida/tools/dumping/__init__.py +++ b/src/aiida/tools/dumping/__init__.py @@ -8,8 +8,10 @@ ########################################################################### """Modules related to the dumping of AiiDA data.""" -from .collection import CollectionDumper -from .data import DataDumper -from .processes import ProcessDumper +from .base import BaseDumper +from .profile import ProfileDumper +from .group import GroupDumper +from .process import ProcessDumper +# from .collection import CollectionDumper -__all__ = ('CollectionDumper', 'DataDumper', 'ProcessDumper') +__all__ = ('BaseDumper', 'ProfileDumper', 'GroupDumper', 'ProcessDumper') #, 'CollectionDumper') diff --git a/src/aiida/tools/dumping/base.py b/src/aiida/tools/dumping/base.py new file mode 100644 index 0000000000..03d72c6f72 --- /dev/null +++ b/src/aiida/tools/dumping/base.py @@ -0,0 +1,25 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### + +from pathlib import Path +from datetime import datetime + + +class BaseDumper: + def __init__( + self, + dump_parent_path: Path = Path.cwd(), + overwrite: bool = False, + incremental: bool = True, + last_dump_time: datetime | None = None, + ): + self.dump_parent_path = dump_parent_path + self.overwrite = overwrite + self.incremental = incremental + self.last_dump_time = last_dump_time \ No newline at end of file diff --git a/src/aiida/tools/dumping/data.py b/src/aiida/tools/dumping/data.py deleted file mode 100644 index 3a75d8d743..0000000000 --- a/src/aiida/tools/dumping/data.py +++ /dev/null @@ -1,292 +0,0 @@ -########################################################################### -# Copyright (c), The AiiDA team. All rights reserved. # -# This file is part of the AiiDA code. # -# # -# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # -# For further information on the license, see the LICENSE.txt file # -# For further information please visit http://www.aiida.net # -########################################################################### -"""Functionality for dumping of Data nodes.""" - -from __future__ import annotations - -import logging -from functools import singledispatchmethod -from pathlib import Path - -import yaml - -from aiida import orm - -logger = logging.getLogger(__name__) - - -class DataDumper: - def __init__( - self, - *args, - dump_parent_path: Path = Path.cwd(), - overwrite: bool = False, - incremental: bool = True, - data_hidden: bool = False, - also_raw: bool = False, - also_rich: bool = False, - rich_spec_dict: dict | None = None, - **kwargs, - ) -> None: - self.args = args - self.dump_parent_path = dump_parent_path - self.overwrite = overwrite - self.incremental = incremental - self.data_hidden = data_hidden - self.also_raw = also_raw - self.also_rich = also_rich - self.kwargs = kwargs - - self.rich_spec_dict = rich_spec_dict - - self.hidden_aiida_path = dump_parent_path / '.aiida-raw-data' - - @singledispatchmethod - def dump_core_data_node_rich(self, data_node, output_path, output_fname): - # raise NotImplementedError(f'Dumping not implemented for type {type(data_node)}') - # print(f'No specific handler found for type <{type(data_node)}> <{data_node}>, doing nothing.') - # output_path /= 'general' - # This is effectively the `rich` dumping - data_node_entry_point_name = data_node.entry_point.name - export_settings = self.rich_spec_dict[data_node_entry_point_name] - exporter = export_settings['exporter'] - fileformat = export_settings['export_format'] - if exporter is not None: - output_path.mkdir(exist_ok=True, parents=True) - exporter( - node=data_node, - output_fname=output_path / output_fname, - fileformat=fileformat, - overwrite=self.overwrite, - ) - # This is for orm.Data types for which no default dumping is implemented, e.g. Bool or Float - # except ValueError: - # pass - # This is for orm.Data types for whose entry_point names no entry exists in the DEFAULT_CORE_EXPORT_MAPPING - # This is now captured outside in the `CollectionDumper`, so should not be relevant anymore - # except TypeError: - # raise - - @dump_core_data_node_rich.register - def _( - self, - data_node: orm.StructureData, - output_path: str | Path | None = None, - output_fname: str | None = None, - ): - if type(data_node) is orm.StructureData: - self._dump_structuredata(data_node, output_path=output_path, output_fname=output_fname) - else: - # Handle the case where data_node is a subclass of orm.StructureData - # Just use the default dispatch function implementation - self.dump_core_data_node_rich.dispatch(object)(self, data_node, output_path, output_fname) - - @dump_core_data_node_rich.register - def _( - self, - data_node: orm.Code, - output_path: str | Path | None = None, - output_fname: str | None = None, - ): - self._dump_code(data_node=data_node, output_path=output_path, output_fname=output_fname) - - @dump_core_data_node_rich.register - def _( - self, - data_node: orm.Computer, - output_path: str | Path | None = None, - output_fname: str | None = None, - ): - self._dump_computer_setup(data_node=data_node, output_path=output_path, output_fname=output_fname) - self._dump_computer_config(data_node=data_node, output_path=output_path, output_fname=output_fname) - - @dump_core_data_node_rich.register - def _( - self, - data_node: orm.BandsData, - output_path: str | Path | None = None, - output_fname: str | None = None, - ): - self._dump_bandsdata(data_node=data_node, output_path=output_path, output_fname=output_fname) - - # These are the rich dumping implementations that actually differ from the default dispatch - def _dump_structuredata( - self, - data_node: orm.StructureData, - output_path: Path | None = None, - output_fname: str | None = None, - ): - from aiida.common.exceptions import UnsupportedSpeciesError - - node_entry_point_name = data_node.entry_point.name - exporter = self.rich_spec_dict[node_entry_point_name]['exporter'] - fileformat = self.rich_spec_dict[node_entry_point_name]['export_format'] - - if output_fname is None: - output_fname = DataDumper.generate_output_fname_rich(data_node=data_node, fileformat=fileformat) - - # ? There also exists a CifData file type - # output_path /= 'structures' - output_path.mkdir(exist_ok=True, parents=True) - try: - exporter( - node=data_node, - output_fname=output_path / output_fname, - fileformat=fileformat, - overwrite=self.overwrite, - ) - except UnsupportedSpeciesError: - # This is the case for, e.g. HubbardStructureData that has species like `Mn0` - # Not sure how to resolve this. Wouldn't add a singledispatch for data types defined in plugins. Currently, - # do strict type check. HubbardStructureData doesn't implement an export method itself, though. - pass - - def _dump_code( - self, - data_node: orm.Code, - output_path: Path | None = None, - output_fname: str | None = None, - ): - # output_path /= 'codes' - - node_entry_point_name = data_node.entry_point.name - exporter = self.rich_spec_dict[node_entry_point_name]['exporter'] - fileformat = self.rich_spec_dict[node_entry_point_name]['export_format'] - - if fileformat != 'yaml': - raise NotImplementedError('No other fileformats supported so far apart from YAML.') - output_path.mkdir(exist_ok=True, parents=True) - if output_fname is None: - output_fname = DataDumper.generate_output_fname_rich(data_node=data_node, fileformat=fileformat) - - exporter( - node=data_node, - output_fname=output_path / output_fname, - fileformat=fileformat, - overwrite=self.overwrite, - ) - - def _dump_computer_setup( - self, - data_node: orm.Computer, - output_path: Path | None = None, - output_fname: str | None = None, - ): - node_entry_point_name = data_node.entry_point.name - # TODO: Don't use the `exporter` here, as `Computer` doesn't derive from Data, so custom implementation - fileformat = self.rich_spec_dict[node_entry_point_name]['export_format'] - - if fileformat != 'yaml': - raise NotImplementedError('No other fileformats supported so far apart from YAML.') - - output_path.mkdir(exist_ok=True, parents=True) - - # This is a bit of a hack. Should split this up into two different functions. - if output_fname is None: - output_fname = output_path / f'{data_node.full_label}-setup-{data_node.pk}.{fileformat}' - - # ? Copied over from `cmd_computer` as importing `computer_export_setup` led to click Context error: - # TypeError: Context.__init__() got an unexpected keyword argument 'computer' - computer_setup = { - 'label': data_node.label, - 'hostname': data_node.hostname, - 'description': data_node.description, - 'transport': data_node.transport_type, - 'scheduler': data_node.scheduler_type, - 'shebang': data_node.get_shebang(), - 'work_dir': data_node.get_workdir(), - 'mpirun_command': ' '.join(data_node.get_mpirun_command()), - 'mpiprocs_per_machine': data_node.get_default_mpiprocs_per_machine(), - 'default_memory_per_machine': data_node.get_default_memory_per_machine(), - 'use_double_quotes': data_node.get_use_double_quotes(), - 'prepend_text': data_node.get_prepend_text(), - 'append_text': data_node.get_append_text(), - } - - if not output_fname.is_file(): - output_fname.write_text(yaml.dump(computer_setup, sort_keys=False), 'utf-8') - - def _dump_computer_config( - self, - data_node: orm.Computer, - output_path: Path | None = None, - output_fname: str | None = None, - ): - from aiida.orm import User - - node_entry_point_name = data_node.entry_point.name - # TODO: Don't use the `exporter` here, as `Computer` doesn't derive from Data, so custom implementation - fileformat = self.rich_spec_dict[node_entry_point_name]['export_format'] - - # output_path /= 'computers' - if fileformat != 'yaml': - raise NotImplementedError('No other fileformats supported so far apart from YAML.') - - output_path.mkdir(exist_ok=True, parents=True) - - # This is a bit of a hack. Should split this up into two different functions. - if output_fname is None: - output_fname = output_path / f'{data_node.full_label}-config-{data_node.pk}.{fileformat}' - - users = User.collection.all() - for user in users: - computer_configuration = data_node.get_configuration(user) - if not output_fname.is_file(): - output_fname.write_text(yaml.dump(computer_configuration, sort_keys=False), 'utf-8') - - def _dump_bandsdata( - self, - data_node: orm.BandsData, - output_path: Path | None = None, - output_fname: str | None = None, - ): - node_entry_point_name = data_node.entry_point.name - exporter = self.rich_spec_dict[node_entry_point_name]['exporter'] - fileformat = self.rich_spec_dict[node_entry_point_name]['export_format'] - - from aiida.tools.dumping.utils import sanitize_file_extension - - output_path.mkdir(exist_ok=True, parents=True) - - if output_fname is None: - output_fname = DataDumper.generate_output_fname_rich(data_node=data_node, fileformat=fileformat) - - output_fname = sanitize_file_extension(output_fname) - - exporter( - node=data_node, - output_fname=output_path / output_fname, - fileformat=fileformat, - overwrite=self.overwrite, - ) - - def _dump_user_info(self): ... - - def dump_core_data_node_raw(self, data_node: orm.Data, output_path: Path, output_fname: str | None = None): - output_path.mkdir(exist_ok=True, parents=True) - - if output_fname is None: - output_fname = DataDumper.generate_output_fname_raw(data_node=data_node) - - with open(output_path.resolve() / output_fname, 'w') as handle: - yaml.dump(data_node.attributes, handle) - - @staticmethod - def generate_output_fname_raw(data_node, prefix: str | None = None): - if prefix is None: - return f'{data_node.__class__.__name__}-{data_node.pk}_attrs.yaml' - else: - return f'{prefix}-{data_node.__class__.__name__}-{data_node.pk}_attrs.yaml' - - @staticmethod - def generate_output_fname_rich(data_node, fileformat, prefix: str | None = None): - if prefix is None: - return f'{data_node.__class__.__name__}-{data_node.pk}.{fileformat}' - else: - return f'{prefix}-{data_node.__class__.__name__}-{data_node.pk}.{fileformat}' diff --git a/src/aiida/tools/dumping/collection.py b/src/aiida/tools/dumping/group.py similarity index 52% rename from src/aiida/tools/dumping/collection.py rename to src/aiida/tools/dumping/group.py index 9cf0dccde6..6f6cb7c214 100644 --- a/src/aiida/tools/dumping/collection.py +++ b/src/aiida/tools/dumping/group.py @@ -14,63 +14,46 @@ import logging from collections import Counter from pathlib import Path +from datetime import datetime from aiida import orm -from aiida.tools.dumping.data import DataDumper -from aiida.tools.dumping.processes import ProcessDumper +from aiida.tools.dumping.process import ProcessDumper +from aiida.tools.dumping.base import BaseDumper logger = logging.getLogger(__name__) DEFAULT_PROCESSES_TO_DUMP = [orm.CalculationNode, orm.WorkflowNode] -DEFAULT_DATA_TO_DUMP = [orm.StructureData, orm.Code, orm.Computer, orm.BandsData, orm.UpfData] +# DEFAULT_DATA_TO_DUMP = [orm.StructureData, orm.Code, orm.Computer, orm.BandsData, orm.UpfData] # DEFAULT_COLLECTIONS_TO_DUMP ?? -DEFAULT_ENTITIES_TO_DUMP = DEFAULT_PROCESSES_TO_DUMP + DEFAULT_DATA_TO_DUMP +DEFAULT_ENTITIES_TO_DUMP = DEFAULT_PROCESSES_TO_DUMP # + DEFAULT_DATA_TO_DUMP # ! This class is instantiated once for every group, or once for the full profile -class CollectionDumper: +class GroupDumper: def __init__( self, - dump_parent_path: Path = Path().cwd(), - nodes: set = {}, + base_dumper: BaseDumper | None = None, + process_dumper: ProcessDumper | None = None, group: orm.Group | str | None = None, - overwrite: bool = False, - incremental: bool = True, - should_dump_processes: bool = False, - should_dump_data: bool = False, - only_top_level_workflows: bool = True, - rich_dump_all: bool = True, deduplicate: bool = True, - organize_by_groups: bool = True, - process_dumper: ProcessDumper | None = None, - data_dumper: DataDumper | None = None, + output_path: str | Path | None = None ): - self.dump_parent_path = dump_parent_path - self.overwrite = overwrite - self.incremental = incremental - self.should_dump_processes = should_dump_processes - self.should_dump_data = should_dump_data - self.only_top_level_workflows = only_top_level_workflows - self.nodes = nodes self.deduplicate = deduplicate - self.process_dumper = process_dumper - self.data_dumper = data_dumper - # Allow passing of group via label if isinstance(group, str): group = orm.Group.get(group) self.group = group + self.output_path = output_path - if organize_by_groups: - if group is not None: - group_subdir = Path(*group.type_string.split('.')) - self.output_path = self.dump_parent_path / 'groups' / group_subdir / self.group.label - else: - self.output_path = self.dump_parent_path / 'no-group' - else: - self.output_path = self.dump_parent_path + if base_dumper is None: + base_dumper = BaseDumper() + self.base_dumper: BaseDumper = base_dumper + + if process_dumper is None: + process_dumper = ProcessDumper() + self.process_dumper: ProcessDumper = process_dumper if not hasattr(self, 'entity_counter'): self.create_entity_counter() @@ -81,8 +64,8 @@ def create_entity_counter(self) -> Counter: # If the group only has one WorkChain assigned to it, this will only return a count of 1 for the # WorkChainNode, nothing more, that is, it doesn't work recursively. nodes = self.group.nodes - elif self.nodes is not None: - nodes = self.nodes + # elif self.nodes is not None: + # nodes = self.nodes else: nodes = orm.QueryBuilder().append(orm.Node).all(flat=True) @@ -96,9 +79,9 @@ def create_entity_counter(self) -> Counter: return entity_counter - def get_collection_nodes(self): - if self.nodes: - self.collection_nodes = self.nodes + def get_group_nodes(self): + # if self.nodes: + # self.collection_nodes = self.nodes # if hasattr(self, 'collection_nodes'): # return self.collection_nodes @@ -129,27 +112,20 @@ def get_collection_nodes(self): nodes = [profile_node for profile_node in profile_nodes if profile_node not in nodes_in_groups] nodes = [orm.load_node(node) for node in nodes] + if self.base_dumper.last_dump_time is not None: + # breakpoint() + nodes = [node for node in nodes if node.mtime > self.base_dumper.last_dump_time] + self.collection_nodes = nodes return nodes def _should_dump_processes(self) -> bool: - if not self.nodes: - return ( - sum( - self.entity_counter.get(orm_process_class, 0) - for orm_process_class in [ - orm.CalcJobNode, - orm.CalcFunctionNode, - orm.WorkChainNode, - orm.WorkFunctionNode, - orm.ProcessNode, - ] - ) - > 0 - ) - else: - return len([node for node in self.nodes if isinstance(node, orm.ProcessNode)]) > 0 + + if not hasattr(self, 'group_nodes'): + self.get_group_nodes() + + return len([node for node in self.collection_nodes if isinstance(node, orm.ProcessNode)]) > 0 def _dump_calculations(self, calculations): for calculation in calculations: @@ -186,11 +162,11 @@ def _dump_workflows(self, workflows): link_calculations_dir=self.output_path / 'calculations', ) - def dump_processes(self): - nodes = self.get_collection_nodes() + def _dump_processes(self): + nodes = self.get_group_nodes() workflows = [node for node in nodes if isinstance(node, orm.WorkflowNode)] - if self.only_top_level_workflows: + if self.deduplicate: workflows = [workflow for workflow in workflows if workflow.caller is None] # Also need to obtain sub-calculations that were called by workflows of the group @@ -210,70 +186,3 @@ def dump_processes(self): self._dump_calculations(calculations=calculations) self._dump_workflows(workflows=workflows) - - # TODO: Add `dump_data_raw` here, as well - # def dump_data_rich(self): - # nodes = self.get_collection_nodes() - # nodes = [node for node in nodes if isinstance(node, (orm.Data, orm.Computer))] - # # Here, when providing logic to set the exporters and fileformat via the rich-options, don't have to filter - # # anymore for `core` - # nodes = [node for node in nodes if node.entry_point.name.startswith('core')] - # if len(nodes) == 0: - # return - - # self.output_path.mkdir(exist_ok=True, parents=True) - # data_dumper = self.data_dumper - - # for data_node in nodes: - # node_entry_point_name = data_node.entry_point.name - - # # Get the fileformat and exporter for the data node - # try: - # fileformat = data_dumper.rich_spec_dict[node_entry_point_name]['export_format'] - # exporter = data_dumper.rich_spec_dict[node_entry_point_name]['exporter'] - - # # If options for the rich dumping are specified and not all the other defaults are being used - # # Some entry_points might not be inside the `rich_spec_dict` - # except KeyError: - # continue - - # except: - # # Raise all exceptions here during development - # raise - - # # Don't go further if no importer implemented for a data type anyway - # if exporter is None: - # continue - - # try: - # # Generate a nice filename and sanitize it - # nice_output_path = self.output_path / 'data' / data_node.__class__.__name__.lower() - # nice_fname = data_dumper.generate_output_fname_rich(data_node=data_node, fileformat=fileformat).replace( - # '__', '_' - # ) - # nice_fname = sanitize_file_extension(nice_fname) - - # if data_dumper.data_hidden: - # # Define paths for hidden dump and linking - # hidden_output_path = self.hidden_aiida_path / 'data' / data_node.__class__.__name__.lower() - # uuid_fname = sanitize_file_extension(f'{data_node.uuid}.{fileformat}') - - # # Dump the data in the hidden directory - # data_dumper.dump_core_data_node_rich(data_node, hidden_output_path, uuid_fname) - - # # Link the hidden file to the expected output path - # (nice_output_path / nice_fname).parent.mkdir(exist_ok=True, parents=True) - # os.symlink(hidden_output_path / uuid_fname, nice_output_path / nice_fname) - - # else: - # # Dump the data in the non-hidden directory - # data_dumper.dump_core_data_node_rich(data_node, nice_output_path, nice_fname) - - # except TypeError: - # # Handle case when no exporter is implemented for a given data_node type - # raise - # except OSError: - # # A Data node, e.g. a Code might already be existent, so don't worry about this exception - # continue - # except Exception: - # raise diff --git a/src/aiida/tools/dumping/parser.py b/src/aiida/tools/dumping/parser.py index 39288929e6..96412eb421 100644 --- a/src/aiida/tools/dumping/parser.py +++ b/src/aiida/tools/dumping/parser.py @@ -35,7 +35,6 @@ def parse_config_file(config_file: str | Path | None) -> dict: collection_kwargs = { 'should_dump_processes': config.get('dump_processes', True), 'should_dump_data': config.get('dump_data', True), - 'only_top_level_workflows': config.get('only_top_level_workflows', True), } rich_kwargs = { diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/process.py similarity index 71% rename from src/aiida/tools/dumping/processes.py rename to src/aiida/tools/dumping/process.py index fcc8671ff6..f45c0692e4 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/process.py @@ -33,60 +33,36 @@ from aiida.common import LinkType from aiida.common.exceptions import NotExistentAttributeError -from aiida.orm import ( - CalcFunctionNode, - CalcJobNode, - CalculationNode, - ProcessNode, - WorkChainNode, - WorkflowNode, - WorkFunctionNode, -) +from aiida import orm from aiida.orm.utils import LinkTriple from aiida.tools.archive.exceptions import ExportValidationError -from aiida.tools.dumping.data import DataDumper +from aiida.tools.dumping.base import BaseDumper from aiida.tools.dumping.utils import prepare_dump_path -LOGGER = logging.getLogger(__name__) +logger = logging.getLogger(__name__) class ProcessDumper: def __init__( self, - *args, - dump_parent_path: Path = Path.cwd(), - overwrite: bool = False, - incremental: bool = True, + base: BaseDumper = BaseDumper(), flat: bool = False, include_inputs: bool = True, include_outputs: bool = False, include_attributes: bool = True, include_extras: bool = True, - rich_options: str = '', - rich_config_file: Path | None = None, - rich_dump_all: bool = True, - data_dumper: DataDumper = DataDumper(), dump_unsealed: bool = False, - **kwargs, ) -> None: - self.args = args - self.dump_parent_path = dump_parent_path - self.overwrite = overwrite - self.incremental = incremental self.flat = flat + self.base = base self.include_inputs = include_inputs self.include_outputs = include_outputs self.include_attributes = include_attributes self.include_extras = include_extras - self.rich_options = rich_options - self.rich_config_file = rich_config_file - self.rich_dump_all = rich_dump_all - self.data_dumper = data_dumper - self.kwargs = kwargs self.dump_unsealed = dump_unsealed @staticmethod - def _generate_default_dump_path(process_node: ProcessNode, prefix: str = 'dump') -> Path: + def _generate_default_dump_path(process_node: orm.ProcessNode, prefix: str = 'dump') -> Path: """Simple helper function to generate the default parent-dumping directory if none given. This function is not called for the recursive sub-calls of `_dump_calculation` as it just creates the default @@ -113,7 +89,7 @@ def _generate_default_dump_path(process_node: ProcessNode, prefix: str = 'dump') return Path('-'.join(entities_to_dump)) @staticmethod - def _generate_readme(process_node: ProcessNode, output_path: Path) -> None: + def _generate_readme(process_node: orm.ProcessNode, output_path: Path) -> None: """Generate README.md file in main dumping directory. :param process_node: `CalculationNode` or `WorkflowNode`. @@ -158,11 +134,11 @@ def _generate_readme(process_node: ProcessNode, output_path: Path) -> None: # `verdi process report` # Copied over from `cmd_process` - if isinstance(process_node, CalcJobNode): + if isinstance(process_node, orm.CalcJobNode): process_report = get_calcjob_report(process_node) - elif isinstance(process_node, WorkChainNode): + elif isinstance(process_node, orm.WorkChainNode): process_report = get_workchain_report(process_node, levelname='REPORT', indent_size=2, max_depth=None) - elif isinstance(process_node, (CalcFunctionNode, WorkFunctionNode)): + elif isinstance(process_node, (orm.CalcFunctionNode, orm.WorkFunctionNode)): process_report = get_process_function_report(process_node) else: process_report = f'Nothing to show for node type {process_node.__class__}' @@ -209,11 +185,9 @@ def _generate_child_node_label(index: int, link_triple: LinkTriple, append_pk: b def dump( self, - process_node: ProcessNode, + process_node: orm.ProcessNode, output_path: Path | None, io_dump_paths: List[str | Path] | None = None, - *args, - **kwargs, ) -> Path: """Dumps all data involved in a `ProcessNode`, including its outgoing links. @@ -236,22 +210,22 @@ def dump( # I don't want to include them in the general class `__init__`, as they don't really fit there. # But the `_dump_node_yaml` function is private, so it's never called outside by the user. # Setting the class attributes here dynamically is probably not a good solution, but it works for now. - for key, value in kwargs.items(): - setattr(self, key, value) + # for key, value in kwargs.items(): + # setattr(self, key, value) if output_path is None: output_path = self._generate_default_dump_path(process_node=process_node) - prepare_dump_path(path_to_validate=output_path, overwrite=self.overwrite, incremental=self.incremental) + prepare_dump_path(path_to_validate=output_path, overwrite=self.base.overwrite, incremental=self.base.incremental) - if isinstance(process_node, CalculationNode): + if isinstance(process_node, orm.CalculationNode): self._dump_calculation( calculation_node=process_node, output_path=output_path, io_dump_paths=io_dump_paths, ) - elif isinstance(process_node, WorkflowNode): + elif isinstance(process_node, orm.WorkflowNode): self._dump_workflow( workflow_node=process_node, output_path=output_path, @@ -264,7 +238,7 @@ def dump( def _dump_workflow( self, - workflow_node: WorkflowNode, + workflow_node: orm.WorkflowNode, output_path: Path, io_dump_paths: List[str | Path] | None = None, link_calculations: bool = False, @@ -277,7 +251,7 @@ def _dump_workflow( :param io_dump_paths: Custom subdirectories for `CalculationNode` s, defaults to None """ - prepare_dump_path(path_to_validate=output_path, overwrite=self.overwrite, incremental=self.incremental) + prepare_dump_path(path_to_validate=output_path, overwrite=self.base.overwrite, incremental=self.base.incremental) self._dump_node_yaml(process_node=workflow_node, output_path=output_path) called_links = workflow_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() @@ -289,7 +263,7 @@ def _dump_workflow( child_output_path = output_path.resolve() / child_label # Recursive function call for `WorkFlowNode` - if isinstance(child_node, WorkflowNode): + if isinstance(child_node, orm.WorkflowNode): self._dump_workflow( workflow_node=child_node, output_path=child_output_path, @@ -301,7 +275,7 @@ def _dump_workflow( ) # Once a `CalculationNode` as child reached, dump it - elif isinstance(child_node, CalculationNode): + elif isinstance(child_node, orm.CalculationNode): if not link_calculations: self._dump_calculation( calculation_node=child_node, @@ -319,7 +293,7 @@ def _dump_workflow( def _dump_calculation( self, - calculation_node: CalculationNode, + calculation_node: orm.CalculationNode, output_path: Path, io_dump_paths: List[str | Path] | None = None, ) -> None: @@ -331,7 +305,7 @@ def _dump_calculation( Default: ['inputs', 'outputs', 'node_inputs', 'node_outputs'] """ - prepare_dump_path(path_to_validate=output_path, overwrite=self.overwrite, incremental=self.incremental) + prepare_dump_path(path_to_validate=output_path, overwrite=self.base.overwrite, incremental=self.base.incremental) self._dump_node_yaml(process_node=calculation_node, output_path=output_path) io_dump_mapping = self._generate_calculation_io_mapping(io_dump_paths=io_dump_paths) @@ -345,10 +319,6 @@ def _dump_calculation( output_path.resolve() / io_dump_mapping.retrieved ) - if self.data_dumper.also_raw: - # TODO: Replace with attached self.data_dumper attribute - self.data_dumper.dump_core_data_node_raw(data_node=calculation_node, output_path=output_path) - # Dump the node_inputs if self.include_inputs: input_links = calculation_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC) @@ -358,26 +328,6 @@ def _dump_calculation( self._dump_calculation_io_files(parent_path=output_path / io_dump_mapping.inputs, link_triples=input_links) - if self.data_dumper.also_raw: - # Always dump the `raw` data inside the calculation directories - # I don't see a reason why one would want all the node attribute files in a centralized location - self._dump_calculation_io_files_raw( - output_path=output_path / io_dump_mapping.inputs, link_triples=input_links - ) - - if self.data_dumper.also_rich: - rich_data_output_path = output_path / io_dump_mapping.inputs - # if not self.data_dumper.data_hidden: - # rich_data_output_path = output_path / io_dump_mapping.inputs - # else: - # # TODO: Currently, when dumping only one selected workflow, if rich dumping is activated, but - # # TODO: `data-hidden` is set, no data nodes were actually being dumped - # # TODO: With the current implementation below, they are dumped, but not in the same structure as for the - # # TODO: `dump_rich_core` function. Quick fix for now - # pass - - # Only dump the rich data output files in the process directories if data_hidden is False - self._dump_calculation_io_files_rich(output_path=rich_data_output_path, link_triples=input_links) # Dump the node_outputs apart from `retrieved` if self.include_outputs: output_links = list(calculation_node.base.links.get_outgoing(link_type=LinkType.CREATE)) @@ -388,18 +338,6 @@ def _dump_calculation( link_triples=output_links, ) - if self.data_dumper.also_raw: - self._dump_calculation_io_files_raw( - output_path=output_path / io_dump_mapping.outputs, - link_triples=output_links, - ) - - if self.data_dumper.also_rich: - self._dump_calculation_io_files_rich( - output_path=output_path / io_dump_mapping.outputs, - link_triples=output_links, - ) - def _dump_calculation_io_files( self, parent_path: Path, @@ -422,92 +360,6 @@ def _dump_calculation_io_files( link_triple.node.base.repository.copy_tree(linked_node_path.resolve()) - def _dump_calculation_io_files_raw( - self, - output_path: Path, - link_triples: orm.LinkManager | List[orm.LinkTriple], - ): - """Small helper function to dump linked input/output nodes of a `orm.CalculationNode`. - - :param parent_path: Parent directory for dumping the linked node contents. - :param link_triples: List of link triples. - """ - - output_path /= 'raw' - - for link_triple in link_triples: - link_label = link_triple.link_label - data_node = link_triple.node - - # linked_node_path.parent.mkdir(parents=True, exist_ok=True) - output_path.mkdir(parents=True, exist_ok=True) - - # Then dump the node attributes for each node - output_fname = DataDumper.generate_output_fname_raw(prefix=link_label, data_node=data_node) - output_fname = output_fname.replace('__', '_') - - if self.data_dumper.data_hidden: - self.data_dumper.dump_core_data_node_raw( - data_node=data_node, output_path=output_path, output_fname=output_fname - ) - self.data_dumper.dump_core_data_node_raw( - data_node=data_node, output_path=output_path, output_fname=output_fname - ) - - def _dump_calculation_io_files_rich( - self, - output_path: Path, - link_triples: orm.LinkManager | List[orm.LinkTriple], - ): - """Small helper function to dump linked input/output nodes of a `orm.CalculationNode`. - - :param parent_path: Parent directory for dumping the linked node contents. - :param link_triples: List of link triples. - """ - - # Set up the rich parsing functions - - # Extend (at least the keys) by the dynamic entry points - rich_spec_dict = self.data_dumper.rich_spec_dict - - for link_triple in link_triples: - link_label = link_triple.link_label - data_node = link_triple.node - - node = link_triple.node - node_entry_point = node.entry_point - node_entry_point_name = node_entry_point.name - - # TODO: Somehow obtain sensible filenames -> Should this be done here, or by the export function that is - # TODO: possibly written by the plugin developer - if node_entry_point_name.startswith('core'): - # Obtain settings from the export dict - # TODO: -> This might break when plugin is missing - try: - exporter = rich_spec_dict[node_entry_point_name]['exporter'] - fileformat = rich_spec_dict[node_entry_point_name]['export_format'] - output_fname = self.data_dumper.generate_output_fname_rich( - prefix=link_label, data_node=data_node, fileformat=fileformat - ) - output_fname = output_fname.replace('__', '_') - except KeyError: - continue - - # No exporter set - if exporter is None: - continue - - # Only create subdirectory if `Data` node has an exporter - rich_output_path = output_path / 'rich' / node.__class__.__name__.lower() - rich_output_path.mkdir(parents=True, exist_ok=True) - - # TODO: Here, if data_hidden is True, dump in hidden directory, else in output_path - self.data_dumper.dump_core_data_node_rich( - node, - output_path=rich_output_path, - output_fname=output_fname, - ) - def _generate_calculation_io_mapping(self, io_dump_paths: List[str | Path] | None = None) -> SimpleNamespace: """Helper function to generate mapping for entities dumped for each `CalculationNode`. @@ -522,7 +374,7 @@ def _generate_calculation_io_mapping(self, io_dump_paths: List[str | Path] | Non aiida_entities_to_dump = ['repository', 'retrieved', 'inputs', 'outputs'] default_calculation_io_dump_paths = ['inputs', 'outputs', 'node_inputs', 'node_outputs'] if self.flat and io_dump_paths is None: - LOGGER.info( + logger.info( 'Flat set to True and no `io_dump_paths`. Dumping in a flat directory, files might be overwritten.' ) empty_calculation_io_dump_paths = [''] * 4 @@ -530,24 +382,24 @@ def _generate_calculation_io_mapping(self, io_dump_paths: List[str | Path] | Non return SimpleNamespace(**dict(zip(aiida_entities_to_dump, empty_calculation_io_dump_paths))) elif not self.flat and io_dump_paths is None: - LOGGER.info( + logger.info( 'Flat set to False but no `io_dump_paths` provided. ' + f'Will use the defaults {default_calculation_io_dump_paths}.' ) return SimpleNamespace(**dict(zip(aiida_entities_to_dump, default_calculation_io_dump_paths))) elif self.flat: - LOGGER.info('Flat set to True but `io_dump_paths` provided. These will be used, but `inputs` not nested.') + logger.info('Flat set to True but `io_dump_paths` provided. These will be used, but `inputs` not nested.') return SimpleNamespace(**dict(zip(aiida_entities_to_dump, io_dump_paths))) else: - LOGGER.info( + logger.info( 'Flat set to False but no `io_dump_paths` provided. These will be used, but `node_inputs` flattened.' ) return SimpleNamespace(**dict(zip(aiida_entities_to_dump, io_dump_paths))) # type: ignore[arg-type] def _dump_node_yaml( self, - process_node: ProcessNode, + process_node: orm.ProcessNode, output_path: Path, output_filename: str = '.aiida_node_metadata.yaml', ) -> None: diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py new file mode 100644 index 0000000000..5b88fe8d55 --- /dev/null +++ b/src/aiida/tools/dumping/profile.py @@ -0,0 +1,102 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### + +# TODO: Use `batch_iter` from aiida.tools.archive.common + +from __future__ import annotations +from pathlib import Path +import logging +from aiida import orm +from aiida.cmdline.params.options.main import ORGANIZE_BY_GROUPS +from aiida.tools.dumping.base import BaseDumper +from aiida.tools.dumping.process import ProcessDumper +from aiida.tools.dumping.group import GroupDumper +from aiida.manage.configuration.profile import Profile + +logger = logging.getLogger(__name__) + +class ProfileDumper: + def __init__( + self, + profile: str | Profile, + base_dumper: BaseDumper | None = None, + process_dumper: ProcessDumper | None = None, + organize_by_groups: bool = True, + deduplicate: bool = True, + groups: list[str | orm.Group] | None = None, + dump_processes: bool = True, + ): + self.organize_by_groups = organize_by_groups + self.deduplicate = deduplicate + self.profile = profile + self.dump_processes = dump_processes + + if base_dumper is None: + base_dumper = BaseDumper() + self.base_dumper: BaseDumper = base_dumper + + if process_dumper is None: + process_dumper = ProcessDumper() + self.process_dumper: ProcessDumper = process_dumper + + if not groups: + groups = orm.QueryBuilder().append(orm.Group).all(flat=True) + self.groups = groups + + + def dump(self): + + self._dump_processes_not_in_any_group() + self._dump_processes_per_group() + + + def _dump_processes_not_in_any_group(self): + + # === Dump the data that is not associated with any group === + if self.organize_by_groups: + output_path = self.base_dumper.dump_parent_path / 'no-group' + else: + output_path = self.base_dumper.dump_parent_path + + no_group_dumper = GroupDumper( + base_dumper=self.base_dumper, + process_dumper=self.process_dumper, + group=None, + deduplicate=self.deduplicate, + output_path=output_path, + ) + + if self.dump_processes and no_group_dumper._should_dump_processes(): + + logger.report(f'Dumping processes not in any group for profile `{self.profile.name}`...') + + no_group_dumper._dump_processes() + + def _dump_processes_per_group(self): + # === Dump data per-group if Groups exist in profile or are selected === + + for group in self.groups: + + if self.organize_by_groups: + output_path = self.base_dumper.dump_parent_path / group.label + else: + output_path = self.base_dumper.dump_parent_path + + group_dumper = GroupDumper( + base_dumper=self.base_dumper, + process_dumper=self.process_dumper, + group=group, + deduplicate=self.deduplicate, + output_path=output_path, + ) + + if self.dump_processes and group_dumper._should_dump_processes(): + logger.report(f'Dumping processes in group {group.label} for profile `{self.profile.name}`...') + + group_dumper._dump_processes() diff --git a/src/aiida/tools/dumping/test-config-file.yaml b/src/aiida/tools/dumping/test-config-file.yaml index 63bbe38180..6d1db5e967 100644 --- a/src/aiida/tools/dumping/test-config-file.yaml +++ b/src/aiida/tools/dumping/test-config-file.yaml @@ -4,7 +4,6 @@ incremental: true dry_run: false organize_by_groups: true dump_processes: true -only_top_level_workflows: true dump_data: true data_hidden: true also_raw: false diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index c409a438dc..e2f5633939 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -15,7 +15,7 @@ import pytest -from aiida.tools.dumping.processes import ProcessDumper +from aiida.tools.dumping.process import ProcessDumper # Non-AiiDA variables filename = 'file.txt' From 0105c08a9efed85e0950030d3c0a9b779afe4889 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 Jan 2025 12:32:03 +0000 Subject: [PATCH 06/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/aiida/cmdline/commands/cmd_profile.py | 20 +++++++++----------- src/aiida/tools/dumping/__init__.py | 5 +++-- src/aiida/tools/dumping/base.py | 4 ++-- src/aiida/tools/dumping/group.py | 10 ++++------ src/aiida/tools/dumping/process.py | 14 ++++++++++---- src/aiida/tools/dumping/profile.py | 23 +++++++++-------------- 6 files changed, 37 insertions(+), 39 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_profile.py b/src/aiida/cmdline/commands/cmd_profile.py index 2c16566672..4c4926399b 100644 --- a/src/aiida/cmdline/commands/cmd_profile.py +++ b/src/aiida/cmdline/commands/cmd_profile.py @@ -19,7 +19,7 @@ from aiida.cmdline.utils import defaults, echo from aiida.common import exceptions from aiida.manage.configuration import Profile, create_profile, get_config -from aiida.tools.dumping import GroupDumper, ProfileDumper, ProcessDumper +from aiida.tools.dumping import ProcessDumper, ProfileDumper @verdi.group('profile') @@ -306,13 +306,11 @@ def profile_mirror( ): """Dump all data in an AiiDA profile's storage to disk.""" - from pathlib import Path from datetime import datetime + from pathlib import Path - from aiida import orm - from aiida.tools.dumping.parser import DumpConfigParser - from aiida.tools.dumping.utils import prepare_dump_path from aiida.tools.dumping.base import BaseDumper + from aiida.tools.dumping.utils import prepare_dump_path profile = ctx.obj['profile'] @@ -360,7 +358,7 @@ def profile_mirror( echo.echo_critical(str(exc)) try: - with safeguard_file_path.open("r") as fhandle: + with safeguard_file_path.open('r') as fhandle: last_dump_time = datetime.fromisoformat(fhandle.readlines()[-1].strip().split()[-1]).astimezone() except IndexError: last_dump_time = None @@ -374,9 +372,9 @@ def profile_mirror( process_dumper = ProcessDumper( base=base_dumper, - include_inputs= include_inputs, - include_outputs= include_outputs, - include_attributes= include_attributes, + include_inputs=include_inputs, + include_outputs=include_outputs, + include_attributes=include_attributes, include_extras=include_extras, flat=flat, ) @@ -395,5 +393,5 @@ def profile_mirror( # Append the current time to the file last_dump_time = datetime.now().astimezone().isoformat() - with safeguard_file_path.open("a") as fhandle: - fhandle.write(f"Last profile mirror time: {last_dump_time}\n") + with safeguard_file_path.open('a') as fhandle: + fhandle.write(f'Last profile mirror time: {last_dump_time}\n') diff --git a/src/aiida/tools/dumping/__init__.py b/src/aiida/tools/dumping/__init__.py index c6031fc35a..48b73eee65 100644 --- a/src/aiida/tools/dumping/__init__.py +++ b/src/aiida/tools/dumping/__init__.py @@ -9,9 +9,10 @@ """Modules related to the dumping of AiiDA data.""" from .base import BaseDumper -from .profile import ProfileDumper from .group import GroupDumper from .process import ProcessDumper +from .profile import ProfileDumper + # from .collection import CollectionDumper -__all__ = ('BaseDumper', 'ProfileDumper', 'GroupDumper', 'ProcessDumper') #, 'CollectionDumper') +__all__ = ('BaseDumper', 'GroupDumper', 'ProcessDumper', 'ProfileDumper') # , 'CollectionDumper') diff --git a/src/aiida/tools/dumping/base.py b/src/aiida/tools/dumping/base.py index 03d72c6f72..8a89e464d2 100644 --- a/src/aiida/tools/dumping/base.py +++ b/src/aiida/tools/dumping/base.py @@ -7,8 +7,8 @@ # For further information please visit http://www.aiida.net # ########################################################################### -from pathlib import Path from datetime import datetime +from pathlib import Path class BaseDumper: @@ -22,4 +22,4 @@ def __init__( self.dump_parent_path = dump_parent_path self.overwrite = overwrite self.incremental = incremental - self.last_dump_time = last_dump_time \ No newline at end of file + self.last_dump_time = last_dump_time diff --git a/src/aiida/tools/dumping/group.py b/src/aiida/tools/dumping/group.py index 6f6cb7c214..6ea4c960d3 100644 --- a/src/aiida/tools/dumping/group.py +++ b/src/aiida/tools/dumping/group.py @@ -14,18 +14,17 @@ import logging from collections import Counter from pathlib import Path -from datetime import datetime from aiida import orm -from aiida.tools.dumping.process import ProcessDumper from aiida.tools.dumping.base import BaseDumper +from aiida.tools.dumping.process import ProcessDumper logger = logging.getLogger(__name__) DEFAULT_PROCESSES_TO_DUMP = [orm.CalculationNode, orm.WorkflowNode] # DEFAULT_DATA_TO_DUMP = [orm.StructureData, orm.Code, orm.Computer, orm.BandsData, orm.UpfData] # DEFAULT_COLLECTIONS_TO_DUMP ?? -DEFAULT_ENTITIES_TO_DUMP = DEFAULT_PROCESSES_TO_DUMP # + DEFAULT_DATA_TO_DUMP +DEFAULT_ENTITIES_TO_DUMP = DEFAULT_PROCESSES_TO_DUMP # + DEFAULT_DATA_TO_DUMP # ! This class is instantiated once for every group, or once for the full profile @@ -36,7 +35,7 @@ def __init__( process_dumper: ProcessDumper | None = None, group: orm.Group | str | None = None, deduplicate: bool = True, - output_path: str | Path | None = None + output_path: str | Path | None = None, ): self.deduplicate = deduplicate @@ -53,7 +52,7 @@ def __init__( if process_dumper is None: process_dumper = ProcessDumper() - self.process_dumper: ProcessDumper = process_dumper + self.process_dumper: ProcessDumper = process_dumper if not hasattr(self, 'entity_counter'): self.create_entity_counter() @@ -121,7 +120,6 @@ def get_group_nodes(self): return nodes def _should_dump_processes(self) -> bool: - if not hasattr(self, 'group_nodes'): self.get_group_nodes() diff --git a/src/aiida/tools/dumping/process.py b/src/aiida/tools/dumping/process.py index f45c0692e4..00f8173d4b 100644 --- a/src/aiida/tools/dumping/process.py +++ b/src/aiida/tools/dumping/process.py @@ -31,9 +31,9 @@ import yaml +from aiida import orm from aiida.common import LinkType from aiida.common.exceptions import NotExistentAttributeError -from aiida import orm from aiida.orm.utils import LinkTriple from aiida.tools.archive.exceptions import ExportValidationError from aiida.tools.dumping.base import BaseDumper @@ -216,7 +216,9 @@ def dump( if output_path is None: output_path = self._generate_default_dump_path(process_node=process_node) - prepare_dump_path(path_to_validate=output_path, overwrite=self.base.overwrite, incremental=self.base.incremental) + prepare_dump_path( + path_to_validate=output_path, overwrite=self.base.overwrite, incremental=self.base.incremental + ) if isinstance(process_node, orm.CalculationNode): self._dump_calculation( @@ -251,7 +253,9 @@ def _dump_workflow( :param io_dump_paths: Custom subdirectories for `CalculationNode` s, defaults to None """ - prepare_dump_path(path_to_validate=output_path, overwrite=self.base.overwrite, incremental=self.base.incremental) + prepare_dump_path( + path_to_validate=output_path, overwrite=self.base.overwrite, incremental=self.base.incremental + ) self._dump_node_yaml(process_node=workflow_node, output_path=output_path) called_links = workflow_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() @@ -305,7 +309,9 @@ def _dump_calculation( Default: ['inputs', 'outputs', 'node_inputs', 'node_outputs'] """ - prepare_dump_path(path_to_validate=output_path, overwrite=self.base.overwrite, incremental=self.base.incremental) + prepare_dump_path( + path_to_validate=output_path, overwrite=self.base.overwrite, incremental=self.base.incremental + ) self._dump_node_yaml(process_node=calculation_node, output_path=output_path) io_dump_mapping = self._generate_calculation_io_mapping(io_dump_paths=io_dump_paths) diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py index 5b88fe8d55..c343e1c617 100644 --- a/src/aiida/tools/dumping/profile.py +++ b/src/aiida/tools/dumping/profile.py @@ -10,17 +10,18 @@ # TODO: Use `batch_iter` from aiida.tools.archive.common from __future__ import annotations -from pathlib import Path + import logging + from aiida import orm -from aiida.cmdline.params.options.main import ORGANIZE_BY_GROUPS +from aiida.manage.configuration.profile import Profile from aiida.tools.dumping.base import BaseDumper -from aiida.tools.dumping.process import ProcessDumper from aiida.tools.dumping.group import GroupDumper -from aiida.manage.configuration.profile import Profile +from aiida.tools.dumping.process import ProcessDumper logger = logging.getLogger(__name__) + class ProfileDumper: def __init__( self, @@ -29,7 +30,7 @@ def __init__( process_dumper: ProcessDumper | None = None, organize_by_groups: bool = True, deduplicate: bool = True, - groups: list[str | orm.Group] | None = None, + groups: list[str | orm.Group] | None = None, dump_processes: bool = True, ): self.organize_by_groups = organize_by_groups @@ -43,27 +44,23 @@ def __init__( if process_dumper is None: process_dumper = ProcessDumper() - self.process_dumper: ProcessDumper = process_dumper + self.process_dumper: ProcessDumper = process_dumper if not groups: groups = orm.QueryBuilder().append(orm.Group).all(flat=True) self.groups = groups - def dump(self): - self._dump_processes_not_in_any_group() self._dump_processes_per_group() - def _dump_processes_not_in_any_group(self): - # === Dump the data that is not associated with any group === if self.organize_by_groups: output_path = self.base_dumper.dump_parent_path / 'no-group' else: output_path = self.base_dumper.dump_parent_path - + no_group_dumper = GroupDumper( base_dumper=self.base_dumper, process_dumper=self.process_dumper, @@ -71,9 +68,8 @@ def _dump_processes_not_in_any_group(self): deduplicate=self.deduplicate, output_path=output_path, ) - + if self.dump_processes and no_group_dumper._should_dump_processes(): - logger.report(f'Dumping processes not in any group for profile `{self.profile.name}`...') no_group_dumper._dump_processes() @@ -82,7 +78,6 @@ def _dump_processes_per_group(self): # === Dump data per-group if Groups exist in profile or are selected === for group in self.groups: - if self.organize_by_groups: output_path = self.base_dumper.dump_parent_path / group.label else: From 64b715eb9ba13c5496a53fd1140acd3a8989453e Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 27 Jan 2025 17:52:14 +0100 Subject: [PATCH 07/27] Symlinking of workflows between groups works. --- src/aiida/cmdline/commands/cmd_process.py | 8 -- src/aiida/cmdline/commands/cmd_profile.py | 14 -- src/aiida/cmdline/params/options/main.py | 57 +------- src/aiida/tools/dumping/group.py | 157 +++++++++++----------- src/aiida/tools/dumping/parser.py | 8 +- src/aiida/tools/dumping/process.py | 5 +- src/aiida/tools/dumping/profile.py | 25 +++- 7 files changed, 107 insertions(+), 167 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 30e75a4295..9984b9ee7b 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -567,10 +567,6 @@ def process_repair(manager, broker, dry_run): @options.INCLUDE_OUTPUTS() @options.INCLUDE_ATTRIBUTES() @options.INCLUDE_EXTRAS() -@options.ALSO_RAW() -@options.ALSO_RICH() -@options.RICH_SPEC() -@options.RICH_DUMP_ALL() @click.option( '--dump-unsealed', is_flag=True, @@ -592,10 +588,6 @@ def process_dump( include_extras, dump_unsealed, incremental, - also_raw, - also_rich, - rich_spec, - rich_dump_all, ) -> None: """Dump process input and output files to disk. diff --git a/src/aiida/cmdline/commands/cmd_profile.py b/src/aiida/cmdline/commands/cmd_profile.py index 4c4926399b..9008ccabe9 100644 --- a/src/aiida/cmdline/commands/cmd_profile.py +++ b/src/aiida/cmdline/commands/cmd_profile.py @@ -314,20 +314,6 @@ def profile_mirror( profile = ctx.obj['profile'] - # if nodes and groups: - # echo.echo_critical('`nodes` and `groups` specified. Set only one.') - - # if dump_config_file is None: - - # # TODO: Also allow for mixing. Currently one can _only_ specify either the config file, or the arguments on the - # # TODO: command line - # else: - # kwarg_dicts_from_config = DumpConfigParser.parse_config_file(dump_config_file) - - # general_kwargs = kwarg_dicts_from_config['general_kwargs'] - # processdumper_kwargs = kwarg_dicts_from_config['processdumper_kwargs'] - # datadumper_kwargs = kwarg_dicts_from_config['datadumper_kwargs'] - incremental = not overwrite if path is None: diff --git a/src/aiida/cmdline/params/options/main.py b/src/aiida/cmdline/params/options/main.py index e7a18eedc1..8ee982ad1f 100644 --- a/src/aiida/cmdline/params/options/main.py +++ b/src/aiida/cmdline/params/options/main.py @@ -27,8 +27,6 @@ 'ALL', 'ALL_STATES', 'ALL_USERS', - 'ALSO_RAW', - 'ALSO_RICH', 'APPEND_TEXT', 'ARCHIVE_FORMAT', 'BROKER_HOST', @@ -46,7 +44,6 @@ 'COMPUTERS', 'CONFIG_FILE', 'DATA', - 'DATA_HIDDEN', 'DATUM', 'DB_BACKEND', 'DB_ENGINE', @@ -62,7 +59,6 @@ 'DICT_KEYS', 'DRY_RUN', 'DUMP_CONFIG_FILE', - 'DUMP_DATA', 'DUMP_PROCESSES', 'EXIT_STATUS', 'EXPORT_FORMAT', @@ -108,8 +104,6 @@ 'PROJECT', 'RAW', 'REPOSITORY_PATH', - 'RICH_DUMP_ALL', - 'RICH_SPEC', 'SCHEDULER', 'SILENT', 'SORT', @@ -801,7 +795,7 @@ def set_log_level(ctx, _param, value): DEDUPLICATE = OverridableOption( '--deduplicate/--no-deduplicate', is_flag=True, - default=False, + default=True, show_default=True, help='', ) @@ -814,46 +808,6 @@ def set_log_level(ctx, _param, value): help='Dump process data.', ) -DUMP_DATA = OverridableOption( - '--dump-data/--no-dump-data', - is_flag=True, - default=False, - type=bool, - show_default=True, - help='Dump data nodes in a dedicated directory.', -) - -DATA_HIDDEN = OverridableOption( - '--data-hidden/--data-non-hidden', - is_flag=True, - default=True, - show_default=True, - help='Dump all `orm.Data` in the hidden directory and link to there.', -) - -ALSO_RAW = OverridableOption( - '--also-raw/--no-also-raw', - is_flag=True, - default=False, - show_default=True, - help='Dump the `attributes` of all nodes related to the Process.', -) - -ALSO_RICH = OverridableOption( - '--also-rich/--no-also-rich', - is_flag=True, - default=False, - show_default=True, - help='Dump also nicely prepared outputs, e.g. CIF for structures or PDF image for bands.', -) - -RICH_SPEC = OverridableOption( - '--rich-spec', - default=None, - type=str, - help='Specifications for rich data dumping.', -) - DUMP_CONFIG_FILE = OverridableOption( '--dump-config-file', default=None, @@ -861,15 +815,6 @@ def set_log_level(ctx, _param, value): help='Provide dumping options via a config file in YAML format.', ) -RICH_DUMP_ALL = OverridableOption( - '--rich-dump-all/--no-rich-dump-all', - default=True, - is_flag=True, - type=bool, - show_default=True, - help='If a rich specification is provided, this triggers if all other Data nodes should also be dumped or not.', -) - ORGANIZE_BY_GROUPS = OverridableOption( '--organize-by-groups/--no-organize-by-groups', default=True, diff --git a/src/aiida/tools/dumping/group.py b/src/aiida/tools/dumping/group.py index 6ea4c960d3..e7032ef823 100644 --- a/src/aiida/tools/dumping/group.py +++ b/src/aiida/tools/dumping/group.py @@ -10,9 +10,10 @@ from __future__ import annotations +import os +from collections import defaultdict import itertools as it import logging -from collections import Counter from pathlib import Path from aiida import orm @@ -27,7 +28,6 @@ DEFAULT_ENTITIES_TO_DUMP = DEFAULT_PROCESSES_TO_DUMP # + DEFAULT_DATA_TO_DUMP -# ! This class is instantiated once for every group, or once for the full profile class GroupDumper: def __init__( self, @@ -36,6 +36,7 @@ def __init__( group: orm.Group | str | None = None, deduplicate: bool = True, output_path: str | Path | None = None, + global_log_dict: dict[str, Path] | None = None ): self.deduplicate = deduplicate @@ -45,6 +46,7 @@ def __init__( self.group = group self.output_path = output_path + self.global_log_dict = global_log_dict if base_dumper is None: base_dumper = BaseDumper() @@ -54,36 +56,14 @@ def __init__( process_dumper = ProcessDumper() self.process_dumper: ProcessDumper = process_dumper - if not hasattr(self, 'entity_counter'): - self.create_entity_counter() + self.nodes = self._get_nodes() + self.log_dict = {} - def create_entity_counter(self) -> Counter: - entity_counter = Counter() - if self.group is not None: - # If the group only has one WorkChain assigned to it, this will only return a count of 1 for the - # WorkChainNode, nothing more, that is, it doesn't work recursively. - nodes = self.group.nodes - # elif self.nodes is not None: - # nodes = self.nodes - else: - nodes = orm.QueryBuilder().append(orm.Node).all(flat=True) - - # Iterate over all the entities in the group - for node in nodes: - # Count the type string of each entity - entity_counter[node.__class__] += 1 - - # Convert the Counter to a dictionary (optional) - self.entity_counter = entity_counter - - return entity_counter + def _should_dump_processes(self) -> bool: - def get_group_nodes(self): - # if self.nodes: - # self.collection_nodes = self.nodes + return len([node for node in self.nodes if isinstance(node, orm.ProcessNode)]) > 0 - # if hasattr(self, 'collection_nodes'): - # return self.collection_nodes + def _get_nodes(self): # Get all nodes that are in the group if self.group is not None: @@ -92,7 +72,7 @@ def get_group_nodes(self): # Get all nodes that are _not_ in any group else: groups = orm.QueryBuilder().append(orm.Group).all(flat=True) - nodes_in_groups = [node.pk for group in groups for node in group.nodes] + nodes_in_groups = [node.uuid for group in groups for node in group.nodes] # Need to expand here also with the called_descendants of `WorkflowNodes`, otherwise the called # `CalculationNode`s for `WorkflowNode`s that are part of a group are dumped twice sub_nodes_in_groups = list( @@ -104,66 +84,24 @@ def get_group_nodes(self): ] ) ) - sub_nodes_in_groups = [node.pk for node in sub_nodes_in_groups] + sub_nodes_in_groups = [node.uuid for node in sub_nodes_in_groups] nodes_in_groups = nodes_in_groups + sub_nodes_in_groups - profile_nodes = orm.QueryBuilder().append(orm.Node, project=['pk']).all(flat=True) + profile_nodes = orm.QueryBuilder().append(orm.Node, project=['uuid']).all(flat=True) nodes = [profile_node for profile_node in profile_nodes if profile_node not in nodes_in_groups] nodes = [orm.load_node(node) for node in nodes] if self.base_dumper.last_dump_time is not None: - # breakpoint() nodes = [node for node in nodes if node.mtime > self.base_dumper.last_dump_time] - self.collection_nodes = nodes - return nodes - def _should_dump_processes(self) -> bool: - if not hasattr(self, 'group_nodes'): - self.get_group_nodes() - - return len([node for node in self.collection_nodes if isinstance(node, orm.ProcessNode)]) > 0 - - def _dump_calculations(self, calculations): - for calculation in calculations: - calculation_dumper = self.process_dumper - - calculation_dump_path = ( - self.output_path - / 'calculations' - / calculation_dumper._generate_default_dump_path(process_node=calculation, prefix='') - ) + def _get_processes(self): - if calculation.caller is None or (calculation.caller is not None and self.deduplicate): - calculation_dumper._dump_calculation(calculation_node=calculation, output_path=calculation_dump_path) - - def _dump_workflows(self, workflows): - # workflow_nodes = get_nodes_from_db(aiida_node_type=orm.WorkflowNode, with_group=self.group, flat=True) - for workflow in workflows: - # if workflow.pk == 47: - # breakpoint() - - workflow_dumper = self.process_dumper - - # TODO: If the GroupDumper is called from somewhere else outside, prefix the path with `groups/core` etc - workflow_dump_path = ( - self.output_path - / 'workflows' - / workflow_dumper._generate_default_dump_path(process_node=workflow, prefix=None) - ) - # logger.report(f'WORKFLOW_DUMP_PATH: {workflow_dump_path}') - workflow_dumper._dump_workflow( - workflow_node=workflow, - output_path=workflow_dump_path, - link_calculations=self.deduplicate, - link_calculations_dir=self.output_path / 'calculations', - ) - - def _dump_processes(self): - nodes = self.get_group_nodes() + nodes = self.nodes workflows = [node for node in nodes if isinstance(node, orm.WorkflowNode)] + # Make sure that only top-level workflows are dumped in their own directories when de-duplcation is enabled if self.deduplicate: workflows = [workflow for workflow in workflows if workflow.caller is None] @@ -177,10 +115,69 @@ def _dump_processes(self): calculations = set([node for node in nodes if isinstance(node, orm.CalculationNode)] + called_calculations) - if len(workflows) + len(calculations) == 0: + self.calculations = calculations + self.workflows = workflows + + self.log_dict = { + 'calculations': {}, + # dict.fromkeys([c.uuid for c in self.calculations], None), + 'workflows': dict.fromkeys([w.uuid for w in workflows], None) + } + + def _dump_processes(self): + + self._get_processes() + + if len(self.workflows) + len(self.calculations) == 0: + logger.report("No workflows or calculations to dump in group.") return self.output_path.mkdir(exist_ok=True, parents=True) - self._dump_calculations(calculations=calculations) - self._dump_workflows(workflows=workflows) + self._dump_calculations() + self._dump_workflows() + + def _dump_calculations(self): + + calculations_path = self.output_path / 'calculations' + + for calculation in self.calculations: + calculation_dumper = self.process_dumper + + calculation_dump_path = ( + calculations_path / calculation_dumper._generate_default_dump_path(process_node=calculation, prefix='') + ) + + if calculation.caller is None: + # or (calculation.caller is not None and not self.deduplicate): + calculation_dumper._dump_calculation(calculation_node=calculation, output_path=calculation_dump_path) + + self.log_dict['calculations'][calculation.uuid] = calculation_dump_path + + def _dump_workflows(self): + # workflow_nodes = get_nodes_from_db(aiida_node_type=orm.WorkflowNode, with_group=self.group, flat=True) + workflow_path = self.output_path / 'workflows' + workflow_path.mkdir(exist_ok=True, parents=True) + + for workflow in self.workflows: + + workflow_dumper = self.process_dumper + + workflow_dump_path = ( + workflow_path / workflow_dumper._generate_default_dump_path(process_node=workflow, prefix=None) + ) + + if self.deduplicate and workflow.uuid in self.global_log_dict["workflows"].keys(): + os.symlink( + src=self.global_log_dict["workflows"][workflow.uuid], + dst=workflow_dump_path, + ) + else: + workflow_dumper._dump_workflow( + workflow_node=workflow, + output_path=workflow_dump_path, + # link_calculations=not self.deduplicate, + # link_calculations_dir=self.output_path / 'calculations', + ) + + self.log_dict['workflows'][workflow.uuid] = workflow_dump_path diff --git a/src/aiida/tools/dumping/parser.py b/src/aiida/tools/dumping/parser.py index 96412eb421..c895d2717d 100644 --- a/src/aiida/tools/dumping/parser.py +++ b/src/aiida/tools/dumping/parser.py @@ -27,10 +27,10 @@ def parse_config_file(config_file: str | Path | None) -> dict: 'flat': config.get('flat', False), } - datadumper_kwargs = { - 'also_raw': config.get('also_raw', False), - 'also_rich': config.get('also_rich', True), - } + # datadumper_kwargs = { + # 'also_raw': config.get('also_raw', False), + # 'also_rich': config.get('also_rich', True), + # } collection_kwargs = { 'should_dump_processes': config.get('dump_processes', True), diff --git a/src/aiida/tools/dumping/process.py b/src/aiida/tools/dumping/process.py index 00f8173d4b..92102332fd 100644 --- a/src/aiida/tools/dumping/process.py +++ b/src/aiida/tools/dumping/process.py @@ -245,6 +245,7 @@ def _dump_workflow( io_dump_paths: List[str | Path] | None = None, link_calculations: bool = False, link_calculations_dir: Path | None = None, + workflow_symlink: Path | None = None, ) -> None: """Recursive function to traverse a `WorkflowNode` and dump its `CalculationNode` s. @@ -254,7 +255,9 @@ def _dump_workflow( """ prepare_dump_path( - path_to_validate=output_path, overwrite=self.base.overwrite, incremental=self.base.incremental + path_to_validate=output_path, + overwrite=self.base.overwrite, + incremental=self.base.incremental, ) self._dump_node_yaml(process_node=workflow_node, output_path=output_path) diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py index c343e1c617..5093178297 100644 --- a/src/aiida/tools/dumping/profile.py +++ b/src/aiida/tools/dumping/profile.py @@ -12,7 +12,11 @@ from __future__ import annotations import logging +import itertools as it +from rich.pretty import pprint +from pathlib import Path +from collections import Counter from aiida import orm from aiida.manage.configuration.profile import Profile from aiida.tools.dumping.base import BaseDumper @@ -37,6 +41,7 @@ def __init__( self.deduplicate = deduplicate self.profile = profile self.dump_processes = dump_processes + self.groups = groups if base_dumper is None: base_dumper = BaseDumper() @@ -46,12 +51,14 @@ def __init__( process_dumper = ProcessDumper() self.process_dumper: ProcessDumper = process_dumper - if not groups: - groups = orm.QueryBuilder().append(orm.Group).all(flat=True) - self.groups = groups + # self.log_dict: dict[dict[str, Path]] = {} + self.log_dict= {'calculations': {}, 'workflows': {}} def dump(self): - self._dump_processes_not_in_any_group() + if not self.groups: + self._dump_processes_not_in_any_group() + self.groups = orm.QueryBuilder().append(orm.Group).all(flat=True) + self._dump_processes_per_group() def _dump_processes_not_in_any_group(self): @@ -67,6 +74,7 @@ def _dump_processes_not_in_any_group(self): group=None, deduplicate=self.deduplicate, output_path=output_path, + global_log_dict=self.log_dict, ) if self.dump_processes and no_group_dumper._should_dump_processes(): @@ -74,6 +82,8 @@ def _dump_processes_not_in_any_group(self): no_group_dumper._dump_processes() + self.log_dict.update(no_group_dumper.log_dict) + def _dump_processes_per_group(self): # === Dump data per-group if Groups exist in profile or are selected === @@ -89,9 +99,16 @@ def _dump_processes_per_group(self): group=group, deduplicate=self.deduplicate, output_path=output_path, + global_log_dict=self.log_dict, ) if self.dump_processes and group_dumper._should_dump_processes(): logger.report(f'Dumping processes in group {group.label} for profile `{self.profile.name}`...') group_dumper._dump_processes() + for entity in ['calculations', 'workflows']: + self.log_dict[entity].update(group_dumper.log_dict[entity]) + + pprint(group_dumper.log_dict) + pprint(self.log_dict) + From a8c5aacd4095bf83eb218c23da6086d502532585 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 Jan 2025 16:52:34 +0000 Subject: [PATCH 08/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/aiida/tools/dumping/group.py | 27 ++++++++++----------------- src/aiida/tools/dumping/profile.py | 7 ++----- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/src/aiida/tools/dumping/group.py b/src/aiida/tools/dumping/group.py index e7032ef823..ee7c64f136 100644 --- a/src/aiida/tools/dumping/group.py +++ b/src/aiida/tools/dumping/group.py @@ -10,10 +10,9 @@ from __future__ import annotations -import os -from collections import defaultdict import itertools as it import logging +import os from pathlib import Path from aiida import orm @@ -36,7 +35,7 @@ def __init__( group: orm.Group | str | None = None, deduplicate: bool = True, output_path: str | Path | None = None, - global_log_dict: dict[str, Path] | None = None + global_log_dict: dict[str, Path] | None = None, ): self.deduplicate = deduplicate @@ -60,11 +59,9 @@ def __init__( self.log_dict = {} def _should_dump_processes(self) -> bool: - return len([node for node in self.nodes if isinstance(node, orm.ProcessNode)]) > 0 def _get_nodes(self): - # Get all nodes that are in the group if self.group is not None: nodes = list(self.group.nodes) @@ -97,7 +94,6 @@ def _get_nodes(self): return nodes def _get_processes(self): - nodes = self.nodes workflows = [node for node in nodes if isinstance(node, orm.WorkflowNode)] @@ -121,15 +117,14 @@ def _get_processes(self): self.log_dict = { 'calculations': {}, # dict.fromkeys([c.uuid for c in self.calculations], None), - 'workflows': dict.fromkeys([w.uuid for w in workflows], None) + 'workflows': dict.fromkeys([w.uuid for w in workflows], None), } def _dump_processes(self): - self._get_processes() if len(self.workflows) + len(self.calculations) == 0: - logger.report("No workflows or calculations to dump in group.") + logger.report('No workflows or calculations to dump in group.') return self.output_path.mkdir(exist_ok=True, parents=True) @@ -138,14 +133,13 @@ def _dump_processes(self): self._dump_workflows() def _dump_calculations(self): - calculations_path = self.output_path / 'calculations' for calculation in self.calculations: calculation_dumper = self.process_dumper - calculation_dump_path = ( - calculations_path / calculation_dumper._generate_default_dump_path(process_node=calculation, prefix='') + calculation_dump_path = calculations_path / calculation_dumper._generate_default_dump_path( + process_node=calculation, prefix='' ) if calculation.caller is None: @@ -160,16 +154,15 @@ def _dump_workflows(self): workflow_path.mkdir(exist_ok=True, parents=True) for workflow in self.workflows: - workflow_dumper = self.process_dumper - workflow_dump_path = ( - workflow_path / workflow_dumper._generate_default_dump_path(process_node=workflow, prefix=None) + workflow_dump_path = workflow_path / workflow_dumper._generate_default_dump_path( + process_node=workflow, prefix=None ) - if self.deduplicate and workflow.uuid in self.global_log_dict["workflows"].keys(): + if self.deduplicate and workflow.uuid in self.global_log_dict['workflows'].keys(): os.symlink( - src=self.global_log_dict["workflows"][workflow.uuid], + src=self.global_log_dict['workflows'][workflow.uuid], dst=workflow_dump_path, ) else: diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py index 5093178297..282bad1372 100644 --- a/src/aiida/tools/dumping/profile.py +++ b/src/aiida/tools/dumping/profile.py @@ -12,11 +12,9 @@ from __future__ import annotations import logging -import itertools as it + from rich.pretty import pprint -from pathlib import Path -from collections import Counter from aiida import orm from aiida.manage.configuration.profile import Profile from aiida.tools.dumping.base import BaseDumper @@ -52,7 +50,7 @@ def __init__( self.process_dumper: ProcessDumper = process_dumper # self.log_dict: dict[dict[str, Path]] = {} - self.log_dict= {'calculations': {}, 'workflows': {}} + self.log_dict = {'calculations': {}, 'workflows': {}} def dump(self): if not self.groups: @@ -111,4 +109,3 @@ def _dump_processes_per_group(self): pprint(group_dumper.log_dict) pprint(self.log_dict) - From fbdf478aa813cd39b5254f4e746a85860a584f72 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Tue, 28 Jan 2025 11:14:22 +0100 Subject: [PATCH 09/27] Fix `verdi process dump` tests - Use the `BaseDumper` instead of passing arguments to the `ProcessDumper` - Append PKs to the test output paths and use `aiida_profile_clean` fixture for reproducible results --- src/aiida/cmdline/commands/cmd_process.py | 24 ++++--- src/aiida/cmdline/commands/cmd_profile.py | 8 +-- src/aiida/tools/dumping/process.py | 70 ++++++++++-------- .../{test_processes.py => test_process.py} | 71 ++++++++++++------- 4 files changed, 100 insertions(+), 73 deletions(-) rename tests/tools/dumping/{test_processes.py => test_process.py} (88%) diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 9984b9ee7b..395c74de5e 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -606,21 +606,23 @@ def process_dump( """ from aiida.tools.archive.exceptions import ExportValidationError + from aiida.tools.dumping.base import BaseDumper from aiida.tools.dumping.process import ProcessDumper - processdumper_kwargs = { - 'include_inputs': include_inputs, - 'include_outputs': include_outputs, - 'include_attributes': include_attributes, - 'include_extras': include_extras, - 'flat': flat, - 'dump_unsealed': dump_unsealed, - 'incremental': incremental, - } + base_dumper = BaseDumper( + dump_parent_path=path, + overwrite=overwrite, + incremental=incremental, + ) process_dumper = ProcessDumper( - overwrite=overwrite, - **processdumper_kwargs, + base_dumper=base_dumper, + include_inputs=include_inputs, + include_outputs=include_outputs, + include_attributes=include_attributes, + include_extras=include_extras, + flat=flat, + dump_unsealed=dump_unsealed, ) try: diff --git a/src/aiida/cmdline/commands/cmd_profile.py b/src/aiida/cmdline/commands/cmd_profile.py index 9008ccabe9..4f6fc99b60 100644 --- a/src/aiida/cmdline/commands/cmd_profile.py +++ b/src/aiida/cmdline/commands/cmd_profile.py @@ -330,7 +330,7 @@ def profile_mirror( else: echo.echo_report(f"Dumping of profile `{profile.name}`'s data at path: `{path}`.") - SAFEGUARD_FILE: str = '.verdi_profile_mirror' + SAFEGUARD_FILE: str = '.verdi_profile_mirror' # noqa: N806 safeguard_file_path: Path = path / SAFEGUARD_FILE try: @@ -357,7 +357,7 @@ def profile_mirror( ) process_dumper = ProcessDumper( - base=base_dumper, + base_dumper=base_dumper, include_inputs=include_inputs, include_outputs=include_outputs, include_attributes=include_attributes, @@ -378,6 +378,6 @@ def profile_mirror( profile_dumper.dump() # Append the current time to the file - last_dump_time = datetime.now().astimezone().isoformat() + last_dump_time = datetime.now().astimezone() with safeguard_file_path.open('a') as fhandle: - fhandle.write(f'Last profile mirror time: {last_dump_time}\n') + fhandle.write(f'Last profile mirror time: {last_dump_time.isoformat()}\n') diff --git a/src/aiida/tools/dumping/process.py b/src/aiida/tools/dumping/process.py index 92102332fd..2ed2aa894b 100644 --- a/src/aiida/tools/dumping/process.py +++ b/src/aiida/tools/dumping/process.py @@ -27,7 +27,6 @@ import os from pathlib import Path from types import SimpleNamespace -from typing import List import yaml @@ -45,24 +44,29 @@ class ProcessDumper: def __init__( self, - base: BaseDumper = BaseDumper(), - flat: bool = False, + base_dumper: BaseDumper | None = None, include_inputs: bool = True, include_outputs: bool = False, include_attributes: bool = True, include_extras: bool = True, + flat: bool = False, dump_unsealed: bool = False, ) -> None: - self.flat = flat - self.base = base self.include_inputs = include_inputs self.include_outputs = include_outputs self.include_attributes = include_attributes self.include_extras = include_extras + self.flat = flat self.dump_unsealed = dump_unsealed + if base_dumper is None: + base_dumper = BaseDumper() + self.base_dumper: BaseDumper = base_dumper + @staticmethod - def _generate_default_dump_path(process_node: orm.ProcessNode, prefix: str = 'dump') -> Path: + def _generate_default_dump_path( + process_node: orm.ProcessNode, prefix: str | None = 'dump', append_pk: bool = True + ) -> Path: """Simple helper function to generate the default parent-dumping directory if none given. This function is not called for the recursive sub-calls of `_dump_calculation` as it just creates the default @@ -74,17 +78,20 @@ def _generate_default_dump_path(process_node: orm.ProcessNode, prefix: str = 'du entities_to_dump = [] - if prefix: - # No '' and None + # No '' and None + if prefix is not None: entities_to_dump += [prefix] try: - entities_to_dump += [process_node.process_label] + if process_node.process_label is not None: + entities_to_dump.append(process_node.process_label) except AttributeError: # This case came up during testing, not sure how relevant it actually is - entities_to_dump += [process_node.process_type] + if process_node.process_type is not None: + entities_to_dump.append(process_node.process_type) - entities_to_dump += [str(process_node.pk)] + if append_pk: + entities_to_dump += [str(process_node.pk)] return Path('-'.join(entities_to_dump)) @@ -187,7 +194,7 @@ def dump( self, process_node: orm.ProcessNode, output_path: Path | None, - io_dump_paths: List[str | Path] | None = None, + io_dump_paths: list[str | Path] | None = None, ) -> Path: """Dumps all data involved in a `ProcessNode`, including its outgoing links. @@ -217,7 +224,7 @@ def dump( output_path = self._generate_default_dump_path(process_node=process_node) prepare_dump_path( - path_to_validate=output_path, overwrite=self.base.overwrite, incremental=self.base.incremental + path_to_validate=output_path, overwrite=self.base_dumper.overwrite, incremental=self.base_dumper.incremental ) if isinstance(process_node, orm.CalculationNode): @@ -242,7 +249,7 @@ def _dump_workflow( self, workflow_node: orm.WorkflowNode, output_path: Path, - io_dump_paths: List[str | Path] | None = None, + io_dump_paths: list[str | Path] | None = None, link_calculations: bool = False, link_calculations_dir: Path | None = None, workflow_symlink: Path | None = None, @@ -256,8 +263,8 @@ def _dump_workflow( prepare_dump_path( path_to_validate=output_path, - overwrite=self.base.overwrite, - incremental=self.base.incremental, + overwrite=self.base_dumper.overwrite, + incremental=self.base_dumper.incremental, ) self._dump_node_yaml(process_node=workflow_node, output_path=output_path) @@ -289,11 +296,11 @@ def _dump_workflow( output_path=child_output_path, io_dump_paths=io_dump_paths, ) - else: + elif link_calculations_dir is not None: + calculation_dump_path = link_calculations_dir / ProcessDumper._generate_default_dump_path( + process_node=child_node, prefix='' + ) try: - calculation_dump_path = link_calculations_dir / ProcessDumper._generate_default_dump_path( - process_node=child_node, prefix='' - ) os.symlink(calculation_dump_path, child_output_path) except FileExistsError: pass @@ -302,7 +309,7 @@ def _dump_calculation( self, calculation_node: orm.CalculationNode, output_path: Path, - io_dump_paths: List[str | Path] | None = None, + io_dump_paths: list[str | Path] | None = None, ) -> None: """Dump the contents of a `CalculationNode` to a specified output path. @@ -313,7 +320,7 @@ def _dump_calculation( """ prepare_dump_path( - path_to_validate=output_path, overwrite=self.base.overwrite, incremental=self.base.incremental + path_to_validate=output_path, overwrite=self.base_dumper.overwrite, incremental=self.base_dumper.incremental ) self._dump_node_yaml(process_node=calculation_node, output_path=output_path) @@ -350,12 +357,12 @@ def _dump_calculation( def _dump_calculation_io_files( self, parent_path: Path, - link_triples: orm.LinkManager | List[orm.LinkTriple], + link_triples: orm.LinkManager | list[orm.LinkTriple], ): """Small helper function to dump linked input/output nodes of a `orm.CalculationNode`. :param parent_path: Parent directory for dumping the linked node contents. - :param link_triples: List of link triples. + :param link_triples: list of link triples. """ for link_triple in link_triples: @@ -369,7 +376,7 @@ def _dump_calculation_io_files( link_triple.node.base.repository.copy_tree(linked_node_path.resolve()) - def _generate_calculation_io_mapping(self, io_dump_paths: List[str | Path] | None = None) -> SimpleNamespace: + def _generate_calculation_io_mapping(self, io_dump_paths: list[str | Path] | None = None) -> SimpleNamespace: """Helper function to generate mapping for entities dumped for each `CalculationNode`. This is to avoid exposing AiiDA terminology, like `repository` to the user, while keeping track of which @@ -380,8 +387,8 @@ def _generate_calculation_io_mapping(self, io_dump_paths: List[str | Path] | Non :return: SimpleNamespace mapping. """ - aiida_entities_to_dump = ['repository', 'retrieved', 'inputs', 'outputs'] - default_calculation_io_dump_paths = ['inputs', 'outputs', 'node_inputs', 'node_outputs'] + aiida_entities_to_dump: list[str] = ['repository', 'retrieved', 'inputs', 'outputs'] + default_calculation_io_dump_paths: list[str | Path] = ['inputs', 'outputs', 'node_inputs', 'node_outputs'] if self.flat and io_dump_paths is None: logger.info( 'Flat set to True and no `io_dump_paths`. Dumping in a flat directory, files might be overwritten.' @@ -390,21 +397,22 @@ def _generate_calculation_io_mapping(self, io_dump_paths: List[str | Path] | Non return SimpleNamespace(**dict(zip(aiida_entities_to_dump, empty_calculation_io_dump_paths))) - elif not self.flat and io_dump_paths is None: + if not self.flat and io_dump_paths is None: logger.info( 'Flat set to False but no `io_dump_paths` provided. ' + f'Will use the defaults {default_calculation_io_dump_paths}.' ) - return SimpleNamespace(**dict(zip(aiida_entities_to_dump, default_calculation_io_dump_paths))) + io_dump_paths = default_calculation_io_dump_paths elif self.flat: logger.info('Flat set to True but `io_dump_paths` provided. These will be used, but `inputs` not nested.') - return SimpleNamespace(**dict(zip(aiida_entities_to_dump, io_dump_paths))) else: logger.info( 'Flat set to False but no `io_dump_paths` provided. These will be used, but `node_inputs` flattened.' ) - return SimpleNamespace(**dict(zip(aiida_entities_to_dump, io_dump_paths))) # type: ignore[arg-type] + + assert io_dump_paths is not None + return SimpleNamespace(**dict(zip(aiida_entities_to_dump, io_dump_paths))) def _dump_node_yaml( self, diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_process.py similarity index 88% rename from tests/tools/dumping/test_processes.py rename to tests/tools/dumping/test_process.py index e2f5633939..47d39ba75e 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_process.py @@ -15,6 +15,7 @@ import pytest +from aiida.tools.dumping.base import BaseDumper from aiida.tools.dumping.process import ProcessDumper # Non-AiiDA variables @@ -38,6 +39,7 @@ # Only test top-level actions, like path and README creation # Other things tested via `_dump_workflow` and `_dump_calculation` +@pytest.mark.usefixtures('aiida_profile_clean') def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path): from aiida.tools.archive.exceptions import ExportValidationError @@ -59,6 +61,7 @@ def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path assert return_path == dump_parent_path +@pytest.mark.usefixtures('aiida_profile_clean') def test_dump_workflow(generate_calculation_node_io, generate_workchain_node_io, tmp_path): # Need to generate parent path for dumping, as I don't want the sub-workchains to be dumped directly into `tmp_path` dump_parent_path = tmp_path / 'wc-workflow_dump-test-io' @@ -68,15 +71,16 @@ def test_dump_workflow(generate_calculation_node_io, generate_workchain_node_io, wc_node = generate_workchain_node_io(cj_nodes=cj_nodes) process_dumper._dump_workflow(workflow_node=wc_node, output_path=dump_parent_path) - input_path = '01-sub_workflow/01-calculation/inputs/file.txt' - singlefiledata_path = '01-sub_workflow/01-calculation/node_inputs/singlefile/file.txt' - folderdata_path = '01-sub_workflow/01-calculation/node_inputs/folderdata/relative_path/file.txt' - arraydata_path = '01-sub_workflow/01-calculation/node_inputs/arraydata/default.npy' + base_path = Path('01-sub_workflow-8/01-calculation-9') + input_path = base_path / 'inputs/file.txt' + singlefiledata_path = base_path / 'node_inputs/singlefile/file.txt' + folderdata_path = base_path / 'node_inputs/folderdata/relative_path/file.txt' + arraydata_path = base_path / 'node_inputs/arraydata/default.npy' node_metadata_paths = [ node_metadata_file, - f'01-sub_workflow/{node_metadata_file}', - f'01-sub_workflow/01-calculation/{node_metadata_file}', - f'01-sub_workflow/02-calculation/{node_metadata_file}', + f'01-sub_workflow-8/{node_metadata_file}', + f'{base_path}/{node_metadata_file}', + f'01-sub_workflow-8/02-calculation-10/{node_metadata_file}', ] expected_files = [input_path, singlefiledata_path, folderdata_path, arraydata_path, *node_metadata_paths] @@ -89,14 +93,14 @@ def test_dump_workflow(generate_calculation_node_io, generate_workchain_node_io, process_dumper = ProcessDumper(flat=True) process_dumper._dump_workflow(workflow_node=wc_node, output_path=dump_parent_path) - input_path = '01-sub_workflow/01-calculation/file.txt' - arraydata_path = '01-sub_workflow/01-calculation/default.npy' - folderdata_path = '01-sub_workflow/01-calculation/relative_path/file.txt' + input_path = base_path / 'file.txt' + arraydata_path = base_path / 'default.npy' + folderdata_path = base_path / 'relative_path/file.txt' node_metadata_paths = [ node_metadata_file, - f'01-sub_workflow/{node_metadata_file}', - f'01-sub_workflow/01-calculation/{node_metadata_file}', - f'01-sub_workflow/02-calculation/{node_metadata_file}', + f'01-sub_workflow-8/{node_metadata_file}', + f'{base_path}/{node_metadata_file}', + f'01-sub_workflow-8/02-calculation-10/{node_metadata_file}', ] expected_files = [input_path, folderdata_path, arraydata_path, *node_metadata_paths] @@ -105,21 +109,27 @@ def test_dump_workflow(generate_calculation_node_io, generate_workchain_node_io, assert all([expected_file.is_file() for expected_file in expected_files]) +@pytest.mark.usefixtures('aiida_profile_clean') def test_dump_multiply_add(tmp_path, generate_workchain_multiply_add): dump_parent_path = tmp_path / 'wc-dump-test-multiply-add' process_dumper = ProcessDumper() wc_node = generate_workchain_multiply_add() process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) - input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] - output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] + arithmetic_add_path = dump_parent_path / '02-ArithmeticAddCalculation-8' + multiply_path = dump_parent_path / '01-multiply-6' + input_files = [ - dump_parent_path / '02-ArithmeticAddCalculation' / inputs_relpath / input_file for input_file in input_files - ] - input_files += [dump_parent_path / '01-multiply' / inputs_relpath / 'source_file'] - output_files = [ - dump_parent_path / '02-ArithmeticAddCalculation' / outputs_relpath / output_file for output_file in output_files + '_aiidasubmit.sh', + 'aiida.in', + '.aiida/job_tmpl.json', + '.aiida/calcinfo.json', ] + output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] + + input_files = [arithmetic_add_path / inputs_relpath / input_file for input_file in input_files] + input_files += [multiply_path / inputs_relpath / 'source_file'] + output_files = [arithmetic_add_path / outputs_relpath / output_file for output_file in output_files] # No node_inputs contained in MultiplyAddWorkChain assert all([input_file.is_file() for input_file in input_files]) @@ -130,7 +140,7 @@ def test_dump_multiply_add(tmp_path, generate_workchain_multiply_add): process_dumper = ProcessDumper(flat=True) process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) - multiply_file = dump_parent_path / '01-multiply' / 'source_file' + multiply_file = dump_parent_path / '01-multiply-6' / 'source_file' arithmetic_add_files = [ '_aiidasubmit.sh', 'aiida.in', @@ -141,7 +151,7 @@ def test_dump_multiply_add(tmp_path, generate_workchain_multiply_add): 'aiida.out', ] arithmetic_add_files = [ - dump_parent_path / '02-ArithmeticAddCalculation' / arithmetic_add_file + dump_parent_path / '02-ArithmeticAddCalculation-8' / arithmetic_add_file for arithmetic_add_file in arithmetic_add_files ] @@ -202,7 +212,8 @@ def test_dump_calculation_flat(tmp_path, generate_calculation_node_io): def test_dump_calculation_overwr_incr(tmp_path, generate_calculation_node_io): """Tests the ProcessDumper for the overwrite and incremental option.""" dump_parent_path = tmp_path / 'cj-dump-test-overwrite' - process_dumper = ProcessDumper(overwrite=False, incremental=False) + base_dumper = BaseDumper(overwrite=False, incremental=False) + process_dumper = ProcessDumper(base_dumper=base_dumper) calculation_node = generate_calculation_node_io() calculation_node.seal() # Create safeguard file to mock existing dump directory @@ -212,7 +223,8 @@ def test_dump_calculation_overwr_incr(tmp_path, generate_calculation_node_io): with pytest.raises(FileExistsError): process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) # With overwrite option true no error is raised and the dumping can run through. - process_dumper = ProcessDumper(overwrite=True, incremental=False) + base_dumper = BaseDumper(overwrite=True, incremental=False) + process_dumper = ProcessDumper(base_dumper=base_dumper) process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) assert (dump_parent_path / inputs_relpath / filename).is_file() @@ -221,7 +233,8 @@ def test_dump_calculation_overwr_incr(tmp_path, generate_calculation_node_io): # Incremental also does work dump_parent_path.mkdir() (dump_parent_path / '.aiida_node_metadata.yaml').touch() - process_dumper = ProcessDumper(overwrite=False, incremental=True) + base_dumper = BaseDumper(overwrite=False, incremental=True) + process_dumper = ProcessDumper(base_dumper=base_dumper) process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) assert (dump_parent_path / inputs_relpath / filename).is_file() @@ -235,6 +248,7 @@ def test_dump_calculation_no_inputs(tmp_path, generate_calculation_node_io): assert not (dump_parent_path / node_inputs_relpath).is_dir() +@pytest.mark.usefixtures('aiida_profile_clean') def test_dump_calculation_add(tmp_path, generate_calculation_node_add): dump_parent_path = tmp_path / 'cj-dump-test-add' @@ -314,6 +328,7 @@ def test_prepare_dump_path(tmp_path): assert test_file.is_file() +@pytest.mark.usefixtures('aiida_profile_clean') def test_generate_default_dump_path( generate_calculation_node_add, generate_workchain_multiply_add, @@ -343,6 +358,7 @@ def test_generate_calculation_io_mapping(): assert calculation_io_mapping.outputs == 'node_outputs_' +@pytest.mark.usefixtures('aiida_profile_clean') def test_generate_child_node_label( generate_workchain_multiply_add, generate_calculation_node_io, generate_workchain_node_io ): @@ -364,7 +380,7 @@ def test_generate_child_node_label( for index, output_node in enumerate(output_triples) ] ) - assert output_paths == ['00-sub_workflow', '01-calculation'] + assert output_paths == ['00-sub_workflow-5', '01-calculation-6'] # Check with multiply_add workchain node multiply_add_node = generate_workchain_multiply_add() @@ -374,7 +390,8 @@ def test_generate_child_node_label( output_paths = sorted( [process_dumper._generate_child_node_label(_, output_node) for _, output_node in enumerate(output_triples)] ) - assert output_paths == ['00-multiply', '01-ArithmeticAddCalculation', '02-result'] + print(output_paths) + assert output_paths == ['00-multiply-12', '01-ArithmeticAddCalculation-14', '02-result-17'] def test_dump_node_yaml(generate_calculation_node_io, tmp_path, generate_workchain_multiply_add): From b98c61adbb66618aedae1a6d7446b7b278ed73c2 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Tue, 28 Jan 2025 12:58:22 +0100 Subject: [PATCH 10/27] Fix mypy complaints --- src/aiida/cmdline/params/options/main.py | 7 -- src/aiida/tools/dumping/base.py | 4 +- src/aiida/tools/dumping/group.py | 74 +++++++-------- src/aiida/tools/dumping/logger.py | 18 ++++ src/aiida/tools/dumping/process.py | 4 +- src/aiida/tools/dumping/profile.py | 63 ++++++------- src/aiida/tools/dumping/utils.py | 110 ++++++++++------------- tests/tools/dumping/test_process.py | 2 +- 8 files changed, 139 insertions(+), 143 deletions(-) create mode 100644 src/aiida/tools/dumping/logger.py diff --git a/src/aiida/cmdline/params/options/main.py b/src/aiida/cmdline/params/options/main.py index 8ee982ad1f..82d4fda8d8 100644 --- a/src/aiida/cmdline/params/options/main.py +++ b/src/aiida/cmdline/params/options/main.py @@ -867,10 +867,3 @@ def set_log_level(ctx, _param, value): show_default=True, help="Incremental dumping of data to disk. Doesn't require using overwrite to clean previous directories.", ) - -RICH_OPTIONS = OverridableOption( - '--rich-options', - default=None, - type=str, - help='Specifications for rich data dumping.', -) diff --git a/src/aiida/tools/dumping/base.py b/src/aiida/tools/dumping/base.py index 8a89e464d2..a2e2c379e8 100644 --- a/src/aiida/tools/dumping/base.py +++ b/src/aiida/tools/dumping/base.py @@ -14,12 +14,12 @@ class BaseDumper: def __init__( self, - dump_parent_path: Path = Path.cwd(), + dump_parent_path: Path | None = None, overwrite: bool = False, incremental: bool = True, last_dump_time: datetime | None = None, ): - self.dump_parent_path = dump_parent_path + self.dump_parent_path = dump_parent_path or Path.cwd() self.overwrite = overwrite self.incremental = incremental self.last_dump_time = last_dump_time diff --git a/src/aiida/tools/dumping/group.py b/src/aiida/tools/dumping/group.py index ee7c64f136..38bf25c380 100644 --- a/src/aiida/tools/dumping/group.py +++ b/src/aiida/tools/dumping/group.py @@ -11,15 +11,16 @@ from __future__ import annotations import itertools as it -import logging import os from pathlib import Path from aiida import orm +from aiida.common.log import AIIDA_LOGGER from aiida.tools.dumping.base import BaseDumper +from aiida.tools.dumping.logger import DumpLogger from aiida.tools.dumping.process import ProcessDumper -logger = logging.getLogger(__name__) +logger = AIIDA_LOGGER.getChild('tools.dumping') DEFAULT_PROCESSES_TO_DUMP = [orm.CalculationNode, orm.WorkflowNode] # DEFAULT_DATA_TO_DUMP = [orm.StructureData, orm.Code, orm.Computer, orm.BandsData, orm.UpfData] @@ -32,31 +33,28 @@ def __init__( self, base_dumper: BaseDumper | None = None, process_dumper: ProcessDumper | None = None, + dump_logger: DumpLogger | None = None, group: orm.Group | str | None = None, deduplicate: bool = True, - output_path: str | Path | None = None, - global_log_dict: dict[str, Path] | None = None, + output_path: Path | str | None = None, ): self.deduplicate = deduplicate # Allow passing of group via label if isinstance(group, str): - group = orm.Group.get(group) + group = orm.load_group(group) self.group = group - self.output_path = output_path - self.global_log_dict = global_log_dict - if base_dumper is None: - base_dumper = BaseDumper() - self.base_dumper: BaseDumper = base_dumper + self.base_dumper = base_dumper or BaseDumper() + self.process_dumper = process_dumper or ProcessDumper() + self.dump_logger = dump_logger or DumpLogger() - if process_dumper is None: - process_dumper = ProcessDumper() - self.process_dumper: ProcessDumper = process_dumper + # Properly set the `output_path` attribute + + self.output_path = Path(output_path or self.base_dumper.dump_parent_path) self.nodes = self._get_nodes() - self.log_dict = {} def _should_dump_processes(self) -> bool: return len([node for node in self.nodes if isinstance(node, orm.ProcessNode)]) > 0 @@ -68,21 +66,23 @@ def _get_nodes(self): # Get all nodes that are _not_ in any group else: - groups = orm.QueryBuilder().append(orm.Group).all(flat=True) + groups: list[orm.Group] = orm.QueryBuilder().append(orm.Group).all(flat=True) # type: ignore[assignment] nodes_in_groups = [node.uuid for group in groups for node in group.nodes] + # Need to expand here also with the called_descendants of `WorkflowNodes`, otherwise the called # `CalculationNode`s for `WorkflowNode`s that are part of a group are dumped twice - sub_nodes_in_groups = list( - it.chain( - *[ - orm.load_node(node).called_descendants - for node in nodes_in_groups - if isinstance(orm.load_node(node), orm.WorkflowNode) - ] - ) + # Get the called descendants of WorkflowNodes within the nodes_in_groups list + called_descendants_generator = ( + orm.load_node(node).called_descendants + for node in nodes_in_groups + if isinstance(orm.load_node(node), orm.WorkflowNode) ) + + # Flatten the list of called descendants + sub_nodes_in_groups = list(it.chain(*called_descendants_generator)) + sub_nodes_in_groups = [node.uuid for node in sub_nodes_in_groups] - nodes_in_groups = nodes_in_groups + sub_nodes_in_groups + nodes_in_groups += sub_nodes_in_groups profile_nodes = orm.QueryBuilder().append(orm.Node, project=['uuid']).all(flat=True) nodes = [profile_node for profile_node in profile_nodes if profile_node not in nodes_in_groups] @@ -114,11 +114,9 @@ def _get_processes(self): self.calculations = calculations self.workflows = workflows - self.log_dict = { - 'calculations': {}, - # dict.fromkeys([c.uuid for c in self.calculations], None), - 'workflows': dict.fromkeys([w.uuid for w in workflows], None), - } + def dump(self): + self.output_path.mkdir(exist_ok=True, parents=True) + self._dump_processes() def _dump_processes(self): self._get_processes() @@ -127,13 +125,12 @@ def _dump_processes(self): logger.report('No workflows or calculations to dump in group.') return - self.output_path.mkdir(exist_ok=True, parents=True) - self._dump_calculations() self._dump_workflows() def _dump_calculations(self): calculations_path = self.output_path / 'calculations' + dumped_calculations = {} for calculation in self.calculations: calculation_dumper = self.process_dumper @@ -146,12 +143,15 @@ def _dump_calculations(self): # or (calculation.caller is not None and not self.deduplicate): calculation_dumper._dump_calculation(calculation_node=calculation, output_path=calculation_dump_path) - self.log_dict['calculations'][calculation.uuid] = calculation_dump_path + dumped_calculations[calculation.uuid] = calculation_dump_path + + self.dump_logger.update_calculations(dumped_calculations) def _dump_workflows(self): # workflow_nodes = get_nodes_from_db(aiida_node_type=orm.WorkflowNode, with_group=self.group, flat=True) workflow_path = self.output_path / 'workflows' workflow_path.mkdir(exist_ok=True, parents=True) + dumped_workflows = {} for workflow in self.workflows: workflow_dumper = self.process_dumper @@ -160,9 +160,11 @@ def _dump_workflows(self): process_node=workflow, prefix=None ) - if self.deduplicate and workflow.uuid in self.global_log_dict['workflows'].keys(): + logged_workflows = self.dump_logger.get_logs()['workflows'] + + if self.deduplicate and workflow.uuid in logged_workflows.keys(): os.symlink( - src=self.global_log_dict['workflows'][workflow.uuid], + src=logged_workflows[workflow.uuid], dst=workflow_dump_path, ) else: @@ -173,4 +175,6 @@ def _dump_workflows(self): # link_calculations_dir=self.output_path / 'calculations', ) - self.log_dict['workflows'][workflow.uuid] = workflow_dump_path + dumped_workflows[workflow.uuid] = workflow_dump_path + + self.dump_logger.update_workflows(dumped_workflows) diff --git a/src/aiida/tools/dumping/logger.py b/src/aiida/tools/dumping/logger.py new file mode 100644 index 0000000000..eecf611911 --- /dev/null +++ b/src/aiida/tools/dumping/logger.py @@ -0,0 +1,18 @@ +from pathlib import Path + + +class DumpLogger: + def __init__(self): + self.log_dict: dict[str, dict[str, Path]] = {'calculations': {}, 'workflows': {}} + + def update_calculations(self, new_calculations: dict[str, Path]): + """Update the log with new calculations.""" + self.log_dict['calculations'].update(new_calculations) + + def update_workflows(self, new_workflows: dict[str, Path]): + """Update the log with new workflows.""" + self.log_dict['workflows'].update(new_workflows) + + def get_logs(self): + """Retrieve the current state of the log.""" + return self.log_dict diff --git a/src/aiida/tools/dumping/process.py b/src/aiida/tools/dumping/process.py index 2ed2aa894b..f65da5a15e 100644 --- a/src/aiida/tools/dumping/process.py +++ b/src/aiida/tools/dumping/process.py @@ -59,9 +59,7 @@ def __init__( self.flat = flat self.dump_unsealed = dump_unsealed - if base_dumper is None: - base_dumper = BaseDumper() - self.base_dumper: BaseDumper = base_dumper + self.base_dumper = base_dumper or BaseDumper() @staticmethod def _generate_default_dump_path( diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py index 282bad1372..2b2d5294c1 100644 --- a/src/aiida/tools/dumping/profile.py +++ b/src/aiida/tools/dumping/profile.py @@ -11,25 +11,25 @@ from __future__ import annotations -import logging - -from rich.pretty import pprint - from aiida import orm +from aiida.common.log import AIIDA_LOGGER +from aiida.manage import get_manager, load_profile from aiida.manage.configuration.profile import Profile from aiida.tools.dumping.base import BaseDumper from aiida.tools.dumping.group import GroupDumper +from aiida.tools.dumping.logger import DumpLogger from aiida.tools.dumping.process import ProcessDumper -logger = logging.getLogger(__name__) +logger = AIIDA_LOGGER.getChild('tools.dumping') class ProfileDumper: def __init__( self, - profile: str | Profile, + profile: str | Profile | None = None, base_dumper: BaseDumper | None = None, process_dumper: ProcessDumper | None = None, + dump_logger: DumpLogger | None = None, organize_by_groups: bool = True, deduplicate: bool = True, groups: list[str | orm.Group] | None = None, @@ -37,27 +37,37 @@ def __init__( ): self.organize_by_groups = organize_by_groups self.deduplicate = deduplicate - self.profile = profile self.dump_processes = dump_processes self.groups = groups - if base_dumper is None: - base_dumper = BaseDumper() - self.base_dumper: BaseDumper = base_dumper + self.base_dumper = base_dumper or BaseDumper() + self.process_dumper = process_dumper or ProcessDumper() + self.dump_logger = dump_logger or DumpLogger() - if process_dumper is None: - process_dumper = ProcessDumper() - self.process_dumper: ProcessDumper = process_dumper + # Load the profile + if isinstance(profile, str): + profile = load_profile(profile) - # self.log_dict: dict[dict[str, Path]] = {} - self.log_dict = {'calculations': {}, 'workflows': {}} + if profile is None: + manager = get_manager() + profile = manager.get_profile() + + assert profile is not None + self.profile = profile def dump(self): + # No groups selected, dump data which is not part of any group + # If groups selected, however, this data should not also be dumped automatically if not self.groups: self._dump_processes_not_in_any_group() - self.groups = orm.QueryBuilder().append(orm.Group).all(flat=True) - self._dump_processes_per_group() + # Still, even without selecting groups, by default, all profile data should be dumped + # Thus, we obtain all groups in the profile here + profile_groups = orm.QueryBuilder().append(orm.Group).all(flat=True) + self._dump_processes_per_group(groups=profile_groups) + + else: + self._dump_processes_per_group(groups=self.groups) def _dump_processes_not_in_any_group(self): # === Dump the data that is not associated with any group === @@ -71,21 +81,19 @@ def _dump_processes_not_in_any_group(self): process_dumper=self.process_dumper, group=None, deduplicate=self.deduplicate, + dump_logger=self.dump_logger, output_path=output_path, - global_log_dict=self.log_dict, ) if self.dump_processes and no_group_dumper._should_dump_processes(): logger.report(f'Dumping processes not in any group for profile `{self.profile.name}`...') - no_group_dumper._dump_processes() + no_group_dumper.dump() - self.log_dict.update(no_group_dumper.log_dict) - - def _dump_processes_per_group(self): + def _dump_processes_per_group(self, groups): # === Dump data per-group if Groups exist in profile or are selected === - for group in self.groups: + for group in groups: if self.organize_by_groups: output_path = self.base_dumper.dump_parent_path / group.label else: @@ -94,18 +102,13 @@ def _dump_processes_per_group(self): group_dumper = GroupDumper( base_dumper=self.base_dumper, process_dumper=self.process_dumper, + dump_logger=self.dump_logger, group=group, deduplicate=self.deduplicate, output_path=output_path, - global_log_dict=self.log_dict, ) if self.dump_processes and group_dumper._should_dump_processes(): logger.report(f'Dumping processes in group {group.label} for profile `{self.profile.name}`...') - group_dumper._dump_processes() - for entity in ['calculations', 'workflows']: - self.log_dict[entity].update(group_dumper.log_dict[entity]) - - pprint(group_dumper.log_dict) - pprint(self.log_dict) + group_dumper.dump() diff --git a/src/aiida/tools/dumping/utils.py b/src/aiida/tools/dumping/utils.py index c4c1ac0fc1..438c8a7c6b 100644 --- a/src/aiida/tools/dumping/utils.py +++ b/src/aiida/tools/dumping/utils.py @@ -10,16 +10,14 @@ from __future__ import annotations -import logging import shutil from pathlib import Path -from rich.console import Console -from rich.table import Table +from aiida.common.log import AIIDA_LOGGER __all__ = ['prepare_dump_path'] -logger = logging.getLogger(__name__) +logger = AIIDA_LOGGER.getChild('tools.dumping') def prepare_dump_path( @@ -41,10 +39,12 @@ def prepare_dump_path( :raises FileNotFoundError: If no `safeguard_file` is found.""" if overwrite and incremental: - raise ValueError('Both overwrite and incremental set to True. Only specify one.') + msg = 'Both overwrite and incremental set to True. Only specify one.' + raise ValueError(msg) if path_to_validate.is_file(): - raise FileExistsError(f'A file at the given path `{path_to_validate}` already exists.') + msg = f'A file at the given path `{path_to_validate}` already exists.' + raise FileExistsError(msg) # Handle existing directory if path_to_validate.is_dir(): @@ -53,89 +53,69 @@ def prepare_dump_path( # Case 1: Non-empty directory and overwrite is False if not is_empty and not overwrite: if incremental: - logger.info('Incremental dumping selected. Will keep directory.') + msg = f'Incremental dumping selected. Will update directory `{path_to_validate}` with new data.' + logger.report(msg) else: - raise FileExistsError( - f'Path `{path_to_validate}` already exists, and neither overwrite nor incremental is enabled.' - ) + msg = f'Path `{path_to_validate}` already exists, and neither overwrite nor incremental is enabled.' + raise FileExistsError(msg) # Case 2: Non-empty directory, overwrite is True if not is_empty and overwrite: safeguard_exists = (path_to_validate / safeguard_file).is_file() if safeguard_exists: - logger.info(f'Overwriting directory `{path_to_validate}`.') + msg = f'Overwriting directory `{path_to_validate}`.' + logger.report(msg) shutil.rmtree(path_to_validate) else: - raise FileNotFoundError( - f'Path `{path_to_validate}` exists without safeguard file ' - f'`{safeguard_file}`. Not removing because path might be a directory not created by AiiDA.' + msg = ( + f'Path `{path_to_validate}` exists without safeguard file `{safeguard_file}`. ' + f'Not removing because path might be a directory not created by AiiDA.' ) + raise FileNotFoundError(msg) # Create directory if it doesn't exist or was removed path_to_validate.mkdir(exist_ok=True, parents=True) (path_to_validate / safeguard_file).touch() -def get_nodes_from_db(qb_instance, qb_filters: t.List | None = None, flat=False): - # Computers cannot be associated via `with_group` - # for qb_filter in qb_filters: - # qb.add_filter(**qb_filter) - - return_iterable = qb_instance.iterall() if qb_instance.count() > 10 ^ 3 else qb_instance.all() - - # Manual flattening as `iterall` doesn't have `flat` option unlike `all` - if flat: - return_iterable = [_[0] for _ in return_iterable] - - return return_iterable - +# @staticmethod +# def dumper_pretty_print(dumper_instance, include_private_and_dunder: bool = False): +# console = Console() +# table = Table(title=f'Attributes and Methods of {dumper_instance.__class__.__name__}') -# def validate_rich_options(rich_options, rich_config_file): -# if rich_options is not None and rich_config_file is not None: -# raise ValueError('Specify rich options either via CLI or config file, not both.') +# # Adding columns to the table +# table.add_column('Name', justify='left') +# table.add_column('Type', justify='left') +# table.add_column('Value', justify='left') -# else: -# logger.report('Neither `--rich-options` nor `--rich-config` set, using defaults.') +# # Lists to store attributes and methods +# entries = [] +# # Iterate over the class attributes and methods +# for attr_name in dir(dumper_instance): +# # Exclude private attributes and dunder methods +# attr_value = getattr(dumper_instance, attr_name) +# entry_type = 'Attribute' if not callable(attr_value) else 'Method' -@staticmethod -def dumper_pretty_print(dumper_instance, include_private_and_dunder: bool = False): - console = Console() - table = Table(title=f'Attributes and Methods of {dumper_instance.__class__.__name__}') - - # Adding columns to the table - table.add_column('Name', justify='left') - table.add_column('Type', justify='left') - table.add_column('Value', justify='left') - - # Lists to store attributes and methods - entries = [] - - # Iterate over the class attributes and methods - for attr_name in dir(dumper_instance): - # Exclude private attributes and dunder methods - attr_value = getattr(dumper_instance, attr_name) - entry_type = 'Attribute' if not callable(attr_value) else 'Method' - - if attr_name.startswith('_'): - if include_private_and_dunder: - entries.append((attr_name, entry_type, str(attr_value))) - else: - pass - else: - entries.append((attr_name, entry_type, str(attr_value))) +# if attr_name.startswith('_'): +# if include_private_and_dunder: +# entries.append((attr_name, entry_type, str(attr_value))) +# else: +# pass +# else: +# entries.append((attr_name, entry_type, str(attr_value))) - # Sort entries: attributes first, then methods - entries.sort(key=lambda x: (x[1] == 'Method', x[0])) +# # Sort entries: attributes first, then methods +# entries.sort(key=lambda x: (x[1] == 'Method', x[0])) - # Add sorted entries to the table - for name, entry_type, value in entries: - table.add_row(name, entry_type, value) +# # Add sorted entries to the table +# for name, entry_type, value in entries: +# table.add_row(name, entry_type, value) - # Print the formatted table - console.print(table) +# # Print the formatted table +# console.print(table) # def check_storage_size_user(): diff --git a/tests/tools/dumping/test_process.py b/tests/tools/dumping/test_process.py index 47d39ba75e..683e3c4707 100644 --- a/tests/tools/dumping/test_process.py +++ b/tests/tools/dumping/test_process.py @@ -6,7 +6,7 @@ # For further information on the license, see the LICENSE.txt file # # For further information please visit http://www.aiida.net # ########################################################################### -"""Tests for the dumping of ProcessNode data to disk.""" +"""Tests for the dumping of process data to disk.""" from __future__ import annotations From 2dfe2ca6930cf5a1e5b478be7c60de306c5181b7 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Tue, 28 Jan 2025 17:24:45 +0100 Subject: [PATCH 11/27] Start to work on group testing --- src/aiida/tools/dumping/group.py | 14 ++---- tests/tools/dumping/__init__.py | 0 tests/tools/dumping/test_group.py | 75 +++++++++++++++++++++++++++++ tests/tools/dumping/test_profile.py | 9 ++++ 4 files changed, 88 insertions(+), 10 deletions(-) create mode 100644 tests/tools/dumping/__init__.py create mode 100644 tests/tools/dumping/test_group.py create mode 100644 tests/tools/dumping/test_profile.py diff --git a/src/aiida/tools/dumping/group.py b/src/aiida/tools/dumping/group.py index 38bf25c380..86d63e69b0 100644 --- a/src/aiida/tools/dumping/group.py +++ b/src/aiida/tools/dumping/group.py @@ -22,12 +22,6 @@ logger = AIIDA_LOGGER.getChild('tools.dumping') -DEFAULT_PROCESSES_TO_DUMP = [orm.CalculationNode, orm.WorkflowNode] -# DEFAULT_DATA_TO_DUMP = [orm.StructureData, orm.Code, orm.Computer, orm.BandsData, orm.UpfData] -# DEFAULT_COLLECTIONS_TO_DUMP ?? -DEFAULT_ENTITIES_TO_DUMP = DEFAULT_PROCESSES_TO_DUMP # + DEFAULT_DATA_TO_DUMP - - class GroupDumper: def __init__( self, @@ -114,10 +108,6 @@ def _get_processes(self): self.calculations = calculations self.workflows = workflows - def dump(self): - self.output_path.mkdir(exist_ok=True, parents=True) - self._dump_processes() - def _dump_processes(self): self._get_processes() @@ -178,3 +168,7 @@ def _dump_workflows(self): dumped_workflows[workflow.uuid] = workflow_dump_path self.dump_logger.update_workflows(dumped_workflows) + + def dump(self): + self.output_path.mkdir(exist_ok=True, parents=True) + self._dump_processes() diff --git a/tests/tools/dumping/__init__.py b/tests/tools/dumping/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/tools/dumping/test_group.py b/tests/tools/dumping/test_group.py new file mode 100644 index 0000000000..b008fc2293 --- /dev/null +++ b/tests/tools/dumping/test_group.py @@ -0,0 +1,75 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Tests for the dumping of group data to disk.""" + +# TODO: Test that de-duplication also works for calculations + +import pytest +from pathlib import Path +from aiida import orm + + +@pytest.mark.usefixtures('aiida_profile_clean') +@pytest.fixture(scope='session', autouse=True) +def setup_profile_groups(generate_calculation_node_add, generate_workchain_multiply_add): + # Create nodes for profile storage + int_node = orm.Int(1).store() + _ = generate_calculation_node_add() + _ = generate_workchain_multiply_add() + cj_node = generate_calculation_node_add() + wc_node = generate_workchain_multiply_add() + + # Create the various groups + add_group = orm.Group.collection.get_or_create(label='add')[0] + multiply_add_group = orm.Group.collection.get_or_create(label='multiply-add')[0] + cj_dupl_group = orm.Group.collection.get_or_create(label='cj-dupl')[0] + wc_dupl_group = orm.Group.collection.get_or_create(label='wc-dupl')[0] + no_process_group = orm.Group.collection.get_or_create(label='add')[0] + + # Populate groups + add_group.add_nodes([cj_node]) + multiply_add_group.add_nodes([wc_node]) + cj_dupl_group.add_nodes([cj_node]) + wc_dupl_group.add_nodes([wc_node]) + no_process_group.add_nodes([int_node]) + + # Not sure if this is actually needed? + return { + 'add_group': add_group, + 'multiply_add_group': multiply_add_group, + 'cj_dupl_group': cj_dupl_group, + 'wc_dupl_group': wc_dupl_group, + 'no_process_group': no_process_group, + } + + +class TestGroupDumper: + + def test_should_dump_processes(self): + print(orm.QueryBuilder().append(orm.Group).all(flat=True)) + assert False + # pass + + def test_get_nodes(self): + pass + + def test_get_processes(self): + pass + + def test_dump_processes(self): + pass + + def test_dump_calculations(self): + pass + + def test_dump_workflows(self): + pass + + def test_dump(self): + pass diff --git a/tests/tools/dumping/test_profile.py b/tests/tools/dumping/test_profile.py new file mode 100644 index 0000000000..574688d8ae --- /dev/null +++ b/tests/tools/dumping/test_profile.py @@ -0,0 +1,9 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Tests for the dumping of profile data to disk.""" From 492f87f7ba1b1ec057395001033ba33148d21576 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Jan 2025 16:27:12 +0000 Subject: [PATCH 12/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/aiida/tools/dumping/group.py | 1 + tests/tools/dumping/test_group.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/aiida/tools/dumping/group.py b/src/aiida/tools/dumping/group.py index 86d63e69b0..6350d413ae 100644 --- a/src/aiida/tools/dumping/group.py +++ b/src/aiida/tools/dumping/group.py @@ -22,6 +22,7 @@ logger = AIIDA_LOGGER.getChild('tools.dumping') + class GroupDumper: def __init__( self, diff --git a/tests/tools/dumping/test_group.py b/tests/tools/dumping/test_group.py index b008fc2293..fa07a879eb 100644 --- a/tests/tools/dumping/test_group.py +++ b/tests/tools/dumping/test_group.py @@ -10,8 +10,9 @@ # TODO: Test that de-duplication also works for calculations + import pytest -from pathlib import Path + from aiida import orm @@ -50,7 +51,6 @@ def setup_profile_groups(generate_calculation_node_add, generate_workchain_multi class TestGroupDumper: - def test_should_dump_processes(self): print(orm.QueryBuilder().append(orm.Group).all(flat=True)) assert False From 3887130a2631d5de2d4c399e589339e24c4c04e6 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Tue, 28 Jan 2025 16:39:53 +0100 Subject: [PATCH 13/27] Add ArithmeticAdd CJ Node fixture without `run` --- tests/conftest.py | 153 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 5aa0ef3b89..251b354462 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -852,6 +852,159 @@ def _generate_calculation_node_add(): return _generate_calculation_node_add +@pytest.fixture(scope='class') +def construct_calculation_node_add(tmp_path_factory): + def _construct_calculation_node_add(x: int = 1, y: int = 2): + import textwrap + from aiida.orm import InstalledCode, Int, CalcJobNode, Computer, FolderData + from aiida.common import LinkType + import json + from _pytest.tmpdir import tmp_path_factory + + # Create a minimal computer + # Not using any of the `aiida_localhost` or `aiida_computer_local` fixtures as they are function-scoped + created, computer = Computer.collection.get_or_create( + label='mock_computer', + hostname='localhost', + transport_type='core.local', + scheduler_type='core.direct' + ) + if created: + computer.store() + + # Create the calculation node + calc_node = CalcJobNode(computer=computer) + + # Create input nodes + x_node = Int(x) + y_node = Int(y) + code_node = InstalledCode(computer=computer, filepath_executable='/bin/bash') + + # Store input nodes + x_node.store() + y_node.store() + code_node.store() + + # Input files + input_content = f'echo $(({x} + {y}))\n' + calc_node.base.repository.put_object_from_bytes(input_content.encode(), 'aiida.in') + + # .aiida folder content + calcinfo_dict = { + "codes_info": [ + { + "stdin_name": "aiida.in", + "stdout_name": "aiida.out", + "code_uuid": code_node.uuid + } + ], + "retrieve_list": [ + "aiida.out", + "_scheduler-stdout.txt", + "_scheduler-stderr.txt" + ], + "uuid": calc_node.uuid, + "file_copy_operation_order": [2, 0, 1] + } + + job_tmpl_dict = { + "submit_as_hold": False, + "rerunnable": False, + "job_name": "aiida-42", + "sched_output_path": "_scheduler-stdout.txt", + "shebang": "#!/bin/bash", + "sched_error_path": "_scheduler-stderr.txt", + "sched_join_files": False, + "prepend_text": "", + "append_text": "", + "job_resource": { + "num_machines": 1, + "num_mpiprocs_per_machine": 1, + "num_cores_per_machine": None, + "num_cores_per_mpiproc": None, + "tot_num_mpiprocs": 1 + }, + "codes_info": [ + { + "prepend_cmdline_params": [], + "cmdline_params": ["/usr/bin/bash"], + "use_double_quotes": [False, False], + "wrap_cmdline_params": False, + "stdin_name": "aiida.in", + "stdout_name": "aiida.out", + "stderr_name": None, + "join_files": False + } + ], + "codes_run_mode": 0, + "import_sys_environment": True, + "job_environment": {}, + "environment_variables_double_quotes": False, + "max_memory_kb": None, + 'max_wallclock_seconds': 3600, + } + + calc_node.base.repository.put_object_from_bytes( + json.dumps(calcinfo_dict, indent=4).encode(), + '.aiida/calcinfo.json' + ) + calc_node.base.repository.put_object_from_bytes( + json.dumps(job_tmpl_dict, indent=4).encode(), + '.aiida/job_tmpl.json' + ) + + # Submit script + submit_script = textwrap.dedent("""\ + #!/bin/bash + exec > _scheduler-stdout.txt + exec 2> _scheduler-stderr.txt + + '/usr/bin/bash' < 'aiida.in' > 'aiida.out' + """) + + calc_node.base.repository.put_object_from_bytes(submit_script.encode(), '_aiidasubmit.sh') + + # Store CalcInfo in node attributes + calc_node.base.attributes.set('input_filename', 'aiida.in') + calc_node.base.attributes.set('output_filename', 'aiida.out') + + # Add input links + calc_node.base.links.add_incoming(x_node, link_type=LinkType.INPUT_CALC, link_label='x') + calc_node.base.links.add_incoming(y_node, link_type=LinkType.INPUT_CALC, link_label='y') + calc_node.base.links.add_incoming(code_node, link_type=LinkType.INPUT_CALC, link_label='code') + + # Must store CalcjobNode before I can add output files + calc_node.store() + + + # Create FolderData node for retrieved + retrieved_folder = FolderData() + output_content = f'{x+y}\n'.encode() + retrieved_folder.put_object_from_bytes(output_content, 'aiida.out') + + scheduler_stdout = '\n'.encode() + scheduler_stderr = '\n'.encode() + retrieved_folder.base.repository.put_object_from_bytes(scheduler_stdout, '_scheduler-stdout.txt') + retrieved_folder.base.repository.put_object_from_bytes(scheduler_stderr, '_scheduler-stderr.txt') + retrieved_folder.store() + + retrieved_folder.base.links.add_incoming(calc_node, link_type=LinkType.CREATE, link_label='retrieved') + + # Create and link output node (sum) + output_node = Int(x+y) + output_node.store() + output_node.base.links.add_incoming(calc_node, link_type=LinkType.CREATE, link_label='sum') + + # Set process properties + calc_node.set_process_state('finished') + calc_node.set_process_label('ArithmeticAddCalculation') + calc_node.set_process_type('aiida.calculations:core.arithmetic.add') + calc_node.set_exit_status(0) + + return calc_node + + return _construct_calculation_node_add + @pytest.fixture def generate_workchain_multiply_add(aiida_localhost): def _generate_workchain_multiply_add(): From f452ab2b70f92c17af8c6f4e04d23f95853bb0ef Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Jan 2025 16:28:18 +0000 Subject: [PATCH 14/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/conftest.py | 97 +++++++++++++------------------ tests/tools/dumping/test_group.py | 1 - 2 files changed, 41 insertions(+), 57 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 251b354462..2dfad75dfe 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -855,19 +855,16 @@ def _generate_calculation_node_add(): @pytest.fixture(scope='class') def construct_calculation_node_add(tmp_path_factory): def _construct_calculation_node_add(x: int = 1, y: int = 2): + import json import textwrap - from aiida.orm import InstalledCode, Int, CalcJobNode, Computer, FolderData + from aiida.common import LinkType - import json - from _pytest.tmpdir import tmp_path_factory + from aiida.orm import CalcJobNode, Computer, FolderData, InstalledCode, Int # Create a minimal computer # Not using any of the `aiida_localhost` or `aiida_computer_local` fixtures as they are function-scoped created, computer = Computer.collection.get_or_create( - label='mock_computer', - hostname='localhost', - transport_type='core.local', - scheduler_type='core.direct' + label='mock_computer', hostname='localhost', transport_type='core.local', scheduler_type='core.direct' ) if created: computer.store() @@ -891,66 +888,54 @@ def _construct_calculation_node_add(x: int = 1, y: int = 2): # .aiida folder content calcinfo_dict = { - "codes_info": [ - { - "stdin_name": "aiida.in", - "stdout_name": "aiida.out", - "code_uuid": code_node.uuid - } - ], - "retrieve_list": [ - "aiida.out", - "_scheduler-stdout.txt", - "_scheduler-stderr.txt" - ], - "uuid": calc_node.uuid, - "file_copy_operation_order": [2, 0, 1] + 'codes_info': [{'stdin_name': 'aiida.in', 'stdout_name': 'aiida.out', 'code_uuid': code_node.uuid}], + 'retrieve_list': ['aiida.out', '_scheduler-stdout.txt', '_scheduler-stderr.txt'], + 'uuid': calc_node.uuid, + 'file_copy_operation_order': [2, 0, 1], } job_tmpl_dict = { - "submit_as_hold": False, - "rerunnable": False, - "job_name": "aiida-42", - "sched_output_path": "_scheduler-stdout.txt", - "shebang": "#!/bin/bash", - "sched_error_path": "_scheduler-stderr.txt", - "sched_join_files": False, - "prepend_text": "", - "append_text": "", - "job_resource": { - "num_machines": 1, - "num_mpiprocs_per_machine": 1, - "num_cores_per_machine": None, - "num_cores_per_mpiproc": None, - "tot_num_mpiprocs": 1 + 'submit_as_hold': False, + 'rerunnable': False, + 'job_name': 'aiida-42', + 'sched_output_path': '_scheduler-stdout.txt', + 'shebang': '#!/bin/bash', + 'sched_error_path': '_scheduler-stderr.txt', + 'sched_join_files': False, + 'prepend_text': '', + 'append_text': '', + 'job_resource': { + 'num_machines': 1, + 'num_mpiprocs_per_machine': 1, + 'num_cores_per_machine': None, + 'num_cores_per_mpiproc': None, + 'tot_num_mpiprocs': 1, }, - "codes_info": [ + 'codes_info': [ { - "prepend_cmdline_params": [], - "cmdline_params": ["/usr/bin/bash"], - "use_double_quotes": [False, False], - "wrap_cmdline_params": False, - "stdin_name": "aiida.in", - "stdout_name": "aiida.out", - "stderr_name": None, - "join_files": False + 'prepend_cmdline_params': [], + 'cmdline_params': ['/usr/bin/bash'], + 'use_double_quotes': [False, False], + 'wrap_cmdline_params': False, + 'stdin_name': 'aiida.in', + 'stdout_name': 'aiida.out', + 'stderr_name': None, + 'join_files': False, } ], - "codes_run_mode": 0, - "import_sys_environment": True, - "job_environment": {}, - "environment_variables_double_quotes": False, - "max_memory_kb": None, + 'codes_run_mode': 0, + 'import_sys_environment': True, + 'job_environment': {}, + 'environment_variables_double_quotes': False, + 'max_memory_kb': None, 'max_wallclock_seconds': 3600, } calc_node.base.repository.put_object_from_bytes( - json.dumps(calcinfo_dict, indent=4).encode(), - '.aiida/calcinfo.json' + json.dumps(calcinfo_dict, indent=4).encode(), '.aiida/calcinfo.json' ) calc_node.base.repository.put_object_from_bytes( - json.dumps(job_tmpl_dict, indent=4).encode(), - '.aiida/job_tmpl.json' + json.dumps(job_tmpl_dict, indent=4).encode(), '.aiida/job_tmpl.json' ) # Submit script @@ -976,7 +961,6 @@ def _construct_calculation_node_add(x: int = 1, y: int = 2): # Must store CalcjobNode before I can add output files calc_node.store() - # Create FolderData node for retrieved retrieved_folder = FolderData() output_content = f'{x+y}\n'.encode() @@ -991,7 +975,7 @@ def _construct_calculation_node_add(x: int = 1, y: int = 2): retrieved_folder.base.links.add_incoming(calc_node, link_type=LinkType.CREATE, link_label='retrieved') # Create and link output node (sum) - output_node = Int(x+y) + output_node = Int(x + y) output_node.store() output_node.base.links.add_incoming(calc_node, link_type=LinkType.CREATE, link_label='sum') @@ -1005,6 +989,7 @@ def _construct_calculation_node_add(x: int = 1, y: int = 2): return _construct_calculation_node_add + @pytest.fixture def generate_workchain_multiply_add(aiida_localhost): def _generate_workchain_multiply_add(): diff --git a/tests/tools/dumping/test_group.py b/tests/tools/dumping/test_group.py index fa07a879eb..ed13151f34 100644 --- a/tests/tools/dumping/test_group.py +++ b/tests/tools/dumping/test_group.py @@ -10,7 +10,6 @@ # TODO: Test that de-duplication also works for calculations - import pytest from aiida import orm From abbfaff06c6a0f86aab1877817f799992f80c820 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Fri, 31 Jan 2025 15:56:10 +0100 Subject: [PATCH 15/27] First tests for node collection dumping And back to `CollectionDumper` --- src/aiida/tools/dumping/__init__.py | 4 +- .../tools/dumping/{group.py => collection.py} | 61 +++----- src/aiida/tools/dumping/profile.py | 43 +++++- src/aiida/tools/dumping/utils.py | 68 ++------- tests/tools/dumping/test_collection.py | 138 ++++++++++++++++++ tests/tools/dumping/test_group.py | 74 ---------- 6 files changed, 208 insertions(+), 180 deletions(-) rename src/aiida/tools/dumping/{group.py => collection.py} (71%) create mode 100644 tests/tools/dumping/test_collection.py delete mode 100644 tests/tools/dumping/test_group.py diff --git a/src/aiida/tools/dumping/__init__.py b/src/aiida/tools/dumping/__init__.py index 48b73eee65..6bc7b9c2c0 100644 --- a/src/aiida/tools/dumping/__init__.py +++ b/src/aiida/tools/dumping/__init__.py @@ -9,10 +9,10 @@ """Modules related to the dumping of AiiDA data.""" from .base import BaseDumper -from .group import GroupDumper +from .collection import CollectionDumper from .process import ProcessDumper from .profile import ProfileDumper # from .collection import CollectionDumper -__all__ = ('BaseDumper', 'GroupDumper', 'ProcessDumper', 'ProfileDumper') # , 'CollectionDumper') +__all__ = ('BaseDumper', 'CollectionDumper', 'ProcessDumper', 'ProfileDumper') # , 'CollectionDumper') diff --git a/src/aiida/tools/dumping/group.py b/src/aiida/tools/dumping/collection.py similarity index 71% rename from src/aiida/tools/dumping/group.py rename to src/aiida/tools/dumping/collection.py index 6350d413ae..89428b4e18 100644 --- a/src/aiida/tools/dumping/group.py +++ b/src/aiida/tools/dumping/collection.py @@ -10,8 +10,8 @@ from __future__ import annotations -import itertools as it import os +from functools import cached_property from pathlib import Path from aiida import orm @@ -19,27 +19,31 @@ from aiida.tools.dumping.base import BaseDumper from aiida.tools.dumping.logger import DumpLogger from aiida.tools.dumping.process import ProcessDumper +from aiida.tools.dumping.utils import filter_by_last_dump_time logger = AIIDA_LOGGER.getChild('tools.dumping') -class GroupDumper: +class CollectionDumper: def __init__( self, base_dumper: BaseDumper | None = None, process_dumper: ProcessDumper | None = None, dump_logger: DumpLogger | None = None, - group: orm.Group | str | None = None, + collection: orm.Group | str | list[str] | None = None, deduplicate: bool = True, output_path: Path | str | None = None, ): self.deduplicate = deduplicate - # Allow passing of group via label - if isinstance(group, str): - group = orm.load_group(group) + # Pass the collection type. Could be Group or just list of nodes + if isinstance(collection, str): + try: + collection = orm.load_group(collection) + except: + raise - self.group = group + self.collection = collection self.base_dumper = base_dumper or BaseDumper() self.process_dumper = process_dumper or ProcessDumper() @@ -49,44 +53,19 @@ def __init__( self.output_path = Path(output_path or self.base_dumper.dump_parent_path) - self.nodes = self._get_nodes() - - def _should_dump_processes(self) -> bool: - return len([node for node in self.nodes if isinstance(node, orm.ProcessNode)]) > 0 + @cached_property + def nodes(self): + return self._get_nodes() def _get_nodes(self): - # Get all nodes that are in the group - if self.group is not None: - nodes = list(self.group.nodes) - - # Get all nodes that are _not_ in any group - else: - groups: list[orm.Group] = orm.QueryBuilder().append(orm.Group).all(flat=True) # type: ignore[assignment] - nodes_in_groups = [node.uuid for group in groups for node in group.nodes] - - # Need to expand here also with the called_descendants of `WorkflowNodes`, otherwise the called - # `CalculationNode`s for `WorkflowNode`s that are part of a group are dumped twice - # Get the called descendants of WorkflowNodes within the nodes_in_groups list - called_descendants_generator = ( - orm.load_node(node).called_descendants - for node in nodes_in_groups - if isinstance(orm.load_node(node), orm.WorkflowNode) - ) - - # Flatten the list of called descendants - sub_nodes_in_groups = list(it.chain(*called_descendants_generator)) - - sub_nodes_in_groups = [node.uuid for node in sub_nodes_in_groups] - nodes_in_groups += sub_nodes_in_groups - - profile_nodes = orm.QueryBuilder().append(orm.Node, project=['uuid']).all(flat=True) - nodes = [profile_node for profile_node in profile_nodes if profile_node not in nodes_in_groups] - nodes = [orm.load_node(node) for node in nodes] + if isinstance(self.collection, orm.Group): + nodes: list[str] = list(self.collection.nodes) - if self.base_dumper.last_dump_time is not None: - nodes = [node for node in nodes if node.mtime > self.base_dumper.last_dump_time] + return filter_by_last_dump_time(nodes=nodes, last_dump_time=self.base_dumper.last_dump_time) - return nodes + def _should_dump_processes(self, nodes: list[orm.Node] | None = None) -> bool: + test_nodes = nodes or self.nodes + return len([node for node in test_nodes if isinstance(node, orm.ProcessNode)]) > 0 def _get_processes(self): nodes = self.nodes diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py index 2b2d5294c1..17b88ecefc 100644 --- a/src/aiida/tools/dumping/profile.py +++ b/src/aiida/tools/dumping/profile.py @@ -11,14 +11,17 @@ from __future__ import annotations +from typing import cast + from aiida import orm from aiida.common.log import AIIDA_LOGGER from aiida.manage import get_manager, load_profile from aiida.manage.configuration.profile import Profile from aiida.tools.dumping.base import BaseDumper -from aiida.tools.dumping.group import GroupDumper +from aiida.tools.dumping.collection import CollectionDumper from aiida.tools.dumping.logger import DumpLogger from aiida.tools.dumping.process import ProcessDumper +from aiida.tools.dumping.utils import filter_by_last_dump_time logger = AIIDA_LOGGER.getChild('tools.dumping') @@ -76,10 +79,12 @@ def _dump_processes_not_in_any_group(self): else: output_path = self.base_dumper.dump_parent_path - no_group_dumper = GroupDumper( + no_group_nodes = self._get_no_group_nodes() + + no_group_dumper = CollectionDumper( base_dumper=self.base_dumper, process_dumper=self.process_dumper, - group=None, + collection=no_group_nodes, deduplicate=self.deduplicate, dump_logger=self.dump_logger, output_path=output_path, @@ -99,11 +104,11 @@ def _dump_processes_per_group(self, groups): else: output_path = self.base_dumper.dump_parent_path - group_dumper = GroupDumper( + group_dumper = CollectionDumper( base_dumper=self.base_dumper, process_dumper=self.process_dumper, dump_logger=self.dump_logger, - group=group, + collection=group, deduplicate=self.deduplicate, output_path=output_path, ) @@ -112,3 +117,31 @@ def _dump_processes_per_group(self, groups): logger.report(f'Dumping processes in group {group.label} for profile `{self.profile.name}`...') group_dumper.dump() + + def _get_no_group_nodes(self) -> list[str]: + # Get all nodes that are _not_ in any group + group_qb = orm.QueryBuilder().append(orm.Group) + profile_groups = cast(list[orm.Group], group_qb.all(flat=True)) + node_qb = orm.QueryBuilder().append(orm.Node, project=['uuid']) + profile_nodes = cast(list[str], node_qb.all(flat=True)) + + nodes_in_groups: list[str] = [node.uuid for group in profile_groups for node in group.nodes] + + # Need to expand here also with the called_descendants of `WorkflowNodes`, otherwise the called + # `CalculationNode`s for `WorkflowNode`s that are part of a group are dumped twice + # Get the called descendants of WorkflowNodes within the nodes_in_groups list + + sub_nodes_in_groups: list[str] = [ + node.uuid + for n in nodes_in_groups + if isinstance((workflow_node := orm.load_node(n)), orm.WorkflowNode) + for node in workflow_node.called_descendants + ] + + # sub_nodes_in_groups: list[str] = [node.uuid for node in sub_nodes_in_groups] + nodes_in_groups += sub_nodes_in_groups + + nodes: list[str] = [profile_node for profile_node in profile_nodes if profile_node not in nodes_in_groups] + nodes = filter_by_last_dump_time(nodes=nodes, last_dump_time=self.base_dumper.last_dump_time) + + return nodes diff --git a/src/aiida/tools/dumping/utils.py b/src/aiida/tools/dumping/utils.py index 438c8a7c6b..0758e81770 100644 --- a/src/aiida/tools/dumping/utils.py +++ b/src/aiida/tools/dumping/utils.py @@ -11,8 +11,10 @@ from __future__ import annotations import shutil +from datetime import datetime from pathlib import Path +from aiida import orm from aiida.common.log import AIIDA_LOGGER __all__ = ['prepare_dump_path'] @@ -80,64 +82,6 @@ def prepare_dump_path( (path_to_validate / safeguard_file).touch() -# @staticmethod -# def dumper_pretty_print(dumper_instance, include_private_and_dunder: bool = False): -# console = Console() -# table = Table(title=f'Attributes and Methods of {dumper_instance.__class__.__name__}') - -# # Adding columns to the table -# table.add_column('Name', justify='left') -# table.add_column('Type', justify='left') -# table.add_column('Value', justify='left') - -# # Lists to store attributes and methods -# entries = [] - -# # Iterate over the class attributes and methods -# for attr_name in dir(dumper_instance): -# # Exclude private attributes and dunder methods -# attr_value = getattr(dumper_instance, attr_name) -# entry_type = 'Attribute' if not callable(attr_value) else 'Method' - -# if attr_name.startswith('_'): -# if include_private_and_dunder: -# entries.append((attr_name, entry_type, str(attr_value))) -# else: -# pass -# else: -# entries.append((attr_name, entry_type, str(attr_value))) - -# # Sort entries: attributes first, then methods -# entries.sort(key=lambda x: (x[1] == 'Method', x[0])) - -# # Add sorted entries to the table -# for name, entry_type, value in entries: -# table.add_row(name, entry_type, value) - -# # Print the formatted table -# console.print(table) - - -# def check_storage_size_user(): -# from aiida.manage.manager import get_manager - -# manager = get_manager() -# storage = manager.get_profile_storage() - -# data = storage.get_info(detailed=True) -# repository_data = data['repository']['Size (MB)'] -# total_size_gb = sum(repository_data.values()) / 1024 -# if total_size_gb > 10: -# user_input = ( -# input('Repository size larger than 10gb. Do you still want to dump the profile data? (y/N): ') -# .strip() -# .lower() -# ) - -# if user_input != 'y': -# sys.exit() - - def sanitize_file_extension(filename: str | Path): if isinstance(filename, Path): filename = str(filename) @@ -147,3 +91,11 @@ def sanitize_file_extension(filename: str | Path): filename = filename.replace('.mpl_png', '.png') return Path(filename) + + +def filter_by_last_dump_time(nodes: list[str], last_dump_time: datetime | None = None) -> list[str]: + if last_dump_time is not None: + orm_nodes = [orm.load_node(node) for node in nodes] + return [node.uuid for node in orm_nodes if node.mtime > last_dump_time] + else: + return nodes diff --git a/tests/tools/dumping/test_collection.py b/tests/tools/dumping/test_collection.py new file mode 100644 index 0000000000..50a2f357ef --- /dev/null +++ b/tests/tools/dumping/test_collection.py @@ -0,0 +1,138 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Tests for the dumping of group data to disk.""" + +# TODO: Test that de-duplication also works for calculations +# TODO: Test incremental dumping + +import pytest + +from aiida import orm +from aiida.tools.dumping import BaseDumper, CollectionDumper, ProcessDumper + +# Fixture that depends on generate_calculation_node_add_class +# @pytest.fixture(scope="class") +# def setup_calculation_node_add_class(generate_calculation_node_add_class): +# # This will make sure the fixture runs and is available for setup_class +# generate_calculation_node_add_class() # You can also do any additional setup here + + +@pytest.fixture() +def setup_no_process_group(): + no_process_group, _ = orm.Group.collection.get_or_create(label='no-process') + if no_process_group.is_empty: + int_node = orm.Int(1).store() + no_process_group.add_nodes([int_node]) + return no_process_group + + +@pytest.fixture() +def setup_add_group(generate_calculation_node_add): + add_group, _ = orm.Group.collection.get_or_create(label='add') + if add_group.is_empty: + add_node = generate_calculation_node_add() + add_group.add_nodes([add_node]) + return add_group + + +@pytest.fixture() +def setup_multiply_add_group(generate_workchain_multiply_add): + multiply_add_group, _ = orm.Group.collection.get_or_create(label='multiply-add') + if multiply_add_group.is_empty: + multiply_add_node = generate_workchain_multiply_add() + multiply_add_group.add_nodes([multiply_add_node]) + return multiply_add_group + + +@pytest.fixture() +def multiply_process_groups(): ... + + +@pytest.mark.usefixtures('aiida_profile_clean_class') +class TestCollectionDumper: + def test_should_dump_processes(self, setup_no_process_group, setup_add_group): + """""" + no_process_group: orm.Group = setup_no_process_group + add_group: orm.Group = setup_add_group + + base_dumper = BaseDumper() + process_dumper = ProcessDumper() + + group_dumper = CollectionDumper(base_dumper=base_dumper, process_dumper=process_dumper, collection=no_process_group) + + assert group_dumper._should_dump_processes() is False + + group_dumper = CollectionDumper(base_dumper=base_dumper, process_dumper=process_dumper, collection=add_group) + + assert group_dumper._should_dump_processes() is True + + # def test_get_nodes(self): + # pass + + # def test_get_processes(self): + # pass + + # def test_dump_processes(self): + # pass + + # def test_dump_calculations(self): + # pass + + # def test_dump_workflows(self): + # pass + + # def test_dump(self): + # pass + + +#######3 + +# def test_setup_profile( +# self, +# generate_calculation_node_add, +# generate_workchain_multiply_add, +# generate_calculation_node_io, +# generate_workchain_node_io, +# ): +# # TODO: This is a hack... and not actually a real test +# # TODO: I'm using the `aiida_profile_clean_class` fiture to make sure I have a clean profile for this class +# # TODO: However, this method is not an actual test, but sets up the profile data how I want it for testing +# # TODO: Ideally, I'd create a class-scoped fixture that does the setup +# # TODO: Or define a `setup_class` method +# # TODO: However, as most of AiiDA's fixtures are function-scoped, I didn't manage to get any of these approaches +# # TODO: To work, due to pytest's ScopeMismatch exceptions + +# # Create nodes for profile storage +# ## Not in any group +# int_node = orm.Int(1).store() +# _ = generate_calculation_node_add() +# _ = generate_workchain_multiply_add() +# ## For putting into groups +# add_node = generate_calculation_node_add() +# multiply_add_node = generate_workchain_multiply_add() + +# # Create the various groups +# add_group, _ = orm.Group.collection.get_or_create(label='add') +# multiply_add_group, _ = orm.Group.collection.get_or_create(label='multiply-add') +# cj_dupl_group, _ = orm.Group.collection.get_or_create(label='cj-dupl') +# wc_dupl_group, _ = orm.Group.collection.get_or_create(label='wc-dupl') +# no_process_group, _ = orm.Group.collection.get_or_create(label='no-process') + +# # Populate groups +# add_group.add_nodes([add_node]) +# multiply_add_group.add_nodes([multiply_add_node]) +# cj_dupl_group.add_nodes([add_node]) +# wc_dupl_group.add_nodes([multiply_add_node]) +# no_process_group.add_nodes([int_node]) + +# self.add_group = add_group +# self.multiply_add_group = multiply_add_group +# self.cj_dupl_group = cj_dupl_group +# self.wc_dupl_group = wc_dupl_group +# self.no_process_group = no_process_group diff --git a/tests/tools/dumping/test_group.py b/tests/tools/dumping/test_group.py deleted file mode 100644 index ed13151f34..0000000000 --- a/tests/tools/dumping/test_group.py +++ /dev/null @@ -1,74 +0,0 @@ -########################################################################### -# Copyright (c), The AiiDA team. All rights reserved. # -# This file is part of the AiiDA code. # -# # -# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # -# For further information on the license, see the LICENSE.txt file # -# For further information please visit http://www.aiida.net # -########################################################################### -"""Tests for the dumping of group data to disk.""" - -# TODO: Test that de-duplication also works for calculations - -import pytest - -from aiida import orm - - -@pytest.mark.usefixtures('aiida_profile_clean') -@pytest.fixture(scope='session', autouse=True) -def setup_profile_groups(generate_calculation_node_add, generate_workchain_multiply_add): - # Create nodes for profile storage - int_node = orm.Int(1).store() - _ = generate_calculation_node_add() - _ = generate_workchain_multiply_add() - cj_node = generate_calculation_node_add() - wc_node = generate_workchain_multiply_add() - - # Create the various groups - add_group = orm.Group.collection.get_or_create(label='add')[0] - multiply_add_group = orm.Group.collection.get_or_create(label='multiply-add')[0] - cj_dupl_group = orm.Group.collection.get_or_create(label='cj-dupl')[0] - wc_dupl_group = orm.Group.collection.get_or_create(label='wc-dupl')[0] - no_process_group = orm.Group.collection.get_or_create(label='add')[0] - - # Populate groups - add_group.add_nodes([cj_node]) - multiply_add_group.add_nodes([wc_node]) - cj_dupl_group.add_nodes([cj_node]) - wc_dupl_group.add_nodes([wc_node]) - no_process_group.add_nodes([int_node]) - - # Not sure if this is actually needed? - return { - 'add_group': add_group, - 'multiply_add_group': multiply_add_group, - 'cj_dupl_group': cj_dupl_group, - 'wc_dupl_group': wc_dupl_group, - 'no_process_group': no_process_group, - } - - -class TestGroupDumper: - def test_should_dump_processes(self): - print(orm.QueryBuilder().append(orm.Group).all(flat=True)) - assert False - # pass - - def test_get_nodes(self): - pass - - def test_get_processes(self): - pass - - def test_dump_processes(self): - pass - - def test_dump_calculations(self): - pass - - def test_dump_workflows(self): - pass - - def test_dump(self): - pass From e09e0789fdf55af383d1ac871b5c2271ae2a8a32 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 31 Jan 2025 14:56:57 +0000 Subject: [PATCH 16/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tools/dumping/test_collection.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/tools/dumping/test_collection.py b/tests/tools/dumping/test_collection.py index 50a2f357ef..36a570ee62 100644 --- a/tests/tools/dumping/test_collection.py +++ b/tests/tools/dumping/test_collection.py @@ -64,7 +64,9 @@ def test_should_dump_processes(self, setup_no_process_group, setup_add_group): base_dumper = BaseDumper() process_dumper = ProcessDumper() - group_dumper = CollectionDumper(base_dumper=base_dumper, process_dumper=process_dumper, collection=no_process_group) + group_dumper = CollectionDumper( + base_dumper=base_dumper, process_dumper=process_dumper, collection=no_process_group + ) assert group_dumper._should_dump_processes() is False From 48acce7a66b9834b9f416dc89f412c563a4760b0 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Wed, 5 Feb 2025 13:20:35 +0100 Subject: [PATCH 17/27] Improve logging and add dry-run feature. --- src/aiida/cmdline/commands/cmd_profile.py | 36 ++++++--- src/aiida/tools/dumping/collection.py | 55 ++++++++----- src/aiida/tools/dumping/logger.py | 98 ++++++++++++++++++++--- src/aiida/tools/dumping/profile.py | 12 ++- src/aiida/tools/dumping/utils.py | 5 +- 5 files changed, 161 insertions(+), 45 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_profile.py b/src/aiida/cmdline/commands/cmd_profile.py index 4f6fc99b60..a69acbba3c 100644 --- a/src/aiida/cmdline/commands/cmd_profile.py +++ b/src/aiida/cmdline/commands/cmd_profile.py @@ -20,6 +20,7 @@ from aiida.common import exceptions from aiida.manage.configuration import Profile, create_profile, get_config from aiida.tools.dumping import ProcessDumper, ProfileDumper +from aiida.tools.dumping.logger import DumpLogger @verdi.group('profile') @@ -306,6 +307,7 @@ def profile_mirror( ): """Dump all data in an AiiDA profile's storage to disk.""" + import json from datetime import datetime from pathlib import Path @@ -319,17 +321,6 @@ def profile_mirror( if path is None: path = Path.cwd() / f'{profile.name}-mirror' - # TODO: Implement proper dry-run feature - dry_run_message = f"Dry run for dumping of profile `{profile.name}`'s data at path: `{path}`.\n" - dry_run_message += 'Only directories will be created.' - - if dry_run: - echo.echo_report(dry_run_message) - return - - else: - echo.echo_report(f"Dumping of profile `{profile.name}`'s data at path: `{path}`.") - SAFEGUARD_FILE: str = '.verdi_profile_mirror' # noqa: N806 safeguard_file_path: Path = path / SAFEGUARD_FILE @@ -349,6 +340,24 @@ def profile_mirror( except IndexError: last_dump_time = None + if dry_run: + node_counts = ProfileDumper._get_number_of_nodes_to_dump(last_dump_time) + node_counts_str = ' & '.join(f'{count} {node_type}' for node_type, count in node_counts.items()) + dry_run_message = f'Dry run for mirroring of profile `{profile.name}`: {node_counts_str} to dump.\n' + echo.echo_report(dry_run_message) + return + + echo.echo_report(f"Dumping of profile `{profile.name}`'s data at path: `{path}`.") + + if incremental: + msg = 'Incremental dumping selected. Will update directory.' + echo.echo_report(msg) + + try: + dump_logger = DumpLogger.from_file(dump_parent_path=path) + except (json.JSONDecodeError, OSError): + dump_logger = DumpLogger(dump_parent_path=path) + base_dumper = BaseDumper( dump_parent_path=path, overwrite=overwrite, @@ -368,6 +377,7 @@ def profile_mirror( profile_dumper = ProfileDumper( base_dumper=base_dumper, process_dumper=process_dumper, + dump_logger=dump_logger, groups=groups, organize_by_groups=organize_by_groups, deduplicate=deduplicate, @@ -381,3 +391,7 @@ def profile_mirror( last_dump_time = datetime.now().astimezone() with safeguard_file_path.open('a') as fhandle: fhandle.write(f'Last profile mirror time: {last_dump_time.isoformat()}\n') + + dump_logger.save_log() + + echo.echo_report(f'Dumped {dump_logger.counter} new nodes.') diff --git a/src/aiida/tools/dumping/collection.py b/src/aiida/tools/dumping/collection.py index 89428b4e18..1852bc2245 100644 --- a/src/aiida/tools/dumping/collection.py +++ b/src/aiida/tools/dumping/collection.py @@ -11,13 +11,14 @@ from __future__ import annotations import os +from datetime import datetime from functools import cached_property from pathlib import Path from aiida import orm from aiida.common.log import AIIDA_LOGGER from aiida.tools.dumping.base import BaseDumper -from aiida.tools.dumping.logger import DumpLogger +from aiida.tools.dumping.logger import DumpLog, DumpLogger from aiida.tools.dumping.process import ProcessDumper from aiida.tools.dumping.utils import filter_by_last_dump_time @@ -47,28 +48,38 @@ def __init__( self.base_dumper = base_dumper or BaseDumper() self.process_dumper = process_dumper or ProcessDumper() - self.dump_logger = dump_logger or DumpLogger() + self.dump_logger = dump_logger or DumpLogger(dump_parent_path=self.base_dumper.dump_parent_path) # Properly set the `output_path` attribute self.output_path = Path(output_path or self.base_dumper.dump_parent_path) @cached_property - def nodes(self): + def nodes(self) -> list[str]: return self._get_nodes() - def _get_nodes(self): + def _get_nodes(self) -> list[str]: + nodes: list[str] | None = None if isinstance(self.collection, orm.Group): - nodes: list[str] = list(self.collection.nodes) + nodes = [n.uuid for n in list(self.collection.nodes)] + elif isinstance(self.collection, list) and len(self.collection) > 0: + if all(isinstance(n, str) for n in self.collection): + nodes = self.collection + else: + msg = 'A collection of nodes must be passed via their UUIDs.' + raise TypeError(msg) + else: + nodes = [] - return filter_by_last_dump_time(nodes=nodes, last_dump_time=self.base_dumper.last_dump_time) + filtered_nodes = filter_by_last_dump_time(nodes=nodes, last_dump_time=self.base_dumper.last_dump_time) + return filtered_nodes - def _should_dump_processes(self, nodes: list[orm.Node] | None = None) -> bool: + def _should_dump_processes(self, nodes: list[str] | None = None) -> bool: test_nodes = nodes or self.nodes - return len([node for node in test_nodes if isinstance(node, orm.ProcessNode)]) > 0 + return len([node for node in test_nodes if isinstance(orm.load_node(node), orm.ProcessNode)]) > 0 def _get_processes(self): - nodes = self.nodes + nodes = [orm.load_node(n) for n in self.nodes] workflows = [node for node in nodes if isinstance(node, orm.WorkflowNode)] # Make sure that only top-level workflows are dumped in their own directories when de-duplcation is enabled @@ -99,6 +110,8 @@ def _dump_processes(self): self._dump_workflows() def _dump_calculations(self): + if len(self.calculations) == 0: + return calculations_path = self.output_path / 'calculations' dumped_calculations = {} @@ -106,19 +119,22 @@ def _dump_calculations(self): calculation_dumper = self.process_dumper calculation_dump_path = calculations_path / calculation_dumper._generate_default_dump_path( - process_node=calculation, prefix='' + process_node=calculation, prefix=None ) if calculation.caller is None: - # or (calculation.caller is not None and not self.deduplicate): calculation_dumper._dump_calculation(calculation_node=calculation, output_path=calculation_dump_path) - dumped_calculations[calculation.uuid] = calculation_dump_path + dumped_calculations[calculation.uuid] = DumpLog( + path=calculation_dump_path, + time=datetime.now().astimezone(), + ) self.dump_logger.update_calculations(dumped_calculations) - def _dump_workflows(self): - # workflow_nodes = get_nodes_from_db(aiida_node_type=orm.WorkflowNode, with_group=self.group, flat=True) + def _dump_workflows(self) -> None: + if len(self.workflows) == 0: + return workflow_path = self.output_path / 'workflows' workflow_path.mkdir(exist_ok=True, parents=True) dumped_workflows = {} @@ -130,22 +146,23 @@ def _dump_workflows(self): process_node=workflow, prefix=None ) - logged_workflows = self.dump_logger.get_logs()['workflows'] + logged_workflows = self.dump_logger.get_log()['workflows'] if self.deduplicate and workflow.uuid in logged_workflows.keys(): os.symlink( - src=logged_workflows[workflow.uuid], + src=logged_workflows[workflow.uuid].path, dst=workflow_dump_path, ) else: workflow_dumper._dump_workflow( workflow_node=workflow, output_path=workflow_dump_path, - # link_calculations=not self.deduplicate, - # link_calculations_dir=self.output_path / 'calculations', ) - dumped_workflows[workflow.uuid] = workflow_dump_path + dumped_workflows[workflow.uuid] = DumpLog( + path=workflow_dump_path, + time=datetime.now().astimezone(), + ) self.dump_logger.update_workflows(dumped_workflows) diff --git a/src/aiida/tools/dumping/logger.py b/src/aiida/tools/dumping/logger.py index eecf611911..34fd2170e8 100644 --- a/src/aiida/tools/dumping/logger.py +++ b/src/aiida/tools/dumping/logger.py @@ -1,18 +1,96 @@ +import json +from dataclasses import dataclass +from datetime import datetime from pathlib import Path +from typing import TypeAlias + + +@dataclass +class DumpLog: + """Represents a single dump log entry.""" + + path: Path + time: datetime + + +DumpDict: TypeAlias = dict[str, DumpLog] class DumpLogger: - def __init__(self): - self.log_dict: dict[str, dict[str, Path]] = {'calculations': {}, 'workflows': {}} + """Main logger class using dataclasses for better structure.""" - def update_calculations(self, new_calculations: dict[str, Path]): - """Update the log with new calculations.""" - self.log_dict['calculations'].update(new_calculations) + DUMP_FILE: str = '.dump_log.json' - def update_workflows(self, new_workflows: dict[str, Path]): - """Update the log with new workflows.""" - self.log_dict['workflows'].update(new_workflows) + def __init__( + self, + dump_parent_path: Path, + calculations: DumpDict | None = None, + workflows: DumpDict | None = None, + counter: int = 0, + ) -> None: + self.dump_parent_path = dump_parent_path + self.calculations = calculations or {} + self.workflows = workflows or {} + self.counter = 0 - def get_logs(self): + @property + def dump_file(self) -> Path: + """Get the path to the dump file.""" + return self.dump_parent_path / self.DUMP_FILE + + def update_calculations(self, new_calculations: DumpDict) -> None: + """Update the calculations log.""" + self.calculations.update(new_calculations) + self.counter += len(new_calculations) + + def update_workflows(self, new_workflows: DumpDict) -> None: + """Update the workflows log.""" + self.workflows.update(new_workflows) + self.counter += len(new_workflows) + + def get_log(self) -> dict[str, DumpDict]: """Retrieve the current state of the log.""" - return self.log_dict + return {'calculations': self.calculations, 'workflows': self.workflows} + + def save_log(self) -> None: + """Save the log to a JSON file.""" + + def serialize_logs(logs: DumpDict) -> dict: + serialized = {} + for uuid, entry in logs.items(): + serialized[uuid] = {'path': str(entry.path), 'time': entry.time.isoformat()} + return serialized + + log_dict = { + 'calculations': serialize_logs(self.calculations), + 'workflows': serialize_logs(self.workflows), + } + + with self.dump_file.open('w', encoding='utf-8') as f: + json.dump(log_dict, f, indent=4) + + @classmethod + def from_file(cls, dump_parent_path: Path) -> 'DumpLogger': + """Alternative constructor to load from an existing JSON file.""" + instance = cls(dump_parent_path=dump_parent_path) + + if not instance.dump_file.exists(): + return instance + + try: + with instance.dump_file.open('r', encoding='utf-8') as f: + data = json.load(f) + + def deserialize_logs(category_data: dict) -> DumpDict: + deserialized = {} + for uuid, entry in category_data.items(): + deserialized[uuid] = DumpLog(path=Path(entry['path']), time=datetime.fromisoformat(entry['time'])) + return deserialized + + instance.calculations = deserialize_logs(data['calculations']) + instance.workflows = deserialize_logs(data['workflows']) + + except (json.JSONDecodeError, OSError): + raise + + return instance diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py index 17b88ecefc..a591d50269 100644 --- a/src/aiida/tools/dumping/profile.py +++ b/src/aiida/tools/dumping/profile.py @@ -45,7 +45,7 @@ def __init__( self.base_dumper = base_dumper or BaseDumper() self.process_dumper = process_dumper or ProcessDumper() - self.dump_logger = dump_logger or DumpLogger() + self.dump_logger = dump_logger or DumpLogger(dump_parent_path=self.base_dumper.dump_parent_path) # Load the profile if isinstance(profile, str): @@ -145,3 +145,13 @@ def _get_no_group_nodes(self) -> list[str]: nodes = filter_by_last_dump_time(nodes=nodes, last_dump_time=self.base_dumper.last_dump_time) return nodes + + @staticmethod + def _get_number_of_nodes_to_dump(last_dump_time) -> dict[str, int]: + result = {} + for node_type in (orm.CalculationNode, orm.WorkflowNode): + qb = orm.QueryBuilder().append(node_type, project=['uuid']) + nodes = cast(list[str], qb.all(flat=True)) + nodes = filter_by_last_dump_time(nodes=nodes, last_dump_time=last_dump_time) + result[node_type.class_node_type.split('.')[-2] + 's'] = len(nodes) + return result diff --git a/src/aiida/tools/dumping/utils.py b/src/aiida/tools/dumping/utils.py index 0758e81770..fc6813e676 100644 --- a/src/aiida/tools/dumping/utils.py +++ b/src/aiida/tools/dumping/utils.py @@ -54,10 +54,7 @@ def prepare_dump_path( # Case 1: Non-empty directory and overwrite is False if not is_empty and not overwrite: - if incremental: - msg = f'Incremental dumping selected. Will update directory `{path_to_validate}` with new data.' - logger.report(msg) - else: + if not incremental: msg = f'Path `{path_to_validate}` already exists, and neither overwrite nor incremental is enabled.' raise FileExistsError(msg) From b2aba2f0d554c8547f0bda50cf53e98b60db79e7 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 6 Feb 2025 10:20:12 +0100 Subject: [PATCH 18/27] BaseDumper dataclass. get_processes return dict. Extend tests. --- src/aiida/cmdline/commands/cmd_profile.py | 13 +-- src/aiida/tools/dumping/base.py | 21 ++-- src/aiida/tools/dumping/collection.py | 67 ++++++------ src/aiida/tools/dumping/logger.py | 4 +- src/aiida/tools/dumping/utils.py | 2 +- tests/tools/dumping/test_collection.py | 120 ++++++++++++++++++---- tests/tools/dumping/test_process.py | 63 ------------ tests/tools/dumping/test_utils.py | 78 ++++++++++++++ 8 files changed, 233 insertions(+), 135 deletions(-) create mode 100644 tests/tools/dumping/test_utils.py diff --git a/src/aiida/cmdline/commands/cmd_profile.py b/src/aiida/cmdline/commands/cmd_profile.py index a69acbba3c..78c7e62686 100644 --- a/src/aiida/cmdline/commands/cmd_profile.py +++ b/src/aiida/cmdline/commands/cmd_profile.py @@ -19,8 +19,6 @@ from aiida.cmdline.utils import defaults, echo from aiida.common import exceptions from aiida.manage.configuration import Profile, create_profile, get_config -from aiida.tools.dumping import ProcessDumper, ProfileDumper -from aiida.tools.dumping.logger import DumpLogger @verdi.group('profile') @@ -311,7 +309,9 @@ def profile_mirror( from datetime import datetime from pathlib import Path + from aiida.tools.dumping import ProcessDumper, ProfileDumper from aiida.tools.dumping.base import BaseDumper + from aiida.tools.dumping.logger import DumpLogger from aiida.tools.dumping.utils import prepare_dump_path profile = ctx.obj['profile'] @@ -321,6 +321,8 @@ def profile_mirror( if path is None: path = Path.cwd() / f'{profile.name}-mirror' + echo.echo_report(f'Mirroring data of profile `{profile.name}`at path: `{path}`.') + SAFEGUARD_FILE: str = '.verdi_profile_mirror' # noqa: N806 safeguard_file_path: Path = path / SAFEGUARD_FILE @@ -347,10 +349,8 @@ def profile_mirror( echo.echo_report(dry_run_message) return - echo.echo_report(f"Dumping of profile `{profile.name}`'s data at path: `{path}`.") - if incremental: - msg = 'Incremental dumping selected. Will update directory.' + msg = 'Incremental mirroring selected. Will update directory.' echo.echo_report(msg) try: @@ -392,6 +392,7 @@ def profile_mirror( with safeguard_file_path.open('a') as fhandle: fhandle.write(f'Last profile mirror time: {last_dump_time.isoformat()}\n') + # Write the logging json file to disk dump_logger.save_log() - echo.echo_report(f'Dumped {dump_logger.counter} new nodes.') + echo.echo_success(f'Dumped {dump_logger.counter} new nodes.') diff --git a/src/aiida/tools/dumping/base.py b/src/aiida/tools/dumping/base.py index a2e2c379e8..bbe63c9301 100644 --- a/src/aiida/tools/dumping/base.py +++ b/src/aiida/tools/dumping/base.py @@ -7,19 +7,18 @@ # For further information please visit http://www.aiida.net # ########################################################################### +from dataclasses import dataclass from datetime import datetime from pathlib import Path +@dataclass class BaseDumper: - def __init__( - self, - dump_parent_path: Path | None = None, - overwrite: bool = False, - incremental: bool = True, - last_dump_time: datetime | None = None, - ): - self.dump_parent_path = dump_parent_path or Path.cwd() - self.overwrite = overwrite - self.incremental = incremental - self.last_dump_time = last_dump_time + dump_parent_path: Path | None = None + overwrite: bool = False + incremental: bool = True + last_dump_time: datetime | None = None + + def __post_init__(self): + if self.dump_parent_path is None: + self.dump_parent_path = Path.cwd() diff --git a/src/aiida/tools/dumping/collection.py b/src/aiida/tools/dumping/collection.py index 1852bc2245..8ea026382c 100644 --- a/src/aiida/tools/dumping/collection.py +++ b/src/aiida/tools/dumping/collection.py @@ -14,6 +14,7 @@ from datetime import datetime from functools import cached_property from pathlib import Path +from typing import TYPE_CHECKING, TypeVar, cast from aiida import orm from aiida.common.log import AIIDA_LOGGER @@ -22,6 +23,12 @@ from aiida.tools.dumping.process import ProcessDumper from aiida.tools.dumping.utils import filter_by_last_dump_time +if TYPE_CHECKING: + from collections.abc import Sequence + +T = TypeVar('T', bound='orm.ProcessNode') + + logger = AIIDA_LOGGER.getChild('tools.dumping') @@ -33,11 +40,11 @@ def __init__( dump_logger: DumpLogger | None = None, collection: orm.Group | str | list[str] | None = None, deduplicate: bool = True, - output_path: Path | str | None = None, + output_path: Path | None = None, ): self.deduplicate = deduplicate - # Pass the collection type. Could be Group or just list of nodes + # Collection could be a Group or a list of nodes if isinstance(collection, str): try: collection = orm.load_group(collection) @@ -51,8 +58,10 @@ def __init__( self.dump_logger = dump_logger or DumpLogger(dump_parent_path=self.base_dumper.dump_parent_path) # Properly set the `output_path` attribute - - self.output_path = Path(output_path or self.base_dumper.dump_parent_path) + if output_path is not None: + self.output_path = output_path + else: + self.output_path = Path.cwd() @cached_property def nodes(self) -> list[str]: @@ -61,7 +70,7 @@ def nodes(self) -> list[str]: def _get_nodes(self) -> list[str]: nodes: list[str] | None = None if isinstance(self.collection, orm.Group): - nodes = [n.uuid for n in list(self.collection.nodes)] + nodes = [n.uuid for n in self.collection.nodes] elif isinstance(self.collection, list) and len(self.collection) > 0: if all(isinstance(n, str) for n in self.collection): nodes = self.collection @@ -78,7 +87,7 @@ def _should_dump_processes(self, nodes: list[str] | None = None) -> bool: test_nodes = nodes or self.nodes return len([node for node in test_nodes if isinstance(orm.load_node(node), orm.ProcessNode)]) > 0 - def _get_processes(self): + def _get_processes(self) -> dict[str, Sequence[orm.ProcessNode]]: nodes = [orm.load_node(n) for n in self.nodes] workflows = [node for node in nodes if isinstance(node, orm.WorkflowNode)] @@ -94,28 +103,20 @@ def _get_processes(self): node for node in workflow.called_descendants if isinstance(node, orm.CalculationNode) ] - calculations = set([node for node in nodes if isinstance(node, orm.CalculationNode)] + called_calculations) - - self.calculations = calculations - self.workflows = workflows + calculations = [node for node in nodes if isinstance(node, orm.CalculationNode)] + called_calculations + return { + 'calculations': cast(Sequence[orm.ProcessNode], calculations), + 'workflows': cast(Sequence[orm.ProcessNode], workflows), + } + # return {'calculations': calculations, 'workflows': workflows} - def _dump_processes(self): - self._get_processes() - - if len(self.workflows) + len(self.calculations) == 0: - logger.report('No workflows or calculations to dump in group.') - return - - self._dump_calculations() - self._dump_workflows() - - def _dump_calculations(self): - if len(self.calculations) == 0: + def _dump_calculations(self, calculations: Sequence[orm.CalculationNode]) -> None: + if len(calculations) == 0: return calculations_path = self.output_path / 'calculations' dumped_calculations = {} - for calculation in self.calculations: + for calculation in calculations: calculation_dumper = self.process_dumper calculation_dump_path = calculations_path / calculation_dumper._generate_default_dump_path( @@ -123,7 +124,9 @@ def _dump_calculations(self): ) if calculation.caller is None: - calculation_dumper._dump_calculation(calculation_node=calculation, output_path=calculation_dump_path) + calculation_dumper._dump_calculation( + calculation_node=cast(orm.CalculationNode, calculation), output_path=calculation_dump_path + ) dumped_calculations[calculation.uuid] = DumpLog( path=calculation_dump_path, @@ -132,14 +135,14 @@ def _dump_calculations(self): self.dump_logger.update_calculations(dumped_calculations) - def _dump_workflows(self) -> None: - if len(self.workflows) == 0: + def _dump_workflows(self, workflows: Sequence[orm.WorkflowNode]) -> None: + if len(workflows) == 0: return workflow_path = self.output_path / 'workflows' workflow_path.mkdir(exist_ok=True, parents=True) dumped_workflows = {} - for workflow in self.workflows: + for workflow in workflows: workflow_dumper = self.process_dumper workflow_dump_path = workflow_path / workflow_dumper._generate_default_dump_path( @@ -148,14 +151,14 @@ def _dump_workflows(self) -> None: logged_workflows = self.dump_logger.get_log()['workflows'] - if self.deduplicate and workflow.uuid in logged_workflows.keys(): + if self.deduplicate and workflow in logged_workflows.keys(): os.symlink( src=logged_workflows[workflow.uuid].path, dst=workflow_dump_path, ) else: workflow_dumper._dump_workflow( - workflow_node=workflow, + workflow_node=cast(orm.WorkflowNode, workflow), output_path=workflow_dump_path, ) @@ -166,6 +169,8 @@ def _dump_workflows(self) -> None: self.dump_logger.update_workflows(dumped_workflows) - def dump(self): + def dump(self) -> None: self.output_path.mkdir(exist_ok=True, parents=True) - self._dump_processes() + collection_processes = self._get_processes() + self._dump_calculations(calculations=collection_processes['calculations']) + self._dump_workflows(workflows=collection_processes['workflows']) diff --git a/src/aiida/tools/dumping/logger.py b/src/aiida/tools/dumping/logger.py index 34fd2170e8..7489df0bbd 100644 --- a/src/aiida/tools/dumping/logger.py +++ b/src/aiida/tools/dumping/logger.py @@ -23,12 +23,12 @@ class DumpLogger: def __init__( self, - dump_parent_path: Path, + dump_parent_path: Path | None = None, calculations: DumpDict | None = None, workflows: DumpDict | None = None, counter: int = 0, ) -> None: - self.dump_parent_path = dump_parent_path + self.dump_parent_path = dump_parent_path or Path.cwd() self.calculations = calculations or {} self.workflows = workflows or {} self.counter = 0 diff --git a/src/aiida/tools/dumping/utils.py b/src/aiida/tools/dumping/utils.py index fc6813e676..0573fede09 100644 --- a/src/aiida/tools/dumping/utils.py +++ b/src/aiida/tools/dumping/utils.py @@ -63,7 +63,7 @@ def prepare_dump_path( safeguard_exists = (path_to_validate / safeguard_file).is_file() if safeguard_exists: - msg = f'Overwriting directory `{path_to_validate}`.' + msg = '`--overwrite` option selected. Will recreate directory.' logger.report(msg) shutil.rmtree(path_to_validate) diff --git a/tests/tools/dumping/test_collection.py b/tests/tools/dumping/test_collection.py index 36a570ee62..0e3f11c130 100644 --- a/tests/tools/dumping/test_collection.py +++ b/tests/tools/dumping/test_collection.py @@ -11,10 +11,12 @@ # TODO: Test that de-duplication also works for calculations # TODO: Test incremental dumping +from datetime import datetime + import pytest from aiida import orm -from aiida.tools.dumping import BaseDumper, CollectionDumper, ProcessDumper +from aiida.tools.dumping import CollectionDumper # Fixture that depends on generate_calculation_node_add_class # @pytest.fixture(scope="class") @@ -24,7 +26,7 @@ @pytest.fixture() -def setup_no_process_group(): +def setup_no_process_group() -> orm.Group: no_process_group, _ = orm.Group.collection.get_or_create(label='no-process') if no_process_group.is_empty: int_node = orm.Int(1).store() @@ -33,7 +35,7 @@ def setup_no_process_group(): @pytest.fixture() -def setup_add_group(generate_calculation_node_add): +def setup_add_group(generate_calculation_node_add) -> orm.Group: add_group, _ = orm.Group.collection.get_or_create(label='add') if add_group.is_empty: add_node = generate_calculation_node_add() @@ -42,7 +44,7 @@ def setup_add_group(generate_calculation_node_add): @pytest.fixture() -def setup_multiply_add_group(generate_workchain_multiply_add): +def setup_multiply_add_group(generate_workchain_multiply_add) -> orm.Group: multiply_add_group, _ = orm.Group.collection.get_or_create(label='multiply-add') if multiply_add_group.is_empty: multiply_add_node = generate_workchain_multiply_add() @@ -51,7 +53,13 @@ def setup_multiply_add_group(generate_workchain_multiply_add): @pytest.fixture() -def multiply_process_groups(): ... +def duplicate_group(): + def _duplicate_group(source_group: orm.Group, dest_group_label: str): + dupl_group, created = orm.Group.collection.get_or_create(label=dest_group_label) + dupl_group.add_nodes(list(source_group.nodes)) + return dupl_group + + return _duplicate_group @pytest.mark.usefixtures('aiida_profile_clean_class') @@ -61,30 +69,100 @@ def test_should_dump_processes(self, setup_no_process_group, setup_add_group): no_process_group: orm.Group = setup_no_process_group add_group: orm.Group = setup_add_group - base_dumper = BaseDumper() - process_dumper = ProcessDumper() + collection_dumper = CollectionDumper(collection=no_process_group) - group_dumper = CollectionDumper( - base_dumper=base_dumper, process_dumper=process_dumper, collection=no_process_group - ) + assert collection_dumper._should_dump_processes() is False - assert group_dumper._should_dump_processes() is False + collection_dumper = CollectionDumper(collection=add_group) - group_dumper = CollectionDumper(base_dumper=base_dumper, process_dumper=process_dumper, collection=add_group) + assert collection_dumper._should_dump_processes() is True - assert group_dumper._should_dump_processes() is True + @pytest.mark.usefixtures('aiida_profile_clean') + def test_get_nodes( + self, setup_no_process_group, setup_add_group, setup_multiply_add_group, generate_calculation_node_add + ): + add_group: orm.Group = setup_add_group - # def test_get_nodes(self): - # pass + collection_dumper = CollectionDumper(collection=add_group) + nodes = collection_dumper._get_nodes() + group_node = orm.load_node(nodes[0]) + group_node_uuid = nodes[0] - # def test_get_processes(self): - # pass + assert len(nodes) == 1 + assert isinstance(nodes[0], str) + assert isinstance(group_node, orm.CalcJobNode) + assert nodes[0] == group_node_uuid - # def test_dump_processes(self): - # pass + # Now, add another CalcJobNode to the profile + # As not part of the group, should not be returned + cj_node1 = generate_calculation_node_add() + nodes = collection_dumper._get_nodes() + assert len(nodes) == 1 - # def test_dump_calculations(self): - # pass + # Now, add the node to the group, should be captured by get_nodes + add_group.add_nodes([cj_node1]) + nodes = collection_dumper._get_nodes() + assert len(nodes) == 2 + + # Filtering by time should work + collection_dumper.base_dumper.last_dump_time = datetime.now().astimezone() + + cj_node2 = generate_calculation_node_add() + add_group.add_nodes([cj_node2]) + + nodes = collection_dumper._get_nodes() + assert len(nodes) == 1 + assert nodes[0] == cj_node2.uuid + + with pytest.raises(TypeError): + collection_dumper = CollectionDumper(collection=[1]) + collection_dumper._get_nodes() + + def test_get_processes(self, setup_add_group, setup_multiply_add_group, duplicate_group): + add_group: orm.Group = setup_add_group + multiply_add_group: orm.Group = setup_multiply_add_group + + add_nodes = list(add_group.nodes) + multiply_add_nodes = list(multiply_add_group.nodes) + + add_dumper = CollectionDumper(collection=add_group) + multiply_add_dumper = CollectionDumper(collection=multiply_add_group) + + add_process_dict = add_dumper._get_processes() + assert len(add_process_dict['calculations']) == 1 + assert add_process_dict['calculations'][0].uuid == add_nodes[0].uuid + assert len(add_process_dict['workflows']) == 0 + + multiply_add_process_dict = multiply_add_dumper._get_processes() + + assert len(multiply_add_process_dict['calculations']) == 2 + assert set(multiply_add_process_dict['calculations']) == set(multiply_add_nodes[0].called_descendants) + assert len(multiply_add_process_dict['workflows']) == 1 + assert multiply_add_process_dict['calculations'][0].uuid == multiply_add_nodes[0].uuid + + # TODO: Test here also de-duplication with a Workflow with a sub-workflow + + def test_dump_calculations(self, setup_add_group, setup_multiply_add_group, tmp_path): + add_group: orm.Group = setup_add_group + multiply_add_group: orm.Group = setup_multiply_add_group + + add_group_path = tmp_path / 'add_group' + multiply_add_group_path = tmp_path / 'multiply_add_group' + + add_dumper = CollectionDumper(collection=add_group, output_path=add_group_path) + multiply_add_dumper = CollectionDumper(collection=multiply_add_group, output_path=multiply_add_group_path) + + add_process_dict = add_dumper._get_processes() + + add_dumper._dump_calculations(add_process_dict['calculations']) + + assert (add_group_path / 'calculations' / 'ArithmeticAddCalculation-4' / 'inputs' / 'aiida.in').exists() + + multiply_add_process_dict = multiply_add_dumper._get_processes() + + multiply_add_dumper._dump_calculations(multiply_add_process_dict['calculations']) + + pytest.set_trace() # def test_dump_workflows(self): # pass diff --git a/tests/tools/dumping/test_process.py b/tests/tools/dumping/test_process.py index 683e3c4707..56fb356054 100644 --- a/tests/tools/dumping/test_process.py +++ b/tests/tools/dumping/test_process.py @@ -265,69 +265,6 @@ def test_dump_calculation_add(tmp_path, generate_calculation_node_add): assert all([output_file.is_file() for output_file in output_files]) -# Tests for helper methods -@pytest.mark.usefixtures('chdir_tmp_path') -def test_prepare_dump_path(tmp_path): - from aiida.tools.dumping.utils import prepare_dump_path - - test_dir = tmp_path / Path('test-dir') - test_file = test_dir / filename - safeguard_file = node_metadata_file - safeguard_file_path = test_dir / safeguard_file - - # Cannot set both overwrite and incremental to True - with pytest.raises(ValueError): - prepare_dump_path(path_to_validate=test_dir, overwrite=True, incremental=True) - - # Check that fails if file with same name as output dir - test_dir.touch() - with pytest.raises(FileExistsError): - prepare_dump_path(path_to_validate=test_dir) - test_dir.unlink() - - # Check if path created if non-existent - prepare_dump_path(path_to_validate=test_dir) - assert test_dir.exists() - assert safeguard_file_path.is_file() - - # Directory exists, but empty -> is fine - safeguard_file_path.unlink() - prepare_dump_path(path_to_validate=test_dir) - assert test_dir.exists() - assert safeguard_file_path.is_file() - - # Fails if directory not empty, safeguard file existent, and overwrite set to False - test_file.touch() - safeguard_file_path.touch() - with pytest.raises(FileExistsError): - prepare_dump_path(path_to_validate=test_dir, overwrite=False, incremental=False) - - # Fails if directory not empty, overwrite set to True, but safeguard_file not found (for safety reasons) - safeguard_file_path.unlink() - test_file.touch() - with pytest.raises(FileNotFoundError): - prepare_dump_path(path_to_validate=test_dir, overwrite=True, incremental=False) - - # Works if directory not empty, overwrite set to True and safeguard_file contained - # -> After function call, test_file is deleted, and safeguard_file again created - safeguard_file_path.touch() - prepare_dump_path( - path_to_validate=test_dir, - safeguard_file=safeguard_file, - overwrite=True, - incremental=False, - ) - assert not test_file.is_file() - assert safeguard_file_path.is_file() - - # Works if directory not empty, but incremental=True and safeguard_file (e.g. `.aiida_node_metadata.yaml`) contained - # -> After function call, test file and safeguard_file still there - test_file.touch() - prepare_dump_path(path_to_validate=test_dir, safeguard_file=safeguard_file, incremental=True) - assert safeguard_file_path.is_file() - assert test_file.is_file() - - @pytest.mark.usefixtures('aiida_profile_clean') def test_generate_default_dump_path( generate_calculation_node_add, diff --git a/tests/tools/dumping/test_utils.py b/tests/tools/dumping/test_utils.py new file mode 100644 index 0000000000..108a8c612a --- /dev/null +++ b/tests/tools/dumping/test_utils.py @@ -0,0 +1,78 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Tests for utility functions for the dumping data to disk.""" + +from pathlib import Path + +import pytest + +filename = 'file.txt' +node_metadata_file = '.aiida_node_metadata.yaml' + + +@pytest.mark.usefixtures('chdir_tmp_path') +def test_prepare_dump_path(tmp_path): + from aiida.tools.dumping.utils import prepare_dump_path + + test_dir = tmp_path / Path('test-dir') + test_file = test_dir / filename + safeguard_file = node_metadata_file + safeguard_file_path = test_dir / safeguard_file + + # Cannot set both overwrite and incremental to True + with pytest.raises(ValueError): + prepare_dump_path(path_to_validate=test_dir, overwrite=True, incremental=True) + + # Check that fails if file with same name as output dir + test_dir.touch() + with pytest.raises(FileExistsError): + prepare_dump_path(path_to_validate=test_dir) + test_dir.unlink() + + # Check if path created if non-existent + prepare_dump_path(path_to_validate=test_dir) + assert test_dir.exists() + assert safeguard_file_path.is_file() + + # Directory exists, but empty -> is fine + safeguard_file_path.unlink() + prepare_dump_path(path_to_validate=test_dir) + assert test_dir.exists() + assert safeguard_file_path.is_file() + + # Fails if directory not empty, safeguard file existent, and overwrite set to False + test_file.touch() + safeguard_file_path.touch() + with pytest.raises(FileExistsError): + prepare_dump_path(path_to_validate=test_dir, overwrite=False, incremental=False) + + # Fails if directory not empty, overwrite set to True, but safeguard_file not found (for safety reasons) + safeguard_file_path.unlink() + test_file.touch() + with pytest.raises(FileNotFoundError): + prepare_dump_path(path_to_validate=test_dir, overwrite=True, incremental=False) + + # Works if directory not empty, overwrite set to True and safeguard_file contained + # -> After function call, test_file is deleted, and safeguard_file again created + safeguard_file_path.touch() + prepare_dump_path( + path_to_validate=test_dir, + safeguard_file=safeguard_file, + overwrite=True, + incremental=False, + ) + assert not test_file.is_file() + assert safeguard_file_path.is_file() + + # Works if directory not empty, but incremental=True and safeguard_file (e.g. `.aiida_node_metadata.yaml`) contained + # -> After function call, test file and safeguard_file still there + test_file.touch() + prepare_dump_path(path_to_validate=test_dir, safeguard_file=safeguard_file, incremental=True) + assert safeguard_file_path.is_file() + assert test_file.is_file() From 7c905e6c1dba2ea9d87fb657472171ffea6230e0 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 6 Feb 2025 12:04:12 +0100 Subject: [PATCH 19/27] Add `ProcessesToDump` NamedTuple --- src/aiida/tools/dumping/collection.py | 98 +++++++++++++++----------- src/aiida/tools/dumping/profile.py | 7 +- tests/tools/dumping/test_collection.py | 32 ++++----- 3 files changed, 77 insertions(+), 60 deletions(-) diff --git a/src/aiida/tools/dumping/collection.py b/src/aiida/tools/dumping/collection.py index 8ea026382c..c1de2dc1b5 100644 --- a/src/aiida/tools/dumping/collection.py +++ b/src/aiida/tools/dumping/collection.py @@ -14,7 +14,7 @@ from datetime import datetime from functools import cached_property from pathlib import Path -from typing import TYPE_CHECKING, TypeVar, cast +from typing import TYPE_CHECKING, NamedTuple, TypeVar from aiida import orm from aiida.common.log import AIIDA_LOGGER @@ -26,12 +26,19 @@ if TYPE_CHECKING: from collections.abc import Sequence + from aiida.tools.dumping.logger import DumpDict + T = TypeVar('T', bound='orm.ProcessNode') logger = AIIDA_LOGGER.getChild('tools.dumping') +class ProcessesToDump(NamedTuple): + calculations: Sequence[orm.CalculationNode] + workflows: Sequence[orm.WorkflowNode] + + class CollectionDumper: def __init__( self, @@ -41,6 +48,7 @@ def __init__( collection: orm.Group | str | list[str] | None = None, deduplicate: bool = True, output_path: Path | None = None, + processes_to_dump: ProcessesToDump | None = None, ): self.deduplicate = deduplicate @@ -83,36 +91,41 @@ def _get_nodes(self) -> list[str]: filtered_nodes = filter_by_last_dump_time(nodes=nodes, last_dump_time=self.base_dumper.last_dump_time) return filtered_nodes - def _should_dump_processes(self, nodes: list[str] | None = None) -> bool: - test_nodes = nodes or self.nodes - return len([node for node in test_nodes if isinstance(orm.load_node(node), orm.ProcessNode)]) > 0 + @cached_property + def processes_to_dump(self) -> ProcessesToDump: + return self._get_processes_to_dump() - def _get_processes(self) -> dict[str, Sequence[orm.ProcessNode]]: + def _get_processes_to_dump(self) -> ProcessesToDump: nodes = [orm.load_node(n) for n in self.nodes] workflows = [node for node in nodes if isinstance(node, orm.WorkflowNode)] + calculations = [node for node in nodes if isinstance(node, orm.CalculationNode)] # Make sure that only top-level workflows are dumped in their own directories when de-duplcation is enabled if self.deduplicate: workflows = [workflow for workflow in workflows if workflow.caller is None] - # Also need to obtain sub-calculations that were called by workflows of the group - # These are not contained in the group.nodes directly - called_calculations = [] - for workflow in workflows: - called_calculations += [ - node for node in workflow.called_descendants if isinstance(node, orm.CalculationNode) - ] - - calculations = [node for node in nodes if isinstance(node, orm.CalculationNode)] + called_calculations - return { - 'calculations': cast(Sequence[orm.ProcessNode], calculations), - 'workflows': cast(Sequence[orm.ProcessNode], workflows), - } - # return {'calculations': calculations, 'workflows': workflows} + else: + # If no deduplication, also sub-calculations that were called by workflows of the group, and which are not + # contained in the group.nodes directly are being dumped explicitly + called_calculations = [] + for workflow in workflows: + called_calculations += [ + node for node in workflow.called_descendants if isinstance(node, orm.CalculationNode) + ] + + calculations += called_calculations + + return ProcessesToDump( + calculations=calculations, + workflows=workflows, + ) + + def should_dump_processes(self) -> bool: + # if self.processes_to_dump is None: + # self._get_processes_to_dump() + return (len(self.processes_to_dump.calculations) + len(self.processes_to_dump.workflows)) > 0 def _dump_calculations(self, calculations: Sequence[orm.CalculationNode]) -> None: - if len(calculations) == 0: - return calculations_path = self.output_path / 'calculations' dumped_calculations = {} @@ -123,34 +136,32 @@ def _dump_calculations(self, calculations: Sequence[orm.CalculationNode]) -> Non process_node=calculation, prefix=None ) - if calculation.caller is None: - calculation_dumper._dump_calculation( - calculation_node=cast(orm.CalculationNode, calculation), output_path=calculation_dump_path - ) + # This is handled in the get_processes method: `if calculation.caller is None:` + calculation_dumper._dump_calculation(calculation_node=calculation, output_path=calculation_dump_path) - dumped_calculations[calculation.uuid] = DumpLog( - path=calculation_dump_path, - time=datetime.now().astimezone(), - ) + dumped_calculations[calculation.uuid] = DumpLog( + path=calculation_dump_path, + time=datetime.now().astimezone(), + ) - self.dump_logger.update_calculations(dumped_calculations) + self.dump_logger.update_calculations(new_calculations=dumped_calculations) def _dump_workflows(self, workflows: Sequence[orm.WorkflowNode]) -> None: - if len(workflows) == 0: - return - workflow_path = self.output_path / 'workflows' + workflow_path: Path = self.output_path / 'workflows' + dumped_workflows: dict[str, DumpLog] = {} + workflow_path.mkdir(exist_ok=True, parents=True) - dumped_workflows = {} for workflow in workflows: - workflow_dumper = self.process_dumper + workflow_dumper: ProcessDumper = self.process_dumper - workflow_dump_path = workflow_path / workflow_dumper._generate_default_dump_path( + workflow_dump_path: Path = workflow_path / workflow_dumper._generate_default_dump_path( process_node=workflow, prefix=None ) - logged_workflows = self.dump_logger.get_log()['workflows'] + logged_workflows: DumpDict = self.dump_logger.get_log()['workflows'] + # Symlink here, if deduplication enabled and workflow was already dumped if self.deduplicate and workflow in logged_workflows.keys(): os.symlink( src=logged_workflows[workflow.uuid].path, @@ -158,7 +169,7 @@ def _dump_workflows(self, workflows: Sequence[orm.WorkflowNode]) -> None: ) else: workflow_dumper._dump_workflow( - workflow_node=cast(orm.WorkflowNode, workflow), + workflow_node=workflow, output_path=workflow_dump_path, ) @@ -167,10 +178,13 @@ def _dump_workflows(self, workflows: Sequence[orm.WorkflowNode]) -> None: time=datetime.now().astimezone(), ) - self.dump_logger.update_workflows(dumped_workflows) + self.dump_logger.update_workflows(new_workflows=dumped_workflows) def dump(self) -> None: self.output_path.mkdir(exist_ok=True, parents=True) - collection_processes = self._get_processes() - self._dump_calculations(calculations=collection_processes['calculations']) - self._dump_workflows(workflows=collection_processes['workflows']) + collection_processes: ProcessesToDump = self._get_processes_to_dump() + + if len(collection_processes.calculations) > 1: + self._dump_calculations(calculations=collection_processes.calculations) + if len(collection_processes.workflows) > 1: + self._dump_workflows(workflows=collection_processes.workflows) diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py index a591d50269..03f3643b18 100644 --- a/src/aiida/tools/dumping/profile.py +++ b/src/aiida/tools/dumping/profile.py @@ -74,6 +74,9 @@ def dump(self): def _dump_processes_not_in_any_group(self): # === Dump the data that is not associated with any group === + + # `dump_parent_path` is set in the `post_init` method of the `BaseDumper` dataclass + assert self.base_dumper.dump_parent_path is not None if self.organize_by_groups: output_path = self.base_dumper.dump_parent_path / 'no-group' else: @@ -90,7 +93,7 @@ def _dump_processes_not_in_any_group(self): output_path=output_path, ) - if self.dump_processes and no_group_dumper._should_dump_processes(): + if self.dump_processes and no_group_dumper.should_dump_processes(): logger.report(f'Dumping processes not in any group for profile `{self.profile.name}`...') no_group_dumper.dump() @@ -113,7 +116,7 @@ def _dump_processes_per_group(self, groups): output_path=output_path, ) - if self.dump_processes and group_dumper._should_dump_processes(): + if self.dump_processes and group_dumper.should_dump_processes(): logger.report(f'Dumping processes in group {group.label} for profile `{self.profile.name}`...') group_dumper.dump() diff --git a/tests/tools/dumping/test_collection.py b/tests/tools/dumping/test_collection.py index 0e3f11c130..d6c7ed067e 100644 --- a/tests/tools/dumping/test_collection.py +++ b/tests/tools/dumping/test_collection.py @@ -71,11 +71,11 @@ def test_should_dump_processes(self, setup_no_process_group, setup_add_group): collection_dumper = CollectionDumper(collection=no_process_group) - assert collection_dumper._should_dump_processes() is False + assert collection_dumper.should_dump_processes() is False collection_dumper = CollectionDumper(collection=add_group) - assert collection_dumper._should_dump_processes() is True + assert collection_dumper.should_dump_processes() is True @pytest.mark.usefixtures('aiida_profile_clean') def test_get_nodes( @@ -118,7 +118,7 @@ def test_get_nodes( collection_dumper = CollectionDumper(collection=[1]) collection_dumper._get_nodes() - def test_get_processes(self, setup_add_group, setup_multiply_add_group, duplicate_group): + def test_get_processes_to_dump(self, setup_add_group, setup_multiply_add_group, duplicate_group): add_group: orm.Group = setup_add_group multiply_add_group: orm.Group = setup_multiply_add_group @@ -128,17 +128,17 @@ def test_get_processes(self, setup_add_group, setup_multiply_add_group, duplicat add_dumper = CollectionDumper(collection=add_group) multiply_add_dumper = CollectionDumper(collection=multiply_add_group) - add_process_dict = add_dumper._get_processes() - assert len(add_process_dict['calculations']) == 1 - assert add_process_dict['calculations'][0].uuid == add_nodes[0].uuid - assert len(add_process_dict['workflows']) == 0 + add_process_to_dump = add_dumper._get_processes_to_dump() + assert len(add_process_to_dump.calculations) == 1 + assert add_process_to_dump.calculations[0].uuid == add_nodes[0].uuid + assert len(add_process_to_dump.workflows) == 0 - multiply_add_process_dict = multiply_add_dumper._get_processes() + multiply_add_processes_to_dump = multiply_add_dumper._get_processes_to_dump() - assert len(multiply_add_process_dict['calculations']) == 2 - assert set(multiply_add_process_dict['calculations']) == set(multiply_add_nodes[0].called_descendants) - assert len(multiply_add_process_dict['workflows']) == 1 - assert multiply_add_process_dict['calculations'][0].uuid == multiply_add_nodes[0].uuid + assert len(multiply_add_processes_to_dump.calculations) == 2 + assert set(multiply_add_processes_to_dump.calculations) == set(multiply_add_nodes[0].called_descendants) + assert len(multiply_add_processes_to_dump.workflows) == 1 + assert multiply_add_processes_to_dump.calculations[0].uuid == multiply_add_nodes[0].uuid # TODO: Test here also de-duplication with a Workflow with a sub-workflow @@ -152,15 +152,15 @@ def test_dump_calculations(self, setup_add_group, setup_multiply_add_group, tmp_ add_dumper = CollectionDumper(collection=add_group, output_path=add_group_path) multiply_add_dumper = CollectionDumper(collection=multiply_add_group, output_path=multiply_add_group_path) - add_process_dict = add_dumper._get_processes() + add_processes_to_dump = add_dumper._get_processes_to_dump() - add_dumper._dump_calculations(add_process_dict['calculations']) + add_dumper._dump_calculations(add_processes_to_dump.calculations) assert (add_group_path / 'calculations' / 'ArithmeticAddCalculation-4' / 'inputs' / 'aiida.in').exists() - multiply_add_process_dict = multiply_add_dumper._get_processes() + multiply_add_processes_to_dump = multiply_add_dumper._get_processes_to_dump() - multiply_add_dumper._dump_calculations(multiply_add_process_dict['calculations']) + multiply_add_dumper._dump_calculations(multiply_add_processes_to_dump.calculations) pytest.set_trace() From 7dba485fb8bc2012de2e6aa1dc6ef7a9c03e8a50 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 6 Feb 2025 13:57:05 +0100 Subject: [PATCH 20/27] Use `compare_tree` utility function for dumping tests --- src/aiida/cmdline/commands/cmd_profile.py | 2 + src/aiida/tools/dumping/collection.py | 4 +- src/aiida/tools/dumping/profile.py | 4 +- tests/tools/dumping/test_collection.py | 100 +++++++++++----------- tests/tools/dumping/test_utils.py | 29 +++++++ 5 files changed, 84 insertions(+), 55 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_profile.py b/src/aiida/cmdline/commands/cmd_profile.py index 78c7e62686..215af6c2b6 100644 --- a/src/aiida/cmdline/commands/cmd_profile.py +++ b/src/aiida/cmdline/commands/cmd_profile.py @@ -336,6 +336,8 @@ def profile_mirror( except FileExistsError as exc: echo.echo_critical(str(exc)) + breakpoint() + try: with safeguard_file_path.open('r') as fhandle: last_dump_time = datetime.fromisoformat(fhandle.readlines()[-1].strip().split()[-1]).astimezone() diff --git a/src/aiida/tools/dumping/collection.py b/src/aiida/tools/dumping/collection.py index c1de2dc1b5..c1c1674442 100644 --- a/src/aiida/tools/dumping/collection.py +++ b/src/aiida/tools/dumping/collection.py @@ -184,7 +184,7 @@ def dump(self) -> None: self.output_path.mkdir(exist_ok=True, parents=True) collection_processes: ProcessesToDump = self._get_processes_to_dump() - if len(collection_processes.calculations) > 1: + if len(collection_processes.calculations) > 0: self._dump_calculations(calculations=collection_processes.calculations) - if len(collection_processes.workflows) > 1: + if len(collection_processes.workflows) > 0: self._dump_workflows(workflows=collection_processes.workflows) diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py index 03f3643b18..bc9f45fa80 100644 --- a/src/aiida/tools/dumping/profile.py +++ b/src/aiida/tools/dumping/profile.py @@ -101,9 +101,11 @@ def _dump_processes_not_in_any_group(self): def _dump_processes_per_group(self, groups): # === Dump data per-group if Groups exist in profile or are selected === + assert self.base_dumper.dump_parent_path is not None + for group in groups: if self.organize_by_groups: - output_path = self.base_dumper.dump_parent_path / group.label + output_path = self.base_dumper.dump_parent_path / f"group-{group.label}" else: output_path = self.base_dumper.dump_parent_path diff --git a/tests/tools/dumping/test_collection.py b/tests/tools/dumping/test_collection.py index d6c7ed067e..82491a7ff4 100644 --- a/tests/tools/dumping/test_collection.py +++ b/tests/tools/dumping/test_collection.py @@ -12,12 +12,15 @@ # TODO: Test incremental dumping from datetime import datetime +from pathlib import Path import pytest from aiida import orm from aiida.tools.dumping import CollectionDumper +from .test_utils import compare_tree + # Fixture that depends on generate_calculation_node_add_class # @pytest.fixture(scope="class") # def setup_calculation_node_add_class(generate_calculation_node_add_class): @@ -146,21 +149,61 @@ def test_dump_calculations(self, setup_add_group, setup_multiply_add_group, tmp_ add_group: orm.Group = setup_add_group multiply_add_group: orm.Group = setup_multiply_add_group - add_group_path = tmp_path / 'add_group' - multiply_add_group_path = tmp_path / 'multiply_add_group' + add_group_path = Path('add_group') + multiply_add_group_path = Path('multiply_add_group') - add_dumper = CollectionDumper(collection=add_group, output_path=add_group_path) - multiply_add_dumper = CollectionDumper(collection=multiply_add_group, output_path=multiply_add_group_path) + add_dumper = CollectionDumper(collection=add_group, output_path=tmp_path / add_group_path) + multiply_add_dumper = CollectionDumper( + collection=multiply_add_group, output_path=tmp_path / multiply_add_group_path + ) add_processes_to_dump = add_dumper._get_processes_to_dump() add_dumper._dump_calculations(add_processes_to_dump.calculations) - assert (add_group_path / 'calculations' / 'ArithmeticAddCalculation-4' / 'inputs' / 'aiida.in').exists() + expected_tree = { + 'calculations': { + 'ArithmeticAddCalculation-4': { + 'inputs': ['_aiidasubmit.sh', 'aiida.in'], + 'node_inputs': [], + 'outputs': ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'], + } + } + } + + compare_tree(expected=expected_tree, base_path=tmp_path, relative_path=add_group_path) multiply_add_processes_to_dump = multiply_add_dumper._get_processes_to_dump() + # No calculations to dump when deduplication is enabled multiply_add_dumper._dump_calculations(multiply_add_processes_to_dump.calculations) + multiply_add_test_path: Path = multiply_add_group_path / 'calculations' + + assert not multiply_add_test_path.exists() + + multiply_add_dumper_no_dedup = CollectionDumper( + collection=multiply_add_group, output_path=multiply_add_group_path, deduplicate=False + ) + multiply_add_processes_to_dump = multiply_add_dumper_no_dedup._get_processes_to_dump() + + # calculations to dump when deduplication is enabled + multiply_add_dumper_no_dedup._dump_calculations(multiply_add_processes_to_dump.calculations) + + expected_tree_no_dedup = { + 'calculations': { + 'ArithmeticAddCalculation-15': { + 'inputs': ['_aiidasubmit.sh', 'aiida.in'], + 'node_inputs': [], + 'outputs': ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'], + }, + 'multiply-13': { + 'inputs': ['source_file'], + 'node_inputs': [], + }, + } + } + + compare_tree(expected=expected_tree_no_dedup, base_path=tmp_path, relative_path=multiply_add_group_path) pytest.set_trace() @@ -169,50 +212,3 @@ def test_dump_calculations(self, setup_add_group, setup_multiply_add_group, tmp_ # def test_dump(self): # pass - - -#######3 - -# def test_setup_profile( -# self, -# generate_calculation_node_add, -# generate_workchain_multiply_add, -# generate_calculation_node_io, -# generate_workchain_node_io, -# ): -# # TODO: This is a hack... and not actually a real test -# # TODO: I'm using the `aiida_profile_clean_class` fiture to make sure I have a clean profile for this class -# # TODO: However, this method is not an actual test, but sets up the profile data how I want it for testing -# # TODO: Ideally, I'd create a class-scoped fixture that does the setup -# # TODO: Or define a `setup_class` method -# # TODO: However, as most of AiiDA's fixtures are function-scoped, I didn't manage to get any of these approaches -# # TODO: To work, due to pytest's ScopeMismatch exceptions - -# # Create nodes for profile storage -# ## Not in any group -# int_node = orm.Int(1).store() -# _ = generate_calculation_node_add() -# _ = generate_workchain_multiply_add() -# ## For putting into groups -# add_node = generate_calculation_node_add() -# multiply_add_node = generate_workchain_multiply_add() - -# # Create the various groups -# add_group, _ = orm.Group.collection.get_or_create(label='add') -# multiply_add_group, _ = orm.Group.collection.get_or_create(label='multiply-add') -# cj_dupl_group, _ = orm.Group.collection.get_or_create(label='cj-dupl') -# wc_dupl_group, _ = orm.Group.collection.get_or_create(label='wc-dupl') -# no_process_group, _ = orm.Group.collection.get_or_create(label='no-process') - -# # Populate groups -# add_group.add_nodes([add_node]) -# multiply_add_group.add_nodes([multiply_add_node]) -# cj_dupl_group.add_nodes([add_node]) -# wc_dupl_group.add_nodes([multiply_add_node]) -# no_process_group.add_nodes([int_node]) - -# self.add_group = add_group -# self.multiply_add_group = multiply_add_group -# self.cj_dupl_group = cj_dupl_group -# self.wc_dupl_group = wc_dupl_group -# self.no_process_group = no_process_group diff --git a/tests/tools/dumping/test_utils.py b/tests/tools/dumping/test_utils.py index 108a8c612a..b30e49c146 100644 --- a/tests/tools/dumping/test_utils.py +++ b/tests/tools/dumping/test_utils.py @@ -76,3 +76,32 @@ def test_prepare_dump_path(tmp_path): prepare_dump_path(path_to_validate=test_dir, safeguard_file=safeguard_file, incremental=True) assert safeguard_file_path.is_file() assert test_file.is_file() + + +def compare_tree(expected: dict, base_path: Path, relative_path: Path = Path()): + """Recursively compares an expected directory structure with an actual path. + + Args: + expected (dict): The expected directory structure. + base_path (Path): The root directory where the actual structure is located. + relative_path (Path): The relative path inside the base directory (used internally for recursion). + """ + actual_path = base_path / relative_path + + assert actual_path.exists(), f'Path does not exist: {actual_path}' + assert actual_path.is_dir(), f'Path is not a directory: {actual_path}' + + for name, content in expected.items(): + item_path = actual_path / name + assert item_path.exists(), f'Missing: {item_path}' + + if isinstance(content, list): # It's a directory with files (list of filenames) + assert item_path.is_dir(), f'Expected a directory: {item_path}' + # Check that all files exist inside the directory + for filename in content: + file_path = item_path / filename + assert file_path.exists(), f'Missing file: {file_path}' + assert file_path.is_file(), f'Expected a file: {file_path}' + elif isinstance(content, dict): # It's a subdirectory + assert item_path.is_dir(), f'Expected a directory: {item_path}' + compare_tree(content, base_path, relative_path / name) From 8ddaa159c576882111ecf910584482deff8e70c3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 6 Feb 2025 12:57:29 +0000 Subject: [PATCH 21/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/aiida/tools/dumping/profile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py index bc9f45fa80..6b9f33a58e 100644 --- a/src/aiida/tools/dumping/profile.py +++ b/src/aiida/tools/dumping/profile.py @@ -105,7 +105,7 @@ def _dump_processes_per_group(self, groups): for group in groups: if self.organize_by_groups: - output_path = self.base_dumper.dump_parent_path / f"group-{group.label}" + output_path = self.base_dumper.dump_parent_path / f'group-{group.label}' else: output_path = self.base_dumper.dump_parent_path From 42e76ceb8bbd9c6bd861243216478f3fb26c09d1 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 6 Feb 2025 15:55:18 +0100 Subject: [PATCH 22/27] Start making test methods smaller --- src/aiida/tools/dumping/collection.py | 6 +- tests/tools/dumping/test_collection.py | 110 ++++++++++++++++++------- 2 files changed, 84 insertions(+), 32 deletions(-) diff --git a/src/aiida/tools/dumping/collection.py b/src/aiida/tools/dumping/collection.py index c1c1674442..edc8219b98 100644 --- a/src/aiida/tools/dumping/collection.py +++ b/src/aiida/tools/dumping/collection.py @@ -88,8 +88,10 @@ def _get_nodes(self) -> list[str]: else: nodes = [] - filtered_nodes = filter_by_last_dump_time(nodes=nodes, last_dump_time=self.base_dumper.last_dump_time) - return filtered_nodes + # TODO: Possibly have `last_dump_time` as attribute of CollectionDumper instead + # nodes = filter_by_last_dump_time(nodes=nodes, last_dump_time=self.last_dump_time) + nodes = filter_by_last_dump_time(nodes=nodes, last_dump_time=self.base_dumper.last_dump_time) + return nodes @cached_property def processes_to_dump(self) -> ProcessesToDump: diff --git a/tests/tools/dumping/test_collection.py b/tests/tools/dumping/test_collection.py index 82491a7ff4..5ad3ddd01b 100644 --- a/tests/tools/dumping/test_collection.py +++ b/tests/tools/dumping/test_collection.py @@ -17,7 +17,7 @@ import pytest from aiida import orm -from aiida.tools.dumping import CollectionDumper +from aiida.tools.dumping import CollectionDumper, collection from .test_utils import compare_tree @@ -28,6 +28,7 @@ # generate_calculation_node_add_class() # You can also do any additional setup here +@pytest.mark.usefixtures('aiida_profile_clean') @pytest.fixture() def setup_no_process_group() -> orm.Group: no_process_group, _ = orm.Group.collection.get_or_create(label='no-process') @@ -37,6 +38,7 @@ def setup_no_process_group() -> orm.Group: return no_process_group +@pytest.mark.usefixtures('aiida_profile_clean') @pytest.fixture() def setup_add_group(generate_calculation_node_add) -> orm.Group: add_group, _ = orm.Group.collection.get_or_create(label='add') @@ -46,6 +48,7 @@ def setup_add_group(generate_calculation_node_add) -> orm.Group: return add_group +@pytest.mark.usefixtures('aiida_profile_clean') @pytest.fixture() def setup_multiply_add_group(generate_workchain_multiply_add) -> orm.Group: multiply_add_group, _ = orm.Group.collection.get_or_create(label='multiply-add') @@ -55,6 +58,7 @@ def setup_multiply_add_group(generate_workchain_multiply_add) -> orm.Group: return multiply_add_group +@pytest.mark.usefixtures('aiida_profile_clean') @pytest.fixture() def duplicate_group(): def _duplicate_group(source_group: orm.Group, dest_group_label: str): @@ -80,46 +84,51 @@ def test_should_dump_processes(self, setup_no_process_group, setup_add_group): assert collection_dumper.should_dump_processes() is True - @pytest.mark.usefixtures('aiida_profile_clean') - def test_get_nodes( - self, setup_no_process_group, setup_add_group, setup_multiply_add_group, generate_calculation_node_add - ): + + def test_get_nodes_add_group(self, setup_add_group): + add_group: orm.Group = setup_add_group collection_dumper = CollectionDumper(collection=add_group) - nodes = collection_dumper._get_nodes() - group_node = orm.load_node(nodes[0]) - group_node_uuid = nodes[0] - assert len(nodes) == 1 - assert isinstance(nodes[0], str) - assert isinstance(group_node, orm.CalcJobNode) - assert nodes[0] == group_node_uuid - - # Now, add another CalcJobNode to the profile - # As not part of the group, should not be returned - cj_node1 = generate_calculation_node_add() nodes = collection_dumper._get_nodes() assert len(nodes) == 1 + # add_group: orm.Group = setup_add_group - # Now, add the node to the group, should be captured by get_nodes - add_group.add_nodes([cj_node1]) - nodes = collection_dumper._get_nodes() - assert len(nodes) == 2 + # collection_dumper = CollectionDumper(collection=add_group) + # nodes = collection_dumper._get_nodes() + # group_node = orm.load_node(nodes[0]) + # group_node_uuid = nodes[0] - # Filtering by time should work - collection_dumper.base_dumper.last_dump_time = datetime.now().astimezone() + # assert len(nodes) == 1 + # assert isinstance(nodes[0], str) + # assert isinstance(group_node, orm.CalcJobNode) + # assert nodes[0] == group_node_uuid - cj_node2 = generate_calculation_node_add() - add_group.add_nodes([cj_node2]) + # # Now, add another CalcJobNode to the profile + # # As not part of the group, should not be returned + # cj_node1 = generate_calculation_node_add() + # nodes = collection_dumper._get_nodes() + # assert len(nodes) == 1 - nodes = collection_dumper._get_nodes() - assert len(nodes) == 1 - assert nodes[0] == cj_node2.uuid + # # Now, add the node to the group, should be captured by get_nodes + # add_group.add_nodes([cj_node1]) + # nodes = collection_dumper._get_nodes() + # assert len(nodes) == 2 + + # # Filtering by time should work + # collection_dumper.base_dumper.last_dump_time = datetime.now().astimezone() - with pytest.raises(TypeError): - collection_dumper = CollectionDumper(collection=[1]) - collection_dumper._get_nodes() + # cj_node2 = generate_calculation_node_add() + # add_group.add_nodes([cj_node2]) + + # nodes = collection_dumper._get_nodes() + # assert len(nodes) == 1 + # assert nodes[0] == cj_node2.uuid + + # with pytest.raises(TypeError): + # collection_dumper = CollectionDumper(collection=[1]) + # collection_dumper._get_nodes() def test_get_processes_to_dump(self, setup_add_group, setup_multiply_add_group, duplicate_group): add_group: orm.Group = setup_add_group @@ -212,3 +221,44 @@ def test_dump_calculations(self, setup_add_group, setup_multiply_add_group, tmp_ # def test_dump(self): # pass + + # @pytest.mark.usefixtures('aiida_profile_clean') + # def test_get_nodes( + # self, setup_no_process_group, setup_add_group, setup_multiply_add_group, generate_calculation_node_add + # ): + # add_group: orm.Group = setup_add_group + + # collection_dumper = CollectionDumper(collection=add_group) + # nodes = collection_dumper._get_nodes() + # group_node = orm.load_node(nodes[0]) + # group_node_uuid = nodes[0] + + # assert len(nodes) == 1 + # assert isinstance(nodes[0], str) + # assert isinstance(group_node, orm.CalcJobNode) + # assert nodes[0] == group_node_uuid + + # # Now, add another CalcJobNode to the profile + # # As not part of the group, should not be returned + # cj_node1 = generate_calculation_node_add() + # nodes = collection_dumper._get_nodes() + # assert len(nodes) == 1 + + # # Now, add the node to the group, should be captured by get_nodes + # add_group.add_nodes([cj_node1]) + # nodes = collection_dumper._get_nodes() + # assert len(nodes) == 2 + + # # Filtering by time should work + # collection_dumper.base_dumper.last_dump_time = datetime.now().astimezone() + + # cj_node2 = generate_calculation_node_add() + # add_group.add_nodes([cj_node2]) + + # nodes = collection_dumper._get_nodes() + # assert len(nodes) == 1 + # assert nodes[0] == cj_node2.uuid + + # with pytest.raises(TypeError): + # collection_dumper = CollectionDumper(collection=[1]) + # collection_dumper._get_nodes() \ No newline at end of file From de8f92f9318f5f3241161f3131c4ddbe6c14fec2 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 10 Feb 2025 18:56:41 +0100 Subject: [PATCH 23/27] Commit changes to continue. --- src/aiida/cmdline/commands/cmd_profile.py | 71 +++-- src/aiida/cmdline/params/options/main.py | 23 +- src/aiida/common/utils.py | 2 + src/aiida/repository/repository.py | 1 + src/aiida/tools/dumping/base.py | 4 + src/aiida/tools/dumping/collection.py | 318 +++++++++++++++++----- src/aiida/tools/dumping/config.py | 11 + src/aiida/tools/dumping/process.py | 9 +- src/aiida/tools/dumping/profile.py | 131 +++++---- src/aiida/tools/dumping/utils.py | 88 ++++-- tests/tools/dumping/test_collection.py | 151 +++++----- 11 files changed, 553 insertions(+), 256 deletions(-) create mode 100644 src/aiida/tools/dumping/config.py diff --git a/src/aiida/cmdline/commands/cmd_profile.py b/src/aiida/cmdline/commands/cmd_profile.py index 215af6c2b6..e616debd06 100644 --- a/src/aiida/cmdline/commands/cmd_profile.py +++ b/src/aiida/cmdline/commands/cmd_profile.py @@ -273,37 +273,57 @@ def profile_delete(force, delete_data, profiles): @verdi_profile.command('mirror') @options.PATH() +@options.DRY_RUN() @options.OVERWRITE() -# @options.INCREMENTAL() @options.DUMP_PROCESSES() -@options.DEDUPLICATE() +@options.GROUPS() +@options.ORGANIZE_BY_GROUPS() +# @options.DEDUPLICATE() +# @click.option( +# '--check-dirs/--no-check-dirs', +# default=False, +# show_default=True, +# help='Check for existence of dump directories. Otherwise, incremental mirroring is only evaluated from the log.') +@click.option( + '--symlink-duplicates/--no-symlink-duplicates', + default=True, + show_default=True, + help='Symlink data if the same node is contained in multiple groups.') +@click.option( + '--delete-missing/--no-delete-missing', + default=False, + show_default=True, + help="If a previously dumped node is deleted from AiiDA's DB, also delete the corresponding dump directory.") +@click.option( + '--extra-calc-dirs/--no-extra-calc-dirs', + default=False, + show_default=True, + help='If a top-level process calls sub-processes, create a designated directory only for the top-level process.') @options.INCLUDE_INPUTS() @options.INCLUDE_OUTPUTS() @options.INCLUDE_ATTRIBUTES() @options.INCLUDE_EXTRAS() @options.FLAT() -@options.DUMP_CONFIG_FILE() -@options.GROUPS() -@options.ORGANIZE_BY_GROUPS() -@options.DRY_RUN() @click.pass_context def profile_mirror( ctx, path, - overwrite, - organize_by_groups, dry_run, + overwrite, dump_processes, - deduplicate, + groups, + organize_by_groups, + symlink_duplicates, + delete_missing, + extra_calc_dirs, + # check_dirs, include_inputs, include_outputs, include_attributes, include_extras, flat, - dump_config_file, - groups, ): - """Dump all data in an AiiDA profile's storage to disk.""" + """Dump all data in an AiiDA profile's storage to disk in a human-readable directory tree.""" import json from datetime import datetime @@ -313,6 +333,7 @@ def profile_mirror( from aiida.tools.dumping.base import BaseDumper from aiida.tools.dumping.logger import DumpLogger from aiida.tools.dumping.utils import prepare_dump_path + from aiida.tools.dumping.config import ProfileDumpConfig profile = ctx.obj['profile'] @@ -321,7 +342,7 @@ def profile_mirror( if path is None: path = Path.cwd() / f'{profile.name}-mirror' - echo.echo_report(f'Mirroring data of profile `{profile.name}`at path: `{path}`.') + echo.echo_report(f'Mirroring data of profile `{profile.name}` at path: `{path}`.') SAFEGUARD_FILE: str = '.verdi_profile_mirror' # noqa: N806 safeguard_file_path: Path = path / SAFEGUARD_FILE @@ -336,8 +357,6 @@ def profile_mirror( except FileExistsError as exc: echo.echo_critical(str(exc)) - breakpoint() - try: with safeguard_file_path.open('r') as fhandle: last_dump_time = datetime.fromisoformat(fhandle.readlines()[-1].strip().split()[-1]).astimezone() @@ -346,9 +365,10 @@ def profile_mirror( if dry_run: node_counts = ProfileDumper._get_number_of_nodes_to_dump(last_dump_time) - node_counts_str = ' & '.join(f'{count} {node_type}' for node_type, count in node_counts.items()) - dry_run_message = f'Dry run for mirroring of profile `{profile.name}`: {node_counts_str} to dump.\n' + dry_run_message = f'Dry run for mirroring of profile `{profile.name}`. Would dump:' echo.echo_report(dry_run_message) + for count, node_type in node_counts.items(): + echo.echo_report(f'{count}: {node_type}') return if incremental: @@ -376,18 +396,25 @@ def profile_mirror( flat=flat, ) + # breakpoint() + profile_dump_config = ProfileDumpConfig( + dump_processes=dump_processes, + symlink_duplicates=symlink_duplicates, + delete_missing=delete_missing, + extra_calc_dirs=extra_calc_dirs, + organize_by_groups=organize_by_groups, + ) + profile_dumper = ProfileDumper( + profile=profile, + profile_dump_config=profile_dump_config, base_dumper=base_dumper, process_dumper=process_dumper, dump_logger=dump_logger, groups=groups, - organize_by_groups=organize_by_groups, - deduplicate=deduplicate, - profile=profile, - dump_processes=dump_processes, ) - profile_dumper.dump() + profile_dumper.dump_processes() # Append the current time to the file last_dump_time = datetime.now().astimezone() diff --git a/src/aiida/cmdline/params/options/main.py b/src/aiida/cmdline/params/options/main.py index 82d4fda8d8..a806d1b1a2 100644 --- a/src/aiida/cmdline/params/options/main.py +++ b/src/aiida/cmdline/params/options/main.py @@ -53,12 +53,10 @@ 'DB_PORT', 'DB_USERNAME', 'DEBUG', - 'DEDUPLICATE', 'DESCRIPTION', 'DICT_FORMAT', 'DICT_KEYS', 'DRY_RUN', - 'DUMP_CONFIG_FILE', 'DUMP_PROCESSES', 'EXIT_STATUS', 'EXPORT_FORMAT', @@ -792,13 +790,13 @@ def set_log_level(ctx, _param, value): show_default=True, ) -DEDUPLICATE = OverridableOption( - '--deduplicate/--no-deduplicate', - is_flag=True, - default=True, - show_default=True, - help='', -) +# DEDUPLICATE = OverridableOption( +# '--deduplicate/--no-deduplicate', +# is_flag=True, +# default=True, +# show_default=True, +# help='', +# ) DUMP_PROCESSES = OverridableOption( '--dump-processes/--no-dump-processes', @@ -808,13 +806,6 @@ def set_log_level(ctx, _param, value): help='Dump process data.', ) -DUMP_CONFIG_FILE = OverridableOption( - '--dump-config-file', - default=None, - type=types.FileOrUrl(), - help='Provide dumping options via a config file in YAML format.', -) - ORGANIZE_BY_GROUPS = OverridableOption( '--organize-by-groups/--no-organize-by-groups', default=True, diff --git a/src/aiida/common/utils.py b/src/aiida/common/utils.py index 1b2f2b14ce..8cd1046dfb 100644 --- a/src/aiida/common/utils.py +++ b/src/aiida/common/utils.py @@ -17,6 +17,8 @@ from datetime import datetime from typing import Any, Dict from uuid import UUID +from aiida.manage import get_manager, load_profile +from aiida.manage.configuration.profile import Profile from .lang import classproperty diff --git a/src/aiida/repository/repository.py b/src/aiida/repository/repository.py index 992a96447d..32351ddeef 100644 --- a/src/aiida/repository/repository.py +++ b/src/aiida/repository/repository.py @@ -519,6 +519,7 @@ def copy_tree(self, target: Union[str, pathlib.Path], path: Optional[FilePath] = dirpath.mkdir(parents=True, exist_ok=True) with self.open(root / filename) as handle: + # TODO: Possibly skip filepath.write_bytes(handle.read()) # these methods are not actually used in aiida-core, but are here for completeness diff --git a/src/aiida/tools/dumping/base.py b/src/aiida/tools/dumping/base.py index bbe63c9301..6bbd5b505e 100644 --- a/src/aiida/tools/dumping/base.py +++ b/src/aiida/tools/dumping/base.py @@ -14,9 +14,13 @@ @dataclass class BaseDumper: + """Container for shared arguments of all Dumper classes.""" + dump_parent_path: Path | None = None overwrite: bool = False incremental: bool = True + check_dirs: bool = False + # TODO: Make this a per-class attribute? last_dump_time: datetime | None = None def __post_init__(self): diff --git a/src/aiida/tools/dumping/collection.py b/src/aiida/tools/dumping/collection.py index edc8219b98..56352b4574 100644 --- a/src/aiida/tools/dumping/collection.py +++ b/src/aiida/tools/dumping/collection.py @@ -10,26 +10,27 @@ from __future__ import annotations +from dataclasses import dataclass import os from datetime import datetime -from functools import cached_property from pathlib import Path -from typing import TYPE_CHECKING, NamedTuple, TypeVar +from typing import TYPE_CHECKING, NamedTuple from aiida import orm +from aiida.common.exceptions import NotExistent from aiida.common.log import AIIDA_LOGGER from aiida.tools.dumping.base import BaseDumper from aiida.tools.dumping.logger import DumpLog, DumpLogger from aiida.tools.dumping.process import ProcessDumper from aiida.tools.dumping.utils import filter_by_last_dump_time +from aiida.tools.dumping.config import ProfileDumpConfig +from typing import Literal if TYPE_CHECKING: from collections.abc import Sequence from aiida.tools.dumping.logger import DumpDict -T = TypeVar('T', bound='orm.ProcessNode') - logger = AIIDA_LOGGER.getChild('tools.dumping') @@ -38,98 +39,175 @@ class ProcessesToDump(NamedTuple): calculations: Sequence[orm.CalculationNode] workflows: Sequence[orm.WorkflowNode] + @property + def is_empty(self) -> bool: + """Check if there are any processes to dump.""" + return len(self.calculations) == 0 and len(self.workflows) == 0 + + +# @dataclass +# class CollectionDumpConfig: +# dump_processes: bool = True +# symlink_duplicates: bool = True +# delete_missing: bool = False +# extra_calc_dirs: bool = False +# organize_by_groups: bool = True class CollectionDumper: + """Class to handle dumping of a collection of AiiDA ORM entities.""" + def __init__( self, + collection: orm.Group | str | Sequence[str] | Sequence[int], + profile_dump_config: ProfileDumpConfig | None = None, base_dumper: BaseDumper | None = None, process_dumper: ProcessDumper | None = None, dump_logger: DumpLogger | None = None, - collection: orm.Group | str | list[str] | None = None, - deduplicate: bool = True, output_path: Path | None = None, - processes_to_dump: ProcessesToDump | None = None, ): - self.deduplicate = deduplicate + """Initialize the CollectionDumper. - # Collection could be a Group or a list of nodes - if isinstance(collection, str): - try: - collection = orm.load_group(collection) - except: - raise + :param collection: The collection of AiiDA ORM entities to be dumped, either a group, group label, or list of + :param base_dumper: Base dumper instance or None (gets instantiated). + :param process_dumper: Process dumper instance or None (gets instantiated). + :param dump_logger: Logger for the dumping (gets instantiated). + :param output_path: The parent output path for dumping the collection nodes. + :param processes_to_dump: Optional precomputed processes to dump. + """ - self.collection = collection + self.collection = self._validate_collection(collection) self.base_dumper = base_dumper or BaseDumper() self.process_dumper = process_dumper or ProcessDumper() self.dump_logger = dump_logger or DumpLogger(dump_parent_path=self.base_dumper.dump_parent_path) - # Properly set the `output_path` attribute - if output_path is not None: - self.output_path = output_path - else: - self.output_path = Path.cwd() - - @cached_property - def nodes(self) -> list[str]: - return self._get_nodes() - - def _get_nodes(self) -> list[str]: - nodes: list[str] | None = None - if isinstance(self.collection, orm.Group): - nodes = [n.uuid for n in self.collection.nodes] - elif isinstance(self.collection, list) and len(self.collection) > 0: - if all(isinstance(n, str) for n in self.collection): - nodes = self.collection - else: - msg = 'A collection of nodes must be passed via their UUIDs.' - raise TypeError(msg) + self.output_path = output_path or Path.cwd() + + self.profile_dump_config = profile_dump_config or ProfileDumpConfig() + + self._collection_nodes: Sequence[str] | Sequence[int] | None = None + self._processes_to_dump: ProcessesToDump | None = None + + def _validate_collection( + self, collection: orm.Group | str | Sequence[str] | Sequence[int] + ) -> orm.Group | Sequence[str] | Sequence[int]: + """Validate the given collection identifier. + + :param collection: The input collection to validate. + :return: The validated collection. + :raises NotExistent: If no ``orm.Group`` can be loaded for a given label. + :raises ValueError: If no list of integers or strings to identify nodes is passed. + """ + + if isinstance(collection, str): + try: + return orm.load_group(collection) + except Exception as exc: + msg = f'Could not load group: {collection}.' + raise NotExistent(msg) from exc + if (isinstance(collection, list) and all(isinstance(n, (str, int)) for n in collection)) or isinstance( + collection, orm.Group + ): + return collection + else: - nodes = [] + msg = f'{collection} is an invalid collection.' + raise ValueError(msg) + + @property + def collection_nodes(self) -> Sequence[str] | Sequence[int]: + """Return collection nodes. + + :return: List of collection node identifiers. + """ + if not self._collection_nodes: + self._collection_nodes = self._get_collection_nodes() + return self._collection_nodes + + def _get_collection_nodes(self) -> Sequence[str] | Sequence[int]: + """Retrieve the node ``PK``s/``UUID``s from the collection, filtered by the last dump time, if incremental + dumping is selected. + + :return: List of node identifiers. + """ + if not self.collection: + return [] + + nodes = [n.uuid for n in self.collection.nodes] if isinstance(self.collection, orm.Group) else self.collection + + if self.base_dumper.incremental and self.base_dumper.last_dump_time: + nodes = filter_by_last_dump_time(nodes, last_dump_time=self.base_dumper.last_dump_time) - # TODO: Possibly have `last_dump_time` as attribute of CollectionDumper instead - # nodes = filter_by_last_dump_time(nodes=nodes, last_dump_time=self.last_dump_time) - nodes = filter_by_last_dump_time(nodes=nodes, last_dump_time=self.base_dumper.last_dump_time) return nodes - @cached_property + @property def processes_to_dump(self) -> ProcessesToDump: - return self._get_processes_to_dump() + """Get the processes to dump from the collection of nodes. + + :return: Instance of the ``ProcessesToDump`` class containing the selected calculations and workflows. + """ + if not self._processes_to_dump: + self._processes_to_dump = self._get_processes_to_dump() + return self._processes_to_dump def _get_processes_to_dump(self) -> ProcessesToDump: - nodes = [orm.load_node(n) for n in self.nodes] - workflows = [node for node in nodes if isinstance(node, orm.WorkflowNode)] - calculations = [node for node in nodes if isinstance(node, orm.CalculationNode)] + """Retrieve the processeses from the collection nodes. - # Make sure that only top-level workflows are dumped in their own directories when de-duplcation is enabled - if self.deduplicate: - workflows = [workflow for workflow in workflows if workflow.caller is None] + If deduplication is selected, this method takes care of only dumping top-level workflows and only dump + calculations in their own designated directories if they are not part of a workflow. - else: - # If no deduplication, also sub-calculations that were called by workflows of the group, and which are not - # contained in the group.nodes directly are being dumped explicitly + :return: Instance of the ``ProcessesToDump`` class containing the selected calculations and workflows. + """ + + if not self.collection_nodes: + return ProcessesToDump(calculations=[], workflows=[]) + + # Better than: `nodes = [orm.load_node(n) for n in self.collection_nodes]` + # As the list comprehension fetches each node from the DB individually + nodes_orm = orm.QueryBuilder().append(orm.Node, filters={'uuid': {'in': self.collection_nodes}}).all(flat=True) + + workflows = [node for node in nodes_orm if isinstance(node, orm.WorkflowNode)] + calculations = [node for node in nodes_orm if isinstance(node, orm.CalculationNode)] + + # Make sure that only top-level workflows and calculations are dumped + workflows = [workflow for workflow in workflows if workflow.caller is None] + + # If sub-calculations that were called by workflows of the group, and which are not + # contained in the group.nodes directly are being dumped explicitly + # breakpoint() + if self.profile_dump_config.extra_calc_dirs: called_calculations = [] for workflow in workflows: called_calculations += [ node for node in workflow.called_descendants if isinstance(node, orm.CalculationNode) ] - calculations += called_calculations + # Convert to set to avoid duplicates + calculations = list(set(calculations + called_calculations)) + else: + calculations = [calculation for calculation in calculations if calculation.caller is None] return ProcessesToDump( calculations=calculations, workflows=workflows, ) - def should_dump_processes(self) -> bool: - # if self.processes_to_dump is None: - # self._get_processes_to_dump() - return (len(self.processes_to_dump.calculations) + len(self.processes_to_dump.workflows)) > 0 - def _dump_calculations(self, calculations: Sequence[orm.CalculationNode]) -> None: + + """Dump a collection of calculations. + + Deduplication is already handled in the ``get_processes`` method, where PKs/UUIDs are used, rather than AiiDA + ORM entities as here. Specifically, calculations that are part of a workflow are not dumpid in their own, + dedicated directory if they are part of a workflow. + + :param calculations: Sequence of ``orm.CalculationNode``s + :return: None + """ + calculations_path = self.output_path / 'calculations' - dumped_calculations = {} + dumped_calculations: dict[str, DumpLog] = {} + + logged_calculations: DumpDict = self.dump_logger.get_log()['calculations'] for calculation in calculations: calculation_dumper = self.process_dumper @@ -138,6 +216,13 @@ def _dump_calculations(self, calculations: Sequence[orm.CalculationNode]) -> Non process_node=calculation, prefix=None ) + if self.profile_dump_config.symlink_duplicates and calculation.uuid in logged_calculations.keys(): + calculation_dump_path.parent.mkdir(exist_ok=True, parents=True) + os.symlink( + src=logged_calculations[calculation.uuid].path, + dst=calculation_dump_path, + ) + # This is handled in the get_processes method: `if calculation.caller is None:` calculation_dumper._dump_calculation(calculation_node=calculation, output_path=calculation_dump_path) @@ -149,11 +234,16 @@ def _dump_calculations(self, calculations: Sequence[orm.CalculationNode]) -> Non self.dump_logger.update_calculations(new_calculations=dumped_calculations) def _dump_workflows(self, workflows: Sequence[orm.WorkflowNode]) -> None: + """Dump a collection of workflows. + + """ workflow_path: Path = self.output_path / 'workflows' dumped_workflows: dict[str, DumpLog] = {} workflow_path.mkdir(exist_ok=True, parents=True) + logged_workflows: DumpDict = self.dump_logger.get_log()['workflows'] + for workflow in workflows: workflow_dumper: ProcessDumper = self.process_dumper @@ -161,10 +251,10 @@ def _dump_workflows(self, workflows: Sequence[orm.WorkflowNode]) -> None: process_node=workflow, prefix=None ) - logged_workflows: DumpDict = self.dump_logger.get_log()['workflows'] - # Symlink here, if deduplication enabled and workflow was already dumped - if self.deduplicate and workflow in logged_workflows.keys(): + if self.profile_dump_config.symlink_duplicates and workflow.uuid in logged_workflows.keys(): + workflow_dump_path.parent.mkdir(exist_ok=True, parents=True) + os.symlink( src=logged_workflows[workflow.uuid].path, dst=workflow_dump_path, @@ -175,18 +265,110 @@ def _dump_workflows(self, workflows: Sequence[orm.WorkflowNode]) -> None: output_path=workflow_dump_path, ) - dumped_workflows[workflow.uuid] = DumpLog( - path=workflow_dump_path, - time=datetime.now().astimezone(), - ) + dumped_workflows[workflow.uuid] = DumpLog( + path=workflow_dump_path, + time=datetime.now().astimezone(), + ) self.dump_logger.update_workflows(new_workflows=dumped_workflows) def dump(self) -> None: + """Top-level method that actually performs the dumping of the AiiDA data for the collection. + + :return: None + """ + self.output_path.mkdir(exist_ok=True, parents=True) collection_processes: ProcessesToDump = self._get_processes_to_dump() + # breakpoint() + + if not self.processes_to_dump.is_empty: + # self._dump_processes(processes=collection_processes) + + # First, dump workflows, then calculations + if len(collection_processes.workflows) > 0: + # breakpoint() + self._dump_workflows(workflows=collection_processes.workflows) + if len(collection_processes.calculations) > 0: + # breakpoint() + self._dump_calculations(calculations=collection_processes.calculations) + +# TODO: See, if I can generalize the dump sub-methods + # def _dump_processes( + # self, + # # processes: Sequence[orm.CalculationNode | orm.WorkflowNode], + # processes: Sequence[orm.CalculationNode] | Sequence[orm.WorkflowNode], + # ) -> None: + # """Dump a collection of calculations or workflows. + + # :param processes: Sequence of ``orm.CalculationNode``s or ``orm.WorkflowNode``s + # :param process_type: Type of processes, either 'calculations' or 'workflows' + # :return: None + # """ + + # # From, e.g., 'aiida.workflows:core.arithmetic.multiply_add' to 'workflows + # if isinstance(processes[0], orm.CalculationNode): + # process_type_str = 'calculations' + # elif isinstance(processes[0], orm.WorkflowNode): + # process_type_str = 'workflows' + # # else: + # # breakpoint() + # # process_type_str = processes[0].process_type.split(':')[0].split('.')[1] + # process_type_path = self.output_path / process_type_str + # process_type_path.mkdir(exist_ok=True, parents=True) + + # dumped_processes: dict[str, DumpLog] = {} + # logged_processes: DumpDict = self.dump_logger.get_log()[process_type_str] + + # # breakpoint() + + # for process in processes: + # process_dumper = self.process_dumper + + # process_dump_path = process_type_path / process_dumper._generate_default_dump_path( + # process_node=process, prefix=None + # ) + + # # Target directory already exists, skip this process + # if process_dump_path.exists(): + # continue + + # else: + # # Symlink here, if deduplication enabled and process was already dumped + # # TODO: Possibly check dirs here + # # TODO: Alternatively have method/endpoint to delete one calculation from the dumping + # # TODO: Which would also update the log. + # # Otherwise, one might delete a calculation, maybe because it was wrong, and then it won't be dumped + # # anymore ever. + # if self.deduplicate and process.uuid in logged_processes.keys(): + # try: + # os.symlink( + # src=logged_processes[process.uuid].path, + # dst=process_dump_path, + # ) + # except: + # # raise + # pass + # # breakpoint() + # else: + # if process_type_str == 'calculations': + # process_dumper._dump_calculation(calculation_node=process, output_path=process_dump_path) + # elif process_type_str == 'workflows': + # process_dumper._dump_workflow( + # workflow_node=process, + # output_path=process_dump_path, + # ) + + + # dumped_processes[process.uuid] = DumpLog( + # path=process_dump_path, + # time=datetime.now().astimezone(), + # ) + + # # breakpoint() + + # if process_type_str == 'calculations': + # self.dump_logger.update_calculations(new_calculations=dumped_processes) + # elif process_type_str == 'workflows': + # self.dump_logger.update_workflows(new_workflows=dumped_processes) - if len(collection_processes.calculations) > 0: - self._dump_calculations(calculations=collection_processes.calculations) - if len(collection_processes.workflows) > 0: - self._dump_workflows(workflows=collection_processes.workflows) diff --git a/src/aiida/tools/dumping/config.py b/src/aiida/tools/dumping/config.py new file mode 100644 index 0000000000..cd8537ce3b --- /dev/null +++ b/src/aiida/tools/dumping/config.py @@ -0,0 +1,11 @@ +from dataclasses import dataclass + + +@dataclass +class ProfileDumpConfig: + dump_processes: bool = True + symlink_duplicates: bool = True # + delete_missing: bool = False # profile + extra_calc_dirs: bool = False # collection + organize_by_groups: bool = True # profile + diff --git a/src/aiida/tools/dumping/process.py b/src/aiida/tools/dumping/process.py index f65da5a15e..8a4f962bf2 100644 --- a/src/aiida/tools/dumping/process.py +++ b/src/aiida/tools/dumping/process.py @@ -42,6 +42,8 @@ class ProcessDumper: + """Class to handle dumping of an AiiDA process.""" + def __init__( self, base_dumper: BaseDumper | None = None, @@ -52,6 +54,10 @@ def __init__( flat: bool = False, dump_unsealed: bool = False, ) -> None: + """Initialize the CollectionDumper. + + + """ self.include_inputs = include_inputs self.include_outputs = include_outputs self.include_attributes = include_attributes @@ -218,8 +224,7 @@ def dump( # for key, value in kwargs.items(): # setattr(self, key, value) - if output_path is None: - output_path = self._generate_default_dump_path(process_node=process_node) + output_path = output_path or self._generate_default_dump_path(process_node=process_node) prepare_dump_path( path_to_validate=output_path, overwrite=self.base_dumper.overwrite, incremental=self.base_dumper.incremental diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py index 6b9f33a58e..04374ebe16 100644 --- a/src/aiida/tools/dumping/profile.py +++ b/src/aiida/tools/dumping/profile.py @@ -7,11 +7,12 @@ # For further information please visit http://www.aiida.net # ########################################################################### -# TODO: Use `batch_iter` from aiida.tools.archive.common +# TODO: Possibly use `batch_iter` from aiida.tools.archive.common from __future__ import annotations -from typing import cast +from dataclasses import dataclass +from typing import Sequence, cast from aiida import orm from aiida.common.log import AIIDA_LOGGER @@ -19,6 +20,7 @@ from aiida.manage.configuration.profile import Profile from aiida.tools.dumping.base import BaseDumper from aiida.tools.dumping.collection import CollectionDumper +from aiida.tools.dumping.config import ProfileDumpConfig from aiida.tools.dumping.logger import DumpLogger from aiida.tools.dumping.process import ProcessDumper from aiida.tools.dumping.utils import filter_by_last_dump_time @@ -27,73 +29,67 @@ class ProfileDumper: + """Class to handle dumping of the data of an AiiDA profile.""" + def __init__( self, profile: str | Profile | None = None, + profile_dump_config: ProfileDumpConfig | None = None, base_dumper: BaseDumper | None = None, process_dumper: ProcessDumper | None = None, dump_logger: DumpLogger | None = None, - organize_by_groups: bool = True, - deduplicate: bool = True, - groups: list[str | orm.Group] | None = None, - dump_processes: bool = True, + # deduplicate: bool = True, + groups: Sequence[str | orm.Group] | None = None, ): - self.organize_by_groups = organize_by_groups - self.deduplicate = deduplicate - self.dump_processes = dump_processes + """Initialize the ProfileDumper. + + :param profile: The selected profile to dump. + :param base_dumper: Base dumper instance or None (gets instantiated). + :param process_dumper: Process dumper instance or None (gets instantiated). + :param dump_logger: Logger for the dumping (gets instantiated). + :param organize_by_groups: Organize dumped data by groups. + :param groups: Dump data only for selected groups. + :param dump_processes: Should dump process data? + """ + self.groups = groups self.base_dumper = base_dumper or BaseDumper() self.process_dumper = process_dumper or ProcessDumper() self.dump_logger = dump_logger or DumpLogger(dump_parent_path=self.base_dumper.dump_parent_path) - # Load the profile - if isinstance(profile, str): - profile = load_profile(profile) + self.profile_dump_config = profile_dump_config or ProfileDumpConfig() - if profile is None: - manager = get_manager() - profile = manager.get_profile() - - assert profile is not None + if not isinstance(profile, Profile): + profile = load_profile(profile=profile, allow_switch=True) self.profile = profile - def dump(self): - # No groups selected, dump data which is not part of any group - # If groups selected, however, this data should not also be dumped automatically - if not self.groups: - self._dump_processes_not_in_any_group() - - # Still, even without selecting groups, by default, all profile data should be dumped - # Thus, we obtain all groups in the profile here - profile_groups = orm.QueryBuilder().append(orm.Group).all(flat=True) - self._dump_processes_per_group(groups=profile_groups) - - else: - self._dump_processes_per_group(groups=self.groups) - def _dump_processes_not_in_any_group(self): - # === Dump the data that is not associated with any group === + """Dump the profile's process data not contained in any group.""" - # `dump_parent_path` is set in the `post_init` method of the `BaseDumper` dataclass + # `dump_parent_path` set to CWD in the `post_init` method of the `BaseDumper` dataclass if not given assert self.base_dumper.dump_parent_path is not None - if self.organize_by_groups: + if self.profile_dump_config.organize_by_groups: output_path = self.base_dumper.dump_parent_path / 'no-group' else: output_path = self.base_dumper.dump_parent_path - no_group_nodes = self._get_no_group_nodes() + no_group_nodes = self._get_no_group_processes() no_group_dumper = CollectionDumper( + collection=no_group_nodes, + profile_dump_config=self.profile_dump_config, base_dumper=self.base_dumper, process_dumper=self.process_dumper, - collection=no_group_nodes, - deduplicate=self.deduplicate, + # deduplicate=self.deduplicate, dump_logger=self.dump_logger, output_path=output_path, ) - if self.dump_processes and no_group_dumper.should_dump_processes(): + # Add additional check here to only issue the message when there are actual processes to dump for a group + # This might not be the case for, e.g., pseudopotential groups, or if there are no new processes since the + # last dumping + if self.dump_processes and not no_group_dumper.processes_to_dump.is_empty: logger.report(f'Dumping processes not in any group for profile `{self.profile.name}`...') no_group_dumper.dump() @@ -104,59 +100,84 @@ def _dump_processes_per_group(self, groups): assert self.base_dumper.dump_parent_path is not None for group in groups: - if self.organize_by_groups: + if self.profile_dump_config.organize_by_groups: output_path = self.base_dumper.dump_parent_path / f'group-{group.label}' else: output_path = self.base_dumper.dump_parent_path group_dumper = CollectionDumper( base_dumper=self.base_dumper, + profile_dump_config=self.profile_dump_config, process_dumper=self.process_dumper, dump_logger=self.dump_logger, collection=group, - deduplicate=self.deduplicate, + # deduplicate=self.deduplicate, output_path=output_path, ) - if self.dump_processes and group_dumper.should_dump_processes(): + # Add additional check here to only issue the message when there are actual processes to dump for a group + # This might not be the case for, e.g., pseudopotential groups, or if there are no new processes since the + # last dumping + # breakpoint() + if self.dump_processes and not group_dumper.processes_to_dump.is_empty: + # breakpoint() logger.report(f'Dumping processes in group {group.label} for profile `{self.profile.name}`...') group_dumper.dump() - def _get_no_group_nodes(self) -> list[str]: - # Get all nodes that are _not_ in any group + def _get_no_group_processes(self) -> Sequence[str] | Sequence[int]: + """Obtain nodes in the profile that are not part of any group. + + :return: List of UUIDs of selected nodes. + """ + group_qb = orm.QueryBuilder().append(orm.Group) - profile_groups = cast(list[orm.Group], group_qb.all(flat=True)) - node_qb = orm.QueryBuilder().append(orm.Node, project=['uuid']) - profile_nodes = cast(list[str], node_qb.all(flat=True)) + profile_groups = cast(Sequence[orm.Group], group_qb.all(flat=True)) + process_qb = orm.QueryBuilder().append(orm.ProcessNode, project=['uuid']) + profile_nodes = cast(Sequence[str], process_qb.all(flat=True)) - nodes_in_groups: list[str] = [node.uuid for group in profile_groups for node in group.nodes] + nodes_in_groups: Sequence[str] = [node.uuid for group in profile_groups for node in group.nodes] # Need to expand here also with the called_descendants of `WorkflowNodes`, otherwise the called # `CalculationNode`s for `WorkflowNode`s that are part of a group are dumped twice # Get the called descendants of WorkflowNodes within the nodes_in_groups list - - sub_nodes_in_groups: list[str] = [ + sub_nodes_in_groups: Sequence[str] = [ node.uuid for n in nodes_in_groups - if isinstance((workflow_node := orm.load_node(n)), orm.WorkflowNode) + # if isinstance((workflow_node := orm.load_node(n)), orm.WorkflowNode) + if isinstance((workflow_node := orm.load_node(n)), orm.ProcessNode) for node in workflow_node.called_descendants ] - # sub_nodes_in_groups: list[str] = [node.uuid for node in sub_nodes_in_groups] nodes_in_groups += sub_nodes_in_groups - nodes: list[str] = [profile_node for profile_node in profile_nodes if profile_node not in nodes_in_groups] - nodes = filter_by_last_dump_time(nodes=nodes, last_dump_time=self.base_dumper.last_dump_time) + process_nodes: Sequence[str | int] = [ + profile_node for profile_node in profile_nodes if profile_node not in nodes_in_groups + ] + process_nodes = filter_by_last_dump_time(nodes=process_nodes, last_dump_time=self.base_dumper.last_dump_time) + + return process_nodes + + def dump_processes(self): + # No groups selected, dump data which is not part of any group + # If groups selected, however, this data should not also be dumped automatically + if not self.groups: + self._dump_processes_not_in_any_group() + + # Still, even without selecting groups, by default, all profile data should be dumped + # Thus, we obtain all groups in the profile here + profile_groups = orm.QueryBuilder().append(orm.Group).all(flat=True) + self._dump_processes_per_group(groups=profile_groups) - return nodes + else: + self._dump_processes_per_group(groups=self.groups) @staticmethod def _get_number_of_nodes_to_dump(last_dump_time) -> dict[str, int]: result = {} for node_type in (orm.CalculationNode, orm.WorkflowNode): qb = orm.QueryBuilder().append(node_type, project=['uuid']) - nodes = cast(list[str], qb.all(flat=True)) + nodes = cast(Sequence[str], qb.all(flat=True)) nodes = filter_by_last_dump_time(nodes=nodes, last_dump_time=last_dump_time) result[node_type.class_node_type.split('.')[-2] + 's'] = len(nodes) return result diff --git a/src/aiida/tools/dumping/utils.py b/src/aiida/tools/dumping/utils.py index 0573fede09..d2f216c539 100644 --- a/src/aiida/tools/dumping/utils.py +++ b/src/aiida/tools/dumping/utils.py @@ -13,6 +13,7 @@ import shutil from datetime import datetime from pathlib import Path +from typing import cast from aiida import orm from aiida.common.log import AIIDA_LOGGER @@ -40,6 +41,8 @@ def prepare_dump_path( `incremental` are enabled. :raises FileNotFoundError: If no `safeguard_file` is found.""" + # TODO: Handle symlinks + if overwrite and incremental: msg = 'Both overwrite and incremental set to True. Only specify one.' raise ValueError(msg) @@ -63,9 +66,16 @@ def prepare_dump_path( safeguard_exists = (path_to_validate / safeguard_file).is_file() if safeguard_exists: + logger.report(path_to_validate) + # breakpoint() msg = '`--overwrite` option selected. Will recreate directory.' logger.report(msg) - shutil.rmtree(path_to_validate) + try: + shutil.rmtree(path_to_validate) + except OSError: + # `shutil.rmtree` fails for symbolic links with + # OSError: Cannot call rmtree on a symbolic link + _delete_dir_recursively(path_to_validate) else: msg = ( @@ -79,20 +89,64 @@ def prepare_dump_path( (path_to_validate / safeguard_file).touch() -def sanitize_file_extension(filename: str | Path): - if isinstance(filename, Path): - filename = str(filename) - if filename.endswith('.mpl_pdf'): - filename = filename.replace('.mpl_pdf', '.pdf') - if filename.endswith('.mpl_png'): - filename = filename.replace('.mpl_png', '.png') - - return Path(filename) - - -def filter_by_last_dump_time(nodes: list[str], last_dump_time: datetime | None = None) -> list[str]: - if last_dump_time is not None: - orm_nodes = [orm.load_node(node) for node in nodes] - return [node.uuid for node in orm_nodes if node.mtime > last_dump_time] - else: +def _delete_dir_recursively(path): + """ + Delete folder, sub-folders and files. + Implementation taken from: https://stackoverflow.com/a/70285390/9431838 + """ + for f in path.glob('**/*'): + if f.is_symlink(): + f.unlink(missing_ok=True) # missing_ok is added in python 3.8 + elif f.is_file(): + f.unlink() + elif f.is_dir(): + try: + f.rmdir() # delete empty sub-folder + except OSError: # sub-folder is not empty + _delete_dir_recursively(f) # recurse the current sub-folder + except Exception as exception: # capture other exception + print(f'exception name: {exception.__class__.__name__}') + print(f'exception msg: {exception}') + + try: + path.rmdir() # time to delete an empty folder + except NotADirectoryError: + path.unlink() # delete folder even if it is a symlink, linux + except Exception as exception: + print(f'exception name: {exception.__class__.__name__}') + print(f'exception msg: {exception}') + + +def _get_filtered_nodes(nodes: list[str | int], last_dump_time: datetime, key: str = 'uuid') -> list[str | int]: + """Helper function to get ``orm.Node``s from the DB based on ``id``/``uuid`` and filter by ``mtime``. + + :param nodes: Collection of node PKs or UUIDs + :param last_dump_time: Last time nodes were dumped to disk. + :param key: Identifier to obtain nodes with, either ``id`` or ``uuid``. + :return: List of nodes filtered by ``last_dump_time``. + """ + + qb = orm.QueryBuilder().append(orm.Node, filters={key: {'in': nodes}}) + nodes_orm: list[orm.Node] = cast(list[orm.Node], qb.all(flat=True)) + return [getattr(node, key) for node in nodes_orm if node.mtime > last_dump_time] + + +def filter_by_last_dump_time(nodes: list[str | int], last_dump_time: datetime) -> list[str | int]: + """Filter a list of nodes by the last dump time of the corresponding dumper. + + :param nodes: A list of node identifiers, which can be either UUIDs (str) or IDs (int). + :param last_dump_time: Only include nodes dumped after this timestamp. + :return: A list of node identifiers that have a dump time after the specified last_dump_time. + """ + + # TODO: Possibly directly use QueryBuilder filter. Though, `nodes` directly accessible from orm.Group.nodes + + if not nodes or last_dump_time is None: return nodes + + key = 'uuid' if isinstance(nodes[0], str) else 'id' + return _get_filtered_nodes( + nodes=nodes, + last_dump_time=last_dump_time, + key=key, + ) diff --git a/tests/tools/dumping/test_collection.py b/tests/tools/dumping/test_collection.py index 5ad3ddd01b..6b79dd1195 100644 --- a/tests/tools/dumping/test_collection.py +++ b/tests/tools/dumping/test_collection.py @@ -1,4 +1,3 @@ -########################################################################### # Copyright (c), The AiiDA team. All rights reserved. # # This file is part of the AiiDA code. # # # @@ -17,7 +16,7 @@ import pytest from aiida import orm -from aiida.tools.dumping import CollectionDumper, collection +from aiida.tools.dumping import CollectionDumper from .test_utils import compare_tree @@ -28,7 +27,7 @@ # generate_calculation_node_add_class() # You can also do any additional setup here -@pytest.mark.usefixtures('aiida_profile_clean') +# @pytest.mark.usefixtures('aiida_profile_clean') @pytest.fixture() def setup_no_process_group() -> orm.Group: no_process_group, _ = orm.Group.collection.get_or_create(label='no-process') @@ -38,7 +37,7 @@ def setup_no_process_group() -> orm.Group: return no_process_group -@pytest.mark.usefixtures('aiida_profile_clean') +# @pytest.mark.usefixtures('aiida_profile_clean') @pytest.fixture() def setup_add_group(generate_calculation_node_add) -> orm.Group: add_group, _ = orm.Group.collection.get_or_create(label='add') @@ -48,7 +47,7 @@ def setup_add_group(generate_calculation_node_add) -> orm.Group: return add_group -@pytest.mark.usefixtures('aiida_profile_clean') +# @pytest.mark.usefixtures('aiida_profile_clean') @pytest.fixture() def setup_multiply_add_group(generate_workchain_multiply_add) -> orm.Group: multiply_add_group, _ = orm.Group.collection.get_or_create(label='multiply-add') @@ -58,7 +57,7 @@ def setup_multiply_add_group(generate_workchain_multiply_add) -> orm.Group: return multiply_add_group -@pytest.mark.usefixtures('aiida_profile_clean') +# @pytest.mark.usefixtures('aiida_profile_clean') @pytest.fixture() def duplicate_group(): def _duplicate_group(source_group: orm.Group, dest_group_label: str): @@ -69,67 +68,68 @@ def _duplicate_group(source_group: orm.Group, dest_group_label: str): return _duplicate_group -@pytest.mark.usefixtures('aiida_profile_clean_class') +# @pytest.mark.usefixtures('aiida_profile_clean_class') class TestCollectionDumper: - def test_should_dump_processes(self, setup_no_process_group, setup_add_group): - """""" - no_process_group: orm.Group = setup_no_process_group - add_group: orm.Group = setup_add_group - - collection_dumper = CollectionDumper(collection=no_process_group) - - assert collection_dumper.should_dump_processes() is False + # @pytest.mark.usefixtures('aiida_profile_clean') + # def test_should_dump_processes(self, setup_no_process_group, setup_add_group): + # """""" + # no_process_group: orm.Group = setup_no_process_group + # add_group: orm.Group = setup_add_group - collection_dumper = CollectionDumper(collection=add_group) + # collection_dumper = CollectionDumper(collection=no_process_group) - assert collection_dumper.should_dump_processes() is True + # assert collection_dumper._should_dump_processes() is False + # collection_dumper = CollectionDumper(collection=add_group) - def test_get_nodes_add_group(self, setup_add_group): + # assert collection_dumper._should_dump_processes() is True + @pytest.mark.usefixtures('aiida_profile_clean') + def test_resolve_collection_nodes(self, setup_add_group, generate_calculation_node_add): add_group: orm.Group = setup_add_group + add_nodes = add_group.nodes - collection_dumper = CollectionDumper(collection=add_group) + add_dumper = CollectionDumper(collection=add_group) - nodes = collection_dumper._get_nodes() + nodes = add_dumper._get_collection_nodes() assert len(nodes) == 1 - # add_group: orm.Group = setup_add_group - - # collection_dumper = CollectionDumper(collection=add_group) - # nodes = collection_dumper._get_nodes() - # group_node = orm.load_node(nodes[0]) - # group_node_uuid = nodes[0] - - # assert len(nodes) == 1 - # assert isinstance(nodes[0], str) - # assert isinstance(group_node, orm.CalcJobNode) - # assert nodes[0] == group_node_uuid - - # # Now, add another CalcJobNode to the profile - # # As not part of the group, should not be returned - # cj_node1 = generate_calculation_node_add() - # nodes = collection_dumper._get_nodes() - # assert len(nodes) == 1 + assert isinstance(nodes[0], str) + assert nodes[0] == add_nodes[0].uuid + assert isinstance(orm.load_node(nodes[0]), orm.CalcJobNode) + + # Now, add another CalcJobNode to the profile + # As not part of the group, should not be returned + # Also, last_dump_time is None here by default, so no filtering applied + # Still contains the previous node in the returned collection + cj_node1 = generate_calculation_node_add() + nodes = add_dumper._get_collection_nodes() + assert len(nodes) == 1 + assert isinstance(nodes[0], str) + assert nodes[0] == add_nodes[0].uuid + assert isinstance(orm.load_node(nodes[0]), orm.CalcJobNode) - # # Now, add the node to the group, should be captured by get_nodes - # add_group.add_nodes([cj_node1]) - # nodes = collection_dumper._get_nodes() - # assert len(nodes) == 2 + # Now, add the node to the group, should be captured by get_nodes + add_group.add_nodes([cj_node1]) + nodes = add_dumper._get_collection_nodes() + assert len(nodes) == 2 + assert set(nodes) == set([add_nodes[0].uuid, cj_node1.uuid]) - # # Filtering by time should work - # collection_dumper.base_dumper.last_dump_time = datetime.now().astimezone() + # Filtering by time should work -> Now, only cj_node2 gets returned + add_dumper.base_dumper.last_dump_time = datetime.now().astimezone() - # cj_node2 = generate_calculation_node_add() - # add_group.add_nodes([cj_node2]) + cj_node2 = generate_calculation_node_add() + add_group.add_nodes([cj_node2]) - # nodes = collection_dumper._get_nodes() - # assert len(nodes) == 1 - # assert nodes[0] == cj_node2.uuid + nodes = add_dumper._get_collection_nodes() + assert len(nodes) == 1 + assert nodes[0] == cj_node2.uuid - # with pytest.raises(TypeError): - # collection_dumper = CollectionDumper(collection=[1]) - # collection_dumper._get_nodes() + for invalid_collection in [{'foo': 'bar'}, [1.0, 1.1]]: + collection_dumper = CollectionDumper(collection=invalid_collection) + with pytest.raises(ValueError): + collection_dumper._get_collection_nodes() + @pytest.mark.usefixtures('aiida_profile_clean') def test_get_processes_to_dump(self, setup_add_group, setup_multiply_add_group, duplicate_group): add_group: orm.Group = setup_add_group multiply_add_group: orm.Group = setup_multiply_add_group @@ -154,21 +154,15 @@ def test_get_processes_to_dump(self, setup_add_group, setup_multiply_add_group, # TODO: Test here also de-duplication with a Workflow with a sub-workflow - def test_dump_calculations(self, setup_add_group, setup_multiply_add_group, tmp_path): + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_calculations_add(self, setup_add_group, tmp_path): add_group: orm.Group = setup_add_group - multiply_add_group: orm.Group = setup_multiply_add_group - - add_group_path = Path('add_group') - multiply_add_group_path = Path('multiply_add_group') - - add_dumper = CollectionDumper(collection=add_group, output_path=tmp_path / add_group_path) - multiply_add_dumper = CollectionDumper( - collection=multiply_add_group, output_path=tmp_path / multiply_add_group_path - ) + add_group_label = add_group.label + add_group_path = tmp_path / add_group_label - add_processes_to_dump = add_dumper._get_processes_to_dump() + add_dumper = CollectionDumper(collection=add_group, output_path=add_group_path) - add_dumper._dump_calculations(add_processes_to_dump.calculations) + add_dumper._dump_calculations(add_dumper._get_processes_to_dump().calculations) expected_tree = { 'calculations': { @@ -182,39 +176,44 @@ def test_dump_calculations(self, setup_add_group, setup_multiply_add_group, tmp_ compare_tree(expected=expected_tree, base_path=tmp_path, relative_path=add_group_path) - multiply_add_processes_to_dump = multiply_add_dumper._get_processes_to_dump() + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_calculations_multiply_add(self, setup_multiply_add_group, tmp_path): + multiply_add_group: orm.Group = setup_multiply_add_group + multiply_add_group_label = multiply_add_group.label + multiply_add_group_path = tmp_path / multiply_add_group_label - # No calculations to dump when deduplication is enabled - multiply_add_dumper._dump_calculations(multiply_add_processes_to_dump.calculations) - multiply_add_test_path: Path = multiply_add_group_path / 'calculations' + multiply_add_dumper = CollectionDumper(collection=multiply_add_group, output_path=multiply_add_group_path) - assert not multiply_add_test_path.exists() + # No calculations to dump when deduplication is enabled + multiply_add_dumper._dump_calculations(multiply_add_dumper._get_processes_to_dump().calculations) + assert not (multiply_add_group_path / 'calculations').exists() + # Now, disable de-duplication -> Should dump calculations multiply_add_dumper_no_dedup = CollectionDumper( collection=multiply_add_group, output_path=multiply_add_group_path, deduplicate=False ) - multiply_add_processes_to_dump = multiply_add_dumper_no_dedup._get_processes_to_dump() - # calculations to dump when deduplication is enabled - multiply_add_dumper_no_dedup._dump_calculations(multiply_add_processes_to_dump.calculations) + multiply_add_dumper_no_dedup._dump_calculations( + multiply_add_dumper_no_dedup._get_processes_to_dump().calculations + ) expected_tree_no_dedup = { 'calculations': { - 'ArithmeticAddCalculation-15': { + 'ArithmeticAddCalculation-8': { 'inputs': ['_aiidasubmit.sh', 'aiida.in'], 'node_inputs': [], 'outputs': ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'], }, - 'multiply-13': { + 'multiply-6': { 'inputs': ['source_file'], 'node_inputs': [], }, } } - compare_tree(expected=expected_tree_no_dedup, base_path=tmp_path, relative_path=multiply_add_group_path) + compare_tree(expected=expected_tree_no_dedup, base_path=tmp_path, relative_path=Path(multiply_add_group_label)) - pytest.set_trace() + # pytest.set_trace() # def test_dump_workflows(self): # pass @@ -261,4 +260,4 @@ def test_dump_calculations(self, setup_add_group, setup_multiply_add_group, tmp_ # with pytest.raises(TypeError): # collection_dumper = CollectionDumper(collection=[1]) - # collection_dumper._get_nodes() \ No newline at end of file + # collection_dumper._get_nodes() From c83cc063ab037dc7c58f10620d01eac71439db64 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 10 Feb 2025 17:57:08 +0000 Subject: [PATCH 24/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/aiida/cmdline/commands/cmd_profile.py | 11 +- src/aiida/common/utils.py | 2 - src/aiida/repository/repository.py | 2 +- src/aiida/tools/dumping/collection.py | 166 +++++++++++----------- src/aiida/tools/dumping/config.py | 3 +- src/aiida/tools/dumping/process.py | 5 +- src/aiida/tools/dumping/profile.py | 3 +- 7 files changed, 92 insertions(+), 100 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_profile.py b/src/aiida/cmdline/commands/cmd_profile.py index e616debd06..ada34c7657 100644 --- a/src/aiida/cmdline/commands/cmd_profile.py +++ b/src/aiida/cmdline/commands/cmd_profile.py @@ -288,17 +288,20 @@ def profile_delete(force, delete_data, profiles): '--symlink-duplicates/--no-symlink-duplicates', default=True, show_default=True, - help='Symlink data if the same node is contained in multiple groups.') + help='Symlink data if the same node is contained in multiple groups.', +) @click.option( '--delete-missing/--no-delete-missing', default=False, show_default=True, - help="If a previously dumped node is deleted from AiiDA's DB, also delete the corresponding dump directory.") + help="If a previously dumped node is deleted from AiiDA's DB, also delete the corresponding dump directory.", +) @click.option( '--extra-calc-dirs/--no-extra-calc-dirs', default=False, show_default=True, - help='If a top-level process calls sub-processes, create a designated directory only for the top-level process.') + help='If a top-level process calls sub-processes, create a designated directory only for the top-level process.', +) @options.INCLUDE_INPUTS() @options.INCLUDE_OUTPUTS() @options.INCLUDE_ATTRIBUTES() @@ -331,9 +334,9 @@ def profile_mirror( from aiida.tools.dumping import ProcessDumper, ProfileDumper from aiida.tools.dumping.base import BaseDumper + from aiida.tools.dumping.config import ProfileDumpConfig from aiida.tools.dumping.logger import DumpLogger from aiida.tools.dumping.utils import prepare_dump_path - from aiida.tools.dumping.config import ProfileDumpConfig profile = ctx.obj['profile'] diff --git a/src/aiida/common/utils.py b/src/aiida/common/utils.py index 8cd1046dfb..1b2f2b14ce 100644 --- a/src/aiida/common/utils.py +++ b/src/aiida/common/utils.py @@ -17,8 +17,6 @@ from datetime import datetime from typing import Any, Dict from uuid import UUID -from aiida.manage import get_manager, load_profile -from aiida.manage.configuration.profile import Profile from .lang import classproperty diff --git a/src/aiida/repository/repository.py b/src/aiida/repository/repository.py index 32351ddeef..a332d4ded3 100644 --- a/src/aiida/repository/repository.py +++ b/src/aiida/repository/repository.py @@ -519,7 +519,7 @@ def copy_tree(self, target: Union[str, pathlib.Path], path: Optional[FilePath] = dirpath.mkdir(parents=True, exist_ok=True) with self.open(root / filename) as handle: - # TODO: Possibly skip + # TODO: Possibly skip filepath.write_bytes(handle.read()) # these methods are not actually used in aiida-core, but are here for completeness diff --git a/src/aiida/tools/dumping/collection.py b/src/aiida/tools/dumping/collection.py index 56352b4574..dec7f532d9 100644 --- a/src/aiida/tools/dumping/collection.py +++ b/src/aiida/tools/dumping/collection.py @@ -10,7 +10,6 @@ from __future__ import annotations -from dataclasses import dataclass import os from datetime import datetime from pathlib import Path @@ -20,11 +19,10 @@ from aiida.common.exceptions import NotExistent from aiida.common.log import AIIDA_LOGGER from aiida.tools.dumping.base import BaseDumper +from aiida.tools.dumping.config import ProfileDumpConfig from aiida.tools.dumping.logger import DumpLog, DumpLogger from aiida.tools.dumping.process import ProcessDumper from aiida.tools.dumping.utils import filter_by_last_dump_time -from aiida.tools.dumping.config import ProfileDumpConfig -from typing import Literal if TYPE_CHECKING: from collections.abc import Sequence @@ -53,6 +51,7 @@ def is_empty(self) -> bool: # extra_calc_dirs: bool = False # organize_by_groups: bool = True + class CollectionDumper: """Class to handle dumping of a collection of AiiDA ORM entities.""" @@ -193,7 +192,6 @@ def _get_processes_to_dump(self) -> ProcessesToDump: ) def _dump_calculations(self, calculations: Sequence[orm.CalculationNode]) -> None: - """Dump a collection of calculations. Deduplication is already handled in the ``get_processes`` method, where PKs/UUIDs are used, rather than AiiDA @@ -234,9 +232,7 @@ def _dump_calculations(self, calculations: Sequence[orm.CalculationNode]) -> Non self.dump_logger.update_calculations(new_calculations=dumped_calculations) def _dump_workflows(self, workflows: Sequence[orm.WorkflowNode]) -> None: - """Dump a collection of workflows. - - """ + """Dump a collection of workflows.""" workflow_path: Path = self.output_path / 'workflows' dumped_workflows: dict[str, DumpLog] = {} @@ -293,82 +289,82 @@ def dump(self) -> None: # breakpoint() self._dump_calculations(calculations=collection_processes.calculations) -# TODO: See, if I can generalize the dump sub-methods - # def _dump_processes( - # self, - # # processes: Sequence[orm.CalculationNode | orm.WorkflowNode], - # processes: Sequence[orm.CalculationNode] | Sequence[orm.WorkflowNode], - # ) -> None: - # """Dump a collection of calculations or workflows. - - # :param processes: Sequence of ``orm.CalculationNode``s or ``orm.WorkflowNode``s - # :param process_type: Type of processes, either 'calculations' or 'workflows' - # :return: None - # """ - - # # From, e.g., 'aiida.workflows:core.arithmetic.multiply_add' to 'workflows - # if isinstance(processes[0], orm.CalculationNode): - # process_type_str = 'calculations' - # elif isinstance(processes[0], orm.WorkflowNode): - # process_type_str = 'workflows' - # # else: - # # breakpoint() - # # process_type_str = processes[0].process_type.split(':')[0].split('.')[1] - # process_type_path = self.output_path / process_type_str - # process_type_path.mkdir(exist_ok=True, parents=True) - - # dumped_processes: dict[str, DumpLog] = {} - # logged_processes: DumpDict = self.dump_logger.get_log()[process_type_str] - - # # breakpoint() - - # for process in processes: - # process_dumper = self.process_dumper - - # process_dump_path = process_type_path / process_dumper._generate_default_dump_path( - # process_node=process, prefix=None - # ) - - # # Target directory already exists, skip this process - # if process_dump_path.exists(): - # continue - - # else: - # # Symlink here, if deduplication enabled and process was already dumped - # # TODO: Possibly check dirs here - # # TODO: Alternatively have method/endpoint to delete one calculation from the dumping - # # TODO: Which would also update the log. - # # Otherwise, one might delete a calculation, maybe because it was wrong, and then it won't be dumped - # # anymore ever. - # if self.deduplicate and process.uuid in logged_processes.keys(): - # try: - # os.symlink( - # src=logged_processes[process.uuid].path, - # dst=process_dump_path, - # ) - # except: - # # raise - # pass - # # breakpoint() - # else: - # if process_type_str == 'calculations': - # process_dumper._dump_calculation(calculation_node=process, output_path=process_dump_path) - # elif process_type_str == 'workflows': - # process_dumper._dump_workflow( - # workflow_node=process, - # output_path=process_dump_path, - # ) - - - # dumped_processes[process.uuid] = DumpLog( - # path=process_dump_path, - # time=datetime.now().astimezone(), - # ) - - # # breakpoint() - - # if process_type_str == 'calculations': - # self.dump_logger.update_calculations(new_calculations=dumped_processes) - # elif process_type_str == 'workflows': - # self.dump_logger.update_workflows(new_workflows=dumped_processes) +# TODO: See, if I can generalize the dump sub-methods +# def _dump_processes( +# self, +# # processes: Sequence[orm.CalculationNode | orm.WorkflowNode], +# processes: Sequence[orm.CalculationNode] | Sequence[orm.WorkflowNode], +# ) -> None: +# """Dump a collection of calculations or workflows. + +# :param processes: Sequence of ``orm.CalculationNode``s or ``orm.WorkflowNode``s +# :param process_type: Type of processes, either 'calculations' or 'workflows' +# :return: None +# """ + +# # From, e.g., 'aiida.workflows:core.arithmetic.multiply_add' to 'workflows +# if isinstance(processes[0], orm.CalculationNode): +# process_type_str = 'calculations' +# elif isinstance(processes[0], orm.WorkflowNode): +# process_type_str = 'workflows' +# # else: +# # breakpoint() +# # process_type_str = processes[0].process_type.split(':')[0].split('.')[1] +# process_type_path = self.output_path / process_type_str +# process_type_path.mkdir(exist_ok=True, parents=True) + +# dumped_processes: dict[str, DumpLog] = {} +# logged_processes: DumpDict = self.dump_logger.get_log()[process_type_str] + +# # breakpoint() + +# for process in processes: +# process_dumper = self.process_dumper + +# process_dump_path = process_type_path / process_dumper._generate_default_dump_path( +# process_node=process, prefix=None +# ) + +# # Target directory already exists, skip this process +# if process_dump_path.exists(): +# continue + +# else: +# # Symlink here, if deduplication enabled and process was already dumped +# # TODO: Possibly check dirs here +# # TODO: Alternatively have method/endpoint to delete one calculation from the dumping +# # TODO: Which would also update the log. +# # Otherwise, one might delete a calculation, maybe because it was wrong, and then it won't be dumped +# # anymore ever. +# if self.deduplicate and process.uuid in logged_processes.keys(): +# try: +# os.symlink( +# src=logged_processes[process.uuid].path, +# dst=process_dump_path, +# ) +# except: +# # raise +# pass +# # breakpoint() +# else: +# if process_type_str == 'calculations': +# process_dumper._dump_calculation(calculation_node=process, output_path=process_dump_path) +# elif process_type_str == 'workflows': +# process_dumper._dump_workflow( +# workflow_node=process, +# output_path=process_dump_path, +# ) + + +# dumped_processes[process.uuid] = DumpLog( +# path=process_dump_path, +# time=datetime.now().astimezone(), +# ) + +# # breakpoint() + +# if process_type_str == 'calculations': +# self.dump_logger.update_calculations(new_calculations=dumped_processes) +# elif process_type_str == 'workflows': +# self.dump_logger.update_workflows(new_workflows=dumped_processes) diff --git a/src/aiida/tools/dumping/config.py b/src/aiida/tools/dumping/config.py index cd8537ce3b..09da896ed5 100644 --- a/src/aiida/tools/dumping/config.py +++ b/src/aiida/tools/dumping/config.py @@ -4,8 +4,7 @@ @dataclass class ProfileDumpConfig: dump_processes: bool = True - symlink_duplicates: bool = True # + symlink_duplicates: bool = True delete_missing: bool = False # profile extra_calc_dirs: bool = False # collection organize_by_groups: bool = True # profile - diff --git a/src/aiida/tools/dumping/process.py b/src/aiida/tools/dumping/process.py index 8a4f962bf2..617c475bf6 100644 --- a/src/aiida/tools/dumping/process.py +++ b/src/aiida/tools/dumping/process.py @@ -54,10 +54,7 @@ def __init__( flat: bool = False, dump_unsealed: bool = False, ) -> None: - """Initialize the CollectionDumper. - - - """ + """Initialize the CollectionDumper.""" self.include_inputs = include_inputs self.include_outputs = include_outputs self.include_attributes = include_attributes diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py index 04374ebe16..bb76d3df6c 100644 --- a/src/aiida/tools/dumping/profile.py +++ b/src/aiida/tools/dumping/profile.py @@ -11,12 +11,11 @@ from __future__ import annotations -from dataclasses import dataclass from typing import Sequence, cast from aiida import orm from aiida.common.log import AIIDA_LOGGER -from aiida.manage import get_manager, load_profile +from aiida.manage import load_profile from aiida.manage.configuration.profile import Profile from aiida.tools.dumping.base import BaseDumper from aiida.tools.dumping.collection import CollectionDumper From ea28a519003fa1a57434867a0b2edcb9c3202582 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Tue, 11 Feb 2025 10:12:33 +0100 Subject: [PATCH 25/27] WIP: Dir deletion on node deletion for mirror. --- src/aiida/cmdline/commands/cmd_profile.py | 4 + src/aiida/tools/dumping/collection.py | 38 +++---- src/aiida/tools/dumping/profile.py | 101 ++++++++++++++++-- src/aiida/tools/dumping/utils.py | 122 +++++++++++++++------- 4 files changed, 197 insertions(+), 68 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_profile.py b/src/aiida/cmdline/commands/cmd_profile.py index ada34c7657..7ac3872e51 100644 --- a/src/aiida/cmdline/commands/cmd_profile.py +++ b/src/aiida/cmdline/commands/cmd_profile.py @@ -418,6 +418,10 @@ def profile_mirror( ) profile_dumper.dump_processes() + profile_dumper.delete_processes() + + if delete_missing: + profile_dumper._get_processes_to_delete() # Append the current time to the file last_dump_time = datetime.now().astimezone() diff --git a/src/aiida/tools/dumping/collection.py b/src/aiida/tools/dumping/collection.py index dec7f532d9..468b638fc8 100644 --- a/src/aiida/tools/dumping/collection.py +++ b/src/aiida/tools/dumping/collection.py @@ -22,7 +22,7 @@ from aiida.tools.dumping.config import ProfileDumpConfig from aiida.tools.dumping.logger import DumpLog, DumpLogger from aiida.tools.dumping.process import ProcessDumper -from aiida.tools.dumping.utils import filter_by_last_dump_time +from aiida.tools.dumping.utils import filter_by_last_dump_time, extend_calculations if TYPE_CHECKING: from collections.abc import Sequence @@ -119,7 +119,7 @@ def collection_nodes(self) -> Sequence[str] | Sequence[int]: :return: List of collection node identifiers. """ - if not self._collection_nodes: + if self._collection_nodes is None: self._collection_nodes = self._get_collection_nodes() return self._collection_nodes @@ -165,26 +165,11 @@ def _get_processes_to_dump(self) -> ProcessesToDump: # As the list comprehension fetches each node from the DB individually nodes_orm = orm.QueryBuilder().append(orm.Node, filters={'uuid': {'in': self.collection_nodes}}).all(flat=True) - workflows = [node for node in nodes_orm if isinstance(node, orm.WorkflowNode)] - calculations = [node for node in nodes_orm if isinstance(node, orm.CalculationNode)] + workflows = [node for node in nodes_orm if isinstance(node, orm.WorkflowNode) and node.caller is None] + calculations = [node for node in nodes_orm if isinstance(node, orm.CalculationNode) and node.caller is None] - # Make sure that only top-level workflows and calculations are dumped - workflows = [workflow for workflow in workflows if workflow.caller is None] - - # If sub-calculations that were called by workflows of the group, and which are not - # contained in the group.nodes directly are being dumped explicitly - # breakpoint() if self.profile_dump_config.extra_calc_dirs: - called_calculations = [] - for workflow in workflows: - called_calculations += [ - node for node in workflow.called_descendants if isinstance(node, orm.CalculationNode) - ] - - # Convert to set to avoid duplicates - calculations = list(set(calculations + called_calculations)) - else: - calculations = [calculation for calculation in calculations if calculation.caller is None] + calculations = extend_calculations(profile_dump_config=self.profile_dump_config, calculations=calculations, workflows=workflows) return ProcessesToDump( calculations=calculations, @@ -222,12 +207,15 @@ def _dump_calculations(self, calculations: Sequence[orm.CalculationNode]) -> Non ) # This is handled in the get_processes method: `if calculation.caller is None:` - calculation_dumper._dump_calculation(calculation_node=calculation, output_path=calculation_dump_path) + else: + # TODO: Don't update the logger with the UUID of a symlinked calculation as keys must be unique + # TODO: Possibly add another `symlink` attribute to `DumpLog` which can hold a list of symlinks + calculation_dumper._dump_calculation(calculation_node=calculation, output_path=calculation_dump_path) - dumped_calculations[calculation.uuid] = DumpLog( - path=calculation_dump_path, - time=datetime.now().astimezone(), - ) + dumped_calculations[calculation.uuid] = DumpLog( + path=calculation_dump_path, + time=datetime.now().astimezone(), + ) self.dump_logger.update_calculations(new_calculations=dumped_calculations) diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py index bb76d3df6c..9e9fc060ba 100644 --- a/src/aiida/tools/dumping/profile.py +++ b/src/aiida/tools/dumping/profile.py @@ -22,7 +22,7 @@ from aiida.tools.dumping.config import ProfileDumpConfig from aiida.tools.dumping.logger import DumpLogger from aiida.tools.dumping.process import ProcessDumper -from aiida.tools.dumping.utils import filter_by_last_dump_time +from aiida.tools.dumping.utils import filter_by_last_dump_time, _safe_delete logger = AIIDA_LOGGER.getChild('tools.dumping') @@ -48,7 +48,6 @@ def __init__( :param dump_logger: Logger for the dumping (gets instantiated). :param organize_by_groups: Organize dumped data by groups. :param groups: Dump data only for selected groups. - :param dump_processes: Should dump process data? """ self.groups = groups @@ -63,6 +62,9 @@ def __init__( profile = load_profile(profile=profile, allow_switch=True) self.profile = profile + self._processes_to_dump: Sequence[str] | None = None + self._processes_to_delete: Sequence[str] | None = None + def _dump_processes_not_in_any_group(self): """Dump the profile's process data not contained in any group.""" @@ -88,7 +90,7 @@ def _dump_processes_not_in_any_group(self): # Add additional check here to only issue the message when there are actual processes to dump for a group # This might not be the case for, e.g., pseudopotential groups, or if there are no new processes since the # last dumping - if self.dump_processes and not no_group_dumper.processes_to_dump.is_empty: + if self.profile_dump_config.dump_processes and not no_group_dumper.processes_to_dump.is_empty: logger.report(f'Dumping processes not in any group for profile `{self.profile.name}`...') no_group_dumper.dump() @@ -118,7 +120,7 @@ def _dump_processes_per_group(self, groups): # This might not be the case for, e.g., pseudopotential groups, or if there are no new processes since the # last dumping # breakpoint() - if self.dump_processes and not group_dumper.processes_to_dump.is_empty: + if self.profile_dump_config.dump_processes and not group_dumper.processes_to_dump.is_empty: # breakpoint() logger.report(f'Dumping processes in group {group.label} for profile `{self.profile.name}`...') @@ -133,7 +135,7 @@ def _get_no_group_processes(self) -> Sequence[str] | Sequence[int]: group_qb = orm.QueryBuilder().append(orm.Group) profile_groups = cast(Sequence[orm.Group], group_qb.all(flat=True)) process_qb = orm.QueryBuilder().append(orm.ProcessNode, project=['uuid']) - profile_nodes = cast(Sequence[str], process_qb.all(flat=True)) + profile_processes = cast(Sequence[str], process_qb.all(flat=True)) nodes_in_groups: Sequence[str] = [node.uuid for group in profile_groups for node in group.nodes] @@ -151,7 +153,7 @@ def _get_no_group_processes(self) -> Sequence[str] | Sequence[int]: nodes_in_groups += sub_nodes_in_groups process_nodes: Sequence[str | int] = [ - profile_node for profile_node in profile_nodes if profile_node not in nodes_in_groups + profile_node for profile_node in profile_processes if profile_node not in nodes_in_groups ] process_nodes = filter_by_last_dump_time(nodes=process_nodes, last_dump_time=self.base_dumper.last_dump_time) @@ -160,6 +162,8 @@ def _get_no_group_processes(self) -> Sequence[str] | Sequence[int]: def dump_processes(self): # No groups selected, dump data which is not part of any group # If groups selected, however, this data should not also be dumped automatically + # TODO: Maybe populate the `processes_to_dump` property here, even though I don't really need it, as I get the + # TODO: nodes from the specified collection if not self.groups: self._dump_processes_not_in_any_group() @@ -173,6 +177,7 @@ def dump_processes(self): @staticmethod def _get_number_of_nodes_to_dump(last_dump_time) -> dict[str, int]: + # TODO: Change this method... result = {} for node_type in (orm.CalculationNode, orm.WorkflowNode): qb = orm.QueryBuilder().append(node_type, project=['uuid']) @@ -180,3 +185,87 @@ def _get_number_of_nodes_to_dump(last_dump_time) -> dict[str, int]: nodes = filter_by_last_dump_time(nodes=nodes, last_dump_time=last_dump_time) result[node_type.class_node_type.split('.')[-2] + 's'] = len(nodes) return result + + @property + def processes_to_dump(self) -> Sequence[str]: + if self._processes_to_dump is None: + self._processes_to_dump = self._get_processes_to_dump() + return self._processes_to_dump + + def _get_processes_to_dump(self) -> Sequence[str]: + + process_qb = ( + orm.QueryBuilder() + .append( + orm.ProcessNode, + project=['uuid'], + filters={'ctime': {'>': self.base_dumper.last_dump_time}} + ) + ) + + profile_processes = cast(Sequence[str], process_qb.all(flat=True)) + + return profile_processes + + @property + def processes_to_delete(self) -> Sequence[str]: + if self._processes_to_delete is None: + self._processes_to_delete = self._get_processes_to_delete() + return self._processes_to_delete + + def _get_processes_to_delete(self) -> Sequence[str]: + + dump_logger = self.dump_logger + log = dump_logger.get_log() + dumped_uuids = set(list(log['calculations'].keys()) + list(log['workflows'].keys())) + # Cannot use QB here because, when deleted, not in the DB anymore + # dumped_qb = orm.QueryBuilder().append(orm.ProcessNode, filters={'uuid': {'in': dumped_uuids}}, project=['uuid']) + # dumped_processes: set[str] = set(cast(list[str], dumped_qb.all(flat=True))) + + # TODO: Possibly filter here since last dump time + # TODO: But it is highly likely that the last dump command with deletion was run a while ago + # TODO: So I cannot filter by last dump time, but should probably take the whole set + profile_qb = orm.QueryBuilder().append(orm.ProcessNode) + profile_processes = set(cast(Sequence[orm.ProcessNode], profile_qb.all(flat=True))) + profile_uuids = set([process.uuid for process in profile_processes if process.caller is None]) + + to_delete_uuids = list(dumped_uuids - profile_uuids) + + return to_delete_uuids + + def _delete_missing_process_paths(self, to_delete_uuids): + + log = self.dump_logger.get_log() + paths_to_delete = [] + + for to_delete_uuid in to_delete_uuids: + try: + paths_to_delete.append(log['workflows'][to_delete_uuid].path) + except KeyError: + paths_to_delete.append(log['calculations'][to_delete_uuid].path) + except: + raise + + for path_to_delete in paths_to_delete: + _safe_delete( + path_to_validate=path_to_delete, + safeguard_file='.aiida_node_metadata.yaml', + verbose=False + ) + + # breakpoint() + + def delete_processes(self): + + to_dump_processes = self.processes_to_dump + to_delete_processes = self.processes_to_delete + + print(f'TO_DUMP_PROCESSES: {to_dump_processes}') + print(f'TO_DELETE_PROCESSES: {to_delete_processes}') + + breakpoint() + + self._delete_missing_process_paths(to_delete_uuids=to_delete_processes) + + # TODO: Need to also delete entry from the log when I delete the dir + # TODO: Add also logging for node/path deletion diff --git a/src/aiida/tools/dumping/utils.py b/src/aiida/tools/dumping/utils.py index d2f216c539..9f7505b1bd 100644 --- a/src/aiida/tools/dumping/utils.py +++ b/src/aiida/tools/dumping/utils.py @@ -18,16 +18,17 @@ from aiida import orm from aiida.common.log import AIIDA_LOGGER -__all__ = ['prepare_dump_path'] +__all__ = ["prepare_dump_path"] -logger = AIIDA_LOGGER.getChild('tools.dumping') +logger = AIIDA_LOGGER.getChild("tools.dumping") def prepare_dump_path( path_to_validate: Path, overwrite: bool = False, incremental: bool = True, - safeguard_file: str = '.aiida_node_metadata.yaml', + safeguard_file: str = ".aiida_node_metadata.yaml", + verbose: bool = False, ) -> None: """Create default dumping directory for a given process node and return it as absolute path. @@ -44,11 +45,11 @@ def prepare_dump_path( # TODO: Handle symlinks if overwrite and incremental: - msg = 'Both overwrite and incremental set to True. Only specify one.' + msg = "Both overwrite and incremental set to True. Only specify one." raise ValueError(msg) if path_to_validate.is_file(): - msg = f'A file at the given path `{path_to_validate}` already exists.' + msg = f"A file at the given path `{path_to_validate}` already exists." raise FileExistsError(msg) # Handle existing directory @@ -58,43 +59,67 @@ def prepare_dump_path( # Case 1: Non-empty directory and overwrite is False if not is_empty and not overwrite: if not incremental: - msg = f'Path `{path_to_validate}` already exists, and neither overwrite nor incremental is enabled.' + msg = f"Path `{path_to_validate}` already exists, and neither overwrite nor incremental is enabled." raise FileExistsError(msg) # Case 2: Non-empty directory, overwrite is True if not is_empty and overwrite: - safeguard_exists = (path_to_validate / safeguard_file).is_file() - - if safeguard_exists: - logger.report(path_to_validate) - # breakpoint() - msg = '`--overwrite` option selected. Will recreate directory.' - logger.report(msg) - try: - shutil.rmtree(path_to_validate) - except OSError: - # `shutil.rmtree` fails for symbolic links with - # OSError: Cannot call rmtree on a symbolic link - _delete_dir_recursively(path_to_validate) - - else: - msg = ( - f'Path `{path_to_validate}` exists without safeguard file `{safeguard_file}`. ' - f'Not removing because path might be a directory not created by AiiDA.' - ) - raise FileNotFoundError(msg) - - # Create directory if it doesn't exist or was removed + _safe_delete( + path_to_validate=path_to_validate, + safeguard_file=safeguard_file, + verbose=verbose, + ) + + # Re-create directory, as both shutil.rmtree and `_delete_dir_recursively` delete the original dir path_to_validate.mkdir(exist_ok=True, parents=True) (path_to_validate / safeguard_file).touch() +def _safe_delete( + path_to_validate: Path, + safeguard_file: str = ".aiida_node_metadata.yaml", + verbose: bool = False, +) -> None: + """Also deletes the top-level directory itself. + """ + + if not path_to_validate.exists(): + return + + is_empty = any(path_to_validate.iterdir()) + if is_empty: + path_to_validate.rmdir() + return + + safeguard_exists = (path_to_validate / safeguard_file).is_file() + + if safeguard_exists: + if verbose: + logger.report(str(path_to_validate)) + msg = "`--overwrite` option selected. Will recreate directory." + logger.report(msg) + try: + _delete_dir_recursively(path_to_validate) + # shutil.rmtree(path_to_validate) + except OSError: + # `shutil.rmtree` fails for symbolic links with + # OSError: Cannot call rmtree on a symbolic link + _delete_dir_recursively(path_to_validate) + + else: + msg = ( + f"Path `{path_to_validate}` exists without safeguard file `{safeguard_file}`. " + f"Not removing because path might be a directory not created by AiiDA." + ) + raise FileNotFoundError(msg) + + def _delete_dir_recursively(path): """ Delete folder, sub-folders and files. Implementation taken from: https://stackoverflow.com/a/70285390/9431838 """ - for f in path.glob('**/*'): + for f in path.glob("**/*"): if f.is_symlink(): f.unlink(missing_ok=True) # missing_ok is added in python 3.8 elif f.is_file(): @@ -105,19 +130,21 @@ def _delete_dir_recursively(path): except OSError: # sub-folder is not empty _delete_dir_recursively(f) # recurse the current sub-folder except Exception as exception: # capture other exception - print(f'exception name: {exception.__class__.__name__}') - print(f'exception msg: {exception}') + print(f"exception name: {exception.__class__.__name__}") + print(f"exception msg: {exception}") try: path.rmdir() # time to delete an empty folder except NotADirectoryError: path.unlink() # delete folder even if it is a symlink, linux except Exception as exception: - print(f'exception name: {exception.__class__.__name__}') - print(f'exception msg: {exception}') + print(f"exception name: {exception.__class__.__name__}") + print(f"exception msg: {exception}") -def _get_filtered_nodes(nodes: list[str | int], last_dump_time: datetime, key: str = 'uuid') -> list[str | int]: +def _get_filtered_nodes( + nodes: list[str | int], last_dump_time: datetime, key: str = "uuid" +) -> list[str | int]: """Helper function to get ``orm.Node``s from the DB based on ``id``/``uuid`` and filter by ``mtime``. :param nodes: Collection of node PKs or UUIDs @@ -126,12 +153,14 @@ def _get_filtered_nodes(nodes: list[str | int], last_dump_time: datetime, key: s :return: List of nodes filtered by ``last_dump_time``. """ - qb = orm.QueryBuilder().append(orm.Node, filters={key: {'in': nodes}}) + qb = orm.QueryBuilder().append(orm.Node, filters={key: {"in": nodes}}) nodes_orm: list[orm.Node] = cast(list[orm.Node], qb.all(flat=True)) return [getattr(node, key) for node in nodes_orm if node.mtime > last_dump_time] -def filter_by_last_dump_time(nodes: list[str | int], last_dump_time: datetime) -> list[str | int]: +def filter_by_last_dump_time( + nodes: list[str | int], last_dump_time: datetime +) -> list[str | int]: """Filter a list of nodes by the last dump time of the corresponding dumper. :param nodes: A list of node identifiers, which can be either UUIDs (str) or IDs (int). @@ -144,9 +173,28 @@ def filter_by_last_dump_time(nodes: list[str | int], last_dump_time: datetime) - if not nodes or last_dump_time is None: return nodes - key = 'uuid' if isinstance(nodes[0], str) else 'id' + key = "uuid" if isinstance(nodes[0], str) else "id" return _get_filtered_nodes( nodes=nodes, last_dump_time=last_dump_time, key=key, ) + + +def extend_calculations(profile_dump_config, calculations, workflows): + + # If sub-calculations that were called by workflows of the group, and which are not + # contained in the group.nodes directly are being dumped explicitly + # breakpoint() + called_calculations = [] + for workflow in workflows: + called_calculations += [ + node + for node in workflow.called_descendants + if isinstance(node, orm.CalculationNode) + ] + + # Convert to set to avoid duplicates + calculations = list(set(calculations + called_calculations)) + + return calculations From 862674a3a09c371e93819e0de960208c7702d144 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 11 Feb 2025 09:12:58 +0000 Subject: [PATCH 26/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/aiida/tools/dumping/collection.py | 6 ++- src/aiida/tools/dumping/profile.py | 21 ++--------- src/aiida/tools/dumping/utils.py | 53 +++++++++++---------------- 3 files changed, 29 insertions(+), 51 deletions(-) diff --git a/src/aiida/tools/dumping/collection.py b/src/aiida/tools/dumping/collection.py index 468b638fc8..cd2ae90186 100644 --- a/src/aiida/tools/dumping/collection.py +++ b/src/aiida/tools/dumping/collection.py @@ -22,7 +22,7 @@ from aiida.tools.dumping.config import ProfileDumpConfig from aiida.tools.dumping.logger import DumpLog, DumpLogger from aiida.tools.dumping.process import ProcessDumper -from aiida.tools.dumping.utils import filter_by_last_dump_time, extend_calculations +from aiida.tools.dumping.utils import extend_calculations, filter_by_last_dump_time if TYPE_CHECKING: from collections.abc import Sequence @@ -169,7 +169,9 @@ def _get_processes_to_dump(self) -> ProcessesToDump: calculations = [node for node in nodes_orm if isinstance(node, orm.CalculationNode) and node.caller is None] if self.profile_dump_config.extra_calc_dirs: - calculations = extend_calculations(profile_dump_config=self.profile_dump_config, calculations=calculations, workflows=workflows) + calculations = extend_calculations( + profile_dump_config=self.profile_dump_config, calculations=calculations, workflows=workflows + ) return ProcessesToDump( calculations=calculations, diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py index 9e9fc060ba..db03e2b5cf 100644 --- a/src/aiida/tools/dumping/profile.py +++ b/src/aiida/tools/dumping/profile.py @@ -22,7 +22,7 @@ from aiida.tools.dumping.config import ProfileDumpConfig from aiida.tools.dumping.logger import DumpLogger from aiida.tools.dumping.process import ProcessDumper -from aiida.tools.dumping.utils import filter_by_last_dump_time, _safe_delete +from aiida.tools.dumping.utils import _safe_delete, filter_by_last_dump_time logger = AIIDA_LOGGER.getChild('tools.dumping') @@ -193,14 +193,8 @@ def processes_to_dump(self) -> Sequence[str]: return self._processes_to_dump def _get_processes_to_dump(self) -> Sequence[str]: - - process_qb = ( - orm.QueryBuilder() - .append( - orm.ProcessNode, - project=['uuid'], - filters={'ctime': {'>': self.base_dumper.last_dump_time}} - ) + process_qb = orm.QueryBuilder().append( + orm.ProcessNode, project=['uuid'], filters={'ctime': {'>': self.base_dumper.last_dump_time}} ) profile_processes = cast(Sequence[str], process_qb.all(flat=True)) @@ -214,7 +208,6 @@ def processes_to_delete(self) -> Sequence[str]: return self._processes_to_delete def _get_processes_to_delete(self) -> Sequence[str]: - dump_logger = self.dump_logger log = dump_logger.get_log() dumped_uuids = set(list(log['calculations'].keys()) + list(log['workflows'].keys())) @@ -234,7 +227,6 @@ def _get_processes_to_delete(self) -> Sequence[str]: return to_delete_uuids def _delete_missing_process_paths(self, to_delete_uuids): - log = self.dump_logger.get_log() paths_to_delete = [] @@ -247,16 +239,11 @@ def _delete_missing_process_paths(self, to_delete_uuids): raise for path_to_delete in paths_to_delete: - _safe_delete( - path_to_validate=path_to_delete, - safeguard_file='.aiida_node_metadata.yaml', - verbose=False - ) + _safe_delete(path_to_validate=path_to_delete, safeguard_file='.aiida_node_metadata.yaml', verbose=False) # breakpoint() def delete_processes(self): - to_dump_processes = self.processes_to_dump to_delete_processes = self.processes_to_delete diff --git a/src/aiida/tools/dumping/utils.py b/src/aiida/tools/dumping/utils.py index 9f7505b1bd..17a075c59f 100644 --- a/src/aiida/tools/dumping/utils.py +++ b/src/aiida/tools/dumping/utils.py @@ -10,7 +10,6 @@ from __future__ import annotations -import shutil from datetime import datetime from pathlib import Path from typing import cast @@ -18,16 +17,16 @@ from aiida import orm from aiida.common.log import AIIDA_LOGGER -__all__ = ["prepare_dump_path"] +__all__ = ['prepare_dump_path'] -logger = AIIDA_LOGGER.getChild("tools.dumping") +logger = AIIDA_LOGGER.getChild('tools.dumping') def prepare_dump_path( path_to_validate: Path, overwrite: bool = False, incremental: bool = True, - safeguard_file: str = ".aiida_node_metadata.yaml", + safeguard_file: str = '.aiida_node_metadata.yaml', verbose: bool = False, ) -> None: """Create default dumping directory for a given process node and return it as absolute path. @@ -45,11 +44,11 @@ def prepare_dump_path( # TODO: Handle symlinks if overwrite and incremental: - msg = "Both overwrite and incremental set to True. Only specify one." + msg = 'Both overwrite and incremental set to True. Only specify one.' raise ValueError(msg) if path_to_validate.is_file(): - msg = f"A file at the given path `{path_to_validate}` already exists." + msg = f'A file at the given path `{path_to_validate}` already exists.' raise FileExistsError(msg) # Handle existing directory @@ -59,7 +58,7 @@ def prepare_dump_path( # Case 1: Non-empty directory and overwrite is False if not is_empty and not overwrite: if not incremental: - msg = f"Path `{path_to_validate}` already exists, and neither overwrite nor incremental is enabled." + msg = f'Path `{path_to_validate}` already exists, and neither overwrite nor incremental is enabled.' raise FileExistsError(msg) # Case 2: Non-empty directory, overwrite is True @@ -77,11 +76,10 @@ def prepare_dump_path( def _safe_delete( path_to_validate: Path, - safeguard_file: str = ".aiida_node_metadata.yaml", + safeguard_file: str = '.aiida_node_metadata.yaml', verbose: bool = False, ) -> None: - """Also deletes the top-level directory itself. - """ + """Also deletes the top-level directory itself.""" if not path_to_validate.exists(): return @@ -96,7 +94,7 @@ def _safe_delete( if safeguard_exists: if verbose: logger.report(str(path_to_validate)) - msg = "`--overwrite` option selected. Will recreate directory." + msg = '`--overwrite` option selected. Will recreate directory.' logger.report(msg) try: _delete_dir_recursively(path_to_validate) @@ -108,8 +106,8 @@ def _safe_delete( else: msg = ( - f"Path `{path_to_validate}` exists without safeguard file `{safeguard_file}`. " - f"Not removing because path might be a directory not created by AiiDA." + f'Path `{path_to_validate}` exists without safeguard file `{safeguard_file}`. ' + f'Not removing because path might be a directory not created by AiiDA.' ) raise FileNotFoundError(msg) @@ -119,7 +117,7 @@ def _delete_dir_recursively(path): Delete folder, sub-folders and files. Implementation taken from: https://stackoverflow.com/a/70285390/9431838 """ - for f in path.glob("**/*"): + for f in path.glob('**/*'): if f.is_symlink(): f.unlink(missing_ok=True) # missing_ok is added in python 3.8 elif f.is_file(): @@ -130,21 +128,19 @@ def _delete_dir_recursively(path): except OSError: # sub-folder is not empty _delete_dir_recursively(f) # recurse the current sub-folder except Exception as exception: # capture other exception - print(f"exception name: {exception.__class__.__name__}") - print(f"exception msg: {exception}") + print(f'exception name: {exception.__class__.__name__}') + print(f'exception msg: {exception}') try: path.rmdir() # time to delete an empty folder except NotADirectoryError: path.unlink() # delete folder even if it is a symlink, linux except Exception as exception: - print(f"exception name: {exception.__class__.__name__}") - print(f"exception msg: {exception}") + print(f'exception name: {exception.__class__.__name__}') + print(f'exception msg: {exception}') -def _get_filtered_nodes( - nodes: list[str | int], last_dump_time: datetime, key: str = "uuid" -) -> list[str | int]: +def _get_filtered_nodes(nodes: list[str | int], last_dump_time: datetime, key: str = 'uuid') -> list[str | int]: """Helper function to get ``orm.Node``s from the DB based on ``id``/``uuid`` and filter by ``mtime``. :param nodes: Collection of node PKs or UUIDs @@ -153,14 +149,12 @@ def _get_filtered_nodes( :return: List of nodes filtered by ``last_dump_time``. """ - qb = orm.QueryBuilder().append(orm.Node, filters={key: {"in": nodes}}) + qb = orm.QueryBuilder().append(orm.Node, filters={key: {'in': nodes}}) nodes_orm: list[orm.Node] = cast(list[orm.Node], qb.all(flat=True)) return [getattr(node, key) for node in nodes_orm if node.mtime > last_dump_time] -def filter_by_last_dump_time( - nodes: list[str | int], last_dump_time: datetime -) -> list[str | int]: +def filter_by_last_dump_time(nodes: list[str | int], last_dump_time: datetime) -> list[str | int]: """Filter a list of nodes by the last dump time of the corresponding dumper. :param nodes: A list of node identifiers, which can be either UUIDs (str) or IDs (int). @@ -173,7 +167,7 @@ def filter_by_last_dump_time( if not nodes or last_dump_time is None: return nodes - key = "uuid" if isinstance(nodes[0], str) else "id" + key = 'uuid' if isinstance(nodes[0], str) else 'id' return _get_filtered_nodes( nodes=nodes, last_dump_time=last_dump_time, @@ -182,17 +176,12 @@ def filter_by_last_dump_time( def extend_calculations(profile_dump_config, calculations, workflows): - # If sub-calculations that were called by workflows of the group, and which are not # contained in the group.nodes directly are being dumped explicitly # breakpoint() called_calculations = [] for workflow in workflows: - called_calculations += [ - node - for node in workflow.called_descendants - if isinstance(node, orm.CalculationNode) - ] + called_calculations += [node for node in workflow.called_descendants if isinstance(node, orm.CalculationNode)] # Convert to set to avoid duplicates calculations = list(set(calculations + called_calculations)) From ac0bf513fa563975e1ced44976b7f1ff0b65b914 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Tue, 11 Feb 2025 16:26:16 +0100 Subject: [PATCH 27/27] Various improvements - First version with symlinks and delete-missing works - Turn BaseDumper into BaseDumpConfig dataclass - Merge _dump_calculations and _dump_workflows to _dump_processes - Improve Logger - Typing --- docs/source/reference/command_line.rst | 2 +- src/aiida/cmdline/commands/cmd_process.py | 13 +- src/aiida/cmdline/commands/cmd_profile.py | 39 +-- src/aiida/tools/dumping/__init__.py | 6 +- src/aiida/tools/dumping/base.py | 28 --- src/aiida/tools/dumping/collection.py | 283 +++++++--------------- src/aiida/tools/dumping/config.py | 47 +++- src/aiida/tools/dumping/logger.py | 158 +++++++++--- src/aiida/tools/dumping/process.py | 41 ++-- src/aiida/tools/dumping/profile.py | 144 +++++------ src/aiida/tools/dumping/utils.py | 66 +++-- tests/tools/dumping/test_collection.py | 12 +- tests/tools/dumping/test_process.py | 14 +- 13 files changed, 444 insertions(+), 409 deletions(-) delete mode 100644 src/aiida/tools/dumping/base.py diff --git a/docs/source/reference/command_line.rst b/docs/source/reference/command_line.rst index 283993fac9..ca81f2e421 100644 --- a/docs/source/reference/command_line.rst +++ b/docs/source/reference/command_line.rst @@ -398,7 +398,7 @@ Below is a list with all available subcommands. configure-rabbitmq Configure RabbitMQ for a profile. delete Delete one or more profiles. list Display a list of all available profiles. - mirror Dump all data in an AiiDA profile's storage to disk. + mirror Dump all data in an AiiDA profile's storage to disk in a... set-default Set a profile as the default profile. setdefault (Deprecated) Set a profile as the default profile. setup Set up a new profile. diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 395c74de5e..e7054b8649 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -606,23 +606,26 @@ def process_dump( """ from aiida.tools.archive.exceptions import ExportValidationError - from aiida.tools.dumping.base import BaseDumper + from aiida.tools.dumping.config import BaseDumpConfig, ProcessDumpConfig from aiida.tools.dumping.process import ProcessDumper - base_dumper = BaseDumper( + base_dump_config = BaseDumpConfig( dump_parent_path=path, overwrite=overwrite, incremental=incremental, ) - process_dumper = ProcessDumper( - base_dumper=base_dumper, + process_dump_config = ProcessDumpConfig( include_inputs=include_inputs, include_outputs=include_outputs, include_attributes=include_attributes, include_extras=include_extras, flat=flat, - dump_unsealed=dump_unsealed, + ) + + process_dumper = ProcessDumper( + base_dump_config=base_dump_config, + process_dump_config=process_dump_config, ) try: diff --git a/src/aiida/cmdline/commands/cmd_profile.py b/src/aiida/cmdline/commands/cmd_profile.py index 7ac3872e51..503684d50f 100644 --- a/src/aiida/cmdline/commands/cmd_profile.py +++ b/src/aiida/cmdline/commands/cmd_profile.py @@ -19,6 +19,7 @@ from aiida.cmdline.utils import defaults, echo from aiida.common import exceptions from aiida.manage.configuration import Profile, create_profile, get_config +from aiida.tools.dumping.config import ProcessDumpConfig @verdi.group('profile') @@ -333,8 +334,7 @@ def profile_mirror( from pathlib import Path from aiida.tools.dumping import ProcessDumper, ProfileDumper - from aiida.tools.dumping.base import BaseDumper - from aiida.tools.dumping.config import ProfileDumpConfig + from aiida.tools.dumping.config import BaseDumpConfig, ProfileDumpConfig from aiida.tools.dumping.logger import DumpLogger from aiida.tools.dumping.utils import prepare_dump_path @@ -367,12 +367,12 @@ def profile_mirror( last_dump_time = None if dry_run: - node_counts = ProfileDumper._get_number_of_nodes_to_dump(last_dump_time) + # node_counts = ProfileDumper._get_number_of_nodes_to_dump(last_dump_time) dry_run_message = f'Dry run for mirroring of profile `{profile.name}`. Would dump:' echo.echo_report(dry_run_message) - for count, node_type in node_counts.items(): - echo.echo_report(f'{count}: {node_type}') - return + # for count, node_type in node_counts.items(): + # echo.echo_report(f'{count}: {node_type}') + # return if incremental: msg = 'Incremental mirroring selected. Will update directory.' @@ -383,15 +383,14 @@ def profile_mirror( except (json.JSONDecodeError, OSError): dump_logger = DumpLogger(dump_parent_path=path) - base_dumper = BaseDumper( + base_dump_config = BaseDumpConfig( dump_parent_path=path, overwrite=overwrite, incremental=incremental, last_dump_time=last_dump_time, ) - process_dumper = ProcessDumper( - base_dumper=base_dumper, + process_dump_config = ProcessDumpConfig( include_inputs=include_inputs, include_outputs=include_outputs, include_attributes=include_attributes, @@ -399,7 +398,11 @@ def profile_mirror( flat=flat, ) - # breakpoint() + process_dumper = ProcessDumper( + base_dump_config=base_dump_config, + process_dump_config=process_dump_config, + ) + profile_dump_config = ProfileDumpConfig( dump_processes=dump_processes, symlink_duplicates=symlink_duplicates, @@ -411,17 +414,23 @@ def profile_mirror( profile_dumper = ProfileDumper( profile=profile, profile_dump_config=profile_dump_config, - base_dumper=base_dumper, + base_dump_config=base_dump_config, process_dumper=process_dumper, dump_logger=dump_logger, groups=groups, ) - profile_dumper.dump_processes() - profile_dumper.delete_processes() + if len(profile_dumper.processes_to_dump) == 0: + echo.echo_success('No processes to dump.') + else: + profile_dumper.dump_processes() + echo.echo_success('Dumped XXX new nodes.') if delete_missing: - profile_dumper._get_processes_to_delete() + if len(profile_dumper.processes_to_delete) == 0: + echo.echo_success('No processes to delete.') + else: + profile_dumper.delete_processes() # Append the current time to the file last_dump_time = datetime.now().astimezone() @@ -431,4 +440,4 @@ def profile_mirror( # Write the logging json file to disk dump_logger.save_log() - echo.echo_success(f'Dumped {dump_logger.counter} new nodes.') + # echo.echo_success(f'Dumped {dump_logger.counter} new nodes.') diff --git a/src/aiida/tools/dumping/__init__.py b/src/aiida/tools/dumping/__init__.py index 6bc7b9c2c0..8f8c86dcdb 100644 --- a/src/aiida/tools/dumping/__init__.py +++ b/src/aiida/tools/dumping/__init__.py @@ -8,11 +8,9 @@ ########################################################################### """Modules related to the dumping of AiiDA data.""" -from .base import BaseDumper from .collection import CollectionDumper +from .logger import DumpLogger from .process import ProcessDumper from .profile import ProfileDumper -# from .collection import CollectionDumper - -__all__ = ('BaseDumper', 'CollectionDumper', 'ProcessDumper', 'ProfileDumper') # , 'CollectionDumper') +__all__ = ('CollectionDumper', 'DumpLogger', 'ProcessDumper', 'ProfileDumper') diff --git a/src/aiida/tools/dumping/base.py b/src/aiida/tools/dumping/base.py deleted file mode 100644 index 6bbd5b505e..0000000000 --- a/src/aiida/tools/dumping/base.py +++ /dev/null @@ -1,28 +0,0 @@ -########################################################################### -# Copyright (c), The AiiDA team. All rights reserved. # -# This file is part of the AiiDA code. # -# # -# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # -# For further information on the license, see the LICENSE.txt file # -# For further information please visit http://www.aiida.net # -########################################################################### - -from dataclasses import dataclass -from datetime import datetime -from pathlib import Path - - -@dataclass -class BaseDumper: - """Container for shared arguments of all Dumper classes.""" - - dump_parent_path: Path | None = None - overwrite: bool = False - incremental: bool = True - check_dirs: bool = False - # TODO: Make this a per-class attribute? - last_dump_time: datetime | None = None - - def __post_init__(self): - if self.dump_parent_path is None: - self.dump_parent_path = Path.cwd() diff --git a/src/aiida/tools/dumping/collection.py b/src/aiida/tools/dumping/collection.py index cd2ae90186..641c406e9e 100644 --- a/src/aiida/tools/dumping/collection.py +++ b/src/aiida/tools/dumping/collection.py @@ -18,22 +18,19 @@ from aiida import orm from aiida.common.exceptions import NotExistent from aiida.common.log import AIIDA_LOGGER -from aiida.tools.dumping.base import BaseDumper -from aiida.tools.dumping.config import ProfileDumpConfig +from aiida.tools.dumping.config import BaseDumpConfig, ProfileDumpConfig from aiida.tools.dumping.logger import DumpLog, DumpLogger from aiida.tools.dumping.process import ProcessDumper -from aiida.tools.dumping.utils import extend_calculations, filter_by_last_dump_time +from aiida.tools.dumping.utils import NodeDumpMapper, extend_calculations, filter_by_last_dump_time if TYPE_CHECKING: - from collections.abc import Sequence - - from aiida.tools.dumping.logger import DumpDict + from collections.abc import Collection, Sequence logger = AIIDA_LOGGER.getChild('tools.dumping') -class ProcessesToDump(NamedTuple): +class ProcessesDumpContainer(NamedTuple): calculations: Sequence[orm.CalculationNode] workflows: Sequence[orm.WorkflowNode] @@ -43,23 +40,15 @@ def is_empty(self) -> bool: return len(self.calculations) == 0 and len(self.workflows) == 0 -# @dataclass -# class CollectionDumpConfig: -# dump_processes: bool = True -# symlink_duplicates: bool = True -# delete_missing: bool = False -# extra_calc_dirs: bool = False -# organize_by_groups: bool = True - - class CollectionDumper: """Class to handle dumping of a collection of AiiDA ORM entities.""" def __init__( self, - collection: orm.Group | str | Sequence[str] | Sequence[int], + # TODO: Refactor here to different arguments: Group, and collection_nodes + collection: orm.Group | str | Collection[str], profile_dump_config: ProfileDumpConfig | None = None, - base_dumper: BaseDumper | None = None, + base_dump_config: BaseDumpConfig | None = None, process_dumper: ProcessDumper | None = None, dump_logger: DumpLogger | None = None, output_path: Path | None = None, @@ -67,7 +56,7 @@ def __init__( """Initialize the CollectionDumper. :param collection: The collection of AiiDA ORM entities to be dumped, either a group, group label, or list of - :param base_dumper: Base dumper instance or None (gets instantiated). + :param base_dump_config: Base dumper instance or None (gets instantiated). :param process_dumper: Process dumper instance or None (gets instantiated). :param dump_logger: Logger for the dumping (gets instantiated). :param output_path: The parent output path for dumping the collection nodes. @@ -76,20 +65,20 @@ def __init__( self.collection = self._validate_collection(collection) - self.base_dumper = base_dumper or BaseDumper() + self.base_dump_config = base_dump_config or BaseDumpConfig() self.process_dumper = process_dumper or ProcessDumper() - self.dump_logger = dump_logger or DumpLogger(dump_parent_path=self.base_dumper.dump_parent_path) + self.dump_logger = dump_logger or DumpLogger(dump_parent_path=self.base_dump_config.dump_parent_path) self.output_path = output_path or Path.cwd() self.profile_dump_config = profile_dump_config or ProfileDumpConfig() - self._collection_nodes: Sequence[str] | Sequence[int] | None = None - self._processes_to_dump: ProcessesToDump | None = None + self._collection_nodes: Collection[str] | None = None + self._processes_to_dump: ProcessesDumpContainer | None = None def _validate_collection( - self, collection: orm.Group | str | Sequence[str] | Sequence[int] - ) -> orm.Group | Sequence[str] | Sequence[int]: + self, collection: orm.Group | str | Collection[str] | Collection[int] + ) -> orm.Group | Collection[str]: """Validate the given collection identifier. :param collection: The input collection to validate. @@ -104,9 +93,23 @@ def _validate_collection( except Exception as exc: msg = f'Could not load group: {collection}.' raise NotExistent(msg) from exc - if (isinstance(collection, list) and all(isinstance(n, (str, int)) for n in collection)) or isinstance( - collection, orm.Group - ): + + elif isinstance(collection, orm.Group): + return collection + + elif isinstance(collection, list): + if all(isinstance(n, str) for n in collection): + return collection + + elif all(isinstance(n, int) for n in collection): + msg = 'Passing node collections via their PK not yet supported.' + raise ValueError(msg) + + else: + msg = 'Mixing identifiers or passing other types not supported' + raise ValueError(msg) + + elif isinstance(collection, list) and all(isinstance(n, int) for n in collection): return collection else: @@ -114,7 +117,7 @@ def _validate_collection( raise ValueError(msg) @property - def collection_nodes(self) -> Sequence[str] | Sequence[int]: + def collection_nodes(self) -> Collection[str]: """Return collection nodes. :return: List of collection node identifiers. @@ -123,24 +126,26 @@ def collection_nodes(self) -> Sequence[str] | Sequence[int]: self._collection_nodes = self._get_collection_nodes() return self._collection_nodes - def _get_collection_nodes(self) -> Sequence[str] | Sequence[int]: - """Retrieve the node ``PK``s/``UUID``s from the collection, filtered by the last dump time, if incremental - dumping is selected. + def _get_collection_nodes(self) -> Collection[str]: + """Retrieve the node UUIDs from the collection, filtered by the last dump time, if for incremental dumping. - :return: List of node identifiers. + :return: List of node UUIDs. """ if not self.collection: return [] - nodes = [n.uuid for n in self.collection.nodes] if isinstance(self.collection, orm.Group) else self.collection + if isinstance(self.collection, orm.Group): + nodes: Collection[str] = [n.uuid for n in self.collection.nodes] + else: + nodes = self.collection - if self.base_dumper.incremental and self.base_dumper.last_dump_time: - nodes = filter_by_last_dump_time(nodes, last_dump_time=self.base_dumper.last_dump_time) + if self.base_dump_config.incremental and self.base_dump_config.last_dump_time: + nodes = filter_by_last_dump_time(nodes, last_dump_time=self.base_dump_config.last_dump_time) return nodes @property - def processes_to_dump(self) -> ProcessesToDump: + def processes_to_dump(self) -> ProcessesDumpContainer: """Get the processes to dump from the collection of nodes. :return: Instance of the ``ProcessesToDump`` class containing the selected calculations and workflows. @@ -149,7 +154,7 @@ def processes_to_dump(self) -> ProcessesToDump: self._processes_to_dump = self._get_processes_to_dump() return self._processes_to_dump - def _get_processes_to_dump(self) -> ProcessesToDump: + def _get_processes_to_dump(self) -> ProcessesDumpContainer: """Retrieve the processeses from the collection nodes. If deduplication is selected, this method takes care of only dumping top-level workflows and only dump @@ -158,8 +163,12 @@ def _get_processes_to_dump(self) -> ProcessesToDump: :return: Instance of the ``ProcessesToDump`` class containing the selected calculations and workflows. """ + # Deduplication is already handled in the ``get_processes`` method, where PKs/UUIDs are used, rather than AiiDA + # ORM entities as here. Specifically, calculations that are part of a workflow are not dumpid in their own, + # dedicated directory if they are part of a workflow. + if not self.collection_nodes: - return ProcessesToDump(calculations=[], workflows=[]) + return ProcessesDumpContainer(calculations=[], workflows=[]) # Better than: `nodes = [orm.load_node(n) for n in self.collection_nodes]` # As the list comprehension fetches each node from the DB individually @@ -173,91 +182,63 @@ def _get_processes_to_dump(self) -> ProcessesToDump: profile_dump_config=self.profile_dump_config, calculations=calculations, workflows=workflows ) - return ProcessesToDump( + return ProcessesDumpContainer( calculations=calculations, workflows=workflows, ) - def _dump_calculations(self, calculations: Sequence[orm.CalculationNode]) -> None: - """Dump a collection of calculations. - - Deduplication is already handled in the ``get_processes`` method, where PKs/UUIDs are used, rather than AiiDA - ORM entities as here. Specifically, calculations that are part of a workflow are not dumpid in their own, - dedicated directory if they are part of a workflow. - - :param calculations: Sequence of ``orm.CalculationNode``s - :return: None - """ + def _dump_processes(self, processes: Sequence[orm.CalculationNode] | Sequence[orm.WorkflowNode]) -> None: + """Dump a collection of processes.""" - calculations_path = self.output_path / 'calculations' - dumped_calculations: dict[str, DumpLog] = {} + if len(processes) == 0: + return - logged_calculations: DumpDict = self.dump_logger.get_log()['calculations'] + # TODO: Only allow for "pure" sequences of Calculation- or WorkflowNodes, or also mixed? + # TODO: If the latter possibly also have directory creation in the loop + sub_path = self.output_path / NodeDumpMapper.get_directory(node=processes[0]) + sub_path.mkdir(exist_ok=True, parents=True) - for calculation in calculations: - calculation_dumper = self.process_dumper + logger_attr = NodeDumpMapper.get_logger_attr(node=processes[0]) + # ! `getattr` gives a reference to the object, thus I can update the store directly + current_store = getattr(self.dump_logger.log, logger_attr) - calculation_dump_path = calculations_path / calculation_dumper._generate_default_dump_path( - process_node=calculation, prefix=None - ) + # breakpoint() - if self.profile_dump_config.symlink_duplicates and calculation.uuid in logged_calculations.keys(): - calculation_dump_path.parent.mkdir(exist_ok=True, parents=True) - os.symlink( - src=logged_calculations[calculation.uuid].path, - dst=calculation_dump_path, - ) + for process in processes: + process_dumper = self.process_dumper + + process_dump_path = sub_path / process_dumper._generate_default_dump_path(process_node=process, prefix=None) + + if self.profile_dump_config.symlink_duplicates and process.uuid in current_store.entries.keys(): + if process_dump_path.exists(): + continue + else: + process_dump_path.parent.mkdir(exist_ok=True, parents=True) + # breakpoint() + try: + os.symlink( + src=current_store.entries[process.uuid].path, + dst=process_dump_path, + ) + # TODO: If this works here, call `add_link` to the DumpLog to extend an existing DumpLog + except FileExistsError: + pass - # This is handled in the get_processes method: `if calculation.caller is None:` else: # TODO: Don't update the logger with the UUID of a symlinked calculation as keys must be unique # TODO: Possibly add another `symlink` attribute to `DumpLog` which can hold a list of symlinks - calculation_dumper._dump_calculation(calculation_node=calculation, output_path=calculation_dump_path) - - dumped_calculations[calculation.uuid] = DumpLog( - path=calculation_dump_path, - time=datetime.now().astimezone(), - ) - - self.dump_logger.update_calculations(new_calculations=dumped_calculations) - - def _dump_workflows(self, workflows: Sequence[orm.WorkflowNode]) -> None: - """Dump a collection of workflows.""" - workflow_path: Path = self.output_path / 'workflows' - dumped_workflows: dict[str, DumpLog] = {} + # TODO: Ignore for now, as I would need to retrieve the list of links, append to it, and assign again - workflow_path.mkdir(exist_ok=True, parents=True) + # process_dumper._dump_calculation(calculation_node=process, output_path=process_dump_path) + # ! TODO: Add DumpLogger here, such that sub-calculations of workflows are also registered in the + # ! dumping, otherwise they end up duplicated, as the registration is done here in the for loop + process_dumper.dump(process_node=process, output_path=process_dump_path) - logged_workflows: DumpDict = self.dump_logger.get_log()['workflows'] - - for workflow in workflows: - workflow_dumper: ProcessDumper = self.process_dumper - - workflow_dump_path: Path = workflow_path / workflow_dumper._generate_default_dump_path( - process_node=workflow, prefix=None + current_store.add_entry( + uuid=process.uuid, + entry=DumpLog(path=process_dump_path, time=datetime.now().astimezone()), ) - # Symlink here, if deduplication enabled and workflow was already dumped - if self.profile_dump_config.symlink_duplicates and workflow.uuid in logged_workflows.keys(): - workflow_dump_path.parent.mkdir(exist_ok=True, parents=True) - - os.symlink( - src=logged_workflows[workflow.uuid].path, - dst=workflow_dump_path, - ) - else: - workflow_dumper._dump_workflow( - workflow_node=workflow, - output_path=workflow_dump_path, - ) - - dumped_workflows[workflow.uuid] = DumpLog( - path=workflow_dump_path, - time=datetime.now().astimezone(), - ) - - self.dump_logger.update_workflows(new_workflows=dumped_workflows) - def dump(self) -> None: """Top-level method that actually performs the dumping of the AiiDA data for the collection. @@ -265,7 +246,7 @@ def dump(self) -> None: """ self.output_path.mkdir(exist_ok=True, parents=True) - collection_processes: ProcessesToDump = self._get_processes_to_dump() + collection_processes: ProcessesDumpContainer = self._get_processes_to_dump() # breakpoint() if not self.processes_to_dump.is_empty: @@ -273,88 +254,6 @@ def dump(self) -> None: # First, dump workflows, then calculations if len(collection_processes.workflows) > 0: - # breakpoint() - self._dump_workflows(workflows=collection_processes.workflows) + self._dump_processes(processes=collection_processes.workflows) if len(collection_processes.calculations) > 0: - # breakpoint() - self._dump_calculations(calculations=collection_processes.calculations) - - -# TODO: See, if I can generalize the dump sub-methods -# def _dump_processes( -# self, -# # processes: Sequence[orm.CalculationNode | orm.WorkflowNode], -# processes: Sequence[orm.CalculationNode] | Sequence[orm.WorkflowNode], -# ) -> None: -# """Dump a collection of calculations or workflows. - -# :param processes: Sequence of ``orm.CalculationNode``s or ``orm.WorkflowNode``s -# :param process_type: Type of processes, either 'calculations' or 'workflows' -# :return: None -# """ - -# # From, e.g., 'aiida.workflows:core.arithmetic.multiply_add' to 'workflows -# if isinstance(processes[0], orm.CalculationNode): -# process_type_str = 'calculations' -# elif isinstance(processes[0], orm.WorkflowNode): -# process_type_str = 'workflows' -# # else: -# # breakpoint() -# # process_type_str = processes[0].process_type.split(':')[0].split('.')[1] -# process_type_path = self.output_path / process_type_str -# process_type_path.mkdir(exist_ok=True, parents=True) - -# dumped_processes: dict[str, DumpLog] = {} -# logged_processes: DumpDict = self.dump_logger.get_log()[process_type_str] - -# # breakpoint() - -# for process in processes: -# process_dumper = self.process_dumper - -# process_dump_path = process_type_path / process_dumper._generate_default_dump_path( -# process_node=process, prefix=None -# ) - -# # Target directory already exists, skip this process -# if process_dump_path.exists(): -# continue - -# else: -# # Symlink here, if deduplication enabled and process was already dumped -# # TODO: Possibly check dirs here -# # TODO: Alternatively have method/endpoint to delete one calculation from the dumping -# # TODO: Which would also update the log. -# # Otherwise, one might delete a calculation, maybe because it was wrong, and then it won't be dumped -# # anymore ever. -# if self.deduplicate and process.uuid in logged_processes.keys(): -# try: -# os.symlink( -# src=logged_processes[process.uuid].path, -# dst=process_dump_path, -# ) -# except: -# # raise -# pass -# # breakpoint() -# else: -# if process_type_str == 'calculations': -# process_dumper._dump_calculation(calculation_node=process, output_path=process_dump_path) -# elif process_type_str == 'workflows': -# process_dumper._dump_workflow( -# workflow_node=process, -# output_path=process_dump_path, -# ) - - -# dumped_processes[process.uuid] = DumpLog( -# path=process_dump_path, -# time=datetime.now().astimezone(), -# ) - -# # breakpoint() - -# if process_type_str == 'calculations': -# self.dump_logger.update_calculations(new_calculations=dumped_processes) -# elif process_type_str == 'workflows': -# self.dump_logger.update_workflows(new_workflows=dumped_processes) + self._dump_processes(processes=collection_processes.calculations) diff --git a/src/aiida/tools/dumping/config.py b/src/aiida/tools/dumping/config.py index 09da896ed5..dab792e2d8 100644 --- a/src/aiida/tools/dumping/config.py +++ b/src/aiida/tools/dumping/config.py @@ -1,10 +1,51 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### + from dataclasses import dataclass +from datetime import datetime +from pathlib import Path + + +@dataclass +class BaseDumpConfig: + """Container for shared arguments of all Dumper classes.""" + + dump_parent_path: Path | None = None + overwrite: bool = False + incremental: bool = True + check_dirs: bool = False + # TODO: Make this a per-class attribute? + last_dump_time: datetime | None = None + + def __post_init__(self): + if self.dump_parent_path is None: + self.dump_parent_path = Path.cwd() + + +@dataclass +class ProcessDumpConfig: + """Arguments for dumping process data.""" + + include_inputs: bool = True + include_outputs: bool = False + include_attributes: bool = True + include_extras: bool = True + flat: bool = False + dump_unsealed: bool = False @dataclass class ProfileDumpConfig: + """Arguments for dumping profile data.""" + dump_processes: bool = True symlink_duplicates: bool = True - delete_missing: bool = False # profile - extra_calc_dirs: bool = False # collection - organize_by_groups: bool = True # profile + delete_missing: bool = False + extra_calc_dirs: bool = False + organize_by_groups: bool = True diff --git a/src/aiida/tools/dumping/logger.py b/src/aiida/tools/dumping/logger.py index 7489df0bbd..18f7541199 100644 --- a/src/aiida/tools/dumping/logger.py +++ b/src/aiida/tools/dumping/logger.py @@ -1,63 +1,140 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### + import json -from dataclasses import dataclass +from dataclasses import dataclass, field, fields from datetime import datetime from pathlib import Path -from typing import TypeAlias +from typing import Collection @dataclass class DumpLog: """Represents a single dump log entry.""" + # TODO: Possibly add `node_type` or something similar here + path: Path time: datetime + links: list[Path] = field(default_factory=list) + + def to_dict(self) -> dict: + return {'path': str(self.path), 'time': self.time.isoformat(), 'links': [str(link) for link in self.links]} + + @classmethod + def from_dict(cls, data: dict) -> 'DumpLog': + return cls( + path=Path(data['path']), + time=datetime.fromisoformat(data['time']), + links=[Path(link) for link in data.get('links', [])], + ) + + +@dataclass +class DumpLogStore: + """A store for DumpLog entries, indexed by UUID.""" + + entries: dict[str, DumpLog] = field(default_factory=dict) + + # TODO: If I support keeping track of the symlinks, possibly should implement extending them here + def add_entry(self, uuid: str, entry: DumpLog) -> None: + """Add a single entry to the container.""" + self.entries[uuid] = entry + + def add_entries(self, entries: dict[str, DumpLog]) -> None: + """Add a collection of entries to the container.""" + self.entries.update(entries) + + def del_entry(self, uuid: str) -> bool: + """Remove a single entry by UUID.""" + if uuid in self.entries: + del self.entries[uuid] + return True + return False + + def del_entries(self, uuids: Collection[str]) -> None: + """Remove a collection of entries by UUID.""" + for uuid in uuids: + if uuid in self.entries: + del self.entries[uuid] + def get_entry(self, uuid: str) -> DumpLog | None: + """Retrieve a single entry by UUID.""" + return self.entries.get(uuid) -DumpDict: TypeAlias = dict[str, DumpLog] + def __len__(self) -> int: + """Return the number of entries in the container.""" + return len(self.entries) + + def __iter__(self): + """Iterate over all entries.""" + return iter(self.entries.items()) + + def to_dict(self) -> dict: + return {uuid: entry.to_dict() for uuid, entry in self.entries.items()} + + @classmethod + def from_dict(cls, data: dict) -> 'DumpLogStore': + store = cls() + store.entries = {uuid: DumpLog.from_dict(entry) for uuid, entry in data.items()} + return store + + +@dataclass +class DumpLogStoreCollection: + """Represents the entire log, with calculations and workflows (will be extended with Data).""" + + calculations: DumpLogStore + workflows: DumpLogStore class DumpLogger: """Main logger class using dataclasses for better structure.""" - DUMP_FILE: str = '.dump_log.json' + DUMP_LOG_FILE: str = '.dump_log.json' + + # TODO: Possibly add `get_calculations` and `get_workflows` as convenience methods def __init__( self, dump_parent_path: Path | None = None, - calculations: DumpDict | None = None, - workflows: DumpDict | None = None, - counter: int = 0, + calculations: DumpLogStore | None = None, + workflows: DumpLogStore | None = None, + # counter: int = 0, ) -> None: self.dump_parent_path = dump_parent_path or Path.cwd() - self.calculations = calculations or {} - self.workflows = workflows or {} - self.counter = 0 + self.calculations = calculations or DumpLogStore() + self.workflows = workflows or DumpLogStore() + # self.counter = counter @property - def dump_file(self) -> Path: + def log_file_path(self) -> Path: """Get the path to the dump file.""" - return self.dump_parent_path / self.DUMP_FILE + return self.dump_parent_path / self.DUMP_LOG_FILE - def update_calculations(self, new_calculations: DumpDict) -> None: - """Update the calculations log.""" - self.calculations.update(new_calculations) - self.counter += len(new_calculations) + def add_entry(self, store: DumpLogStore, uuid: str, entry: DumpLog) -> None: + store.add_entry(uuid, entry) - def update_workflows(self, new_workflows: DumpDict) -> None: - """Update the workflows log.""" - self.workflows.update(new_workflows) - self.counter += len(new_workflows) + def del_entry(self, store: DumpLogStore, uuid: str) -> bool: + return store.del_entry(uuid) - def get_log(self) -> dict[str, DumpDict]: - """Retrieve the current state of the log.""" - return {'calculations': self.calculations, 'workflows': self.workflows} + @property + def log(self) -> DumpLogStoreCollection: + """Retrieve the current state of the log as a dataclass.""" + return DumpLogStoreCollection(calculations=self.calculations, workflows=self.workflows) def save_log(self) -> None: """Save the log to a JSON file.""" - def serialize_logs(logs: DumpDict) -> dict: + def serialize_logs(container: DumpLogStore) -> dict: serialized = {} - for uuid, entry in logs.items(): + for uuid, entry in container.entries.items(): serialized[uuid] = {'path': str(entry.path), 'time': entry.time.isoformat()} return serialized @@ -66,26 +143,34 @@ def serialize_logs(logs: DumpDict) -> dict: 'workflows': serialize_logs(self.workflows), } - with self.dump_file.open('w', encoding='utf-8') as f: + with self.log_file_path.open('w', encoding='utf-8') as f: json.dump(log_dict, f, indent=4) + def __enter__(self) -> 'DumpLogger': + return self + + def __exit__(self, exc_type, exc_value, traceback) -> None: + self.save_log() + @classmethod def from_file(cls, dump_parent_path: Path) -> 'DumpLogger': """Alternative constructor to load from an existing JSON file.""" instance = cls(dump_parent_path=dump_parent_path) - if not instance.dump_file.exists(): + if not instance.log_file_path.exists(): return instance try: - with instance.dump_file.open('r', encoding='utf-8') as f: + with instance.log_file_path.open('r', encoding='utf-8') as f: data = json.load(f) - def deserialize_logs(category_data: dict) -> DumpDict: - deserialized = {} + def deserialize_logs(category_data: dict) -> DumpLogStore: + container = DumpLogStore() for uuid, entry in category_data.items(): - deserialized[uuid] = DumpLog(path=Path(entry['path']), time=datetime.fromisoformat(entry['time'])) - return deserialized + container.add_entry( + uuid, DumpLog(path=Path(entry['path']), time=datetime.fromisoformat(entry['time'])) + ) + return container instance.calculations = deserialize_logs(data['calculations']) instance.workflows = deserialize_logs(data['workflows']) @@ -94,3 +179,12 @@ def deserialize_logs(category_data: dict) -> DumpDict: raise return instance + + def find_store_by_uuid(self, uuid: str) -> DumpLogStore | None: + """Find the store that contains the given UUID.""" + # Iterate over the fields of the DumpLogStoreCollection dataclass for generality + for field_ in fields(self.log): + store = getattr(self.log, field_.name) + if uuid in store.entries: + return store + return None diff --git a/src/aiida/tools/dumping/process.py b/src/aiida/tools/dumping/process.py index 617c475bf6..0e744c4421 100644 --- a/src/aiida/tools/dumping/process.py +++ b/src/aiida/tools/dumping/process.py @@ -35,7 +35,7 @@ from aiida.common.exceptions import NotExistentAttributeError from aiida.orm.utils import LinkTriple from aiida.tools.archive.exceptions import ExportValidationError -from aiida.tools.dumping.base import BaseDumper +from aiida.tools.dumping.config import BaseDumpConfig, ProcessDumpConfig from aiida.tools.dumping.utils import prepare_dump_path logger = logging.getLogger(__name__) @@ -46,23 +46,20 @@ class ProcessDumper: def __init__( self, - base_dumper: BaseDumper | None = None, - include_inputs: bool = True, - include_outputs: bool = False, - include_attributes: bool = True, - include_extras: bool = True, - flat: bool = False, - dump_unsealed: bool = False, + base_dump_config: BaseDumpConfig | None = None, + process_dump_config: ProcessDumpConfig | None = None, ) -> None: - """Initialize the CollectionDumper.""" - self.include_inputs = include_inputs - self.include_outputs = include_outputs - self.include_attributes = include_attributes - self.include_extras = include_extras - self.flat = flat - self.dump_unsealed = dump_unsealed + """Initialize the ProcessDumper.""" - self.base_dumper = base_dumper or BaseDumper() + self.base_dump_config = base_dump_config or BaseDumpConfig() + self.process_dump_config = process_dump_config or ProcessDumpConfig() + + self.include_inputs = self.process_dump_config.include_inputs + self.include_outputs = self.process_dump_config.include_outputs + self.include_attributes = self.process_dump_config.include_attributes + self.include_extras = self.process_dump_config.include_extras + self.flat = self.process_dump_config.flat + self.dump_unsealed = self.process_dump_config.dump_unsealed @staticmethod def _generate_default_dump_path( @@ -224,7 +221,9 @@ def dump( output_path = output_path or self._generate_default_dump_path(process_node=process_node) prepare_dump_path( - path_to_validate=output_path, overwrite=self.base_dumper.overwrite, incremental=self.base_dumper.incremental + path_to_validate=output_path, + overwrite=self.base_dump_config.overwrite, + incremental=self.base_dump_config.incremental, ) if isinstance(process_node, orm.CalculationNode): @@ -263,8 +262,8 @@ def _dump_workflow( prepare_dump_path( path_to_validate=output_path, - overwrite=self.base_dumper.overwrite, - incremental=self.base_dumper.incremental, + overwrite=self.base_dump_config.overwrite, + incremental=self.base_dump_config.incremental, ) self._dump_node_yaml(process_node=workflow_node, output_path=output_path) @@ -320,7 +319,9 @@ def _dump_calculation( """ prepare_dump_path( - path_to_validate=output_path, overwrite=self.base_dumper.overwrite, incremental=self.base_dumper.incremental + path_to_validate=output_path, + overwrite=self.base_dump_config.overwrite, + incremental=self.base_dump_config.incremental, ) self._dump_node_yaml(process_node=calculation_node, output_path=output_path) diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py index db03e2b5cf..e3f5d3a8c7 100644 --- a/src/aiida/tools/dumping/profile.py +++ b/src/aiida/tools/dumping/profile.py @@ -11,15 +11,15 @@ from __future__ import annotations -from typing import Sequence, cast +from collections.abc import Collection +from typing import cast from aiida import orm from aiida.common.log import AIIDA_LOGGER from aiida.manage import load_profile from aiida.manage.configuration.profile import Profile -from aiida.tools.dumping.base import BaseDumper from aiida.tools.dumping.collection import CollectionDumper -from aiida.tools.dumping.config import ProfileDumpConfig +from aiida.tools.dumping.config import BaseDumpConfig, ProfileDumpConfig from aiida.tools.dumping.logger import DumpLogger from aiida.tools.dumping.process import ProcessDumper from aiida.tools.dumping.utils import _safe_delete, filter_by_last_dump_time @@ -34,16 +34,15 @@ def __init__( self, profile: str | Profile | None = None, profile_dump_config: ProfileDumpConfig | None = None, - base_dumper: BaseDumper | None = None, + base_dump_config: BaseDumpConfig | None = None, process_dumper: ProcessDumper | None = None, dump_logger: DumpLogger | None = None, - # deduplicate: bool = True, - groups: Sequence[str | orm.Group] | None = None, + groups: Collection[str] | Collection[orm.Group] | None = None, ): """Initialize the ProfileDumper. :param profile: The selected profile to dump. - :param base_dumper: Base dumper instance or None (gets instantiated). + :param base_dump_config: Base dumper instance or None (gets instantiated). :param process_dumper: Process dumper instance or None (gets instantiated). :param dump_logger: Logger for the dumping (gets instantiated). :param organize_by_groups: Organize dumped data by groups. @@ -52,9 +51,9 @@ def __init__( self.groups = groups - self.base_dumper = base_dumper or BaseDumper() + self.base_dump_config = base_dump_config or BaseDumpConfig() self.process_dumper = process_dumper or ProcessDumper() - self.dump_logger = dump_logger or DumpLogger(dump_parent_path=self.base_dumper.dump_parent_path) + self.dump_logger = dump_logger or DumpLogger(dump_parent_path=self.base_dump_config.dump_parent_path) self.profile_dump_config = profile_dump_config or ProfileDumpConfig() @@ -62,27 +61,26 @@ def __init__( profile = load_profile(profile=profile, allow_switch=True) self.profile = profile - self._processes_to_dump: Sequence[str] | None = None - self._processes_to_delete: Sequence[str] | None = None + self._processes_to_dump: Collection[str] | None = None + self._processes_to_delete: Collection[str] | None = None def _dump_processes_not_in_any_group(self): """Dump the profile's process data not contained in any group.""" # `dump_parent_path` set to CWD in the `post_init` method of the `BaseDumper` dataclass if not given - assert self.base_dumper.dump_parent_path is not None + assert self.base_dump_config.dump_parent_path is not None if self.profile_dump_config.organize_by_groups: - output_path = self.base_dumper.dump_parent_path / 'no-group' + output_path = self.base_dump_config.dump_parent_path / 'no-group' else: - output_path = self.base_dumper.dump_parent_path + output_path = self.base_dump_config.dump_parent_path no_group_nodes = self._get_no_group_processes() no_group_dumper = CollectionDumper( collection=no_group_nodes, profile_dump_config=self.profile_dump_config, - base_dumper=self.base_dumper, + base_dump_config=self.base_dump_config, process_dumper=self.process_dumper, - # deduplicate=self.deduplicate, dump_logger=self.dump_logger, output_path=output_path, ) @@ -98,21 +96,20 @@ def _dump_processes_not_in_any_group(self): def _dump_processes_per_group(self, groups): # === Dump data per-group if Groups exist in profile or are selected === - assert self.base_dumper.dump_parent_path is not None + assert self.base_dump_config.dump_parent_path is not None for group in groups: if self.profile_dump_config.organize_by_groups: - output_path = self.base_dumper.dump_parent_path / f'group-{group.label}' + output_path = self.base_dump_config.dump_parent_path / f'group-{group.label}' else: - output_path = self.base_dumper.dump_parent_path + output_path = self.base_dump_config.dump_parent_path group_dumper = CollectionDumper( - base_dumper=self.base_dumper, + base_dump_config=self.base_dump_config, profile_dump_config=self.profile_dump_config, process_dumper=self.process_dumper, dump_logger=self.dump_logger, collection=group, - # deduplicate=self.deduplicate, output_path=output_path, ) @@ -126,23 +123,23 @@ def _dump_processes_per_group(self, groups): group_dumper.dump() - def _get_no_group_processes(self) -> Sequence[str] | Sequence[int]: + def _get_no_group_processes(self) -> Collection[str]: """Obtain nodes in the profile that are not part of any group. :return: List of UUIDs of selected nodes. """ - group_qb = orm.QueryBuilder().append(orm.Group) - profile_groups = cast(Sequence[orm.Group], group_qb.all(flat=True)) - process_qb = orm.QueryBuilder().append(orm.ProcessNode, project=['uuid']) - profile_processes = cast(Sequence[str], process_qb.all(flat=True)) + profile_groups = cast(Collection[orm.Group], orm.QueryBuilder().append(orm.Group).all(flat=True)) + profile_processes = cast( + Collection[str], orm.QueryBuilder().append(orm.ProcessNode, project=['uuid']).all(flat=True) + ) - nodes_in_groups: Sequence[str] = [node.uuid for group in profile_groups for node in group.nodes] + nodes_in_groups: list[str] = [node.uuid for group in profile_groups for node in group.nodes] # Need to expand here also with the called_descendants of `WorkflowNodes`, otherwise the called # `CalculationNode`s for `WorkflowNode`s that are part of a group are dumped twice # Get the called descendants of WorkflowNodes within the nodes_in_groups list - sub_nodes_in_groups: Sequence[str] = [ + sub_nodes_in_groups: list[str] = [ node.uuid for n in nodes_in_groups # if isinstance((workflow_node := orm.load_node(n)), orm.WorkflowNode) @@ -152,10 +149,12 @@ def _get_no_group_processes(self) -> Sequence[str] | Sequence[int]: nodes_in_groups += sub_nodes_in_groups - process_nodes: Sequence[str | int] = [ + process_nodes: Collection[str] = [ profile_node for profile_node in profile_processes if profile_node not in nodes_in_groups ] - process_nodes = filter_by_last_dump_time(nodes=process_nodes, last_dump_time=self.base_dumper.last_dump_time) + process_nodes = filter_by_last_dump_time( + nodes=process_nodes, last_dump_time=self.base_dump_config.last_dump_time + ) return process_nodes @@ -175,73 +174,67 @@ def dump_processes(self): else: self._dump_processes_per_group(groups=self.groups) - @staticmethod - def _get_number_of_nodes_to_dump(last_dump_time) -> dict[str, int]: - # TODO: Change this method... - result = {} - for node_type in (orm.CalculationNode, orm.WorkflowNode): - qb = orm.QueryBuilder().append(node_type, project=['uuid']) - nodes = cast(Sequence[str], qb.all(flat=True)) - nodes = filter_by_last_dump_time(nodes=nodes, last_dump_time=last_dump_time) - result[node_type.class_node_type.split('.')[-2] + 's'] = len(nodes) - return result - @property - def processes_to_dump(self) -> Sequence[str]: + def processes_to_dump(self) -> Collection[str]: if self._processes_to_dump is None: self._processes_to_dump = self._get_processes_to_dump() return self._processes_to_dump - def _get_processes_to_dump(self) -> Sequence[str]: - process_qb = orm.QueryBuilder().append( - orm.ProcessNode, project=['uuid'], filters={'ctime': {'>': self.base_dumper.last_dump_time}} - ) + def _get_processes_to_dump(self) -> Collection[str]: + if self.base_dump_config.last_dump_time is not None: + process_qb = orm.QueryBuilder().append( + orm.ProcessNode, project=['uuid'], filters={'ctime': {'>': self.base_dump_config.last_dump_time}} + ) + else: + process_qb = orm.QueryBuilder().append(orm.ProcessNode, project=['uuid']) - profile_processes = cast(Sequence[str], process_qb.all(flat=True)) + profile_processes = cast(Collection[str], process_qb.all(flat=True)) return profile_processes @property - def processes_to_delete(self) -> Sequence[str]: + def processes_to_delete(self) -> Collection[str]: if self._processes_to_delete is None: self._processes_to_delete = self._get_processes_to_delete() return self._processes_to_delete - def _get_processes_to_delete(self) -> Sequence[str]: + def _get_processes_to_delete(self) -> Collection[str]: dump_logger = self.dump_logger - log = dump_logger.get_log() - dumped_uuids = set(list(log['calculations'].keys()) + list(log['workflows'].keys())) - # Cannot use QB here because, when deleted, not in the DB anymore - # dumped_qb = orm.QueryBuilder().append(orm.ProcessNode, filters={'uuid': {'in': dumped_uuids}}, project=['uuid']) + log = dump_logger.log + + # breakpoint() + dumped_uuids = set(list(log.calculations.entries.keys()) + list(log.workflows.entries.keys())) + # Cannot use QB here because, when node deleted, it's not in the DB anymore # dumped_processes: set[str] = set(cast(list[str], dumped_qb.all(flat=True))) # TODO: Possibly filter here since last dump time # TODO: But it is highly likely that the last dump command with deletion was run a while ago # TODO: So I cannot filter by last dump time, but should probably take the whole set profile_qb = orm.QueryBuilder().append(orm.ProcessNode) - profile_processes = set(cast(Sequence[orm.ProcessNode], profile_qb.all(flat=True))) + profile_processes = set(cast(Collection[orm.ProcessNode], profile_qb.all(flat=True))) profile_uuids = set([process.uuid for process in profile_processes if process.caller is None]) to_delete_uuids = list(dumped_uuids - profile_uuids) return to_delete_uuids - def _delete_missing_process_paths(self, to_delete_uuids): - log = self.dump_logger.get_log() - paths_to_delete = [] + def _delete_missing_node(self, to_delete_uuid) -> None: + # TODO: Possibly make a delete method for the path and the log, and then call that in the loop - for to_delete_uuid in to_delete_uuids: - try: - paths_to_delete.append(log['workflows'][to_delete_uuid].path) - except KeyError: - paths_to_delete.append(log['calculations'][to_delete_uuid].path) - except: - raise + dump_logger = self.dump_logger + current_store = dump_logger.find_store_by_uuid(uuid=to_delete_uuid) + if not current_store: + return - for path_to_delete in paths_to_delete: - _safe_delete(path_to_validate=path_to_delete, safeguard_file='.aiida_node_metadata.yaml', verbose=False) + # ! Cannot load the node via its UUID here and use the type to get the correct store, as the Node is deleted + # ! from the DB. Should find a better solution - # breakpoint() + try: + path_to_delete = current_store.entries[to_delete_uuid].path + _safe_delete(path_to_validate=path_to_delete, safeguard_file='.aiida_node_metadata.yaml', verbose=False) + current_store.del_entry(uuid=to_delete_uuid) + except: + raise def delete_processes(self): to_dump_processes = self.processes_to_dump @@ -250,9 +243,20 @@ def delete_processes(self): print(f'TO_DUMP_PROCESSES: {to_dump_processes}') print(f'TO_DELETE_PROCESSES: {to_delete_processes}') - breakpoint() - - self._delete_missing_process_paths(to_delete_uuids=to_delete_processes) + # breakpoint() + for to_delete_uuid in to_delete_processes: + self._delete_missing_node(to_delete_uuid=to_delete_uuid) # TODO: Need to also delete entry from the log when I delete the dir # TODO: Add also logging for node/path deletion + + # @staticmethod + # def _get_number_of_nodes_to_dump(last_dump_time) -> dict[str, int]: + # # TODO: Change this method... + # result = {} + # for node_type in (orm.CalculationNode, orm.WorkflowNode): + # qb = orm.QueryBuilder().append(node_type, project=['uuid']) + # nodes = cast(Collection[str], qb.all(flat=True)) + # nodes = filter_by_last_dump_time(nodes=nodes, last_dump_time=last_dump_time) + # result[node_type.class_node_type.split('.')[-2] + 's'] = len(nodes) + # return result diff --git a/src/aiida/tools/dumping/utils.py b/src/aiida/tools/dumping/utils.py index 17a075c59f..6893f36978 100644 --- a/src/aiida/tools/dumping/utils.py +++ b/src/aiida/tools/dumping/utils.py @@ -12,16 +12,50 @@ from datetime import datetime from pathlib import Path -from typing import cast +from typing import Collection, cast from aiida import orm from aiida.common.log import AIIDA_LOGGER -__all__ = ['prepare_dump_path'] +# TypeAlias not supported in Py 3.9 +# Collection[str] = Collection[str] | Collection[int] | None + +__all__ = ('NodeDumpMapper', 'prepare_dump_path') logger = AIIDA_LOGGER.getChild('tools.dumping') +class NodeDumpMapper: + calculation_key: str = 'calculations' + workflow_key: str = 'workflows' + + @classmethod + def get_directory(cls, node: orm.Node) -> Path: + # Check node type and map to the corresponding directory + if isinstance(node, orm.CalculationNode): + # This includes subclasses like orm.CalcFunctionNode and orm.CalcJobNode + return Path(cls.calculation_key) + elif isinstance(node, orm.WorkflowNode): + # This includes subclasses like orm.WorkFunctionNode and orm.WorkChainNode + return Path(cls.workflow_key) + else: + msg = f'Dumping not implemented yet for node type: {type(node)}' + raise NotImplementedError(msg) + + @classmethod + def get_logger_attr(cls, node: orm.Node) -> str: + # Check node type and map to the corresponding directory + if isinstance(node, orm.CalculationNode): + # This includes subclasses like orm.CalcFunctionNode and orm.CalcJobNode + return cls.calculation_key + elif isinstance(node, orm.WorkflowNode): + # This includes subclasses like orm.WorkFunctionNode and orm.WorkChainNode + return cls.workflow_key + else: + msg = f'Dumping not implemented yet for node type: {type(node)}' + raise NotImplementedError(msg) + + def prepare_dump_path( path_to_validate: Path, overwrite: bool = False, @@ -84,7 +118,7 @@ def _safe_delete( if not path_to_validate.exists(): return - is_empty = any(path_to_validate.iterdir()) + is_empty = not any(path_to_validate.iterdir()) if is_empty: path_to_validate.rmdir() return @@ -140,21 +174,7 @@ def _delete_dir_recursively(path): print(f'exception msg: {exception}') -def _get_filtered_nodes(nodes: list[str | int], last_dump_time: datetime, key: str = 'uuid') -> list[str | int]: - """Helper function to get ``orm.Node``s from the DB based on ``id``/``uuid`` and filter by ``mtime``. - - :param nodes: Collection of node PKs or UUIDs - :param last_dump_time: Last time nodes were dumped to disk. - :param key: Identifier to obtain nodes with, either ``id`` or ``uuid``. - :return: List of nodes filtered by ``last_dump_time``. - """ - - qb = orm.QueryBuilder().append(orm.Node, filters={key: {'in': nodes}}) - nodes_orm: list[orm.Node] = cast(list[orm.Node], qb.all(flat=True)) - return [getattr(node, key) for node in nodes_orm if node.mtime > last_dump_time] - - -def filter_by_last_dump_time(nodes: list[str | int], last_dump_time: datetime) -> list[str | int]: +def filter_by_last_dump_time(nodes: Collection[str], last_dump_time: datetime | None = None) -> Collection[str]: """Filter a list of nodes by the last dump time of the corresponding dumper. :param nodes: A list of node identifiers, which can be either UUIDs (str) or IDs (int). @@ -163,16 +183,12 @@ def filter_by_last_dump_time(nodes: list[str | int], last_dump_time: datetime) - """ # TODO: Possibly directly use QueryBuilder filter. Though, `nodes` directly accessible from orm.Group.nodes - if not nodes or last_dump_time is None: return nodes - key = 'uuid' if isinstance(nodes[0], str) else 'id' - return _get_filtered_nodes( - nodes=nodes, - last_dump_time=last_dump_time, - key=key, - ) + qb = orm.QueryBuilder().append(orm.Node, filters={'uuid': {'in': nodes}}) + nodes_orm: list[orm.Node] = cast(list[orm.Node], qb.all(flat=True)) + return [node.uuid for node in nodes_orm if node.mtime > last_dump_time] def extend_calculations(profile_dump_config, calculations, workflows): diff --git a/tests/tools/dumping/test_collection.py b/tests/tools/dumping/test_collection.py index 6b79dd1195..7dc309e788 100644 --- a/tests/tools/dumping/test_collection.py +++ b/tests/tools/dumping/test_collection.py @@ -115,7 +115,7 @@ def test_resolve_collection_nodes(self, setup_add_group, generate_calculation_no assert set(nodes) == set([add_nodes[0].uuid, cj_node1.uuid]) # Filtering by time should work -> Now, only cj_node2 gets returned - add_dumper.base_dumper.last_dump_time = datetime.now().astimezone() + add_dumper.base_dump_config.last_dump_time = datetime.now().astimezone() cj_node2 = generate_calculation_node_add() add_group.add_nodes([cj_node2]) @@ -162,7 +162,7 @@ def test_dump_calculations_add(self, setup_add_group, tmp_path): add_dumper = CollectionDumper(collection=add_group, output_path=add_group_path) - add_dumper._dump_calculations(add_dumper._get_processes_to_dump().calculations) + add_dumper._dump_processes(add_dumper._get_processes_to_dump().calculations) expected_tree = { 'calculations': { @@ -185,7 +185,7 @@ def test_dump_calculations_multiply_add(self, setup_multiply_add_group, tmp_path multiply_add_dumper = CollectionDumper(collection=multiply_add_group, output_path=multiply_add_group_path) # No calculations to dump when deduplication is enabled - multiply_add_dumper._dump_calculations(multiply_add_dumper._get_processes_to_dump().calculations) + multiply_add_dumper._dump_processes(multiply_add_dumper._get_processes_to_dump().calculations) assert not (multiply_add_group_path / 'calculations').exists() # Now, disable de-duplication -> Should dump calculations @@ -193,9 +193,7 @@ def test_dump_calculations_multiply_add(self, setup_multiply_add_group, tmp_path collection=multiply_add_group, output_path=multiply_add_group_path, deduplicate=False ) - multiply_add_dumper_no_dedup._dump_calculations( - multiply_add_dumper_no_dedup._get_processes_to_dump().calculations - ) + multiply_add_dumper_no_dedup._dump_processes(multiply_add_dumper_no_dedup._get_processes_to_dump().calculations) expected_tree_no_dedup = { 'calculations': { @@ -249,7 +247,7 @@ def test_dump_calculations_multiply_add(self, setup_multiply_add_group, tmp_path # assert len(nodes) == 2 # # Filtering by time should work - # collection_dumper.base_dumper.last_dump_time = datetime.now().astimezone() + # collection_dumper.base_dump_config.last_dump_time = datetime.now().astimezone() # cj_node2 = generate_calculation_node_add() # add_group.add_nodes([cj_node2]) diff --git a/tests/tools/dumping/test_process.py b/tests/tools/dumping/test_process.py index 56fb356054..b199ab1ade 100644 --- a/tests/tools/dumping/test_process.py +++ b/tests/tools/dumping/test_process.py @@ -15,7 +15,7 @@ import pytest -from aiida.tools.dumping.base import BaseDumper +from aiida.tools.dumping.config import BaseDumpConfig from aiida.tools.dumping.process import ProcessDumper # Non-AiiDA variables @@ -212,8 +212,8 @@ def test_dump_calculation_flat(tmp_path, generate_calculation_node_io): def test_dump_calculation_overwr_incr(tmp_path, generate_calculation_node_io): """Tests the ProcessDumper for the overwrite and incremental option.""" dump_parent_path = tmp_path / 'cj-dump-test-overwrite' - base_dumper = BaseDumper(overwrite=False, incremental=False) - process_dumper = ProcessDumper(base_dumper=base_dumper) + base_dump_config = BaseDumpConfig(overwrite=False, incremental=False) + process_dumper = ProcessDumper(base_dump_config=base_dump_config) calculation_node = generate_calculation_node_io() calculation_node.seal() # Create safeguard file to mock existing dump directory @@ -223,8 +223,8 @@ def test_dump_calculation_overwr_incr(tmp_path, generate_calculation_node_io): with pytest.raises(FileExistsError): process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) # With overwrite option true no error is raised and the dumping can run through. - base_dumper = BaseDumper(overwrite=True, incremental=False) - process_dumper = ProcessDumper(base_dumper=base_dumper) + base_dump_config = BaseDumpConfig(overwrite=True, incremental=False) + process_dumper = ProcessDumper(base_dump_config=base_dump_config) process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) assert (dump_parent_path / inputs_relpath / filename).is_file() @@ -233,8 +233,8 @@ def test_dump_calculation_overwr_incr(tmp_path, generate_calculation_node_io): # Incremental also does work dump_parent_path.mkdir() (dump_parent_path / '.aiida_node_metadata.yaml').touch() - base_dumper = BaseDumper(overwrite=False, incremental=True) - process_dumper = ProcessDumper(base_dumper=base_dumper) + base_dump_config = BaseDumpConfig(overwrite=False, incremental=True) + process_dumper = ProcessDumper(base_dump_config=base_dump_config) process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) assert (dump_parent_path / inputs_relpath / filename).is_file()