Skip to content

Commit

Permalink
Fix catalog._datasets access for KedroDataCatalog (#4438)
Browse files Browse the repository at this point in the history
* Fixed catalog list for KedroDataCatalog

Signed-off-by: Elena Khaustova <[email protected]>

* Replaced solution

Signed-off-by: Elena Khaustova <[email protected]>

* Updated solution and made it on the catalog side

Signed-off-by: Elena Khaustova <[email protected]>

* Updated internal datasets access for KedroDataCatalog

Signed-off-by: Elena Khaustova <[email protected]>

* Fixed __getattribute__

Signed-off-by: Elena Khaustova <[email protected]>

* Added test template

Signed-off-by: Elena Khaustova <[email protected]>

* Updated solution and test

Signed-off-by: Elena Khaustova <[email protected]>

* Fixed linter

Signed-off-by: Elena Khaustova <[email protected]>

* Updated release notes

Signed-off-by: Elena Khaustova <[email protected]>

---------

Signed-off-by: Elena Khaustova <[email protected]>
  • Loading branch information
ElenaKhaustova authored Jan 27, 2025
1 parent 3d18096 commit a098829
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 22 deletions.
8 changes: 2 additions & 6 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,6 @@
{
"path": "detect_secrets.filters.allowlist.is_line_allowlisted"
},
{
"path": "detect_secrets.filters.common.is_baseline_file",
"filename": ".secrets.baseline"
},
{
"path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
"min_level": 2
Expand Down Expand Up @@ -215,9 +211,9 @@
"filename": "tests/io/test_kedro_data_catalog.py",
"hashed_secret": "15dd2c9ccec914f1470b4dccb45789844e49cf70",
"is_verified": false,
"line_number": 491
"line_number": 499
}
]
},
"generated_at": "2025-01-20T11:39:50Z"
"generated_at": "2025-01-27T18:47:13Z"
}
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
* Moved `kedro-catalog` JSON schema to `kedro-datasets`.
* Updated `Partitioned dataset lazy saving` docs page.
* Fixed `KedroDataCatalog` mutation after pipeline run.
* Made `KedroDataCatalog._datasets` compatible with `DataCatalog._datasets`.

## Breaking changes to the API
## Documentation changes
Expand Down
43 changes: 27 additions & 16 deletions kedro/io/kedro_data_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

from __future__ import annotations

import copy
import difflib
import logging
import re
Expand Down Expand Up @@ -98,7 +97,8 @@ def __init__(
>>> catalog = KedroDataCatalog(datasets={"cars": cars})
"""
self._config_resolver = config_resolver or CatalogConfigResolver()
self._datasets: dict[str, AbstractDataset] = datasets or {}
# TODO: rename back to _datasets when removing old catalog
self.__datasets: dict[str, AbstractDataset] = datasets or {}
self._lazy_datasets: dict[str, _LazyDataset] = {}
self._load_versions, self._save_version = _validate_versions(
datasets, load_versions or {}, save_version
Expand All @@ -116,7 +116,7 @@ def __init__(
@property
def datasets(self) -> dict[str, Any]:
# TODO: remove when removing old catalog
return copy.copy(self._datasets)
return self._lazy_datasets | self.__datasets

@datasets.setter
def datasets(self, value: Any) -> None:
Expand All @@ -125,36 +125,45 @@ def datasets(self, value: Any) -> None:
"Operation not allowed. Please use KedroDataCatalog.add() instead."
)

def __getattribute__(self, key: str) -> Any:
# Needed for compatability with old catalog interface since now we
# differ _datasets and _lazy_datasets
# TODO: remove when removing old catalog
if key == "_datasets":
return self.datasets
else:
return super().__getattribute__(key)

@property
def config_resolver(self) -> CatalogConfigResolver:
return self._config_resolver

def __repr__(self) -> str:
return repr(self._lazy_datasets | self._datasets)
return repr(self._lazy_datasets | self.__datasets)

def __contains__(self, dataset_name: str) -> bool:
"""Check if an item is in the catalog as a materialised dataset or pattern."""
return (
dataset_name in self._datasets
dataset_name in self.__datasets
or dataset_name in self._lazy_datasets
or self._config_resolver.match_pattern(dataset_name) is not None
)

def __eq__(self, other) -> bool: # type: ignore[no-untyped-def]
"""Compares two catalogs based on materialised datasets and datasets patterns."""
return (
self._datasets,
self.__datasets,
self._lazy_datasets,
self._config_resolver.list_patterns(),
) == (
other._datasets,
other.__datasets,
other._lazy_datasets,
other.config_resolver.list_patterns(),
)

def keys(self) -> List[str]: # noqa: UP006
"""List all dataset names registered in the catalog."""
return list(self._lazy_datasets.keys()) + list(self._datasets.keys())
return list(self._lazy_datasets.keys()) + list(self.__datasets.keys())

def values(self) -> List[AbstractDataset]: # noqa: UP006
"""List all datasets registered in the catalog."""
Expand Down Expand Up @@ -218,18 +227,18 @@ def __setitem__(self, key: str, value: Any) -> None:
>>>
>>> assert catalog.load("data_csv_dataset").equals(df)
"""
if key in self._datasets:
if key in self.__datasets:
self._logger.warning("Replacing dataset '%s'", key)
if isinstance(value, AbstractDataset):
self._load_versions, self._save_version = _validate_versions(
{key: value}, self._load_versions, self._save_version
)
self._datasets[key] = value
self.__datasets[key] = value
elif isinstance(value, _LazyDataset):
self._lazy_datasets[key] = value
else:
self._logger.info(f"Adding input data as a MemoryDataset - {key}")
self._datasets[key] = MemoryDataset(data=value) # type: ignore[abstract]
self.__datasets[key] = MemoryDataset(data=value) # type: ignore[abstract]

def __len__(self) -> int:
return len(self.keys())
Expand All @@ -250,7 +259,7 @@ def get(
Returns:
An instance of AbstractDataset.
"""
if key not in self._datasets and key not in self._lazy_datasets:
if key not in self.__datasets and key not in self._lazy_datasets:
ds_config = self._config_resolver.resolve_pattern(key)
if ds_config:
self._add_from_config(key, ds_config)
Expand All @@ -259,7 +268,7 @@ def get(
if lazy_dataset:
self[key] = lazy_dataset.materialize()

dataset = self._datasets.get(key, None)
dataset = self.__datasets.get(key, None)

return dataset or default

Expand Down Expand Up @@ -425,7 +434,7 @@ def to_config(
credentials.update(unresolved_credentials)
load_versions[ds_name] = self._load_versions.get(ds_name, None)

for ds_name, ds in self._datasets.items(): # type: ignore[assignment]
for ds_name, ds in self.__datasets.items(): # type: ignore[assignment]
if _is_memory_dataset(ds): # type: ignore[arg-type]
continue
resolved_config = ds.to_config() # type: ignore[attr-defined]
Expand Down Expand Up @@ -505,7 +514,7 @@ def get_dataset(
# Flag to turn on/off fuzzy-matching which can be time consuming and
# slow down plugins like `kedro-viz`
if suggest:
matches = difflib.get_close_matches(ds_name, self._datasets.keys())
matches = difflib.get_close_matches(ds_name, self.keys())
if matches:
suggestions = ", ".join(matches)
error_msg += f" - did you mean one of these instead: {suggestions}"
Expand All @@ -532,7 +541,9 @@ def add(
) -> None:
# TODO: remove when removing old catalog
"""Adds a new ``AbstractDataset`` object to the ``KedroDataCatalog``."""
if ds_name in self._datasets and not replace:
if (
ds_name in self.__datasets or ds_name in self._lazy_datasets
) and not replace:
raise DatasetAlreadyExistsError(
f"Dataset '{ds_name}' has already been registered"
)
Expand Down
8 changes: 8 additions & 0 deletions tests/io/test_kedro_data_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,14 @@ def test_release(self, data_catalog):
"""Test release is called without errors"""
data_catalog.release("test")

def test_dataset_property(self, data_catalog_from_config):
"""Test _dataset attribute returns the same result as dataset property"""
# Catalog includes only lazy dataset, we get boats dataset to materialize it
_ = data_catalog_from_config["boats"]
assert data_catalog_from_config.datasets == data_catalog_from_config._datasets
for ds_name in data_catalog_from_config.list():
assert ds_name in data_catalog_from_config._datasets

class TestKedroDataCatalogToConfig:
def test_to_config(self, correct_config_versioned, dataset, filepath):
"""Test dumping catalog config"""
Expand Down

0 comments on commit a098829

Please sign in to comment.