Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Caching of file-set hashes by local path and mtimes #700

Merged
merged 36 commits into from
Mar 17, 2024
Merged
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
45117ef
added code to handle "locally-persistent-ids"
tclose Feb 24, 2024
2b7ca50
implemented persistent hash cache to avoid rehashing files
tclose Feb 24, 2024
04b95ff
touched up persistent_hash_cache test
tclose Feb 24, 2024
0c865f4
replaced Cache({}) with Cache() to match new proper class
tclose Feb 24, 2024
3b3fdb7
upped resolution of mtime to nanoseconds
tclose Feb 24, 2024
81a5108
added sleep to various tests to ensure file mtimes are different
tclose Feb 24, 2024
0c4b179
added more sleeps to ensure mtimes of input files are different in tests
tclose Feb 24, 2024
615d590
debugged setting hash cache via env var and added clean up of directory
tclose Feb 24, 2024
55b660e
mock mtime writing instead of adding sleeps to ensure mtimes are diff…
tclose Feb 24, 2024
5d51736
undid overzealous black
tclose Feb 24, 2024
0421f85
added missing import
tclose Feb 24, 2024
a864b32
Adds platformdirs dependency and use it to store the hash cache within
tclose Feb 24, 2024
05ca695
added unittests to hit exceptions in persistentcache init
tclose Feb 24, 2024
52ef03f
added mkdir to location converter
tclose Feb 24, 2024
0216236
debugged mkdir of persistent cache
tclose Feb 24, 2024
bad261b
bug fixes in persistentcache location init
tclose Feb 24, 2024
2fbee2b
Revert "mock mtime writing instead of adding sleeps to ensure mtimes …
tclose Feb 24, 2024
91948f0
skip lock files in directory clean up
tclose Feb 24, 2024
e058408
fixed clean-up bug
tclose Feb 24, 2024
f1ded7a
added mock import
tclose Feb 24, 2024
bb11067
added another sleep to trigger atime change
tclose Feb 24, 2024
a031ea5
implementing @effigies suggestions
tclose Feb 29, 2024
f2f70a6
added comments and doc strings to explain the use of the persistent c…
tclose Feb 29, 2024
191aa9c
touched up comments
tclose Feb 29, 2024
3076fea
another comment touch up
tclose Feb 29, 2024
a094fbc
touch up comments again
tclose Feb 29, 2024
d27201f
Merge branch 'master' into local-cache-ids
tclose Mar 8, 2024
291f29f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 8, 2024
0a10f6c
added in @djarecka's test for moving file cache locations
tclose Mar 8, 2024
311e3dd
updated cache initialisation
tclose Mar 8, 2024
4827365
switched to use blake2b isntead of blake2s
tclose Mar 8, 2024
b6799b6
[skip ci] deleted already commented-out code
tclose Mar 8, 2024
2bb86fe
additional doc strings for hash cache objects
tclose Mar 8, 2024
1f601e1
added test to see that persistent cache is used in the running of tasks
tclose Mar 16, 2024
7e60c41
moved persistent hash cache within "hash_cache" subdirectory of the p…
tclose Mar 17, 2024
921979c
fixed import issue
tclose Mar 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
debugged setting hash cache via env var and added clean up of directory
  • Loading branch information
tclose committed Feb 24, 2024
commit 615d59075a57972aa8bb67addcb0089e3961ef99
2 changes: 2 additions & 0 deletions pydra/engine/submitter.py
Original file line number Diff line number Diff line change
@@ -6,6 +6,7 @@
from .workers import WORKERS
from .core import is_workflow
from .helpers import get_open_loop, load_and_run_async
from ..utils.hash import PersistentCache

import logging

@@ -43,6 +44,7 @@ def __call__(self, runnable, cache_locations=None, rerun=False, environment=None
self.loop.run_until_complete(
self.submit_from_call(runnable, rerun, environment)
)
PersistentCache.clean_up()
return runnable.result()

async def submit_from_call(self, runnable, rerun, environment):
75 changes: 48 additions & 27 deletions pydra/utils/hash.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
"""Generic object hashing dispatch"""

import os

# import stat
import struct
from datetime import datetime
import tempfile
import typing as ty
from pathlib import Path
@@ -78,7 +77,7 @@ def location_validator(self, _, location):
f"Persistent cache location '{location}' is not a directory"
)

def get_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> Hash:
def get_or_calculate_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> Hash:
"""Check whether key is present in the persistent cache store and return it if so.
Otherwise use `calculate_hash` to generate the hash and save it in the persistent
store.
@@ -108,22 +107,52 @@ def get_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> Hash:
key_path.write_bytes(hsh)
return Hash(hsh)


def persistent_cache_converter(
path: ty.Union[Path, str, PersistentCache, None]
) -> PersistentCache:
if isinstance(path, PersistentCache):
return path
if path is None:
path = tempfile.mkdtemp()
return PersistentCache(Path(path))
@classmethod
def clean_up(cls):
"""Cleans up old hash caches that haven't been accessed in the last 30 days"""
now = datetime.now()
for path in cls.DEFAULT_LOCATION.iterdir():
days = (now - datetime.fromtimestamp(path.lstat().st_atime)).days
if days > cls.CLEAN_UP_PERIOD:
path.unlink()

@classmethod
def from_path(
cls, path: ty.Union[Path, str, "PersistentCache", None]
) -> "PersistentCache":
if isinstance(path, PersistentCache):
return path
if path is None:
path = Path(tempfile.mkdtemp())
else:
path = Path(path)
if not path.exists():
try:
Path(path).mkdir()
except Exception as e:
raise RuntimeError(
f"Could not create cache directory for file hashes at {path}, "
"please use 'PYDRA_HASH_CACHE' environment variable to control "
"where it is created by default"
) from e
return PersistentCache(path)

# FIXME: Ideally the default location would be inside one of the cache_locations
# but can't think of a clean way to pass the cache_locations down to this part
# of the code, so just dumping in the home directory instead by default. In any case,
# this needs to be documented
DEFAULT_LOCATION = Path(
os.environ.get("PYDRA_HASH_CACHE", Path("~").expanduser() / ".pydra-hash-cache")
)
# Set the period after which old hash cache files are cleaned up in days
CLEAN_UP_PERIOD = os.environ.get("PYDRA_HASH_CACHE_CLEAN_UP_PERIOD", 30)


@attrs.define
class Cache:
persistent: ty.Optional[PersistentCache] = attrs.field(
default=None,
converter=persistent_cache_converter,
converter=PersistentCache.from_path, # type: ignore[misc]
)
_hashes: ty.Dict[int, Hash] = attrs.field(factory=dict)

@@ -141,22 +170,14 @@ class UnhashableError(ValueError):
"""Error for objects that cannot be hashed"""


def hash_function(obj, persistent_cache: ty.Optional[Path] = None):
def hash_function(obj, **kwargs):
"""Generate hash of object."""
if persistent_cache is None:
# FIXME: Ideally the default location would be inside one of the cache_locations
# but can't think of a clean way to pass the cache_locations down to this part
# of the code, so just dumping in the home directory instead
persistent_cache = (Path("~") / ".pydra-hash-cache").expanduser()
try:
if not persistent_cache.exists():
persistent_cache.mkdir()
except Exception:
persistent_cache = None
return hash_object(obj, persistent_cache=persistent_cache).hex()
return hash_object(obj, **kwargs).hex()


def hash_object(obj: object, persistent_cache: ty.Optional[Path] = None) -> Hash:
def hash_object(
obj: object, persistent_cache: ty.Optional[Path] = PersistentCache.DEFAULT_LOCATION
) -> Hash:
"""Hash an object

Constructs a byte string that uniquely identifies the object,
@@ -201,7 +222,7 @@ def calc_hash(first: ty.Optional[bytes] = None) -> Hash:
tp.__module__,
tp.__name__,
) + first
hsh = cache.persistent.get_hash(key, calc_hash)
hsh = cache.persistent.get_or_calculate_hash(key, calc_hash)
else:
hsh = calc_hash(first=first)
logger.debug("Hash of %s object is %s", obj, hsh)