From cfad0d516f7a98190eace0463e76a0a83d93df33 Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Tue, 18 Jan 2022 18:32:01 -0500 Subject: [PATCH] Add options to the File constructor to set the data alignment --- docs/high/file.rst | 25 ++++++++ h5py/_hl/files.py | 25 +++++++- h5py/tests/test_file_alignment.py | 103 ++++++++++++++++++++++++++++++ news/enable_file_alignment.rst | 31 +++++++++ 4 files changed, 181 insertions(+), 3 deletions(-) create mode 100644 h5py/tests/test_file_alignment.py create mode 100644 news/enable_file_alignment.rst diff --git a/docs/high/file.rst b/docs/high/file.rst index fcfd17310..fcc827153 100644 --- a/docs/high/file.rst +++ b/docs/high/file.rst @@ -367,6 +367,24 @@ chunk cache*. Chunks and caching are described in greater detail in the `HDF5 documentation `_. +.. _file_alignment: + +Data alignment +-------------- + +When creating datasets within files, it may be advantageous to align the offset +within the file itself. This can help optimize read and write times if the data +become aligned with the underlying hardware, or may help with parallelism with +MPI. Unfortunately, aligning small variables to large blocks can leave alot of +empty space in a file. To this effect, application developers are left with two +options to tune the alignment of data within their file. The two variables +``alignment_threshold`` and ``alignment_interval`` in the :class:`File` +constructor help control the threshold in bytes where the data alignment policy +takes effect and the alignment in bytes within the file. The alignment is +measured from the end of the user block. + +For more information, see the official HDF5 documentation `H5P_SET_ALIGNMENT +`_. Reference --------- @@ -418,6 +436,13 @@ Reference :param fs_threshold: The smallest free-space section size that the free space manager will track. Only allowed when creating a new file. The default is 1. + :param alignment_threshold: Together with ``alignment_interval``, this + property ensures that any file object greater than or equal + in size to the alignement threshold (in bytes) will be + aligned on an address which is a multiple of alignment interval. + :param alignment_interval: This property should be used in conjunction with + ``alignment_threshold``. See the description above. For more + details, see :ref:`file_alignment`. :param kwds: Driver-specific keywords; see :ref:`file_driver`. .. method:: __bool__() diff --git a/h5py/_hl/files.py b/h5py/_hl/files.py index 16bf08a3e..3bc787937 100644 --- a/h5py/_hl/files.py +++ b/h5py/_hl/files.py @@ -109,7 +109,8 @@ def registered_drivers(): def make_fapl(driver, libver, rdcc_nslots, rdcc_nbytes, rdcc_w0, locking, - page_buf_size, min_meta_keep, min_raw_keep, **kwds): + page_buf_size, min_meta_keep, min_raw_keep, + alignment_threshold, alignment_interval, **kwds): """ Set up a file access property list """ plist = h5p.create(h5p.FILE_ACCESS) @@ -123,6 +124,7 @@ def make_fapl(driver, libver, rdcc_nslots, rdcc_nbytes, rdcc_w0, locking, # we default to earliest low, high = h5f.LIBVER_EARLIEST, h5f.LIBVER_LATEST plist.set_libver_bounds(low, high) + plist.set_alignment(alignment_threshold, alignment_interval) cache_settings = list(plist.get_cache()) if rdcc_nslots is not None: @@ -354,7 +356,8 @@ def swmr_mode(self, value): def __init__(self, name, mode='r', driver=None, libver=None, userblock_size=None, swmr=False, rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, track_order=None, fs_strategy=None, fs_persist=False, fs_threshold=1, fs_page_size=None, - page_buf_size=None, min_meta_keep=0, min_raw_keep=0, locking=None, **kwds): + page_buf_size=None, min_meta_keep=0, min_raw_keep=0, locking=None, + alignment_threshold=1, alignment_interval=1, **kwds): """Create a new file object. See the h5py user guide for a detailed explanation of the options. @@ -449,6 +452,19 @@ def __init__(self, name, mode='r', driver=None, libver=None, userblock_size=None Warning: The HDF5_USE_FILE_LOCKING environment variable can override this parameter. Only available with HDF5 >= 1.12.1 or 1.10.x >= 1.10.7. + + alignment_threshold + Together with ``alignment_interval``, this property ensures that + any file object greater than or equal in size to the alignement + threshold (in bytes) will be aligned on an address which is a + multiple of alignment interval. + + alignment_interval + This property should be used in conjunction with + ``alignment_threshold``. See the description above. For more + details, see + https://portal.hdfgroup.org/display/HDF5/H5P_SET_ALIGNMENT + Additional keywords Passed on to the selected file driver. """ @@ -500,7 +516,10 @@ def __init__(self, name, mode='r', driver=None, libver=None, userblock_size=None with phil: fapl = make_fapl(driver, libver, rdcc_nslots, rdcc_nbytes, rdcc_w0, - locking, page_buf_size, min_meta_keep, min_raw_keep, **kwds) + locking, page_buf_size, min_meta_keep, min_raw_keep, + alignment_threshold=alignment_threshold, + alignment_interval=alignment_interval, + **kwds) fcpl = make_fcpl(track_order=track_order, fs_strategy=fs_strategy, fs_persist=fs_persist, fs_threshold=fs_threshold, fs_page_size=fs_page_size) diff --git a/h5py/tests/test_file_alignment.py b/h5py/tests/test_file_alignment.py new file mode 100644 index 000000000..59d7758cf --- /dev/null +++ b/h5py/tests/test_file_alignment.py @@ -0,0 +1,103 @@ +import h5py +from .common import TestCase + + +def is_aligned(dataset, offset=4096): + # Here we check if the dataset is aligned + return dataset.id.get_offset() % offset == 0 + + +def dataset_name(i): + return f"data{i:03}" + + +class TestFileAlignment(TestCase): + """ + Ensure that setting the file alignment has the desired effect + in the internal structure. + """ + def test_no_alignment_set(self): + fname = self.mktemp() + # 881 is a prime number, so hopefully this help randomize the alignment + # enough + # A nice even number might give a pathological case where + # While we don't want the data to be aligned, it ends up aligned... + shape = (881,) + + with h5py.File(fname, 'w') as h5file: + # Create up to 1000 datasets + # At least one of them should be misaligned. + # While this isn't perfect, it seems that there + # The case where 1000 datasets get created is one where the data + # is aligned. Therefore, during correct operation, this test is + # expected to finish quickly + for i in range(1000): + dataset = h5file.create_dataset( + dataset_name(i), shape, dtype='uint8') + # Assign data so that the dataset is instantiated in + # the file + dataset[...] = i + if not is_aligned(dataset): + # Break early asserting that the file is not aligned + break + else: + raise RuntimeError("Data was all found to be aligned to 4096") + + def test_alignment_set_above_threshold(self): + # 2022/01/19 hmaarrfk + # UnitTest (TestCase) doesn't play well with pytest parametrization. + alignment_threshold = 1000 + alignment_interval = 4096 + + for shape in [ + (1033,), # A prime number above the thresold + (1000,), # Exactly equal to the threshold + (1001,), # one above the threshold + ]: + fname = self.mktemp() + with h5py.File(fname, 'w', + alignment_threshold=alignment_threshold, + alignment_interval=alignment_interval) as h5file: + # Create up to 1000 datasets + # They are all expected to be aligned + for i in range(1000): + dataset = h5file.create_dataset( + dataset_name(i), shape, dtype='uint8') + # Assign data so that the dataset is instantiated in + # the file + dataset[...] = i + assert is_aligned(dataset, offset=alignment_interval) + + def test_alignment_set_below_threshold(self): + # 2022/01/19 hmaarrfk + # UnitTest (TestCase) doesn't play well with pytest parametrization. + alignment_threshold = 1000 + alignment_interval = 1024 + + for shape in [ + (881,), # A prime number below the thresold + (999,), # Exactly one below the threshold + ]: + fname = self.mktemp() + with h5py.File(fname, 'w', + alignment_threshold=alignment_threshold, + alignment_interval=alignment_interval) as h5file: + # Create up to 1000 datasets + # At least one of them should be misaligned. + # While this isn't perfect, it seems that there + # The case where 1000 datasets get created is one where the + # data is aligned. Therefore, during correct operation, this + # test is expected to finish quickly + for i in range(1000): + dataset = h5file.create_dataset( + dataset_name(i), shape, dtype='uint8') + # Assign data so that the dataset is instantiated in + # the file + dataset[...] = i + if not is_aligned(dataset, offset=alignment_interval): + # Break early asserting that the file is not aligned + break + else: + raise RuntimeError( + "Data was all found to be aligned to " + f"{alignment_interval}. This is highly unlikely.") diff --git a/news/enable_file_alignment.rst b/news/enable_file_alignment.rst new file mode 100644 index 000000000..98308d51a --- /dev/null +++ b/news/enable_file_alignment.rst @@ -0,0 +1,31 @@ +New features +------------ + +* The ``File`` constructor contains two new parameters ``alignment_threshold``, + and ``alignment_interval`` controling the data alignment within the HDF5 + file. + +Deprecations +------------ + +* + +Exposing HDF5 functions +----------------------- + +* + +Bug fixes +--------- + +* + +Building h5py +------------- + +* + +Development +----------- + +*