Merge pull request h5py#2040 from ramonaoptics/file_alignment

Support setting the data alignment within the HDF5 file
geojunky · Feb 10, 2022 · adf2f11 · adf2f11
2 parents d570af6 + cfad0d5
commit adf2f11
Show file tree

Hide file tree

Showing 4 changed files with 181 additions and 3 deletions.
diff --git a/docs/high/file.rst b/docs/high/file.rst
@@ -367,6 +367,24 @@ chunk cache*.
 Chunks and caching are described in greater detail in the `HDF5 documentation
 <https://portal.hdfgroup.org/display/HDF5/Chunking+in+HDF5>`_.
 
+.. _file_alignment:
+
+Data alignment
+--------------
+
+When creating datasets within files, it may be advantageous to align the offset
+within the file itself. This can help optimize read and write times if the data
+become aligned with the underlying hardware, or may help with parallelism with
+MPI. Unfortunately, aligning small variables to large blocks can leave alot of
+empty space in a file. To this effect, application developers are left with two
+options to tune the alignment of data within their file.  The two variables
+``alignment_threshold`` and ``alignment_interval``  in the :class:`File`
+constructor help control the threshold in bytes where the data alignment policy
+takes effect and the alignment in bytes within the file. The alignment is
+measured from the end of the user block.
+
+For more information, see the official HDF5 documentation `H5P_SET_ALIGNMENT
+<https://portal.hdfgroup.org/display/HDF5/H5P_SET_ALIGNMENT>`_.
 
 Reference
 ---------
@@ -418,6 +436,13 @@ Reference
     :param fs_threshold: The smallest free-space section size that the free
             space manager will track. Only allowed when creating a new file.
             The default is 1.
+    :param alignment_threshold: Together with ``alignment_interval``, this
+            property ensures that any file object greater than or equal
+            in size to the alignement threshold (in bytes) will be
+            aligned on an address which is a multiple of alignment interval.
+    :param alignment_interval: This property should be used in conjunction with
+            ``alignment_threshold``. See the description above. For more
+            details, see :ref:`file_alignment`.
     :param kwds:    Driver-specific keywords; see :ref:`file_driver`.
 
     .. method:: __bool__()

diff --git a/h5py/_hl/files.py b/h5py/_hl/files.py
@@ -109,7 +109,8 @@ def registered_drivers():
 
 
 def make_fapl(driver, libver, rdcc_nslots, rdcc_nbytes, rdcc_w0, locking,
-              page_buf_size, min_meta_keep, min_raw_keep, **kwds):
+              page_buf_size, min_meta_keep, min_raw_keep,
+              alignment_threshold, alignment_interval, **kwds):
     """ Set up a file access property list """
     plist = h5p.create(h5p.FILE_ACCESS)
 
@@ -123,6 +124,7 @@ def make_fapl(driver, libver, rdcc_nslots, rdcc_nbytes, rdcc_w0, locking,
         # we default to earliest
         low, high = h5f.LIBVER_EARLIEST, h5f.LIBVER_LATEST
     plist.set_libver_bounds(low, high)
+    plist.set_alignment(alignment_threshold, alignment_interval)
 
     cache_settings = list(plist.get_cache())
     if rdcc_nslots is not None:
@@ -354,7 +356,8 @@ def swmr_mode(self, value):
     def __init__(self, name, mode='r', driver=None, libver=None, userblock_size=None, swmr=False,
                  rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, track_order=None,
                  fs_strategy=None, fs_persist=False, fs_threshold=1, fs_page_size=None,
-                 page_buf_size=None, min_meta_keep=0, min_raw_keep=0, locking=None, **kwds):
+                 page_buf_size=None, min_meta_keep=0, min_raw_keep=0, locking=None,
+                 alignment_threshold=1, alignment_interval=1, **kwds):
         """Create a new file object.
 
         See the h5py user guide for a detailed explanation of the options.
@@ -449,6 +452,19 @@ def __init__(self, name, mode='r', driver=None, libver=None, userblock_size=None
             Warning: The HDF5_USE_FILE_LOCKING environment variable can override
             this parameter.
             Only available with HDF5 >= 1.12.1 or 1.10.x >= 1.10.7.
+
+        alignment_threshold
+            Together with ``alignment_interval``, this property ensures that
+            any file object greater than or equal in size to the alignement
+            threshold (in bytes) will be aligned on an address which is a
+            multiple of alignment interval.
+
+        alignment_interval
+            This property should be used in conjunction with
+            ``alignment_threshold``. See the description above. For more
+            details, see
+            https://portal.hdfgroup.org/display/HDF5/H5P_SET_ALIGNMENT
+
         Additional keywords
             Passed on to the selected file driver.
         """
@@ -500,7 +516,10 @@ def __init__(self, name, mode='r', driver=None, libver=None, userblock_size=None
 
             with phil:
                 fapl = make_fapl(driver, libver, rdcc_nslots, rdcc_nbytes, rdcc_w0,
-                                 locking, page_buf_size, min_meta_keep, min_raw_keep, **kwds)
+                                 locking, page_buf_size, min_meta_keep, min_raw_keep,
+                                 alignment_threshold=alignment_threshold,
+                                 alignment_interval=alignment_interval,
+                                 **kwds)
                 fcpl = make_fcpl(track_order=track_order, fs_strategy=fs_strategy,
                                  fs_persist=fs_persist, fs_threshold=fs_threshold,
                                  fs_page_size=fs_page_size)

diff --git a/h5py/tests/test_file_alignment.py b/h5py/tests/test_file_alignment.py
@@ -0,0 +1,103 @@
+import h5py
+from .common import TestCase
+
+
+def is_aligned(dataset, offset=4096):
+    # Here we check if the dataset is aligned
+    return dataset.id.get_offset() % offset == 0
+
+
+def dataset_name(i):
+    return f"data{i:03}"
+
+
+class TestFileAlignment(TestCase):
+    """
+        Ensure that setting the file alignment has the desired effect
+        in the internal structure.
+    """
+    def test_no_alignment_set(self):
+        fname = self.mktemp()
+        # 881 is a prime number, so hopefully this help randomize the alignment
+        # enough
+        # A nice even number might give a pathological case where
+        # While we don't want the data to be aligned, it ends up aligned...
+        shape = (881,)
+
+        with h5py.File(fname, 'w') as h5file:
+            # Create up to 1000 datasets
+            # At least one of them should be misaligned.
+            # While this isn't perfect, it seems that there
+            # The case where 1000 datasets get created is one where the data
+            # is aligned. Therefore, during correct operation, this test is
+            # expected to finish quickly
+            for i in range(1000):
+                dataset = h5file.create_dataset(
+                    dataset_name(i), shape, dtype='uint8')
+                # Assign data so that the dataset is instantiated in
+                # the file
+                dataset[...] = i
+                if not is_aligned(dataset):
+                    # Break early asserting that the file is not aligned
+                    break
+            else:
+                raise RuntimeError("Data was all found to be aligned to 4096")
+
+    def test_alignment_set_above_threshold(self):
+        # 2022/01/19 hmaarrfk
+        # UnitTest (TestCase) doesn't play well with pytest parametrization.
+        alignment_threshold = 1000
+        alignment_interval = 4096
+
+        for shape in [
+            (1033,),  # A prime number above the thresold
+            (1000,),  # Exactly equal to the threshold
+            (1001,),  # one above the threshold
+        ]:
+            fname = self.mktemp()
+            with h5py.File(fname, 'w',
+                           alignment_threshold=alignment_threshold,
+                           alignment_interval=alignment_interval) as h5file:
+                # Create up to 1000 datasets
+                # They are all expected to be aligned
+                for i in range(1000):
+                    dataset = h5file.create_dataset(
+                        dataset_name(i), shape, dtype='uint8')
+                    # Assign data so that the dataset is instantiated in
+                    # the file
+                    dataset[...] = i
+                    assert is_aligned(dataset, offset=alignment_interval)
+
+    def test_alignment_set_below_threshold(self):
+        # 2022/01/19 hmaarrfk
+        # UnitTest (TestCase) doesn't play well with pytest parametrization.
+        alignment_threshold = 1000
+        alignment_interval = 1024
+
+        for shape in [
+            (881,),  # A prime number below the thresold
+            (999,),  # Exactly one below the threshold
+        ]:
+            fname = self.mktemp()
+            with h5py.File(fname, 'w',
+                           alignment_threshold=alignment_threshold,
+                           alignment_interval=alignment_interval) as h5file:
+                # Create up to 1000 datasets
+                # At least one of them should be misaligned.
+                # While this isn't perfect, it seems that there
+                # The case where 1000 datasets get created is one where the
+                # data is aligned. Therefore, during correct operation, this
+                # test is expected to finish quickly
+                for i in range(1000):
+                    dataset = h5file.create_dataset(
+                        dataset_name(i), shape, dtype='uint8')
+                    # Assign data so that the dataset is instantiated in
+                    # the file
+                    dataset[...] = i
+                    if not is_aligned(dataset, offset=alignment_interval):
+                        # Break early asserting that the file is not aligned
+                        break
+                else:
+                    raise RuntimeError(
+                        "Data was all found to be aligned to "
+                        f"{alignment_interval}. This is highly unlikely.")
diff --git a/news/enable_file_alignment.rst b/news/enable_file_alignment.rst
@@ -0,0 +1,31 @@
+New features
+------------
+
+* The ``File`` constructor contains two new parameters ``alignment_threshold``,
+  and ``alignment_interval`` controling the data alignment within the HDF5
+  file.
+
+Deprecations
+------------
+
+* <news item>
+
+Exposing HDF5 functions
+-----------------------
+
+* <news item>
+
+Bug fixes
+---------
+
+* <news item>
+
+Building h5py
+-------------
+
+* <news item>
+
+Development
+-----------
+
+* <news item>