Skip to content

Commit

Permalink
Merge pull request h5py#2040 from ramonaoptics/file_alignment
Browse files Browse the repository at this point in the history
Support setting the data alignment within the HDF5 file
  • Loading branch information
takluyver authored Feb 10, 2022
2 parents d570af6 + cfad0d5 commit adf2f11
Show file tree
Hide file tree
Showing 4 changed files with 181 additions and 3 deletions.
25 changes: 25 additions & 0 deletions docs/high/file.rst
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,24 @@ chunk cache*.
Chunks and caching are described in greater detail in the `HDF5 documentation
<https://portal.hdfgroup.org/display/HDF5/Chunking+in+HDF5>`_.

.. _file_alignment:

Data alignment
--------------

When creating datasets within files, it may be advantageous to align the offset
within the file itself. This can help optimize read and write times if the data
become aligned with the underlying hardware, or may help with parallelism with
MPI. Unfortunately, aligning small variables to large blocks can leave alot of
empty space in a file. To this effect, application developers are left with two
options to tune the alignment of data within their file. The two variables
``alignment_threshold`` and ``alignment_interval`` in the :class:`File`
constructor help control the threshold in bytes where the data alignment policy
takes effect and the alignment in bytes within the file. The alignment is
measured from the end of the user block.

For more information, see the official HDF5 documentation `H5P_SET_ALIGNMENT
<https://portal.hdfgroup.org/display/HDF5/H5P_SET_ALIGNMENT>`_.

Reference
---------
Expand Down Expand Up @@ -418,6 +436,13 @@ Reference
:param fs_threshold: The smallest free-space section size that the free
space manager will track. Only allowed when creating a new file.
The default is 1.
:param alignment_threshold: Together with ``alignment_interval``, this
property ensures that any file object greater than or equal
in size to the alignement threshold (in bytes) will be
aligned on an address which is a multiple of alignment interval.
:param alignment_interval: This property should be used in conjunction with
``alignment_threshold``. See the description above. For more
details, see :ref:`file_alignment`.
:param kwds: Driver-specific keywords; see :ref:`file_driver`.

.. method:: __bool__()
Expand Down
25 changes: 22 additions & 3 deletions h5py/_hl/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ def registered_drivers():


def make_fapl(driver, libver, rdcc_nslots, rdcc_nbytes, rdcc_w0, locking,
page_buf_size, min_meta_keep, min_raw_keep, **kwds):
page_buf_size, min_meta_keep, min_raw_keep,
alignment_threshold, alignment_interval, **kwds):
""" Set up a file access property list """
plist = h5p.create(h5p.FILE_ACCESS)

Expand All @@ -123,6 +124,7 @@ def make_fapl(driver, libver, rdcc_nslots, rdcc_nbytes, rdcc_w0, locking,
# we default to earliest
low, high = h5f.LIBVER_EARLIEST, h5f.LIBVER_LATEST
plist.set_libver_bounds(low, high)
plist.set_alignment(alignment_threshold, alignment_interval)

cache_settings = list(plist.get_cache())
if rdcc_nslots is not None:
Expand Down Expand Up @@ -354,7 +356,8 @@ def swmr_mode(self, value):
def __init__(self, name, mode='r', driver=None, libver=None, userblock_size=None, swmr=False,
rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, track_order=None,
fs_strategy=None, fs_persist=False, fs_threshold=1, fs_page_size=None,
page_buf_size=None, min_meta_keep=0, min_raw_keep=0, locking=None, **kwds):
page_buf_size=None, min_meta_keep=0, min_raw_keep=0, locking=None,
alignment_threshold=1, alignment_interval=1, **kwds):
"""Create a new file object.
See the h5py user guide for a detailed explanation of the options.
Expand Down Expand Up @@ -449,6 +452,19 @@ def __init__(self, name, mode='r', driver=None, libver=None, userblock_size=None
Warning: The HDF5_USE_FILE_LOCKING environment variable can override
this parameter.
Only available with HDF5 >= 1.12.1 or 1.10.x >= 1.10.7.
alignment_threshold
Together with ``alignment_interval``, this property ensures that
any file object greater than or equal in size to the alignement
threshold (in bytes) will be aligned on an address which is a
multiple of alignment interval.
alignment_interval
This property should be used in conjunction with
``alignment_threshold``. See the description above. For more
details, see
https://portal.hdfgroup.org/display/HDF5/H5P_SET_ALIGNMENT
Additional keywords
Passed on to the selected file driver.
"""
Expand Down Expand Up @@ -500,7 +516,10 @@ def __init__(self, name, mode='r', driver=None, libver=None, userblock_size=None

with phil:
fapl = make_fapl(driver, libver, rdcc_nslots, rdcc_nbytes, rdcc_w0,
locking, page_buf_size, min_meta_keep, min_raw_keep, **kwds)
locking, page_buf_size, min_meta_keep, min_raw_keep,
alignment_threshold=alignment_threshold,
alignment_interval=alignment_interval,
**kwds)
fcpl = make_fcpl(track_order=track_order, fs_strategy=fs_strategy,
fs_persist=fs_persist, fs_threshold=fs_threshold,
fs_page_size=fs_page_size)
Expand Down
103 changes: 103 additions & 0 deletions h5py/tests/test_file_alignment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import h5py
from .common import TestCase


def is_aligned(dataset, offset=4096):
# Here we check if the dataset is aligned
return dataset.id.get_offset() % offset == 0


def dataset_name(i):
return f"data{i:03}"


class TestFileAlignment(TestCase):
"""
Ensure that setting the file alignment has the desired effect
in the internal structure.
"""
def test_no_alignment_set(self):
fname = self.mktemp()
# 881 is a prime number, so hopefully this help randomize the alignment
# enough
# A nice even number might give a pathological case where
# While we don't want the data to be aligned, it ends up aligned...
shape = (881,)

with h5py.File(fname, 'w') as h5file:
# Create up to 1000 datasets
# At least one of them should be misaligned.
# While this isn't perfect, it seems that there
# The case where 1000 datasets get created is one where the data
# is aligned. Therefore, during correct operation, this test is
# expected to finish quickly
for i in range(1000):
dataset = h5file.create_dataset(
dataset_name(i), shape, dtype='uint8')
# Assign data so that the dataset is instantiated in
# the file
dataset[...] = i
if not is_aligned(dataset):
# Break early asserting that the file is not aligned
break
else:
raise RuntimeError("Data was all found to be aligned to 4096")

def test_alignment_set_above_threshold(self):
# 2022/01/19 hmaarrfk
# UnitTest (TestCase) doesn't play well with pytest parametrization.
alignment_threshold = 1000
alignment_interval = 4096

for shape in [
(1033,), # A prime number above the thresold
(1000,), # Exactly equal to the threshold
(1001,), # one above the threshold
]:
fname = self.mktemp()
with h5py.File(fname, 'w',
alignment_threshold=alignment_threshold,
alignment_interval=alignment_interval) as h5file:
# Create up to 1000 datasets
# They are all expected to be aligned
for i in range(1000):
dataset = h5file.create_dataset(
dataset_name(i), shape, dtype='uint8')
# Assign data so that the dataset is instantiated in
# the file
dataset[...] = i
assert is_aligned(dataset, offset=alignment_interval)

def test_alignment_set_below_threshold(self):
# 2022/01/19 hmaarrfk
# UnitTest (TestCase) doesn't play well with pytest parametrization.
alignment_threshold = 1000
alignment_interval = 1024

for shape in [
(881,), # A prime number below the thresold
(999,), # Exactly one below the threshold
]:
fname = self.mktemp()
with h5py.File(fname, 'w',
alignment_threshold=alignment_threshold,
alignment_interval=alignment_interval) as h5file:
# Create up to 1000 datasets
# At least one of them should be misaligned.
# While this isn't perfect, it seems that there
# The case where 1000 datasets get created is one where the
# data is aligned. Therefore, during correct operation, this
# test is expected to finish quickly
for i in range(1000):
dataset = h5file.create_dataset(
dataset_name(i), shape, dtype='uint8')
# Assign data so that the dataset is instantiated in
# the file
dataset[...] = i
if not is_aligned(dataset, offset=alignment_interval):
# Break early asserting that the file is not aligned
break
else:
raise RuntimeError(
"Data was all found to be aligned to "
f"{alignment_interval}. This is highly unlikely.")
31 changes: 31 additions & 0 deletions news/enable_file_alignment.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
New features
------------

* The ``File`` constructor contains two new parameters ``alignment_threshold``,
and ``alignment_interval`` controling the data alignment within the HDF5
file.

Deprecations
------------

* <news item>

Exposing HDF5 functions
-----------------------

* <news item>

Bug fixes
---------

* <news item>

Building h5py
-------------

* <news item>

Development
-----------

* <news item>

0 comments on commit adf2f11

Please sign in to comment.