Skip to content

Commit

Permalink
Merge pull request h5py#1444 from takluyver/select-mshape
Browse files Browse the repository at this point in the history
Attempt to fix performance problems reading/writing with integer indexing
  • Loading branch information
takluyver authored Dec 14, 2019
2 parents 2ecafd9 + 967f936 commit 21cd476
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 56 deletions.
29 changes: 29 additions & 0 deletions benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,32 @@ def time_many_small_reads(self):
ds = self.f['a']
for i in range(10000):
arr = ds[i * 10:(i + 1) * 10]

class WritingTimeSuite:
"""Based on example in GitHub issue 492:
https://github.com/h5py/h5py/issues/492
"""
def setup(self):
self._td = TemporaryDirectory()
path = osp.join(self._td.name, 'test.h5')
self.f = h5py.File(path, 'w')
self.shape = shape = (128, 1024, 512)
self.f.create_dataset(
'a', shape=shape, dtype=np.float32, chunks=(1, shape[1], 64)
)

def teardown(self):
self.f.close()
self._td.cleanup()

def time_write_index_last_axis(self):
ds = self.f['a']
data = np.zeros(self.shape[:2])
for i in range(self.shape[2]):
ds[..., i] = data

def time_write_slice_last_axis(self):
ds = self.f['a']
data = np.zeros(self.shape[:2])
for i in range(self.shape[2]):
ds[..., i:i+1] = data[..., np.newaxis]
34 changes: 11 additions & 23 deletions h5py/_hl/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,22 +693,16 @@ def __getitem__(self, args, new_dtype=None):
selection = sel.select(self.shape, args, dsid=self.id)

if selection.nselect == 0:
return numpy.ndarray(selection.mshape, dtype=new_dtype)
return numpy.ndarray(selection.array_shape, dtype=new_dtype)

# Up-converting to (1,) so that numpy.ndarray correctly creates
# np.void rows in case of multi-field dtype. (issue 135)
single_element = selection.mshape == ()
mshape = (1,) if single_element else selection.mshape
arr = numpy.ndarray(mshape, new_dtype, order='C')

# HDF5 has a bug where if the memory shape has a different rank
# than the dataset, the read is very slow
if len(mshape) < len(self.shape):
# pad with ones
mshape = (1,)*(len(self.shape)-len(mshape)) + mshape
single_element = selection.array_shape == ()
arr_shape = (1,) if single_element else selection.array_shape
arr = numpy.ndarray(arr_shape, new_dtype, order='C')

# Perform the actual read
mspace = h5s.create_simple(mshape)
mspace = h5s.create_simple(selection.mshape)
fspace = selection.id
self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl)

Expand Down Expand Up @@ -840,26 +834,20 @@ def __setitem__(self, args, val):
# memory. In any case, if we cannot afford to create an intermediate
# array of the same size as the dataset chunk size, the user program has
# little hope to go much further. Solves h5py isue #1067
if mshape == () and selection.mshape != ():
if mshape == () and selection.array_shape != ():
if self.dtype.subdtype is not None:
raise TypeError("Scalar broadcasting is not supported for array dtypes")
if self.chunks and (numpy.prod(self.chunks, dtype=numpy.float) >= \
numpy.prod(selection.mshape, dtype=numpy.float)):
val2 = numpy.empty(selection.mshape, dtype=val.dtype)
if self.chunks and (numpy.prod(self.chunks, dtype=numpy.float) >=
numpy.prod(selection.array_shape, dtype=numpy.float)):
val2 = numpy.empty(selection.array_shape, dtype=val.dtype)
else:
val2 = numpy.empty(selection.mshape[-1], dtype=val.dtype)
val2 = numpy.empty(selection.array_shape[-1], dtype=val.dtype)
val2[...] = val
val = val2
mshape = val.shape

# Perform the write, with broadcasting
# Be careful to pad memory shape with ones to avoid HDF5 chunking
# glitch, which kicks in for mismatched memory/file selections
if len(mshape) < len(self.shape):
mshape_pad = (1,)*(len(self.shape)-len(mshape)) + mshape
else:
mshape_pad = mshape
mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED,)*len(mshape_pad))
mspace = h5s.create_simple(selection.expand_shape(mshape))
for fspace in selection.broadcast(mshape):
self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl)

Expand Down
109 changes: 77 additions & 32 deletions h5py/_hl/selections.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,20 @@ def mshape(self):
""" Shape of selection (always 1-D for this class) """
return (self.nselect,)

def broadcast(self, target_shape):
@property
def array_shape(self):
"""Shape of array to read/write (always 1-D for this class)"""
return self.mshape

# expand_shape and broadcast only really make sense for SimpleSelection
def expand_shape(self, source_shape):
if product(source_shape) != self.nselect:
raise TypeError("Broadcasting is not supported for point-wise selections")
return source_shape

def broadcast(self, source_shape):
""" Get an iterable for broadcasting """
if np.product(target_shape) != self.nselect:
if product(source_shape) != self.nselect:
raise TypeError("Broadcasting is not supported for point-wise selections")
yield self._id

Expand Down Expand Up @@ -213,13 +224,17 @@ class SimpleSelection(Selection):
@property
def mshape(self):
""" Shape of current selection """
return self._mshape
return self._sel[1]

@property
def array_shape(self):
return self._array_shape

def __init__(self, shape, *args, **kwds):
super(SimpleSelection, self).__init__(shape, *args, **kwds)
rank = len(self.shape)
self._sel = ((0,)*rank, self.shape, (1,)*rank, (False,)*rank)
self._mshape = self.shape
self._array_shape = self.shape

def __getitem__(self, args):

Expand All @@ -238,47 +253,67 @@ def __getitem__(self, args):

self._sel = (start, count, step, scalar)

self._mshape = tuple(x for x, y in zip(count, scalar) if not y)
# array shape drops dimensions where a scalar index was selected
self._array_shape = tuple(x for x, y in zip(count, scalar) if not y)

return self

def expand_shape(self, source_shape):
"""Match the dimensions of an array to be broadcast to the selection
def broadcast(self, target_shape):
""" Return an iterator over target dataspaces for broadcasting.
The returned shape describes an array of the same size as the input
shape, but its dimensions
Follows the standard NumPy broadcasting rules against the current
selection shape (self.mshape).
"""
if self.shape == ():
if np.product(target_shape) != 1:
raise TypeError("Can't broadcast %s to scalar" % target_shape)
self._id.select_all()
yield self._id
return
E.g. with a dataset shape (10, 5, 4, 2), writing like this::
ds[..., 0] = np.ones((5, 4))
The source shape (5, 4) will expand to (1, 5, 4, 1).
Then the broadcast method below repeats that chunk 10
times to write to an effective shape of (10, 5, 4, 1).
"""
start, count, step, scalar = self._sel

rank = len(count)
target = list(target_shape)
remaining_src_dims = list(source_shape)

tshape = []
for idx in range(1,rank+1):
if len(target) == 0 or scalar[-idx]: # Skip scalar axes
tshape.append(1)
eshape = []
for idx in range(1, rank + 1):
if len(remaining_src_dims) == 0 or scalar[-idx]: # Skip scalar axes
eshape.append(1)
else:
t = target.pop()
t = remaining_src_dims.pop()
if t == 1 or count[-idx] == t:
tshape.append(t)
eshape.append(t)
else:
raise TypeError("Can't broadcast %s -> %s" % (target_shape, self.mshape))
raise TypeError("Can't broadcast %s -> %s" % (source_shape, self.array_shape)) # array shape

if any([n > 1 for n in target]):
if any([n > 1 for n in remaining_src_dims]):
# All dimensions from target_shape should either have been popped
# to match the selection shape, or be 1.
raise TypeError("Can't broadcast %s -> %s" % (target_shape, self.mshape))
raise TypeError("Can't broadcast %s -> %s" % (source_shape, self.array_shape)) # array shape

# We have built eshape backwards, so now reverse it
return tuple(eshape[::-1])

tshape.reverse()
tshape = tuple(tshape)

def broadcast(self, source_shape):
""" Return an iterator over target dataspaces for broadcasting.
Follows the standard NumPy broadcasting rules against the current
selection shape (self.mshape).
"""
if self.shape == ():
if product(source_shape) != 1:
raise TypeError("Can't broadcast %s to scalar" % source_shape)
self._id.select_all()
yield self._id
return

start, count, step, scalar = self._sel

rank = len(count)
tshape = self.expand_shape(source_shape)

chunks = tuple(x//y for x, y in zip(count, tshape))
nchunks = product(chunks)
Expand Down Expand Up @@ -310,9 +345,13 @@ class FancySelection(Selection):
def mshape(self):
return self._mshape

@property
def array_shape(self):
return self._array_shape

def __init__(self, shape, *args, **kwds):
super(FancySelection, self).__init__(shape, *args, **kwds)
self._mshape = self.shape
self._mshape = self._array_shape = self.shape

def __getitem__(self, args):

Expand Down Expand Up @@ -385,10 +424,16 @@ def __getitem__(self, args):
elif scalar[idx]:
mshape[idx] = -1

self._mshape = tuple(x for x in mshape if x >= 0)
self._mshape = tuple(abs(x) for x in mshape) # Convert -1 back to 1
self._array_shape = tuple(x for x in mshape if x >= 0)

def expand_shape(self, source_shape):
if not source_shape == self.array_shape:
raise TypeError("Broadcasting is not supported for complex selections")
return source_shape

def broadcast(self, target_shape):
if not target_shape == self.mshape:
def broadcast(self, source_shape):
if not source_shape == self.array_shape:
raise TypeError("Broadcasting is not supported for complex selections")
yield self._id

Expand Down
2 changes: 1 addition & 1 deletion h5py/_hl/vds.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def __init__(self, path_or_dataset, name=None,

@property
def shape(self):
return self.sel.mshape
return self.sel.array_shape

def __getitem__(self, key):
tmp = copy(self)
Expand Down
30 changes: 30 additions & 0 deletions news/select-mshape.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
New features
------------

* <news item>

Deprecations
------------

* <news item>

Exposing HDF5 functions
-----------------------

* <news item>

Bug fixes
---------

* Fix pathologically slow reading/writing in certain conditions with integer
indexing (:issue:`492`).

Building h5py
-------------

* <news item>

Development
-----------

* <news item>

0 comments on commit 21cd476

Please sign in to comment.