diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index 43c1e12a5..acc1e6efa 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -26,3 +26,32 @@ def time_many_small_reads(self): ds = self.f['a'] for i in range(10000): arr = ds[i * 10:(i + 1) * 10] + +class WritingTimeSuite: + """Based on example in GitHub issue 492: + https://github.com/h5py/h5py/issues/492 + """ + def setup(self): + self._td = TemporaryDirectory() + path = osp.join(self._td.name, 'test.h5') + self.f = h5py.File(path, 'w') + self.shape = shape = (128, 1024, 512) + self.f.create_dataset( + 'a', shape=shape, dtype=np.float32, chunks=(1, shape[1], 64) + ) + + def teardown(self): + self.f.close() + self._td.cleanup() + + def time_write_index_last_axis(self): + ds = self.f['a'] + data = np.zeros(self.shape[:2]) + for i in range(self.shape[2]): + ds[..., i] = data + + def time_write_slice_last_axis(self): + ds = self.f['a'] + data = np.zeros(self.shape[:2]) + for i in range(self.shape[2]): + ds[..., i:i+1] = data[..., np.newaxis] diff --git a/h5py/_hl/dataset.py b/h5py/_hl/dataset.py index c84460c8b..b94a48c7a 100644 --- a/h5py/_hl/dataset.py +++ b/h5py/_hl/dataset.py @@ -693,22 +693,16 @@ def __getitem__(self, args, new_dtype=None): selection = sel.select(self.shape, args, dsid=self.id) if selection.nselect == 0: - return numpy.ndarray(selection.mshape, dtype=new_dtype) + return numpy.ndarray(selection.array_shape, dtype=new_dtype) # Up-converting to (1,) so that numpy.ndarray correctly creates # np.void rows in case of multi-field dtype. (issue 135) - single_element = selection.mshape == () - mshape = (1,) if single_element else selection.mshape - arr = numpy.ndarray(mshape, new_dtype, order='C') - - # HDF5 has a bug where if the memory shape has a different rank - # than the dataset, the read is very slow - if len(mshape) < len(self.shape): - # pad with ones - mshape = (1,)*(len(self.shape)-len(mshape)) + mshape + single_element = selection.array_shape == () + arr_shape = (1,) if single_element else selection.array_shape + arr = numpy.ndarray(arr_shape, new_dtype, order='C') # Perform the actual read - mspace = h5s.create_simple(mshape) + mspace = h5s.create_simple(selection.mshape) fspace = selection.id self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl) @@ -840,26 +834,20 @@ def __setitem__(self, args, val): # memory. In any case, if we cannot afford to create an intermediate # array of the same size as the dataset chunk size, the user program has # little hope to go much further. Solves h5py isue #1067 - if mshape == () and selection.mshape != (): + if mshape == () and selection.array_shape != (): if self.dtype.subdtype is not None: raise TypeError("Scalar broadcasting is not supported for array dtypes") - if self.chunks and (numpy.prod(self.chunks, dtype=numpy.float) >= \ - numpy.prod(selection.mshape, dtype=numpy.float)): - val2 = numpy.empty(selection.mshape, dtype=val.dtype) + if self.chunks and (numpy.prod(self.chunks, dtype=numpy.float) >= + numpy.prod(selection.array_shape, dtype=numpy.float)): + val2 = numpy.empty(selection.array_shape, dtype=val.dtype) else: - val2 = numpy.empty(selection.mshape[-1], dtype=val.dtype) + val2 = numpy.empty(selection.array_shape[-1], dtype=val.dtype) val2[...] = val val = val2 mshape = val.shape # Perform the write, with broadcasting - # Be careful to pad memory shape with ones to avoid HDF5 chunking - # glitch, which kicks in for mismatched memory/file selections - if len(mshape) < len(self.shape): - mshape_pad = (1,)*(len(self.shape)-len(mshape)) + mshape - else: - mshape_pad = mshape - mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED,)*len(mshape_pad)) + mspace = h5s.create_simple(selection.expand_shape(mshape)) for fspace in selection.broadcast(mshape): self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl) diff --git a/h5py/_hl/selections.py b/h5py/_hl/selections.py index a730d3465..505ed2719 100644 --- a/h5py/_hl/selections.py +++ b/h5py/_hl/selections.py @@ -149,9 +149,20 @@ def mshape(self): """ Shape of selection (always 1-D for this class) """ return (self.nselect,) - def broadcast(self, target_shape): + @property + def array_shape(self): + """Shape of array to read/write (always 1-D for this class)""" + return self.mshape + + # expand_shape and broadcast only really make sense for SimpleSelection + def expand_shape(self, source_shape): + if product(source_shape) != self.nselect: + raise TypeError("Broadcasting is not supported for point-wise selections") + return source_shape + + def broadcast(self, source_shape): """ Get an iterable for broadcasting """ - if np.product(target_shape) != self.nselect: + if product(source_shape) != self.nselect: raise TypeError("Broadcasting is not supported for point-wise selections") yield self._id @@ -213,13 +224,17 @@ class SimpleSelection(Selection): @property def mshape(self): """ Shape of current selection """ - return self._mshape + return self._sel[1] + + @property + def array_shape(self): + return self._array_shape def __init__(self, shape, *args, **kwds): super(SimpleSelection, self).__init__(shape, *args, **kwds) rank = len(self.shape) self._sel = ((0,)*rank, self.shape, (1,)*rank, (False,)*rank) - self._mshape = self.shape + self._array_shape = self.shape def __getitem__(self, args): @@ -238,47 +253,67 @@ def __getitem__(self, args): self._sel = (start, count, step, scalar) - self._mshape = tuple(x for x, y in zip(count, scalar) if not y) + # array shape drops dimensions where a scalar index was selected + self._array_shape = tuple(x for x, y in zip(count, scalar) if not y) return self + def expand_shape(self, source_shape): + """Match the dimensions of an array to be broadcast to the selection - def broadcast(self, target_shape): - """ Return an iterator over target dataspaces for broadcasting. + The returned shape describes an array of the same size as the input + shape, but its dimensions - Follows the standard NumPy broadcasting rules against the current - selection shape (self.mshape). - """ - if self.shape == (): - if np.product(target_shape) != 1: - raise TypeError("Can't broadcast %s to scalar" % target_shape) - self._id.select_all() - yield self._id - return + E.g. with a dataset shape (10, 5, 4, 2), writing like this:: + + ds[..., 0] = np.ones((5, 4)) + The source shape (5, 4) will expand to (1, 5, 4, 1). + Then the broadcast method below repeats that chunk 10 + times to write to an effective shape of (10, 5, 4, 1). + """ start, count, step, scalar = self._sel rank = len(count) - target = list(target_shape) + remaining_src_dims = list(source_shape) - tshape = [] - for idx in range(1,rank+1): - if len(target) == 0 or scalar[-idx]: # Skip scalar axes - tshape.append(1) + eshape = [] + for idx in range(1, rank + 1): + if len(remaining_src_dims) == 0 or scalar[-idx]: # Skip scalar axes + eshape.append(1) else: - t = target.pop() + t = remaining_src_dims.pop() if t == 1 or count[-idx] == t: - tshape.append(t) + eshape.append(t) else: - raise TypeError("Can't broadcast %s -> %s" % (target_shape, self.mshape)) + raise TypeError("Can't broadcast %s -> %s" % (source_shape, self.array_shape)) # array shape - if any([n > 1 for n in target]): + if any([n > 1 for n in remaining_src_dims]): # All dimensions from target_shape should either have been popped # to match the selection shape, or be 1. - raise TypeError("Can't broadcast %s -> %s" % (target_shape, self.mshape)) + raise TypeError("Can't broadcast %s -> %s" % (source_shape, self.array_shape)) # array shape + + # We have built eshape backwards, so now reverse it + return tuple(eshape[::-1]) - tshape.reverse() - tshape = tuple(tshape) + + def broadcast(self, source_shape): + """ Return an iterator over target dataspaces for broadcasting. + + Follows the standard NumPy broadcasting rules against the current + selection shape (self.mshape). + """ + if self.shape == (): + if product(source_shape) != 1: + raise TypeError("Can't broadcast %s to scalar" % source_shape) + self._id.select_all() + yield self._id + return + + start, count, step, scalar = self._sel + + rank = len(count) + tshape = self.expand_shape(source_shape) chunks = tuple(x//y for x, y in zip(count, tshape)) nchunks = product(chunks) @@ -310,9 +345,13 @@ class FancySelection(Selection): def mshape(self): return self._mshape + @property + def array_shape(self): + return self._array_shape + def __init__(self, shape, *args, **kwds): super(FancySelection, self).__init__(shape, *args, **kwds) - self._mshape = self.shape + self._mshape = self._array_shape = self.shape def __getitem__(self, args): @@ -385,10 +424,16 @@ def __getitem__(self, args): elif scalar[idx]: mshape[idx] = -1 - self._mshape = tuple(x for x in mshape if x >= 0) + self._mshape = tuple(abs(x) for x in mshape) # Convert -1 back to 1 + self._array_shape = tuple(x for x in mshape if x >= 0) + + def expand_shape(self, source_shape): + if not source_shape == self.array_shape: + raise TypeError("Broadcasting is not supported for complex selections") + return source_shape - def broadcast(self, target_shape): - if not target_shape == self.mshape: + def broadcast(self, source_shape): + if not source_shape == self.array_shape: raise TypeError("Broadcasting is not supported for complex selections") yield self._id diff --git a/h5py/_hl/vds.py b/h5py/_hl/vds.py index f3b84ed0e..41e7264f0 100644 --- a/h5py/_hl/vds.py +++ b/h5py/_hl/vds.py @@ -97,7 +97,7 @@ def __init__(self, path_or_dataset, name=None, @property def shape(self): - return self.sel.mshape + return self.sel.array_shape def __getitem__(self, key): tmp = copy(self) diff --git a/news/select-mshape.rst b/news/select-mshape.rst new file mode 100644 index 000000000..1d7cbd457 --- /dev/null +++ b/news/select-mshape.rst @@ -0,0 +1,30 @@ +New features +------------ + +* + +Deprecations +------------ + +* + +Exposing HDF5 functions +----------------------- + +* + +Bug fixes +--------- + +* Fix pathologically slow reading/writing in certain conditions with integer + indexing (:issue:`492`). + +Building h5py +------------- + +* + +Development +----------- + +*