From e60e3e0dea0e900978d2d1c1fbcbbd4374b5a4a8 Mon Sep 17 00:00:00 2001 From: zxdawn Date: Fri, 28 May 2021 21:24:17 +0800 Subject: [PATCH 1/2] set bin without data to fill_value --- pyresample/bucket/__init__.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/pyresample/bucket/__init__.py b/pyresample/bucket/__init__.py index ad0cf22e9..66c5296c0 100644 --- a/pyresample/bucket/__init__.py +++ b/pyresample/bucket/__init__.py @@ -139,13 +139,17 @@ def _get_indices(self): target_shape = self.target_area.shape self.idxs = self.y_idxs * target_shape[1] + self.x_idxs - def get_sum(self, data, skipna=True): + def get_sum(self, data, fill_value=np.nan, skipna=True): """Calculate sums for each bin with drop-in-a-bucket resampling. Parameters ---------- data : Numpy or Dask array Data to be binned and summed. + fill_value : float + Fill value to mark missing/invalid values in the input data, + as well as in the binned and averaged output data. + Default: np.nan skipna : boolean (optional) If True, skips NaN values for the sum calculation (similarly to Numpy's `nansum`). Buckets containing only NaN are set to zero. @@ -180,6 +184,10 @@ def get_sum(self, data, skipna=True): # TODO remove following line in favour of weights = data when dask histogram bug (issue #6935) is fixed sums = self._mask_bins_with_nan_if_not_skipna(skipna, data, out_size, sums) + # set bin without data to fill_value if fill_value exists + if ~np.isnan(fill_value): + sums = da.where(sums == 0, fill_value, sums) + return sums.reshape(self.target_area.shape) def _mask_bins_with_nan_if_not_skipna(self, skipna, data, out_size, statistic): @@ -190,7 +198,7 @@ def _mask_bins_with_nan_if_not_skipna(self, skipna, data, out_size, statistic): statistic = da.where(nan_bins > 0, np.nan, statistic) return statistic - def _call_pandas_groupby_statistics(self, scipy_method, data, fill_value=None, skipna=None): + def _call_pandas_groupby_statistics(self, scipy_method, data, fill_value=np.nan, skipna=None): """Calculate statistics (min/max) for each bin with drop-in-a-bucket resampling.""" import dask.dataframe as dd import pandas as pd @@ -236,8 +244,8 @@ def _call_pandas_groupby_statistics(self, scipy_method, data, fill_value=None, s # TODO remove following line in favour of weights = data when dask histogram bug (issue #6935) is fixed statistics = self._mask_bins_with_nan_if_not_skipna(skipna, data, out_size, statistics) - # set bin without data to fill value - statistics = da.where(counts == 0, fill_value, statistics) + # set bin without data to fill_value + statistics = da.where(np.in1d(counts, [0, fill_value]), fill_value, statistics) return statistics.reshape(self.target_area.shape) From f2e171b3692996497fb023d378ebe08a2ac2c1de Mon Sep 17 00:00:00 2001 From: zxdawn Date: Mon, 31 May 2021 11:04:27 +0800 Subject: [PATCH 2/2] use np.isin which supports dask --- pyresample/bucket/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyresample/bucket/__init__.py b/pyresample/bucket/__init__.py index 66c5296c0..1f39cc0b6 100644 --- a/pyresample/bucket/__init__.py +++ b/pyresample/bucket/__init__.py @@ -245,7 +245,7 @@ def _call_pandas_groupby_statistics(self, scipy_method, data, fill_value=np.nan, statistics = self._mask_bins_with_nan_if_not_skipna(skipna, data, out_size, statistics) # set bin without data to fill_value - statistics = da.where(np.in1d(counts, [0, fill_value]), fill_value, statistics) + statistics = da.where(np.isin(counts, [0, fill_value]), fill_value, statistics) return statistics.reshape(self.target_area.shape)