OpenMined · dvadym · Oct 17, 2024 · Oct 16, 2024 · Oct 17, 2024 · RamSaw
diff --git a/pipeline_dp/dataset_histograms/histogram_error_estimator.py b/pipeline_dp/dataset_histograms/histogram_error_estimator.py
@@ -20,21 +20,25 @@
 import bisect
 
 
-class CountErrorEstimator:
+class ErrorEstimator:
     """Estimator of the error from DP pipeline from DatasetHistograms.
 
     The recommended way to create this object is to use create_error_estimator.
-    It works only for COUNT and PRIVACY_ID_COUNT.
 
     Partition selection error is not implemented yet. Now only contribution
     bounding and noise error are taken into consideration.
     """
 
-    def __init__(self, epsilon: float, delta: Optional[float],
-                 metric: pipeline_dp.Metric, noise: pipeline_dp.NoiseKind,
-                 l0_ratios_dropped: Sequence[Tuple[int, float]],
-                 linf_ratios_dropped: Sequence[Tuple[int, float]],
-                 partition_histogram: hist.Histogram):
+    def __init__(
+        self,
+        epsilon: float,
+        delta: Optional[float],
+        metric: pipeline_dp.Metric,
+        noise: pipeline_dp.NoiseKind,
+        l0_ratios_dropped: Sequence[Tuple[int, float]],
+        linf_ratios_dropped: Sequence[Tuple[int, float]],
+        partition_histogram: hist.Histogram,
+    ):
         self._base_std = self._get_stddev_for_dp_mechanism(
             epsilon, delta, noise)
         self._metric = metric
@@ -62,7 +66,8 @@ def _get_stddev_for_dp_mechanism(
 
     def estimate_rmse(self,
                       l0_bound: int,
-                      linf_bound: Optional[int] = None) -> float:
+                      linf_bound: Optional[int] = None,
+                      to_print=False) -> float:
         """Estimates RMSE error for given l0 and linf bounds.
 
         Estimation algorithm is the following:
@@ -84,18 +89,22 @@ def estimate_rmse(self,
             linf_bound: linf contribution bound, AKA for COUNT as
               max_contributions_per_partition. This parameter is ignored for
               PRIVACY_ID_COUNT
+
         Returns:
             the estimated error.
         """
-        if self._metric == pipeline_dp.Metrics.COUNT:
+        if self._metric != pipeline_dp.Metrics.PRIVACY_ID_COUNT:
             if linf_bound is None:
                 raise ValueError("linf must be given for COUNT")
         ratio_dropped_l0 = self.get_ratio_dropped_l0(l0_bound)
         ratio_dropped_linf = 0
-        if self._metric == pipeline_dp.Metrics.COUNT:
+        if self._metric != pipeline_dp.Metrics.PRIVACY_ID_COUNT:
             ratio_dropped_linf = self.get_ratio_dropped_linf(linf_bound)
         ratio_dropped = 1 - (1 - ratio_dropped_l0) * (1 - ratio_dropped_linf)
         stddev = self._get_stddev(l0_bound, linf_bound)
+        if to_print:
+            print(f"{ratio_dropped_l0=},"
+                  f" {ratio_dropped_linf=} {ratio_dropped=} {stddev=}")
         return _estimate_rmse_impl(ratio_dropped, stddev,
                                    self._partition_histogram)
 
@@ -133,23 +142,29 @@ def _get_stddev(self,
         return self._base_std * math.sqrt(l0_bound) * linf_bound
 
 
-def create_error_estimator(histograms: hist.DatasetHistograms, epsilon: float,
-                           delta: Optional[float], metric: pipeline_dp.Metric,
-                           noise: pipeline_dp.NoiseKind) -> CountErrorEstimator:
+def create_estimator_for_count_privacy_id_count(
+    histograms: hist.DatasetHistograms,
+    epsilon: float,
+    delta: Optional[float],
+    metric: pipeline_dp.Metric,
+    noise: pipeline_dp.NoiseKind,
+) -> ErrorEstimator:
     """Creates histogram based error estimator for COUNT or PRIVACY_ID_COUNT.
 
     Args:
         histograms: dataset histograms.
         epsilon: epsilon parameter of the DP mechanism for adding noise.
-        delta: delta parameter of the DP mechanism for adding noise (must be
-            None for Laplace noise).
+        delta: delta parameter of the DP mechanism for adding noise (must be None
+          for Laplace noise).
         metric: DP aggregation, COUNT or PRIVACY_ID_COUNT.
         noise: type of DP noise.
+
     Returns:
         Error estimator.
     """
     if metric not in [
-            pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.PRIVACY_ID_COUNT
+            pipeline_dp.Metrics.COUNT,
+            pipeline_dp.Metrics.PRIVACY_ID_COUNT,
     ]:
         raise ValueError(
             f"Only COUNT and PRIVACY_ID_COUNT are supported, but metric={metric}"
@@ -162,8 +177,15 @@ def create_error_estimator(histograms: hist.DatasetHistograms, epsilon: float,
         partition_histogram = histograms.count_per_partition_histogram
     else:
         partition_histogram = histograms.count_privacy_id_per_partition
-    return CountErrorEstimator(epsilon, delta, metric, noise, l0_ratios_dropped,
-                               linf_ratios_dropped, partition_histogram)
+    return ErrorEstimator(
+        epsilon,
+        delta,
+        metric,
+        noise,
+        l0_ratios_dropped,
+        linf_ratios_dropped,
+        partition_histogram,
+    )
 
 
 def _estimate_rmse_impl(ratio_dropped: float, std: float,
@@ -176,3 +198,45 @@ def _estimate_rmse_impl(ratio_dropped: float, std: float,
                          std**2)
         sum_rmse += bin.count * rmse
     return sum_rmse / num_partitions
+
+
+def create_estimator_for_sum(histograms: hist.DatasetHistograms,
+                             epsilon: float,
+                             delta: Optional[float],
+                             noise: pipeline_dp.NoiseKind,
+                             sum_index: int = 0) -> ErrorEstimator:
+    """Creates histogram based error estimator for SUM.
+
+    Args:
+        histograms: dataset histograms.
+        epsilon: epsilon parameter of the DP mechanism for adding noise.
+        delta: delta parameter of the DP mechanism for adding noise (must be None
+          for Laplace noise).
+        noise: type of DP noise.
+        sum_index: the index of the sum for the case of multi-aggregations.
+
+    Returns:
+        Error estimator.
+    """
+    l0_ratios_dropped = hist.compute_ratio_dropped(
+        histograms.l0_contributions_histogram)
+    if isinstance(histograms.linf_sum_contributions_histogram, hist.Histogram):
+        # 1 sum
+        linf_sum_histograms = histograms.linf_sum_contributions_histogram
+        partition_histogram = histograms.sum_per_partition_histogram
+    else:  # multiple SUM aggregations
+        linf_sum_histograms = histograms.linf_sum_contributions_histogram[
+            sum_index]
+        partition_histogram = histograms.sum_per_partition_histogram[sum_index]
+
+    linf_ratios_dropped = hist.compute_ratio_dropped(linf_sum_histograms)
+
+    return ErrorEstimator(
+        epsilon,
+        delta,
+        pipeline_dp.Metrics.SUM,
+        noise,
+        l0_ratios_dropped,
+        linf_ratios_dropped,
+        partition_histogram,
+    )
diff --git a/tests/dataset_histograms/histogram_error_estimator_test.py b/tests/dataset_histograms/histogram_error_estimator_test.py
@@ -49,9 +49,18 @@ def _get_estimator(
         epsilon: float = 2**0.5 / 2,
         delta: Optional[float] = None,
     ):
-        return histogram_error_estimator.create_error_estimator(
+        return histogram_error_estimator.create_estimator_for_count_privacy_id_count(
             self._get_histograms(), epsilon, delta, metric, noise_kind)
 
+    def _get_estimator_for_sum(
+        self,
+        noise_kind: pipeline_dp.NoiseKind = pipeline_dp.NoiseKind.LAPLACE,
+        epsilon: float = 2**0.5 / 2,
+        delta: Optional[float] = None,
+    ):
+        return histogram_error_estimator.create_estimator_for_sum(
+            self._get_histograms(), epsilon, delta, noise_kind)
+
     @parameterized.named_parameters(
         dict(testcase_name='count_gaussian',
              metric=pipeline_dp.Metrics.COUNT,
@@ -112,6 +121,15 @@ def test_get_ratio_dropped_l0(self, l0_bound, expected):
         self.assertAlmostEqual(estimator.get_ratio_dropped_l0(l0_bound),
                                expected)
 
+    @parameterized.parameters((0, 1), (1, 9 / 11), (2, 8 / 11), (3, 7 / 11),
+                              (9, 1 / 11), (10, 0), (20, 0))
+    # there are 11 (privacy_id, partition) pairs (from 2 privacy units), when
+    # l0_bound=1, 9 are dropped (from 1 privacy unit).
+    def test_get_ratio_dropped_l0_for_sum(self, l0_bound, expected):
+        estimator = self._get_estimator_for_sum()
+        self.assertAlmostEqual(estimator.get_ratio_dropped_l0(l0_bound),
+                               expected)
+
     @parameterized.parameters((0, 1), (1, 19 / 30), (2, 18 / 30), (10, 10 / 30),
                               (20, 0), (21, 0))
     # there are 30 rows (from 2 privacy units), when linf_bound=1, 19 are
@@ -121,6 +139,17 @@ def test_get_ratio_dropped_linf(self, linf_bound, expected):
         self.assertAlmostEqual(estimator.get_ratio_dropped_linf(linf_bound),
                                expected)
 
+    @parameterized.parameters((0, 1), (0.5, 0.89), (1, 0.78), (2, 0.76),
+                              (40, 0))
+    # there 1 is contribution 40 and 10 contribution 1.
+    # total contribution = 1*40+10*1 = 50
+    # when linf_bound = 0.5, left after contribution bounding 11*0.5=5.5, i.e.
+    # dropped (50-5.5)/50 = 0.89
+    def test_get_ratio_dropped_linf_for_sum(self, linf_bound, expected):
+        estimator = self._get_estimator_for_sum()
+        self.assertAlmostEqual(estimator.get_ratio_dropped_linf(linf_bound),
+                               expected)
+
     @parameterized.parameters((1, 1, 3.9565310998335823),
                               (1, 2, 5.683396971098993),
                               (10, 10, 200.01249625055996))
@@ -142,6 +171,10 @@ def test_estimate_rmse_count(self, l0_bound, linf_bound, expected):
         self.assertAlmostEqual(estimator.estimate_rmse(l0_bound, linf_bound),
                                expected)
 
+    def test_estimate_rmse_sum(self):
+        estimator = self._get_estimator_for_sum()
+        self.assertAlmostEqual(estimator.estimate_rmse(1, 1), 5.93769917)
+
 
 if __name__ == '__main__':
     absltest.main()