Skip to content

Commit 3a7a0ff

Browse files
authored
Create error estimator for SUM (#528)
1 parent 4cf6a56 commit 3a7a0ff

File tree

2 files changed

+122
-27
lines changed

2 files changed

+122
-27
lines changed

pipeline_dp/dataset_histograms/histogram_error_estimator.py

Lines changed: 77 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,21 +20,25 @@
2020
import bisect
2121

2222

23-
class CountErrorEstimator:
23+
class ErrorEstimator:
2424
"""Estimator of the error from DP pipeline from DatasetHistograms.
2525
2626
The recommended way to create this object is to use create_error_estimator.
27-
It works only for COUNT and PRIVACY_ID_COUNT.
2827
2928
Partition selection error is not implemented yet. Now only contribution
3029
bounding and noise error are taken into consideration.
3130
"""
3231

33-
def __init__(self, epsilon: float, delta: Optional[float],
34-
metric: pipeline_dp.Metric, noise: pipeline_dp.NoiseKind,
35-
l0_ratios_dropped: Sequence[Tuple[int, float]],
36-
linf_ratios_dropped: Sequence[Tuple[int, float]],
37-
partition_histogram: hist.Histogram):
32+
def __init__(
33+
self,
34+
epsilon: float,
35+
delta: Optional[float],
36+
metric: pipeline_dp.Metric,
37+
noise: pipeline_dp.NoiseKind,
38+
l0_ratios_dropped: Sequence[Tuple[int, float]],
39+
linf_ratios_dropped: Sequence[Tuple[int, float]],
40+
partition_histogram: hist.Histogram,
41+
):
3842
self._base_std = self._get_stddev_for_dp_mechanism(
3943
epsilon, delta, noise)
4044
self._metric = metric
@@ -84,15 +88,16 @@ def estimate_rmse(self,
8488
linf_bound: linf contribution bound, AKA for COUNT as
8589
max_contributions_per_partition. This parameter is ignored for
8690
PRIVACY_ID_COUNT
91+
8792
Returns:
8893
the estimated error.
8994
"""
90-
if self._metric == pipeline_dp.Metrics.COUNT:
95+
if self._metric != pipeline_dp.Metrics.PRIVACY_ID_COUNT:
9196
if linf_bound is None:
9297
raise ValueError("linf must be given for COUNT")
9398
ratio_dropped_l0 = self.get_ratio_dropped_l0(l0_bound)
9499
ratio_dropped_linf = 0
95-
if self._metric == pipeline_dp.Metrics.COUNT:
100+
if self._metric != pipeline_dp.Metrics.PRIVACY_ID_COUNT:
96101
ratio_dropped_linf = self.get_ratio_dropped_linf(linf_bound)
97102
ratio_dropped = 1 - (1 - ratio_dropped_l0) * (1 - ratio_dropped_linf)
98103
stddev = self._get_stddev(l0_bound, linf_bound)
@@ -133,23 +138,29 @@ def _get_stddev(self,
133138
return self._base_std * math.sqrt(l0_bound) * linf_bound
134139

135140

136-
def create_error_estimator(histograms: hist.DatasetHistograms, epsilon: float,
137-
delta: Optional[float], metric: pipeline_dp.Metric,
138-
noise: pipeline_dp.NoiseKind) -> CountErrorEstimator:
141+
def create_estimator_for_count_and_privacy_id_count(
142+
histograms: hist.DatasetHistograms,
143+
epsilon: float,
144+
delta: Optional[float],
145+
metric: pipeline_dp.Metric,
146+
noise: pipeline_dp.NoiseKind,
147+
) -> ErrorEstimator:
139148
"""Creates histogram based error estimator for COUNT or PRIVACY_ID_COUNT.
140149
141150
Args:
142151
histograms: dataset histograms.
143152
epsilon: epsilon parameter of the DP mechanism for adding noise.
144-
delta: delta parameter of the DP mechanism for adding noise (must be
145-
None for Laplace noise).
153+
delta: delta parameter of the DP mechanism for adding noise (must be None
154+
for Laplace noise).
146155
metric: DP aggregation, COUNT or PRIVACY_ID_COUNT.
147156
noise: type of DP noise.
157+
148158
Returns:
149159
Error estimator.
150160
"""
151161
if metric not in [
152-
pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.PRIVACY_ID_COUNT
162+
pipeline_dp.Metrics.COUNT,
163+
pipeline_dp.Metrics.PRIVACY_ID_COUNT,
153164
]:
154165
raise ValueError(
155166
f"Only COUNT and PRIVACY_ID_COUNT are supported, but metric={metric}"
@@ -162,8 +173,15 @@ def create_error_estimator(histograms: hist.DatasetHistograms, epsilon: float,
162173
partition_histogram = histograms.count_per_partition_histogram
163174
else:
164175
partition_histogram = histograms.count_privacy_id_per_partition
165-
return CountErrorEstimator(epsilon, delta, metric, noise, l0_ratios_dropped,
166-
linf_ratios_dropped, partition_histogram)
176+
return ErrorEstimator(
177+
epsilon,
178+
delta,
179+
metric,
180+
noise,
181+
l0_ratios_dropped,
182+
linf_ratios_dropped,
183+
partition_histogram,
184+
)
167185

168186

169187
def _estimate_rmse_impl(ratio_dropped: float, std: float,
@@ -176,3 +194,45 @@ def _estimate_rmse_impl(ratio_dropped: float, std: float,
176194
std**2)
177195
sum_rmse += bin.count * rmse
178196
return sum_rmse / num_partitions
197+
198+
199+
def create_estimator_for_sum(histograms: hist.DatasetHistograms,
200+
epsilon: float,
201+
delta: Optional[float],
202+
noise: pipeline_dp.NoiseKind,
203+
sum_index: int = 0) -> ErrorEstimator:
204+
"""Creates histogram based error estimator for SUM.
205+
206+
Args:
207+
histograms: dataset histograms.
208+
epsilon: epsilon parameter of the DP mechanism for adding noise.
209+
delta: delta parameter of the DP mechanism for adding noise (must be None
210+
for Laplace noise).
211+
noise: type of DP noise.
212+
sum_index: the index of the sum for the case of multi-aggregations.
213+
214+
Returns:
215+
Error estimator.
216+
"""
217+
l0_ratios_dropped = hist.compute_ratio_dropped(
218+
histograms.l0_contributions_histogram)
219+
if isinstance(histograms.linf_sum_contributions_histogram, hist.Histogram):
220+
# 1 sum
221+
linf_sum_histograms = histograms.linf_sum_contributions_histogram
222+
partition_histogram = histograms.sum_per_partition_histogram
223+
else: # multiple SUM aggregations
224+
linf_sum_histograms = histograms.linf_sum_contributions_histogram[
225+
sum_index]
226+
partition_histogram = histograms.sum_per_partition_histogram[sum_index]
227+
228+
linf_ratios_dropped = hist.compute_ratio_dropped(linf_sum_histograms)
229+
230+
return ErrorEstimator(
231+
epsilon,
232+
delta,
233+
pipeline_dp.Metrics.SUM,
234+
noise,
235+
l0_ratios_dropped,
236+
linf_ratios_dropped,
237+
partition_histogram,
238+
)

tests/dataset_histograms/histogram_error_estimator_test.py

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,25 @@ def _get_histograms(self) -> hist.DatasetHistograms:
4242
computing_histograms.compute_dataset_histograms(
4343
dataset, data_extractors, pipeline_dp.LocalBackend()))[0]
4444

45-
def _get_estimator(
45+
def _get_estimator_for_count_and_privacy_id_count(
4646
self,
4747
metric: pipeline_dp.Metric,
4848
noise_kind: pipeline_dp.NoiseKind = pipeline_dp.NoiseKind.LAPLACE,
4949
epsilon: float = 2**0.5 / 2,
5050
delta: Optional[float] = None,
5151
):
52-
return histogram_error_estimator.create_error_estimator(
52+
return histogram_error_estimator.create_estimator_for_count_and_privacy_id_count(
5353
self._get_histograms(), epsilon, delta, metric, noise_kind)
5454

55+
def _get_estimator_for_sum(
56+
self,
57+
noise_kind: pipeline_dp.NoiseKind = pipeline_dp.NoiseKind.LAPLACE,
58+
epsilon: float = 2**0.5 / 2,
59+
delta: Optional[float] = None,
60+
):
61+
return histogram_error_estimator.create_estimator_for_sum(
62+
self._get_histograms(), epsilon, delta, noise_kind)
63+
5564
@parameterized.named_parameters(
5665
dict(testcase_name='count_gaussian',
5766
metric=pipeline_dp.Metrics.COUNT,
@@ -90,25 +99,34 @@ def test_count_get_sigma(self, metric: pipeline_dp.Metric, epsilon: float,
9099
delta: Optional[float],
91100
noise_kind: pipeline_dp.NoiseKind, l0: float,
92101
linf: float, expected: float):
93-
estimator = self._get_estimator(metric=metric,
94-
epsilon=epsilon,
95-
delta=delta,
96-
noise_kind=noise_kind)
102+
estimator = self._get_estimator_for_count_and_privacy_id_count(
103+
metric=metric, epsilon=epsilon, delta=delta, noise_kind=noise_kind)
97104
self.assertAlmostEqual(estimator._get_stddev(l0, linf),
98105
expected,
99106
delta=1e-10)
100107

101108
def test_sum_not_supported(self):
102109
with self.assertRaisesRegex(
103110
ValueError, "Only COUNT and PRIVACY_ID_COUNT are supported"):
104-
self._get_estimator(pipeline_dp.Metrics.SUM)
111+
self._get_estimator_for_count_and_privacy_id_count(
112+
pipeline_dp.Metrics.SUM)
105113

106114
@parameterized.parameters((0, 1), (1, 9 / 11), (2, 8 / 11), (3, 7 / 11),
107115
(9, 1 / 11), (10, 0), (20, 0))
108116
# there are 11 (privacy_id, partition) pairs (from 2 privacy units), when
109117
# l0_bound=1, 9 are dropped (from 1 privacy unit).
110118
def test_get_ratio_dropped_l0(self, l0_bound, expected):
111-
estimator = self._get_estimator(pipeline_dp.Metrics.COUNT)
119+
estimator = self._get_estimator_for_count_and_privacy_id_count(
120+
pipeline_dp.Metrics.COUNT)
121+
self.assertAlmostEqual(estimator.get_ratio_dropped_l0(l0_bound),
122+
expected)
123+
124+
@parameterized.parameters((0, 1), (1, 9 / 11), (2, 8 / 11), (3, 7 / 11),
125+
(9, 1 / 11), (10, 0), (20, 0))
126+
# there are 11 (privacy_id, partition) pairs (from 2 privacy units), when
127+
# l0_bound=1, 9 are dropped (from 1 privacy unit).
128+
def test_get_ratio_dropped_l0_for_sum(self, l0_bound, expected):
129+
estimator = self._get_estimator_for_sum()
112130
self.assertAlmostEqual(estimator.get_ratio_dropped_l0(l0_bound),
113131
expected)
114132

@@ -117,7 +135,19 @@ def test_get_ratio_dropped_l0(self, l0_bound, expected):
117135
# there are 30 rows (from 2 privacy units), when linf_bound=1, 19 are
118136
# dropped (from 1 privacy unit, which contributes 20 to 1 partition).
119137
def test_get_ratio_dropped_linf(self, linf_bound, expected):
120-
estimator = self._get_estimator(pipeline_dp.Metrics.COUNT)
138+
estimator = self._get_estimator_for_count_and_privacy_id_count(
139+
pipeline_dp.Metrics.COUNT)
140+
self.assertAlmostEqual(estimator.get_ratio_dropped_linf(linf_bound),
141+
expected)
142+
143+
@parameterized.parameters((0, 1), (0.5, 0.89), (1, 0.78), (2, 0.76),
144+
(40, 0))
145+
# there 1 is contribution of 40 and 10 contribution of 1.
146+
# total contribution = 1*40+10*1 = 50
147+
# when linf_bound = 0.5, left after contribution bounding 11*0.5=5.5, i.e.
148+
# dropped (50-5.5)/50 = 0.89
149+
def test_get_ratio_dropped_linf_for_sum(self, linf_bound, expected):
150+
estimator = self._get_estimator_for_sum()
121151
self.assertAlmostEqual(estimator.get_ratio_dropped_linf(linf_bound),
122152
expected)
123153

@@ -138,10 +168,15 @@ def test_get_ratio_dropped_linf(self, linf_bound, expected):
138168
# rmse2 = sqrt(21*total_ratio_dropped + noise_stddev**2) ~= 19.70177
139169
# rmse = (9*rmse1+rmse2)/10.
140170
def test_estimate_rmse_count(self, l0_bound, linf_bound, expected):
141-
estimator = self._get_estimator(pipeline_dp.Metrics.COUNT)
171+
estimator = self._get_estimator_for_count_and_privacy_id_count(
172+
pipeline_dp.Metrics.COUNT)
142173
self.assertAlmostEqual(estimator.estimate_rmse(l0_bound, linf_bound),
143174
expected)
144175

176+
def test_estimate_rmse_sum(self):
177+
estimator = self._get_estimator_for_sum()
178+
self.assertAlmostEqual(estimator.estimate_rmse(1, 1), 5.93769917)
179+
145180

146181
if __name__ == '__main__':
147182
absltest.main()

0 commit comments

Comments
 (0)