Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 77 additions & 17 deletions pipeline_dp/dataset_histograms/histogram_error_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,25 @@
import bisect


class CountErrorEstimator:
class ErrorEstimator:
"""Estimator of the error from DP pipeline from DatasetHistograms.

The recommended way to create this object is to use create_error_estimator.
It works only for COUNT and PRIVACY_ID_COUNT.

Partition selection error is not implemented yet. Now only contribution
bounding and noise error are taken into consideration.
"""

def __init__(self, epsilon: float, delta: Optional[float],
metric: pipeline_dp.Metric, noise: pipeline_dp.NoiseKind,
l0_ratios_dropped: Sequence[Tuple[int, float]],
linf_ratios_dropped: Sequence[Tuple[int, float]],
partition_histogram: hist.Histogram):
def __init__(
self,
epsilon: float,
delta: Optional[float],
metric: pipeline_dp.Metric,
noise: pipeline_dp.NoiseKind,
l0_ratios_dropped: Sequence[Tuple[int, float]],
linf_ratios_dropped: Sequence[Tuple[int, float]],
partition_histogram: hist.Histogram,
):
self._base_std = self._get_stddev_for_dp_mechanism(
epsilon, delta, noise)
self._metric = metric
Expand Down Expand Up @@ -84,15 +88,16 @@ def estimate_rmse(self,
linf_bound: linf contribution bound, AKA for COUNT as
max_contributions_per_partition. This parameter is ignored for
PRIVACY_ID_COUNT

Returns:
the estimated error.
"""
if self._metric == pipeline_dp.Metrics.COUNT:
if self._metric != pipeline_dp.Metrics.PRIVACY_ID_COUNT:
if linf_bound is None:
raise ValueError("linf must be given for COUNT")
ratio_dropped_l0 = self.get_ratio_dropped_l0(l0_bound)
ratio_dropped_linf = 0
if self._metric == pipeline_dp.Metrics.COUNT:
if self._metric != pipeline_dp.Metrics.PRIVACY_ID_COUNT:
ratio_dropped_linf = self.get_ratio_dropped_linf(linf_bound)
ratio_dropped = 1 - (1 - ratio_dropped_l0) * (1 - ratio_dropped_linf)
stddev = self._get_stddev(l0_bound, linf_bound)
Expand Down Expand Up @@ -133,23 +138,29 @@ def _get_stddev(self,
return self._base_std * math.sqrt(l0_bound) * linf_bound


def create_error_estimator(histograms: hist.DatasetHistograms, epsilon: float,
delta: Optional[float], metric: pipeline_dp.Metric,
noise: pipeline_dp.NoiseKind) -> CountErrorEstimator:
def create_estimator_for_count_and_privacy_id_count(
histograms: hist.DatasetHistograms,
epsilon: float,
delta: Optional[float],
metric: pipeline_dp.Metric,
noise: pipeline_dp.NoiseKind,
) -> ErrorEstimator:
"""Creates histogram based error estimator for COUNT or PRIVACY_ID_COUNT.

Args:
histograms: dataset histograms.
epsilon: epsilon parameter of the DP mechanism for adding noise.
delta: delta parameter of the DP mechanism for adding noise (must be
None for Laplace noise).
delta: delta parameter of the DP mechanism for adding noise (must be None
for Laplace noise).
metric: DP aggregation, COUNT or PRIVACY_ID_COUNT.
noise: type of DP noise.

Returns:
Error estimator.
"""
if metric not in [
pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.PRIVACY_ID_COUNT
pipeline_dp.Metrics.COUNT,
pipeline_dp.Metrics.PRIVACY_ID_COUNT,
]:
raise ValueError(
f"Only COUNT and PRIVACY_ID_COUNT are supported, but metric={metric}"
Expand All @@ -162,8 +173,15 @@ def create_error_estimator(histograms: hist.DatasetHistograms, epsilon: float,
partition_histogram = histograms.count_per_partition_histogram
else:
partition_histogram = histograms.count_privacy_id_per_partition
return CountErrorEstimator(epsilon, delta, metric, noise, l0_ratios_dropped,
linf_ratios_dropped, partition_histogram)
return ErrorEstimator(
epsilon,
delta,
metric,
noise,
l0_ratios_dropped,
linf_ratios_dropped,
partition_histogram,
)


def _estimate_rmse_impl(ratio_dropped: float, std: float,
Expand All @@ -176,3 +194,45 @@ def _estimate_rmse_impl(ratio_dropped: float, std: float,
std**2)
sum_rmse += bin.count * rmse
return sum_rmse / num_partitions


def create_estimator_for_sum(histograms: hist.DatasetHistograms,
epsilon: float,
delta: Optional[float],
noise: pipeline_dp.NoiseKind,
sum_index: int = 0) -> ErrorEstimator:
"""Creates histogram based error estimator for SUM.

Args:
histograms: dataset histograms.
epsilon: epsilon parameter of the DP mechanism for adding noise.
delta: delta parameter of the DP mechanism for adding noise (must be None
for Laplace noise).
noise: type of DP noise.
sum_index: the index of the sum for the case of multi-aggregations.

Returns:
Error estimator.
"""
l0_ratios_dropped = hist.compute_ratio_dropped(
histograms.l0_contributions_histogram)
if isinstance(histograms.linf_sum_contributions_histogram, hist.Histogram):
# 1 sum
linf_sum_histograms = histograms.linf_sum_contributions_histogram
partition_histogram = histograms.sum_per_partition_histogram
else: # multiple SUM aggregations
linf_sum_histograms = histograms.linf_sum_contributions_histogram[
sum_index]
partition_histogram = histograms.sum_per_partition_histogram[sum_index]

linf_ratios_dropped = hist.compute_ratio_dropped(linf_sum_histograms)

return ErrorEstimator(
epsilon,
delta,
pipeline_dp.Metrics.SUM,
noise,
l0_ratios_dropped,
linf_ratios_dropped,
partition_histogram,
)
55 changes: 45 additions & 10 deletions tests/dataset_histograms/histogram_error_estimator_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,25 @@ def _get_histograms(self) -> hist.DatasetHistograms:
computing_histograms.compute_dataset_histograms(
dataset, data_extractors, pipeline_dp.LocalBackend()))[0]

def _get_estimator(
def _get_estimator_for_count_and_privacy_id_count(
self,
metric: pipeline_dp.Metric,
noise_kind: pipeline_dp.NoiseKind = pipeline_dp.NoiseKind.LAPLACE,
epsilon: float = 2**0.5 / 2,
delta: Optional[float] = None,
):
return histogram_error_estimator.create_error_estimator(
return histogram_error_estimator.create_estimator_for_count_and_privacy_id_count(
self._get_histograms(), epsilon, delta, metric, noise_kind)

def _get_estimator_for_sum(
self,
noise_kind: pipeline_dp.NoiseKind = pipeline_dp.NoiseKind.LAPLACE,
epsilon: float = 2**0.5 / 2,
delta: Optional[float] = None,
):
return histogram_error_estimator.create_estimator_for_sum(
self._get_histograms(), epsilon, delta, noise_kind)

@parameterized.named_parameters(
dict(testcase_name='count_gaussian',
metric=pipeline_dp.Metrics.COUNT,
Expand Down Expand Up @@ -90,25 +99,34 @@ def test_count_get_sigma(self, metric: pipeline_dp.Metric, epsilon: float,
delta: Optional[float],
noise_kind: pipeline_dp.NoiseKind, l0: float,
linf: float, expected: float):
estimator = self._get_estimator(metric=metric,
epsilon=epsilon,
delta=delta,
noise_kind=noise_kind)
estimator = self._get_estimator_for_count_and_privacy_id_count(
metric=metric, epsilon=epsilon, delta=delta, noise_kind=noise_kind)
self.assertAlmostEqual(estimator._get_stddev(l0, linf),
expected,
delta=1e-10)

def test_sum_not_supported(self):
with self.assertRaisesRegex(
ValueError, "Only COUNT and PRIVACY_ID_COUNT are supported"):
self._get_estimator(pipeline_dp.Metrics.SUM)
self._get_estimator_for_count_and_privacy_id_count(
pipeline_dp.Metrics.SUM)

@parameterized.parameters((0, 1), (1, 9 / 11), (2, 8 / 11), (3, 7 / 11),
(9, 1 / 11), (10, 0), (20, 0))
# there are 11 (privacy_id, partition) pairs (from 2 privacy units), when
# l0_bound=1, 9 are dropped (from 1 privacy unit).
def test_get_ratio_dropped_l0(self, l0_bound, expected):
estimator = self._get_estimator(pipeline_dp.Metrics.COUNT)
estimator = self._get_estimator_for_count_and_privacy_id_count(
pipeline_dp.Metrics.COUNT)
self.assertAlmostEqual(estimator.get_ratio_dropped_l0(l0_bound),
expected)

@parameterized.parameters((0, 1), (1, 9 / 11), (2, 8 / 11), (3, 7 / 11),
(9, 1 / 11), (10, 0), (20, 0))
# there are 11 (privacy_id, partition) pairs (from 2 privacy units), when
# l0_bound=1, 9 are dropped (from 1 privacy unit).
def test_get_ratio_dropped_l0_for_sum(self, l0_bound, expected):
estimator = self._get_estimator_for_sum()
self.assertAlmostEqual(estimator.get_ratio_dropped_l0(l0_bound),
expected)

Expand All @@ -117,7 +135,19 @@ def test_get_ratio_dropped_l0(self, l0_bound, expected):
# there are 30 rows (from 2 privacy units), when linf_bound=1, 19 are
# dropped (from 1 privacy unit, which contributes 20 to 1 partition).
def test_get_ratio_dropped_linf(self, linf_bound, expected):
estimator = self._get_estimator(pipeline_dp.Metrics.COUNT)
estimator = self._get_estimator_for_count_and_privacy_id_count(
pipeline_dp.Metrics.COUNT)
self.assertAlmostEqual(estimator.get_ratio_dropped_linf(linf_bound),
expected)

@parameterized.parameters((0, 1), (0.5, 0.89), (1, 0.78), (2, 0.76),
(40, 0))
# there 1 is contribution of 40 and 10 contribution of 1.
# total contribution = 1*40+10*1 = 50
# when linf_bound = 0.5, left after contribution bounding 11*0.5=5.5, i.e.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how linf can be a double and not an integer?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

linf_bound is max contribution per partition, which means

max_contributions_per_partition for COUNT

max_sum_per_partition for SUM (which can be double)

# dropped (50-5.5)/50 = 0.89
def test_get_ratio_dropped_linf_for_sum(self, linf_bound, expected):
estimator = self._get_estimator_for_sum()
self.assertAlmostEqual(estimator.get_ratio_dropped_linf(linf_bound),
expected)

Expand All @@ -138,10 +168,15 @@ def test_get_ratio_dropped_linf(self, linf_bound, expected):
# rmse2 = sqrt(21*total_ratio_dropped + noise_stddev**2) ~= 19.70177
# rmse = (9*rmse1+rmse2)/10.
def test_estimate_rmse_count(self, l0_bound, linf_bound, expected):
estimator = self._get_estimator(pipeline_dp.Metrics.COUNT)
estimator = self._get_estimator_for_count_and_privacy_id_count(
pipeline_dp.Metrics.COUNT)
self.assertAlmostEqual(estimator.estimate_rmse(l0_bound, linf_bound),
expected)

def test_estimate_rmse_sum(self):
estimator = self._get_estimator_for_sum()
self.assertAlmostEqual(estimator.estimate_rmse(1, 1), 5.93769917)


if __name__ == '__main__':
absltest.main()
Loading