From 94bacde1fa4753ae65ec610f36c4b37cb9881ac2 Mon Sep 17 00:00:00 2001 From: Vadym Doroshenko <53558779+dvadym@users.noreply.github.com> Date: Fri, 3 Jun 2022 17:36:15 +0300 Subject: [PATCH] Improving aggregate_params validation (#290) --- pipeline_dp/aggregate_params.py | 82 ++++++++++++++++++--------------- pipeline_dp/dp_engine.py | 2 + tests/dp_engine_test.py | 60 +++++++++++++++++++----- 3 files changed, 94 insertions(+), 50 deletions(-) diff --git a/pipeline_dp/aggregate_params.py b/pipeline_dp/aggregate_params.py index 87f42753..1f9c9b74 100644 --- a/pipeline_dp/aggregate_params.py +++ b/pipeline_dp/aggregate_params.py @@ -112,39 +112,29 @@ def __post_init__(self): needs_min_max_value = Metrics.SUM in self.metrics \ or Metrics.MEAN in self.metrics \ or Metrics.VARIANCE in self.metrics - if not isinstance(self.max_partitions_contributed, - int) or self.max_partitions_contributed <= 0: - raise ValueError( - "params.max_partitions_contributed must be set " - "to a positive integer") - if not isinstance(self.max_contributions_per_partition, - int) or self.max_contributions_per_partition <= 0: - raise ValueError( - "params.max_contributions_per_partition must be set " - "to a positive integer") if needs_min_max_value and (self.min_value is None or self.max_value is None): raise ValueError( - "params.min_value and params.max_value must be set") + "AggregateParams: min_value and max_value must be set") if needs_min_max_value and (_not_a_proper_number(self.min_value) or _not_a_proper_number(self.max_value)): raise ValueError( - "params.min_value and params.max_value must be both finite numbers" - ) + "AggregateParams: min_value and max_value must be both " + "finite numbers") if needs_min_max_value and self.max_value < self.min_value: raise ValueError( - "params.max_value must be equal to or greater than params.min_value" - ) + "AggregateParams: max_value must be equal to or greater than" + " min_value") if Metrics.VECTOR_SUM in self.metrics and \ (Metrics.SUM in self.metrics or \ Metrics.MEAN in self.metrics or \ Metrics.VARIANCE in self.metrics): raise ValueError( - "Vector sum can not be computed together with scalar metrics, like sum, mean etc" - ) + "AggregateParams: vector sum can not be computed together " + "with scalar metrics, like sum, mean etc") if self.contribution_bounds_already_enforced and Metrics.PRIVACY_ID_COUNT in self.metrics: raise ValueError( - "Cannot calculate PRIVACY_ID_COUNT when " + "AggregateParams: Cannot calculate PRIVACY_ID_COUNT when " "contribution_bounds_already_enforced is set to True.") if self.custom_combiners: logging.warning("Warning: custom combiners are used. This is an " @@ -160,23 +150,31 @@ def __post_init__(self): raise ValueError( "AggregateParams.public_partitions is deprecated. Please use public_partitions argument in DPEngine.aggregate insead." ) - if self.max_contributions is not None and self.max_contributions > 0: + if self.max_contributions is not None: + _check_is_positive_int(self.max_contributions, "max_contributions") if ((self.max_partitions_contributed is not None) or (self.max_contributions_per_partition is not None)): raise ValueError( - "Only one in params.max_contributions or " - "(params.max_partitions_contributed and " - "params.max_contributions_per_partition) must be set") - else: - if ((self.max_partitions_contributed is None or - self.max_partitions_contributed <= 0) and - (self.max_contributions_per_partition is None or - self.max_contributions_per_partition <= 0)): + "AggregateParams: only one in max_contributions or " + "both max_partitions_contributed and " + "max_contributions_per_partition must be set") + else: # self.max_contributions is None + n_not_none = _count_not_none(self.max_partitions_contributed, + self.max_contributions_per_partition) + if n_not_none == 0: + raise ValueError( + "AggregateParams: either max_contributions must be set or " + "both max_partitions_contributed and " + "max_contributions_per_partition must be set.") + elif n_not_none == 1: raise ValueError( - "One amongst params.max_contributions or " - "(params.max_partitions_contributed or " - "params.max_contributions_per_partition) must be set to a " - "positive value.") + "AggregateParams: either none or both from " + "max_partitions_contributed and " + " max_contributions_per_partition must be set.") + _check_is_positive_int(self.max_partitions_contributed, + "max_partitions_contributed") + _check_is_positive_int(self.max_contributions_per_partition, + "max_contributions_per_partition") def __str__(self): if self.custom_combiners: @@ -373,13 +371,21 @@ class PrivacyIdCountParams: def __post_init__(self): if self.public_partitions: raise ValueError( - "PrivacyIdCountParams.public_partitions is deprecated. Please read API documentation for anonymous PrivacyIdCountParams transform." - ) + "PrivacyIdCountParams.public_partitions is deprecated. Please " + "read API documentation for anonymous PrivacyIdCountParams " + "transform.") -def _not_a_proper_number(num): - """ - Returns: - true if num is inf or NaN, false otherwise. - """ +def _not_a_proper_number(num: Any) -> bool: + """Returns true if num is inf or NaN, false otherwise.""" return math.isnan(num) or math.isinf(num) + + +def _check_is_positive_int(num: Any, field_name: str) -> bool: + if not (isinstance(num, int) and num > 0): + raise ValueError( + f"{field_name} has to be positive integer, but {num} given.") + + +def _count_not_none(*args): + return sum([1 for arg in args if arg is not None]) diff --git a/pipeline_dp/dp_engine.py b/pipeline_dp/dp_engine.py index c69c8985..c445a95e 100644 --- a/pipeline_dp/dp_engine.py +++ b/pipeline_dp/dp_engine.py @@ -466,6 +466,8 @@ def divide_and_round_up(a, b): def _check_aggregate_params(col, params: pipeline_dp.AggregateParams, data_extractors: DataExtractors): + if params.max_contributions is not None: + raise NotImplementedError("max_contributions is not supported yet.") if col is None or not col: raise ValueError("col must be non-empty") if params is None: diff --git a/tests/dp_engine_test.py b/tests/dp_engine_test.py index 3397acc8..fd1b4921 100644 --- a/tests/dp_engine_test.py +++ b/tests/dp_engine_test.py @@ -103,7 +103,6 @@ def test_user_contribution_bounding_applied(self): input_col, max_contributions=max_contributions, aggregator_fn=DpEngineTest.aggregator_fn)) - print(bound_result) self.assertLen(bound_result, 5) def test_contribution_bounding_empty_col(self): @@ -198,77 +197,113 @@ def test_aggregate_none(self): with self.assertRaises(Exception): pipeline_dp.DPEngine(None, None).aggregate(None, None, None) + # TODO: move these tests to aggregate_params_tests.py @parameterized.named_parameters( dict(testcase_name='negative max_partitions_contributed', - error_msg='negative max_partitions_contributed', + error_msg='max_partitions_contributed has to be positive integer', min_value=None, max_value=None, max_partitions_contributed=-1, max_contributions_per_partition=1, + max_contributions=None, metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT]), dict(testcase_name='negative max_contributions_per_partition', - error_msg='negative max_contributions_per_partition', + error_msg= + 'max_contributions_per_partition has to be positive integer', min_value=None, max_value=None, max_partitions_contributed=1, max_contributions_per_partition=-1, + max_contributions=None, metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT]), dict(testcase_name='float max_partitions_contributed', - error_msg='float max_partitions_contributed', + error_msg='max_partitions_contributed has to be positive integer', min_value=None, max_value=None, max_partitions_contributed=1.5, max_contributions_per_partition=1, + max_contributions=None, metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT]), dict(testcase_name='float max_contributions_per_partition', - error_msg='float max_contributions_per_partition', + error_msg= + 'max_contributions_per_partition has to be positive integer', min_value=None, max_value=None, max_partitions_contributed=1, max_contributions_per_partition=1.5, + max_contributions=None, metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT]), dict(testcase_name='unspecified min_value SUM', - error_msg='unspecified min_value', + error_msg='min_value and max_value must be set', min_value=None, max_value=1, max_partitions_contributed=1, max_contributions_per_partition=1, + max_contributions=None, metrics=[pipeline_dp.Metrics.SUM]), dict(testcase_name='unspecified max_value SUM', - error_msg='unspecified max_value', + error_msg='min_value and max_value must be set', min_value=1, max_value=None, max_partitions_contributed=1, max_contributions_per_partition=1, + max_contributions=None, metrics=[pipeline_dp.Metrics.SUM]), dict(testcase_name='unspecified min_value MEAN', - error_msg='unspecified min_value', + error_msg='min_value and max_value must be set', min_value=None, max_value=1, max_partitions_contributed=1, max_contributions_per_partition=1, + max_contributions=None, metrics=[pipeline_dp.Metrics.MEAN]), dict(testcase_name='unspecified max_value MEAN', - error_msg='unspecified max_value', + error_msg='min_value and max_value must be set', min_value=1, max_value=None, max_partitions_contributed=1, max_contributions_per_partition=1, + max_contributions=None, metrics=[pipeline_dp.Metrics.MEAN]), dict(testcase_name='min_value > max_value', - error_msg='min_value > max_value', + error_msg='must be equal to or greater', min_value=2, max_value=1, max_partitions_contributed=1, max_contributions_per_partition=1, + max_contributions=None, + metrics=[pipeline_dp.Metrics.SUM]), + dict(testcase_name='max_contrib and max_partitions are set', + error_msg='only one in max_contributions or', + min_value=0, + max_value=1, + max_partitions_contributed=1, + max_contributions_per_partition=1, + max_contributions=1, + metrics=[pipeline_dp.Metrics.SUM]), + dict(testcase_name='max_partitions_contributed not set', + error_msg='either none or both', + min_value=0, + max_value=1, + max_partitions_contributed=None, + max_contributions_per_partition=1, + max_contributions=None, + metrics=[pipeline_dp.Metrics.SUM]), + dict(testcase_name='contributions not set', + error_msg='either max_contributions must', + min_value=0, + max_value=1, + max_partitions_contributed=None, + max_contributions_per_partition=None, + max_contributions=None, metrics=[pipeline_dp.Metrics.SUM]), ) def test_check_invalid_bounding_params(self, error_msg, min_value, max_value, max_partitions_contributed, max_contributions_per_partition, - metrics): - with self.assertRaises(Exception, msg=error_msg): + max_contributions, metrics): + with self.assertRaisesRegex(ValueError, error_msg): budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10) engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, @@ -282,6 +317,7 @@ def test_check_invalid_bounding_params(self, error_msg, min_value, max_contributions_per_partition, min_value=min_value, max_value=max_value, + max_contributions=max_contributions, metrics=metrics), self._get_default_extractors()) def test_check_aggregate_params(self):