From 5d0ea0baaa7494172f0396ddb6c78f82c78429cf Mon Sep 17 00:00:00 2001 From: rileyh Date: Mon, 2 Dec 2024 11:21:20 -0600 Subject: [PATCH] [#167] Add a normal distribution to randomized parameter search --- .../model_exploration/link_step_train_test_models.py | 10 ++++++++-- hlink/tests/model_exploration_test.py | 9 +++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 988ed8b..452bcc1 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -704,13 +704,19 @@ def _choose_randomized_parameters( # the parameter should be sampled. elif isinstance(value, collections.abc.Mapping): distribution = value["distribution"] - low = value["low"] - high = value["high"] if distribution == "randint": + low = value["low"] + high = value["high"] parameter_choices[key] = rng.randint(low, high) elif distribution == "uniform": + low = value["low"] + high = value["high"] parameter_choices[key] = rng.uniform(low, high) + elif distribution == "normal": + mean = value["mean"] + stdev = value["standard_deviation"] + parameter_choices[key] = rng.normalvariate(mean, stdev) else: raise ValueError("unknown distribution") # All other types (including strings) are passed through unchanged. diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 3af04da..8f31aaa 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -385,6 +385,11 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_distributio "type": "decision_tree", "maxDepth": {"distribution": "randint", "low": 1, "high": 20}, "minInfoGain": {"distribution": "uniform", "low": 0.0, "high": 100.0}, + "minWeightFractionPerNode": { + "distribution": "normal", + "mean": 10.0, + "standard_deviation": 2.5, + }, } ] @@ -396,6 +401,10 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_distributio assert parameter_choice["type"] == "decision_tree" assert 1 <= parameter_choice["maxDepth"] <= 20 assert 0.0 <= parameter_choice["minInfoGain"] <= 100.0 + # Technically a normal distribution can return any value, even ones very + # far from its mean. So we can't assert on the value returned here. But + # there definitely should be a value of some sort in the dictionary. + assert "minWeightFractionPerNode" in parameter_choice def test_get_model_parameters_search_strategy_randomized_take_values(training_conf):