diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index e700285..909309a 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -766,17 +766,25 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any rng = random.Random(seed) return_parameters = [] + # These keys are special and should not be sampled or modified. All + # other keys are hyper-parameters to the model and should be sampled. + frozen_keys = {"type", "threshold", "threshold_ratio"} for _ in range(num_samples): parameter_spec = rng.choice(model_parameters) - model_type = parameter_spec["type"] - sample_parameters = dict( - (key, value) + sample_parameters = { + key: value for (key, value) in parameter_spec.items() - if key != "type" - ) + if key not in frozen_keys + } + frozen_parameters = { + key: value + for (key, value) in parameter_spec.items() + if key in frozen_keys + } + randomized = _choose_randomized_parameters(rng, sample_parameters) - randomized["type"] = model_type - return_parameters.append(randomized) + result = {**frozen_parameters, **randomized} + return_parameters.append(result) return return_parameters else: diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 1aeef9c..b58bfd1 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -520,6 +520,34 @@ def test_get_model_parameters_search_strategy_randomized_unknown_distribution( _get_model_parameters(training_conf["training"]) +def test_get_model_parameters_search_strategy_randomized_thresholds(training_conf): + """ + Even when the model parameters are selected with strategy "randomized", the + thresholds are still treated with a "grid" strategy. + _get_model_parameters() is not in charge of creating the threshold matrix, + so it passes the threshold and threshold_ratio through unchanged. + """ + training_conf["training"]["model_parameter_search"] = { + "strategy": "randomized", + "num_samples": 25, + } + training_conf["training"]["model_parameters"] = [ + { + "type": "random_forest", + "maxDepth": [1, 10, 100], + "threshold": [0.3, 0.5, 0.7, 0.8, 0.9], + "threshold_ratio": 1.2, + } + ] + + model_parameters = _get_model_parameters(training_conf["training"]) + + for parameter_choice in model_parameters: + assert parameter_choice["type"] == "random_forest" + assert parameter_choice["threshold"] == [0.3, 0.5, 0.7, 0.8, 0.9] + assert parameter_choice["threshold_ratio"] == 1.2 + + # ------------------------------------- # Tests that probably should be moved # -------------------------------------