[#167] Don't randomize threshold or threshold_ratio

Only the hyper-parameters to the model should be affected by training.model_parameter_search.strategy. thresholds and threshold_ratios should be passed through unchanged on each model.
ipums · Dec 2, 2024 · 0f99e1b · 0f99e1b
1 parent 943fc0a
commit 0f99e1b
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 7 deletions.
diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -766,17 +766,25 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
             rng = random.Random(seed)
 
             return_parameters = []
+            # These keys are special and should not be sampled or modified. All
+            # other keys are hyper-parameters to the model and should be sampled.
+            frozen_keys = {"type", "threshold", "threshold_ratio"}
             for _ in range(num_samples):
                 parameter_spec = rng.choice(model_parameters)
-                model_type = parameter_spec["type"]
-                sample_parameters = dict(
-                    (key, value)
+                sample_parameters = {
+                    key: value
                     for (key, value) in parameter_spec.items()
-                    if key != "type"
-                )
+                    if key not in frozen_keys
+                }
+                frozen_parameters = {
+                    key: value
+                    for (key, value) in parameter_spec.items()
+                    if key in frozen_keys
+                }
+
                 randomized = _choose_randomized_parameters(rng, sample_parameters)
-                randomized["type"] = model_type
-                return_parameters.append(randomized)
+                result = {**frozen_parameters, **randomized}
+                return_parameters.append(result)
 
             return return_parameters
         else:

diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
@@ -520,6 +520,34 @@ def test_get_model_parameters_search_strategy_randomized_unknown_distribution(
         _get_model_parameters(training_conf["training"])
 
 
+def test_get_model_parameters_search_strategy_randomized_thresholds(training_conf):
+    """
+    Even when the model parameters are selected with strategy "randomized", the
+    thresholds are still treated with a "grid" strategy.
+    _get_model_parameters() is not in charge of creating the threshold matrix,
+    so it passes the threshold and threshold_ratio through unchanged.
+    """
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "randomized",
+        "num_samples": 25,
+    }
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "random_forest",
+            "maxDepth": [1, 10, 100],
+            "threshold": [0.3, 0.5, 0.7, 0.8, 0.9],
+            "threshold_ratio": 1.2,
+        }
+    ]
+
+    model_parameters = _get_model_parameters(training_conf["training"])
+
+    for parameter_choice in model_parameters:
+        assert parameter_choice["type"] == "random_forest"
+        assert parameter_choice["threshold"] == [0.3, 0.5, 0.7, 0.8, 0.9]
+        assert parameter_choice["threshold_ratio"] == 1.2
+
+
 # -------------------------------------
 # Tests that probably should be moved
 # -------------------------------------