[#167] Support "pinned" parameters with model_parameter_search strate…

…gy randomized This lets users set some parameters to a particular value, and only sample others. It's mostly a convenience because previously you could get the same behavior by passing the parameter as a one-element list, like `maxDepth = [7]`. This commit introduces the extra convenience of just specifying the parameter as a value, like `maxDepth = 7`. So now you can do something like this: ``` [[training.model_parameters]] type = "random_forest" maxDepth = 7 numTrees = [1, 10, 20] subsamplingRate = {distribution = "uniform", low = 0.1, high = 0.9} ``` maxDepth will always be 7, numTrees will be randomly sampled from the list 1, 10, 20, and subsamplingRate will be sampled uniformly from the range [0.1, 0.9].
ipums · Nov 27, 2024 · 1692c87 · 1692c87
1 parent 65cb5ff
commit 1692c87
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 3 deletions.
diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -695,8 +695,8 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
     parameter_choices = dict()
 
     for key, value in model_parameters.items():
-        # If it's a Sequence (usually list), choose one of the values at random.
-        if isinstance(value, collections.abc.Sequence):
+        # If it's a Sequence (usually list) but not a string, choose one of the values at random.
+        if isinstance(value, collections.abc.Sequence) and not isinstance(value, str):
             parameter_choices[key] = random.choice(value)
         # If it's a Mapping (usually dict), it defines a distribution from which
         # the parameter should be sampled.
@@ -711,8 +711,9 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
                 parameter_choices[key] = random.uniform(low, high)
             else:
                 raise ValueError("unknown distribution")
+        # All other types (including strings) are passed through unchanged.
         else:
-            raise ValueError("can't handle value type")
+            parameter_choices[key] = value
 
     return parameter_choices
 

diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
@@ -397,6 +397,40 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_distributio
         assert 0.0 <= parameter_choice["minInfoGain"] <= 100.0
 
 
+def test_get_model_parameters_search_strategy_randomized_take_values(training_conf):
+    """
+    If a value is neither a list nor a table, the "randomized" strategy just passes
+    it along as a value. This lets the user easily pin some parameters to a particular
+    value and randomize others.
+    """
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "randomized",
+        "num_samples": 25,
+    }
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "random_forest",
+            "maxDepth": 7,
+            "impurity": "entropy",
+            "minInfoGain": 0.5,
+            "numTrees": {"distribution": "randint", "low": 10, "high": 100},
+            "subsamplingRate": [0.5, 1.0, 1.5],
+        }
+    ]
+
+    model_parameters = _get_model_parameters(training_conf["training"])
+
+    assert len(model_parameters) == 25
+
+    for parameter_choice in model_parameters:
+        assert parameter_choice["type"] == "random_forest"
+        assert parameter_choice["maxDepth"] == 7
+        assert parameter_choice["impurity"] == "entropy"
+        assert parameter_choice["minInfoGain"] == 0.5
+        assert 10 <= parameter_choice["numTrees"] <= 100
+        assert parameter_choice["subsamplingRate"] in {0.5, 1.0, 1.5}
+
+
 # -------------------------------------
 # Tests that probably should be moved
 # -------------------------------------