From 1692c87452d984e48f12b51c82b24739c2360ffc Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Wed, 27 Nov 2024 14:51:07 -0600
Subject: [PATCH] [#167] Support "pinned" parameters with
 model_parameter_search strategy randomized

This lets users set some parameters to a particular value, and only sample
others.  It's mostly a convenience because previously you could get the same
behavior by passing the parameter as a one-element list, like `maxDepth = [7]`.

This commit introduces the extra convenience of just specifying the parameter
as a value, like `maxDepth = 7`. So now you can do something like this:

```
[[training.model_parameters]]
type = "random_forest"
maxDepth = 7
numTrees = [1, 10, 20]
subsamplingRate = {distribution = "uniform", low = 0.1, high = 0.9}
```

maxDepth will always be 7, numTrees will be randomly sampled from the list 1,
10, 20, and subsamplingRate will be sampled uniformly from the range [0.1,
0.9].
---
 .../link_step_train_test_models.py            |  7 ++--
 hlink/tests/model_exploration_test.py         | 34 +++++++++++++++++++
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 1c182ce..54a3115 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -695,8 +695,8 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
     parameter_choices = dict()
 
     for key, value in model_parameters.items():
-        # If it's a Sequence (usually list), choose one of the values at random.
-        if isinstance(value, collections.abc.Sequence):
+        # If it's a Sequence (usually list) but not a string, choose one of the values at random.
+        if isinstance(value, collections.abc.Sequence) and not isinstance(value, str):
             parameter_choices[key] = random.choice(value)
         # If it's a Mapping (usually dict), it defines a distribution from which
         # the parameter should be sampled.
@@ -711,8 +711,9 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
                 parameter_choices[key] = random.uniform(low, high)
             else:
                 raise ValueError("unknown distribution")
+        # All other types (including strings) are passed through unchanged.
         else:
-            raise ValueError("can't handle value type")
+            parameter_choices[key] = value
 
     return parameter_choices
 
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 51f648f..33ee240 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -397,6 +397,40 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_distributio
         assert 0.0 <= parameter_choice["minInfoGain"] <= 100.0
 
 
+def test_get_model_parameters_search_strategy_randomized_take_values(training_conf):
+    """
+    If a value is neither a list nor a table, the "randomized" strategy just passes
+    it along as a value. This lets the user easily pin some parameters to a particular
+    value and randomize others.
+    """
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "randomized",
+        "num_samples": 25,
+    }
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "random_forest",
+            "maxDepth": 7,
+            "impurity": "entropy",
+            "minInfoGain": 0.5,
+            "numTrees": {"distribution": "randint", "low": 10, "high": 100},
+            "subsamplingRate": [0.5, 1.0, 1.5],
+        }
+    ]
+
+    model_parameters = _get_model_parameters(training_conf["training"])
+
+    assert len(model_parameters) == 25
+
+    for parameter_choice in model_parameters:
+        assert parameter_choice["type"] == "random_forest"
+        assert parameter_choice["maxDepth"] == 7
+        assert parameter_choice["impurity"] == "entropy"
+        assert parameter_choice["minInfoGain"] == 0.5
+        assert 10 <= parameter_choice["numTrees"] <= 100
+        assert parameter_choice["subsamplingRate"] in {0.5, 1.0, 1.5}
+
+
 # -------------------------------------
 # Tests that probably should be moved
 # -------------------------------------