From c5f5b13c0125e17375978d244ab70eef4a8cc9f4 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 26 Nov 2024 14:20:36 -0600
Subject: [PATCH 01/21] [#167] Pull _custom_param_grid_builder() out of the
 LinkStepTrainTestModels class

---
 .../link_step_train_test_models.py            | 63 ++++++++++---------
 hlink/tests/model_exploration_test.py         |  6 +-
 2 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 8e391b8..347a9ad 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -244,35 +244,6 @@ def _get_splits(
 
         return splits
 
-    def _custom_param_grid_builder(self, conf: dict[str, Any]) -> list[dict[str, Any]]:
-        print("Building param grid for models")
-        given_parameters = conf[f"{self.task.training_conf}"]["model_parameters"]
-        new_params = []
-        for run in given_parameters:
-            params = run.copy()
-            model_type = params.pop("type")
-
-            # dropping thresholds to prep for scikitlearn model exploration refactor
-            threshold = params.pop("threshold", False)
-            threshold_ratio = params.pop("threshold_ratio", False)
-
-            keys = params.keys()
-            values = params.values()
-
-            params_exploded = []
-            for prod in itertools.product(*values):
-                params_exploded.append(dict(zip(keys, prod)))
-
-            for subdict in params_exploded:
-                subdict["type"] = model_type
-                if threshold:
-                    subdict["threshold"] = threshold
-                if threshold_ratio:
-                    subdict["threshold_ratio"] = threshold_ratio
-
-            new_params.extend(params_exploded)
-        return new_params
-
     def _capture_results(
         self,
         predictions: pyspark.sql.DataFrame,
@@ -332,7 +303,7 @@ def _get_model_parameters(self, conf: dict[str, Any]) -> list[dict[str, Any]]:
 
         model_parameters = conf[training_conf]["model_parameters"]
         if "param_grid" in conf[training_conf] and conf[training_conf]["param_grid"]:
-            model_parameters = self._custom_param_grid_builder(conf)
+            model_parameters = _custom_param_grid_builder(training_conf, conf)
         elif model_parameters == []:
             raise ValueError(
                 "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'."
@@ -691,3 +662,35 @@ def _create_desc_df() -> pd.DataFrame:
             "mcc_train_sd",
         ]
     )
+
+
+def _custom_param_grid_builder(
+    training_conf: str, conf: dict[str, Any]
+) -> list[dict[str, Any]]:
+    print("Building param grid for models")
+    given_parameters = conf[training_conf]["model_parameters"]
+    new_params = []
+    for run in given_parameters:
+        params = run.copy()
+        model_type = params.pop("type")
+
+        # dropping thresholds to prep for scikitlearn model exploration refactor
+        threshold = params.pop("threshold", False)
+        threshold_ratio = params.pop("threshold_ratio", False)
+
+        keys = params.keys()
+        values = params.values()
+
+        params_exploded = []
+        for prod in itertools.product(*values):
+            params_exploded.append(dict(zip(keys, prod)))
+
+        for subdict in params_exploded:
+            subdict["type"] = model_type
+            if threshold:
+                subdict["threshold"] = threshold
+            if threshold_ratio:
+                subdict["threshold_ratio"] = threshold_ratio
+
+        new_params.extend(params_exploded)
+    return new_params
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index e0cf593..42e1364 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -9,6 +9,7 @@
 import hlink.linking.core.threshold as threshold_core
 from hlink.linking.model_exploration.link_step_train_test_models import (
     LinkStepTrainTestModels,
+    _custom_param_grid_builder,
 )
 
 
@@ -121,7 +122,7 @@ def test_all(
     main.do_drop_all("")
 
 
-def test_step_2_param_grid(spark, main, training_conf, model_exploration, fake_self):
+def test_step_2_param_grid(main, training_conf):
     """Test matching step 2 training to see if the custom param grid builder is working"""
 
     training_conf["training"]["model_parameters"] = [
@@ -129,8 +130,7 @@ def test_step_2_param_grid(spark, main, training_conf, model_exploration, fake_s
         {"type": "probit", "threshold": [0.5, 0.7]},
     ]
 
-    link_step = LinkStepTrainTestModels(model_exploration)
-    param_grid = link_step._custom_param_grid_builder(training_conf)
+    param_grid = _custom_param_grid_builder("training", training_conf)
 
     expected = [
         {"maxDepth": 3, "numTrees": 50, "type": "random_forest"},

From 605369b93bd201970ed9b97f09a8953b3c456efa Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 26 Nov 2024 14:29:28 -0600
Subject: [PATCH 02/21] [#167] Simplify the interface to
 _custom_param_grid_builder()

We can just pass the list of model_parameters from the config file to this
function.
---
 .../model_exploration/link_step_train_test_models.py |  6 +++---
 hlink/tests/model_exploration_test.py                | 12 ++++--------
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 347a9ad..7c03404 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -303,7 +303,7 @@ def _get_model_parameters(self, conf: dict[str, Any]) -> list[dict[str, Any]]:
 
         model_parameters = conf[training_conf]["model_parameters"]
         if "param_grid" in conf[training_conf] and conf[training_conf]["param_grid"]:
-            model_parameters = _custom_param_grid_builder(training_conf, conf)
+            model_parameters = _custom_param_grid_builder(model_parameters)
         elif model_parameters == []:
             raise ValueError(
                 "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'."
@@ -665,10 +665,10 @@ def _create_desc_df() -> pd.DataFrame:
 
 
 def _custom_param_grid_builder(
-    training_conf: str, conf: dict[str, Any]
+    model_parameters: list[dict[str, Any]]
 ) -> list[dict[str, Any]]:
     print("Building param grid for models")
-    given_parameters = conf[training_conf]["model_parameters"]
+    given_parameters = model_parameters
     new_params = []
     for run in given_parameters:
         params = run.copy()
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 42e1364..03b53d5 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -122,15 +122,13 @@ def test_all(
     main.do_drop_all("")
 
 
-def test_step_2_param_grid(main, training_conf):
-    """Test matching step 2 training to see if the custom param grid builder is working"""
-
-    training_conf["training"]["model_parameters"] = [
+def test_custom_param_grid_builder():
+    """Test matching step 2's custom param grid builder"""
+    model_parameters = [
         {"type": "random_forest", "maxDepth": [3, 4, 5], "numTrees": [50, 100]},
         {"type": "probit", "threshold": [0.5, 0.7]},
     ]
-
-    param_grid = _custom_param_grid_builder("training", training_conf)
+    param_grid = _custom_param_grid_builder(model_parameters)
 
     expected = [
         {"maxDepth": 3, "numTrees": 50, "type": "random_forest"},
@@ -145,8 +143,6 @@ def test_step_2_param_grid(main, training_conf):
     assert len(param_grid) == len(expected)
     assert all(m in expected for m in param_grid)
 
-    main.do_drop_all("")
-
 
 # -------------------------------------
 # Tests that probably should be moved

From 2204152a2f4252b19a029e8de156cce98513e369 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 26 Nov 2024 14:44:03 -0600
Subject: [PATCH 03/21] [#167] Pull _get_model_parameters() out of the LinkStep
 class

This will make this piece of code easier to understand and test.
---
 .../link_step_train_test_models.py            | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 7c03404..9ef97ee 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -67,7 +67,7 @@ def _run(self) -> None:
 
         splits = self._get_splits(prepped_data, id_a, n_training_iterations, seed)
 
-        model_parameters = self._get_model_parameters(config)
+        model_parameters = _get_model_parameters(training_conf, config)
 
         logger.info(
             f"There are {len(model_parameters)} sets of model parameters to explore; "
@@ -298,18 +298,6 @@ def _capture_results(
         )
         return pd.concat([results_df, new_results], ignore_index=True)
 
-    def _get_model_parameters(self, conf: dict[str, Any]) -> list[dict[str, Any]]:
-        training_conf = str(self.task.training_conf)
-
-        model_parameters = conf[training_conf]["model_parameters"]
-        if "param_grid" in conf[training_conf] and conf[training_conf]["param_grid"]:
-            model_parameters = _custom_param_grid_builder(model_parameters)
-        elif model_parameters == []:
-            raise ValueError(
-                "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'."
-            )
-        return model_parameters
-
     def _save_training_results(
         self, desc_df: pd.DataFrame, spark: pyspark.sql.SparkSession
     ) -> None:
@@ -694,3 +682,16 @@ def _custom_param_grid_builder(
 
         new_params.extend(params_exploded)
     return new_params
+
+
+def _get_model_parameters(
+    training_conf: str, conf: dict[str, Any]
+) -> list[dict[str, Any]]:
+    model_parameters = conf[training_conf]["model_parameters"]
+    if "param_grid" in conf[training_conf] and conf[training_conf]["param_grid"]:
+        model_parameters = _custom_param_grid_builder(model_parameters)
+    elif model_parameters == []:
+        raise ValueError(
+            "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'."
+        )
+    return model_parameters

From 7d483801edb104decc72ab979d9aca905535d4fb Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 26 Nov 2024 15:05:35 -0600
Subject: [PATCH 04/21] [#167] Add a few tests for _get_model_parameters()

---
 hlink/tests/model_exploration_test.py | 57 +++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 03b53d5..e349500 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -10,6 +10,7 @@
 from hlink.linking.model_exploration.link_step_train_test_models import (
     LinkStepTrainTestModels,
     _custom_param_grid_builder,
+    _get_model_parameters,
 )
 
 
@@ -144,6 +145,62 @@ def test_custom_param_grid_builder():
     assert all(m in expected for m in param_grid)
 
 
+def test_get_model_parameters_no_param_grid_attribute(training_conf):
+    """
+    When there's no training.param_grid attribute, the default is to use the "explicit"
+    strategy, testing each element of model_parameters in turn.
+    """
+    training_conf["training"]["model_parameters"] = [
+        {"type": "random_forest", "maxDepth": 3, "numTrees": 50},
+        {"type": "probit", "threshold": 0.7},
+    ]
+    assert "param_grid" not in training_conf["training"]
+
+    model_parameters = _get_model_parameters("training", training_conf)
+
+    assert model_parameters == [
+        {"type": "random_forest", "maxDepth": 3, "numTrees": 50},
+        {"type": "probit", "threshold": 0.7},
+    ]
+
+
+def test_get_model_parameters_param_grid_false(training_conf):
+    """
+    When training.param_grid is set to False, model exploration uses the "explicit"
+    strategy. The model_parameters are returned unchanged.
+    """
+    training_conf["training"]["model_parameters"] = [
+        {"type": "logistic_regression", "threshold": 0.3, "threshold_ratio": 1.4},
+    ]
+    training_conf["training"]["param_grid"] = False
+
+    model_parameters = _get_model_parameters("training", training_conf)
+
+    assert model_parameters == [
+        {"type": "logistic_regression", "threshold": 0.3, "threshold_ratio": 1.4},
+    ]
+
+
+def test_get_model_parameters_param_grid_true(training_conf):
+    """
+    When training.param_grid is set to True, model exploration uses the "grid"
+    strategy, exploding model_parameters.
+    """
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "random_forest",
+            "maxDepth": [5, 10, 15],
+            "numTrees": [50, 100],
+            "threshold": 0.5,
+        },
+    ]
+    training_conf["training"]["param_grid"] = True
+
+    model_parameters = _get_model_parameters("training", training_conf)
+    # 3 settings for maxDepth * 2 settings for numTrees = 6 total settings
+    assert len(model_parameters) == 6
+
+
 # -------------------------------------
 # Tests that probably should be moved
 # -------------------------------------

From bc0bf7d6d254c60f580ba3c192ac93a96b449660 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 26 Nov 2024 15:23:47 -0600
Subject: [PATCH 05/21] [#167] Just pass the training section of the config to
 _get_model_parameters()

---
 .../model_exploration/link_step_train_test_models.py   | 10 ++++------
 hlink/tests/model_exploration_test.py                  |  6 +++---
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 9ef97ee..47c0a8d 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -67,7 +67,7 @@ def _run(self) -> None:
 
         splits = self._get_splits(prepped_data, id_a, n_training_iterations, seed)
 
-        model_parameters = _get_model_parameters(training_conf, config)
+        model_parameters = _get_model_parameters(config[training_conf])
 
         logger.info(
             f"There are {len(model_parameters)} sets of model parameters to explore; "
@@ -684,11 +684,9 @@ def _custom_param_grid_builder(
     return new_params
 
 
-def _get_model_parameters(
-    training_conf: str, conf: dict[str, Any]
-) -> list[dict[str, Any]]:
-    model_parameters = conf[training_conf]["model_parameters"]
-    if "param_grid" in conf[training_conf] and conf[training_conf]["param_grid"]:
+def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any]]:
+    model_parameters = training_config["model_parameters"]
+    if "param_grid" in training_config and training_config["param_grid"]:
         model_parameters = _custom_param_grid_builder(model_parameters)
     elif model_parameters == []:
         raise ValueError(
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index e349500..facd03b 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -156,7 +156,7 @@ def test_get_model_parameters_no_param_grid_attribute(training_conf):
     ]
     assert "param_grid" not in training_conf["training"]
 
-    model_parameters = _get_model_parameters("training", training_conf)
+    model_parameters = _get_model_parameters(training_conf["training"])
 
     assert model_parameters == [
         {"type": "random_forest", "maxDepth": 3, "numTrees": 50},
@@ -174,7 +174,7 @@ def test_get_model_parameters_param_grid_false(training_conf):
     ]
     training_conf["training"]["param_grid"] = False
 
-    model_parameters = _get_model_parameters("training", training_conf)
+    model_parameters = _get_model_parameters(training_conf["training"])
 
     assert model_parameters == [
         {"type": "logistic_regression", "threshold": 0.3, "threshold_ratio": 1.4},
@@ -196,7 +196,7 @@ def test_get_model_parameters_param_grid_true(training_conf):
     ]
     training_conf["training"]["param_grid"] = True
 
-    model_parameters = _get_model_parameters("training", training_conf)
+    model_parameters = _get_model_parameters(training_conf["training"])
     # 3 settings for maxDepth * 2 settings for numTrees = 6 total settings
     assert len(model_parameters) == 6
 

From 8be8806839018db70296437ad9bed3a21050dced Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 26 Nov 2024 15:44:47 -0600
Subject: [PATCH 06/21] [#167] Add a couple of tests for the new
 training.model_parameter_search setting

One of these tests is failing because we haven't implemented this logic in the
_get_model_parameters() function yet.
---
 hlink/tests/model_exploration_test.py | 45 +++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index facd03b..9bf58c6 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -201,6 +201,51 @@ def test_get_model_parameters_param_grid_true(training_conf):
     assert len(model_parameters) == 6
 
 
+def test_get_model_parameters_search_strategy_explicit(training_conf):
+    """
+    When training.model_parameter_search.strategy is set to "explicit",
+    model_parameters pass through unchanged.
+    """
+    training_conf["training"]["model_parameters"] = [
+        {"type": "random_forest", "maxDepth": 15, "numTrees": 100, "threshold": 0.5},
+        {"type": "probit", "threshold": 0.8, "threshold_ratio": 1.3},
+    ]
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "explicit",
+    }
+    assert "param_grid" not in training_conf["training"]
+
+    model_parameters = _get_model_parameters(training_conf["training"])
+
+    assert model_parameters == [
+        {"type": "random_forest", "maxDepth": 15, "numTrees": 100, "threshold": 0.5},
+        {"type": "probit", "threshold": 0.8, "threshold_ratio": 1.3},
+    ]
+
+
+def test_get_model_parameters_search_strategy_grid(training_conf):
+    """
+    When training.model_parameter_search.strategy is set to "grid",
+    model_parameters are exploded.
+    """
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "random_forest",
+            "maxDepth": [5, 10, 15],
+            "numTrees": [50, 100],
+            "threshold": 0.5,
+        },
+    ]
+    training_conf["model_parameter_search"] = {
+        "strategy": "grid",
+    }
+    assert "param_grid" not in training_conf
+
+    model_parameters = _get_model_parameters(training_conf["training"])
+    # 3 settings for maxDepth * 2 settings for numTrees = 6 total settings
+    assert len(model_parameters) == 6
+
+
 # -------------------------------------
 # Tests that probably should be moved
 # -------------------------------------

From a939ec2f064ac7607d937e7ffce783b935c08164 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 26 Nov 2024 16:00:24 -0600
Subject: [PATCH 07/21] [#167] Look for training.model_parameter_search in
 _get_model_parameters()

---
 .../model_exploration/link_step_train_test_models.py   | 10 ++++++++++
 hlink/tests/model_exploration_test.py                  |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 47c0a8d..3e9853a 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -686,8 +686,18 @@ def _custom_param_grid_builder(
 
 def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any]]:
     model_parameters = training_config["model_parameters"]
+    model_parameter_search = training_config.get("model_parameter_search")
+
     if "param_grid" in training_config and training_config["param_grid"]:
         model_parameters = _custom_param_grid_builder(model_parameters)
+    elif model_parameter_search is not None:
+        strategy = model_parameter_search["strategy"]
+        if strategy == "explicit":
+            return model_parameters
+        elif strategy == "grid":
+            return _custom_param_grid_builder(model_parameters)
+        else:
+            raise ValueError(f"Unknown model_parameter_search strategy '{strategy}'")
     elif model_parameters == []:
         raise ValueError(
             "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'."
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 9bf58c6..9a86526 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -236,7 +236,7 @@ def test_get_model_parameters_search_strategy_grid(training_conf):
             "threshold": 0.5,
         },
     ]
-    training_conf["model_parameter_search"] = {
+    training_conf["training"]["model_parameter_search"] = {
         "strategy": "grid",
     }
     assert "param_grid" not in training_conf

From 801582e0f629d2c250798e630730f4704bbf49a1 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 26 Nov 2024 16:14:49 -0600
Subject: [PATCH 08/21] [#167] Make sure that model_parameter_search takes
 precedence over param_grid

---
 .../link_step_train_test_models.py            |  6 +-
 hlink/tests/model_exploration_test.py         | 58 ++++++++++++++++++-
 2 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 3e9853a..7ebe074 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -688,9 +688,7 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
     model_parameters = training_config["model_parameters"]
     model_parameter_search = training_config.get("model_parameter_search")
 
-    if "param_grid" in training_config and training_config["param_grid"]:
-        model_parameters = _custom_param_grid_builder(model_parameters)
-    elif model_parameter_search is not None:
+    if model_parameter_search is not None:
         strategy = model_parameter_search["strategy"]
         if strategy == "explicit":
             return model_parameters
@@ -698,6 +696,8 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
             return _custom_param_grid_builder(model_parameters)
         else:
             raise ValueError(f"Unknown model_parameter_search strategy '{strategy}'")
+    elif "param_grid" in training_config and training_config["param_grid"]:
+        model_parameters = _custom_param_grid_builder(model_parameters)
     elif model_parameters == []:
         raise ValueError(
             "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'."
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 9a86526..c560b19 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -145,16 +145,18 @@ def test_custom_param_grid_builder():
     assert all(m in expected for m in param_grid)
 
 
-def test_get_model_parameters_no_param_grid_attribute(training_conf):
+def test_get_model_parameters_default_behavior(training_conf):
     """
-    When there's no training.param_grid attribute, the default is to use the "explicit"
-    strategy, testing each element of model_parameters in turn.
+    When there's no training.param_grid attribute or
+    training.model_parameter_search attribute, the default is to use the
+    "explicit" strategy, testing each element of model_parameters in turn.
     """
     training_conf["training"]["model_parameters"] = [
         {"type": "random_forest", "maxDepth": 3, "numTrees": 50},
         {"type": "probit", "threshold": 0.7},
     ]
     assert "param_grid" not in training_conf["training"]
+    assert "model_parameter_search" not in training_conf["training"]
 
     model_parameters = _get_model_parameters(training_conf["training"])
 
@@ -246,6 +248,56 @@ def test_get_model_parameters_search_strategy_grid(training_conf):
     assert len(model_parameters) == 6
 
 
+def test_get_model_parameters_search_strategy_explicit_with_param_grid_true(
+    training_conf,
+):
+    """
+    When both model_parameter_search and param_grid are set, model_parameter_search
+    takes precedence.
+    """
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "random_forest",
+            "maxDepth": 10,
+            "numTrees": 75,
+            "threshold": 0.7,
+        }
+    ]
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "explicit",
+    }
+    # model_parameter_search takes precedence over this
+    training_conf["training"]["param_grid"] = True
+
+    model_parameters = _get_model_parameters(training_conf["training"])
+    assert model_parameters == [
+        {"type": "random_forest", "maxDepth": 10, "numTrees": 75, "threshold": 0.7}
+    ]
+
+
+def test_get_model_parameters_search_strategy_grid_with_param_grid_false(training_conf):
+    """
+    When both model_parameter_search and param_grid are set, model_parameter_search
+    takes precedence.
+    """
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "random_forest",
+            "maxDepth": [5, 10, 15],
+            "numTrees": [50, 100],
+            "threshold": 0.5,
+        },
+    ]
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "grid",
+    }
+    # model_parameter_search takes precedence over this
+    training_conf["training"]["param_grid"] = False
+
+    model_parameters = _get_model_parameters(training_conf["training"])
+    assert len(model_parameters) == 6
+
+
 # -------------------------------------
 # Tests that probably should be moved
 # -------------------------------------

From a47688477cfa7c04417c5548b66993f8ff82ae1f Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Wed, 27 Nov 2024 09:25:55 -0600
Subject: [PATCH 09/21] [#167] Print a deprecation warning for
 training.param_grid

The new training.model_parameter_search is a more flexible version of
param_grid. We still support param_grid, but eventually we will want to
completely switch over to model_parameter_search instead.
---
 .../link_step_train_test_models.py            | 17 +++++++++++
 hlink/tests/model_exploration_test.py         | 30 ++++++++++++++++---
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 7ebe074..d6dce8f 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -7,6 +7,8 @@
 import logging
 import math
 import re
+import sys
+from textwrap import dedent
 from time import perf_counter
 from typing import Any
 import numpy as np
@@ -688,6 +690,21 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
     model_parameters = training_config["model_parameters"]
     model_parameter_search = training_config.get("model_parameter_search")
 
+    if "param_grid" in training_config:
+        print(
+            dedent(
+                """\
+                Deprecation Warning: training.param_grid is deprecated.
+
+                Please use training.model_parameter_search instead by replacing
+
+                `param_grid = True` with `model_parameter_search = {strategy = "grid"}` or
+                `param_grid = False` with `model_parameter_search = {strategy = "explicit"}`
+
+                [deprecated_in_version=4.0.0]"""
+            ),
+            file=sys.stderr,
+        )
     if model_parameter_search is not None:
         strategy = model_parameter_search["strategy"]
         if strategy == "explicit":
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index c560b19..5a6957e 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -166,10 +166,12 @@ def test_get_model_parameters_default_behavior(training_conf):
     ]
 
 
-def test_get_model_parameters_param_grid_false(training_conf):
+def test_get_model_parameters_param_grid_false(training_conf, capsys):
     """
     When training.param_grid is set to False, model exploration uses the "explicit"
     strategy. The model_parameters are returned unchanged.
+
+    This prints a deprecation warning because param_grid is deprecated.
     """
     training_conf["training"]["model_parameters"] = [
         {"type": "logistic_regression", "threshold": 0.3, "threshold_ratio": 1.4},
@@ -182,11 +184,16 @@ def test_get_model_parameters_param_grid_false(training_conf):
         {"type": "logistic_regression", "threshold": 0.3, "threshold_ratio": 1.4},
     ]
 
+    output = capsys.readouterr()
+    assert "Deprecation Warning: training.param_grid is deprecated" in output.err
+
 
-def test_get_model_parameters_param_grid_true(training_conf):
+def test_get_model_parameters_param_grid_true(training_conf, capsys):
     """
     When training.param_grid is set to True, model exploration uses the "grid"
     strategy, exploding model_parameters.
+
+    This prints a deprecation warning because param_grid is deprecated.
     """
     training_conf["training"]["model_parameters"] = [
         {
@@ -202,6 +209,9 @@ def test_get_model_parameters_param_grid_true(training_conf):
     # 3 settings for maxDepth * 2 settings for numTrees = 6 total settings
     assert len(model_parameters) == 6
 
+    output = capsys.readouterr()
+    assert "Deprecation Warning: training.param_grid is deprecated" in output.err
+
 
 def test_get_model_parameters_search_strategy_explicit(training_conf):
     """
@@ -249,11 +259,13 @@ def test_get_model_parameters_search_strategy_grid(training_conf):
 
 
 def test_get_model_parameters_search_strategy_explicit_with_param_grid_true(
-    training_conf,
+    training_conf, capsys
 ):
     """
     When both model_parameter_search and param_grid are set, model_parameter_search
     takes precedence.
+
+    This prints a deprecation warning because param_grid is deprecated.
     """
     training_conf["training"]["model_parameters"] = [
         {
@@ -274,11 +286,18 @@ def test_get_model_parameters_search_strategy_explicit_with_param_grid_true(
         {"type": "random_forest", "maxDepth": 10, "numTrees": 75, "threshold": 0.7}
     ]
 
+    output = capsys.readouterr()
+    assert "Deprecation Warning: training.param_grid is deprecated" in output.err
+
 
-def test_get_model_parameters_search_strategy_grid_with_param_grid_false(training_conf):
+def test_get_model_parameters_search_strategy_grid_with_param_grid_false(
+    training_conf, capsys
+):
     """
     When both model_parameter_search and param_grid are set, model_parameter_search
     takes precedence.
+
+    This prints a deprecation warning because param_grid is deprecated.
     """
     training_conf["training"]["model_parameters"] = [
         {
@@ -297,6 +316,9 @@ def test_get_model_parameters_search_strategy_grid_with_param_grid_false(trainin
     model_parameters = _get_model_parameters(training_conf["training"])
     assert len(model_parameters) == 6
 
+    output = capsys.readouterr()
+    assert "Deprecation Warning: training.param_grid is deprecated" in output.err
+
 
 # -------------------------------------
 # Tests that probably should be moved

From 8c724467738dd128124cabfd51bd377c2245ade1 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Wed, 27 Nov 2024 10:38:20 -0600
Subject: [PATCH 10/21] [#167] Refactor _get_model_parameters()

---
 .../model_exploration/link_step_train_test_models.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index d6dce8f..99a929c 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -687,9 +687,6 @@ def _custom_param_grid_builder(
 
 
 def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any]]:
-    model_parameters = training_config["model_parameters"]
-    model_parameter_search = training_config.get("model_parameter_search")
-
     if "param_grid" in training_config:
         print(
             dedent(
@@ -705,6 +702,11 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
             ),
             file=sys.stderr,
         )
+
+    model_parameters = training_config["model_parameters"]
+    model_parameter_search = training_config.get("model_parameter_search")
+    use_param_grid = training_config.get("param_grid", False)
+
     if model_parameter_search is not None:
         strategy = model_parameter_search["strategy"]
         if strategy == "explicit":
@@ -713,8 +715,8 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
             return _custom_param_grid_builder(model_parameters)
         else:
             raise ValueError(f"Unknown model_parameter_search strategy '{strategy}'")
-    elif "param_grid" in training_config and training_config["param_grid"]:
-        model_parameters = _custom_param_grid_builder(model_parameters)
+    elif use_param_grid:
+        return _custom_param_grid_builder(model_parameters)
     elif model_parameters == []:
         raise ValueError(
             "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'."

From 896ad67756782fd59fad6903f4c70925a9ccff4a Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Wed, 27 Nov 2024 10:57:42 -0600
Subject: [PATCH 11/21] [#167] Improve an error condition in
 _get_model_parameters()

---
 .../model_exploration/link_step_train_test_models.py  | 10 ++++++----
 hlink/tests/model_exploration_test.py                 | 11 +++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 99a929c..fad2429 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -707,6 +707,11 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
     model_parameter_search = training_config.get("model_parameter_search")
     use_param_grid = training_config.get("param_grid", False)
 
+    if model_parameters == []:
+        raise ValueError(
+            "model_parameters is empty, so there are no models to evaluate"
+        )
+
     if model_parameter_search is not None:
         strategy = model_parameter_search["strategy"]
         if strategy == "explicit":
@@ -717,8 +722,5 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
             raise ValueError(f"Unknown model_parameter_search strategy '{strategy}'")
     elif use_param_grid:
         return _custom_param_grid_builder(model_parameters)
-    elif model_parameters == []:
-        raise ValueError(
-            "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'."
-        )
+
     return model_parameters
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 5a6957e..a64805c 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -145,6 +145,17 @@ def test_custom_param_grid_builder():
     assert all(m in expected for m in param_grid)
 
 
+def test_get_model_parameters_error_if_list_empty(training_conf):
+    """
+    It's an error if the model_parameters list is empty, since in that case there
+    aren't any models to evaluate.
+    """
+    training_conf["training"]["model_parameters"] = []
+
+    with pytest.raises(ValueError, match="model_parameters is empty"):
+        _get_model_parameters(training_conf["training"])
+
+
 def test_get_model_parameters_default_behavior(training_conf):
     """
     When there's no training.param_grid attribute or

From 46da4cb1ee66312a2e52e347d00092426f0744fa Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Wed, 27 Nov 2024 11:28:57 -0600
Subject: [PATCH 12/21] [#167] Start supporting a randomized strategy which can
 randomly sample from lists

---
 .../link_step_train_test_models.py            | 26 ++++++++++++++++
 hlink/tests/model_exploration_test.py         | 31 +++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index fad2429..e6d7437 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -6,6 +6,7 @@
 import itertools
 import logging
 import math
+import random
 import re
 import sys
 from textwrap import dedent
@@ -686,6 +687,21 @@ def _custom_param_grid_builder(
     return new_params
 
 
+def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, Any]:
+    """
+    Choose a randomized setting of parameters from the given specification.
+    """
+    parameter_choices = dict()
+
+    for key, value in model_parameters.items():
+        if key == "type":
+            parameter_choices[key] = value
+        else:
+            parameter_choices[key] = random.choice(value)
+
+    return parameter_choices
+
+
 def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any]]:
     if "param_grid" in training_config:
         print(
@@ -718,6 +734,16 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
             return model_parameters
         elif strategy == "grid":
             return _custom_param_grid_builder(model_parameters)
+        elif strategy == "randomized":
+            num_samples = model_parameter_search["num_samples"]
+
+            return_parameters = []
+            for _ in range(num_samples):
+                parameter_spec = random.choice(model_parameters)
+                randomized = _choose_randomized_parameters(parameter_spec)
+                return_parameters.append(randomized)
+
+            return return_parameters
         else:
             raise ValueError(f"Unknown model_parameter_search strategy '{strategy}'")
     elif use_param_grid:
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index a64805c..bb272be 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -331,6 +331,37 @@ def test_get_model_parameters_search_strategy_grid_with_param_grid_false(
     assert "Deprecation Warning: training.param_grid is deprecated" in output.err
 
 
+def test_get_model_parameters_search_strategy_randomized_sample_from_lists(
+    training_conf,
+):
+    """
+    Strategy "randomized" accepts lists for parameter values, but it does not work
+    the same way as the "grid" strategy. It randomly samples values from the lists
+    num_samples times to create parameter combinations.
+    """
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "randomized",
+        "num_samples": 37,
+    }
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "decision_tree",
+            "maxDepth": [1, 5, 10, 20],
+            "maxBins": [10, 20, 40],
+        }
+    ]
+
+    model_parameters = _get_model_parameters(training_conf["training"])
+
+    # Note that if we used strategy grid, we would get a list of length 4 * 3 = 12 instead
+    assert len(model_parameters) == 37
+
+    for parameter_choice in model_parameters:
+        assert parameter_choice["type"] == "decision_tree"
+        assert parameter_choice["maxDepth"] in {1, 5, 10, 20}
+        assert parameter_choice["maxBins"] in {10, 20, 40}
+
+
 # -------------------------------------
 # Tests that probably should be moved
 # -------------------------------------

From 51b4144701c2ff79da1f4834dc5ed7a580e30763 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Wed, 27 Nov 2024 11:55:45 -0600
Subject: [PATCH 13/21] [#167] Support some simple distributions for randomized
 parameter search

- randint returns a random integer in an inclusive range
- uniform returns a random float in an inclusive range
---
 .../link_step_train_test_models.py            | 15 +++++++-
 hlink/tests/model_exploration_test.py         | 35 +++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index e6d7437..fc9bbae 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -696,8 +696,21 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
     for key, value in model_parameters.items():
         if key == "type":
             parameter_choices[key] = value
-        else:
+        elif type(value) == list:
             parameter_choices[key] = random.choice(value)
+        elif type(value) == dict:
+            distribution = value["distribution"]
+            low = value["low"]
+            high = value["high"]
+
+            if distribution == "randint":
+                parameter_choices[key] = random.randint(low, high)
+            elif distribution == "uniform":
+                parameter_choices[key] = random.uniform(low, high)
+            else:
+                raise ValueError("unknown distribution")
+        else:
+            raise ValueError("can't handle value type")
 
     return parameter_choices
 
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index bb272be..51f648f 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -362,6 +362,41 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_lists(
         assert parameter_choice["maxBins"] in {10, 20, 40}
 
 
+def test_get_model_parameters_search_strategy_randomized_sample_from_distributions(
+    training_conf,
+):
+    """
+    The "randomized" strategy also accepts dictionary values for parameters.
+    These dictionaries define distributions from which the parameters should be
+    sampled.
+
+    For example, {"distribution": "randint", "low": 1, "high": 20} means to
+    pick a random integer between 1 and 20, each integer with an equal chance.
+    And {"distribution": "uniform", "low": 0.0, "high": 100.0} means to pick a
+    random float between 0.0 and 100.0 with a uniform distribution.
+    """
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "randomized",
+        "num_samples": 15,
+    }
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "decision_tree",
+            "maxDepth": {"distribution": "randint", "low": 1, "high": 20},
+            "minInfoGain": {"distribution": "uniform", "low": 0.0, "high": 100.0},
+        }
+    ]
+
+    model_parameters = _get_model_parameters(training_conf["training"])
+
+    assert len(model_parameters) == 15
+
+    for parameter_choice in model_parameters:
+        assert parameter_choice["type"] == "decision_tree"
+        assert 1 <= parameter_choice["maxDepth"] <= 20
+        assert 0.0 <= parameter_choice["minInfoGain"] <= 100.0
+
+
 # -------------------------------------
 # Tests that probably should be moved
 # -------------------------------------

From 907818e29ef2385e4e4e27c8b565e2913a93c733 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Wed, 27 Nov 2024 13:52:39 -0600
Subject: [PATCH 14/21] [#167] Use isinstance instead of directly checking
 types

This makes this code more flexible and easier to understand. It also handles a
weird case where the toml library returns a subclass of dict in some
situations, and built-in Python dicts in other situations.
---
 .../model_exploration/link_step_train_test_models.py      | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index fc9bbae..c975258 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -3,6 +3,7 @@
 # in this project's top-level directory, and also on-line at:
 #   https://github.com/ipums/hlink
 
+import collections.abc
 import itertools
 import logging
 import math
@@ -696,9 +697,12 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
     for key, value in model_parameters.items():
         if key == "type":
             parameter_choices[key] = value
-        elif type(value) == list:
+        # If it's a Sequence (usually list), choose one of the values at random.
+        elif isinstance(value, collections.abc.Sequence):
             parameter_choices[key] = random.choice(value)
-        elif type(value) == dict:
+        # If it's a Mapping (usually dict), it defines a distribution from which
+        # the parameter should be sampled.
+        elif isinstance(value, collections.abc.Mapping):
             distribution = value["distribution"]
             low = value["low"]
             high = value["high"]

From 65cb5ffb3ec28eb47ff8b9165132b871919bc95a Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Wed, 27 Nov 2024 14:22:26 -0600
Subject: [PATCH 15/21] [#167] Pull the edge case logic for "type" out of
 _choose_randomized_parameters()

---
 .../link_step_train_test_models.py                  | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index c975258..1c182ce 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -695,10 +695,8 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
     parameter_choices = dict()
 
     for key, value in model_parameters.items():
-        if key == "type":
-            parameter_choices[key] = value
         # If it's a Sequence (usually list), choose one of the values at random.
-        elif isinstance(value, collections.abc.Sequence):
+        if isinstance(value, collections.abc.Sequence):
             parameter_choices[key] = random.choice(value)
         # If it's a Mapping (usually dict), it defines a distribution from which
         # the parameter should be sampled.
@@ -757,7 +755,14 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
             return_parameters = []
             for _ in range(num_samples):
                 parameter_spec = random.choice(model_parameters)
-                randomized = _choose_randomized_parameters(parameter_spec)
+                model_type = parameter_spec["type"]
+                sample_parameters = dict(
+                    (key, value)
+                    for (key, value) in parameter_spec.items()
+                    if key != "type"
+                )
+                randomized = _choose_randomized_parameters(sample_parameters)
+                randomized["type"] = model_type
                 return_parameters.append(randomized)
 
             return return_parameters

From 1692c87452d984e48f12b51c82b24739c2360ffc Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Wed, 27 Nov 2024 14:51:07 -0600
Subject: [PATCH 16/21] [#167] Support "pinned" parameters with
 model_parameter_search strategy randomized

This lets users set some parameters to a particular value, and only sample
others.  It's mostly a convenience because previously you could get the same
behavior by passing the parameter as a one-element list, like `maxDepth = [7]`.

This commit introduces the extra convenience of just specifying the parameter
as a value, like `maxDepth = 7`. So now you can do something like this:

```
[[training.model_parameters]]
type = "random_forest"
maxDepth = 7
numTrees = [1, 10, 20]
subsamplingRate = {distribution = "uniform", low = 0.1, high = 0.9}
```

maxDepth will always be 7, numTrees will be randomly sampled from the list 1,
10, 20, and subsamplingRate will be sampled uniformly from the range [0.1,
0.9].
---
 .../link_step_train_test_models.py            |  7 ++--
 hlink/tests/model_exploration_test.py         | 34 +++++++++++++++++++
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 1c182ce..54a3115 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -695,8 +695,8 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
     parameter_choices = dict()
 
     for key, value in model_parameters.items():
-        # If it's a Sequence (usually list), choose one of the values at random.
-        if isinstance(value, collections.abc.Sequence):
+        # If it's a Sequence (usually list) but not a string, choose one of the values at random.
+        if isinstance(value, collections.abc.Sequence) and not isinstance(value, str):
             parameter_choices[key] = random.choice(value)
         # If it's a Mapping (usually dict), it defines a distribution from which
         # the parameter should be sampled.
@@ -711,8 +711,9 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
                 parameter_choices[key] = random.uniform(low, high)
             else:
                 raise ValueError("unknown distribution")
+        # All other types (including strings) are passed through unchanged.
         else:
-            raise ValueError("can't handle value type")
+            parameter_choices[key] = value
 
     return parameter_choices
 
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 51f648f..33ee240 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -397,6 +397,40 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_distributio
         assert 0.0 <= parameter_choice["minInfoGain"] <= 100.0
 
 
+def test_get_model_parameters_search_strategy_randomized_take_values(training_conf):
+    """
+    If a value is neither a list nor a table, the "randomized" strategy just passes
+    it along as a value. This lets the user easily pin some parameters to a particular
+    value and randomize others.
+    """
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "randomized",
+        "num_samples": 25,
+    }
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "random_forest",
+            "maxDepth": 7,
+            "impurity": "entropy",
+            "minInfoGain": 0.5,
+            "numTrees": {"distribution": "randint", "low": 10, "high": 100},
+            "subsamplingRate": [0.5, 1.0, 1.5],
+        }
+    ]
+
+    model_parameters = _get_model_parameters(training_conf["training"])
+
+    assert len(model_parameters) == 25
+
+    for parameter_choice in model_parameters:
+        assert parameter_choice["type"] == "random_forest"
+        assert parameter_choice["maxDepth"] == 7
+        assert parameter_choice["impurity"] == "entropy"
+        assert parameter_choice["minInfoGain"] == 0.5
+        assert 10 <= parameter_choice["numTrees"] <= 100
+        assert parameter_choice["subsamplingRate"] in {0.5, 1.0, 1.5}
+
+
 # -------------------------------------
 # Tests that probably should be moved
 # -------------------------------------

From 0becd3234e69d9ceba4182fc9b776e9f492379b8 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 2 Dec 2024 11:02:57 -0600
Subject: [PATCH 17/21] [#167] Respect training.seed when the search strategy
 is ""randomized"

---
 .../link_step_train_test_models.py            | 16 ++++--
 hlink/tests/model_exploration_test.py         | 56 +++++++++++++++++++
 2 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 54a3115..988ed8b 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -688,7 +688,9 @@ def _custom_param_grid_builder(
     return new_params
 
 
-def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, Any]:
+def _choose_randomized_parameters(
+    rng: random.Random, model_parameters: dict[str, Any]
+) -> dict[str, Any]:
     """
     Choose a randomized setting of parameters from the given specification.
     """
@@ -697,7 +699,7 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
     for key, value in model_parameters.items():
         # If it's a Sequence (usually list) but not a string, choose one of the values at random.
         if isinstance(value, collections.abc.Sequence) and not isinstance(value, str):
-            parameter_choices[key] = random.choice(value)
+            parameter_choices[key] = rng.choice(value)
         # If it's a Mapping (usually dict), it defines a distribution from which
         # the parameter should be sampled.
         elif isinstance(value, collections.abc.Mapping):
@@ -706,9 +708,9 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
             high = value["high"]
 
             if distribution == "randint":
-                parameter_choices[key] = random.randint(low, high)
+                parameter_choices[key] = rng.randint(low, high)
             elif distribution == "uniform":
-                parameter_choices[key] = random.uniform(low, high)
+                parameter_choices[key] = rng.uniform(low, high)
             else:
                 raise ValueError("unknown distribution")
         # All other types (including strings) are passed through unchanged.
@@ -737,6 +739,7 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
 
     model_parameters = training_config["model_parameters"]
     model_parameter_search = training_config.get("model_parameter_search")
+    seed = training_config.get("seed")
     use_param_grid = training_config.get("param_grid", False)
 
     if model_parameters == []:
@@ -752,17 +755,18 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
             return _custom_param_grid_builder(model_parameters)
         elif strategy == "randomized":
             num_samples = model_parameter_search["num_samples"]
+            rng = random.Random(seed)
 
             return_parameters = []
             for _ in range(num_samples):
-                parameter_spec = random.choice(model_parameters)
+                parameter_spec = rng.choice(model_parameters)
                 model_type = parameter_spec["type"]
                 sample_parameters = dict(
                     (key, value)
                     for (key, value) in parameter_spec.items()
                     if key != "type"
                 )
-                randomized = _choose_randomized_parameters(sample_parameters)
+                randomized = _choose_randomized_parameters(rng, sample_parameters)
                 randomized["type"] = model_type
                 return_parameters.append(randomized)
 
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 33ee240..3af04da 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -2,6 +2,7 @@
 # For copyright and licensing information, see the NOTICE and LICENSE files
 # in this project's top-level directory, and also on-line at:
 #   https://github.com/ipums/hlink
+from collections import Counter
 
 import pytest
 import pandas as pd
@@ -431,6 +432,61 @@ def test_get_model_parameters_search_strategy_randomized_take_values(training_co
         assert parameter_choice["subsamplingRate"] in {0.5, 1.0, 1.5}
 
 
+def test_get_model_parameters_search_strategy_randomized_multiple_models(training_conf):
+    """
+    When there are multiple models for the "randomized" strategy, it randomly
+    samples the model before sampling the parameters for that model. Setting
+    the training.seed attribute lets us assert more precisely the counts for
+    each model type.
+    """
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "randomized",
+        "num_samples": 100,
+    }
+    training_conf["training"]["seed"] = 101
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "random_forest",
+            "minInfoGain": {"distribution": "uniform", "low": 0.1, "high": 0.9},
+        },
+        {"type": "probit"},
+    ]
+
+    model_parameters = _get_model_parameters(training_conf["training"])
+
+    counter = Counter(parameter_choice["type"] for parameter_choice in model_parameters)
+    assert counter["random_forest"] == 47
+    assert counter["probit"] == 53
+
+
+def test_get_model_parameters_search_strategy_randomized_uses_seed(training_conf):
+    """
+    The "randomized" strategy uses training.seed to allow reproducible runs.
+    """
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "randomized",
+        "num_samples": 5,
+    }
+    training_conf["training"]["seed"] = 35830969
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "random_forest",
+            "maxDepth": {"distribution": "randint", "low": 1, "high": 10},
+            "numTrees": [1, 10, 100, 1000],
+        }
+    ]
+
+    model_parameters = _get_model_parameters(training_conf["training"])
+
+    assert model_parameters == [
+        {"type": "random_forest", "maxDepth": 8, "numTrees": 100},
+        {"type": "random_forest", "maxDepth": 2, "numTrees": 1},
+        {"type": "random_forest", "maxDepth": 4, "numTrees": 100},
+        {"type": "random_forest", "maxDepth": 9, "numTrees": 10},
+        {"type": "random_forest", "maxDepth": 7, "numTrees": 100},
+    ]
+
+
 # -------------------------------------
 # Tests that probably should be moved
 # -------------------------------------

From 5d0ea0baaa7494172f0396ddb6c78f82c78429cf Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 2 Dec 2024 11:21:20 -0600
Subject: [PATCH 18/21] [#167] Add a normal distribution to randomized
 parameter search

---
 .../model_exploration/link_step_train_test_models.py   | 10 ++++++++--
 hlink/tests/model_exploration_test.py                  |  9 +++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 988ed8b..452bcc1 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -704,13 +704,19 @@ def _choose_randomized_parameters(
         # the parameter should be sampled.
         elif isinstance(value, collections.abc.Mapping):
             distribution = value["distribution"]
-            low = value["low"]
-            high = value["high"]
 
             if distribution == "randint":
+                low = value["low"]
+                high = value["high"]
                 parameter_choices[key] = rng.randint(low, high)
             elif distribution == "uniform":
+                low = value["low"]
+                high = value["high"]
                 parameter_choices[key] = rng.uniform(low, high)
+            elif distribution == "normal":
+                mean = value["mean"]
+                stdev = value["standard_deviation"]
+                parameter_choices[key] = rng.normalvariate(mean, stdev)
             else:
                 raise ValueError("unknown distribution")
         # All other types (including strings) are passed through unchanged.
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 3af04da..8f31aaa 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -385,6 +385,11 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_distributio
             "type": "decision_tree",
             "maxDepth": {"distribution": "randint", "low": 1, "high": 20},
             "minInfoGain": {"distribution": "uniform", "low": 0.0, "high": 100.0},
+            "minWeightFractionPerNode": {
+                "distribution": "normal",
+                "mean": 10.0,
+                "standard_deviation": 2.5,
+            },
         }
     ]
 
@@ -396,6 +401,10 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_distributio
         assert parameter_choice["type"] == "decision_tree"
         assert 1 <= parameter_choice["maxDepth"] <= 20
         assert 0.0 <= parameter_choice["minInfoGain"] <= 100.0
+        # Technically a normal distribution can return any value, even ones very
+        # far from its mean. So we can't assert on the value returned here. But
+        # there definitely should be a value of some sort in the dictionary.
+        assert "minWeightFractionPerNode" in parameter_choice
 
 
 def test_get_model_parameters_search_strategy_randomized_take_values(training_conf):

From 943fc0a56a5ae0985c2056164dba67d4ca706dfe Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 2 Dec 2024 11:28:45 -0600
Subject: [PATCH 19/21] [#167] Improve the "unknown distribution" error message

---
 .../link_step_train_test_models.py            |  4 +++-
 hlink/tests/model_exploration_test.py         | 24 +++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 452bcc1..e700285 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -718,7 +718,9 @@ def _choose_randomized_parameters(
                 stdev = value["standard_deviation"]
                 parameter_choices[key] = rng.normalvariate(mean, stdev)
             else:
-                raise ValueError("unknown distribution")
+                raise ValueError(
+                    f"Unknown distribution '{distribution}'. Please choose one of 'randint', 'uniform', or 'normal'."
+                )
         # All other types (including strings) are passed through unchanged.
         else:
             parameter_choices[key] = value
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 8f31aaa..1aeef9c 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -496,6 +496,30 @@ def test_get_model_parameters_search_strategy_randomized_uses_seed(training_conf
     ]
 
 
+def test_get_model_parameters_search_strategy_randomized_unknown_distribution(
+    training_conf,
+):
+    """
+    Passing a distrbution other than "uniform", "randint", or "normal" is an error.
+    """
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "randomized",
+        "num_samples": 10,
+    }
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "decision_tree",
+            "minInfoGain": {"distribution": "laplace", "location": 0.0, "scale": 1.0},
+        }
+    ]
+
+    with pytest.raises(
+        ValueError,
+        match="Unknown distribution 'laplace'. Please choose one of 'randint', 'uniform', or 'normal'.",
+    ):
+        _get_model_parameters(training_conf["training"])
+
+
 # -------------------------------------
 # Tests that probably should be moved
 # -------------------------------------

From 0f99e1b0ad2adf8058081fa29a32b74b78cb37d9 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 2 Dec 2024 12:48:13 -0600
Subject: [PATCH 20/21] [#167] Don't randomize threshold or threshold_ratio

Only the hyper-parameters to the model should be affected by
training.model_parameter_search.strategy. thresholds and
threshold_ratios should be passed through unchanged on each model.
---
 .../link_step_train_test_models.py            | 22 ++++++++++-----
 hlink/tests/model_exploration_test.py         | 28 +++++++++++++++++++
 2 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index e700285..909309a 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -766,17 +766,25 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
             rng = random.Random(seed)
 
             return_parameters = []
+            # These keys are special and should not be sampled or modified. All
+            # other keys are hyper-parameters to the model and should be sampled.
+            frozen_keys = {"type", "threshold", "threshold_ratio"}
             for _ in range(num_samples):
                 parameter_spec = rng.choice(model_parameters)
-                model_type = parameter_spec["type"]
-                sample_parameters = dict(
-                    (key, value)
+                sample_parameters = {
+                    key: value
                     for (key, value) in parameter_spec.items()
-                    if key != "type"
-                )
+                    if key not in frozen_keys
+                }
+                frozen_parameters = {
+                    key: value
+                    for (key, value) in parameter_spec.items()
+                    if key in frozen_keys
+                }
+
                 randomized = _choose_randomized_parameters(rng, sample_parameters)
-                randomized["type"] = model_type
-                return_parameters.append(randomized)
+                result = {**frozen_parameters, **randomized}
+                return_parameters.append(result)
 
             return return_parameters
         else:
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 1aeef9c..b58bfd1 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -520,6 +520,34 @@ def test_get_model_parameters_search_strategy_randomized_unknown_distribution(
         _get_model_parameters(training_conf["training"])
 
 
+def test_get_model_parameters_search_strategy_randomized_thresholds(training_conf):
+    """
+    Even when the model parameters are selected with strategy "randomized", the
+    thresholds are still treated with a "grid" strategy.
+    _get_model_parameters() is not in charge of creating the threshold matrix,
+    so it passes the threshold and threshold_ratio through unchanged.
+    """
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "randomized",
+        "num_samples": 25,
+    }
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "random_forest",
+            "maxDepth": [1, 10, 100],
+            "threshold": [0.3, 0.5, 0.7, 0.8, 0.9],
+            "threshold_ratio": 1.2,
+        }
+    ]
+
+    model_parameters = _get_model_parameters(training_conf["training"])
+
+    for parameter_choice in model_parameters:
+        assert parameter_choice["type"] == "random_forest"
+        assert parameter_choice["threshold"] == [0.3, 0.5, 0.7, 0.8, 0.9]
+        assert parameter_choice["threshold_ratio"] == 1.2
+
+
 # -------------------------------------
 # Tests that probably should be moved
 # -------------------------------------

From 7fed016326d2017d7b4f4ab3e4aea643ebac2626 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 2 Dec 2024 15:01:20 -0600
Subject: [PATCH 21/21] [#167] Add a test for the unknown strategy error
 condition

---
 .../link_step_train_test_models.py                 |  5 ++++-
 hlink/tests/model_exploration_test.py              | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 909309a..cb3801e 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -788,7 +788,10 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
 
             return return_parameters
         else:
-            raise ValueError(f"Unknown model_parameter_search strategy '{strategy}'")
+            raise ValueError(
+                f"Unknown model_parameter_search strategy '{strategy}'. "
+                "Please choose one of 'explicit', 'grid', or 'randomized'."
+            )
     elif use_param_grid:
         return _custom_param_grid_builder(model_parameters)
 
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index b58bfd1..a438995 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -548,6 +548,20 @@ def test_get_model_parameters_search_strategy_randomized_thresholds(training_con
         assert parameter_choice["threshold_ratio"] == 1.2
 
 
+def test_get_model_parameters_unknown_search_strategy(training_conf):
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "something",
+    }
+    training_conf["training"]["model_parameters"] = [{"type": "probit"}]
+
+    with pytest.raises(
+        ValueError,
+        match="Unknown model_parameter_search strategy 'something'. "
+        "Please choose one of 'explicit', 'grid', or 'randomized'.",
+    ):
+        _parameters = _get_model_parameters(training_conf["training"])
+
+
 # -------------------------------------
 # Tests that probably should be moved
 # -------------------------------------