From 1692c87452d984e48f12b51c82b24739c2360ffc Mon Sep 17 00:00:00 2001 From: rileyh Date: Wed, 27 Nov 2024 14:51:07 -0600 Subject: [PATCH] [#167] Support "pinned" parameters with model_parameter_search strategy randomized This lets users set some parameters to a particular value, and only sample others. It's mostly a convenience because previously you could get the same behavior by passing the parameter as a one-element list, like `maxDepth = [7]`. This commit introduces the extra convenience of just specifying the parameter as a value, like `maxDepth = 7`. So now you can do something like this: ``` [[training.model_parameters]] type = "random_forest" maxDepth = 7 numTrees = [1, 10, 20] subsamplingRate = {distribution = "uniform", low = 0.1, high = 0.9} ``` maxDepth will always be 7, numTrees will be randomly sampled from the list 1, 10, 20, and subsamplingRate will be sampled uniformly from the range [0.1, 0.9]. --- .../link_step_train_test_models.py | 7 ++-- hlink/tests/model_exploration_test.py | 34 +++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 1c182ce..54a3115 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -695,8 +695,8 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, parameter_choices = dict() for key, value in model_parameters.items(): - # If it's a Sequence (usually list), choose one of the values at random. - if isinstance(value, collections.abc.Sequence): + # If it's a Sequence (usually list) but not a string, choose one of the values at random. + if isinstance(value, collections.abc.Sequence) and not isinstance(value, str): parameter_choices[key] = random.choice(value) # If it's a Mapping (usually dict), it defines a distribution from which # the parameter should be sampled. @@ -711,8 +711,9 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, parameter_choices[key] = random.uniform(low, high) else: raise ValueError("unknown distribution") + # All other types (including strings) are passed through unchanged. else: - raise ValueError("can't handle value type") + parameter_choices[key] = value return parameter_choices diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 51f648f..33ee240 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -397,6 +397,40 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_distributio assert 0.0 <= parameter_choice["minInfoGain"] <= 100.0 +def test_get_model_parameters_search_strategy_randomized_take_values(training_conf): + """ + If a value is neither a list nor a table, the "randomized" strategy just passes + it along as a value. This lets the user easily pin some parameters to a particular + value and randomize others. + """ + training_conf["training"]["model_parameter_search"] = { + "strategy": "randomized", + "num_samples": 25, + } + training_conf["training"]["model_parameters"] = [ + { + "type": "random_forest", + "maxDepth": 7, + "impurity": "entropy", + "minInfoGain": 0.5, + "numTrees": {"distribution": "randint", "low": 10, "high": 100}, + "subsamplingRate": [0.5, 1.0, 1.5], + } + ] + + model_parameters = _get_model_parameters(training_conf["training"]) + + assert len(model_parameters) == 25 + + for parameter_choice in model_parameters: + assert parameter_choice["type"] == "random_forest" + assert parameter_choice["maxDepth"] == 7 + assert parameter_choice["impurity"] == "entropy" + assert parameter_choice["minInfoGain"] == 0.5 + assert 10 <= parameter_choice["numTrees"] <= 100 + assert parameter_choice["subsamplingRate"] in {0.5, 1.0, 1.5} + + # ------------------------------------- # Tests that probably should be moved # -------------------------------------