Skip to content

Commit

Permalink
[#167] Support "pinned" parameters with model_parameter_search strate…
Browse files Browse the repository at this point in the history
…gy randomized

This lets users set some parameters to a particular value, and only sample
others.  It's mostly a convenience because previously you could get the same
behavior by passing the parameter as a one-element list, like `maxDepth = [7]`.

This commit introduces the extra convenience of just specifying the parameter
as a value, like `maxDepth = 7`. So now you can do something like this:

```
[[training.model_parameters]]
type = "random_forest"
maxDepth = 7
numTrees = [1, 10, 20]
subsamplingRate = {distribution = "uniform", low = 0.1, high = 0.9}
```

maxDepth will always be 7, numTrees will be randomly sampled from the list 1,
10, 20, and subsamplingRate will be sampled uniformly from the range [0.1,
0.9].
  • Loading branch information
riley-harper committed Nov 27, 2024
1 parent 65cb5ff commit 1692c87
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -695,8 +695,8 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
parameter_choices = dict()

for key, value in model_parameters.items():
# If it's a Sequence (usually list), choose one of the values at random.
if isinstance(value, collections.abc.Sequence):
# If it's a Sequence (usually list) but not a string, choose one of the values at random.
if isinstance(value, collections.abc.Sequence) and not isinstance(value, str):
parameter_choices[key] = random.choice(value)
# If it's a Mapping (usually dict), it defines a distribution from which
# the parameter should be sampled.
Expand All @@ -711,8 +711,9 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
parameter_choices[key] = random.uniform(low, high)
else:
raise ValueError("unknown distribution")
# All other types (including strings) are passed through unchanged.
else:
raise ValueError("can't handle value type")
parameter_choices[key] = value

return parameter_choices

Expand Down
34 changes: 34 additions & 0 deletions hlink/tests/model_exploration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,40 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_distributio
assert 0.0 <= parameter_choice["minInfoGain"] <= 100.0


def test_get_model_parameters_search_strategy_randomized_take_values(training_conf):
"""
If a value is neither a list nor a table, the "randomized" strategy just passes
it along as a value. This lets the user easily pin some parameters to a particular
value and randomize others.
"""
training_conf["training"]["model_parameter_search"] = {
"strategy": "randomized",
"num_samples": 25,
}
training_conf["training"]["model_parameters"] = [
{
"type": "random_forest",
"maxDepth": 7,
"impurity": "entropy",
"minInfoGain": 0.5,
"numTrees": {"distribution": "randint", "low": 10, "high": 100},
"subsamplingRate": [0.5, 1.0, 1.5],
}
]

model_parameters = _get_model_parameters(training_conf["training"])

assert len(model_parameters) == 25

for parameter_choice in model_parameters:
assert parameter_choice["type"] == "random_forest"
assert parameter_choice["maxDepth"] == 7
assert parameter_choice["impurity"] == "entropy"
assert parameter_choice["minInfoGain"] == 0.5
assert 10 <= parameter_choice["numTrees"] <= 100
assert parameter_choice["subsamplingRate"] in {0.5, 1.0, 1.5}


# -------------------------------------
# Tests that probably should be moved
# -------------------------------------
Expand Down

0 comments on commit 1692c87

Please sign in to comment.