Skip to content

Commit

Permalink
[#167] Don't randomize threshold or threshold_ratio
Browse files Browse the repository at this point in the history
Only the hyper-parameters to the model should be affected by
training.model_parameter_search.strategy. thresholds and
threshold_ratios should be passed through unchanged on each model.
  • Loading branch information
riley-harper committed Dec 2, 2024
1 parent 943fc0a commit 0f99e1b
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 7 deletions.
22 changes: 15 additions & 7 deletions hlink/linking/model_exploration/link_step_train_test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -766,17 +766,25 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
rng = random.Random(seed)

return_parameters = []
# These keys are special and should not be sampled or modified. All
# other keys are hyper-parameters to the model and should be sampled.
frozen_keys = {"type", "threshold", "threshold_ratio"}
for _ in range(num_samples):
parameter_spec = rng.choice(model_parameters)
model_type = parameter_spec["type"]
sample_parameters = dict(
(key, value)
sample_parameters = {
key: value
for (key, value) in parameter_spec.items()
if key != "type"
)
if key not in frozen_keys
}
frozen_parameters = {
key: value
for (key, value) in parameter_spec.items()
if key in frozen_keys
}

randomized = _choose_randomized_parameters(rng, sample_parameters)
randomized["type"] = model_type
return_parameters.append(randomized)
result = {**frozen_parameters, **randomized}
return_parameters.append(result)

return return_parameters
else:
Expand Down
28 changes: 28 additions & 0 deletions hlink/tests/model_exploration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,34 @@ def test_get_model_parameters_search_strategy_randomized_unknown_distribution(
_get_model_parameters(training_conf["training"])


def test_get_model_parameters_search_strategy_randomized_thresholds(training_conf):
"""
Even when the model parameters are selected with strategy "randomized", the
thresholds are still treated with a "grid" strategy.
_get_model_parameters() is not in charge of creating the threshold matrix,
so it passes the threshold and threshold_ratio through unchanged.
"""
training_conf["training"]["model_parameter_search"] = {
"strategy": "randomized",
"num_samples": 25,
}
training_conf["training"]["model_parameters"] = [
{
"type": "random_forest",
"maxDepth": [1, 10, 100],
"threshold": [0.3, 0.5, 0.7, 0.8, 0.9],
"threshold_ratio": 1.2,
}
]

model_parameters = _get_model_parameters(training_conf["training"])

for parameter_choice in model_parameters:
assert parameter_choice["type"] == "random_forest"
assert parameter_choice["threshold"] == [0.3, 0.5, 0.7, 0.8, 0.9]
assert parameter_choice["threshold_ratio"] == 1.2


# -------------------------------------
# Tests that probably should be moved
# -------------------------------------
Expand Down

0 comments on commit 0f99e1b

Please sign in to comment.