From c5f5b13c0125e17375978d244ab70eef4a8cc9f4 Mon Sep 17 00:00:00 2001 From: rileyh Date: Tue, 26 Nov 2024 14:20:36 -0600 Subject: [PATCH 01/21] [#167] Pull _custom_param_grid_builder() out of the LinkStepTrainTestModels class --- .../link_step_train_test_models.py | 63 ++++++++++--------- hlink/tests/model_exploration_test.py | 6 +- 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 8e391b8..347a9ad 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -244,35 +244,6 @@ def _get_splits( return splits - def _custom_param_grid_builder(self, conf: dict[str, Any]) -> list[dict[str, Any]]: - print("Building param grid for models") - given_parameters = conf[f"{self.task.training_conf}"]["model_parameters"] - new_params = [] - for run in given_parameters: - params = run.copy() - model_type = params.pop("type") - - # dropping thresholds to prep for scikitlearn model exploration refactor - threshold = params.pop("threshold", False) - threshold_ratio = params.pop("threshold_ratio", False) - - keys = params.keys() - values = params.values() - - params_exploded = [] - for prod in itertools.product(*values): - params_exploded.append(dict(zip(keys, prod))) - - for subdict in params_exploded: - subdict["type"] = model_type - if threshold: - subdict["threshold"] = threshold - if threshold_ratio: - subdict["threshold_ratio"] = threshold_ratio - - new_params.extend(params_exploded) - return new_params - def _capture_results( self, predictions: pyspark.sql.DataFrame, @@ -332,7 +303,7 @@ def _get_model_parameters(self, conf: dict[str, Any]) -> list[dict[str, Any]]: model_parameters = conf[training_conf]["model_parameters"] if "param_grid" in conf[training_conf] and conf[training_conf]["param_grid"]: - model_parameters = self._custom_param_grid_builder(conf) + model_parameters = _custom_param_grid_builder(training_conf, conf) elif model_parameters == []: raise ValueError( "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'." @@ -691,3 +662,35 @@ def _create_desc_df() -> pd.DataFrame: "mcc_train_sd", ] ) + + +def _custom_param_grid_builder( + training_conf: str, conf: dict[str, Any] +) -> list[dict[str, Any]]: + print("Building param grid for models") + given_parameters = conf[training_conf]["model_parameters"] + new_params = [] + for run in given_parameters: + params = run.copy() + model_type = params.pop("type") + + # dropping thresholds to prep for scikitlearn model exploration refactor + threshold = params.pop("threshold", False) + threshold_ratio = params.pop("threshold_ratio", False) + + keys = params.keys() + values = params.values() + + params_exploded = [] + for prod in itertools.product(*values): + params_exploded.append(dict(zip(keys, prod))) + + for subdict in params_exploded: + subdict["type"] = model_type + if threshold: + subdict["threshold"] = threshold + if threshold_ratio: + subdict["threshold_ratio"] = threshold_ratio + + new_params.extend(params_exploded) + return new_params diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index e0cf593..42e1364 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -9,6 +9,7 @@ import hlink.linking.core.threshold as threshold_core from hlink.linking.model_exploration.link_step_train_test_models import ( LinkStepTrainTestModels, + _custom_param_grid_builder, ) @@ -121,7 +122,7 @@ def test_all( main.do_drop_all("") -def test_step_2_param_grid(spark, main, training_conf, model_exploration, fake_self): +def test_step_2_param_grid(main, training_conf): """Test matching step 2 training to see if the custom param grid builder is working""" training_conf["training"]["model_parameters"] = [ @@ -129,8 +130,7 @@ def test_step_2_param_grid(spark, main, training_conf, model_exploration, fake_s {"type": "probit", "threshold": [0.5, 0.7]}, ] - link_step = LinkStepTrainTestModels(model_exploration) - param_grid = link_step._custom_param_grid_builder(training_conf) + param_grid = _custom_param_grid_builder("training", training_conf) expected = [ {"maxDepth": 3, "numTrees": 50, "type": "random_forest"}, From 605369b93bd201970ed9b97f09a8953b3c456efa Mon Sep 17 00:00:00 2001 From: rileyh Date: Tue, 26 Nov 2024 14:29:28 -0600 Subject: [PATCH 02/21] [#167] Simplify the interface to _custom_param_grid_builder() We can just pass the list of model_parameters from the config file to this function. --- .../model_exploration/link_step_train_test_models.py | 6 +++--- hlink/tests/model_exploration_test.py | 12 ++++-------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 347a9ad..7c03404 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -303,7 +303,7 @@ def _get_model_parameters(self, conf: dict[str, Any]) -> list[dict[str, Any]]: model_parameters = conf[training_conf]["model_parameters"] if "param_grid" in conf[training_conf] and conf[training_conf]["param_grid"]: - model_parameters = _custom_param_grid_builder(training_conf, conf) + model_parameters = _custom_param_grid_builder(model_parameters) elif model_parameters == []: raise ValueError( "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'." @@ -665,10 +665,10 @@ def _create_desc_df() -> pd.DataFrame: def _custom_param_grid_builder( - training_conf: str, conf: dict[str, Any] + model_parameters: list[dict[str, Any]] ) -> list[dict[str, Any]]: print("Building param grid for models") - given_parameters = conf[training_conf]["model_parameters"] + given_parameters = model_parameters new_params = [] for run in given_parameters: params = run.copy() diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 42e1364..03b53d5 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -122,15 +122,13 @@ def test_all( main.do_drop_all("") -def test_step_2_param_grid(main, training_conf): - """Test matching step 2 training to see if the custom param grid builder is working""" - - training_conf["training"]["model_parameters"] = [ +def test_custom_param_grid_builder(): + """Test matching step 2's custom param grid builder""" + model_parameters = [ {"type": "random_forest", "maxDepth": [3, 4, 5], "numTrees": [50, 100]}, {"type": "probit", "threshold": [0.5, 0.7]}, ] - - param_grid = _custom_param_grid_builder("training", training_conf) + param_grid = _custom_param_grid_builder(model_parameters) expected = [ {"maxDepth": 3, "numTrees": 50, "type": "random_forest"}, @@ -145,8 +143,6 @@ def test_step_2_param_grid(main, training_conf): assert len(param_grid) == len(expected) assert all(m in expected for m in param_grid) - main.do_drop_all("") - # ------------------------------------- # Tests that probably should be moved From 2204152a2f4252b19a029e8de156cce98513e369 Mon Sep 17 00:00:00 2001 From: rileyh Date: Tue, 26 Nov 2024 14:44:03 -0600 Subject: [PATCH 03/21] [#167] Pull _get_model_parameters() out of the LinkStep class This will make this piece of code easier to understand and test. --- .../link_step_train_test_models.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 7c03404..9ef97ee 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -67,7 +67,7 @@ def _run(self) -> None: splits = self._get_splits(prepped_data, id_a, n_training_iterations, seed) - model_parameters = self._get_model_parameters(config) + model_parameters = _get_model_parameters(training_conf, config) logger.info( f"There are {len(model_parameters)} sets of model parameters to explore; " @@ -298,18 +298,6 @@ def _capture_results( ) return pd.concat([results_df, new_results], ignore_index=True) - def _get_model_parameters(self, conf: dict[str, Any]) -> list[dict[str, Any]]: - training_conf = str(self.task.training_conf) - - model_parameters = conf[training_conf]["model_parameters"] - if "param_grid" in conf[training_conf] and conf[training_conf]["param_grid"]: - model_parameters = _custom_param_grid_builder(model_parameters) - elif model_parameters == []: - raise ValueError( - "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'." - ) - return model_parameters - def _save_training_results( self, desc_df: pd.DataFrame, spark: pyspark.sql.SparkSession ) -> None: @@ -694,3 +682,16 @@ def _custom_param_grid_builder( new_params.extend(params_exploded) return new_params + + +def _get_model_parameters( + training_conf: str, conf: dict[str, Any] +) -> list[dict[str, Any]]: + model_parameters = conf[training_conf]["model_parameters"] + if "param_grid" in conf[training_conf] and conf[training_conf]["param_grid"]: + model_parameters = _custom_param_grid_builder(model_parameters) + elif model_parameters == []: + raise ValueError( + "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'." + ) + return model_parameters From 7d483801edb104decc72ab979d9aca905535d4fb Mon Sep 17 00:00:00 2001 From: rileyh Date: Tue, 26 Nov 2024 15:05:35 -0600 Subject: [PATCH 04/21] [#167] Add a few tests for _get_model_parameters() --- hlink/tests/model_exploration_test.py | 57 +++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 03b53d5..e349500 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -10,6 +10,7 @@ from hlink.linking.model_exploration.link_step_train_test_models import ( LinkStepTrainTestModels, _custom_param_grid_builder, + _get_model_parameters, ) @@ -144,6 +145,62 @@ def test_custom_param_grid_builder(): assert all(m in expected for m in param_grid) +def test_get_model_parameters_no_param_grid_attribute(training_conf): + """ + When there's no training.param_grid attribute, the default is to use the "explicit" + strategy, testing each element of model_parameters in turn. + """ + training_conf["training"]["model_parameters"] = [ + {"type": "random_forest", "maxDepth": 3, "numTrees": 50}, + {"type": "probit", "threshold": 0.7}, + ] + assert "param_grid" not in training_conf["training"] + + model_parameters = _get_model_parameters("training", training_conf) + + assert model_parameters == [ + {"type": "random_forest", "maxDepth": 3, "numTrees": 50}, + {"type": "probit", "threshold": 0.7}, + ] + + +def test_get_model_parameters_param_grid_false(training_conf): + """ + When training.param_grid is set to False, model exploration uses the "explicit" + strategy. The model_parameters are returned unchanged. + """ + training_conf["training"]["model_parameters"] = [ + {"type": "logistic_regression", "threshold": 0.3, "threshold_ratio": 1.4}, + ] + training_conf["training"]["param_grid"] = False + + model_parameters = _get_model_parameters("training", training_conf) + + assert model_parameters == [ + {"type": "logistic_regression", "threshold": 0.3, "threshold_ratio": 1.4}, + ] + + +def test_get_model_parameters_param_grid_true(training_conf): + """ + When training.param_grid is set to True, model exploration uses the "grid" + strategy, exploding model_parameters. + """ + training_conf["training"]["model_parameters"] = [ + { + "type": "random_forest", + "maxDepth": [5, 10, 15], + "numTrees": [50, 100], + "threshold": 0.5, + }, + ] + training_conf["training"]["param_grid"] = True + + model_parameters = _get_model_parameters("training", training_conf) + # 3 settings for maxDepth * 2 settings for numTrees = 6 total settings + assert len(model_parameters) == 6 + + # ------------------------------------- # Tests that probably should be moved # ------------------------------------- From bc0bf7d6d254c60f580ba3c192ac93a96b449660 Mon Sep 17 00:00:00 2001 From: rileyh Date: Tue, 26 Nov 2024 15:23:47 -0600 Subject: [PATCH 05/21] [#167] Just pass the training section of the config to _get_model_parameters() --- .../model_exploration/link_step_train_test_models.py | 10 ++++------ hlink/tests/model_exploration_test.py | 6 +++--- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 9ef97ee..47c0a8d 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -67,7 +67,7 @@ def _run(self) -> None: splits = self._get_splits(prepped_data, id_a, n_training_iterations, seed) - model_parameters = _get_model_parameters(training_conf, config) + model_parameters = _get_model_parameters(config[training_conf]) logger.info( f"There are {len(model_parameters)} sets of model parameters to explore; " @@ -684,11 +684,9 @@ def _custom_param_grid_builder( return new_params -def _get_model_parameters( - training_conf: str, conf: dict[str, Any] -) -> list[dict[str, Any]]: - model_parameters = conf[training_conf]["model_parameters"] - if "param_grid" in conf[training_conf] and conf[training_conf]["param_grid"]: +def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any]]: + model_parameters = training_config["model_parameters"] + if "param_grid" in training_config and training_config["param_grid"]: model_parameters = _custom_param_grid_builder(model_parameters) elif model_parameters == []: raise ValueError( diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index e349500..facd03b 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -156,7 +156,7 @@ def test_get_model_parameters_no_param_grid_attribute(training_conf): ] assert "param_grid" not in training_conf["training"] - model_parameters = _get_model_parameters("training", training_conf) + model_parameters = _get_model_parameters(training_conf["training"]) assert model_parameters == [ {"type": "random_forest", "maxDepth": 3, "numTrees": 50}, @@ -174,7 +174,7 @@ def test_get_model_parameters_param_grid_false(training_conf): ] training_conf["training"]["param_grid"] = False - model_parameters = _get_model_parameters("training", training_conf) + model_parameters = _get_model_parameters(training_conf["training"]) assert model_parameters == [ {"type": "logistic_regression", "threshold": 0.3, "threshold_ratio": 1.4}, @@ -196,7 +196,7 @@ def test_get_model_parameters_param_grid_true(training_conf): ] training_conf["training"]["param_grid"] = True - model_parameters = _get_model_parameters("training", training_conf) + model_parameters = _get_model_parameters(training_conf["training"]) # 3 settings for maxDepth * 2 settings for numTrees = 6 total settings assert len(model_parameters) == 6 From 8be8806839018db70296437ad9bed3a21050dced Mon Sep 17 00:00:00 2001 From: rileyh Date: Tue, 26 Nov 2024 15:44:47 -0600 Subject: [PATCH 06/21] [#167] Add a couple of tests for the new training.model_parameter_search setting One of these tests is failing because we haven't implemented this logic in the _get_model_parameters() function yet. --- hlink/tests/model_exploration_test.py | 45 +++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index facd03b..9bf58c6 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -201,6 +201,51 @@ def test_get_model_parameters_param_grid_true(training_conf): assert len(model_parameters) == 6 +def test_get_model_parameters_search_strategy_explicit(training_conf): + """ + When training.model_parameter_search.strategy is set to "explicit", + model_parameters pass through unchanged. + """ + training_conf["training"]["model_parameters"] = [ + {"type": "random_forest", "maxDepth": 15, "numTrees": 100, "threshold": 0.5}, + {"type": "probit", "threshold": 0.8, "threshold_ratio": 1.3}, + ] + training_conf["training"]["model_parameter_search"] = { + "strategy": "explicit", + } + assert "param_grid" not in training_conf["training"] + + model_parameters = _get_model_parameters(training_conf["training"]) + + assert model_parameters == [ + {"type": "random_forest", "maxDepth": 15, "numTrees": 100, "threshold": 0.5}, + {"type": "probit", "threshold": 0.8, "threshold_ratio": 1.3}, + ] + + +def test_get_model_parameters_search_strategy_grid(training_conf): + """ + When training.model_parameter_search.strategy is set to "grid", + model_parameters are exploded. + """ + training_conf["training"]["model_parameters"] = [ + { + "type": "random_forest", + "maxDepth": [5, 10, 15], + "numTrees": [50, 100], + "threshold": 0.5, + }, + ] + training_conf["model_parameter_search"] = { + "strategy": "grid", + } + assert "param_grid" not in training_conf + + model_parameters = _get_model_parameters(training_conf["training"]) + # 3 settings for maxDepth * 2 settings for numTrees = 6 total settings + assert len(model_parameters) == 6 + + # ------------------------------------- # Tests that probably should be moved # ------------------------------------- From a939ec2f064ac7607d937e7ffce783b935c08164 Mon Sep 17 00:00:00 2001 From: rileyh Date: Tue, 26 Nov 2024 16:00:24 -0600 Subject: [PATCH 07/21] [#167] Look for training.model_parameter_search in _get_model_parameters() --- .../model_exploration/link_step_train_test_models.py | 10 ++++++++++ hlink/tests/model_exploration_test.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 47c0a8d..3e9853a 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -686,8 +686,18 @@ def _custom_param_grid_builder( def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any]]: model_parameters = training_config["model_parameters"] + model_parameter_search = training_config.get("model_parameter_search") + if "param_grid" in training_config and training_config["param_grid"]: model_parameters = _custom_param_grid_builder(model_parameters) + elif model_parameter_search is not None: + strategy = model_parameter_search["strategy"] + if strategy == "explicit": + return model_parameters + elif strategy == "grid": + return _custom_param_grid_builder(model_parameters) + else: + raise ValueError(f"Unknown model_parameter_search strategy '{strategy}'") elif model_parameters == []: raise ValueError( "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'." diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 9bf58c6..9a86526 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -236,7 +236,7 @@ def test_get_model_parameters_search_strategy_grid(training_conf): "threshold": 0.5, }, ] - training_conf["model_parameter_search"] = { + training_conf["training"]["model_parameter_search"] = { "strategy": "grid", } assert "param_grid" not in training_conf From 801582e0f629d2c250798e630730f4704bbf49a1 Mon Sep 17 00:00:00 2001 From: rileyh Date: Tue, 26 Nov 2024 16:14:49 -0600 Subject: [PATCH 08/21] [#167] Make sure that model_parameter_search takes precedence over param_grid --- .../link_step_train_test_models.py | 6 +- hlink/tests/model_exploration_test.py | 58 ++++++++++++++++++- 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 3e9853a..7ebe074 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -688,9 +688,7 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any model_parameters = training_config["model_parameters"] model_parameter_search = training_config.get("model_parameter_search") - if "param_grid" in training_config and training_config["param_grid"]: - model_parameters = _custom_param_grid_builder(model_parameters) - elif model_parameter_search is not None: + if model_parameter_search is not None: strategy = model_parameter_search["strategy"] if strategy == "explicit": return model_parameters @@ -698,6 +696,8 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any return _custom_param_grid_builder(model_parameters) else: raise ValueError(f"Unknown model_parameter_search strategy '{strategy}'") + elif "param_grid" in training_config and training_config["param_grid"]: + model_parameters = _custom_param_grid_builder(model_parameters) elif model_parameters == []: raise ValueError( "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'." diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 9a86526..c560b19 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -145,16 +145,18 @@ def test_custom_param_grid_builder(): assert all(m in expected for m in param_grid) -def test_get_model_parameters_no_param_grid_attribute(training_conf): +def test_get_model_parameters_default_behavior(training_conf): """ - When there's no training.param_grid attribute, the default is to use the "explicit" - strategy, testing each element of model_parameters in turn. + When there's no training.param_grid attribute or + training.model_parameter_search attribute, the default is to use the + "explicit" strategy, testing each element of model_parameters in turn. """ training_conf["training"]["model_parameters"] = [ {"type": "random_forest", "maxDepth": 3, "numTrees": 50}, {"type": "probit", "threshold": 0.7}, ] assert "param_grid" not in training_conf["training"] + assert "model_parameter_search" not in training_conf["training"] model_parameters = _get_model_parameters(training_conf["training"]) @@ -246,6 +248,56 @@ def test_get_model_parameters_search_strategy_grid(training_conf): assert len(model_parameters) == 6 +def test_get_model_parameters_search_strategy_explicit_with_param_grid_true( + training_conf, +): + """ + When both model_parameter_search and param_grid are set, model_parameter_search + takes precedence. + """ + training_conf["training"]["model_parameters"] = [ + { + "type": "random_forest", + "maxDepth": 10, + "numTrees": 75, + "threshold": 0.7, + } + ] + training_conf["training"]["model_parameter_search"] = { + "strategy": "explicit", + } + # model_parameter_search takes precedence over this + training_conf["training"]["param_grid"] = True + + model_parameters = _get_model_parameters(training_conf["training"]) + assert model_parameters == [ + {"type": "random_forest", "maxDepth": 10, "numTrees": 75, "threshold": 0.7} + ] + + +def test_get_model_parameters_search_strategy_grid_with_param_grid_false(training_conf): + """ + When both model_parameter_search and param_grid are set, model_parameter_search + takes precedence. + """ + training_conf["training"]["model_parameters"] = [ + { + "type": "random_forest", + "maxDepth": [5, 10, 15], + "numTrees": [50, 100], + "threshold": 0.5, + }, + ] + training_conf["training"]["model_parameter_search"] = { + "strategy": "grid", + } + # model_parameter_search takes precedence over this + training_conf["training"]["param_grid"] = False + + model_parameters = _get_model_parameters(training_conf["training"]) + assert len(model_parameters) == 6 + + # ------------------------------------- # Tests that probably should be moved # ------------------------------------- From a47688477cfa7c04417c5548b66993f8ff82ae1f Mon Sep 17 00:00:00 2001 From: rileyh Date: Wed, 27 Nov 2024 09:25:55 -0600 Subject: [PATCH 09/21] [#167] Print a deprecation warning for training.param_grid The new training.model_parameter_search is a more flexible version of param_grid. We still support param_grid, but eventually we will want to completely switch over to model_parameter_search instead. --- .../link_step_train_test_models.py | 17 +++++++++++ hlink/tests/model_exploration_test.py | 30 ++++++++++++++++--- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 7ebe074..d6dce8f 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -7,6 +7,8 @@ import logging import math import re +import sys +from textwrap import dedent from time import perf_counter from typing import Any import numpy as np @@ -688,6 +690,21 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any model_parameters = training_config["model_parameters"] model_parameter_search = training_config.get("model_parameter_search") + if "param_grid" in training_config: + print( + dedent( + """\ + Deprecation Warning: training.param_grid is deprecated. + + Please use training.model_parameter_search instead by replacing + + `param_grid = True` with `model_parameter_search = {strategy = "grid"}` or + `param_grid = False` with `model_parameter_search = {strategy = "explicit"}` + + [deprecated_in_version=4.0.0]""" + ), + file=sys.stderr, + ) if model_parameter_search is not None: strategy = model_parameter_search["strategy"] if strategy == "explicit": diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index c560b19..5a6957e 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -166,10 +166,12 @@ def test_get_model_parameters_default_behavior(training_conf): ] -def test_get_model_parameters_param_grid_false(training_conf): +def test_get_model_parameters_param_grid_false(training_conf, capsys): """ When training.param_grid is set to False, model exploration uses the "explicit" strategy. The model_parameters are returned unchanged. + + This prints a deprecation warning because param_grid is deprecated. """ training_conf["training"]["model_parameters"] = [ {"type": "logistic_regression", "threshold": 0.3, "threshold_ratio": 1.4}, @@ -182,11 +184,16 @@ def test_get_model_parameters_param_grid_false(training_conf): {"type": "logistic_regression", "threshold": 0.3, "threshold_ratio": 1.4}, ] + output = capsys.readouterr() + assert "Deprecation Warning: training.param_grid is deprecated" in output.err + -def test_get_model_parameters_param_grid_true(training_conf): +def test_get_model_parameters_param_grid_true(training_conf, capsys): """ When training.param_grid is set to True, model exploration uses the "grid" strategy, exploding model_parameters. + + This prints a deprecation warning because param_grid is deprecated. """ training_conf["training"]["model_parameters"] = [ { @@ -202,6 +209,9 @@ def test_get_model_parameters_param_grid_true(training_conf): # 3 settings for maxDepth * 2 settings for numTrees = 6 total settings assert len(model_parameters) == 6 + output = capsys.readouterr() + assert "Deprecation Warning: training.param_grid is deprecated" in output.err + def test_get_model_parameters_search_strategy_explicit(training_conf): """ @@ -249,11 +259,13 @@ def test_get_model_parameters_search_strategy_grid(training_conf): def test_get_model_parameters_search_strategy_explicit_with_param_grid_true( - training_conf, + training_conf, capsys ): """ When both model_parameter_search and param_grid are set, model_parameter_search takes precedence. + + This prints a deprecation warning because param_grid is deprecated. """ training_conf["training"]["model_parameters"] = [ { @@ -274,11 +286,18 @@ def test_get_model_parameters_search_strategy_explicit_with_param_grid_true( {"type": "random_forest", "maxDepth": 10, "numTrees": 75, "threshold": 0.7} ] + output = capsys.readouterr() + assert "Deprecation Warning: training.param_grid is deprecated" in output.err + -def test_get_model_parameters_search_strategy_grid_with_param_grid_false(training_conf): +def test_get_model_parameters_search_strategy_grid_with_param_grid_false( + training_conf, capsys +): """ When both model_parameter_search and param_grid are set, model_parameter_search takes precedence. + + This prints a deprecation warning because param_grid is deprecated. """ training_conf["training"]["model_parameters"] = [ { @@ -297,6 +316,9 @@ def test_get_model_parameters_search_strategy_grid_with_param_grid_false(trainin model_parameters = _get_model_parameters(training_conf["training"]) assert len(model_parameters) == 6 + output = capsys.readouterr() + assert "Deprecation Warning: training.param_grid is deprecated" in output.err + # ------------------------------------- # Tests that probably should be moved From 8c724467738dd128124cabfd51bd377c2245ade1 Mon Sep 17 00:00:00 2001 From: rileyh Date: Wed, 27 Nov 2024 10:38:20 -0600 Subject: [PATCH 10/21] [#167] Refactor _get_model_parameters() --- .../model_exploration/link_step_train_test_models.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index d6dce8f..99a929c 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -687,9 +687,6 @@ def _custom_param_grid_builder( def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any]]: - model_parameters = training_config["model_parameters"] - model_parameter_search = training_config.get("model_parameter_search") - if "param_grid" in training_config: print( dedent( @@ -705,6 +702,11 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any ), file=sys.stderr, ) + + model_parameters = training_config["model_parameters"] + model_parameter_search = training_config.get("model_parameter_search") + use_param_grid = training_config.get("param_grid", False) + if model_parameter_search is not None: strategy = model_parameter_search["strategy"] if strategy == "explicit": @@ -713,8 +715,8 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any return _custom_param_grid_builder(model_parameters) else: raise ValueError(f"Unknown model_parameter_search strategy '{strategy}'") - elif "param_grid" in training_config and training_config["param_grid"]: - model_parameters = _custom_param_grid_builder(model_parameters) + elif use_param_grid: + return _custom_param_grid_builder(model_parameters) elif model_parameters == []: raise ValueError( "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'." From 896ad67756782fd59fad6903f4c70925a9ccff4a Mon Sep 17 00:00:00 2001 From: rileyh Date: Wed, 27 Nov 2024 10:57:42 -0600 Subject: [PATCH 11/21] [#167] Improve an error condition in _get_model_parameters() --- .../model_exploration/link_step_train_test_models.py | 10 ++++++---- hlink/tests/model_exploration_test.py | 11 +++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 99a929c..fad2429 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -707,6 +707,11 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any model_parameter_search = training_config.get("model_parameter_search") use_param_grid = training_config.get("param_grid", False) + if model_parameters == []: + raise ValueError( + "model_parameters is empty, so there are no models to evaluate" + ) + if model_parameter_search is not None: strategy = model_parameter_search["strategy"] if strategy == "explicit": @@ -717,8 +722,5 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any raise ValueError(f"Unknown model_parameter_search strategy '{strategy}'") elif use_param_grid: return _custom_param_grid_builder(model_parameters) - elif model_parameters == []: - raise ValueError( - "No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'." - ) + return model_parameters diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 5a6957e..a64805c 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -145,6 +145,17 @@ def test_custom_param_grid_builder(): assert all(m in expected for m in param_grid) +def test_get_model_parameters_error_if_list_empty(training_conf): + """ + It's an error if the model_parameters list is empty, since in that case there + aren't any models to evaluate. + """ + training_conf["training"]["model_parameters"] = [] + + with pytest.raises(ValueError, match="model_parameters is empty"): + _get_model_parameters(training_conf["training"]) + + def test_get_model_parameters_default_behavior(training_conf): """ When there's no training.param_grid attribute or From 46da4cb1ee66312a2e52e347d00092426f0744fa Mon Sep 17 00:00:00 2001 From: rileyh Date: Wed, 27 Nov 2024 11:28:57 -0600 Subject: [PATCH 12/21] [#167] Start supporting a randomized strategy which can randomly sample from lists --- .../link_step_train_test_models.py | 26 ++++++++++++++++ hlink/tests/model_exploration_test.py | 31 +++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index fad2429..e6d7437 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -6,6 +6,7 @@ import itertools import logging import math +import random import re import sys from textwrap import dedent @@ -686,6 +687,21 @@ def _custom_param_grid_builder( return new_params +def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, Any]: + """ + Choose a randomized setting of parameters from the given specification. + """ + parameter_choices = dict() + + for key, value in model_parameters.items(): + if key == "type": + parameter_choices[key] = value + else: + parameter_choices[key] = random.choice(value) + + return parameter_choices + + def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any]]: if "param_grid" in training_config: print( @@ -718,6 +734,16 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any return model_parameters elif strategy == "grid": return _custom_param_grid_builder(model_parameters) + elif strategy == "randomized": + num_samples = model_parameter_search["num_samples"] + + return_parameters = [] + for _ in range(num_samples): + parameter_spec = random.choice(model_parameters) + randomized = _choose_randomized_parameters(parameter_spec) + return_parameters.append(randomized) + + return return_parameters else: raise ValueError(f"Unknown model_parameter_search strategy '{strategy}'") elif use_param_grid: diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index a64805c..bb272be 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -331,6 +331,37 @@ def test_get_model_parameters_search_strategy_grid_with_param_grid_false( assert "Deprecation Warning: training.param_grid is deprecated" in output.err +def test_get_model_parameters_search_strategy_randomized_sample_from_lists( + training_conf, +): + """ + Strategy "randomized" accepts lists for parameter values, but it does not work + the same way as the "grid" strategy. It randomly samples values from the lists + num_samples times to create parameter combinations. + """ + training_conf["training"]["model_parameter_search"] = { + "strategy": "randomized", + "num_samples": 37, + } + training_conf["training"]["model_parameters"] = [ + { + "type": "decision_tree", + "maxDepth": [1, 5, 10, 20], + "maxBins": [10, 20, 40], + } + ] + + model_parameters = _get_model_parameters(training_conf["training"]) + + # Note that if we used strategy grid, we would get a list of length 4 * 3 = 12 instead + assert len(model_parameters) == 37 + + for parameter_choice in model_parameters: + assert parameter_choice["type"] == "decision_tree" + assert parameter_choice["maxDepth"] in {1, 5, 10, 20} + assert parameter_choice["maxBins"] in {10, 20, 40} + + # ------------------------------------- # Tests that probably should be moved # ------------------------------------- From 51b4144701c2ff79da1f4834dc5ed7a580e30763 Mon Sep 17 00:00:00 2001 From: rileyh Date: Wed, 27 Nov 2024 11:55:45 -0600 Subject: [PATCH 13/21] [#167] Support some simple distributions for randomized parameter search - randint returns a random integer in an inclusive range - uniform returns a random float in an inclusive range --- .../link_step_train_test_models.py | 15 +++++++- hlink/tests/model_exploration_test.py | 35 +++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index e6d7437..fc9bbae 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -696,8 +696,21 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, for key, value in model_parameters.items(): if key == "type": parameter_choices[key] = value - else: + elif type(value) == list: parameter_choices[key] = random.choice(value) + elif type(value) == dict: + distribution = value["distribution"] + low = value["low"] + high = value["high"] + + if distribution == "randint": + parameter_choices[key] = random.randint(low, high) + elif distribution == "uniform": + parameter_choices[key] = random.uniform(low, high) + else: + raise ValueError("unknown distribution") + else: + raise ValueError("can't handle value type") return parameter_choices diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index bb272be..51f648f 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -362,6 +362,41 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_lists( assert parameter_choice["maxBins"] in {10, 20, 40} +def test_get_model_parameters_search_strategy_randomized_sample_from_distributions( + training_conf, +): + """ + The "randomized" strategy also accepts dictionary values for parameters. + These dictionaries define distributions from which the parameters should be + sampled. + + For example, {"distribution": "randint", "low": 1, "high": 20} means to + pick a random integer between 1 and 20, each integer with an equal chance. + And {"distribution": "uniform", "low": 0.0, "high": 100.0} means to pick a + random float between 0.0 and 100.0 with a uniform distribution. + """ + training_conf["training"]["model_parameter_search"] = { + "strategy": "randomized", + "num_samples": 15, + } + training_conf["training"]["model_parameters"] = [ + { + "type": "decision_tree", + "maxDepth": {"distribution": "randint", "low": 1, "high": 20}, + "minInfoGain": {"distribution": "uniform", "low": 0.0, "high": 100.0}, + } + ] + + model_parameters = _get_model_parameters(training_conf["training"]) + + assert len(model_parameters) == 15 + + for parameter_choice in model_parameters: + assert parameter_choice["type"] == "decision_tree" + assert 1 <= parameter_choice["maxDepth"] <= 20 + assert 0.0 <= parameter_choice["minInfoGain"] <= 100.0 + + # ------------------------------------- # Tests that probably should be moved # ------------------------------------- From 907818e29ef2385e4e4e27c8b565e2913a93c733 Mon Sep 17 00:00:00 2001 From: rileyh Date: Wed, 27 Nov 2024 13:52:39 -0600 Subject: [PATCH 14/21] [#167] Use isinstance instead of directly checking types This makes this code more flexible and easier to understand. It also handles a weird case where the toml library returns a subclass of dict in some situations, and built-in Python dicts in other situations. --- .../model_exploration/link_step_train_test_models.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index fc9bbae..c975258 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -3,6 +3,7 @@ # in this project's top-level directory, and also on-line at: # https://github.com/ipums/hlink +import collections.abc import itertools import logging import math @@ -696,9 +697,12 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, for key, value in model_parameters.items(): if key == "type": parameter_choices[key] = value - elif type(value) == list: + # If it's a Sequence (usually list), choose one of the values at random. + elif isinstance(value, collections.abc.Sequence): parameter_choices[key] = random.choice(value) - elif type(value) == dict: + # If it's a Mapping (usually dict), it defines a distribution from which + # the parameter should be sampled. + elif isinstance(value, collections.abc.Mapping): distribution = value["distribution"] low = value["low"] high = value["high"] From 65cb5ffb3ec28eb47ff8b9165132b871919bc95a Mon Sep 17 00:00:00 2001 From: rileyh Date: Wed, 27 Nov 2024 14:22:26 -0600 Subject: [PATCH 15/21] [#167] Pull the edge case logic for "type" out of _choose_randomized_parameters() --- .../link_step_train_test_models.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index c975258..1c182ce 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -695,10 +695,8 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, parameter_choices = dict() for key, value in model_parameters.items(): - if key == "type": - parameter_choices[key] = value # If it's a Sequence (usually list), choose one of the values at random. - elif isinstance(value, collections.abc.Sequence): + if isinstance(value, collections.abc.Sequence): parameter_choices[key] = random.choice(value) # If it's a Mapping (usually dict), it defines a distribution from which # the parameter should be sampled. @@ -757,7 +755,14 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any return_parameters = [] for _ in range(num_samples): parameter_spec = random.choice(model_parameters) - randomized = _choose_randomized_parameters(parameter_spec) + model_type = parameter_spec["type"] + sample_parameters = dict( + (key, value) + for (key, value) in parameter_spec.items() + if key != "type" + ) + randomized = _choose_randomized_parameters(sample_parameters) + randomized["type"] = model_type return_parameters.append(randomized) return return_parameters From 1692c87452d984e48f12b51c82b24739c2360ffc Mon Sep 17 00:00:00 2001 From: rileyh Date: Wed, 27 Nov 2024 14:51:07 -0600 Subject: [PATCH 16/21] [#167] Support "pinned" parameters with model_parameter_search strategy randomized This lets users set some parameters to a particular value, and only sample others. It's mostly a convenience because previously you could get the same behavior by passing the parameter as a one-element list, like `maxDepth = [7]`. This commit introduces the extra convenience of just specifying the parameter as a value, like `maxDepth = 7`. So now you can do something like this: ``` [[training.model_parameters]] type = "random_forest" maxDepth = 7 numTrees = [1, 10, 20] subsamplingRate = {distribution = "uniform", low = 0.1, high = 0.9} ``` maxDepth will always be 7, numTrees will be randomly sampled from the list 1, 10, 20, and subsamplingRate will be sampled uniformly from the range [0.1, 0.9]. --- .../link_step_train_test_models.py | 7 ++-- hlink/tests/model_exploration_test.py | 34 +++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 1c182ce..54a3115 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -695,8 +695,8 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, parameter_choices = dict() for key, value in model_parameters.items(): - # If it's a Sequence (usually list), choose one of the values at random. - if isinstance(value, collections.abc.Sequence): + # If it's a Sequence (usually list) but not a string, choose one of the values at random. + if isinstance(value, collections.abc.Sequence) and not isinstance(value, str): parameter_choices[key] = random.choice(value) # If it's a Mapping (usually dict), it defines a distribution from which # the parameter should be sampled. @@ -711,8 +711,9 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, parameter_choices[key] = random.uniform(low, high) else: raise ValueError("unknown distribution") + # All other types (including strings) are passed through unchanged. else: - raise ValueError("can't handle value type") + parameter_choices[key] = value return parameter_choices diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 51f648f..33ee240 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -397,6 +397,40 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_distributio assert 0.0 <= parameter_choice["minInfoGain"] <= 100.0 +def test_get_model_parameters_search_strategy_randomized_take_values(training_conf): + """ + If a value is neither a list nor a table, the "randomized" strategy just passes + it along as a value. This lets the user easily pin some parameters to a particular + value and randomize others. + """ + training_conf["training"]["model_parameter_search"] = { + "strategy": "randomized", + "num_samples": 25, + } + training_conf["training"]["model_parameters"] = [ + { + "type": "random_forest", + "maxDepth": 7, + "impurity": "entropy", + "minInfoGain": 0.5, + "numTrees": {"distribution": "randint", "low": 10, "high": 100}, + "subsamplingRate": [0.5, 1.0, 1.5], + } + ] + + model_parameters = _get_model_parameters(training_conf["training"]) + + assert len(model_parameters) == 25 + + for parameter_choice in model_parameters: + assert parameter_choice["type"] == "random_forest" + assert parameter_choice["maxDepth"] == 7 + assert parameter_choice["impurity"] == "entropy" + assert parameter_choice["minInfoGain"] == 0.5 + assert 10 <= parameter_choice["numTrees"] <= 100 + assert parameter_choice["subsamplingRate"] in {0.5, 1.0, 1.5} + + # ------------------------------------- # Tests that probably should be moved # ------------------------------------- From 0becd3234e69d9ceba4182fc9b776e9f492379b8 Mon Sep 17 00:00:00 2001 From: rileyh Date: Mon, 2 Dec 2024 11:02:57 -0600 Subject: [PATCH 17/21] [#167] Respect training.seed when the search strategy is ""randomized" --- .../link_step_train_test_models.py | 16 ++++-- hlink/tests/model_exploration_test.py | 56 +++++++++++++++++++ 2 files changed, 66 insertions(+), 6 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 54a3115..988ed8b 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -688,7 +688,9 @@ def _custom_param_grid_builder( return new_params -def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, Any]: +def _choose_randomized_parameters( + rng: random.Random, model_parameters: dict[str, Any] +) -> dict[str, Any]: """ Choose a randomized setting of parameters from the given specification. """ @@ -697,7 +699,7 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, for key, value in model_parameters.items(): # If it's a Sequence (usually list) but not a string, choose one of the values at random. if isinstance(value, collections.abc.Sequence) and not isinstance(value, str): - parameter_choices[key] = random.choice(value) + parameter_choices[key] = rng.choice(value) # If it's a Mapping (usually dict), it defines a distribution from which # the parameter should be sampled. elif isinstance(value, collections.abc.Mapping): @@ -706,9 +708,9 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, high = value["high"] if distribution == "randint": - parameter_choices[key] = random.randint(low, high) + parameter_choices[key] = rng.randint(low, high) elif distribution == "uniform": - parameter_choices[key] = random.uniform(low, high) + parameter_choices[key] = rng.uniform(low, high) else: raise ValueError("unknown distribution") # All other types (including strings) are passed through unchanged. @@ -737,6 +739,7 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any model_parameters = training_config["model_parameters"] model_parameter_search = training_config.get("model_parameter_search") + seed = training_config.get("seed") use_param_grid = training_config.get("param_grid", False) if model_parameters == []: @@ -752,17 +755,18 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any return _custom_param_grid_builder(model_parameters) elif strategy == "randomized": num_samples = model_parameter_search["num_samples"] + rng = random.Random(seed) return_parameters = [] for _ in range(num_samples): - parameter_spec = random.choice(model_parameters) + parameter_spec = rng.choice(model_parameters) model_type = parameter_spec["type"] sample_parameters = dict( (key, value) for (key, value) in parameter_spec.items() if key != "type" ) - randomized = _choose_randomized_parameters(sample_parameters) + randomized = _choose_randomized_parameters(rng, sample_parameters) randomized["type"] = model_type return_parameters.append(randomized) diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 33ee240..3af04da 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -2,6 +2,7 @@ # For copyright and licensing information, see the NOTICE and LICENSE files # in this project's top-level directory, and also on-line at: # https://github.com/ipums/hlink +from collections import Counter import pytest import pandas as pd @@ -431,6 +432,61 @@ def test_get_model_parameters_search_strategy_randomized_take_values(training_co assert parameter_choice["subsamplingRate"] in {0.5, 1.0, 1.5} +def test_get_model_parameters_search_strategy_randomized_multiple_models(training_conf): + """ + When there are multiple models for the "randomized" strategy, it randomly + samples the model before sampling the parameters for that model. Setting + the training.seed attribute lets us assert more precisely the counts for + each model type. + """ + training_conf["training"]["model_parameter_search"] = { + "strategy": "randomized", + "num_samples": 100, + } + training_conf["training"]["seed"] = 101 + training_conf["training"]["model_parameters"] = [ + { + "type": "random_forest", + "minInfoGain": {"distribution": "uniform", "low": 0.1, "high": 0.9}, + }, + {"type": "probit"}, + ] + + model_parameters = _get_model_parameters(training_conf["training"]) + + counter = Counter(parameter_choice["type"] for parameter_choice in model_parameters) + assert counter["random_forest"] == 47 + assert counter["probit"] == 53 + + +def test_get_model_parameters_search_strategy_randomized_uses_seed(training_conf): + """ + The "randomized" strategy uses training.seed to allow reproducible runs. + """ + training_conf["training"]["model_parameter_search"] = { + "strategy": "randomized", + "num_samples": 5, + } + training_conf["training"]["seed"] = 35830969 + training_conf["training"]["model_parameters"] = [ + { + "type": "random_forest", + "maxDepth": {"distribution": "randint", "low": 1, "high": 10}, + "numTrees": [1, 10, 100, 1000], + } + ] + + model_parameters = _get_model_parameters(training_conf["training"]) + + assert model_parameters == [ + {"type": "random_forest", "maxDepth": 8, "numTrees": 100}, + {"type": "random_forest", "maxDepth": 2, "numTrees": 1}, + {"type": "random_forest", "maxDepth": 4, "numTrees": 100}, + {"type": "random_forest", "maxDepth": 9, "numTrees": 10}, + {"type": "random_forest", "maxDepth": 7, "numTrees": 100}, + ] + + # ------------------------------------- # Tests that probably should be moved # ------------------------------------- From 5d0ea0baaa7494172f0396ddb6c78f82c78429cf Mon Sep 17 00:00:00 2001 From: rileyh Date: Mon, 2 Dec 2024 11:21:20 -0600 Subject: [PATCH 18/21] [#167] Add a normal distribution to randomized parameter search --- .../model_exploration/link_step_train_test_models.py | 10 ++++++++-- hlink/tests/model_exploration_test.py | 9 +++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 988ed8b..452bcc1 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -704,13 +704,19 @@ def _choose_randomized_parameters( # the parameter should be sampled. elif isinstance(value, collections.abc.Mapping): distribution = value["distribution"] - low = value["low"] - high = value["high"] if distribution == "randint": + low = value["low"] + high = value["high"] parameter_choices[key] = rng.randint(low, high) elif distribution == "uniform": + low = value["low"] + high = value["high"] parameter_choices[key] = rng.uniform(low, high) + elif distribution == "normal": + mean = value["mean"] + stdev = value["standard_deviation"] + parameter_choices[key] = rng.normalvariate(mean, stdev) else: raise ValueError("unknown distribution") # All other types (including strings) are passed through unchanged. diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 3af04da..8f31aaa 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -385,6 +385,11 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_distributio "type": "decision_tree", "maxDepth": {"distribution": "randint", "low": 1, "high": 20}, "minInfoGain": {"distribution": "uniform", "low": 0.0, "high": 100.0}, + "minWeightFractionPerNode": { + "distribution": "normal", + "mean": 10.0, + "standard_deviation": 2.5, + }, } ] @@ -396,6 +401,10 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_distributio assert parameter_choice["type"] == "decision_tree" assert 1 <= parameter_choice["maxDepth"] <= 20 assert 0.0 <= parameter_choice["minInfoGain"] <= 100.0 + # Technically a normal distribution can return any value, even ones very + # far from its mean. So we can't assert on the value returned here. But + # there definitely should be a value of some sort in the dictionary. + assert "minWeightFractionPerNode" in parameter_choice def test_get_model_parameters_search_strategy_randomized_take_values(training_conf): From 943fc0a56a5ae0985c2056164dba67d4ca706dfe Mon Sep 17 00:00:00 2001 From: rileyh Date: Mon, 2 Dec 2024 11:28:45 -0600 Subject: [PATCH 19/21] [#167] Improve the "unknown distribution" error message --- .../link_step_train_test_models.py | 4 +++- hlink/tests/model_exploration_test.py | 24 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 452bcc1..e700285 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -718,7 +718,9 @@ def _choose_randomized_parameters( stdev = value["standard_deviation"] parameter_choices[key] = rng.normalvariate(mean, stdev) else: - raise ValueError("unknown distribution") + raise ValueError( + f"Unknown distribution '{distribution}'. Please choose one of 'randint', 'uniform', or 'normal'." + ) # All other types (including strings) are passed through unchanged. else: parameter_choices[key] = value diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 8f31aaa..1aeef9c 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -496,6 +496,30 @@ def test_get_model_parameters_search_strategy_randomized_uses_seed(training_conf ] +def test_get_model_parameters_search_strategy_randomized_unknown_distribution( + training_conf, +): + """ + Passing a distrbution other than "uniform", "randint", or "normal" is an error. + """ + training_conf["training"]["model_parameter_search"] = { + "strategy": "randomized", + "num_samples": 10, + } + training_conf["training"]["model_parameters"] = [ + { + "type": "decision_tree", + "minInfoGain": {"distribution": "laplace", "location": 0.0, "scale": 1.0}, + } + ] + + with pytest.raises( + ValueError, + match="Unknown distribution 'laplace'. Please choose one of 'randint', 'uniform', or 'normal'.", + ): + _get_model_parameters(training_conf["training"]) + + # ------------------------------------- # Tests that probably should be moved # ------------------------------------- From 0f99e1b0ad2adf8058081fa29a32b74b78cb37d9 Mon Sep 17 00:00:00 2001 From: rileyh Date: Mon, 2 Dec 2024 12:48:13 -0600 Subject: [PATCH 20/21] [#167] Don't randomize threshold or threshold_ratio Only the hyper-parameters to the model should be affected by training.model_parameter_search.strategy. thresholds and threshold_ratios should be passed through unchanged on each model. --- .../link_step_train_test_models.py | 22 ++++++++++----- hlink/tests/model_exploration_test.py | 28 +++++++++++++++++++ 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index e700285..909309a 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -766,17 +766,25 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any rng = random.Random(seed) return_parameters = [] + # These keys are special and should not be sampled or modified. All + # other keys are hyper-parameters to the model and should be sampled. + frozen_keys = {"type", "threshold", "threshold_ratio"} for _ in range(num_samples): parameter_spec = rng.choice(model_parameters) - model_type = parameter_spec["type"] - sample_parameters = dict( - (key, value) + sample_parameters = { + key: value for (key, value) in parameter_spec.items() - if key != "type" - ) + if key not in frozen_keys + } + frozen_parameters = { + key: value + for (key, value) in parameter_spec.items() + if key in frozen_keys + } + randomized = _choose_randomized_parameters(rng, sample_parameters) - randomized["type"] = model_type - return_parameters.append(randomized) + result = {**frozen_parameters, **randomized} + return_parameters.append(result) return return_parameters else: diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 1aeef9c..b58bfd1 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -520,6 +520,34 @@ def test_get_model_parameters_search_strategy_randomized_unknown_distribution( _get_model_parameters(training_conf["training"]) +def test_get_model_parameters_search_strategy_randomized_thresholds(training_conf): + """ + Even when the model parameters are selected with strategy "randomized", the + thresholds are still treated with a "grid" strategy. + _get_model_parameters() is not in charge of creating the threshold matrix, + so it passes the threshold and threshold_ratio through unchanged. + """ + training_conf["training"]["model_parameter_search"] = { + "strategy": "randomized", + "num_samples": 25, + } + training_conf["training"]["model_parameters"] = [ + { + "type": "random_forest", + "maxDepth": [1, 10, 100], + "threshold": [0.3, 0.5, 0.7, 0.8, 0.9], + "threshold_ratio": 1.2, + } + ] + + model_parameters = _get_model_parameters(training_conf["training"]) + + for parameter_choice in model_parameters: + assert parameter_choice["type"] == "random_forest" + assert parameter_choice["threshold"] == [0.3, 0.5, 0.7, 0.8, 0.9] + assert parameter_choice["threshold_ratio"] == 1.2 + + # ------------------------------------- # Tests that probably should be moved # ------------------------------------- From 7fed016326d2017d7b4f4ab3e4aea643ebac2626 Mon Sep 17 00:00:00 2001 From: rileyh Date: Mon, 2 Dec 2024 15:01:20 -0600 Subject: [PATCH 21/21] [#167] Add a test for the unknown strategy error condition --- .../link_step_train_test_models.py | 5 ++++- hlink/tests/model_exploration_test.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 909309a..cb3801e 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -788,7 +788,10 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any return return_parameters else: - raise ValueError(f"Unknown model_parameter_search strategy '{strategy}'") + raise ValueError( + f"Unknown model_parameter_search strategy '{strategy}'. " + "Please choose one of 'explicit', 'grid', or 'randomized'." + ) elif use_param_grid: return _custom_param_grid_builder(model_parameters) diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index b58bfd1..a438995 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -548,6 +548,20 @@ def test_get_model_parameters_search_strategy_randomized_thresholds(training_con assert parameter_choice["threshold_ratio"] == 1.2 +def test_get_model_parameters_unknown_search_strategy(training_conf): + training_conf["training"]["model_parameter_search"] = { + "strategy": "something", + } + training_conf["training"]["model_parameters"] = [{"type": "probit"}] + + with pytest.raises( + ValueError, + match="Unknown model_parameter_search strategy 'something'. " + "Please choose one of 'explicit', 'grid', or 'randomized'.", + ): + _parameters = _get_model_parameters(training_conf["training"]) + + # ------------------------------------- # Tests that probably should be moved # -------------------------------------