Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Randomized Parameter Search #168

Merged
merged 24 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
c5f5b13
[#167] Pull _custom_param_grid_builder() out of the LinkStepTrainTest…
riley-harper Nov 26, 2024
605369b
[#167] Simplify the interface to _custom_param_grid_builder()
riley-harper Nov 26, 2024
2204152
[#167] Pull _get_model_parameters() out of the LinkStep class
riley-harper Nov 26, 2024
7d48380
[#167] Add a few tests for _get_model_parameters()
riley-harper Nov 26, 2024
bc0bf7d
[#167] Just pass the training section of the config to _get_model_par…
riley-harper Nov 26, 2024
8be8806
[#167] Add a couple of tests for the new training.model_parameter_sea…
riley-harper Nov 26, 2024
a939ec2
[#167] Look for training.model_parameter_search in _get_model_paramet…
riley-harper Nov 26, 2024
801582e
[#167] Make sure that model_parameter_search takes precedence over pa…
riley-harper Nov 26, 2024
a476884
[#167] Print a deprecation warning for training.param_grid
riley-harper Nov 27, 2024
8c72446
[#167] Refactor _get_model_parameters()
riley-harper Nov 27, 2024
896ad67
[#167] Improve an error condition in _get_model_parameters()
riley-harper Nov 27, 2024
46da4cb
[#167] Start supporting a randomized strategy which can randomly samp…
riley-harper Nov 27, 2024
51b4144
[#167] Support some simple distributions for randomized parameter search
riley-harper Nov 27, 2024
907818e
[#167] Use isinstance instead of directly checking types
riley-harper Nov 27, 2024
65cb5ff
[#167] Pull the edge case logic for "type" out of _choose_randomized_…
riley-harper Nov 27, 2024
1692c87
[#167] Support "pinned" parameters with model_parameter_search strate…
riley-harper Nov 27, 2024
0becd32
[#167] Respect training.seed when the search strategy is ""randomized"
riley-harper Dec 2, 2024
5d0ea0b
[#167] Add a normal distribution to randomized parameter search
riley-harper Dec 2, 2024
943fc0a
[#167] Improve the "unknown distribution" error message
riley-harper Dec 2, 2024
0f99e1b
[#167] Don't randomize threshold or threshold_ratio
riley-harper Dec 2, 2024
7fed016
[#167] Add a test for the unknown strategy error condition
riley-harper Dec 2, 2024
0f5deb6
Merge branch 'main' into randomized_parameter_search
riley-harper Dec 3, 2024
c6d3a81
Merge branch 'main' into randomized_parameter_search
riley-harper Dec 3, 2024
73e6adc
Merge branch 'v4-dev' into randomized_parameter_search
riley-harper Dec 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 148 additions & 43 deletions hlink/linking/model_exploration/link_step_train_test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@
# in this project's top-level directory, and also on-line at:
# https://github.com/ipums/hlink

import statistics
import collections.abc
import itertools
import logging
import math
import random
import re
import statistics
import sys
from textwrap import dedent
from time import perf_counter
from dataclasses import dataclass
from typing import Any
Expand Down Expand Up @@ -492,7 +496,7 @@ def _run(self) -> None:
)
# Explode params into all the combinations we want to test with the current model.
# This may use a grid search or a random search or exactly the parameters in the config.
model_parameters = self._get_model_parameters(config)
model_parameters = _get_model_parameters(training_settings)

outer_training_data = self._combine_folds(
outer_folds, ignore=test_data_index
Expand Down Expand Up @@ -632,35 +636,6 @@ def _get_splits(
)
return splits

def _custom_param_grid_builder(self, conf: dict[str, Any]) -> list[dict[str, Any]]:
print("Building param grid for models")
given_parameters = conf[f"{self.task.training_conf}"]["model_parameters"]
new_params = []
for run in given_parameters:
params = run.copy()
model_type = params.pop("type")

# dropping thresholds to prep for scikitlearn model exploration refactor
threshold = params.pop("threshold", False)
threshold_ratio = params.pop("threshold_ratio", False)

keys = params.keys()
values = params.values()

params_exploded = []
for prod in itertools.product(*values):
params_exploded.append(dict(zip(keys, prod)))

for subdict in params_exploded:
subdict["type"] = model_type
if threshold:
subdict["threshold"] = threshold
if threshold_ratio:
subdict["threshold_ratio"] = threshold_ratio

new_params.extend(params_exploded)
return new_params

def _capture_results(
self,
predictions: pyspark.sql.DataFrame,
Expand Down Expand Up @@ -721,18 +696,6 @@ def _capture_results(
)
return pd.concat([results_df, new_results], ignore_index=True)

def _get_model_parameters(self, conf: dict[str, Any]) -> list[dict[str, Any]]:
training_conf = str(self.task.training_conf)

model_parameters = conf[training_conf]["model_parameters"]
if "param_grid" in conf[training_conf] and conf[training_conf]["param_grid"]:
model_parameters = self._custom_param_grid_builder(conf)
elif model_parameters == []:
raise ValueError(
"No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'."
)
return model_parameters

def _save_training_results(
self, desc_df: pd.DataFrame, spark: pyspark.sql.SparkSession
) -> None:
Expand Down Expand Up @@ -1119,3 +1082,145 @@ def _create_thresholded_metrics_df() -> pd.DataFrame:
"mcc_train_sd",
]
)


def _custom_param_grid_builder(
model_parameters: list[dict[str, Any]]
) -> list[dict[str, Any]]:
print("Building param grid for models")
given_parameters = model_parameters
new_params = []
for run in given_parameters:
params = run.copy()
model_type = params.pop("type")

# dropping thresholds to prep for scikitlearn model exploration refactor
threshold = params.pop("threshold", False)
threshold_ratio = params.pop("threshold_ratio", False)

keys = params.keys()
values = params.values()

params_exploded = []
for prod in itertools.product(*values):
params_exploded.append(dict(zip(keys, prod)))

for subdict in params_exploded:
subdict["type"] = model_type
if threshold:
subdict["threshold"] = threshold
if threshold_ratio:
subdict["threshold_ratio"] = threshold_ratio

new_params.extend(params_exploded)
return new_params


def _choose_randomized_parameters(
rng: random.Random, model_parameters: dict[str, Any]
) -> dict[str, Any]:
"""
Choose a randomized setting of parameters from the given specification.
"""
parameter_choices = dict()

for key, value in model_parameters.items():
# If it's a Sequence (usually list) but not a string, choose one of the values at random.
if isinstance(value, collections.abc.Sequence) and not isinstance(value, str):
parameter_choices[key] = rng.choice(value)
# If it's a Mapping (usually dict), it defines a distribution from which
# the parameter should be sampled.
elif isinstance(value, collections.abc.Mapping):
distribution = value["distribution"]

if distribution == "randint":
low = value["low"]
high = value["high"]
parameter_choices[key] = rng.randint(low, high)
elif distribution == "uniform":
low = value["low"]
high = value["high"]
parameter_choices[key] = rng.uniform(low, high)
elif distribution == "normal":
mean = value["mean"]
stdev = value["standard_deviation"]
parameter_choices[key] = rng.normalvariate(mean, stdev)
else:
raise ValueError(
f"Unknown distribution '{distribution}'. Please choose one of 'randint', 'uniform', or 'normal'."
)
# All other types (including strings) are passed through unchanged.
else:
parameter_choices[key] = value

return parameter_choices


def _get_model_parameters(training_settings: dict[str, Any]) -> list[dict[str, Any]]:
if "param_grid" in training_settings:
print(
dedent(
"""\
Deprecation Warning: training.param_grid is deprecated.

Please use training.model_parameter_search instead by replacing

`param_grid = True` with `model_parameter_search = {strategy = "grid"}` or
`param_grid = False` with `model_parameter_search = {strategy = "explicit"}`

[deprecated_in_version=4.0.0]"""
),
file=sys.stderr,
)

model_parameters = training_settings["model_parameters"]
model_parameter_search = training_settings.get("model_parameter_search")
seed = training_settings.get("seed")
use_param_grid = training_settings.get("param_grid", False)

if model_parameters == []:
raise ValueError(
"model_parameters is empty, so there are no models to evaluate"
)

if model_parameter_search is not None:
strategy = model_parameter_search["strategy"]
if strategy == "explicit":
return model_parameters
elif strategy == "grid":
return _custom_param_grid_builder(model_parameters)
elif strategy == "randomized":
num_samples = model_parameter_search["num_samples"]
rng = random.Random(seed)

return_parameters = []
# These keys are special and should not be sampled or modified. All
# other keys are hyper-parameters to the model and should be sampled.
frozen_keys = {"type", "threshold", "threshold_ratio"}
for _ in range(num_samples):
parameter_spec = rng.choice(model_parameters)
sample_parameters = {
key: value
for (key, value) in parameter_spec.items()
if key not in frozen_keys
}
frozen_parameters = {
key: value
for (key, value) in parameter_spec.items()
if key in frozen_keys
}

randomized = _choose_randomized_parameters(rng, sample_parameters)
result = {**frozen_parameters, **randomized}
return_parameters.append(result)

return return_parameters
else:
raise ValueError(
f"Unknown model_parameter_search strategy '{strategy}'. "
"Please choose one of 'explicit', 'grid', or 'randomized'."
)
elif use_param_grid:
return _custom_param_grid_builder(model_parameters)

return model_parameters
Loading
Loading