From 5d0ea0baaa7494172f0396ddb6c78f82c78429cf Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 2 Dec 2024 11:21:20 -0600
Subject: [PATCH] [#167] Add a normal distribution to randomized parameter
 search

---
 .../model_exploration/link_step_train_test_models.py   | 10 ++++++++--
 hlink/tests/model_exploration_test.py                  |  9 +++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 988ed8b..452bcc1 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -704,13 +704,19 @@ def _choose_randomized_parameters(
         # the parameter should be sampled.
         elif isinstance(value, collections.abc.Mapping):
             distribution = value["distribution"]
-            low = value["low"]
-            high = value["high"]
 
             if distribution == "randint":
+                low = value["low"]
+                high = value["high"]
                 parameter_choices[key] = rng.randint(low, high)
             elif distribution == "uniform":
+                low = value["low"]
+                high = value["high"]
                 parameter_choices[key] = rng.uniform(low, high)
+            elif distribution == "normal":
+                mean = value["mean"]
+                stdev = value["standard_deviation"]
+                parameter_choices[key] = rng.normalvariate(mean, stdev)
             else:
                 raise ValueError("unknown distribution")
         # All other types (including strings) are passed through unchanged.
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 3af04da..8f31aaa 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -385,6 +385,11 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_distributio
             "type": "decision_tree",
             "maxDepth": {"distribution": "randint", "low": 1, "high": 20},
             "minInfoGain": {"distribution": "uniform", "low": 0.0, "high": 100.0},
+            "minWeightFractionPerNode": {
+                "distribution": "normal",
+                "mean": 10.0,
+                "standard_deviation": 2.5,
+            },
         }
     ]
 
@@ -396,6 +401,10 @@ def test_get_model_parameters_search_strategy_randomized_sample_from_distributio
         assert parameter_choice["type"] == "decision_tree"
         assert 1 <= parameter_choice["maxDepth"] <= 20
         assert 0.0 <= parameter_choice["minInfoGain"] <= 100.0
+        # Technically a normal distribution can return any value, even ones very
+        # far from its mean. So we can't assert on the value returned here. But
+        # there definitely should be a value of some sort in the dictionary.
+        assert "minWeightFractionPerNode" in parameter_choice
 
 
 def test_get_model_parameters_search_strategy_randomized_take_values(training_conf):