minor addition of utility function for Python client.

h2oai · Oct 14, 2024 · 60ecdae · 60ecdae
1 parent c10d94f
commit 60ecdae
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 8 deletions.
diff --git a/h2o-bindings/bin/custom/python/gen_hglm.py b/h2o-bindings/bin/custom/python/gen_hglm.py
@@ -5,6 +5,12 @@ def update_param(name, param):
     return None
 
 def class_extensions():
+    def level2_names(self):
+        """
+        Get the level 2 column values.
+        """
+        return self._model_json["output"]["group_column_names"]
+
     def coefs_random_names(self):
         """
         Get the random effect coefficient names including the intercept if applicable.

diff --git a/h2o-py/h2o/estimators/hglm.py b/h2o-py/h2o/estimators/hglm.py
@@ -628,6 +628,12 @@ def gen_syn_data(self, gen_syn_data):
         self._parms["gen_syn_data"] = gen_syn_data
 
 
+    def level2_names(self):
+        """
+        Get the level 2 column values.
+        """
+        return self._model_json["output"]["group_column_names"]
+
     def coefs_random_names(self):
         """
         Get the random effect coefficient names including the intercept if applicable.

diff --git a/h2o-py/tests/pyunit_utils/utils_for_glm_hglm_tests.py b/h2o-py/tests/pyunit_utils/utils_for_glm_hglm_tests.py
@@ -146,6 +146,34 @@ def find_model_iterations(glm_model):
     iteration_index = glm_model._model_json["output"]["model_summary"].col_header.index("number_of_iterations")
     return cell_values[lengths-1][iteration_index]
 
+def normalize_random_coefs(random_coefs, numerical_cols, training_frame, num_level2):
+    """
+    Given a random effect coefficients dict, this method will standardize/normalize the coefficients
+    
+    :param random_coefs: python dict with random column names and a list of random coefficients for each level 2 index
+    :param numerical_cols: numerical columns of the frame
+    :param training_frame: h2o frame used to build the model
+    :return: python dict with random columns names and a list of normalized/standardized random coefficients
+    """
+    all_random_names = random_coefs.keys()
+    normalized_coefs = dict()
+    # extract random coefficients for each level 2 value
+    for ind2 in range(num_level2):
+        # extract dict for one level 2 value
+        dictLevel2 = extractCoeffDic(random_coefs, ind2)
+        normalized_one_coefs = normalize_coefs(dictLevel2, numerical_cols, training_frame)
+        new_cnames = normalized_one_coefs.keys()
+
+
+def extractCoeffDic(random_coeffs, ind2):
+    random_coef_names = random_coeffs.keys()
+    random_coef_level2 = dict()
+    for cname in random_coef_names:
+        random_coef_level2[cname] = random_coeffs[cname][ind2]
+    return random_coef_level2    
+
+
+
 def normalize_coefs(coefs, numerical_cols, training_frame):
     """
     Given a coefficient as a dict, the method will normalized/standardized the given coefficents and return it in another
@@ -161,10 +189,11 @@ def normalize_coefs(coefs, numerical_cols, training_frame):
     normalized_coefs = coefs.copy()
     # only numerical coefficients are changed.
     for cname in numerical_cols:
-        cmean = training_frame[cname].mean()[0,0]
-        csigma = training_frame[cname].sd()[0]
-        normalized_coefs[cname] = coefs[cname] * csigma
-        intercept_adjust = intercept_adjust + normalized_coefs[cname]*cmean/csigma
+        if cname in all_coefs_names:
+            cmean = training_frame[cname].mean()[0,0]
+            csigma = training_frame[cname].sd()[0]
+            normalized_coefs[cname] = coefs[cname] * csigma
+            intercept_adjust = intercept_adjust + normalized_coefs[cname]*cmean/csigma
     if "intercept" in all_coefs_names:
         normalized_coefs["intercept"] = coefs["intercept"]+intercept_adjust
     else:
@@ -176,10 +205,11 @@ def denormalize_coefs(coefs_normalized, numerical_cols, training_frame):
     all_coefs_names = coefs_normalized.keys()
     denormalize_coefs = coefs_normalized.copy()
     for cname in numerical_cols:
-        cmean = training_frame[cname].mean()[0,0]
-        csigma = training_frame[cname].sd()[0]
-        denormalize_coefs[cname] = coefs_normalized[cname] / csigma
-        intercept_adjust = intercept_adjust - cmean * coefs_normalized[cname] / csigma
+        if cname in all_coefs_names:
+            cmean = training_frame[cname].mean()[0,0]
+            csigma = training_frame[cname].sd()[0]
+            denormalize_coefs[cname] = coefs_normalized[cname] / csigma
+            intercept_adjust = intercept_adjust - cmean * coefs_normalized[cname] / csigma
 
     if "intercept" in all_coefs_names:
         denormalize_coefs["intercept"] = denormalize_coefs["intercept"] + intercept_adjust

diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_scoring_history_summary.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_scoring_history_summary.py
@@ -31,6 +31,9 @@ def test_scoring_history_model_summary():
     coef_norm = hglm_model.coef_norm()
     coef_names = hglm_model.coef_names()
     coef_random = hglm_model.coefs_random()
+    coef_random_names = hglm_model.coefs_random_names()
+    coef_random_norm = hglm_model.coefs_random_norm()
+    coef_random_names_norm = hglm_model.coefs_random_names_norm()
     t_mat = hglm_model.matrix_T()
     residual_var = hglm_model.residual_variance()
     mse = hglm_model.mse()
@@ -51,6 +54,7 @@ def test_scoring_history_model_summary():
     pyunit_utils.assertCoefDictEqual(coef_norm, coef_norm_manually, 1e-6)
     coef_manually = utils_for_glm_hglm_tests.denormalize_coefs(coef_norm, numerical_columns, train)
     pyunit_utils.assertCoefDictEqual(coef, coef_manually, 1e-6)
+    # check random effect coefficients and normalized random effect coefficients are converted correctly.
     print("Done")
 
 if __name__ == "__main__":