Move test to check init values are set correctly to Python from Java.…

… I was not able to find a good combination of initial betas/ubetas and t matrix to make it work.
h2oai · Oct 28, 2024 · b708d3c · b708d3c
1 parent c36c312
commit b708d3c
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 54 deletions.
diff --git a/h2o-algos/src/test/java/hex/hglm/HGLMBasicTest.java b/h2o-algos/src/test/java/hex/hglm/HGLMBasicTest.java
@@ -517,8 +517,7 @@ public double[] grabRow2Arrays(String[] enumPredNames, String[] numPredNames, bo
     }
 
     if (hasIntercept)
-      rowValues.add(1.0);
-
+      rowValues.add(1.0);    
     return rowValues.stream().mapToDouble(Double::doubleValue).toArray();
   }
 
@@ -551,58 +550,6 @@ public void testMatVecFormation() {
       Scope.exit();
     }
   }
-
-  /*
-   * We will try to set initial values for fixed coefficients, random coefficients, T matrix, and sigma values and make
-   * sure they are set correctly
-   */
-  @Test
-  public void testSetInitBetasTvar() {
-    Scope.enter();
-    try {
-      Frame trainingFrame = parseAndTrackTestFile("smalldata/hglm_test/gaussian_0GC_123R_all5Numeric_p2noise_p08T_woIntercept_standardize.gz");
-      DKV.put(trainingFrame);
-      double[] initBetas = new double[]{1.556019455245578, 0.004387668926465224, -0.012162334935702198, 
-              0.007090905470190553, 0.6642825159614225, -0.6565480160986089};
-      Frame ubetaFrame = new TestFrameBuilder()
-              .withColNames("C1", "C2", "C3")
-              .withVecTypes(T_NUM, T_NUM, T_NUM)
-              .withDataForCol(0, new double[]{-0.939297854112798, -1.3831760587493842, -0.550911700979278, 
-                      -0.7520414750585163, -0.7695703224787936, -1.285064572001838, -1.2685471084776616, -1.1851850464065945})
-              .withDataForCol(1, new double[]{0.6201893456700508, 0.23799145042381686, 0.249791875918212, 
-                      0.37804734995822303, 0.5427148638497548, 0.3811004571313795, 0.40211187674359433, 0.48968036339542964})
-              .withDataForCol(2, new double[]{0.8393873836611673, 0.8284318083138673, 0.8027667198768048, 
-                      0.8479753981593126, 0.8338039588079964, 0.8536416345174341, 0.8304699206057965, 0.8415980227739538})
-              .build();
-      Scope.track(ubetaFrame);
-      Frame tMat = new TestFrameBuilder()
-              .withColNames("C1","C2","C3")
-              .withVecTypes(T_NUM, T_NUM, T_NUM)
-              .withDataForCol(0, new double[]{1.1156288418645166, -0.4164395852593521, -0.850487353031379})
-              .withDataForCol(1, new double[]{-0.4164395852593521, 0.1863522738575454, 0.3453469357885868})
-              .withDataForCol(2, new double[]{-0.850487353031379, 0.3453469357885868, 0.6974072622840342})
-              .build();
-      Scope.track(tMat);
-      Scope.track(trainingFrame);
-      double sigmaEpsilon = 0.09847638;
-      HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters();
-      params._train = trainingFrame._key;
-      params._response_column = "response";
-      params._group_column = "C1";
-      params._random_intercept = false;
-      params._random_columns = new String[]{"C10", "C2", "C3"};
-      params._initial_fixed_effects = initBetas;
-      params._initial_t_matrix = tMat._key;
-      params._initial_random_effects = ubetaFrame._key;
-      params._tau_e_var_init = sigmaEpsilon;
-      params._max_iterations = 0;
-      HGLMModel model = new HGLM(params).trainModel().get();
-      Scope.track_generic(model);
-      checkCorrectInitValue(model, initBetas, ubetaFrame, tMat, sigmaEpsilon);
-    } finally {
-      Scope.exit();
-    }
-  }
 
   public void checkCorrectInitValue(HGLMModel model, double[] initBetas, Frame ubetaFrame, Frame tMat, double sigmaEpsilon) {
     // check fixed coefficient initialization

diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_2_noise_var_init_beta_ubeta_tmat.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_2_noise_var_init_beta_ubeta_tmat.py
@@ -0,0 +1,71 @@
+import sys
+sys.path.insert(1,"../../../")
+import h2o
+from tests import pyunit_utils
+from h2o.estimators.hglm import H2OHGLMEstimator as hglm
+from tests.pyunit_utils import utils_for_glm_hglm_tests
+
+# in this test, want to check the following with random intercept:
+# 1.scoring history (both training and valid)
+# 2. the model summary
+# 3. Fixed effect coefficients, normal and standardized
+# 4. icc
+# 5. residual variance
+def test_scoring_history_model_summary():
+    h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/hglm_test/gaussian_0GC_123R_all5Numeric_p2noise_p08T_woIntercept_standardize.gz"))
+    beta = [1.5606284972932365, -0.0002347762275008978, -0.007899880335654788, 0.0018421903682971376, 
+            0.6654323495890934, -0.6544609203736372] 
+    # ubeta = [[-0.939297854112798, -1.3831760587493842, -0.550911700979278, -0.7520414750585163, -0.7695703224787936,
+    #           -1.285064572001838, -1.2685471084776616, -1.1851850464065945], 
+    #          [0.6201893456700508, 0.23799145042381686, 0.249791875918212, 0.37804734995822303, 0.5427148638497548, 
+    #           0.3811004571313795, 0.40211187674359433, 0.48968036339542964],
+    #          [0.8393873836611673, 0.8284318083138673, 0.8027667198768048, 0.8479753981593126, 0.8338039588079964,
+    #           0.8536416345174341, 0.8304699206057965, 0.8415980227739538]]
+    ubeta = [[-0.9319187693195115, 0.6070501821727673, 0.8394540491750797],
+             [-1.3823145230494698, 0.21486874352840676, 0.8366860141888742],
+             [-0.552534049777237, 0.24577758770128783, 0.8172622402154629],
+             [-0.7632283839126288, 0.3662979940622124, 0.8382611342477616],
+             [-0.7660574987463035, 0.5278044590884986, 0.8421686869476276],
+             [-1.2704526364630178, 0.3882261064670864, 0.8626801006264753],
+             [-1.2615857701992563, 0.39167873788423885, 0.8448421359246485],
+             [-1.1863349889243804, 0.4802231651611951, 0.852783164270973]]
+    ubeta_init = h2o.H2OFrame(ubeta)
+    t_mat = [[1.1086713375915982, -0.40493787563311834, -0.8561132576680854],
+             [-0.40493787563311834, 0.17812207973788066, 0.33964543424526844],
+             [-0.8561132576680854, 0.33964543424526844, 0.709024192121366]]
+    t_mat_init = h2o.H2OFrame(t_mat)
+    y = "response"
+    x = h2o_data.names
+    x.remove("response")
+    x.remove("C1")
+    random_columns = ["C2", "C3", "C4"]
+    # hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345,
+    #                   max_iterations = 20, random_intercept = False)
+    hglm_model = hglm(random_columns = random_columns, group_column = "C1", seed = 12345, max_iterations = 0,
+                      random_intercept = False, initial_fixed_effects = beta, initial_random_effects = ubeta_init,
+                      initial_t_matrix = t_mat_init)
+    hglm_model.train(x=x, y=y, training_frame=h2o_data)
+    # check and make sure the fixed effect coeffs, random effect coeffs and matrix T from model should equal to the 
+    # original initial values since we set max_iterations = 0
+    beta_model = hglm_model.coef()
+    # compare intital beta
+    for index in range(4):
+        assert abs(beta[index]-beta_model[x[index]]) < 1e-6, \
+            "fixed coefficients for {0} from model: {1}, from initialization: {2} should be the same but is " \
+            "not.".format(x[index], beta_model[x[index]], beta[index])
+    ubeta_model = hglm_model.coefs_random()
+    level_2_names = hglm_model.level_2_names()
+    for index in range(len(level_2_names)):
+        pyunit_utils.equal_two_arrays(ubeta[index], ubeta_model[level_2_names[index]])
+    t_mat_model = hglm_model.matrix_T()
+    for index in range(len(t_mat_model)):
+        pyunit_utils.equal_two_arrays(t_mat[index], t_mat_model[index])
+
+
+
+
+
+if __name__ == "__main__":
+    pyunit_utils.standalone_test(test_scoring_history_model_summary)
+else:
+    test_scoring_history_model_summary()