diff --git a/h2o-docs/src/product/data-science.rst b/h2o-docs/src/product/data-science.rst
index bf6fef75163f..b7b16ff0403e 100644
--- a/h2o-docs/src/product/data-science.rst
+++ b/h2o-docs/src/product/data-science.rst
@@ -42,6 +42,7 @@ H2O-3 supports the following supervised algorithms:
    data-science/model_selection
    data-science/gam
    data-science/anova_glm
+   data-science/hglm
    data-science/gbm
    data-science/naive-bayes
    data-science/rulefit
diff --git a/h2o-docs/src/product/data-science/algo-params/custom_metric_func.rst b/h2o-docs/src/product/data-science/algo-params/custom_metric_func.rst
index 5505d6d81bb7..996e94ad2eb4 100644
--- a/h2o-docs/src/product/data-science/algo-params/custom_metric_func.rst
+++ b/h2o-docs/src/product/data-science/algo-params/custom_metric_func.rst
@@ -3,7 +3,7 @@
 ``custom_metric_func``
 ----------------------
 
-- Available in: GBM, GLM, DRF, Deeplearning, Stacked Ensembles, XGBoost
+- Available in: GBM, GLM, HGLM, DRF, Deeplearning, Stacked Ensembles, XGBoost
 - Hyperparameter: no
 
 .. note::
diff --git a/h2o-docs/src/product/data-science/algo-params/family.rst b/h2o-docs/src/product/data-science/algo-params/family.rst
index 29a5b28c60ba..91e6572f52a5 100644
--- a/h2o-docs/src/product/data-science/algo-params/family.rst
+++ b/h2o-docs/src/product/data-science/algo-params/family.rst
@@ -1,7 +1,7 @@
 ``family``
 ----------
 
-- Available in: GLM, GAM
+- Available in: GLM, GAM, HGLM
 - Hyperparameter: no
 
 Description
diff --git a/h2o-docs/src/product/data-science/algo-params/hglm.rst b/h2o-docs/src/product/data-science/algo-params/hglm.rst
deleted file mode 100644
index 24cbf686b56e..000000000000
--- a/h2o-docs/src/product/data-science/algo-params/hglm.rst
+++ /dev/null
@@ -1,81 +0,0 @@
-``HGLM``
---------
-
-- Available in: GLM
-- Hyperparameter: no
-
-Description
-~~~~~~~~~~~
-
-Generalized Linear Models (GLM) estimate regression models for outcomes following exponential distributions. Hierarchical GLM (HGLM) fits generalized linear models with random effects, where the random effect can come from a conjugate exponential-family distribution (for example, Gaussian). HGLM allows you to specify both fixed and random effects, which allows fitting correlated to random effects as well as random regression models. 
-
-HGLM produces estimates for fixed effects, random effects, variance components and their standard errors. It also produces diagnostics, such as variances and leverages. 
-
-The ``HGLM`` option allows you to build a hierarchical generalized linear model. This option is disabled by default.
-
-**Note**: This initial release of HGLM supports only Gaussian for ``family`` and ``rand_family``.
-
-Related Parameters
-~~~~~~~~~~~~~~~~~~
-
-- `random_columns <random_columns.html>`__
-- `rand_family <rand_family.html>`__
-
-Example
-~~~~~~~
-
-.. tabs::
-   .. code-tab:: r R
-
-        library(h2o)
-        h2o.init()
-
-        # Import the semiconductor dataset
-        h2odata <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/semiconductor.csv")
-
-        # Set the response, predictor, and random columns
-        yresp <- "y"
-        xlist <- c("x1", "x3", "x5", "x6")
-        z <- c(1)
-
-        # Convert the "Device" column to a factor
-        h2odata$Device <- h2o.asfactor(h2odata$Device)
-
-        # Train and view the model
-        m11H2O <- h2o.glm(x = xlist, 
-                          y = yresp, 
-                          family = "gaussian", 
-                          rand_family = c("gaussian"), 
-                          rand_link = c("identity"), 
-                          training_frame = h2odata, 
-                          HGLM = TRUE, 
-                          random_columns = z, 
-                          calc_like = TRUE)
-        print(m11H2O)
-
-   .. code-tab:: python
-
-        import h2o
-        from h2o.estimators.glm import H2OGeneralizedLinearEstimator
-        h2o.init()
-
-        # Import the semiconductor dataset
-        h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/semiconductor.csv")
-
-        # Set the response, predictor, and random columns
-        y = "y"
-        x = ["x1","x3","x5","x6"]
-        z = 0
-
-        # Convert the "Device" column to a factor
-        h2o_data["Device"] = h2o_data["Device"].asfactor()
-
-        # Train and view the model
-        h2o_glm = H2OGeneralizedLinearEstimator(HGLM=True, 
-                                                family="gaussian", 
-                                                rand_family=["gaussian"], 
-                                                random_columns=[z],
-                                                rand_link=["identity"],
-                                                calc_like=True)
-        h2o_glm.train(x=x, y=y, training_frame=h2o_data)
-        print(h2o_glm)
diff --git a/h2o-docs/src/product/data-science/algo-params/ignore_const_cols.rst b/h2o-docs/src/product/data-science/algo-params/ignore_const_cols.rst
index bb1f3865a9a9..118ed290d864 100644
--- a/h2o-docs/src/product/data-science/algo-params/ignore_const_cols.rst
+++ b/h2o-docs/src/product/data-science/algo-params/ignore_const_cols.rst
@@ -1,7 +1,7 @@
 ``ignore_const_cols``
 ---------------------
 
-- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, XGBoost, Aggregator, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost
+- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, XGBoost, Aggregator, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost, Decision Tree, ANOVAGLM, ModelSelection
 - Hyperparameter: no
 
 Description
diff --git a/h2o-docs/src/product/data-science/algo-params/ignored_columns.rst b/h2o-docs/src/product/data-science/algo-params/ignored_columns.rst
index 3bdd8ed63e64..7f2ac060544d 100644
--- a/h2o-docs/src/product/data-science/algo-params/ignored_columns.rst
+++ b/h2o-docs/src/product/data-science/algo-params/ignored_columns.rst
@@ -1,7 +1,7 @@
 ``ignored_columns``
 -------------------
 
-- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, XGBoost, Aggregator, CoxPH, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost
+- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, XGBoost, Aggregator, CoxPH, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost, Decision Tree, ANOVAGLM, ModelSelection
 - Hyperparameter: no
 
 Description
diff --git a/h2o-docs/src/product/data-science/algo-params/max_iterations.rst b/h2o-docs/src/product/data-science/algo-params/max_iterations.rst
index 1d15c6240ade..d791368f1798 100644
--- a/h2o-docs/src/product/data-science/algo-params/max_iterations.rst
+++ b/h2o-docs/src/product/data-science/algo-params/max_iterations.rst
@@ -1,7 +1,7 @@
 ``max_iterations``
 ------------------
 
-- Available in: GLM,  GAM, PCA, GLRM, K-Means, CoxPH
+- Available in: GLM, GAM, HGLM, PCA, GLRM, K-Means, CoxPH, ANOVAGLM, ModelSelection
 - Hyperparameter: yes
 
 Description
diff --git a/h2o-docs/src/product/data-science/algo-params/max_runtime_secs.rst b/h2o-docs/src/product/data-science/algo-params/max_runtime_secs.rst
index 40803a1e173e..0f7273c0a575 100644
--- a/h2o-docs/src/product/data-science/algo-params/max_runtime_secs.rst
+++ b/h2o-docs/src/product/data-science/algo-params/max_runtime_secs.rst
@@ -3,7 +3,7 @@
 ``max_runtime_secs``
 -----------------------
 
-- Available in: GBM, DRF, Deep Learning, GLM, PCA, GLRM, Naïve-Bayes, K-Means, AutoML, XGBoost, Word2vec, Isolation Forest, Stacked Ensembles, Uplift DRF
+- Available in: GBM, DRF, Deep Learning, GLM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, AutoML, XGBoost, Word2vec, Isolation Forest, Stacked Ensembles, Uplift DRF, ANOVAGLM, ModelSelection
 - Hyperparameter: yes
 
 Description
diff --git a/h2o-docs/src/product/data-science/algo-params/missing_values_handling.rst b/h2o-docs/src/product/data-science/algo-params/missing_values_handling.rst
index 4785d6cad8a8..01cb03a8482a 100644
--- a/h2o-docs/src/product/data-science/algo-params/missing_values_handling.rst
+++ b/h2o-docs/src/product/data-science/algo-params/missing_values_handling.rst
@@ -1,7 +1,7 @@
 ``missing_values_handling``
 ---------------------------
 
-- Available in: Deep Learning, GLM, GAM
+- Available in: Deep Learning, GLM, GAM, HGLM, ANOVAGLM, ModelSelection
 - Hyperparameter: yes
 
 Description
diff --git a/h2o-docs/src/product/data-science/algo-params/model_id.rst b/h2o-docs/src/product/data-science/algo-params/model_id.rst
index 222a4188c5b9..f1074afe934a 100644
--- a/h2o-docs/src/product/data-science/algo-params/model_id.rst
+++ b/h2o-docs/src/product/data-science/algo-params/model_id.rst
@@ -1,7 +1,7 @@
 ``model_id``
 ------------
 
-- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, Word2Vec, Stacked Ensembles, XGBoost, Aggregator, CoxPH, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost
+- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, Word2Vec, Stacked Ensembles, XGBoost, Aggregator, CoxPH, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost, Decision Tree, ANOVAGLM, ModelSelection
 - Hyperparameter: no
 
 Description
diff --git a/h2o-docs/src/product/data-science/algo-params/offset_column.rst b/h2o-docs/src/product/data-science/algo-params/offset_column.rst
index 999c763a6694..a7b7bce8c91e 100644
--- a/h2o-docs/src/product/data-science/algo-params/offset_column.rst
+++ b/h2o-docs/src/product/data-science/algo-params/offset_column.rst
@@ -1,7 +1,7 @@
 ``offset_column``
 -----------------
 
-- Available in: GBM, Deep Learning, GLM, GAM, CoxPH, XGBoost, Stacked Ensembles
+- Available in: GBM, Deep Learning, GLM, GAM, HGLM, CoxPH, XGBoost, Stacked Ensembles, ANOVAGLM, ModelSelection
 - Hyperparameter: no
 
 
diff --git a/h2o-docs/src/product/data-science/algo-params/plug_values.rst b/h2o-docs/src/product/data-science/algo-params/plug_values.rst
index 9c60330da220..051df8dc2044 100644
--- a/h2o-docs/src/product/data-science/algo-params/plug_values.rst
+++ b/h2o-docs/src/product/data-science/algo-params/plug_values.rst
@@ -1,7 +1,7 @@
 ``plug_values``
 ---------------
 
-- Available in: GLM, GAM
+- Available in: GLM, GAM, HGLM, ANOVAGLM, ModelSelection
 - Hyperparameter: yes
 
 Description
diff --git a/h2o-docs/src/product/data-science/algo-params/rand_family.rst b/h2o-docs/src/product/data-science/algo-params/rand_family.rst
index d02187b3ffa3..93039511d074 100644
--- a/h2o-docs/src/product/data-science/algo-params/rand_family.rst
+++ b/h2o-docs/src/product/data-science/algo-params/rand_family.rst
@@ -1,7 +1,7 @@
 ``rand_family``
 ---------------
 
-- Available in: GLM
+- Available in: HGLM
 - Hyperparameter: no
 
 Description
diff --git a/h2o-docs/src/product/data-science/algo-params/random_columns.rst b/h2o-docs/src/product/data-science/algo-params/random_columns.rst
index 7e7c53406ab5..ee2e621e114c 100644
--- a/h2o-docs/src/product/data-science/algo-params/random_columns.rst
+++ b/h2o-docs/src/product/data-science/algo-params/random_columns.rst
@@ -1,8 +1,8 @@
 ``random_columns``
 ------------------
 
-- Available in: GLM
-- Hyperparameter: no
+- Available in: HGLM
+- Hyperparameter: yes
 
 Description
 ~~~~~~~~~~~
diff --git a/h2o-docs/src/product/data-science/algo-params/score_each_iteration.rst b/h2o-docs/src/product/data-science/algo-params/score_each_iteration.rst
index 9a759131e6aa..5d2426a4a7ac 100644
--- a/h2o-docs/src/product/data-science/algo-params/score_each_iteration.rst
+++ b/h2o-docs/src/product/data-science/algo-params/score_each_iteration.rst
@@ -3,7 +3,7 @@
 ``score_each_iteration``
 ------------------------
 
-- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, XGBoost, Isolation Forest, Extended Isolation Forest, Uplift DRF
+- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, XGBoost, Isolation Forest, Extended Isolation Forest, Uplift DRF, ANOVAGLM, ModelSelection
 - Hyperparameter: no
 
 
diff --git a/h2o-docs/src/product/data-science/algo-params/seed.rst b/h2o-docs/src/product/data-science/algo-params/seed.rst
index 334db7479a1e..1e5c20ed6afc 100644
--- a/h2o-docs/src/product/data-science/algo-params/seed.rst
+++ b/h2o-docs/src/product/data-science/algo-params/seed.rst
@@ -1,7 +1,7 @@
 ``seed``
 --------
 
-- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, AutoML, XGBoost, Stacked Ensembles, Isolation Forest, Target Encoding, Extended Isolation Forest, Uplift DRF, AdaBoost
+- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, AutoML, XGBoost, Stacked Ensembles, Isolation Forest, Target Encoding, Extended Isolation Forest, Uplift DRF, AdaBoost, Decision Tree, ANOVAGLM, ModelSelection
 - Hyperparameter: yes
 
 Description
diff --git a/h2o-docs/src/product/data-science/algo-params/standardize.rst b/h2o-docs/src/product/data-science/algo-params/standardize.rst
index 946fbd60c8f7..27bf489429ab 100644
--- a/h2o-docs/src/product/data-science/algo-params/standardize.rst
+++ b/h2o-docs/src/product/data-science/algo-params/standardize.rst
@@ -1,7 +1,7 @@
 ``standardize``
 ---------------
 
-- Available in: Deep Learning, GLM, GAM, K-Means
+- Available in: Deep Learning, GLM, GAM, HGLM, K-Means, ANOVAGLM, ModelSelection
 - Hyperparameter: yes
 
 Description
diff --git a/h2o-docs/src/product/data-science/algo-params/training_frame.rst b/h2o-docs/src/product/data-science/algo-params/training_frame.rst
index 76d9dd4ad91b..aa75fc63e10f 100644
--- a/h2o-docs/src/product/data-science/algo-params/training_frame.rst
+++ b/h2o-docs/src/product/data-science/algo-params/training_frame.rst
@@ -1,7 +1,7 @@
 ``training_frame``
 ------------------
 
-- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, Word2Vec, Stacked Ensembles, AutoML, XGBoost, Aggregator, CoxPH, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost
+- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, Word2Vec, Stacked Ensembles, AutoML, XGBoost, Aggregator, CoxPH, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost, Decision Tree, ANOVAGLM, ModelSelection
 - Hyperparameter: no
 
 Description
diff --git a/h2o-docs/src/product/data-science/algo-params/validation_frame.rst b/h2o-docs/src/product/data-science/algo-params/validation_frame.rst
index 4725a02883be..66b9ce8a4a72 100644
--- a/h2o-docs/src/product/data-science/algo-params/validation_frame.rst
+++ b/h2o-docs/src/product/data-science/algo-params/validation_frame.rst
@@ -1,7 +1,7 @@
 ``validation_frame``
 --------------------
 
-- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, Stacked Ensembles, AutoML, XGBoost, Uplift DRF
+- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, Stacked Ensembles, AutoML, XGBoost, Uplift DRF, ModelSelection
 - Hyperparameter: no
 
 Description
diff --git a/h2o-docs/src/product/data-science/algo-params/weights_column.rst b/h2o-docs/src/product/data-science/algo-params/weights_column.rst
index 3ecbda1ca3b6..f1a64286915b 100644
--- a/h2o-docs/src/product/data-science/algo-params/weights_column.rst
+++ b/h2o-docs/src/product/data-science/algo-params/weights_column.rst
@@ -1,7 +1,7 @@
 ``weights_column``
 ------------------
 
-- Available in: GBM, DRF, Deep Learning, GLM, GAM, AutoML, XGBoost, CoxPH, Stacked Ensembles, AdaBoost
+- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, AutoML, XGBoost, CoxPH, Stacked Ensembles, AdaBoost, ANOVAGLM, ModelSelection
 - Hyperparameter: no
 
 Description
diff --git a/h2o-docs/src/product/data-science/algo-params/x.rst b/h2o-docs/src/product/data-science/algo-params/x.rst
index 98e27e8a5bc9..06020ab907a0 100644
--- a/h2o-docs/src/product/data-science/algo-params/x.rst
+++ b/h2o-docs/src/product/data-science/algo-params/x.rst
@@ -1,7 +1,7 @@
 ``x``
 -----
 
-- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, Stacked Ensembles, AutoML, XGBoost, Uplift DRF, AdaBoost
+- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, Stacked Ensembles, AutoML, XGBoost, Uplift DRF, AdaBoost, Decision Tree, ANOVAGLM, ModelSelection
 - Hyperparameter: no
 
 Description
diff --git a/h2o-docs/src/product/data-science/algo-params/y.rst b/h2o-docs/src/product/data-science/algo-params/y.rst
index 82c7276707d1..46c1f07c9061 100644
--- a/h2o-docs/src/product/data-science/algo-params/y.rst
+++ b/h2o-docs/src/product/data-science/algo-params/y.rst
@@ -1,6 +1,6 @@
 ``y``
 -----
-- Available in: GBM, DRF, Deep Learning, GLM, GAM, Naïve-Bayes, Stacked Ensembles, AutoML, XGBoost, Aggregator, Uplift DRF, AdaBoost
+- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, Naïve-Bayes, Stacked Ensembles, AutoML, XGBoost, Aggregator, Uplift DRF, AdaBoost, Decision Tree, ANOVAGLM, ModelSelection
 - Hyperparameter: no
 
 
diff --git a/h2o-docs/src/product/data-science/glm.rst b/h2o-docs/src/product/data-science/glm.rst
index 2f71f9188a38..bc1d764ba848 100644
--- a/h2o-docs/src/product/data-science/glm.rst
+++ b/h2o-docs/src/product/data-science/glm.rst
@@ -71,18 +71,6 @@ Algorithm-specific parameters
 
 - `upload_custom_metric <algo-params/upload_custom_metric.html>`__: Upload a custom metric into a running H2O cluster.
 
-HGLM parameters
-'''''''''''''''
-
--  `HGLM <algo-params/hglm.html>`__: If enabled, then an HGLM model will be built. If disabled (default), then a GLM model will be built. 
-
--  **rand_link**: The link function for random component in HGLM specified as an array. Available options include ``identity`` and ``family_default``. 
-
-- `random_columns <algo-params/random_columns.html>`__: An array of random column indices to be used for ``HGLM``.
-
--  **startval**: The initial starting values for fixed and randomized coefficients in ``HGLM`` specified as a double array. 
-
-
 Shared GLM family parameters
 ''''''''''''''''''''''''''''
 
@@ -101,7 +89,12 @@ Shared GLM family parameters
    :scale: 5%
    :align: middle
 
-**GLM Family**: |GAM| `Generalized Additive Models <gam.html#defining-a-gam-model>`__ (GAM) |MS| `ModelSelection <model_selection.html#defining-a-modelselection-model>`__ |ANOVA| `ANOVA GLM <anova_glm.#defining-an-anova-glm-model>`__
+.. |HGLM| image:: ../images/HGLM.png
+   :alt: HGLM
+   :scale: 5%
+   :align: middle
+
+**GLM Family**: |GAM| `Generalized Additive Models <gam.html#defining-a-gam-model>`__ (GAM) |MS| `ModelSelection <model_selection.html#defining-a-modelselection-model>`__ |ANOVA| `ANOVA GLM <anova_glm.#defining-an-anova-glm-model>`__ |HGLM| `Hierarchical Generalized Linear Model <hglm.html>`__ (HGLM)
 
 -  `alpha <algo-params/alpha.html>`__: |GAM| |MS| |ANOVA| Specify the regularization distribution between L1 and L2. A value of ``1`` produces LASSO regression; a value of ``0`` produces Ridge regression. The default value of ``alpha`` is ``0`` when ``SOLVER = 'L-BFGS'``; otherwise it is ``0.5`` to specify a mixing between LASSO and Ridge regression.
 
@@ -115,7 +108,7 @@ Shared GLM family parameters
 
 -  `compute_p_values <algo-params/compute_p_values.html>`__: |GAM| |MS| |ANOVA| Request computation of p-values. P-values can be computed with or without regularization. Setting ``remove_collinear_columns`` is recommended. H2O will return an error if p-values are requested and there are collinear columns and ``remove_collinear_columns`` flag is not enabled. Note that this option is not available for ``family="multinomial"`` or ``family="ordinal"``; ``IRLSM`` solver requried. This option defaults to ``False`` (disabled).
 
--  `family <algo-params/family.html>`__: |GAM| |MS| |ANOVA| Specify the model type.
+-  `family <algo-params/family.html>`__: |GAM| |MS| |ANOVA| |HGLM| Specify the model type.
 
    -  If the family is ``gaussian``, the response must be numeric (**Real** or **Int**).
    -  If the family is ``binomial``, the response must be categorical 2 levels/classes or binary (**Enum** or **Int**).
@@ -175,7 +168,7 @@ Shared GLM family parameters
 
 -  `objective_epsilon <algo-params/objective_epsilon.html>`__: |GAM| If the objective value is less than this threshold, then the model is converged. If ``lambda_search=True``, then this value defaults to ``.0001``. If ``lambda_search=False`` and ``lambda`` is equal to zero, then this value defaults to ``.000001``. For any other value of ``lambda``, the default value of ``objective_epsilon`` is set to ``.0001``. The default value is ``-1``.
 
--  `plug_values <algo-params/plug_values.html>`__: |GAM| |MS| |ANOVA| (Applicable only if ``missing_values_handling="PlugValues"``) Specify a single row frame containing values that will be used to impute missing values of the training/validation frame.
+-  `plug_values <algo-params/plug_values.html>`__: |GAM| |MS| |ANOVA| |HGLM| (Applicable only if ``missing_values_handling="PlugValues"``) Specify a single row frame containing values that will be used to impute missing values of the training/validation frame.
 
 -  `prior <algo-params/prior.html>`__: |GAM| |MS| |ANOVA| Specify prior probability for :math:`p(y==1)`. Use this parameter for logistic regression if the data has been sampled and the mean of response does not reflect reality. This value defaults to ``-1`` and must be a value in the range (0,1).
    
@@ -183,7 +176,7 @@ Shared GLM family parameters
 
 -  `remove_collinear_columns <algo-params/remove_collinear_columns.html>`__: |GAM| |MS| Specify whether to automatically remove collinear columns during model-building. When enabled, collinear columns will be dropped from the model and will have 0 coefficient in the returned model. This option defaults to ``False`` (disabled).
 
-- **score_iteration_interval**: |MS| Perform scoring for every ``score_iteration_interval`` iteration. This option defaults to ``-1``.
+- **score_iteration_interval**: |MS| |HGLM| Perform scoring for every ``score_iteration_interval`` iteration. This option defaults to ``-1``.
 
 -  `solver <algo-params/solver.html>`__: |GAM| |MS| |ANOVA| Specify the solver to use. One of: 
    
@@ -954,247 +947,6 @@ While not converged:
    
       i. Theta :math:`\gets` Maximum Likelihood estimate using Newton’s method with learning rate estimated using Golden section search
 
-Hierarchical GLM
-~~~~~~~~~~~~~~~~
-
-Introduced in 3.28.0.1, Hierarchical GLM (HGLM) fits generalized linear models with random effects, where the random effect can come from a conjugate exponential-family distribution (for example, Gaussian). HGLM allows you to specify both fixed and random effects, which allows fitting correlated to random effects as well as random regression models. HGLM can be used for linear mixed models and for generalized linear mixed models with random effects for a variety of links and a variety of distributions for both the outcomes and the random effects. 
-
-**Note**: The initial release of HGLM supports only the Gaussian family and random family.
-
-Gaussian Family and Random Family in HGLM
-'''''''''''''''''''''''''''''''''''''''''
-
-To build an HGLM, we need the hierarchical log-likelihood (h-likelihood) function. The h-likelihood function can be expressed as (equation 1):
-
-.. math::
-
- h(\beta, \theta, u) = \log(f (y|u)) + \log (f(u))
-
-for fixed effects :math:`\beta`, variance components :math:`\theta`, and random effects :math:`u`.
-
-A standard linar mixed model can be expressed as (equation 2):
-
-.. math::
-
-  y = X\beta + Zu + e
-
-where
-
- - :math:`e \text ~ N(0, I_n, \delta_e^2), u \text ~ N(0, I_k, \delta_u^2)`
- - :math:`e, u` are independent, and :math:`u` represents the random effects
- - :math:`n` is the number of i.i.d observations of :math:`y` with mean :math:`0`
- - :math:`q` is the number of values :math:`Z` can take
-
-Then rewriting equation 2 as :math:`e = X\beta + Zu - y` and derive the h-likelihood as:
-
-.. figure:: ../images/h-likelihood.png
-   :align: center
-
-where :math:`C_1 = - \frac{n}{2} \log(2\pi), C_2 = - \frac{q}{2} \log(2\pi)`
-
-In principal, the HGLM model building involves the following main steps:
-
-1. Set the initial values to :math:`\delta_u^2, \delta_e^2, u, \beta`
-2. Estimate the fixed (:math:`\beta`) and random effects (:math:`u`) by solving for :math:`\frac{\partial h}{\partial \beta} = 0, \frac{\partial h}{\partial u} = 0`
-3. Estimate variance components using the adjusted profile likelihood:
-
- .. math::
-
-   h_p = \big(h + \frac{1}{2} log \big| 2 \pi D^{-1}\big| \big)_{\beta=\hat \beta, u=\hat u}
-
- and solving for
-
- .. math::
-
-   \frac{\partial h_p}{\partial \theta} = 0
-
- Note that :math:`D` is the matrix of the second derivatives of :math:`h` around :math:`\beta = \hat \beta, u = \hat u, \theta = (\delta_u^2, \delta_e^2)`.
-
-H2O Implementation
-''''''''''''''''''
-
-In reality, Lee and Nelder (see References) showed that linear mixed models can be fitted using a hierarchy of GLM by using an augmented linear model.  The linear mixed model will be written as:
-
-.. math::
-
-  y = X\beta + Zu + e \\
-  v = ZZ^T\sigma_u^2 + R\sigma_e^2
-
-where :math:`R` is a diagonal matrix with elements given by the estimated dispersion model. The dispersion model refers to the variance part of the fixed effect model with error :math:`e`. There are cases where the dispersion model is modeled itself as :math:`exp(x_d, \beta_d)`. However, in our current version, the variance is just a constant :math:`\sigma_e^2`, and hence :math:`R` is just a scalar value. It is initialized to be the identity matrix.  The model can be written as an augmented weighted linear model:
-
-.. math::
-
-  y_a = T_a \delta + e_a
-
-where
-
-.. figure:: ../images/hglm_augmentation.png
-   :align: center
-
-Note that :math:`q` is the number of columns in :math:`Z, 0_q` is a vector of :math:`q` zeroes, :math:`I_q` is the :math:`qxq` identity matrix. The variance-covariance matrix of the augmented residual matrix is
-
-.. figure:: ../images/hglm_variance_covariance.png
-   :align: center
-
-Fixed and Random Coefficients Estimation
-''''''''''''''''''''''''''''''''''''''''
-
-The estimates for :math:`\delta` from weighted least squares are given by solving
-
-.. math::
-
-  T_a^T W^{-1} T_a \delta=T_a^T W^{-1} y_a 
-
-where 
-
-.. math::
-
-  W= V(e_a )
-
-The two variance components are estimated iteratively by applying a gamma GLM to the residuals :math:`e_i^2,u_i^2`. Because we are not using a dispersion model, there is only an intercept terms in the linear predictors. The leverages :math:`h_i` for these models are calculated from the diagonal elements of the hat matrix: 
-
-.. math::
-
- H_a=T_a (T_a^T W^{-1} T_a )^{-1} T_a^T W^{-1}
-
-Estimation of Fixed Effect Dispersion Parameter/Variance
-''''''''''''''''''''''''''''''''''''''''''''''''''''''''
-
-A gamma GLM is used to fit the dispersion part of the model with response
-:math:`y_{d,i}=(e_i^2)⁄(1-h_i )` where :math:`E(y_d )=u_d` and :math:`u_d≡\phi` (i.e., :math:`\delta_e^2` for a Gaussian response). The GLM model for the dispersion parameter is then specified by the link function :math:`g_d (.)` and the linear predictor :math:`X_d \beta_d` with prior weights for :math:`(1-h_i )⁄2` for :math:`g_d (u_d )=X_d \beta_d`. Because we are not using a dispersion model, :math:`X_d \beta_d` will only contain the intercept term.
-
-Estimation of Random Effect Dispersion Parameter/Variance
-'''''''''''''''''''''''''''''''''''''''''''''''''''''''''
-
-Similarly, a gamma GLM is fitted to the dispersion term :math:`alpha` (i.e., :math:`\delta_e^2` for a GLM) for the random effect :math:`v`, with :math:`y_\alpha,j = u_j^2⁄(1-h_{n+j}), j=1,2,…,q` and :math:`g_\alpha (u_\alpha )=\lambda`, where the prior weights are :math:`(1-h_{n+j} )⁄2`, and the estimated dispersion term for the random effect is given by :math:`\hat \alpha = g_α^{-1}(\hat \lambda)`.
-
-Fitting Algorithm Overview
-''''''''''''''''''''''''''
-
-The following fitting algorithm from "Generalized linear models with random effects" (Y. Lee, J. A. Nelder and Y. Pawitan; see References) is used to build our HGLM. Let :math:`n` be the number of observations and :math:`k` be the number of levels in the random effect. The algorithm that was implemented here at H2O will perform the following:
-
-1. Initialize starting values either from user by setting parameter startval or by the system if startval is left unspecified.  
-2. Construct an augmented model with response :math:`y_{aug}= {y \choose {E(u)}}`.
-3. Use a GLM to estimate :math:`\delta={\beta \choose u}` given the dispersion :math:`\phi` and :math:`\lambda`. Save the deviance components and leverages from the fitted model.
-4. Use a gamma GLM to estimate the dispersion parameter for :math:`\phi` (i.e. :math:`\delta_e^2` for a Gaussian response).
-5. Use a similar GLM as in step 4 to estimate :math:`\lambda` from the last :math:`k` deviance components and leverages obtained from the GLM in step 3.
-6. Iterate between steps 3-5 until convergence. Note that the convergence measure here is either a timeout event or the following condition has been met: :math:`\frac {\Sigma_i{(\text{eta}. i - \text{eta}.o)^2}} {\Sigma_i(\text{eta}.i)^2 \text{<} 1e - 6}`.
-
-A timeout event can be defined as the following:
-
-1. Maximum number of iterations have been reached
-2. Model building run time exceeds what is specified in ``max_runtime_secs``
-3. A user has clicked on stop model button or similar from Flow.
-
-For families and random families other than Gaussian, link functions are used to translate from the linear space to the model the mean output.  
-
-Linear Mixed Model with Correlated Random Effect
-''''''''''''''''''''''''''''''''''''''''''''''''
-
-Let :math:`A` be a matrix with known elements that describe the correlation among the random effects. The model is now given by:
-
-.. figure:: ../images/hglm_linear_mixed_model1.png
-   :align: center
-
-where :math:`N` is normal distribution and :math:`MVN` is multi-variable normal. This can be easily translated to:
-
-.. figure:: ../images/hglm_linear_mixed_model2.png
-   :align: center
-
-where :math:`Z^* = ZL` and :math:`L` is the Cholesky factorization of :math:`A`. Hence, if you have correlated random effects, you can first perform the transformation to your data before using our HGLM implementation here.
-
-HGLM Model Metrics
-''''''''''''''''''
-
-H2O provides the following model metrics at the end of each HGLM experiment:
-
-- fixef: fixed effects coefficients
-- ranef: random effects coefficients
-- randc: vector of random column indices
-- varfix: dispersion parameter of the mean model
-- varranef: dispersion parameter of the random effects
-- converge: true if algorithm has converge, otherwise false
-- sefe: standard errors of fixed effects
-- sere: standard errors of random effects
-- dfrefe: deviance degrees of freedom for the mean part of model
-- sumvc1: estimates and standard errors of linear predictor in the dispersion model
-- summvc2: estimates and standard errors of the linear predictor for the dispersion parameter of the random effects
-- likelihood: if ``calc_like`` is true, the following four values are returned:
-
-   - hlik: log-h-likelihood;
-   - pvh: adjusted profile log-likelihood profiled over the random effects;
-   - pbvh: adjusted profile log-likelihood profiled over fixed and random effects;
-   - caic: conditional AIC.
-
-- bad: row index of the most influential observation.
-
-Mapping of Fitting Algorithm to the H2O-3 Implementation
-''''''''''''''''''''''''''''''''''''''''''''''''''''''''
-
-This mapping is done in four steps:
-
-1. Initialize starting values by the system.
-2. Estimate :math:`\delta =` :math:`\beta \choose u`.
-3. Estimate :math:`\delta_e^2(\text {tau})`.
-4. Estimate :math:`\delta_u^2(\text {phi})`.
-
-**Step 1**: Initialize starting values by the system.
-
-Following the implementation from R, when a user fails to specify starting values for psi, :math:`\beta`, :math:`\mu`, :math:`\delta_e^2`, :math:`\delta_u^2`, we will do it for the users as follows: 
-
- 1. A GLM model is built with just the fixed columns and response.
- 2. Next init_sig_e(:math:`\delta_e^2`)/tau is set to 0.6*residual_deviance()/residual_degrees_of_freedom().
- 3. init_sig_u(:math:`\delta_u^2`) is set to 0.66*init_sig_e.
- 4. For numerical stability, we restrict the magnitude to init_sig_e and init_sig_u to >= 0.1.
- 5. Set phi = vector of length number of random columns of value init_sig_u/(number of random columns).
- 6. Set :math:`\beta` to the GLM model coefficients, :math:`\mu` to be a zero vector.
- 7. Set psi to be a zero vector.
-
-**Step 2**: Estimate :math:`\delta =` :math:`\beta \choose u`.
-
-Given the current values of :math:`\delta_e^2, \delta_u^2`, we will solve for :math:`\delta =` :math:`\beta \choose u`. Instead of solving :math:`\delta` from :math:`T_a^T W^{-1} T_a \delta=T_a^T W^{-1} y_a`, a different set of formulae are used. A loop is used to solve for the coefficients:
-
- 1. The following variables are generated:
-
-  - :math:`v.i= g_r^{-1} (u_i)` where :math:`u_i` are the random coefficients of the random effects/columns and :math:`g_r^{-1}` can be considered as the inverse link function.
-  - :math:`tau` is a vector of length number of data containing init.sig.e;
-  - :math:`eta.i=X_i \beta+offset` and store the previous :math:`eta.i` as :math:`eta.o`.
-  - :math:`mu.i=g^{-1} (eta.i)`.
-  - dmu_deta is derivative of :math:`g^{-1} (eta.i)` with respect to :math:`eta.i`, which is 1 for identity link.
-  - :math:`z_i=eta.i-offset+(y_i-mu.i)/\text {dmu_deta}`
-  - :math:`zmi= \text{psi}`
-  - :math:`augZ =` :math:`zi \choose zmi`.
-  - du_dv is the derivative of :math:`g_r^{-1} (u_i)` with respect to :math:`v.i.`  Again, for identity link, this is 1.
-  - The weight :math:`W =` :math:`wdata \choose wpsi` where :math:`wdata = \frac {d \text{mu_deta}^2}{\text {prior_weight*family}\$\text{variance}(mu.i)*tau}` and :math:`wpsi = \frac {d \text{u_dv}^2}{\text {prior_weight*family}\$\text{variance(psi)*phi}}`
-
- 2. Finally the following formula is used to solve for the parameters: :math:`augXZ \cdot \delta=augZW` where :math:`augXZ=T_a \cdot W` and :math:`augZW=augZ \cdot W`:
-
-  - Use QR decomposition to augXZ and obtain: :math:`QR \delta = augZW`.
-  - Use backward solve to obtain the coefficients :math:`\delta` from :math:`R \delta = Q^T augZW`.
-  - Calculate :math:`hv=\text{rowsum}(Q)` of length n+number of expanded and store in returnFrame.
-  - Calculate :math:`dev =` :math:`prior weight*(y_i-mu.i)^2 \choose (psi -u_i )^2` of length n+number of expanded random columns and store in returnFrame.
-  - Calculate :math:`resid= \frac {(y-mu.i)} {\sqrt \frac {sum(dev)(1-hv)}{n-p}}` of length n and store in returnFrame.
-  - Go back to step 1 unless :math:`\Sigma_i(eta.i-eta.o)^2 / \Sigma_i(eta.i)^2<1e-6` or a timeout event has occurred. 
-
-**Step 3**: Estimate :math:`\delta_e^2(\text {tau})`
-
-With the newly estimated fixed and random coefficients, we will estimate the dispersion parameter for the fixed effects/columns by building a gamma GLM:
-
- 1. Generate a training frame with constant predictor column of 1 to force glm model to generate only the intercept term:
-
-  - Response column as :math:`dev/(1-hv)`.
-  - Weight column as :math:`(1-hv)/2`.
-  - Predictor column of ones.
-  - The length of the training frame is the number of data rows.
-
- 2. Build a gamma GLM with ``family=gamma`` and ``link=log``.
- 3. Set :math:`tau = \text {exp (intercept value)}`.
- 4. Assign estimation standard error and sigma from the GLM standard error calculation for coefficients.
-
-**Step 4**: Estimate :math:`\delta_u^2(\text {phi})`.
-
-Again, a gamma GLM model is used here. In addition, the error estimates are generated for each random column. Exactly the same steps are used here as in Step 3. The only difference is that we are looking at the :math:`dev,hv` corresponding to the expanded random columns/effects.
-
 .. _regularization:
 
 Regularization
diff --git a/h2o-docs/src/product/data-science/hglm.rst b/h2o-docs/src/product/data-science/hglm.rst
new file mode 100644
index 000000000000..1a98e63ad3ca
--- /dev/null
+++ b/h2o-docs/src/product/data-science/hglm.rst
@@ -0,0 +1,392 @@
+Hierarchical Generalized Linear Model (HGLM) 
+============================================
+
+Introduction
+------------
+
+Hierarchical linear models (HLM) are used in situations where measurements are taken with clusters of data and there are effects of the cluster that can affect the coefficient values of GLM. 
+
+For instance, if we measure the performance of students from multiple schools along with other predictors like family annual incomes, students' health, school type (public, private, religious, etc.), and etc., we would suspect that students from the same school will have similar performances than students from different schools. Therefore, we can denote a coefficient for predictor :math:`m \text{ as } \beta_{mj}` where :math:`j` denotes the school index in our example. :math:`\beta_{0j}` denotes the intercept associated with school :math:`j`.
+
+A level-1 HLM can be expressed as:
+
+.. math::
+   
+   y_{ij} = \beta_{0j} + \sum_{m=1}^{p-1} x_{mij} \beta{mj} + \varepsilon_{ij} \quad \text{ equation 1}
+
+The level-2 model can be expressed as:
+   
+.. math::
+   
+   \beta_{0j} = \beta_{00} + u_{0j}, \beta_{mj} = \beta_{m0} + u_{mj} \quad \text{ equation 2}
+
+where:
+
+- :math:`j(=[1,2,...,J])` denotes the cluster (level-2 variable) the measurement is taken from (e.g. the school index);
+- :math:`i(=1,2,...,n_j)` denotes the data index taken from within cluster :math:`j`;
+- :math:`\beta_{00}` is the fixed intercept;
+- :math:`\beta_{0j}` is the random intercept;
+- :math:`\beta_{m0}` is the fixed coefficient for predictor :math:`m`;
+- The dimension of fixed effect coefficients is :math:`p` which includes the intercept;
+- :math:`u_{mj}` is the random coefficient for predictor :math:`m`. For predictors without a random coefficient, :math:`u_{mj} = 0`;
+- The dimension of the random effect coefficients is :math:`q` which can include the intercept. Note that :math:`q \leq p`;
+- :math:`\varepsilon_{ij} \sim N(0, \delta_e^2)`;
+- :math:`u_{ij} \sim N(0, \delta_u^2)`:
+- :math:`\varepsilon_{ij}, u_{mj}` are independent;
+- :math:`u_{mj}, u_{m^{'}j}` are independent if :math:`m \neq m^{'}`.
+
+We need to solve the following parameters: :math:`\beta_{00}, \beta_{0j}, \beta_{m0}, u_{mj}, \delta_e^2, \delta_u^2`.
+
+Defining an HGLM model
+----------------------
+Parameters are optional unless specified as *required*.
+
+Algorithm-specific parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- **em_epsilon**: (Only available for EM method) Converge if beta/ubeta/tmat/tauEVar changes less (using L-infinity norm) than EM epsilon (defaults to ``0.001``).
+
+- **gen_syn_data**: If enabled, it will generate synthetic HGLM data with the fixed coefficients specified in ``initial_fixed_effects`` and the random coefficients taken from ``initial_random_effects`` or the random effects are randomly generated. In particular, it will generate the folowing output: :math:`Y_j = A_{fj} \theta_f + A_{rj} \theta_{rj} + r_j`. The gaussian noise is generated with variance that's specified in ``tau_e_var_init``. If the random coefficients are to be randomly generated, they are generated with gaussian distribution with variance that's specified in ``tau_u_var_init``.
+
+- **group_column**: Specify the level-2 variable name which is categorical and used to generate the groups in HGLM (defaults to ``None``).
+
+- **initial_fixed_effects**: An array that contains the initial values of the fixed effects coefficient (defaults to ``None``).
+
+- **initial_random_effects**: An H2OFrame ID that contains the initial values of the random effects coefficient. The row names should be the random coefficient names (defaults to ``None``).
+	
+	.. note::
+
+		If you aren't sure what the random coefficient names are, then build the HGLM model with ``max_iterations=0`` and check out the model output field ``random_coefficient_names``. The number of rows of this frame should be the number of level 2 units. Check out the model output field ``group_column_names``. The number of rows should equal the length of the ``group_column_names``.
+
+- **initial_t_matrix**: An H2OFrame ID that contains the initial values of the T matrix. It should be a positive symmetric matrix (defaults to ``None``).
+
+- **method**: Obtains the fixed and random coefficients as well as the various variances (defaults to ``"em"``).
+
+- `random_columns <algo-params/random_columns.html>`__: An array of random column names from which random effects coefficients will be generated in the model building process.
+
+-  `rand_family <algo-params/rand_family.html>`__: Specify the distribution of the random effects. Currently only ``rand_family="gaussisan"`` is supported.
+
+- **random_intercept**: If enabled, will generate a random intercept as part of the random effects coefficients (defaults to ``True``).
+
+- **tau_e_var_init**: Initial variance estimate of random noise (residual noise). If set, this should provide a value of > 0.0. If not set, this will be randomly set during the model building process (defaults to ``0.0``).
+
+- **tau_u_var_init**: Initial variance estimate of random effects. If set, should provide a value > 0.0. If not set, this will be randomly set during the model building process (defaults to ``0.0``).
+
+Common parameters
+~~~~~~~~~~~~~~~~~
+
+- `custom_metric_func <algo-params/custom_metric_func.html>`__: Specify a custom evaluation function.
+
+-  `ignore_const_cols <algo-params/ignore_const_cols.html>`__: Enable this option to ignore constant training columns, since no information can be gained from them. This option defaults to ``True`` (enabled).
+
+-  `ignored_columns <algo-params/ignored_columns.html>`__: (Python and Flow only) Specify the column or columns to be excluded from the model. In Flow, click the checkbox next to a column name to add it to the list of columns excluded from the model. To add all columns, click the **All** button. To remove a column from the list of ignored columns, click the X next to the column name. To remove all columns from the list of ignored columns, click the **None** button. To search for a specific column, type the column name in the **Search** field above the column list. To only show columns with a specific percentage of missing values, specify the percentage in the **Only show columns with more than 0% missing values** field. To change the selections for the hidden columns, use the **Select Visible** or **Deselect Visible** buttons.
+
+-  `max_iterations <algo-params/max_iterations.html>`__: Specify the number of training iterations. This options defaults to ``-1``.
+
+- `max_runtime_secs <algo-params/max_runtime_secs.html>`__: Maximum allowed runtime in seconds for model training. Use ``0`` (default) to disable. 
+
+-  `missing_values_handling <algo-params/missing_values_handling.html>`__: Specify how to handle missing values. One of: ``Skip``, ``MeanImputation`` (default), or ``PlugValues``.
+
+-  `model_id <algo-params/model_id.html>`__: Specify a custom name for the model to use as a reference. By default, H2O automatically generates a destination key.
+
+-  `offset_column <algo-params/offset_column.html>`__: Specify a column to use as the offset; the value cannot be the same as the value for the ``weights_column``.
+   
+     .. note:: 
+
+      Offsets are per-row "bias values" that are used during model training. For Gaussian distributions, they can be seen as simple corrections to the response (``y``) column. Instead of learning to predict the response (y-row), the model learns to predict the (row) offset of the response column. For other distributions, the offset corrections are applied in the linearized space before applying the inverse link function to get the actual response values. 
+
+-  `score_each_iteration <algo-params/score_each_iteration.html>`__: Enable this option to score during each iteration of the model training. This option defaults to ``False`` (disabled).
+
+-  `seed <algo-params/seed.html>`__: Specify the random number generator (RNG) seed for algorithm components dependent on randomization. The seed is consistent for each H2O instance so that you can create models with the same starting conditions in alternative configurations. This option defaults to ``-1`` (time-based random number).
+
+-  `training_frame <algo-params/training_frame.html>`__: *Required* Specify the dataset used to build the model. **NOTE**: In Flow, if you click the **Build a model** button from the ``Parse`` cell, the training frame is entered automatically.
+
+-  `validation_frame <algo-params/validation_frame.html>`__: Specify the dataset used to evaluate the accuracy of the model.
+
+-  `weights_column <algo-params/weights_column.html>`__: Specify a column to use for the observation weights, which are used for bias correction. The specified ``weights_column`` must be included in the specified ``training_frame``. 
+   
+    *Python only*: To use a weights column when passing an H2OFrame to ``x`` instead of a list of column names, the specified ``training_frame`` must contain the specified ``weights_column``. 
+   
+    .. note:: 
+
+      Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more due to the larger loss function pre-factor.
+
+-  `x <algo-params/x.html>`__: Specify a vector containing the names or indices of the predictor variables to use when building the model. If ``x`` is missing, then all columns except ``y`` are used.
+
+-  `y <algo-params/y.html>`__: *Required* Specify the column to use as the dependent variable.
+
+   -  For a regression model, this column must be numeric (**Real** or **Int**).
+   -  For a classification model, this column must be categorical (**Enum** or **String**). If the family is ``Binomial``, the dataset cannot contain more than two levels.
+
+
+Estimation of parameters using machine learning estimation via EM
+-----------------------------------------------------------------
+
+The Expectation-Maximization (EM) algorithm addresses the problem of maximizing the likelihood by conceiving this as a problem with missing data.
+
+Model setup
+~~~~~~~~~~~
+
+Consider a combined model for each unit :math:`j`:
+
+.. math::
+   
+   Y_j = A_{fj} \theta_f + A_{rj} \theta_{rj} + r_j, \theta_{rj} \sim N(0,T_j), r_j \sim N(0, \sigma^2I) \quad \text{ equation 6}
+
+where:
+
+- :math:`Y_j = \begin{bmatrix} x^T_{j1} \\ x^T_{j2} \\ x^T_{j3} \\ \vdots \\ x^T_{jn_j} \\\end{bmatrix}` is a known :math:`n_j \text{ by } p` matrix of level-1 predictors and :math:`x_{ji} = \begin{bmatrix} x^1_{ji} \\ x^2_{ji} \\ \vdots \\ x^{p-1}_{ji} \\ 1 \\\end{bmatrix}`;
+   
+   .. note::
+
+      In general, you can place the intercept at the beginning or the end of each row of data, but we chose to put it at the end for our implementation.
+
+- :math:`\theta_f \text{ is a } p` by 1 vector of fixed coefficients;
+- :math:`A_{rj}` is usually denoted by :math:`Z_{rj} \text{ where } Z_{rj} = \begin{bmatrix} z^T_{j1} \\ z^T_{j2} \\ z^T_{j3} \\ \vdots \\ z^T_{jn_j} \\\end{bmatrix}`;
+   
+   .. note::
+
+      We included a term for the random intercept here. However, there are cases where we do not have a random intercept, and the last element of 1 will not be there for :math:`z_{ji}`.
+
+- :math:`\theta_{rj}` represents the random coefficient and is a :math:`q` by 1 vector;
+- :math:`r_j \text{ is an } n_j` by 1 vector of level-1 residual noise assumed multivariate normal in distribution with 0 mean vector, covariance matrix :math:`\sigma^2 I_{n_{j}\times n_{j}} \text{ where } I_{n_{j \times nj}}` is the identity matrix, :math:`n_j \text{ by } n_j`;
+- :math:`j` denotes the level-2 units where :math:`j = 1,2, \cdots , J`;
+- :math:`T_j` is a symmetric positive definite matrix of size :math:`n_j \text{ by } n_j`. We assume that :math:`T_j` is the same for all :math:`j = 1,2, \cdots , J`, and it is kept to be symmetric positive definite throughout the whole model building process.
+
+M-step
+~~~~~~
+
+EM conceives of :math:`Y_j` as the observed data with :math:`\theta_{rj}` as the missing data. Therefore, the complete data are :math:`(Y_j, \theta_{rj}), j=1, \cdots, J \text{ while } \theta_f, \sigma^2, \text{ and } T_j` are the parameters that need to be estimated. If the complete data were observed, finding the ML estimates will be simple. To estimate :math:`\theta_f`, subtract :math:`A_{rj} \theta_{rj}` from both sides of *equation 6* yielding:
+
+.. math::
+   
+   Y_j - A_{rj} \theta_{rj} = A_{fj} \theta_f + r_f \quad \text{ equation 7}
+
+Next, multiply *equation 7* with :math:`A^T_{fj}` and sum across the level-2 unit :math:`j`. Note that :math:`\sum^J_{j=1} A^T_{fj} r_j \sim 0`. Re-arrange the terms and you will get *equation 8*, which is also the ordinary least squares (OLS) estimate:
+
+.. math::
+
+   \hat{\theta_f} = \Big( \sum^J_{j=1} A^T_{fj} A_{fj} \Big)^{-1} \sum^J_{j=1} A^T_{fj} (Y_j - A_{rj} \theta_{rj}) \quad \text{ equation 8}
+
+Next, ML estimators for :math:`T_j` and :math:`\sigma^2` are straightforward:
+
+.. math::
+   
+   \hat{T_j} = J^{-1} \sum^J_{j=1} \theta_{rj} \theta^T_{rj} \quad \text{ equation 9}
+
+.. math::
+   
+   \hat{\sigma^2} = N^{-1} \sum^J_{j=1} \hat{r^T_j} \hat{r_j} = N^{-1} \sum^J_{j=1} \big( Y_j - A_{fj} \hat{\theta_f} - A_{rj} \theta_{rj} \big)^T \big( Y_j - A_{fj} \hat{\theta_{f}} - A_{rj} \theta_{rj} \big) \quad \text{ equation 10}
+
+where :math:`N = \sum^J_{j=1} n_j`.
+
+.. note::
+   
+   This reasoning defines certain complete-data sufficent statistics (CDSS), that is, statistics that would be sufficient to estimate :math:`\theta_f, T, \text{ and } \sigma^2` if the complete data were observed. These are:
+
+   .. math::
+
+      \sum^J_{j=1} A^T_{fj} A_{rj} \theta_{rj}, \sum^J_{j=1} \theta_{rj} \theta^T_{rj}, \sum^J_{j=1} Y^T_j A_{rj} \theta_{rj}, \sum^J_{j=1} \theta^T_{rj} A^T_{rj} A_{rj} \theta_{rj} \quad \text{ equation 11}.
+
+E-step
+~~~~~~
+
+While the CDSS are not observed, they can be estimated by their conditional expectations given the data :math:`Y` and parameter estimates from the previous iterations. `Dempster et al. [4] <#references>`__ showed that substituting the expected CDSS for the M-step formulas would produce new parameter estimates having a higher likelihood than the current estimates.
+
+To find :math:`E(CDSS | Y, \theta_f, T, \sigma^2)` requires deriving the conditional distribution of the missing data :math:`\theta_r`, given :math:`Y, \theta_f, T, \sigma^2`. From *equation  6*, the joint distribution of the complete data is:
+
+.. math::
+   
+   \begin{pmatrix} Y_j \\ \theta_{rj} \\\end{pmatrix} \sim N \Bigg[ \begin{pmatrix} A_{fj} \theta_{f} \\ 0 \\\end{pmatrix} , \begin{pmatrix} A_{rj}T_jA^T_{rj} + \sigma^2 & A_{rj}T_j \\ T_j A^T_{rj} & T_j \\\end{pmatrix} \Bigg] \quad \text{ equation 12}
+
+From *equation 12*, we can obtain the conditional distribution of the missing data given the complete data as follows:
+
+.. math::
+   
+   \theta_{rj} | Y, \theta_f, T_j, \sigma^2 \sim N (\theta^*_{rj}, \sigma^2 C_j^{-1}) \quad \text{ equation 13} 
+
+with
+
+.. math::
+   
+   \theta^*_{rj} = C^{-1}_j A^T_{rj} (Y_j - A_{fj} \theta_f) \quad \text{ equation 14}
+
+   C_j = A^T_{rj} A_{rj} + \sigma^2 T^{-1}_j \quad \text{ equation 15}
+
+The complete EM algorithm
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The complete EM algorithm is as follows:
+
+1. Initialization: randomly assign some small values to :math:`\theta_f, \sigma^2, T_j`;
+2. Estimation: estimate the CDSS:
+   
+   .. math::
+
+      E \big( \sum^J_{j=1} A^T_{fj} \theta_{rj} \theta_{rj} | Y, \theta_f, T_j, \sigma^2 \big) = \sum^J_{j=1} A^T_{fj} A_{rj} \theta^*_{rj} \\ E \big( \sum^J_{j=1} \theta_{rj} \theta^T_{rj} | Y, \theta_f, T_j, \sigma^2 \big) = \sum^J_{j=1} \theta^*_{rj} \theta^{*T}_{rj} + \sigma^2 \sum^J_{j=1} C^{-1}_j & \quad \text{ equation 17} \\ E \big( \sum^J_{j=1} r^T_j r_j \big) = \sum^J_{j=1} r^{*T}_j r^*_j + \sigma^2 \sum^J_{j=1} tr(C^{-1}_j A^T_{rj} A_{rj})
+
+   where: :math:`r^*_j = Y_j - A_{fj} \theta_f - A_{fj} \theta^*_{rj}, \theta^*_{rj} = C^{-1}_j A^T_{rj} (Y_j - A_{fj} \theta_f), C_j = A^T_{rj} A_{rj} + \sigma^2 T^{-1} \text{ and } \theta_f, \sigma^2, T` are based on the previous iteration or from initialization;
+
+3. Substitution: substitute the estimated CDSS from *equation 17* into the M-step forumulas (*equations 8, 9,* and *10*);
+4. Processing: feed the new estimates of :math:`\theta_f, \sigma^2, T_j` into step 2;
+5. Cycling: continue steps 2, 3, and 4 until the following stopping condition is satisfied:
+
+   - The largest change in the value of any of the parameters is sufficiently small.
+
+Log-likelihood for HGLM
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The model for level-2 unit :math:`j` can be written as:
+
+.. math::
+   
+   Y_j = A_{fj} \theta_f + d_j = X_j \theta_f + d_j, \quad d_j \sim N(0,V_j)
+
+where:
+
+- :math:`Y_j \text{ is an } n_j` by 1 outcome vector;
+- :math:`A_{fj} / X_j = \begin{bmatrix} x^T_{j1} \\ x^T_{j2} \\ x^T_{j3} \\ \vdots \\ x^T_{jn_{j}} \\\end{bmatrix}` is a known :math:`n_j \text{ by } p` matrix of level-1 predictors and :math:`x_{ji} = \begin{bmatrix} x^1_{ji} \\ x^2_{ji} \\ \vdots \\ x^{p-1}_{ji} \\ 1 \\\end{bmatrix}`;
+- :math:`\theta_f \text{ is a } p` by 1 vector of fixed effects;
+- :math:`d_j = A_{rj} \theta_{rj} + r_j = Z_j \theta_{rj} + r_j , A_{rj} / Z_j \text{ is } n_j \text{ by } q`;
+- :math:`\theta_{rj} \sim N(0,T), \theta_{rj} \text{ is } q` by 1, :math:`T \text{ is } q \text{ by } q`;
+- :math:`r_j \sim N(0, \sigma^2 I_{n_j}), I_{n_j} \text{ is } n_j \text{ by } n_j`;
+- :math:`V_j = A_{rj} TA^T_{rj} + \sigma^2 I_{n_j} = Z_j TZ^T_j + \sigma^2 I_{n_j}, \text{ is } n_j \text{ by } n)j`.
+
+For each level-2 value :math:`j`, the likelihood can be written as:
+
+.. math::
+   
+   L(Y_j; \theta_f, \sigma^2, T_j) = (2 \pi)^{-n_{j} /2} |V_j |^{-1/2} \exp \{ -\frac{1}{2} d^T_j V^{-1}_j d_j\}
+
+The log-likelihood is:
+
+.. math::
+   
+   ll(Y_j; \theta_f, \sigma^2 , T_j) = -\frac{1}{2} \Big( n_j \log{(2 \pi)} + \log{(|V_j|)} + (Y_j - X_j \theta_f)^T V^{-1}_j (Y_j - X_j \theta_f) \Big)
+
+Since we assume that random effects are i.i.d., the total log-likelihood is just the sum of the log-likelihood for each level-2 value. Let :math:`T=T_j`:
+
+.. math::
+   
+   ll(Y; \theta_f, \sigma^2, T) \\
+
+   = \sum^J_{j=1} \Big\{ - \frac{1}{2} \big( n_j \log{(2 \pi)} + \log{(|V_j|)} + (Y_j - X_j \theta_f)^T V^{-1}_j (Y_j - X_j \theta_f) \big) \Big\} =
+
+   -\frac{1}{2} n \log{(2 \pi)} -\frac{1}{2} \Big\{ \sum^J_{j=1} \big( \log{(|V_j|)} + (Y_j - X_j \theta_f)^T V^{-1}_j (Y_j - X_j \theta_f) \big) \Big\}
+
+:math:`|V_j|` can be calculated as:
+
+.. math::
+   
+   |V_j| = \Big|Z_j TZ^T_j + \sigma^2 I_{n_j} \Big| = \Big|T^{-1} + \frac{1}{\sigma^2} Z^T_j Z_j \Big| |T| \Big| \sigma^2 I_{n_j} \Big| = \sigma^2 \Big| T^{-1} + \frac{1}{\sigma^2} Z^T_j Z_j \Big| |T|
+
+where: :math:`V^{-1}_j = \frac{1}{\sigma^2} I_{n_j} - \frac{1}{\sigma^4} Z_j \Big( T^{-1} + \frac{1}{\sigma^2} Z^T_j Z_j \Big)^{-1} Z^T_j`
+
+:math:`(Y_j - X_j \theta_f)^T V_j^{-1} (Y_j - X_j \theta_f)` can be calculated as:
+
+.. math::
+   
+   (Y_j - X_j \theta_f)^T V_j^{-1} (Y_j - X_j \theta_f) = \frac{1}{\sigma^2} (Y_j - X_j \theta_f)^T (Y_j - X_j \theta_f) - \frac{1}{\sigma^4} (Y_j - X_j \theta_f)^T Z_j (T^{-1} + \frac{1}{\sigma^2} Z^T_j Z_j)^{-1} Z^T_j (Y_j - X_J \theta_f)
+
+The final log-likelihood is:
+
+.. math::
+   
+   ll(Y; \theta_f, \sigma^2, T) = - \frac{1}{2} n \log{(2 \pi)} - \frac{1}{2} \Big\{ \sum^J_{j=1} \big( \log{(|V_j|)} + \frac{1}{\sigma^2} (Y_j - X_j \theta_f)^T (Y_j - X_j \theta_f) \\ - \frac{1}{\sigma^4} (Y_j - X_j \theta_f)^T Z_j \big(T^{-1} + \frac{1}{\sigma^2} Z^T_j Z_j \big)^{-1} Z^T_j (Y_j - X_j \theta_f) \big) \Big\} \quad \quad \quad
+
+Examples
+--------
+
+The following are simple HGLM examples in Python and R.
+
+.. tabs::
+   .. code-tab:: python
+
+      # Initialize H2O-3 and import the HGLM estimator:
+      import h2o
+      h2o.init()
+      from h2o.estimators import H2OHGLMEstimator as hglm
+
+      # Import the Gaussian wintercept dataset:
+      h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/hglm_test/gaussian_0GC_678R_6enum_5num_p05oise_p08T_wIntercept_standardize.gz")
+
+      # Split the data into training and validation sets:
+      train, valid = h2o_data.split_frame(ratios = [.8], seed = 1234)
+
+      # Define the predictors and response:
+      y = "response"
+      x = h2o_data.names
+      x.remove("response")
+      x.remove("C1")
+
+      # Set the random columns:
+      random_columns = ["C10","C20","C30"]
+
+      # Build and train the model:
+      hglm_model = hglm(random_columns=random_columns, 
+                        group_column = "C1", 
+                        score_each_iteration=True, 
+                        seed=12345, 
+                        em_epsilon = 0.000005)
+      hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid)
+
+      # Grab various metrics (model metrics, scoring history coefficients, etc.):
+      modelMetrics = hglm_model.training_model_metrics()
+      scoring_history = hglm_model.scoring_history(as_data_frame=False)
+      scoring_history_valid = hglm_model.scoring_history_valid(as_data_frame=False)
+      model_summary = hglm_model.summary()
+      coef = hglm_model.coef()
+      coef_norm = hglm_model.coef_norm()
+      coef_names = hglm_model.coef_names()
+      coef_random = hglm_model.coefs_random()
+      coef_random_names = hglm_model.coefs_random_names()
+      coef_random_norm = hglm_model.coefs_random_norm()
+      coef_random_names_norm = hglm_model.coefs_random_names_norm()
+      t_mat = hglm_model.matrix_T()
+      residual_var = hglm_model.residual_variance()
+      mse = hglm_model.mse()
+      mse_fixed = hglm_model.mean_residual_fixed()
+      mse_fixed_valid = hglm_model.mean_residual_fixed(train=False)
+      icc = hglm_model.icc()
+
+   .. code-tab:: r R
+
+      # Import the Gaussian wintercept dataset:
+      h2odata <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/hglm_test/gaussian_0GC_allRC_2enum2numeric_p5oise_p08T_wIntercept_standardize.gz")
+
+      # Set the predictors and response:
+      yresp <- "response"
+      predictor <- c("C2", "C3", "C4", "C5")
+
+      # Set the random and group columns:
+      random_columns <- c("C2", "C3", "C4", "C5")
+      group_column <- "C1"
+
+      # Build and train the model:
+      hglm_model <- h2o.hglm(x = predictor, 
+                             y = yresp, 
+                             training_frame = h2odata, 
+                             group_column = group_column, 
+                             random_columns = random_columns, 
+                             seed = 12345, 
+                             max_iterations = 10, 
+                             em_epsilon = 0.0000001, 
+                             random_intercept = TRUE)
+
+      # Find the coefficient:
+      coeff <- h2o.coef(hglm_model)
+
+References
+----------
+
+[1] David Ruppert, M. P. Wand and R. J. Carroll, Semiparametric Regression, Chapter 4, Cambridge University Press, 2003.
+
+[2] Stephen w. Raudenbush, Anthony S. Bryk, Hierarchical Linear Models Applications and Data Analysis Methods, Second Edition, Sage Publications, 2002.
+
+[3] Rao, C. R. (1973). Linear Statistical Inference and Its Applications. New York: Wiley. 
+
+[4] Dempster, A. P., Laird, N. M., & Rubin, D. B. (1977). Maximum likelihood from incomplete data via the EM algorithm. Journal of the Royal Statistical Society, Seires B, 39, 1-8.
+
+[5] Matrix determinant lemma: https://en.wikipedia.org/wiki/Matrix_determinant_lemma.
+
+[6] Woodbury matrix identity: https://en.wikipedia.org/wiki/Woodbury_matrix_identity.
\ No newline at end of file
diff --git a/h2o-docs/src/product/images/HGLM.png b/h2o-docs/src/product/images/HGLM.png
new file mode 100644
index 000000000000..a21e438b9172
Binary files /dev/null and b/h2o-docs/src/product/images/HGLM.png differ
diff --git a/h2o-docs/src/product/parameters.rst b/h2o-docs/src/product/parameters.rst
index e785975a42fc..edd607a41b6d 100644
--- a/h2o-docs/src/product/parameters.rst
+++ b/h2o-docs/src/product/parameters.rst
@@ -53,7 +53,6 @@ This Appendix provides detailed descriptions of parameters that can be specified
    data-science/algo-params/fold_column
    data-science/algo-params/gainslift_bins
    data-science/algo-params/gradient_epsilon
-   data-science/algo-params/hglm
    data-science/algo-params/histogram_type
    data-science/algo-params/huber_alpha
    data-science/algo-params/ignore_const_cols