Expose weights column

h2oai · Sep 14, 2023 · 7e1e286 · 7e1e286
1 parent beeeaf6
commit 7e1e286
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 1 deletion.
diff --git a/h2o-algos/src/main/java/hex/schemas/AdaBoostV3.java b/h2o-algos/src/main/java/hex/schemas/AdaBoostV3.java
@@ -17,8 +17,9 @@ public static final class AdaBoostParametersV3 extends ModelParametersSchemaV3<A
                 "ignored_columns",
                 "ignore_const_cols",
                 "categorical_encoding",
+                "weights_column",
 
-                // Extended Isolation Forest specific
+                // AdaBoost specific
                 "n_estimators",
                 "weak_learner",
                 "learning_rate",

diff --git a/h2o-py/h2o/estimators/adaboost.py b/h2o-py/h2o/estimators/adaboost.py
@@ -27,6 +27,7 @@ def __init__(self,
                  ignored_columns=None,  # type: Optional[List[str]]
                  ignore_const_cols=True,  # type: bool
                  categorical_encoding="auto",  # type: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"]
+                 weights_column=None,  # type: Optional[str]
                  n_estimators=50,  # type: int
                  weak_learner="auto",  # type: Literal["auto", "drf", "glm"]
                  learning_rate=0.5,  # type: float
@@ -49,6 +50,15 @@ def __init__(self,
                Defaults to ``"auto"``.
         :type categorical_encoding: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder",
                "sort_by_response", "enum_limited"]
+        :param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent
+               to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating
+               that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do
+               not increase the size of the data frame. This is typically the number of times a row is repeated, but
+               non-integer values are supported as well. During training, rows with higher weights matter more, due to
+               the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at
+               that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0.
+               Defaults to ``None``.
+        :type weights_column: str, optional
         :param n_estimators: Number of AdaBoost weak learners.
                Defaults to ``50``.
         :type n_estimators: int
@@ -69,6 +79,7 @@ def __init__(self,
         self.ignored_columns = ignored_columns
         self.ignore_const_cols = ignore_const_cols
         self.categorical_encoding = categorical_encoding
+        self.weights_column = weights_column
         self.n_estimators = n_estimators
         self.weak_learner = weak_learner
         self.learning_rate = learning_rate
@@ -130,6 +141,26 @@ def categorical_encoding(self, categorical_encoding):
         assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"))
         self._parms["categorical_encoding"] = categorical_encoding
 
+    @property
+    def weights_column(self):
+        """
+        Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the
+        dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
+        weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data
+        frame. This is typically the number of times a row is repeated, but non-integer values are supported as well.
+        During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set
+        weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an
+        accurate prediction, remove all rows with weight == 0.
+
+        Type: ``str``.
+        """
+        return self._parms.get("weights_column")
+
+    @weights_column.setter
+    def weights_column(self, weights_column):
+        assert_is_type(weights_column, None, str)
+        self._parms["weights_column"] = weights_column
+
     @property
     def n_estimators(self):
         """

diff --git a/h2o-r/h2o-package/R/adaboost.R b/h2o-r/h2o-package/R/adaboost.R
@@ -17,6 +17,13 @@
 #' @param ignore_const_cols \code{Logical}. Ignore constant columns. Defaults to TRUE.
 #' @param categorical_encoding Encoding scheme for categorical features Must be one of: "AUTO", "Enum", "OneHotInternal", "OneHotExplicit",
 #'        "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited". Defaults to AUTO.
+#' @param weights_column Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from
+#'        the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
+#'        weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the
+#'        data frame. This is typically the number of times a row is repeated, but non-integer values are supported as
+#'        well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If
+#'        you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get
+#'        an accurate prediction, remove all rows with weight == 0.
 #' @param n_estimators Number of AdaBoost weak learners. Defaults to 50.
 #' @param weak_learner Weak learner Must be one of: "AUTO", "DRF", "GLM". Defaults to AUTO.
 #' @param learning_rate Learning rate Defaults to 0.5.
@@ -48,6 +55,7 @@ h2o.adaBoost <- function(x,
                          model_id = NULL,
                          ignore_const_cols = TRUE,
                          categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"),
+                         weights_column = NULL,
                          n_estimators = 50,
                          weak_learner = c("AUTO", "DRF", "GLM"),
                          learning_rate = 0.5,
@@ -79,6 +87,8 @@ h2o.adaBoost <- function(x,
     parms$ignore_const_cols <- ignore_const_cols
   if (!missing(categorical_encoding))
     parms$categorical_encoding <- categorical_encoding
+  if (!missing(weights_column))
+    parms$weights_column <- weights_column
   if (!missing(n_estimators))
     parms$n_estimators <- n_estimators
   if (!missing(weak_learner))
@@ -97,6 +107,7 @@ h2o.adaBoost <- function(x,
                                          training_frame,
                                          ignore_const_cols = TRUE,
                                          categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"),
+                                         weights_column = NULL,
                                          n_estimators = 50,
                                          weak_learner = c("AUTO", "DRF", "GLM"),
                                          learning_rate = 0.5,
@@ -133,6 +144,8 @@ h2o.adaBoost <- function(x,
     parms$ignore_const_cols <- ignore_const_cols
   if (!missing(categorical_encoding))
     parms$categorical_encoding <- categorical_encoding
+  if (!missing(weights_column))
+    parms$weights_column <- weights_column
   if (!missing(n_estimators))
     parms$n_estimators <- n_estimators
   if (!missing(weak_learner))