diff --git a/h2o-algos/src/main/java/hex/api/RegisterAlgos.java b/h2o-algos/src/main/java/hex/api/RegisterAlgos.java index be7e255f35d3..b31bbcc39834 100644 --- a/h2o-algos/src/main/java/hex/api/RegisterAlgos.java +++ b/h2o-algos/src/main/java/hex/api/RegisterAlgos.java @@ -38,7 +38,8 @@ public void registerEndPoints(RestApiContext context) { new hex.tree.uplift.UpliftDRF (true), new hex.modelselection.ModelSelection (true), new hex.isotonic .IsotonicRegression(true), - new hex.tree.dt .DT (true) + new hex.tree.dt .DT (true), + new hex.adaboost. AdaBoost (true) }; // "Word2Vec", "Example", "Grep" diff --git a/h2o-algos/src/main/java/hex/schemas/AdaBoostModelV3.java b/h2o-algos/src/main/java/hex/schemas/AdaBoostModelV3.java new file mode 100644 index 000000000000..9229af5dc9ab --- /dev/null +++ b/h2o-algos/src/main/java/hex/schemas/AdaBoostModelV3.java @@ -0,0 +1,30 @@ +package hex.schemas; + +import hex.adaboost.AdaBoostModel; +import water.api.schemas3.ModelOutputSchemaV3; +import water.api.schemas3.ModelSchemaV3; + +public class AdaBoostModelV3 extends ModelSchemaV3 { + + public static final class AdaBoostModelOutputV3 extends ModelOutputSchemaV3 { + // nothing + } + + public AdaBoostV3.AdaBoostParametersV3 createParametersSchema() { return new AdaBoostV3.AdaBoostParametersV3(); } + public AdaBoostModelOutputV3 createOutputSchema() { return new AdaBoostModelOutputV3(); } + + //========================== + // Custom adapters go here + + // Version&Schema-specific filling into the impl + @Override public AdaBoostModel createImpl() { + AdaBoostV3.AdaBoostParametersV3 p = this.parameters; + AdaBoostModel.AdaBoostParameters parms = p.createImpl(); + return new AdaBoostModel( model_id.key(), parms, new AdaBoostModel.AdaBoostOutput(null) ); + } +} diff --git a/h2o-algos/src/main/java/hex/schemas/AdaBoostV3.java b/h2o-algos/src/main/java/hex/schemas/AdaBoostV3.java new file mode 100644 index 000000000000..1a1edb52189c --- /dev/null +++ b/h2o-algos/src/main/java/hex/schemas/AdaBoostV3.java @@ -0,0 +1,41 @@ +package hex.schemas; + +import hex.adaboost.AdaBoost; +import hex.adaboost.AdaBoostModel; +import water.api.API; +import water.api.schemas3.ModelParametersSchemaV3; + +public class AdaBoostV3 extends ModelBuilderSchema< + AdaBoost, + AdaBoostV3, + AdaBoostV3.AdaBoostParametersV3> { + + public static final class AdaBoostParametersV3 extends ModelParametersSchemaV3 { + static public String[] fields = new String[]{ + "model_id", + "training_frame", + "ignored_columns", + "ignore_const_cols", + "categorical_encoding", + "weights_column", + + // AdaBoost specific + "nlearners", + "weak_learner", + "learn_rate", + "seed", + }; + + @API(help = "Number of AdaBoost weak learners.", gridable = true) + public int nlearners; + + @API(help = "Choose a weak learner type. Defaults to AUTO, which means DRF.", gridable = true, values = {"AUTO", "DRF", "GLM", "GBM"}) + public AdaBoostModel.Algorithm weak_learner; + + @API(help="Learning rate (from 0.0 to 1.0)", gridable = true) + public double learn_rate; + + @API(help = "Seed for pseudo random number generator (if applicable)", gridable = true) + public long seed; + } +} diff --git a/h2o-algos/src/main/resources/META-INF/services/water.api.Schema b/h2o-algos/src/main/resources/META-INF/services/water.api.Schema index b23d07eec489..18c3ea3dc972 100644 --- a/h2o-algos/src/main/resources/META-INF/services/water.api.Schema +++ b/h2o-algos/src/main/resources/META-INF/services/water.api.Schema @@ -114,3 +114,7 @@ hex.schemas.UpliftDRFModelV3 hex.schemas.UpliftDRFModelV3$UpliftDRFModelOutputV3 hex.schemas.UpliftDRFV3 hex.schemas.UpliftDRFV3$UpliftDRFParametersV3 +hex.schemas.AdaBoostModelV3 +hex.schemas.AdaBoostModelV3$AdaBoostModelOutputV3 +hex.schemas.AdaBoostV3 +hex.schemas.AdaBoostV3$AdaBoostParametersV3 diff --git a/h2o-bindings/bin/custom/R/gen_adaboost.py b/h2o-bindings/bin/custom/R/gen_adaboost.py new file mode 100644 index 000000000000..7dbf86f2a3fa --- /dev/null +++ b/h2o-bindings/bin/custom/R/gen_adaboost.py @@ -0,0 +1,41 @@ +extensions = dict( + skip_default_set_params_for=['training_frame', 'ignored_columns', 'response_column', + 'max_confusion_matrix_size', 'distribution', 'offset_column'], + set_required_params=""" +parms$training_frame <- training_frame +args <- .verify_dataxy(training_frame, x, y) +parms$ignored_columns <- args$x_ignore +parms$response_column <- args$y +""", +) + + +doc = dict( + preamble=""" +Build an AdaBoost model + +Builds an AdaBoost model on an H2OFrame. +""", + returns=""" +Creates a \linkS4class{H2OModel} object of the right type. +""", + seealso=""" +\code{\link{predict.H2OModel}} for prediction +""", + examples=""" +library(h2o) +h2o.init() + +# Import the airlines dataset +f <- "https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv" +data <- h2o.importFile(f) + +# Set predictors and response; set response as a factor +data["CAPSULE"] <- as.factor(data["CAPSULE"]) +predictors <- c("AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON") +response <- "CAPSULE" + +# Train the AdaBoost model +h2o_adaboost <- h2o.adaBoost(x = predictors, y = response, training_frame = data, seed = 1234) +""" +) diff --git a/h2o-bindings/bin/custom/python/gen_adaboost.py b/h2o-bindings/bin/custom/python/gen_adaboost.py new file mode 100644 index 000000000000..646c8e820871 --- /dev/null +++ b/h2o-bindings/bin/custom/python/gen_adaboost.py @@ -0,0 +1,8 @@ +options = dict( +) + +doc = dict( + __class__=""" +Builds an AdaBoost model +""" +) diff --git a/h2o-bindings/bin/gen_R.py b/h2o-bindings/bin/gen_R.py index 14a895e584b1..b99875a2a1f4 100644 --- a/h2o-bindings/bin/gen_R.py +++ b/h2o-bindings/bin/gen_R.py @@ -272,6 +272,7 @@ def algo_to_modelname(algo): if algo == "gam": return "Generalized Additive Model" if algo == "modelselection": return "Model Selection" if algo == "infogram": return "Infogram" + if algo == "adaboost": return "AdaBoost Model" return algo @@ -347,6 +348,7 @@ def main(): if name == "stackedensemble": module = "stackedEnsemble" if name == "pca": module = "prcomp" if name == "modelselection": module = "modelSelection" + if name == "adaboost": module = "adaBoost" bi.vprint("Generating model: " + name) bi.write_to_file("%s.R" % file_name, gen_module(mb, name, module)) diff --git a/h2o-bindings/bin/gen_python.py b/h2o-bindings/bin/gen_python.py index 98fd3db325b8..6fa044f77d0e 100755 --- a/h2o-bindings/bin/gen_python.py +++ b/h2o-bindings/bin/gen_python.py @@ -351,6 +351,7 @@ def algo_to_classname(algo): if algo == "rulefit": return "H2ORuleFitEstimator" if algo == "modelselection": return "H2OModelSelectionEstimator" if algo == "isotonicregression": return "H2OIsotonicRegressionEstimator" + if algo == "adaboost": return "H2OAdaBoostEstimator" return "H2O" + algo.capitalize() + "Estimator" diff --git a/h2o-py/docs/modeling.rst b/h2o-py/docs/modeling.rst index 89b4f9cfa408..9ceecb83b361 100644 --- a/h2o-py/docs/modeling.rst +++ b/h2o-py/docs/modeling.rst @@ -8,6 +8,12 @@ Modeling In H2O Supervised ++++++++++ +:mod:`H2OAdaBoostEstimator` +--------------------------- +.. autoclass:: h2o.estimators.adaboost.H2OAdaBoostEstimator + :show-inheritance: + :members: + :mod:`H2OANOVAGLMEstimator` --------------------------- .. autoclass:: h2o.estimators.anovaglm.H2OANOVAGLMEstimator diff --git a/h2o-py/h2o/estimators/__init__.py b/h2o-py/h2o/estimators/__init__.py index d261ff829f13..766e1678b950 100644 --- a/h2o-py/h2o/estimators/__init__.py +++ b/h2o-py/h2o/estimators/__init__.py @@ -7,6 +7,7 @@ import inspect import sys +from .adaboost import H2OAdaBoostEstimator from .aggregator import H2OAggregatorEstimator from .anovaglm import H2OANOVAGLMEstimator from .coxph import H2OCoxProportionalHazardsEstimator @@ -60,12 +61,13 @@ def create_estimator(algo, **params): __all__ = ( "create_estimator", - "H2OAggregatorEstimator", "H2OANOVAGLMEstimator", "H2OCoxProportionalHazardsEstimator", "H2ODecisionTreeEstimator", - "H2OAutoEncoderEstimator", "H2ODeepLearningEstimator", "H2OEstimator", "H2OExtendedIsolationForestEstimator", - "H2OGeneralizedAdditiveEstimator", "H2OGradientBoostingEstimator", "H2OGenericEstimator", - "H2OGeneralizedLinearEstimator", "H2OGeneralizedLowRankEstimator", "H2OInfogram", "H2OIsolationForestEstimator", - "H2OIsotonicRegressionEstimator", "H2OKMeansEstimator", "H2OModelSelectionEstimator", "H2ONaiveBayesEstimator", - "H2OPrincipalComponentAnalysisEstimator", "H2OSupportVectorMachineEstimator", "H2ORandomForestEstimator", - "H2ORuleFitEstimator", "H2OStackedEnsembleEstimator", "H2OSingularValueDecompositionEstimator", - "H2OTargetEncoderEstimator", "H2OUpliftRandomForestEstimator", "H2OWord2vecEstimator", "H2OXGBoostEstimator" + "H2OAdaBoostEstimator", "H2OAggregatorEstimator", "H2OANOVAGLMEstimator", "H2OCoxProportionalHazardsEstimator", + "H2ODecisionTreeEstimator", "H2OAutoEncoderEstimator", "H2ODeepLearningEstimator", "H2OEstimator", + "H2OExtendedIsolationForestEstimator", "H2OGeneralizedAdditiveEstimator", "H2OGradientBoostingEstimator", + "H2OGenericEstimator", "H2OGeneralizedLinearEstimator", "H2OGeneralizedLowRankEstimator", "H2OInfogram", + "H2OIsolationForestEstimator", "H2OIsotonicRegressionEstimator", "H2OKMeansEstimator", "H2OModelSelectionEstimator", + "H2ONaiveBayesEstimator", "H2OPrincipalComponentAnalysisEstimator", "H2OSupportVectorMachineEstimator", + "H2ORandomForestEstimator", "H2ORuleFitEstimator", "H2OStackedEnsembleEstimator", + "H2OSingularValueDecompositionEstimator", "H2OTargetEncoderEstimator", "H2OUpliftRandomForestEstimator", + "H2OWord2vecEstimator", "H2OXGBoostEstimator" ) diff --git a/h2o-py/h2o/estimators/adaboost.py b/h2o-py/h2o/estimators/adaboost.py new file mode 100644 index 000000000000..09495202531c --- /dev/null +++ b/h2o-py/h2o/estimators/adaboost.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +# +# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py +# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details) +# + +from h2o.estimators.estimator_base import H2OEstimator +from h2o.exceptions import H2OValueError +from h2o.frame import H2OFrame +from h2o.utils.typechecks import assert_is_type, Enum, numeric + + +class H2OAdaBoostEstimator(H2OEstimator): + """ + AdaBoost + + Builds an AdaBoost model + """ + + algo = "adaboost" + supervised_learning = True + + def __init__(self, + model_id=None, # type: Optional[Union[None, str, H2OEstimator]] + training_frame=None, # type: Optional[Union[None, str, H2OFrame]] + ignored_columns=None, # type: Optional[List[str]] + ignore_const_cols=True, # type: bool + categorical_encoding="auto", # type: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"] + weights_column=None, # type: Optional[str] + nlearners=50, # type: int + weak_learner="auto", # type: Literal["auto", "drf", "glm", "gbm"] + learn_rate=0.5, # type: float + seed=-1, # type: int + ): + """ + :param model_id: Destination id for this model; auto-generated if not specified. + Defaults to ``None``. + :type model_id: Union[None, str, H2OEstimator], optional + :param training_frame: Id of the training data frame. + Defaults to ``None``. + :type training_frame: Union[None, str, H2OFrame], optional + :param ignored_columns: Names of columns to ignore for training. + Defaults to ``None``. + :type ignored_columns: List[str], optional + :param ignore_const_cols: Ignore constant columns. + Defaults to ``True``. + :type ignore_const_cols: bool + :param categorical_encoding: Encoding scheme for categorical features + Defaults to ``"auto"``. + :type categorical_encoding: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", + "sort_by_response", "enum_limited"] + :param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent + to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating + that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do + not increase the size of the data frame. This is typically the number of times a row is repeated, but + non-integer values are supported as well. During training, rows with higher weights matter more, due to + the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at + that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. + Defaults to ``None``. + :type weights_column: str, optional + :param nlearners: Number of AdaBoost weak learners. + Defaults to ``50``. + :type nlearners: int + :param weak_learner: Choose a weak learner type. Defaults to AUTO, which means DRF. + Defaults to ``"auto"``. + :type weak_learner: Literal["auto", "drf", "glm", "gbm"] + :param learn_rate: Learning rate (from 0.0 to 1.0) + Defaults to ``0.5``. + :type learn_rate: float + :param seed: Seed for pseudo random number generator (if applicable) + Defaults to ``-1``. + :type seed: int + """ + super(H2OAdaBoostEstimator, self).__init__() + self._parms = {} + self._id = self._parms['model_id'] = model_id + self.training_frame = training_frame + self.ignored_columns = ignored_columns + self.ignore_const_cols = ignore_const_cols + self.categorical_encoding = categorical_encoding + self.weights_column = weights_column + self.nlearners = nlearners + self.weak_learner = weak_learner + self.learn_rate = learn_rate + self.seed = seed + + @property + def training_frame(self): + """ + Id of the training data frame. + + Type: ``Union[None, str, H2OFrame]``. + """ + return self._parms.get("training_frame") + + @training_frame.setter + def training_frame(self, training_frame): + self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame') + + @property + def ignored_columns(self): + """ + Names of columns to ignore for training. + + Type: ``List[str]``. + """ + return self._parms.get("ignored_columns") + + @ignored_columns.setter + def ignored_columns(self, ignored_columns): + assert_is_type(ignored_columns, None, [str]) + self._parms["ignored_columns"] = ignored_columns + + @property + def ignore_const_cols(self): + """ + Ignore constant columns. + + Type: ``bool``, defaults to ``True``. + """ + return self._parms.get("ignore_const_cols") + + @ignore_const_cols.setter + def ignore_const_cols(self, ignore_const_cols): + assert_is_type(ignore_const_cols, None, bool) + self._parms["ignore_const_cols"] = ignore_const_cols + + @property + def categorical_encoding(self): + """ + Encoding scheme for categorical features + + Type: ``Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", + "sort_by_response", "enum_limited"]``, defaults to ``"auto"``. + """ + return self._parms.get("categorical_encoding") + + @categorical_encoding.setter + def categorical_encoding(self, categorical_encoding): + assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited")) + self._parms["categorical_encoding"] = categorical_encoding + + @property + def weights_column(self): + """ + Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the + dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative + weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data + frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. + During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set + weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an + accurate prediction, remove all rows with weight == 0. + + Type: ``str``. + """ + return self._parms.get("weights_column") + + @weights_column.setter + def weights_column(self, weights_column): + assert_is_type(weights_column, None, str) + self._parms["weights_column"] = weights_column + + @property + def nlearners(self): + """ + Number of AdaBoost weak learners. + + Type: ``int``, defaults to ``50``. + """ + return self._parms.get("nlearners") + + @nlearners.setter + def nlearners(self, nlearners): + assert_is_type(nlearners, None, int) + self._parms["nlearners"] = nlearners + + @property + def weak_learner(self): + """ + Choose a weak learner type. Defaults to AUTO, which means DRF. + + Type: ``Literal["auto", "drf", "glm", "gbm"]``, defaults to ``"auto"``. + """ + return self._parms.get("weak_learner") + + @weak_learner.setter + def weak_learner(self, weak_learner): + assert_is_type(weak_learner, None, Enum("auto", "drf", "glm", "gbm")) + self._parms["weak_learner"] = weak_learner + + @property + def learn_rate(self): + """ + Learning rate (from 0.0 to 1.0) + + Type: ``float``, defaults to ``0.5``. + """ + return self._parms.get("learn_rate") + + @learn_rate.setter + def learn_rate(self, learn_rate): + assert_is_type(learn_rate, None, numeric) + self._parms["learn_rate"] = learn_rate + + @property + def seed(self): + """ + Seed for pseudo random number generator (if applicable) + + Type: ``int``, defaults to ``-1``. + """ + return self._parms.get("seed") + + @seed.setter + def seed(self, seed): + assert_is_type(seed, None, int) + self._parms["seed"] = seed + + diff --git a/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_saveload.py b/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_saveload.py new file mode 100644 index 000000000000..9de757289e05 --- /dev/null +++ b/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_saveload.py @@ -0,0 +1,38 @@ +import sys, os +sys.path.insert(1, os.path.join("..","..","..")) +import h2o +from tests import pyunit_utils +from h2o.estimators import H2OAdaBoostEstimator + + +def adaBoost_save_and_load(): + print("AdaBoost Save Load Test") + + train = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) + train["CAPSULE"] = train["CAPSULE"].asfactor() + + adaboost_model = H2OAdaBoostEstimator(nlearners=7, seed=12) + adaboost_model.train(training_frame=train, y="CAPSULE") + predict = adaboost_model.predict(train) + + path = pyunit_utils.locate("results") + + assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) + model_path = h2o.save_model(adaboost_model, path=path, force=True) + + assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) + reloaded = h2o.load_model(model_path) + predict_reloaded = reloaded.predict(train) + + assert isinstance(reloaded, + H2OAdaBoostEstimator), \ + "Expected and H2OAdaBoostEstimator, but got {0}"\ + .format(reloaded) + + assert pyunit_utils.compare_frames_local(predict, predict_reloaded, returnResult=True) + + +if __name__ == "__main__": + pyunit_utils.standalone_test(adaBoost_save_and_load) +else: + adaBoost_save_and_load() diff --git a/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_smoke.py b/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_smoke.py new file mode 100644 index 000000000000..5e52a11f9859 --- /dev/null +++ b/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_smoke.py @@ -0,0 +1,31 @@ +import sys, os +sys.path.insert(1, os.path.join("..","..","..")) +import h2o +from tests import pyunit_utils +from h2o.estimators import H2OAdaBoostEstimator + + +def adaboost(): + print("AdaBoost Smoke Test") + + train = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) + train["CAPSULE"] = train["CAPSULE"].asfactor() + + adaboost_model = H2OAdaBoostEstimator(nlearners=55, seed=0xBEEF, weak_learner="GLM", learn_rate=0.6) + adaboost_model.train(training_frame=train, y="CAPSULE") + predict = adaboost_model.predict(train) + + print("") + print(adaboost_model) + print("") + print(predict) + + assert 55 == adaboost_model._model_json["output"]["model_summary"]["number_of_weak_learners"][0], "Python API is not working!" + assert "GLM" == adaboost_model._model_json["output"]["model_summary"]["weak_learner"][0], "Python API is not working!" + assert 0.6 == adaboost_model._model_json["output"]["model_summary"]["learn_rate"][0], "Python API is not working!" + + +if __name__ == "__main__": + pyunit_utils.standalone_test(adaboost) +else: + adaboost() diff --git a/h2o-py/tests/testdir_sklearn/pyunit_sklearn_generic_all_estimators.py b/h2o-py/tests/testdir_sklearn/pyunit_sklearn_generic_all_estimators.py index c584ea25d6d1..c38651b8b69c 100644 --- a/h2o-py/tests/testdir_sklearn/pyunit_sklearn_generic_all_estimators.py +++ b/h2o-py/tests/testdir_sklearn/pyunit_sklearn_generic_all_estimators.py @@ -199,6 +199,7 @@ def make_tests(classifier): 'H2OWord2vecEstimator', # needs a separate test (requires pre_trained model as parameter) 'H2OUpliftRandomForestEstimator', # generic part is not implemented yet 'H2ODecisionTreeEstimator', # generic part is not implemented yet + 'H2OAdaBoostEstimator', # generic part is not implemented yet or test needs to be adjusted just for classification ] estimators = [cls for name, cls in inspect.getmembers(h2o.sklearn, inspect.isclass) if name.endswith('Estimator') and name not in ['H2OAutoMLEstimator'] + failing] diff --git a/h2o-py/tests/testdir_sklearn/pyunit_sklearn_regression_all_estimators.py b/h2o-py/tests/testdir_sklearn/pyunit_sklearn_regression_all_estimators.py index 0debe9691020..839733595178 100644 --- a/h2o-py/tests/testdir_sklearn/pyunit_sklearn_regression_all_estimators.py +++ b/h2o-py/tests/testdir_sklearn/pyunit_sklearn_regression_all_estimators.py @@ -138,7 +138,8 @@ def make_tests(classifier): 'H2OCoxProportionalHazardsRegressor', # doesn't support regression? 'H2OStackedEnsembleRegressor', # needs a separate test (requires models as parameters), 'H2OUpliftRandomForestRegressor', # does not support regression yet - 'H2ODecisionTreeRegressor' # does not support regression yet + 'H2ODecisionTreeRegressor', # does not support regression yet + 'H2OAdaBoostRegressor' # does not support regression yet ] regressors = [cls for name, cls in inspect.getmembers(h2o.sklearn, inspect.isclass) if name.endswith('Regressor') and name not in ['H2OAutoMLRegressor']+failing] diff --git a/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py b/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py index 1fec73fc1a1e..13c93693ae8e 100644 --- a/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py +++ b/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py @@ -23,7 +23,7 @@ algos = ['coxph', 'kmeans', 'deeplearning', 'drf', 'glm', 'gbm', 'pca', 'naivebayes', 'glrm', 'svd', 'isotonicregression', 'psvm', 'aggregator', 'word2vec', 'stackedensemble', 'xgboost', 'isolationforest', 'gam', 'generic', 'targetencoder', 'rulefit', 'extendedisolationforest', 'anovaglm', 'modelselection', - 'upliftdrf', 'infogram', 'dt'] + 'upliftdrf', 'infogram', 'dt', 'adaboost'] algo_additional_default_params = { 'grep' : { 'regex' : '.*' }, 'kmeans' : { 'k' : 2 }, diff --git a/h2o-r/H2O_Load.R b/h2o-r/H2O_Load.R index 96058f24465e..74af068e4558 100755 --- a/h2o-r/H2O_Load.R +++ b/h2o-r/H2O_Load.R @@ -17,7 +17,8 @@ function() { "edicts.R", "coxph.R", "coxphutils.R", "glm.R", "gam.R", "glrm.R", "pca.R", "kmeans.R", "gbm.R", "deeplearning.R", "naivebayes.R", "randomforest.R", "svd.R", "locate.R", "predict.R", "rulefit.R", "isolationforest.R", "psvm.R", "tf-idf.R", "permutation_varimp.R", "extendedisolationforest.R", - "anovaglm.R", "modelselection.R", "upliftrandomforest.R", "infogram.R", "admissibleml.R", "decisiontree.R") + "anovaglm.R", "modelselection.R", "upliftrandomforest.R", "infogram.R", "admissibleml.R", "decisiontree.R", + "adaBoost.R") require(jsonlite); require(RCurl) invisible(lapply(to_src,function(x){source(paste(FULL.PATH, x, sep = ""))})) } diff --git a/h2o-r/h2o-package/R/adaboost.R b/h2o-r/h2o-package/R/adaboost.R new file mode 100644 index 000000000000..526467977831 --- /dev/null +++ b/h2o-r/h2o-package/R/adaboost.R @@ -0,0 +1,170 @@ +# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_R.py +# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details) +#' +# -------------------------- AdaBoost Model -------------------------- # +#' +#' Build an AdaBoost model +#' +#' Builds an AdaBoost model on an H2OFrame. +#' +#' @param x (Optional) A vector containing the names or indices of the predictor variables to use in building the model. +#' If x is missing, then all columns except y are used. +#' @param y The name or column index of the response variable in the data. +#' The response must be either a numeric or a categorical/factor variable. +#' If the response is numeric, then a regression model will be trained, otherwise it will train a classification model. +#' @param training_frame Id of the training data frame. +#' @param model_id Destination id for this model; auto-generated if not specified. +#' @param ignore_const_cols \code{Logical}. Ignore constant columns. Defaults to TRUE. +#' @param categorical_encoding Encoding scheme for categorical features Must be one of: "AUTO", "Enum", "OneHotInternal", "OneHotExplicit", +#' "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited". Defaults to AUTO. +#' @param weights_column Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from +#' the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative +#' weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the +#' data frame. This is typically the number of times a row is repeated, but non-integer values are supported as +#' well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If +#' you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get +#' an accurate prediction, remove all rows with weight == 0. +#' @param nlearners Number of AdaBoost weak learners. Defaults to 50. +#' @param weak_learner Choose a weak learner type. Defaults to AUTO, which means DRF. Must be one of: "AUTO", "DRF", "GLM", "GBM". +#' Defaults to AUTO. +#' @param learn_rate Learning rate (from 0.0 to 1.0) Defaults to 0.5. +#' @param seed Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default). +#' Defaults to -1 (time-based random number). +#' @return Creates a \linkS4class{H2OModel} object of the right type. +#' @seealso \code{\link{predict.H2OModel}} for prediction +#' @examples +#' \dontrun{ +#' library(h2o) +#' h2o.init() +#' +#' # Import the airlines dataset +#' f <- "https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv" +#' data <- h2o.importFile(f) +#' +#' # Set predictors and response; set response as a factor +#' data["CAPSULE"] <- as.factor(data["CAPSULE"]) +#' predictors <- c("AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON") +#' response <- "CAPSULE" +#' +#' # Train the AdaBoost model +#' h2o_adaboost <- h2o.adaBoost(x = predictors, y = response, training_frame = data, seed = 1234) +#' } +#' @export +h2o.adaBoost <- function(x, + y, + training_frame, + model_id = NULL, + ignore_const_cols = TRUE, + categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"), + weights_column = NULL, + nlearners = 50, + weak_learner = c("AUTO", "DRF", "GLM", "GBM"), + learn_rate = 0.5, + seed = -1) +{ + # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object + training_frame <- .validate.H2OFrame(training_frame, required=TRUE) + + # Validate other required args + # If x is missing, then assume user wants to use all columns as features. + if (missing(x)) { + if (is.numeric(y)) { + x <- setdiff(col(training_frame), y) + } else { + x <- setdiff(colnames(training_frame), y) + } + } + + # Build parameter list to send to model builder + parms <- list() + parms$training_frame <- training_frame + args <- .verify_dataxy(training_frame, x, y) + parms$ignored_columns <- args$x_ignore + parms$response_column <- args$y + + if (!missing(model_id)) + parms$model_id <- model_id + if (!missing(ignore_const_cols)) + parms$ignore_const_cols <- ignore_const_cols + if (!missing(categorical_encoding)) + parms$categorical_encoding <- categorical_encoding + if (!missing(weights_column)) + parms$weights_column <- weights_column + if (!missing(nlearners)) + parms$nlearners <- nlearners + if (!missing(weak_learner)) + parms$weak_learner <- weak_learner + if (!missing(learn_rate)) + parms$learn_rate <- learn_rate + if (!missing(seed)) + parms$seed <- seed + + # Error check and build model + model <- .h2o.modelJob('adaboost', parms, h2oRestApiVersion=3, verbose=FALSE) + return(model) +} +.h2o.train_segments_adaboost <- function(x, + y, + training_frame, + ignore_const_cols = TRUE, + categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"), + weights_column = NULL, + nlearners = 50, + weak_learner = c("AUTO", "DRF", "GLM", "GBM"), + learn_rate = 0.5, + seed = -1, + segment_columns = NULL, + segment_models_id = NULL, + parallelism = 1) +{ + # formally define variables that were excluded from function parameters + model_id <- NULL + verbose <- NULL + destination_key <- NULL + # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object + training_frame <- .validate.H2OFrame(training_frame, required=TRUE) + + # Validate other required args + # If x is missing, then assume user wants to use all columns as features. + if (missing(x)) { + if (is.numeric(y)) { + x <- setdiff(col(training_frame), y) + } else { + x <- setdiff(colnames(training_frame), y) + } + } + + # Build parameter list to send to model builder + parms <- list() + parms$training_frame <- training_frame + args <- .verify_dataxy(training_frame, x, y) + parms$ignored_columns <- args$x_ignore + parms$response_column <- args$y + + if (!missing(ignore_const_cols)) + parms$ignore_const_cols <- ignore_const_cols + if (!missing(categorical_encoding)) + parms$categorical_encoding <- categorical_encoding + if (!missing(weights_column)) + parms$weights_column <- weights_column + if (!missing(nlearners)) + parms$nlearners <- nlearners + if (!missing(weak_learner)) + parms$weak_learner <- weak_learner + if (!missing(learn_rate)) + parms$learn_rate <- learn_rate + if (!missing(seed)) + parms$seed <- seed + + # Build segment-models specific parameters + segment_parms <- list() + if (!missing(segment_columns)) + segment_parms$segment_columns <- segment_columns + if (!missing(segment_models_id)) + segment_parms$segment_models_id <- segment_models_id + segment_parms$parallelism <- parallelism + + # Error check and build segment models + segment_models <- .h2o.segmentModelsJob('adaboost', segment_parms, parms, h2oRestApiVersion=3) + return(segment_models) +} diff --git a/h2o-r/h2o-package/pkgdown/_pkgdown.yml b/h2o-r/h2o-package/pkgdown/_pkgdown.yml index 6e170838b4db..149fd0526bc7 100644 --- a/h2o-r/h2o-package/pkgdown/_pkgdown.yml +++ b/h2o-r/h2o-package/pkgdown/_pkgdown.yml @@ -38,6 +38,7 @@ reference: - h2o - h2o.abs - h2o.acos + - h2o.adaBoost - h2o.aggregated_frame - h2o.aggregator - h2o.aic diff --git a/h2o-r/scripts/h2o-r-test-setup.R b/h2o-r/scripts/h2o-r-test-setup.R index 9dd64ce0e762..61e5d8e919a4 100755 --- a/h2o-r/scripts/h2o-r-test-setup.R +++ b/h2o-r/scripts/h2o-r-test-setup.R @@ -187,7 +187,7 @@ function() { "coxph.R", "coxphutils.R", "gbm.R", "glm.R", "gam.R", "anovaglm.R", "glrm.R", "kmeans.R", "deeplearning.R", "randomforest.R", "generic.R", "naivebayes.R", "pca.R", "svd.R", "locate.R", "grid.R", "word2vec.R", "w2vutils.R", "stackedensemble.R", "rulefit.R", "modelselection.R", "predict.R", "xgboost.R", "isolationforest.R", "psvm.R", "segment.R", "tf-idf.R", "explain.R", "permutation_varimp.R", "extendedisolationforest.R", - "upliftrandomforest.R", "infogram.R", "isotonicregression.R", "admissibleml.R", "decisiontree.R") + "upliftrandomforest.R", "infogram.R", "isotonicregression.R", "admissibleml.R", "decisiontree.R", "adaboost.R") src_path <- paste(h2oRDir,"h2o-package","R",sep=.Platform$file.sep) invisible(lapply(to_src,function(x){source(paste(src_path, x, sep = .Platform$file.sep))})) diff --git a/h2o-r/tests/testdir_algos/adaboost/runit_adaboost_smoke.R b/h2o-r/tests/testdir_algos/adaboost/runit_adaboost_smoke.R new file mode 100644 index 000000000000..ca59093b6fbb --- /dev/null +++ b/h2o-r/tests/testdir_algos/adaboost/runit_adaboost_smoke.R @@ -0,0 +1,20 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../../scripts/h2o-r-test-setup.R") + + + +test.adaBoost.smoke <- function() { + f <- "https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv" + data <- h2o.importFile(f) + + # Set predictors and response; set response as a factor + data["CAPSULE"] <- as.factor(data["CAPSULE"]) + predictors <- c("AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON") + response <- "CAPSULE" + + # Train the AdaBoost model + h2o_adaboost <- h2o.adaBoost(x = predictors, y = response, training_frame = data, seed = 1234) + expect_equal(is.null(h2o_adaboost), FALSE) +} + +doTest("adaBoost: Smoke Test", test.adaBoost.smoke)