Skip to content

Commit

Permalink
Implement possibility to have a custom weights column and ensure that…
Browse files Browse the repository at this point in the history
… the created weights collumn will always be the one we use
  • Loading branch information
valenad1 committed Sep 14, 2023
1 parent df188df commit 814c847
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 13 deletions.
35 changes: 24 additions & 11 deletions h2o-algos/src/main/java/hex/adaboost/AdaBoost.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import hex.glm.GLMModel;
import hex.tree.drf.DRF;
import hex.tree.drf.DRFModel;
import hex.tree.dt.DTModel;
import org.apache.log4j.Logger;
import water.*;
import water.exceptions.H2OModelBuilderIllegalArgumentException;
Expand All @@ -27,6 +26,7 @@ public class AdaBoost extends ModelBuilder<AdaBoostModel, AdaBoostModel.AdaBoost
private static final Logger LOG = Logger.getLogger(AdaBoost.class);

private AdaBoostModel _model;
private String _weightsName = "weights";

// Called from an http request
public AdaBoost(AdaBoostModel.AdaBoostParameters parms) {
Expand Down Expand Up @@ -55,6 +55,10 @@ public void init(boolean expensive) {
if (_parms._weak_learner == AdaBoostModel.Algorithm.AUTO) {
_parms._weak_learner = AdaBoostModel.Algorithm.DRF;
}
if (_parms._weights_column != null) {
// _parms._weights_column cannot be used all time since it breaks scoring
_weightsName = _parms._weights_column;
}
}
}

Expand Down Expand Up @@ -82,12 +86,19 @@ public void computeImpl() {
private void buildAdaboost() {
_model._output.alphas = new double[(int)_parms._n_estimators];
_model._output.models = new Key[(int)_parms._n_estimators];

Frame _trainWithWeights = new Frame(train());
Vec weights = _trainWithWeights.anyVec().makeCons(1,1,null,null)[0];
_trainWithWeights.add("weights", weights);
DKV.put(_trainWithWeights);
Scope.track(weights);

Frame _trainWithWeights;
if (_parms._weights_column == null) {
_trainWithWeights = new Frame(train());
Vec weights = _trainWithWeights.anyVec().makeCons(1,1,null,null)[0];
_weightsName = _trainWithWeights.uniquify(_weightsName); // be sure that we are not accidentally using some column in the train
_trainWithWeights.add(_weightsName, weights);
DKV.put(_trainWithWeights);
Scope.track(weights);
_weightsName = _trainWithWeights.lastVecName();
} else {
_trainWithWeights = _parms.train();
}

for (int n = 0; n < _parms._n_estimators; n++) {
ModelBuilder job = chooseWeakLearner(_trainWithWeights);
Expand All @@ -99,17 +110,19 @@ private void buildAdaboost() {
Frame score = model.score(_trainWithWeights);
Scope.track(score);

CountWeTask countWe = new CountWeTask().doAll(_trainWithWeights.vec("weights"), _trainWithWeights.vec(_parms._response_column), score.vec("predict"));
CountWeTask countWe = new CountWeTask().doAll(_trainWithWeights.vec(_weightsName), _trainWithWeights.vec(_parms._response_column), score.vec("predict"));
double e_m = countWe.We / countWe.W;
double alpha_m = _parms._learning_rate * Math.log((1 - e_m) / e_m);
_model._output.alphas[n] = alpha_m;

UpdateWeightsTask updateWeightsTask = new UpdateWeightsTask(alpha_m);
updateWeightsTask.doAll(_trainWithWeights.vec("weights"), _trainWithWeights.vec(_parms._response_column), score.vec("predict"));
updateWeightsTask.doAll(_trainWithWeights.vec(_weightsName), _trainWithWeights.vec(_parms._response_column), score.vec("predict"));
_job.update(1);
_model.update(_job);
}
DKV.remove(_trainWithWeights._key);
if (_trainWithWeights != _parms.train()) {
DKV.remove(_trainWithWeights._key);
}
_model._output._model_summary = createModelSummaryTable();
}
}
Expand Down Expand Up @@ -153,7 +166,7 @@ private DRF getDRFWeakLearner(Frame frame) {
parms._response_column = _parms._response_column;
parms._mtries = 1;
parms._min_rows = 1;
parms._weights_column = "weights";
parms._weights_column = _weightsName;
parms._ntrees = 1;
parms._sample_rate = 1;
parms._max_depth = 1;
Expand Down
123 changes: 121 additions & 2 deletions h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,18 @@
import water.runner.CloudSize;
import water.runner.H2ORunner;
import water.util.FrameUtils;
import water.util.VecUtils;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;

import static org.junit.Assert.*;

@CloudSize(1)
@RunWith(H2ORunner.class)
public class AdaBoostTest extends TestUtil {

public boolean print = true;
public boolean print = false;

@Rule
public EnvironmentVariables environmentVariables = new EnvironmentVariables();
Expand Down Expand Up @@ -355,4 +355,123 @@ public void testUpdateWeights() {
Scope.exit();
}
}

@Test
public void testBasicTrainAndScoreWithExternalWeightsColumn() {
try {
// Train reference model
Scope.enter();
Frame train = parseTestFile("smalldata/prostate/prostate.csv");
String response = "CAPSULE";
train.toCategoricalCol(response);
Scope.track(train);
AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters();
p._train = train._key;
p._seed = 0xDECAF;
p._n_estimators = 10;
p._response_column = response;

AdaBoost adaBoostReference = new AdaBoost(p);
AdaBoostModel adaBoostReferenceModel = adaBoostReference.trainModel().get();
Scope.track_generic(adaBoostReferenceModel);
assertNotNull(adaBoostReferenceModel);

// Add weights column to frame and train different model
Vec weights = train.anyVec().makeCons(1,1,null,null)[0];
train.add("weights", weights);
DKV.put(train);
Scope.track(train);
p._weights_column = "weights";

AdaBoost adaBoostWithExternalWeights = new AdaBoost(p);
AdaBoostModel adaBoostModelWithExternalWeights = adaBoostWithExternalWeights.trainModel().get();
Scope.track_generic(adaBoostModelWithExternalWeights);
assertNotNull(adaBoostModelWithExternalWeights);

// Check that output is identical
Frame scoreReference = adaBoostReferenceModel.score(train);
Scope.track(scoreReference);
Frame scoreWithExternalWeights = adaBoostModelWithExternalWeights.score(train);
Scope.track(scoreWithExternalWeights);
assertFrameEquals(scoreReference, scoreWithExternalWeights, 0); // output should be identical
assertFalse("Weights column should be change in the training", weights.isConst());
} finally {
Scope.exit();
}
}

@Test
public void testBasicTrainAndScoreWithCustomWeightsColumn() {
try {
// Train reference model
Scope.enter();
Frame train = parseTestFile("smalldata/prostate/prostate.csv");
String response = "CAPSULE";
train.toCategoricalCol(response);
Scope.track(train);
AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters();
p._train = train._key;
p._seed = 0xDECAF;
p._n_estimators = 10;
p._response_column = response;

AdaBoost adaBoostReference = new AdaBoost(p);
AdaBoostModel adaBoostReferenceModel = adaBoostReference.trainModel().get();
Scope.track_generic(adaBoostReferenceModel);
assertNotNull(adaBoostReferenceModel);

// Set custom weights column
p._weights_column = "RACE";
double maxReference = train.vec("RACE").max(); // for future assert
AdaBoost adaBoostWithExternalWeights = new AdaBoost(p);
AdaBoostModel adaBoostModelWithExternalWeights = adaBoostWithExternalWeights.trainModel().get();
Scope.track_generic(adaBoostModelWithExternalWeights);
assertNotNull(adaBoostModelWithExternalWeights);

// Check that output is identical
Frame scoreReference = adaBoostReferenceModel.score(train);
Scope.track(scoreReference);
Frame scoreWithExternalWeights = adaBoostModelWithExternalWeights.score(train);
Scope.track(scoreWithExternalWeights);
// output should be different since the weights are not initialize on purpose
assertFalse(Arrays.equals(FrameUtils.asDoubles(scoreReference.vec("predict")), FrameUtils.asDoubles(scoreWithExternalWeights.vec("predict"))));
assertNotEquals("RACE column should be changed in the training", maxReference, train.vec("RACE").max());
} finally {
Scope.exit();
}
}

@Test
public void testBasicTrainAndScoreWithDuplicatedWeightsColumn() {
try {
Scope.enter();
Frame train = parseTestFile("smalldata/prostate/prostate.csv");
// Add weights column to frame
Vec weights = train.anyVec().makeCons(1,1,null,null)[0];
train.add("weights", weights);
DKV.put(train);
String response = "CAPSULE";
train.toCategoricalCol(response);
Scope.track(train);

AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters();
p._train = train._key;
p._seed = 0xDECAF;
p._n_estimators = 10;
p._response_column = response;
p._ignore_const_cols = false;

AdaBoost adaBoost = new AdaBoost(p);
AdaBoostModel adaBoostModel = adaBoost.trainModel().get();
Scope.track_generic(adaBoostModel);
assertNotNull(adaBoostModel);

// Check that output is identical
Frame score = adaBoostModel.score(train);
Scope.track(score);
assertTrue("Weights column should not be changed in the training", weights.isConst());
} finally {
Scope.exit();
}
}
}

0 comments on commit 814c847

Please sign in to comment.