Skip to content

Commit

Permalink
fixed reamining errors.
Browse files Browse the repository at this point in the history
  • Loading branch information
wendycwong committed Oct 7, 2024
1 parent d883ece commit 9ecc510
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 52 deletions.
14 changes: 6 additions & 8 deletions h2o-algos/src/main/java/hex/hglm/HGLM.java
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ static class ScoringHistory {
public ArrayList<Integer> getScoringIters() { return _scoringIters;}
public ArrayList<Long> getScoringTimes() { return _scoringTimes;}
public ArrayList<Double> getLikelihoods() { return _logLikelihood;}
public ArrayList<Double> getLikelihoods2() { return _logLikelihood2;}
public ArrayList<Double> getRMSE() { return _rmse;}

public void addIterationScore(int iter, double loglikelihood, double loglikelihood2, double rmse) {
Expand All @@ -96,9 +95,9 @@ public void addIterationScore(int iter, double loglikelihood, double loglikeliho
}

public TwoDimTable to2dTable() {
String[] cnames = new String[]{"timestamp", "iterations", "loglikelihood", "loglikelihood2", "rmse"};
String[] ctypes = new String[]{"string", "int", "double", "double", "double"};
String[] cformats = new String[]{"%s", "%d", "%.5f", "%.5f", "%.5f"};
String[] cnames = new String[]{"timestamp", "iterations", "loglikelihood", "rmse"};
String[] ctypes = new String[]{"string", "int", "double", "double"};
String[] cformats = new String[]{"%s", "%d", "%.5f", "%.5f"};
int tableSize = _scoringIters.size();
TwoDimTable res = new TwoDimTable("Scoring History", "",
new String[tableSize], cnames, ctypes, cformats, "");
Expand Down Expand Up @@ -238,13 +237,12 @@ public void computeImpl() {
}

private TwoDimTable generateSummary(HGLMModel.HGLMModelOutput modelOutput) {
String[] names = new String[]{"iteration", "loglikelihood", "loglikelihood2"};
String[] types = new String[]{"int", "double", "double"};
String[] formats = new String[]{"%d", "%.5f", "%.5f"};
String[] names = new String[]{"iteration", "loglikelihood"};
String[] types = new String[]{"int", "double"};
String[] formats = new String[]{"%d", "%.5f"};
TwoDimTable summary = new TwoDimTable("HGLM Model", "summary", new String[]{""}, names, types, formats, "");
summary.set(0, 0, modelOutput._iterations);
summary.set(0, 1, modelOutput._log_likelihood);
summary.set(0, 2, modelOutput._log_likelihood2);
return summary;
}

Expand Down
2 changes: 1 addition & 1 deletion h2o-algos/src/main/java/hex/hglm/HGLMUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ public static boolean checkPositiveG(int numLevel2Units, double[][] tMat) {
public static void generateNonStandardizeZTZArjTArs(HGLMModel.HGLMParameters parms, HGLMModel model) {
if (parms._standardize) {
boolean orignalRandomIntercept = parms._random_intercept;
parms._random_intercept = true && !randomColAllEnum(parms.train(), parms._random_columns);
parms._random_intercept = parms._random_intercept || !randomColAllEnum(parms.train(), parms._random_columns);
List<String> colNames = Arrays.asList(parms.train().names());
boolean hasWeights = model._parms._weights_column != null && colNames.contains(model._parms._weights_column);
boolean hasOffsets = model._parms._offset_column != null && colNames.contains(model._parms._offset_column);
Expand Down
7 changes: 3 additions & 4 deletions h2o-algos/src/main/java/hex/hglm/MetricBuilderHGLM.java
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,7 @@ public ModelMetrics makeModelMetrics(Model m, Frame f, Frame adaptedFrame, Frame
if (forTraining) {
mm = new ModelMetricsRegressionHGLM(m, f, hglmM._output._nobs, this._domain, this.weightedSigma(),
this._customMetric, hglmM._output._iterations, hglmM._output._beta, hglmM._output._ubeta,
tmat, mse, hglmM._output._zttimesz_score, mse,
this._yMinusfixPredSquare / hglmM._output._nobs, hglmM._output._yminusxtimesz,
tmat, mse, mse, this._yMinusfixPredSquare / hglmM._output._nobs, hglmM._output._yminusxtimesz,
hglmM._output._arjtarj_score);
} else {
List<String> colNames = Arrays.asList(f.names());
Expand All @@ -89,8 +88,8 @@ public ModelMetrics makeModelMetrics(Model m, Frame f, Frame adaptedFrame, Frame
engineTask.doAll(dinfo._adaptedFrame);
mm = new ModelMetricsRegressionHGLM(m, f, engineTask._nobs, this._domain, this.weightedSigma(),
this._customMetric, hglmM._output._iterations, hglmM._output._beta, hglmM._output._ubeta,
tmat, mse, engineTask._zTTimesZ, this._sse/engineTask._nobs,
this._yMinusfixPredSquare/engineTask._nobs, hglmM._output._yminusxtimesz_valid, engineTask._ArjTArj);
tmat, mse,this._sse/engineTask._nobs, this._yMinusfixPredSquare/engineTask._nobs,
hglmM._output._yminusxtimesz_valid, engineTask._ArjTArj);
hglmM._output._nobs_valid = engineTask._nobs;
}

Expand Down
49 changes: 23 additions & 26 deletions h2o-core/src/main/java/hex/ModelMetricsRegressionHGLM.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ public class ModelMetricsRegressionHGLM extends ModelMetricsSupervised {
public final int _iterations;
public final double[][] _tmat;
public final double _var_residual; // variance of residual error
public final double _log_likelihood2; // lLg from reference [1] of the doc
//public final double _log_likelihood2; // lLg from reference [1] of the doc
public final double _log_likelihood; // llg from reference [2] of the doc
public final double _mse_fixed; // mse of with fixed effect only

public ModelMetricsRegressionHGLM(Model model, Frame frame, long nobs, String[] domain, double sigma,
CustomMetric customMetric, int iter, double[] beta, double[][] ubeta,
double[][] tmat, double varResidual, double[][] zTTimesZ, double mse, double mse_fixed,
double[][] tmat, double varResidual, double mse, double mse_fixed,
double[][] yMinusXFixTimesZ, double[][][] ajTaj) {
super(model, frame, nobs, mse, domain, sigma, customMetric);
_beta = beta;
Expand All @@ -34,34 +34,34 @@ public ModelMetricsRegressionHGLM(Model model, Frame frame, long nobs, String[]
_tmat = tmat;
_var_residual = varResidual;
_icc = calICC(tmat, varResidual);
_log_likelihood2 = calHGLMllg2(nobs, tmat, varResidual, zTTimesZ, mse_fixed*nobs, yMinusXFixTimesZ);
_log_likelihood = calHGLMllg(nobs, tmat, varResidual, ajTaj, mse_fixed*nobs, yMinusXFixTimesZ);
_mse_fixed = mse_fixed;
}

/**
*
* This method calculates the log-likelihood as described in section II.V of the doc.
* This method calculates the log-likelihood as described in section II.V of the doc. I surround the method with
* try catch because it seems like the Matrix toolbox can crash h2o-3 otherwise.
*/
public static double calHGLMllg(long nobs, double[][] tmat, double varResidual, double[][][] zjTTimesZj,
double yMinsXFixSqure, double[][] yMinusXFixTimesZ) {
int numLevel2 = zjTTimesZj.length;
double[][] tmatInv = new Matrix(tmat).inverse().getArray();
double tmatDeterminant = new Matrix(tmat).det();
double oneOVar = 1.0/varResidual;
double oneOVarSq = oneOVar*oneOVar;
double llg = nobs*LOG_2PI + oneOVar*yMinsXFixSqure;
double[][] invTPlusZjTZ;
Matrix yMinusXjFixed;
Matrix yjMinusXjFixed;
for (int ind2 = 0; ind2 < numLevel2; ind2++) {
invTPlusZjTZ = calInvTPZjTZ(tmatInv, zjTTimesZj[ind2], oneOVar);
llg += Math.log(varResidual * new Matrix(invTPlusZjTZ).det()*tmatDeterminant);
yMinusXjFixed = new Matrix(new double[][]{yMinusXFixTimesZ[ind2]});
yjMinusXjFixed = yMinusXjFixed.times(new Matrix(invTPlusZjTZ).inverse().times(yMinusXjFixed.transpose()));
llg -= oneOVarSq * yjMinusXjFixed.getArray()[0][0];
}
return -0.5*llg;
int numLevel2 = zjTTimesZj.length;
double[][] tmatInv = new Matrix(tmat).inverse().getArray();
double tmatDeterminant = new Matrix(tmat).det();
double oneOVar = 1.0 / varResidual;
double oneOVarSq = oneOVar * oneOVar;
double llg = nobs * LOG_2PI + oneOVar * yMinsXFixSqure;
double[][] invTPlusZjTZ;
Matrix yMinusXjFixed;
Matrix yjMinusXjFixed;
for (int ind2 = 0; ind2 < numLevel2; ind2++) {
invTPlusZjTZ = calInvTPZjTZ(tmatInv, zjTTimesZj[ind2], oneOVar);
llg += Math.log(varResidual * new Matrix(invTPlusZjTZ).det() * tmatDeterminant);
yMinusXjFixed = new Matrix(new double[][]{yMinusXFixTimesZ[ind2]});
yjMinusXjFixed = yMinusXjFixed.times(new Matrix(invTPlusZjTZ).inverse().times(yMinusXjFixed.transpose()));
llg -= oneOVarSq * yjMinusXjFixed.getArray()[0][0];
}
return -0.5 * llg;
}

public static double[][] calInvTPZjTZ(double[][] tmatInv, double[][] zjTTimesZj, double oneOVar) {
Expand All @@ -70,7 +70,8 @@ public static double[][] calInvTPZjTZ(double[][] tmatInv, double[][] zjTTimesZj,

/***
*
* This method calculates the log-likelihood as described in section II.VI of the doc.
* This method calculates the log-likelihood as described in section II.VI of the doc. Please keep this method
* even though nobody is calling it.
*/
public static double calHGLMllg2(long nobs, double[][] tmat, double varResidual, double[][] zTTimesZ,
double yMinsXFixSqure, double[][] yMinusXFixTimesZ) {
Expand Down Expand Up @@ -127,10 +128,6 @@ public double llg() {
return _log_likelihood;
}

public double llg2() {
return _log_likelihood2;
}

@Override
public String toString() {
StringBuilder sb = new StringBuilder();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
public class ModelMetricsRegressionHGLMGeneric extends ModelMetricsRegressionHGLM {
public ModelMetricsRegressionHGLMGeneric(Model model, Frame frame, long nobs, String[] domain, double sigma,
CustomMetric customMetric, int iter, double[] beta, double[][] ubeta,
double[][] tmat, double varResidual, double[][] zTTimesZ, double mse, double mse_fixed,
double[][] tmat, double varResidual, double mse, double mse_fixed,
double[][] yMinusXFixTimesZ, double[][][] arjtarj) {
super(model, frame, nobs, domain, sigma, customMetric, iter, beta, ubeta, tmat, varResidual, zTTimesZ, mse,
mse_fixed, yMinusXFixTimesZ, arjtarj);
super(model, frame, nobs, domain, sigma, customMetric, iter, beta, ubeta, tmat, varResidual, mse, mse_fixed,
yMinusXFixTimesZ, arjtarj);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@
def test_define_dataset():
family = 'gaussian' # can be any valid GLM families
nrow = 40000
nenum = 6
nreal = 1 # last one is the response
nenum = 3
nreal = 3 # last one is the response
enum_columns = pyunit_utils.random_dataset_enums_only(nrow, nenum, factorL=8, misFrac=0.0)
real_columns = pyunit_utils.random_dataset_real_only(nrow, nreal, realR = 2, misFrac=0.0)
dataset = enum_columns.cbind(real_columns)
dataset.set_name(dataset.ncol-1, "response")
cnames = dataset.names
group_column=cnames[0]
random_columns = [cnames[1], cnames[2], cnames[3]]
random_columns = [cnames[1], cnames[2], cnames[3], cnames[4]]
hglmDataSet = generate_dataset(family, dataset, group_column, random_columns)
print("Done!")
#h2o.download_csv(hglmDataSet, "/Users/wendycwong/temp/dataset.csv") # save dataset
Expand All @@ -51,17 +51,48 @@ def generate_dataset(family, trainData, group_column, random_columns):

# to generate data in hglm_test/gaussian_0GC_123R_allEnum_p05oise_p08T_wIntercept_standardize.csv, 6 cat, 0 numeric
# 1 response, seed = 12345
startVal = [0.7906251, 1.8005780, -3.5665564, -0.8804172, -1.5809320, 1.5188019, -1.6089287, 1.7509011,
-0.5286826, -1.1203812, -2.3159930, 0.1674759, -0.9065857, -0.7587694, -0.8578529, 0.3007900,
1.5765745, 1.1725489, -0.6935900, -1.1467158, 1.3960304, -1.7078175, -2.8960526, 0.9847858,
-1.0951275, 0.1393349, -0.6782085, 3.3711444, -2.0059428, 1.3293327, -0.5083064, 2.7324153,
0.2036385, -1.6967069, 0.699569, -0.4288891]
# startVal = [0.7906251, 1.8005780, -3.5665564, -0.8804172, -1.5809320, 1.5188019, -1.6089287, 1.7509011,
# -0.5286826, -1.1203812, -2.3159930, 0.1674759, -0.9065857, -0.7587694, -0.8578529, 0.3007900,
# 1.5765745, 1.1725489, -0.6935900, -1.1467158, 1.3960304, -1.7078175, -2.8960526, 0.9847858,
# -1.0951275, 0.1393349, -0.6782085, 3.3711444, -2.0059428, 1.3293327, -0.5083064, 2.7324153,
# 0.2036385, -1.6967069, 0.699569, -0.4288891]
# hglm_test/gaussian_0GC_123R_allEnum_p05oise_p08T_wIntercept.csv
# hglm_test/gaussian_0GC_123R_allEnum_p05oise_p08T_woIntercept_standardize.csv
# hglm_test/gaussian_0GC_123R_allEnum_p05oise_p08T_woIntercept.csv

# to generate data in hglm_test/gaussian_0GC_123R_6enum_5num_p05oise_p08T_wIntercept_standardize.csv, seed=12345,
# startVal = [3.93013069, 0.54472937, 1.00317237, 0.45930296, 2.41925257, -3.09530556, -3.56112954, 1.63825546,
# -0.09974517, 0.09546386, -0.67192248, -0.71572626, 0.78566524, -0.58579001, -1.91637762, 0.85650108,
# 0.91881537, 2.35773321, -0.20756380, 0.40147277, -1.10384921, 0.75619311, -0.57409532, 1.44300300,
# 2.01180669, -1.90782107,-0.41173998, -0.50159384, 1.22944372, -1.18281946, -2.96645841, 2.14272813,
# -0.32555483, -1.00719124, 0.74755600, 1.09900559, 2.30948122, 1.23596162, 0.66856774, -2.56878032,
# 0.05599762]
# hglm_test/gaussian_0GC_123R_6enum_5num_p05oise_p08T_wIntercept.csv
# hglm_test/gaussian_0GC_123R_6enum_5num_p05oise_p08T_woIntercept_standardize.gz
# hglm_test/gaussian_0GC_123R_6enum_5num_p05oise_p08T_woIntercept.gz
# hglm_test/gaussian_0GC_678R_6enum_5num_p05oise_p08T_wIntercept_standardize
# hglm_test/gaussian_0GC_678R_6enum_5num_p05oise_p08T_wIntercept
# hglm_test/gaussian_0GC_678R_6enum_5num_p05oise_p08T_woIntercept_standardize
# hglm_test/gaussian_0GC_678R_6enum_5num_p05oise_p08T_woIntercept
# hglm_test/gaussian_0GC_1267R_6enum_5num_p05oise_p08T_wIntercept_standardize
# hglm_test/gaussian_0GC_1267R_6enum_5num_p05oise_p08T_wIntercept
# hglm_test/gaussian_0GC_1267R_6enum_5num_p05oise_p08T_woIntercept_standardize
# hglm_test/gaussian_0GC_1267R_6enum_5num_p05oise_p08T_woIntercept

# hglm_test/gaussian_0GC_allenum_allRC_p05oise_p08T_wIntercept_standardize
#startVal = [1.10825995, -0.37625500, 0.01522888, -2.33646889, -1.39787749, 0.10817416, -0.48015694, 2.47842056,
# -3.45931533, 0.25396556, -2.52770259, 0.96282659, -2.40216594, -2.79117384, -2.21220306]

# hglm_test/gaussian_0GC_allnumeric_allRC_p05oise_p08T_woIntercept_standardize
#startVal = [-0.9414337, -2.0222721, -2.4312540]

# hglm_test/gaussian_0GC_allRC_2enum2numeric_p05oise_p08T_woIntercept_standardize
startVal = [-1.4313612, 0.6795744, 1.9795154, -3.1187255, 0.2058840, -1.6596187, 0.3460812, -0.7809777,
1.6617960, -0.5174034, 1.8273497, -2.4161541, 0.9474324, 2.3616221, 0.7710148, 0.2706556, 1.0541668]
# hglm_test/gaussian_0GC_allRC_2enum2numeric_p05oise_p08T_wIntercept_standardize
m = hglm(family=family, max_iterations=0, random_columns=random_columns, group_column=group_column,
tau_u_var_init = 0.08, tau_e_var_init = 0.05, random_intercept = False, gen_syn_data=True,
seed = 12345, standardize = False, initial_fixed_effects=startVal)
tau_u_var_init = 0.08, tau_e_var_init = 0.05, random_intercept = True, gen_syn_data=True,
seed = 12345, standardize = True, initial_fixed_effects=startVal)
m.train(training_frame=trainData, y = "response", x =myX)
f2 = m.predict(trainData)
finalDataset = trainData[names_without_response]
Expand Down

0 comments on commit 9ecc510

Please sign in to comment.