From 4b77862ae4e41caa41bd6901a7c4024e8f869cc0 Mon Sep 17 00:00:00 2001 From: alldefector Date: Mon, 13 Mar 2017 15:11:17 -0700 Subject: [PATCH] Fix regularization and estimation in learning steps --- inference/dimmwitted/src/factor_graph.cc | 111 ++++++++++-------- inference/dimmwitted/src/factor_graph.h | 9 +- inference/dimmwitted/src/gibbs_sampler.cc | 2 +- inference/dimmwitted/src/gibbs_sampler.h | 26 ++-- inference/dimmwitted/src/inference_result.cc | 9 +- inference/dimmwitted/src/inference_result.h | 27 ++++- .../dimmwitted/test/factor_graph_test.cc | 7 +- 7 files changed, 118 insertions(+), 73 deletions(-) diff --git a/inference/dimmwitted/src/factor_graph.cc b/inference/dimmwitted/src/factor_graph.cc index f036adc74..9de0f1b95 100644 --- a/inference/dimmwitted/src/factor_graph.cc +++ b/inference/dimmwitted/src/factor_graph.cc @@ -242,76 +242,87 @@ FactorGraph::FactorGraph(const FactorGraph &other) } // Inline by defined here; accessible only from current file. -inline void FactorGraph::sgd_on_factor(size_t factor_id, double stepsize, - size_t vid, size_t evidence_value, +inline void FactorGraph::sgd_on_factor(const Factor &factor, double stepsize, + double gradient, InferenceResult &infrs) { - const Factor &factor = factors[factor_id]; - if (infrs.weights_isfixed[factor.weight_id]) { - return; - } + if (infrs.weights_isfixed[factor.weight_id]) return; + infrs.update_weight(factor.weight_id, stepsize, gradient); +} + +void FactorGraph::sgd_on_variable(const Variable &variable, + InferenceResult &infrs, double stepsize, + bool is_noise_aware, + const std::vector &probs) { // stochastic gradient ascent // decrement weight with stepsize * gradient of weight // gradient of weight = E[f] - E[f|D], where D is evidence variables, // f is the factor function, E[] is expectation. Expectation is // calculated using a sample of the variable. - double pot_evid = factor.potential(vifs.get(), infrs.assignments_evid.get(), - vid, evidence_value); - double pot_free = factor.potential(vifs.get(), infrs.assignments_free.get()); - double gradient = pot_free - pot_evid; - infrs.update_weight(factor.weight_id, stepsize, gradient); -} -void FactorGraph::sgd_on_variable(const Variable &variable, - InferenceResult &infrs, double stepsize, - bool is_noise_aware) { if (variable.is_boolean()) { // boolean: for each factor {learn} // NOTE: boolean vars do not support truthiness / noise-aware learning const VariableToFactor &vv = values[variable.var_val_base]; for (size_t j = 0; j < vv.factor_index_length; ++j) { size_t factor_id = factor_index[vv.factor_index_base + j]; - sgd_on_factor(factor_id, stepsize, variable.id, variable.assignment_dense, - infrs); + const Factor &factor = factors[factor_id]; + + double pot_evid = + factor.potential(vifs.get(), infrs.assignments_evid.get()); + double pot_free = + factor.potential(vifs.get(), infrs.assignments_free.get()); + double gradient = pot_free - pot_evid; + + sgd_on_factor(factor, stepsize, gradient, infrs); } } else { // categorical: for each evidence value { for each factor {learn} } - size_t proposal = infrs.assignments_free[variable.id]; - for (size_t val = 0; val < variable.internal_cardinality(); ++val) { - // skip non-evidence values - if (!is_noise_aware && val != variable.assignment_dense) continue; - const VariableToFactor &ev = values[variable.var_val_base + val]; - if (is_noise_aware && is_linear_zero(ev.truthiness)) continue; + // The factors are generated using ddlog rules, and as a result, + // only one assignment to the vars involved would activate a factor. + // In the FG, we index the factors not by vars, but by . + // Here we find all factors for this var by iterating through its vals + // and that index. - double truthiness = is_noise_aware ? ev.truthiness : 1; + for (size_t i = 0; i < variable.internal_cardinality(); ++i) { + const VariableToFactor &vtf = values[variable.var_val_base + i]; - // run SGD on all factors "activated" by this evidence value - for (size_t j = 0; j < ev.factor_index_length; ++j) { - size_t factor_id = factor_index[ev.factor_index_base + j]; - sgd_on_factor(factor_id, stepsize * truthiness, variable.id, val, - infrs); - } + // loop through each factor corresponding to this + for (size_t j = 0; j < vtf.factor_index_length; ++j) { + size_t factor_id = factor_index[vtf.factor_index_base + j]; + const Factor &factor = factors[factor_id]; - // run SGD on all factors "activated" by proposal value - // NOTE: Current ddlog inference rule syntax implies that this list - // of factors would overlap the above list only if the factor - // connects tuple [var=val] and tuple [var=proposal], which sensible - // ddlog inference rules would never generate. - // Hence we assume that there is no overlap. - // This may change in the future as we introduce fancier factor - // types. - - // skip if we have just processed the same list of factors - // NOTE: not skipping before the first loop because ... assignments_evid! - if (val == proposal) continue; - - const VariableToFactor &pv = values[variable.var_val_base + proposal]; - for (size_t j = 0; j < pv.factor_index_length; ++j) { - size_t factor_id = factor_index[pv.factor_index_base + j]; - sgd_on_factor(factor_id, stepsize * truthiness, variable.id, val, - infrs); - } - } // end for + if (infrs.weights_isfixed[factor.weight_id]) { + continue; + } + + // compute E[f] and E[f|D] + double pot_free_expected = 0, pot_evid_expected = 0; + + if (!is_noise_aware) { + pot_evid_expected = + factor.potential(vifs.get(), infrs.assignments_evid.get()); + } + + for (size_t k = 0; k < variable.internal_cardinality(); ++k) { + double pot_free = factor.potential( + vifs.get(), infrs.assignments_free.get(), variable.id, k); + pot_free_expected += pot_free * probs[k]; + + if (is_noise_aware) { + double pot_evid = factor.potential( + vifs.get(), infrs.assignments_evid.get(), variable.id, k); + pot_evid_expected += + pot_evid * values[variable.var_val_base + k].truthiness; + } + } + + double gradient = pot_free_expected - pot_evid_expected; + + sgd_on_factor(factor, stepsize, gradient, infrs); + + } // end for + } // end for } } diff --git a/inference/dimmwitted/src/factor_graph.h b/inference/dimmwitted/src/factor_graph.h index 43dc55b7f..681a92968 100644 --- a/inference/dimmwitted/src/factor_graph.h +++ b/inference/dimmwitted/src/factor_graph.h @@ -109,11 +109,12 @@ class FactorGraph { * update corresponding weights (stochastic gradient descent). */ void sgd_on_variable(const Variable& variable, InferenceResult& infrs, - double stepsize, bool is_noise_aware); + double stepsize, bool is_noise_aware, + const std::vector& probs); - // perform SGD step for weight learning on one factor - inline void sgd_on_factor(size_t factor_id, double stepsize, size_t vid, - size_t evidence_value, InferenceResult& infrs); + // perform GD step for weight learning on one factor + inline void sgd_on_factor(const Factor& factor, double stepsize, + double gradient, InferenceResult& infrs); /** * Returns log-linear weighted potential of the all factors for the given diff --git a/inference/dimmwitted/src/gibbs_sampler.cc b/inference/dimmwitted/src/gibbs_sampler.cc index ce3094572..e4e0569f9 100644 --- a/inference/dimmwitted/src/gibbs_sampler.cc +++ b/inference/dimmwitted/src/gibbs_sampler.cc @@ -40,7 +40,7 @@ void GibbsSampler::wait() { GibbsSamplerThread::GibbsSamplerThread(FactorGraph &fg, InferenceResult &infrs, size_t ith_shard, size_t n_shards, const CmdParser &opts) - : varlen_potential_buffer_(0), + : var_potential_buffer_(0), fg(fg), infrs(infrs), sample_evidence(opts.should_sample_evidence), diff --git a/inference/dimmwitted/src/gibbs_sampler.h b/inference/dimmwitted/src/gibbs_sampler.h index 393188294..bc07d94c9 100644 --- a/inference/dimmwitted/src/gibbs_sampler.h +++ b/inference/dimmwitted/src/gibbs_sampler.h @@ -68,7 +68,7 @@ class GibbsSamplerThread { unsigned short p_rand_seed[3]; // potential for each proposals for categorical - std::vector varlen_potential_buffer_; + std::vector var_potential_buffer_; // references and cached flags FactorGraph &fg; @@ -133,19 +133,20 @@ inline void GibbsSamplerThread::sample_sgd_single_variable(size_t vid, const Variable &variable = fg.variables[vid]; + // pick a value for the evid Gibbs chain + infrs.assignments_evid[variable.id] = sample_evid(variable); + // pick a value for the regular Gibbs chain size_t proposal = draw_sample(variable, infrs.assignments_free.get(), infrs.weight_values.get()); infrs.assignments_free[variable.id] = proposal; - // pick a value for the (parallel) evid Gibbs chain - infrs.assignments_evid[variable.id] = sample_evid(variable); - if (!learn_non_evidence && ((!is_noise_aware && !variable.is_evid) || (is_noise_aware && !variable.has_truthiness()))) return; - fg.sgd_on_variable(variable, infrs, stepsize, is_noise_aware); + fg.sgd_on_variable(variable, infrs, stepsize, is_noise_aware, + var_potential_buffer_); } inline void GibbsSamplerThread::sample_single_variable(size_t vid) { @@ -215,7 +216,7 @@ inline size_t GibbsSamplerThread::draw_sample(const Variable &variable, } case DTYPE_CATEGORICAL: { - varlen_potential_buffer_.reserve(variable.cardinality); + var_potential_buffer_.reserve(variable.cardinality); double sum = -100000.0; proposal = Variable::INVALID_VALUE; // calculate potential for each proposal given a way to iterate the domain @@ -223,17 +224,20 @@ inline size_t GibbsSamplerThread::draw_sample(const Variable &variable, do { \ for \ EACH_DOMAIN_VALUE { \ - varlen_potential_buffer_[DOMAIN_INDEX] = \ + var_potential_buffer_[DOMAIN_INDEX] = \ fg.potential(variable, DOMAIN_VALUE, assignments, weight_values); \ - sum = logadd(sum, varlen_potential_buffer_[DOMAIN_INDEX]); \ + sum = logadd(sum, var_potential_buffer_[DOMAIN_INDEX]); \ } \ double r = erand48(p_rand_seed); \ + bool unset = true; \ for \ EACH_DOMAIN_VALUE { \ - r -= exp(varlen_potential_buffer_[DOMAIN_INDEX] - sum); \ - if (r <= 0) { \ + double prob = exp(var_potential_buffer_[DOMAIN_INDEX] - sum); \ + var_potential_buffer_[DOMAIN_INDEX] = prob; \ + r -= prob; \ + if (r <= 0 && unset) { \ proposal = DOMAIN_VALUE; \ - break; \ + unset = false; \ } \ } \ } while (0) diff --git a/inference/dimmwitted/src/inference_result.cc b/inference/dimmwitted/src/inference_result.cc index 603a60a83..c54abc00a 100644 --- a/inference/dimmwitted/src/inference_result.cc +++ b/inference/dimmwitted/src/inference_result.cc @@ -19,7 +19,8 @@ InferenceResult::InferenceResult(const FactorGraph &fg, const CmdParser &opts) assignments_evid(new size_t[nvars]), weight_values(new double[nweights]), weight_grads(new float[nweights]), - weights_isfixed(new bool[nweights]) {} + weights_isfixed(new bool[nweights]), + weight_freqs(new size_t[nweights]) {} InferenceResult::InferenceResult(const FactorGraph &fg, const Weight weights[], const CmdParser &opts) @@ -29,6 +30,11 @@ InferenceResult::InferenceResult(const FactorGraph &fg, const Weight weights[], weight_values[weight.id] = weight.weight; weight_grads[weight.id] = 0; weights_isfixed[weight.id] = weight.isfixed; + weight_freqs[weight.id] = 0; + } + + for (size_t i = 0; i < fg.size.num_factors; ++i) { + weight_freqs[fg.factors[i].weight_id] += 1; } for (size_t t = 0; t < nvars; ++t) { @@ -49,6 +55,7 @@ InferenceResult::InferenceResult(const InferenceResult &other) COPY_ARRAY_UNIQUE_PTR_MEMBER(weight_values, nweights); COPY_ARRAY_UNIQUE_PTR_MEMBER(weight_grads, nweights); COPY_ARRAY_UNIQUE_PTR_MEMBER(weights_isfixed, nweights); + COPY_ARRAY_UNIQUE_PTR_MEMBER(weight_freqs, nweights); ntallies = other.ntallies; COPY_ARRAY_UNIQUE_PTR_MEMBER(sample_tallies, ntallies); diff --git a/inference/dimmwitted/src/inference_result.h b/inference/dimmwitted/src/inference_result.h index e20ecec06..302156c1d 100644 --- a/inference/dimmwitted/src/inference_result.h +++ b/inference/dimmwitted/src/inference_result.h @@ -42,6 +42,10 @@ class InferenceResult { // array of whether weight is fixed std::unique_ptr weights_isfixed; + // number of factors for each weight + // used to amortize regularization term + std::unique_ptr weight_freqs; + InferenceResult(const FactorGraph &fg, const Weight weights[], const CmdParser &opts); @@ -66,20 +70,37 @@ class InferenceResult { inline void update_weight(size_t wid, double stepsize, double gradient) { weight_grads[wid] += gradient; double weight = weight_values[wid]; + size_t total_factors = weight_freqs[wid]; + + // apply l1 or l2 regularization switch (opts.regularization) { case REG_L2: { - // bounded approx of 1 - opts.reg_param * stepsize - weight *= (1.0 / (1.0 + opts.reg_param * stepsize)); + weight *= (1.0 - opts.reg_param * stepsize / total_factors); break; } case REG_L1: { - weight += opts.reg_param * (weight < 0); + if (weight > 0) { + weight = + std::max(0.0, weight - opts.reg_param * stepsize / total_factors); + } else if (weight < 0) { + weight = + std::min(0.0, weight + opts.reg_param * stepsize / total_factors); + } break; } default: std::abort(); } + + // apply gradient weight -= stepsize * gradient; + + // clamp + if (weight > 10) + weight = 10; + else if (weight < -10) + weight = -10; + weight_values[wid] = weight; } diff --git a/inference/dimmwitted/test/factor_graph_test.cc b/inference/dimmwitted/test/factor_graph_test.cc index 216ce2306..601ef077c 100644 --- a/inference/dimmwitted/test/factor_graph_test.cc +++ b/inference/dimmwitted/test/factor_graph_test.cc @@ -21,6 +21,7 @@ class FactorGraphTest : public testing::Test { std::unique_ptr cfg; std::unique_ptr infrs; std::unique_ptr cmd_parser; + std::vector dummy_probs; virtual void SetUp() { const char* argv[] = { @@ -54,14 +55,14 @@ class FactorGraphTest : public testing::Test { TEST_F(FactorGraphTest, sgd_on_variable) { infrs->assignments_free[cfg->variables[0].id] = 0; - cfg->sgd_on_variable(cfg->variables[0], *infrs, 0.1, false); + cfg->sgd_on_variable(cfg->variables[0], *infrs, 0.1, false, dummy_probs); std::cout << "The weight value is: " << infrs->weight_values[0] << std::endl; EXPECT_EQ(infrs->weight_values[0], 0.2); - cfg->sgd_on_variable(cfg->variables[10], *infrs, 0.1, false); + cfg->sgd_on_variable(cfg->variables[10], *infrs, 0.1, false, dummy_probs); EXPECT_EQ(infrs->weight_values[0], 0.2); - cfg->sgd_on_variable(cfg->variables[10], *infrs, 0.1, false); + cfg->sgd_on_variable(cfg->variables[10], *infrs, 0.1, false, dummy_probs); EXPECT_EQ(infrs->weight_values[0], 0.2); }