From 4b77862ae4e41caa41bd6901a7c4024e8f869cc0 Mon Sep 17 00:00:00 2001
From: alldefector <feng.niu@lattice.io>
Date: Mon, 13 Mar 2017 15:11:17 -0700
Subject: [PATCH] Fix regularization and estimation in learning steps

---
 inference/dimmwitted/src/factor_graph.cc      | 111 ++++++++++--------
 inference/dimmwitted/src/factor_graph.h       |   9 +-
 inference/dimmwitted/src/gibbs_sampler.cc     |   2 +-
 inference/dimmwitted/src/gibbs_sampler.h      |  26 ++--
 inference/dimmwitted/src/inference_result.cc  |   9 +-
 inference/dimmwitted/src/inference_result.h   |  27 ++++-
 .../dimmwitted/test/factor_graph_test.cc      |   7 +-
 7 files changed, 118 insertions(+), 73 deletions(-)
diff --git a/inference/dimmwitted/src/factor_graph.cc b/inference/dimmwitted/src/factor_graph.cc
index f036adc74..9de0f1b95 100644
--- a/inference/dimmwitted/src/factor_graph.cc
+++ b/inference/dimmwitted/src/factor_graph.cc
@@ -242,76 +242,87 @@ FactorGraph::FactorGraph(const FactorGraph &other)
 }
 
 // Inline by defined here; accessible only from current file.
-inline void FactorGraph::sgd_on_factor(size_t factor_id, double stepsize,
-                                       size_t vid, size_t evidence_value,
+inline void FactorGraph::sgd_on_factor(const Factor &factor, double stepsize,
+                                       double gradient,
                                        InferenceResult &infrs) {
-  const Factor &factor = factors[factor_id];
-  if (infrs.weights_isfixed[factor.weight_id]) {
-    return;
-  }
+  if (infrs.weights_isfixed[factor.weight_id]) return;
+  infrs.update_weight(factor.weight_id, stepsize, gradient);
+}
+
+void FactorGraph::sgd_on_variable(const Variable &variable,
+                                  InferenceResult &infrs, double stepsize,
+                                  bool is_noise_aware,
+                                  const std::vector<double> &probs) {
   // stochastic gradient ascent
   // decrement weight with stepsize * gradient of weight
   // gradient of weight = E[f] - E[f|D], where D is evidence variables,
   // f is the factor function, E[] is expectation. Expectation is
   // calculated using a sample of the variable.
-  double pot_evid = factor.potential(vifs.get(), infrs.assignments_evid.get(),
-                                     vid, evidence_value);
-  double pot_free = factor.potential(vifs.get(), infrs.assignments_free.get());
-  double gradient = pot_free - pot_evid;
-  infrs.update_weight(factor.weight_id, stepsize, gradient);
-}
 
-void FactorGraph::sgd_on_variable(const Variable &variable,
-                                  InferenceResult &infrs, double stepsize,
-                                  bool is_noise_aware) {
   if (variable.is_boolean()) {
     // boolean: for each factor {learn}
     // NOTE: boolean vars do not support truthiness / noise-aware learning
     const VariableToFactor &vv = values[variable.var_val_base];
     for (size_t j = 0; j < vv.factor_index_length; ++j) {
       size_t factor_id = factor_index[vv.factor_index_base + j];
-      sgd_on_factor(factor_id, stepsize, variable.id, variable.assignment_dense,
-                    infrs);
+      const Factor &factor = factors[factor_id];
+
+      double pot_evid =
+          factor.potential(vifs.get(), infrs.assignments_evid.get());
+      double pot_free =
+          factor.potential(vifs.get(), infrs.assignments_free.get());
+      double gradient = pot_free - pot_evid;
+
+      sgd_on_factor(factor, stepsize, gradient, infrs);
     }
   } else {
     // categorical: for each evidence value { for each factor {learn} }
-    size_t proposal = infrs.assignments_free[variable.id];
-    for (size_t val = 0; val < variable.internal_cardinality(); ++val) {
-      // skip non-evidence values
-      if (!is_noise_aware && val != variable.assignment_dense) continue;
 
-      const VariableToFactor &ev = values[variable.var_val_base + val];
-      if (is_noise_aware && is_linear_zero(ev.truthiness)) continue;
+    // The factors are generated using ddlog rules, and as a result,
+    // only one assignment to the vars involved would activate a factor.
+    // In the FG, we index the factors not by vars, but by <var, value>.
+    // Here we find all factors for this var by iterating through its vals
+    // and that index.
 
-      double truthiness = is_noise_aware ? ev.truthiness : 1;
+    for (size_t i = 0; i < variable.internal_cardinality(); ++i) {
+      const VariableToFactor &vtf = values[variable.var_val_base + i];
 
-      // run SGD on all factors "activated" by this evidence value
-      for (size_t j = 0; j < ev.factor_index_length; ++j) {
-        size_t factor_id = factor_index[ev.factor_index_base + j];
-        sgd_on_factor(factor_id, stepsize * truthiness, variable.id, val,
-                      infrs);
-      }
+      // loop through each factor corresponding to this <var, val>
+      for (size_t j = 0; j < vtf.factor_index_length; ++j) {
+        size_t factor_id = factor_index[vtf.factor_index_base + j];
+        const Factor &factor = factors[factor_id];
 
-      // run SGD on all factors "activated" by proposal value
-      // NOTE: Current ddlog inference rule syntax implies that this list
-      //       of factors would overlap the above list only if the factor
-      //       connects tuple [var=val] and tuple [var=proposal], which sensible
-      //       ddlog inference rules would never generate.
-      //       Hence we assume that there is no overlap.
-      //       This may change in the future as we introduce fancier factor
-      //       types.
-
-      // skip if we have just processed the same list of factors
-      // NOTE: not skipping before the first loop because ... assignments_evid!
-      if (val == proposal) continue;
-
-      const VariableToFactor &pv = values[variable.var_val_base + proposal];
-      for (size_t j = 0; j < pv.factor_index_length; ++j) {
-        size_t factor_id = factor_index[pv.factor_index_base + j];
-        sgd_on_factor(factor_id, stepsize * truthiness, variable.id, val,
-                      infrs);
-      }
-    }  // end for
+        if (infrs.weights_isfixed[factor.weight_id]) {
+          continue;
+        }
+
+        // compute E[f] and E[f|D]
+        double pot_free_expected = 0, pot_evid_expected = 0;
+
+        if (!is_noise_aware) {
+          pot_evid_expected =
+              factor.potential(vifs.get(), infrs.assignments_evid.get());
+        }
+
+        for (size_t k = 0; k < variable.internal_cardinality(); ++k) {
+          double pot_free = factor.potential(
+              vifs.get(), infrs.assignments_free.get(), variable.id, k);
+          pot_free_expected += pot_free * probs[k];
+
+          if (is_noise_aware) {
+            double pot_evid = factor.potential(
+                vifs.get(), infrs.assignments_evid.get(), variable.id, k);
+            pot_evid_expected +=
+                pot_evid * values[variable.var_val_base + k].truthiness;
+          }
+        }
+
+        double gradient = pot_free_expected - pot_evid_expected;
+
+        sgd_on_factor(factor, stepsize, gradient, infrs);
+
+      }  // end for
+    }    // end for
   }
 }
 
diff --git a/inference/dimmwitted/src/factor_graph.h b/inference/dimmwitted/src/factor_graph.h
index 43dc55b7f..681a92968 100644
--- a/inference/dimmwitted/src/factor_graph.h
+++ b/inference/dimmwitted/src/factor_graph.h
@@ -109,11 +109,12 @@ class FactorGraph {
    * update corresponding weights (stochastic gradient descent).
    */
   void sgd_on_variable(const Variable& variable, InferenceResult& infrs,
-                       double stepsize, bool is_noise_aware);
+                       double stepsize, bool is_noise_aware,
+                       const std::vector<double>& probs);
 
-  // perform SGD step for weight learning on one factor
-  inline void sgd_on_factor(size_t factor_id, double stepsize, size_t vid,
-                            size_t evidence_value, InferenceResult& infrs);
+  // perform GD step for weight learning on one factor
+  inline void sgd_on_factor(const Factor& factor, double stepsize,
+                            double gradient, InferenceResult& infrs);
 
   /**
    * Returns log-linear weighted potential of the all factors for the given
diff --git a/inference/dimmwitted/src/gibbs_sampler.cc b/inference/dimmwitted/src/gibbs_sampler.cc
index ce3094572..e4e0569f9 100644
--- a/inference/dimmwitted/src/gibbs_sampler.cc
+++ b/inference/dimmwitted/src/gibbs_sampler.cc
@@ -40,7 +40,7 @@ void GibbsSampler::wait() {
 GibbsSamplerThread::GibbsSamplerThread(FactorGraph &fg, InferenceResult &infrs,
                                        size_t ith_shard, size_t n_shards,
                                        const CmdParser &opts)
-    : varlen_potential_buffer_(0),
+    : var_potential_buffer_(0),
       fg(fg),
       infrs(infrs),
       sample_evidence(opts.should_sample_evidence),
diff --git a/inference/dimmwitted/src/gibbs_sampler.h b/inference/dimmwitted/src/gibbs_sampler.h
index 393188294..bc07d94c9 100644
--- a/inference/dimmwitted/src/gibbs_sampler.h
+++ b/inference/dimmwitted/src/gibbs_sampler.h
@@ -68,7 +68,7 @@ class GibbsSamplerThread {
   unsigned short p_rand_seed[3];
 
   // potential for each proposals for categorical
-  std::vector<double> varlen_potential_buffer_;
+  std::vector<double> var_potential_buffer_;
 
   // references and cached flags
   FactorGraph &fg;
@@ -133,19 +133,20 @@ inline void GibbsSamplerThread::sample_sgd_single_variable(size_t vid,
 
   const Variable &variable = fg.variables[vid];
 
+  // pick a value for the evid Gibbs chain
+  infrs.assignments_evid[variable.id] = sample_evid(variable);
+
   // pick a value for the regular Gibbs chain
   size_t proposal = draw_sample(variable, infrs.assignments_free.get(),
                                 infrs.weight_values.get());
   infrs.assignments_free[variable.id] = proposal;
 
-  // pick a value for the (parallel) evid Gibbs chain
-  infrs.assignments_evid[variable.id] = sample_evid(variable);
-
   if (!learn_non_evidence && ((!is_noise_aware && !variable.is_evid) ||
                               (is_noise_aware && !variable.has_truthiness())))
     return;
 
-  fg.sgd_on_variable(variable, infrs, stepsize, is_noise_aware);
+  fg.sgd_on_variable(variable, infrs, stepsize, is_noise_aware,
+                     var_potential_buffer_);
 }
 
 inline void GibbsSamplerThread::sample_single_variable(size_t vid) {
@@ -215,7 +216,7 @@ inline size_t GibbsSamplerThread::draw_sample(const Variable &variable,
     }
 
     case DTYPE_CATEGORICAL: {
-      varlen_potential_buffer_.reserve(variable.cardinality);
+      var_potential_buffer_.reserve(variable.cardinality);
       double sum = -100000.0;
       proposal = Variable::INVALID_VALUE;
 // calculate potential for each proposal given a way to iterate the domain
@@ -223,17 +224,20 @@ inline size_t GibbsSamplerThread::draw_sample(const Variable &variable,
   do {                                                                        \
           for                                                                 \
       EACH_DOMAIN_VALUE {                                                     \
-        varlen_potential_buffer_[DOMAIN_INDEX] =                              \
+        var_potential_buffer_[DOMAIN_INDEX] =                                 \
             fg.potential(variable, DOMAIN_VALUE, assignments, weight_values); \
-        sum = logadd(sum, varlen_potential_buffer_[DOMAIN_INDEX]);            \
+        sum = logadd(sum, var_potential_buffer_[DOMAIN_INDEX]);               \
       }                                                                       \
     double r = erand48(p_rand_seed);                                          \
+    bool unset = true;                                                        \
         for                                                                   \
       EACH_DOMAIN_VALUE {                                                     \
-        r -= exp(varlen_potential_buffer_[DOMAIN_INDEX] - sum);               \
-        if (r <= 0) {                                                         \
+        double prob = exp(var_potential_buffer_[DOMAIN_INDEX] - sum);         \
+        var_potential_buffer_[DOMAIN_INDEX] = prob;                           \
+        r -= prob;                                                            \
+        if (r <= 0 && unset) {                                                \
           proposal = DOMAIN_VALUE;                                            \
-          break;                                                              \
+          unset = false;                                                      \
         }                                                                     \
       }                                                                       \
   } while (0)
diff --git a/inference/dimmwitted/src/inference_result.cc b/inference/dimmwitted/src/inference_result.cc
index 603a60a83..c54abc00a 100644
--- a/inference/dimmwitted/src/inference_result.cc
+++ b/inference/dimmwitted/src/inference_result.cc
@@ -19,7 +19,8 @@ InferenceResult::InferenceResult(const FactorGraph &fg, const CmdParser &opts)
       assignments_evid(new size_t[nvars]),
       weight_values(new double[nweights]),
       weight_grads(new float[nweights]),
-      weights_isfixed(new bool[nweights]) {}
+      weights_isfixed(new bool[nweights]),
+      weight_freqs(new size_t[nweights]) {}
 
 InferenceResult::InferenceResult(const FactorGraph &fg, const Weight weights[],
                                  const CmdParser &opts)
@@ -29,6 +30,11 @@ InferenceResult::InferenceResult(const FactorGraph &fg, const Weight weights[],
     weight_values[weight.id] = weight.weight;
     weight_grads[weight.id] = 0;
     weights_isfixed[weight.id] = weight.isfixed;
+    weight_freqs[weight.id] = 0;
+  }
+
+  for (size_t i = 0; i < fg.size.num_factors; ++i) {
+    weight_freqs[fg.factors[i].weight_id] += 1;
   }
 
   for (size_t t = 0; t < nvars; ++t) {
@@ -49,6 +55,7 @@ InferenceResult::InferenceResult(const InferenceResult &other)
   COPY_ARRAY_UNIQUE_PTR_MEMBER(weight_values, nweights);
   COPY_ARRAY_UNIQUE_PTR_MEMBER(weight_grads, nweights);
   COPY_ARRAY_UNIQUE_PTR_MEMBER(weights_isfixed, nweights);
+  COPY_ARRAY_UNIQUE_PTR_MEMBER(weight_freqs, nweights);
 
   ntallies = other.ntallies;
   COPY_ARRAY_UNIQUE_PTR_MEMBER(sample_tallies, ntallies);
diff --git a/inference/dimmwitted/src/inference_result.h b/inference/dimmwitted/src/inference_result.h
index e20ecec06..302156c1d 100644
--- a/inference/dimmwitted/src/inference_result.h
+++ b/inference/dimmwitted/src/inference_result.h
@@ -42,6 +42,10 @@ class InferenceResult {
   // array of whether weight is fixed
   std::unique_ptr<bool[]> weights_isfixed;
 
+  // number of factors for each weight
+  // used to amortize regularization term
+  std::unique_ptr<size_t[]> weight_freqs;
+
   InferenceResult(const FactorGraph &fg, const Weight weights[],
                   const CmdParser &opts);
 
@@ -66,20 +70,37 @@ class InferenceResult {
   inline void update_weight(size_t wid, double stepsize, double gradient) {
     weight_grads[wid] += gradient;
     double weight = weight_values[wid];
+    size_t total_factors = weight_freqs[wid];
+
+    // apply l1 or l2 regularization
     switch (opts.regularization) {
       case REG_L2: {
-        // bounded approx of 1 - opts.reg_param * stepsize
-        weight *= (1.0 / (1.0 + opts.reg_param * stepsize));
+        weight *= (1.0 - opts.reg_param * stepsize / total_factors);
         break;
       }
       case REG_L1: {
-        weight += opts.reg_param * (weight < 0);
+        if (weight > 0) {
+          weight =
+              std::max(0.0, weight - opts.reg_param * stepsize / total_factors);
+        } else if (weight < 0) {
+          weight =
+              std::min(0.0, weight + opts.reg_param * stepsize / total_factors);
+        }
         break;
       }
       default:
         std::abort();
     }
+
+    // apply gradient
     weight -= stepsize * gradient;
+
+    // clamp
+    if (weight > 10)
+      weight = 10;
+    else if (weight < -10)
+      weight = -10;
+
     weight_values[wid] = weight;
   }
 
diff --git a/inference/dimmwitted/test/factor_graph_test.cc b/inference/dimmwitted/test/factor_graph_test.cc
index 216ce2306..601ef077c 100644
--- a/inference/dimmwitted/test/factor_graph_test.cc
+++ b/inference/dimmwitted/test/factor_graph_test.cc
@@ -21,6 +21,7 @@ class FactorGraphTest : public testing::Test {
   std::unique_ptr<FactorGraph> cfg;
   std::unique_ptr<InferenceResult> infrs;
   std::unique_ptr<CmdParser> cmd_parser;
+  std::vector<double> dummy_probs;
 
   virtual void SetUp() {
     const char* argv[] = {
@@ -54,14 +55,14 @@ class FactorGraphTest : public testing::Test {
 TEST_F(FactorGraphTest, sgd_on_variable) {
   infrs->assignments_free[cfg->variables[0].id] = 0;
 
-  cfg->sgd_on_variable(cfg->variables[0], *infrs, 0.1, false);
+  cfg->sgd_on_variable(cfg->variables[0], *infrs, 0.1, false, dummy_probs);
   std::cout << "The weight value is: " << infrs->weight_values[0] << std::endl;
   EXPECT_EQ(infrs->weight_values[0], 0.2);
 
-  cfg->sgd_on_variable(cfg->variables[10], *infrs, 0.1, false);
+  cfg->sgd_on_variable(cfg->variables[10], *infrs, 0.1, false, dummy_probs);
   EXPECT_EQ(infrs->weight_values[0], 0.2);
 
-  cfg->sgd_on_variable(cfg->variables[10], *infrs, 0.1, false);
+  cfg->sgd_on_variable(cfg->variables[10], *infrs, 0.1, false, dummy_probs);
   EXPECT_EQ(infrs->weight_values[0], 0.2);
 }