Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix regularization and estimation in learning steps #633

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 61 additions & 50 deletions inference/dimmwitted/src/factor_graph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -242,76 +242,87 @@ FactorGraph::FactorGraph(const FactorGraph &other)
}

// Inline by defined here; accessible only from current file.
inline void FactorGraph::sgd_on_factor(size_t factor_id, double stepsize,
size_t vid, size_t evidence_value,
inline void FactorGraph::sgd_on_factor(const Factor &factor, double stepsize,
double gradient,
InferenceResult &infrs) {
const Factor &factor = factors[factor_id];
if (infrs.weights_isfixed[factor.weight_id]) {
return;
}
if (infrs.weights_isfixed[factor.weight_id]) return;
infrs.update_weight(factor.weight_id, stepsize, gradient);
}

void FactorGraph::sgd_on_variable(const Variable &variable,
InferenceResult &infrs, double stepsize,
bool is_noise_aware,
const std::vector<double> &probs) {
// stochastic gradient ascent
// decrement weight with stepsize * gradient of weight
// gradient of weight = E[f] - E[f|D], where D is evidence variables,
// f is the factor function, E[] is expectation. Expectation is
// calculated using a sample of the variable.
double pot_evid = factor.potential(vifs.get(), infrs.assignments_evid.get(),
vid, evidence_value);
double pot_free = factor.potential(vifs.get(), infrs.assignments_free.get());
double gradient = pot_free - pot_evid;
infrs.update_weight(factor.weight_id, stepsize, gradient);
}

void FactorGraph::sgd_on_variable(const Variable &variable,
InferenceResult &infrs, double stepsize,
bool is_noise_aware) {
if (variable.is_boolean()) {
// boolean: for each factor {learn}
// NOTE: boolean vars do not support truthiness / noise-aware learning
const VariableToFactor &vv = values[variable.var_val_base];
for (size_t j = 0; j < vv.factor_index_length; ++j) {
size_t factor_id = factor_index[vv.factor_index_base + j];
sgd_on_factor(factor_id, stepsize, variable.id, variable.assignment_dense,
infrs);
const Factor &factor = factors[factor_id];

double pot_evid =
factor.potential(vifs.get(), infrs.assignments_evid.get());
double pot_free =
factor.potential(vifs.get(), infrs.assignments_free.get());
double gradient = pot_free - pot_evid;

sgd_on_factor(factor, stepsize, gradient, infrs);
}
} else {
// categorical: for each evidence value { for each factor {learn} }
size_t proposal = infrs.assignments_free[variable.id];
for (size_t val = 0; val < variable.internal_cardinality(); ++val) {
// skip non-evidence values
if (!is_noise_aware && val != variable.assignment_dense) continue;

const VariableToFactor &ev = values[variable.var_val_base + val];
if (is_noise_aware && is_linear_zero(ev.truthiness)) continue;
// The factors are generated using ddlog rules, and as a result,
// only one assignment to the vars involved would activate a factor.
// In the FG, we index the factors not by vars, but by <var, value>.
// Here we find all factors for this var by iterating through its vals
// and that index.

double truthiness = is_noise_aware ? ev.truthiness : 1;
for (size_t i = 0; i < variable.internal_cardinality(); ++i) {
const VariableToFactor &vtf = values[variable.var_val_base + i];

// run SGD on all factors "activated" by this evidence value
for (size_t j = 0; j < ev.factor_index_length; ++j) {
size_t factor_id = factor_index[ev.factor_index_base + j];
sgd_on_factor(factor_id, stepsize * truthiness, variable.id, val,
infrs);
}
// loop through each factor corresponding to this <var, val>
for (size_t j = 0; j < vtf.factor_index_length; ++j) {
size_t factor_id = factor_index[vtf.factor_index_base + j];
const Factor &factor = factors[factor_id];

// run SGD on all factors "activated" by proposal value
// NOTE: Current ddlog inference rule syntax implies that this list
// of factors would overlap the above list only if the factor
// connects tuple [var=val] and tuple [var=proposal], which sensible
// ddlog inference rules would never generate.
// Hence we assume that there is no overlap.
// This may change in the future as we introduce fancier factor
// types.

// skip if we have just processed the same list of factors
// NOTE: not skipping before the first loop because ... assignments_evid!
if (val == proposal) continue;

const VariableToFactor &pv = values[variable.var_val_base + proposal];
for (size_t j = 0; j < pv.factor_index_length; ++j) {
size_t factor_id = factor_index[pv.factor_index_base + j];
sgd_on_factor(factor_id, stepsize * truthiness, variable.id, val,
infrs);
}
} // end for
if (infrs.weights_isfixed[factor.weight_id]) {
continue;
}

// compute E[f] and E[f|D]
double pot_free_expected = 0, pot_evid_expected = 0;

if (!is_noise_aware) {
pot_evid_expected =
factor.potential(vifs.get(), infrs.assignments_evid.get());
}

for (size_t k = 0; k < variable.internal_cardinality(); ++k) {
double pot_free = factor.potential(
vifs.get(), infrs.assignments_free.get(), variable.id, k);
pot_free_expected += pot_free * probs[k];

if (is_noise_aware) {
double pot_evid = factor.potential(
vifs.get(), infrs.assignments_evid.get(), variable.id, k);
pot_evid_expected +=
pot_evid * values[variable.var_val_base + k].truthiness;
}
}

double gradient = pot_free_expected - pot_evid_expected;

sgd_on_factor(factor, stepsize, gradient, infrs);

} // end for
} // end for
}
}

Expand Down
9 changes: 5 additions & 4 deletions inference/dimmwitted/src/factor_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,12 @@ class FactorGraph {
* update corresponding weights (stochastic gradient descent).
*/
void sgd_on_variable(const Variable& variable, InferenceResult& infrs,
double stepsize, bool is_noise_aware);
double stepsize, bool is_noise_aware,
const std::vector<double>& probs);

// perform SGD step for weight learning on one factor
inline void sgd_on_factor(size_t factor_id, double stepsize, size_t vid,
size_t evidence_value, InferenceResult& infrs);
// perform GD step for weight learning on one factor
inline void sgd_on_factor(const Factor& factor, double stepsize,
double gradient, InferenceResult& infrs);

/**
* Returns log-linear weighted potential of the all factors for the given
Expand Down
2 changes: 1 addition & 1 deletion inference/dimmwitted/src/gibbs_sampler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ void GibbsSampler::wait() {
GibbsSamplerThread::GibbsSamplerThread(FactorGraph &fg, InferenceResult &infrs,
size_t ith_shard, size_t n_shards,
const CmdParser &opts)
: varlen_potential_buffer_(0),
: var_potential_buffer_(0),
fg(fg),
infrs(infrs),
sample_evidence(opts.should_sample_evidence),
Expand Down
26 changes: 15 additions & 11 deletions inference/dimmwitted/src/gibbs_sampler.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class GibbsSamplerThread {
unsigned short p_rand_seed[3];

// potential for each proposals for categorical
std::vector<double> varlen_potential_buffer_;
std::vector<double> var_potential_buffer_;

// references and cached flags
FactorGraph &fg;
Expand Down Expand Up @@ -133,19 +133,20 @@ inline void GibbsSamplerThread::sample_sgd_single_variable(size_t vid,

const Variable &variable = fg.variables[vid];

// pick a value for the evid Gibbs chain
infrs.assignments_evid[variable.id] = sample_evid(variable);

// pick a value for the regular Gibbs chain
size_t proposal = draw_sample(variable, infrs.assignments_free.get(),
infrs.weight_values.get());
infrs.assignments_free[variable.id] = proposal;

// pick a value for the (parallel) evid Gibbs chain
infrs.assignments_evid[variable.id] = sample_evid(variable);

if (!learn_non_evidence && ((!is_noise_aware && !variable.is_evid) ||
(is_noise_aware && !variable.has_truthiness())))
return;

fg.sgd_on_variable(variable, infrs, stepsize, is_noise_aware);
fg.sgd_on_variable(variable, infrs, stepsize, is_noise_aware,
var_potential_buffer_);
}

inline void GibbsSamplerThread::sample_single_variable(size_t vid) {
Expand Down Expand Up @@ -215,25 +216,28 @@ inline size_t GibbsSamplerThread::draw_sample(const Variable &variable,
}

case DTYPE_CATEGORICAL: {
varlen_potential_buffer_.reserve(variable.cardinality);
var_potential_buffer_.reserve(variable.cardinality);
double sum = -100000.0;
proposal = Variable::INVALID_VALUE;
// calculate potential for each proposal given a way to iterate the domain
#define COMPUTE_PROPOSAL(EACH_DOMAIN_VALUE, DOMAIN_VALUE, DOMAIN_INDEX) \
do { \
for \
EACH_DOMAIN_VALUE { \
varlen_potential_buffer_[DOMAIN_INDEX] = \
var_potential_buffer_[DOMAIN_INDEX] = \
fg.potential(variable, DOMAIN_VALUE, assignments, weight_values); \
sum = logadd(sum, varlen_potential_buffer_[DOMAIN_INDEX]); \
sum = logadd(sum, var_potential_buffer_[DOMAIN_INDEX]); \
} \
double r = erand48(p_rand_seed); \
bool unset = true; \
for \
EACH_DOMAIN_VALUE { \
r -= exp(varlen_potential_buffer_[DOMAIN_INDEX] - sum); \
if (r <= 0) { \
double prob = exp(var_potential_buffer_[DOMAIN_INDEX] - sum); \
var_potential_buffer_[DOMAIN_INDEX] = prob; \
r -= prob; \
if (r <= 0 && unset) { \
proposal = DOMAIN_VALUE; \
break; \
unset = false; \
} \
} \
} while (0)
Expand Down
9 changes: 8 additions & 1 deletion inference/dimmwitted/src/inference_result.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ InferenceResult::InferenceResult(const FactorGraph &fg, const CmdParser &opts)
assignments_evid(new size_t[nvars]),
weight_values(new double[nweights]),
weight_grads(new float[nweights]),
weights_isfixed(new bool[nweights]) {}
weights_isfixed(new bool[nweights]),
weight_freqs(new size_t[nweights]) {}

InferenceResult::InferenceResult(const FactorGraph &fg, const Weight weights[],
const CmdParser &opts)
Expand All @@ -29,6 +30,11 @@ InferenceResult::InferenceResult(const FactorGraph &fg, const Weight weights[],
weight_values[weight.id] = weight.weight;
weight_grads[weight.id] = 0;
weights_isfixed[weight.id] = weight.isfixed;
weight_freqs[weight.id] = 0;
}

for (size_t i = 0; i < fg.size.num_factors; ++i) {
weight_freqs[fg.factors[i].weight_id] += 1;
}

for (size_t t = 0; t < nvars; ++t) {
Expand All @@ -49,6 +55,7 @@ InferenceResult::InferenceResult(const InferenceResult &other)
COPY_ARRAY_UNIQUE_PTR_MEMBER(weight_values, nweights);
COPY_ARRAY_UNIQUE_PTR_MEMBER(weight_grads, nweights);
COPY_ARRAY_UNIQUE_PTR_MEMBER(weights_isfixed, nweights);
COPY_ARRAY_UNIQUE_PTR_MEMBER(weight_freqs, nweights);

ntallies = other.ntallies;
COPY_ARRAY_UNIQUE_PTR_MEMBER(sample_tallies, ntallies);
Expand Down
27 changes: 24 additions & 3 deletions inference/dimmwitted/src/inference_result.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ class InferenceResult {
// array of whether weight is fixed
std::unique_ptr<bool[]> weights_isfixed;

// number of factors for each weight
// used to amortize regularization term
std::unique_ptr<size_t[]> weight_freqs;

InferenceResult(const FactorGraph &fg, const Weight weights[],
const CmdParser &opts);

Expand All @@ -66,20 +70,37 @@ class InferenceResult {
inline void update_weight(size_t wid, double stepsize, double gradient) {
weight_grads[wid] += gradient;
double weight = weight_values[wid];
size_t total_factors = weight_freqs[wid];

// apply l1 or l2 regularization
switch (opts.regularization) {
case REG_L2: {
// bounded approx of 1 - opts.reg_param * stepsize
weight *= (1.0 / (1.0 + opts.reg_param * stepsize));
weight *= (1.0 - opts.reg_param * stepsize / total_factors);
break;
}
case REG_L1: {
weight += opts.reg_param * (weight < 0);
if (weight > 0) {
weight =
std::max(0.0, weight - opts.reg_param * stepsize / total_factors);
} else if (weight < 0) {
weight =
std::min(0.0, weight + opts.reg_param * stepsize / total_factors);
}
break;
}
default:
std::abort();
}

// apply gradient
weight -= stepsize * gradient;

// clamp
if (weight > 10)
weight = 10;
else if (weight < -10)
weight = -10;

weight_values[wid] = weight;
}

Expand Down
7 changes: 4 additions & 3 deletions inference/dimmwitted/test/factor_graph_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class FactorGraphTest : public testing::Test {
std::unique_ptr<FactorGraph> cfg;
std::unique_ptr<InferenceResult> infrs;
std::unique_ptr<CmdParser> cmd_parser;
std::vector<double> dummy_probs;

virtual void SetUp() {
const char* argv[] = {
Expand Down Expand Up @@ -54,14 +55,14 @@ class FactorGraphTest : public testing::Test {
TEST_F(FactorGraphTest, sgd_on_variable) {
infrs->assignments_free[cfg->variables[0].id] = 0;

cfg->sgd_on_variable(cfg->variables[0], *infrs, 0.1, false);
cfg->sgd_on_variable(cfg->variables[0], *infrs, 0.1, false, dummy_probs);
std::cout << "The weight value is: " << infrs->weight_values[0] << std::endl;
EXPECT_EQ(infrs->weight_values[0], 0.2);

cfg->sgd_on_variable(cfg->variables[10], *infrs, 0.1, false);
cfg->sgd_on_variable(cfg->variables[10], *infrs, 0.1, false, dummy_probs);
EXPECT_EQ(infrs->weight_values[0], 0.2);

cfg->sgd_on_variable(cfg->variables[10], *infrs, 0.1, false);
cfg->sgd_on_variable(cfg->variables[10], *infrs, 0.1, false, dummy_probs);
EXPECT_EQ(infrs->weight_values[0], 0.2);
}

Expand Down