Adding dual averaging scale adapter implementation (#46)

* Adding dual averaging scale adapter implementation * Fix indentation * Rename variance_adapter to variance_shape_adapter
UCL · Oct 24, 2024 · 5d29b85 · 5d29b85
1 parent 517bde4
commit 5d29b85
Show file tree

Hide file tree

Showing 11 changed files with 322 additions and 73 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,13 +2,14 @@
 
 export(barker_proposal)
 export(chain_state)
+export(dual_averaging_scale_adapter)
 export(example_gaussian_stan_model)
 export(hamiltonian_proposal)
 export(langevin_proposal)
 export(random_walk_proposal)
 export(robust_shape_adapter)
 export(sample_chain)
-export(scale_adapter)
+export(simple_scale_adapter)
 export(target_distribution_from_stan_model)
 export(trace_function_from_stan_model)
-export(variance_adapter)
+export(variance_shape_adapter)
diff --git a/R/adaptation.R b/R/adaptation.R
@@ -1,4 +1,5 @@
-#' Create object to adapt proposal scale to coerce average acceptance rate.
+#' Create object to adapt proposal scale to coerce average acceptance rate using
+#' a Robbins and Monro (1951) scheme.
 #'
 #' @param initial_scale Initial value to use for scale parameter. If not set
 #'   explicitly a proposal and dimension dependent default will be used.
@@ -25,9 +26,9 @@
 #'   grad_log_density = function(x) -x
 #' )
 #' proposal <- barker_proposal(target_distribution)
-#' adapter <- scale_adapter(initial_scale = 1., target_accept_prob = 0.4)
+#' adapter <- simple_scale_adapter(initial_scale = 1., target_accept_prob = 0.4)
 #' adapter$initialize(proposal, chain_state(c(0, 0)))
-scale_adapter <- function(
+simple_scale_adapter <- function(
     initial_scale = NULL, target_accept_prob = NULL, kappa = 0.6) {
   log_scale <- NULL
   initialize <- function(proposal, initial_state) {
@@ -41,9 +42,9 @@ scale_adapter <- function(
     if (is.null(target_accept_prob)) {
       target_accept_prob <- proposal$default_target_accept_prob()
     }
-    gamma <- sample_index^(-kappa)
+    beta <- sample_index^(-kappa)
     accept_prob <- state_and_statistics$statistics$accept_prob
-    log_scale <<- log_scale + gamma * (accept_prob - target_accept_prob)
+    log_scale <<- log_scale + beta * (accept_prob - target_accept_prob)
     proposal$update(scale = exp(log_scale))
   }
   list(
@@ -54,12 +55,95 @@ scale_adapter <- function(
   )
 }
 
+#' Create object to adapt proposal scale to coerce average acceptance rate
+#' using dual averaging scheme of Nesterov (2009) and Hoffman and Gelman (2014).
+#'
+#' @inherit simple_scale_adapter params return
+#'
+#' @param gamma Regularization coefficient for (log) scale in dual averaging
+#'   algorithm. Controls amount of regularization of (log) scale towards `mu`.
+#'   Should be set to a non-negative value. Defaults to value recommended in
+#'   Hoffman and Gelman (2014).
+#' @param iteration_offset Offset to chain iteration used for the iteration
+#'   based weighting of the adaptation statistic error estimate. Should be set
+#'   to a non-negative value. A value greater than zero has the effect of
+#'   stabilizing early iterations. Defaults to value recommended in
+#'   Hoffman and Gelman (2014).
+#' @param mu Value to regularize (log) scale towards. If `NULL` (the default),
+#'   `mu` will be set to `log(10 * initial_scale)`, as recommended in Hoffman
+#'   and Gelman (2014).
+#'
+#' @export
+#'
+#' @examples
+#' target_distribution <- list(
+#'   log_density = function(x) -sum(x^2) / 2,
+#'   grad_log_density = function(x) -x
+#' )
+#' proposal <- barker_proposal(target_distribution)
+#' adapter <- dual_averaging_scale_adapter(
+#'   initial_scale = 1., target_accept_prob = 0.4
+#' )
+#' adapter$initialize(proposal, chain_state(c(0, 0)))
+dual_averaging_scale_adapter <- function(
+    initial_scale = NULL,
+    target_accept_prob = NULL,
+    kappa = 0.75,
+    gamma = 0.05,
+    iteration_offset = 10,
+    mu = NULL) {
+  log_scale <- NULL
+  smoothed_log_scale <- 0
+  accept_prob_error <- 0
+  initialize <- function(proposal, initial_state) {
+    if (is.null(initial_scale)) {
+      initial_scale <- proposal$default_initial_scale(initial_state$dimension())
+    }
+    if (is.null(mu)) {
+      mu <<- log(10 * initial_scale)
+    }
+    log_scale <<- log(initial_scale)
+    proposal$update(scale = initial_scale)
+  }
+  update <- function(proposal, sample_index, state_and_statistics) {
+    if (is.null(target_accept_prob)) {
+      target_accept_prob <- proposal$default_target_accept_prob()
+    }
+    accept_prob <- state_and_statistics$statistics$accept_prob
+    offset_sample_index <- sample_index + iteration_offset
+    accept_prob_error <<- (
+      (1 - 1 / offset_sample_index) * accept_prob_error + (
+        target_accept_prob - accept_prob
+      ) / offset_sample_index
+    )
+    log_scale <<- mu - sqrt(sample_index) * accept_prob_error / gamma
+    beta <- sample_index^(-kappa)
+    smoothed_log_scale <<- beta * log_scale + (1 - beta) * smoothed_log_scale
+    proposal$update(scale = exp(log_scale))
+  }
+  finalize <- function(proposal) {
+    proposal$update(scale = exp(smoothed_log_scale))
+  }
+  list(
+    initialize = initialize,
+    update = update,
+    finalize = finalize,
+    state = function() {
+      list(
+        log_scale = log_scale,
+        smoothed_log_scale = smoothed_log_scale,
+        accept_prob_error = accept_prob_error
+      )
+    }
+  )
+}
+
 #' Create object to adapt proposal with per dimension scales based on estimates
 #' of target distribution variances.
 #'
 #' @param kappa Decay rate exponent in `[0.5, 1]` for adaptation learning rate.
 #'
-#' @inherit scale_adapter return
+#' @inherit simple_scale_adapter return
 #'
 #' @export
 #' @examples
@@ -68,20 +152,22 @@ scale_adapter <- function(
 #'   grad_log_density = function(x) -x
 #' )
 #' proposal <- barker_proposal(target_distribution)
-#' adapter <- variance_adapter()
+#' adapter <- variance_shape_adapter()
 #' adapter$initialize(proposal, chain_state(c(0, 0)))
-variance_adapter <- function(kappa = 0.6) {
+variance_shape_adapter <- function(kappa = 0.6) {
   mean_estimate <- NULL
   variance_estimate <- NULL
   initialize <- function(proposal, initial_state) {
     mean_estimate <<- initial_state$position()
     variance_estimate <<- rep(1., initial_state$dimension())
   }
   update <- function(proposal, sample_index, state_and_statistics) {
-    gamma <- sample_index^(-kappa)
+    # Offset sample_index by 1 so that initial unity variance_estimate acts as
+    # regularizer
+    beta <- (sample_index + 1)^(-kappa)
     position <- state_and_statistics$state$position()
-    mean_estimate <<- mean_estimate + gamma * (position - mean_estimate)
-    variance_estimate <<- variance_estimate + gamma * (
+    mean_estimate <<- mean_estimate + beta * (position - mean_estimate)
+    variance_estimate <<- variance_estimate + beta * (
       (position - mean_estimate)^2 - variance_estimate
     )
     proposal$update(shape = sqrt(variance_estimate))
@@ -107,9 +193,9 @@ variance_adapter <- function(kappa = 0.6) {
 #'     coerced acceptance rate. _Statistics and Computing_, 22, 997-1008.
 #'     <https://doi.iorg/10.1007/s11222-011-9269-5>
 #'
-#' @inheritParams scale_adapter
+#' @inheritParams simple_scale_adapter
 #'
-#' @inherit scale_adapter return
+#' @inherit simple_scale_adapter return
 #'
 #' @export
 #'
@@ -139,7 +225,7 @@ robust_shape_adapter <- function(
     momentum <- state_and_statistics$proposed_state$momentum()
     accept_prob <- state_and_statistics$statistics$accept_prob
     shape <<- ramcmc::adapt_S(
-      shape, momentum, accept_prob, sample_index - 1, target_accept_prob, kappa
+      shape, momentum, accept_prob, sample_index, target_accept_prob, kappa
     )
     proposal$update(shape = shape)
   }

diff --git a/R/chains.R b/R/chains.R
@@ -181,7 +181,7 @@ chain_loop <- function(
       state, target_distribution, proposal
     )
     for (adapter in adapters) {
-      adapter$update(proposal, chain_iteration + 1, state_and_statistics)
+      adapter$update(proposal, chain_iteration, state_and_statistics)
     }
     state <- state_and_statistics$state
     if (record_traces_and_statistics) {

diff --git a/README.Rmd b/README.Rmd
@@ -59,7 +59,7 @@ results <- sample_chain(
   initial_state = rnorm(dimension),
   n_warm_up_iteration = 1000,
   n_main_iteration = 1000,
-  adapters = list(scale_adapter(), variance_adapter())
+  adapters = list(simple_scale_adapter(), variance_shape_adapter())
 )
 mean_accept_prob <- mean(results$statistics[, "accept_prob"])
 adapted_shape <- proposal$parameters()$shape

diff --git a/README.md b/README.md
@@ -56,7 +56,7 @@ results <- sample_chain(
   initial_state = rnorm(dimension),
   n_warm_up_iteration = 1000,
   n_main_iteration = 1000,
-  adapters = list(scale_adapter(), variance_adapter())
+  adapters = list(simple_scale_adapter(), variance_shape_adapter())
 )
 mean_accept_prob <- mean(results$statistics[, "accept_prob"])
 adapted_shape <- proposal$parameters()$shape
@@ -66,7 +66,7 @@ cat(
   sprintf("Adapter scale est.: %s", toString(adapted_shape)),
   sep = "\n"
 )
-#> Average acceptance probability is 0.40
+#> Average acceptance probability is 0.41
 #> True target scales: 1.50538046096953, 1.37774732725824, 0.277038897322645
-#> Adapter scale est.: 1.35010920408606, 1.5140138215658, 0.248974800274054
+#> Adapter scale est.: 1.2489768457131, 1.23111560302158, 0.215024121396933
 ```
diff --git a/man/dual_averaging_scale_adapter.Rd b/man/dual_averaging_scale_adapter.Rd
diff --git a/man/scale_adapter.Rd → man/simple_scale_adapter.Rd b/man/scale_adapter.Rd → man/simple_scale_adapter.Rd
diff --git a/man/variance_adapter.Rd → man/variance_shape_adapter.Rd b/man/variance_adapter.Rd → man/variance_shape_adapter.Rd