Merge pull request #58 from rapidsurveys:dev

create step-by-step vignette'; fix #42
rapidsurveys · Jan 10, 2025 · fe931be · fe931be
2 parents 907f18b + 2fec164
commit fe931be
Show file tree

Hide file tree

Showing 4 changed files with 128 additions and 29 deletions.
diff --git a/R/boot_bw.R b/R/boot_bw.R
@@ -89,7 +89,9 @@ boot_bw_parallel <- function(x, w, statistic,
                              params, outputColumns = params,
                              replicates = 400,
                              strata = NULL,
-                             cores = parallelly::availableCores(omit = 1)) {    
+                             cores = parallelly::availableCores(omit = 1)) {  
+  cli::cli_h2("Resampling in parallel")
+
   ## Setup parallelism ----
   cli::cli_progress_step("Setting up {.strong {cores}} parallel operations")
   cl <- parallel::makeCluster(cores)
@@ -153,7 +155,6 @@ boot_bw_parallel <- function(x, w, statistic,
 
   ## Create list output and append class ----
   boot <- list(
-    statistic = deparse(substitute(statistic)),
     params = params,
     replicates = replicates,
     strata = strata,
@@ -176,10 +177,8 @@ boot_bw_sequential <- function(x, w, statistic,
                                params, outputColumns = params,
                                replicates = 400,
                                strata = NULL) {
-  x_name <- deparse(substitute(x))
-  stat_name <- deparse(substitute(statistic))
-
   cli::cli_h2("Resampling sequentially")
+
   ## Resample ----
   if (is.null(strata)) {
     cli::cli_h3("Resampling with {.strong {replicates}} replicates")
@@ -244,7 +243,6 @@ boot_bw_sequential <- function(x, w, statistic,
 
   ## Create list output and append class ----
   boot <- list(
-    statistic = deparse(substitute(statistic)),
     params = params,
     replicates = replicates,
     strata = strata,
@@ -264,6 +262,8 @@ boot_bw_sequential <- function(x, w, statistic,
 #' 
 
 boot_bw_weight <- function(w) {
+  w_name <- as.character(substitute(w))
+
   req_names <- c("psu", "pop", "weight", "cumWeight")
   names_check <- req_names %in% names(w)
   names_in <- req_names[names_check]
@@ -280,7 +280,7 @@ boot_bw_weight <- function(w) {
       w$cumWeight <- cumsum(w$weight)
     } else {
       cli::cli_abort(
-        "{.arg w} doesn't have the needed variables or they are not named appropriately"
+        "{.strong {w_name}} doesn't have the needed variables or they are not named appropriately"
       )
     }
   }

diff --git a/R/utils.R b/R/utils.R
@@ -8,14 +8,16 @@ check_params <- function(x, params) {
   params_in <- params[which(params %in% names(x))]
   params_out <- params[which(!params %in% names(x))]
 
+  x_name <- as.character(substitute(x))
+
   if (length(params_in) == 0) {
     if (length(params) > 1) {
       cli::cli_abort(
-        "{.val {params}} are not variables in {.arg x}"
+        "{.strong {params}} are not variables in {.strong {x_name}}"
       )
     } else {
       cli::cli_abort(
-        "{.val {params}} is not a variable in {.arg x}"
+        "{.strong {params}} is not a variable in {.strong {x_name}}"
       )
     }
   } else {
@@ -33,25 +35,25 @@ check_params <- function(x, params) {
       if (length(params_in) == 1) {
         cli::cli_bullets(
           c(
-            "v" = "{.val {params_in}} is a variable in {.arg x}",
+            "v" = "{.strong {params_in}} is a variable in {.strong {x_name}}",
             "!" = ifelse(
               length(params_out) > 1,
-              "{.val {params_out}} are not variables in {.arg x}",
-              "{.val {params_out}} is not a variable in {.arg x}"
+              "{.strong {params_out}} are not variables in {.strong {x_name}}",
+              "{.strong {params_out}} is not a variable in {.strong {x_name}}"
             ),
-            "i" = "Returning {.val {params_in}}"
+            "i" = "Returning {.strong {params_in}}"
           )
         )
       } else {
         cli::cli_bullets(
           c(
-            "v" = "{.val {params_in}} are variables in {.arg x}",
+            "v" = "{.strong {params_in}} are variables in {.strong {x_name}}",
             "!" = ifelse(
               length(params_out) > 1,
-              "{.val {params_out}} are not variables in {.arg x}",
-              "{.val {params_out}} is not a variable in {.arg x}"
+              "{.strong {params_out}} are not variables in {.strong {x_name}}",
+              "{.strong {params_out}} is not a variable in {.strong {x_name}}"
             ),
-            "i" = "Returning {.val {params_in}}"
+            "i" = "Returning {.strong {params_in}}"
           )
         )
       }
@@ -72,19 +74,21 @@ check_data <- function(x) {
   data_name_check <- "psu" %in% names(x)
   data_structure_check <- ncol(x) > 1
 
+  x_name <- as.character(substitute(x))
+
   if (data_name_check) {
     if (data_structure_check) {
       cli::cli_alert_success(
-        "{.arg x} has the appropriate/expected data structure"
+        "{.strong {x_name}} has the appropriate/expected data structure"
       )
     } else {
       cli::cli_abort(
-        "{.var x} doesn't have variables with data to estimate"
+        "{.strong {x_name}} doesn't have variables with data to estimate"
       )
     }
   } else {
     cli::cli_abort(
-      "{.var x} doesn't have a {.var psu} variable or has a different name"
+      "{.strong {x_name}} doesn't have a {.var psu} variable or has a different name"
     )
   }
 }
@@ -130,17 +134,17 @@ check_est_df <- function(est_df, strata) {
   data_name_in <- c(strata, "est", "se")[which(data_name_check)]
   data_name_out <- c(strata, "est", "se")[which(!data_name_check)]
 
-  arg_name <- deparse(substitute(est_df))
+  arg_name <- as.character(substitute(est_df))
 
   message_out <- ifelse(
     length(data_name_out) == 1,
-    "{.strong {.val {arg_name}}} doesn't have a {.strong {.val {data_name_out}}} variable or has a different name",
-    "{.strong {.val {arg_name}}} doesn't have {.strong {.val {data_name_out}}} variables or have different names"
+    "{.strong {arg_name}} doesn't have a {.strong {data_name_out}} variable or has a different name",
+    "{.strong {arg_name}} doesn't have {.strong {data_name_out}} variables or have different names"
   )
 
   if (all(data_name_check)) {
     cli::cli_alert_success(
-      "{.arg est_df} has the appropriate/expected variables"
+      "{.strong {arg_name}} has the appropriate/expected variables"
     )
   } else {
     cli::cli_abort(message_out)
@@ -159,17 +163,17 @@ check_pop_df <- function(pop_df) {
   data_name_in <- c("strata", "pop")[which(data_name_check)]
   data_name_out <- c("strata", "pop")[which(!data_name_check)]
 
-  arg_name <- deparse(substitute(pop_df))
+  arg_name <- as.character(substitute(pop_df))
 
   message_out <- ifelse(
     length(data_name_out) == 1,
-    "{.strong {.val {arg_name}}} doesn't have a {.strong {.val {data_name_out}}} variable or has a different name",
-    "{.strong {.val {arg_name}}} doesn't have {.strong {.val {data_name_out}}} variables or have different names"
+    "{.strong {arg_name}} doesn't have a {.strong {data_name_out}} variable or has a different name",
+    "{.strong {arg_name}} doesn't have {.strong {data_name_out}} variables or have different names"
   )
 
   if (all(data_name_check)) {
     cli::cli_alert_success(
-      "{.arg pop_df} has the appropriate/expected variables"
+      "{.strong {arg_name}} has the appropriate/expected variables"
     )
   } else {
     cli::cli_abort(message_out)

diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -1,4 +1,5 @@
 Arimond
+Bakool
 Boateng
 CMD
 CodeFactor
@@ -21,6 +22,7 @@ PSUs
 RapidSurveys
 Ruel
 Sankar
+Shabelle
 Siling
 Sodani
 codecov

diff --git a/vignettes/bbw-estimation.Rmd b/vignettes/bbw-estimation.Rmd
@@ -18,8 +18,101 @@ knitr::opts_chunk$set(
 
 ```{r setup, echo = FALSE}
 library(bbw)
+set.seed(1977)
 ```
 
-The `{bbw}` package was developed primarily as a tool for analysing complex sample survey data. 
+The `{bbw}` package was developed primarily as a tool for analysing complex sample survey data. It was developed specifically for use with the [Rapid Assessment Method (RAM)](https://rapidsurveys.io/ramOPmanual/) and the [Simple Spatial Survey Method (S3M)](https://researchonline.lshtm.ac.uk/id/eprint/2572543).
 
+The [`indicatorsHH`](https://rapidsurveys.io/bbw/reference/indicatorsHH.html) is a survey dataset collected from a RAM survey in Bakool, Bay, and Middle Shabelle regions of Somalia. The [`villageData`](https://rapidsurveys.io/bbw/reference/villageData.html) contains the list of villages/clusters that were sampled in the survey that collected the `indicatorsHH` dataset. These is a good set of data to demonstrate the use of the `{bbw}` package to perform blocked weighted bootstrap estimation. This demonstration focuses on the original set of functions included in the `{bbw}` package.
+
+## Bootstrap resampling
+
+There are two functions in the `{bbw}` package that can be used for bootstrap resampling. The `bootBW()` is the original bootstrap resampling function of the package. It can be used as follows:
+
+```{r orig-bootstrap-1}
+boot_df <- bootBW(
+  x = indicatorsHH, w = villageData, statistic = bootClassic,
+  params = c("anc1", "anc2")
+)
+```
+
+This call to `bootBW()` takes in the survey dataset `indicatorsHH` as its first argument (`x`). This dataset is expected to have a variable labelled as `psu` which identifies the primary sampling unit from which data was collected during the survey and then additional variables for the indicators to be estimated. The second argument (`w`) is for the dataset of the list of primary sampling units that were sampled in the survey to collect the survey data specified in `x`. This dataset, which in this case is `villageData`, should have at least a variable labelled `psu` which identified the primary sampling unit that matches the same variable in the survey dataset and a variable labelled `pop` for the population size of the primary sampling unit. The `statistic` argument specified the type of statistic to apply to the bootstrap replicates. There are two of these functions available from the `{bbw}` package - `bootClassic()` and the `bootPROBIT()`. For this example, the `bootClassic()` function is used to get the mean value of the bootstrap replicates. This is generally useful for binomial type of indicators and for continuous variables of which to get the mean of. The `params` argument takes in values of the indicator names in `x` to be estimated. In this example, two indicator names for antenatal care are specified. Finally, the argument for `replicates` specify the number of replicate bootstraps to be performed. The default of 400 replicates is used here. This results in the following (showing first 10 rows):
+
+```{r orig-bootstrap-2}
+head(boot_df, 10)
+```
+
+The result is a `data.frame()` of bootstrap replicates with number of rows equal to the number or replicates and number of columns equal to the number of `params` specified. Hence, `boot_df` has 400 rows and 2 columns.
+
+## Bootstrap estimation
+
+Using `boot_df` containing bootstrap replicates of the indicators `anc1` and `anc2`, estimating each indicator with a 95% confidence interval using the *percentile bootstrap method*. This can be simply done using the `quantile()` function from the `stats` package as follows:
+
+```{r orig-bootstrap-3}
+est_df <- lapply(
+  X = boot_df,
+  FUN = quantile,
+  probs = c(0.5, 0.025, 0.975)
+) |>
+  do.call(rbind, args = _)
+```
+
+The `quantile()` function is used to get the 50th percentile (for the estimate) and the 2.5th and the 97.5th percentile of the bootstrap replicates to get the lower confidence limit and the upper confidence limits (respectively) of the indicator estimate. This gives the following results:
+
+```{r orig-bootstrap-4}
+est_df
+```
+
+## Stratified bootstrap resampling
+
+Note that the `indicatorsHH` dataset has geographical stratification. Specifically, the survey from which this data was collected was designed to be representative of three regions in Somalia with the regions identified through the `region` variable in `indicatorsHH`. Because of this the more appropriate bootstrap resampling approach would be to resample within each region. To do this using the original `bootBW()` function would require restructuring the survey dataset by region and then passing the region-stratified datasets individually to the `bootBW()` function. This may look something like this:
+
+```{r orig-bootstrap-5}
+## Split indicators by region ----
+indicators_by_region <- split(indicatorsHH, f = indicatorsHH$region)
+
+## Split psus by region ----
+psus_by_region <- split(villageData, f = villageData$region)
+
+## Bootstrap
+boot_df <- Map(
+  f = bootBW, 
+  x = indicators_by_region, 
+  w = psus_by_region, 
+  statistic = rep(list(get("bootClassic")), length(indicators_by_region)), 
+  params = rep(list(c("anc1", "anc2")), length(indicators_by_region))
+)
+```
+
+The `bootBW()` function only accepts single `data.frame` inputs for `x` and `w` arguments. Hence, to resample data from within region, the datasets will have to be split into separate `data.frame` inputs per region and then `bootBW()` applied to each separately. In the example above, this is done by concatenating each of the inputs to `bootBW()` into a list and then using the `Map()` function is sent to `bootBW()` sequentially. This produces a list of the `data.frame` bootstrap resample for each region (shown below):
+
+```{r orig-bootstrap-6}
+boot_df
+```
+
+To estimate the per region results from this bootstrap resampling, the following can be implemented:
+
+```{r orig-bootstrap-7}
+est_df <- lapply(
+  X = boot_df, 
+  FUN = function(x) lapply(
+    x, FUN = quantile, probs = c(0.5, 0.025, 0.975)
+  ) |> 
+    do.call(rbind, args = _)
+)
+
+est_df <- data.frame(
+  region = names(est_df),
+  indicators = lapply(est_df, FUN = row.names) |> unlist(),
+  do.call(rbind, args = est_df)
+)
+
+row.names(est_df) <- NULL
+```
+
+which results in the following output:
+
+```{r orig-bootstrap-8}
+est_df
+```