Skip to content

Commit

Permalink
Merge pull request #58 from rapidsurveys:dev
Browse files Browse the repository at this point in the history
create step-by-step vignette'; fix #42
  • Loading branch information
ernestguevarra authored Jan 10, 2025
2 parents 907f18b + 2fec164 commit fe931be
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 29 deletions.
14 changes: 7 additions & 7 deletions R/boot_bw.R
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,9 @@ boot_bw_parallel <- function(x, w, statistic,
params, outputColumns = params,
replicates = 400,
strata = NULL,
cores = parallelly::availableCores(omit = 1)) {
cores = parallelly::availableCores(omit = 1)) {
cli::cli_h2("Resampling in parallel")

## Setup parallelism ----
cli::cli_progress_step("Setting up {.strong {cores}} parallel operations")
cl <- parallel::makeCluster(cores)
Expand Down Expand Up @@ -153,7 +155,6 @@ boot_bw_parallel <- function(x, w, statistic,

## Create list output and append class ----
boot <- list(
statistic = deparse(substitute(statistic)),
params = params,
replicates = replicates,
strata = strata,
Expand All @@ -176,10 +177,8 @@ boot_bw_sequential <- function(x, w, statistic,
params, outputColumns = params,
replicates = 400,
strata = NULL) {
x_name <- deparse(substitute(x))
stat_name <- deparse(substitute(statistic))

cli::cli_h2("Resampling sequentially")

## Resample ----
if (is.null(strata)) {
cli::cli_h3("Resampling with {.strong {replicates}} replicates")
Expand Down Expand Up @@ -244,7 +243,6 @@ boot_bw_sequential <- function(x, w, statistic,

## Create list output and append class ----
boot <- list(
statistic = deparse(substitute(statistic)),
params = params,
replicates = replicates,
strata = strata,
Expand All @@ -264,6 +262,8 @@ boot_bw_sequential <- function(x, w, statistic,
#'

boot_bw_weight <- function(w) {
w_name <- as.character(substitute(w))

req_names <- c("psu", "pop", "weight", "cumWeight")
names_check <- req_names %in% names(w)
names_in <- req_names[names_check]
Expand All @@ -280,7 +280,7 @@ boot_bw_weight <- function(w) {
w$cumWeight <- cumsum(w$weight)
} else {
cli::cli_abort(
"{.arg w} doesn't have the needed variables or they are not named appropriately"
"{.strong {w_name}} doesn't have the needed variables or they are not named appropriately"
)
}
}
Expand Down
46 changes: 25 additions & 21 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,16 @@ check_params <- function(x, params) {
params_in <- params[which(params %in% names(x))]
params_out <- params[which(!params %in% names(x))]

x_name <- as.character(substitute(x))

if (length(params_in) == 0) {
if (length(params) > 1) {
cli::cli_abort(
"{.val {params}} are not variables in {.arg x}"
"{.strong {params}} are not variables in {.strong {x_name}}"
)
} else {
cli::cli_abort(
"{.val {params}} is not a variable in {.arg x}"
"{.strong {params}} is not a variable in {.strong {x_name}}"
)
}
} else {
Expand All @@ -33,25 +35,25 @@ check_params <- function(x, params) {
if (length(params_in) == 1) {
cli::cli_bullets(
c(
"v" = "{.val {params_in}} is a variable in {.arg x}",
"v" = "{.strong {params_in}} is a variable in {.strong {x_name}}",
"!" = ifelse(
length(params_out) > 1,
"{.val {params_out}} are not variables in {.arg x}",
"{.val {params_out}} is not a variable in {.arg x}"
"{.strong {params_out}} are not variables in {.strong {x_name}}",
"{.strong {params_out}} is not a variable in {.strong {x_name}}"
),
"i" = "Returning {.val {params_in}}"
"i" = "Returning {.strong {params_in}}"
)
)
} else {
cli::cli_bullets(
c(
"v" = "{.val {params_in}} are variables in {.arg x}",
"v" = "{.strong {params_in}} are variables in {.strong {x_name}}",
"!" = ifelse(
length(params_out) > 1,
"{.val {params_out}} are not variables in {.arg x}",
"{.val {params_out}} is not a variable in {.arg x}"
"{.strong {params_out}} are not variables in {.strong {x_name}}",
"{.strong {params_out}} is not a variable in {.strong {x_name}}"
),
"i" = "Returning {.val {params_in}}"
"i" = "Returning {.strong {params_in}}"
)
)
}
Expand All @@ -72,19 +74,21 @@ check_data <- function(x) {
data_name_check <- "psu" %in% names(x)
data_structure_check <- ncol(x) > 1

x_name <- as.character(substitute(x))

if (data_name_check) {
if (data_structure_check) {
cli::cli_alert_success(
"{.arg x} has the appropriate/expected data structure"
"{.strong {x_name}} has the appropriate/expected data structure"
)
} else {
cli::cli_abort(
"{.var x} doesn't have variables with data to estimate"
"{.strong {x_name}} doesn't have variables with data to estimate"
)
}
} else {
cli::cli_abort(
"{.var x} doesn't have a {.var psu} variable or has a different name"
"{.strong {x_name}} doesn't have a {.var psu} variable or has a different name"
)
}
}
Expand Down Expand Up @@ -130,17 +134,17 @@ check_est_df <- function(est_df, strata) {
data_name_in <- c(strata, "est", "se")[which(data_name_check)]
data_name_out <- c(strata, "est", "se")[which(!data_name_check)]

arg_name <- deparse(substitute(est_df))
arg_name <- as.character(substitute(est_df))

message_out <- ifelse(
length(data_name_out) == 1,
"{.strong {.val {arg_name}}} doesn't have a {.strong {.val {data_name_out}}} variable or has a different name",
"{.strong {.val {arg_name}}} doesn't have {.strong {.val {data_name_out}}} variables or have different names"
"{.strong {arg_name}} doesn't have a {.strong {data_name_out}} variable or has a different name",
"{.strong {arg_name}} doesn't have {.strong {data_name_out}} variables or have different names"
)

if (all(data_name_check)) {
cli::cli_alert_success(
"{.arg est_df} has the appropriate/expected variables"
"{.strong {arg_name}} has the appropriate/expected variables"
)
} else {
cli::cli_abort(message_out)
Expand All @@ -159,17 +163,17 @@ check_pop_df <- function(pop_df) {
data_name_in <- c("strata", "pop")[which(data_name_check)]
data_name_out <- c("strata", "pop")[which(!data_name_check)]

arg_name <- deparse(substitute(pop_df))
arg_name <- as.character(substitute(pop_df))

message_out <- ifelse(
length(data_name_out) == 1,
"{.strong {.val {arg_name}}} doesn't have a {.strong {.val {data_name_out}}} variable or has a different name",
"{.strong {.val {arg_name}}} doesn't have {.strong {.val {data_name_out}}} variables or have different names"
"{.strong {arg_name}} doesn't have a {.strong {data_name_out}} variable or has a different name",
"{.strong {arg_name}} doesn't have {.strong {data_name_out}} variables or have different names"
)

if (all(data_name_check)) {
cli::cli_alert_success(
"{.arg pop_df} has the appropriate/expected variables"
"{.strong {arg_name}} has the appropriate/expected variables"
)
} else {
cli::cli_abort(message_out)
Expand Down
2 changes: 2 additions & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
Arimond
Bakool
Boateng
CMD
CodeFactor
Expand All @@ -21,6 +22,7 @@ PSUs
RapidSurveys
Ruel
Sankar
Shabelle
Siling
Sodani
codecov
Expand Down
95 changes: 94 additions & 1 deletion vignettes/bbw-estimation.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,101 @@ knitr::opts_chunk$set(

```{r setup, echo = FALSE}
library(bbw)
set.seed(1977)
```

The `{bbw}` package was developed primarily as a tool for analysing complex sample survey data.
The `{bbw}` package was developed primarily as a tool for analysing complex sample survey data. It was developed specifically for use with the [Rapid Assessment Method (RAM)](https://rapidsurveys.io/ramOPmanual/) and the [Simple Spatial Survey Method (S3M)](https://researchonline.lshtm.ac.uk/id/eprint/2572543).

The [`indicatorsHH`](https://rapidsurveys.io/bbw/reference/indicatorsHH.html) is a survey dataset collected from a RAM survey in Bakool, Bay, and Middle Shabelle regions of Somalia. The [`villageData`](https://rapidsurveys.io/bbw/reference/villageData.html) contains the list of villages/clusters that were sampled in the survey that collected the `indicatorsHH` dataset. These is a good set of data to demonstrate the use of the `{bbw}` package to perform blocked weighted bootstrap estimation. This demonstration focuses on the original set of functions included in the `{bbw}` package.

## Bootstrap resampling

There are two functions in the `{bbw}` package that can be used for bootstrap resampling. The `bootBW()` is the original bootstrap resampling function of the package. It can be used as follows:

```{r orig-bootstrap-1}
boot_df <- bootBW(
x = indicatorsHH, w = villageData, statistic = bootClassic,
params = c("anc1", "anc2")
)
```

This call to `bootBW()` takes in the survey dataset `indicatorsHH` as its first argument (`x`). This dataset is expected to have a variable labelled as `psu` which identifies the primary sampling unit from which data was collected during the survey and then additional variables for the indicators to be estimated. The second argument (`w`) is for the dataset of the list of primary sampling units that were sampled in the survey to collect the survey data specified in `x`. This dataset, which in this case is `villageData`, should have at least a variable labelled `psu` which identified the primary sampling unit that matches the same variable in the survey dataset and a variable labelled `pop` for the population size of the primary sampling unit. The `statistic` argument specified the type of statistic to apply to the bootstrap replicates. There are two of these functions available from the `{bbw}` package - `bootClassic()` and the `bootPROBIT()`. For this example, the `bootClassic()` function is used to get the mean value of the bootstrap replicates. This is generally useful for binomial type of indicators and for continuous variables of which to get the mean of. The `params` argument takes in values of the indicator names in `x` to be estimated. In this example, two indicator names for antenatal care are specified. Finally, the argument for `replicates` specify the number of replicate bootstraps to be performed. The default of 400 replicates is used here. This results in the following (showing first 10 rows):

```{r orig-bootstrap-2}
head(boot_df, 10)
```

The result is a `data.frame()` of bootstrap replicates with number of rows equal to the number or replicates and number of columns equal to the number of `params` specified. Hence, `boot_df` has 400 rows and 2 columns.

## Bootstrap estimation

Using `boot_df` containing bootstrap replicates of the indicators `anc1` and `anc2`, estimating each indicator with a 95% confidence interval using the *percentile bootstrap method*. This can be simply done using the `quantile()` function from the `stats` package as follows:

```{r orig-bootstrap-3}
est_df <- lapply(
X = boot_df,
FUN = quantile,
probs = c(0.5, 0.025, 0.975)
) |>
do.call(rbind, args = _)
```

The `quantile()` function is used to get the 50th percentile (for the estimate) and the 2.5th and the 97.5th percentile of the bootstrap replicates to get the lower confidence limit and the upper confidence limits (respectively) of the indicator estimate. This gives the following results:

```{r orig-bootstrap-4}
est_df
```

## Stratified bootstrap resampling

Note that the `indicatorsHH` dataset has geographical stratification. Specifically, the survey from which this data was collected was designed to be representative of three regions in Somalia with the regions identified through the `region` variable in `indicatorsHH`. Because of this the more appropriate bootstrap resampling approach would be to resample within each region. To do this using the original `bootBW()` function would require restructuring the survey dataset by region and then passing the region-stratified datasets individually to the `bootBW()` function. This may look something like this:

```{r orig-bootstrap-5}
## Split indicators by region ----
indicators_by_region <- split(indicatorsHH, f = indicatorsHH$region)
## Split psus by region ----
psus_by_region <- split(villageData, f = villageData$region)
## Bootstrap
boot_df <- Map(
f = bootBW,
x = indicators_by_region,
w = psus_by_region,
statistic = rep(list(get("bootClassic")), length(indicators_by_region)),
params = rep(list(c("anc1", "anc2")), length(indicators_by_region))
)
```

The `bootBW()` function only accepts single `data.frame` inputs for `x` and `w` arguments. Hence, to resample data from within region, the datasets will have to be split into separate `data.frame` inputs per region and then `bootBW()` applied to each separately. In the example above, this is done by concatenating each of the inputs to `bootBW()` into a list and then using the `Map()` function is sent to `bootBW()` sequentially. This produces a list of the `data.frame` bootstrap resample for each region (shown below):

```{r orig-bootstrap-6}
boot_df
```

To estimate the per region results from this bootstrap resampling, the following can be implemented:

```{r orig-bootstrap-7}
est_df <- lapply(
X = boot_df,
FUN = function(x) lapply(
x, FUN = quantile, probs = c(0.5, 0.025, 0.975)
) |>
do.call(rbind, args = _)
)
est_df <- data.frame(
region = names(est_df),
indicators = lapply(est_df, FUN = row.names) |> unlist(),
do.call(rbind, args = est_df)
)
row.names(est_df) <- NULL
```

which results in the following output:

```{r orig-bootstrap-8}
est_df
```

0 comments on commit fe931be

Please sign in to comment.