diff --git a/.gitignore b/.gitignore index 4685db3..13dfd2b 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ README.html docs +inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index fd5cb3d..bce8a9a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: bbw Type: Package Title: Blocked Weighted Bootstrap -Version: 0.2.4.9000 +Version: 0.2.5.9000 Authors@R: c( person("Mark", "Myatt", email = "mark@brixtonhealth.com", role = c("aut", "cph")), diff --git a/NAMESPACE b/NAMESPACE index 98d618e..5c86049 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,7 @@ export(boot_bw_sample_clusters) export(boot_bw_sample_within_clusters) export(boot_bw_sequential) export(boot_bw_weight) +export(estimate_total) export(recode) importFrom(car,bcPower) importFrom(car,powerTransform) diff --git a/R/bootProbit.R b/R/bootProbit.R index aa49063..fd38f42 100644 --- a/R/bootProbit.R +++ b/R/bootProbit.R @@ -46,7 +46,7 @@ bootPROBIT <- function(x, params, threshold = THRESHOLD) { d <- car::bcPower(d, lambda) threshold <- car::bcPower(threshold, lambda) m <- mean(d, na.rm = TRUE) - s <- stats::sd(d, na.rm = T) + s <- stats::sd(d, na.rm = TRUE) ## PROBIT estimate ---- x <- stats::pnorm(q = threshold, mean = m, sd = s) diff --git a/R/boot_bw_estimate.R b/R/boot_bw_estimate.R index 7156ddc..e4970cc 100644 --- a/R/boot_bw_estimate.R +++ b/R/boot_bw_estimate.R @@ -51,5 +51,49 @@ boot_bw_estimate <- function(boot_df) { row.names(est) <- NULL ## Return est ---- + est +} + + +#' +#' Boot estimate +#' +#' @keywords internal +#' + +boot_percentile <- function(boot_df) { + if (is.data.frame(boot_df)) { + est <- lapply( + X = boot_df, + FUN = stats::quantile, + probs = c(0.5, 0.025, 0.975), + na.rm = TRUE + ) |> + do.call(rbind, args = _) |> + as.data.frame() + + se <- lapply( + X = boot_df, + FUN = stats::sd, + na.rm = TRUE + ) |> + do.call(rbind, args = _) |> + as.data.frame() + + est <- data.frame(est, se) + + names(est) <- c("est", "lcl", "ucl", "se") + } else { + est <- stats::quantile( + x = boot_df, probs = c(0.5, 0.025, 0.975), na.rm = TRUE + ) |> + rbind() |> + data.frame( + se = stats::sd(x = boot_df, na.rm = TRUE) + ) + + names(est) <- c("est", "lcl", "ucl", "se") + } + est } \ No newline at end of file diff --git a/R/data.R b/R/data.R index 6760f2b..629b313 100644 --- a/R/data.R +++ b/R/data.R @@ -139,3 +139,34 @@ #' @source Mother and child health and nutrition survey in 3 regions of Somalia #' "indicatorsCH2" + + +#' +#' Somalia regional population in 2022 +#' +#' A data.frame with 19 rows and 18 columns: +#' +#' **Variable** | **Description** +#' :--- | :--- +#' `region` | Region name +#' `total` | Total population +#' `urban` | Total urban population +#' `rural` | Total rural population +#' `idp` | Total IDP population +#' `urban_stressed` | Total urban population - stressed +#' `rural_stressed` | Total rural population - stressed +#' `idp_stressed` | Total IDP population - stressed +#' `urban_crisis` | Total urban population - crisis +#' `rural_crisis` | Total rural population - crisis +#' `idp_crisis` | Total IDP population - crisis +#' `urban_emergency` | Total urban population - emergency +#' `rural_emergency` | Total rural population - emergency +#' `idp_emergency` | Total IDP population - emergency +#' `urban_catastrophe` | Total urban population - catastrophe +#' `rural_catastrophe` | Total rural population - catastrophe +#' `idp_catastrophe` | Total IDP population - catastrophe +#' `percent_at_least_crisis` | Percentage of population that are at least in crisis +#' +#' @source +#' +"somalia_population" diff --git a/R/post_strat_estimation.R b/R/post_strat_estimation.R new file mode 100644 index 0000000..ac154ec --- /dev/null +++ b/R/post_strat_estimation.R @@ -0,0 +1,127 @@ +#' +#' Post-stratification analysis +#' +#' @param est_df A [data.frame()] of stratified indicator estimates to get +#' overall estimates of. `est_df` should have a variable named `est` for +#' the values of the indicator estimate, a variable named `strata` for +#' information on the stratification or grouping of the estimates, and a +#' variable named `se` for the standard errors for the values of the +#' indicator estimate. This is usually produced via a call to +#' [boot_bw_estimate()]. +#' @param pop_df A [data.frame()] with at least two variables: `strata` for the +#' stratification/grouping information that matches `strata` in `est_df` and +#' `pop` for information on population for the given `strata`. +#' +#' @returns A vector of values for the overall estimate, overall 95% lower +#' confidence limit, and overall 95% upper confidence limit for each of the +#' `strata` in `est_df`. +#' +#' @examples +#' est_df <- boot_bw( +#' x = indicatorsHH, w = villageData, statistic = bootClassic, +#' params = "anc1", strata = "region", replicates = 9 +#' ) |> +#' boot_bw_estimate() +#' +#' ## Add population ---- +#' pop_df <- somalia_population |> +#' subset(select = c(region, total)) +#' +#' names(pop_df) <- c("strata", "pop") +#' +#' estimate_total(est_df, pop_df) +#' +#' @export +#' + +estimate_total <- function(est_df, pop_df) { + ## Check the data ---- + check_est_df(est_df) + check_pop_df(pop_df) + + ## Merge estimates with population data ---- + est_pop_df <- merge(pop_df, est_df, by = "strata", all.y = TRUE) + + ## Get total estimates ---- + if (length(unique(est_pop_df$indicator)) > 1) { + est_pop_df <- split(x = est_pop_df, f = est_pop_df$indicator) + + total_est_df <- lapply( + X = est_pop_df, + FUN = estimate_total_ + ) |> + do.call(rbind, args = _) |> + as.data.frame() + } else { + total_est_df <- estimate_total_(est_pop_df) + } + + ## Return estimates ---- + total_est_df +} + + +#' +#' Estimate post-stratification weighted totals +#' +#' @keywords internal +#' + +estimate_total_ <- function(est_pop_df) { + with(est_pop_df, { + data.frame( + strata = "Overall", + indicator = unique(indicator), + est = calc_total_estimate(est, pop), + lcl = calc_total_ci(est, pop, se, "lcl"), + ucl = calc_total_ci(est, pop, se, "ucl"), + se = calc_total_sd(se, pop) + ) + }) +} + + +#' +#' Calculate total estimate +#' +#' @keywords internal +#' + +calc_total_estimate <- function(est, pop) { + sum(est * pop, na.rm = TRUE) / sum(pop, na.rm = TRUE) +} + + +#' +#' Calculate total sd +#' +#' @keywords internal +#' + +calc_total_sd <- function(se, pop) { + sum(se ^ 2 * pop / sum(pop, na.rm = TRUE), na.rm = TRUE) +} + + +#' +#' Calculate confidence limits +#' +#' @keywords internal +#' + +calc_total_ci <- function(est, pop, se, ci = c("lcl", "ucl")) { + ci <- match.arg(ci) + + operator <- ifelse(ci == "lcl", "-", "+") + + str2expression( + paste0( + "sum(est * pop, na.rm = TRUE) / sum(pop, na.rm = TRUE) ", + operator, + " 1.96 * sqrt(sum(se ^ 2 * pop / sum(pop, na.rm = TRUE), na.rm = TRUE))" + ) + ) |> + eval() +} + + diff --git a/R/utils.R b/R/utils.R index 4e117f1..f808871 100644 --- a/R/utils.R +++ b/R/utils.R @@ -120,32 +120,58 @@ tidy_boot <- function(boot, w, strata, outputColumns) { #' -#' Boot estimate +#' Check est_df #' #' @keywords internal #' -boot_percentile <- function(boot_df) { - if (is.data.frame(boot_df)) { - est <- lapply( - X = boot_df, - FUN = stats::quantile, - probs = c(0.5, 0.025, 0.975), - na.rm = TRUE - ) |> - do.call(rbind, args = _) |> - data.frame() +check_est_df <- function(est_df) { + data_name_check <- c("strata", "est", "se") %in% names(est_df) + data_name_in <- c("strata", "est", "se")[which(data_name_check)] + data_name_out <- c("strata", "est", "se")[which(!data_name_check)] + + arg_name <- deparse(substitute(est_df)) + + message_out <- ifelse( + length(data_name_out) == 1, + "{.strong {.val {arg_name}}} doesn't have a {.strong {.val {data_name_out}}} variable or has a different name", + "{.strong {.val {arg_name}}} doesn't have {.strong {.val {data_name_out}}} variables or have different names" + ) - names(est) <- c("est", "lcl", "ucl") + if (all(data_name_check)) { + cli::cli_alert_success( + "{.arg est_df} has the appropriate/expected variables" + ) } else { - est <- stats::quantile( - x = boot_df, probs = c(0.5, 0.025, 0.975), na.rm = TRUE - ) |> - rbind() |> - data.frame() - - names(est) <- c("est", "lcl", "ucl") + cli::cli_abort(message_out) } +} + + +#' +#' Check pop_df +#' +#' @keywords internal +#' + +check_pop_df <- function(pop_df) { + data_name_check <- c("strata", "pop") %in% names(pop_df) + data_name_in <- c("strata", "pop")[which(data_name_check)] + data_name_out <- c("strata", "pop")[which(!data_name_check)] + + arg_name <- deparse(substitute(pop_df)) - est + message_out <- ifelse( + length(data_name_out) == 1, + "{.strong {.val {arg_name}}} doesn't have a {.strong {.val {data_name_out}}} variable or has a different name", + "{.strong {.val {arg_name}}} doesn't have {.strong {.val {data_name_out}}} variables or have different names" + ) + + if (all(data_name_check)) { + cli::cli_alert_success( + "{.arg pop_df} has the appropriate/expected variables" + ) + } else { + cli::cli_abort(message_out) + } } \ No newline at end of file diff --git a/README.Rmd b/README.Rmd index 1f8d0e3..b95580a 100644 --- a/README.Rmd +++ b/README.Rmd @@ -29,7 +29,7 @@ knitr::opts_chunk$set( ## Overview -The **blocked weighted bootstrap** is an estimation technique for use with data from two-stage cluster sampled surveys in which either prior weighting (e.g. *population-proportional sampling* or *PPS* as used in **Standardized Monitoring and Assessment of Relief and Transitions** or **SMART** surveys) or *posterior weighting* (e.g. as used in **Rapid Assessment Method** or **RAM** and **Simple Spatial Sampling Method** or **S3M** surveys) is implemented. +The **blocked weighted bootstrap** is an estimation technique for use with data from two-stage cluster sampled surveys in which either prior weighting (e.g. *population-proportional sampling* or *PPS* as used in [Standardized Monitoring and Assessment of Relief and Transitions (SMART)](https://smartmethodology.org/) surveys) or *posterior weighting* (e.g. as used in [Rapid Assessment Method (RAM)](https://rapidsurveys.io/ramOPmanual/) and [Simple Spatial Sampling Method (S3M)](https://researchonline.lshtm.ac.uk/id/eprint/2572543) surveys) is implemented. ## Installation @@ -61,16 +61,16 @@ With RAM and S3M surveys, the sample is complex in the sense that it is an unwei * **Blocked**: The block corresponds to the primary sampling unit (*PSU = cluster*). PSUs are resampled with replacement. Observations within the resampled PSUs are also sampled with replacement. -* **Weighted**: RAM and S3M samples do not use population proportional sampling (PPS) to weight the sample prior to data collection (e.g. as is done with SMART surveys). This means that a posterior weighting procedure is required. `{bbw}` uses a *"roulette wheel"* algorithm (see [illustration below](#FIG1)) to weight (i.e. by population) the selection probability of PSUs in bootstrap replicates. +* **Weighted**: RAM and S3M samples do not use population proportional sampling (PPS) to weight the sample prior to data collection (e.g. as is done with SMART surveys). This means that a posterior weighting procedure is required. `{bbw}` uses a *"roulette wheel"* algorithm (see [illustration below](#fig1)) to weight (i.e. by population) the selection probability of PSUs in bootstrap replicates. - +

```{r, echo = FALSE, eval = TRUE, out.width = "50%", fig.alt = "Roulette wheel algorithm", fig.align = "center"} knitr::include_graphics("man/figures/rouletteWheel.png") ```

-In the case of prior weighting by PPS all clusters are given the same weight. With posterior weighting (as in RAM or S3M) the weight is the population of each PSU. This procedure is very similar to the *fitness proportional selection* technique used in *evolutionary computing*. +In the case of prior weighting by PPS all clusters are given the same weight. With posterior weighting (as in RAM or S3M) the weight is the population of each PSU. This procedure is very similar to the [*fitness proportional selection*](https://en.wikipedia.org/wiki/Fitness_proportionate_selection) technique used in *evolutionary computing*. A total of `m` PSUs are sampled with replacement for each bootstrap replicate (where `m` is the number of PSUs in the survey sample). diff --git a/data-raw/population.R b/data-raw/population.R new file mode 100644 index 0000000..e563947 --- /dev/null +++ b/data-raw/population.R @@ -0,0 +1,34 @@ +# Somalia regional population estimates 2022 + +.url <- "https://fsnau.org/downloads/2022-Gu-IPC-Population-Tables-Current.pdf" + +somalia_population <- pdftools::pdf_text(.url) |> + stringr::str_split(pattern = "\n") |> + (\(x) x[[1]])() |> + (\(x) x[c(8:15, 18:19, 22:29, 31)])() |> + stringr::str_remove_all(pattern = ",") |> + stringr::str_split(pattern = "[ ]{2,}") |> + do.call(rbind, args = _) |> + data.frame() + +names(somalia_population) <- c( + "region", "total", "urban", "rural", "idp", + "urban_stressed", "rural_stressed", "idp_stressed", + "urban_crisis", "rural_crisis", "idp_crisis", + "urban_emergency", "rural_emergency", "idp_emergency", + "urban_catastrophe", "rural_catastrophe", "idp_catastrophe", + "percent_at_least_crisis" +) + +somalia_population[ , c("total", "urban", "rural", "idp", + "urban_stressed", "rural_stressed", "idp_stressed", + "urban_crisis", "rural_crisis", "idp_crisis", + "urban_emergency", "rural_emergency", "idp_emergency", + "urban_catastrophe", "rural_catastrophe", "idp_catastrophe", + "percent_at_least_crisis")] <- lapply( + X = subset(somalia_population, select = -region), + FUN = as.numeric + ) |> + do.call(cbind, args = _) + +usethis::use_data(somalia_population, overwrite = TRUE, compress = "xz") \ No newline at end of file diff --git a/data/somalia_population.rda b/data/somalia_population.rda new file mode 100644 index 0000000..16192a1 Binary files /dev/null and b/data/somalia_population.rda differ diff --git a/inst/WORDLIST b/inst/WORDLIST index c0a9b84..873492e 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,21 +1,31 @@ Arimond +Boateng CMD CodeFactor DL +Fairhurst FeFol +GJ Gelbach Handwashing ICFI +IDP IYCF JB LCL Lifecycle ORCID +PLoS PSU PSUs RapidSurveys Ruel +Sankar +Siling +Sodani codecov +df doi psu +sd th diff --git a/man/boot_percentile.Rd b/man/boot_percentile.Rd index 5d44d41..f2ac7de 100644 --- a/man/boot_percentile.Rd +++ b/man/boot_percentile.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/boot_bw_estimate.R \name{boot_percentile} \alias{boot_percentile} \title{Boot estimate} diff --git a/man/calc_total_ci.Rd b/man/calc_total_ci.Rd new file mode 100644 index 0000000..a53fd1c --- /dev/null +++ b/man/calc_total_ci.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/post_strat_estimation.R +\name{calc_total_ci} +\alias{calc_total_ci} +\title{Calculate confidence limits} +\usage{ +calc_total_ci(est, pop, se, ci = c("lcl", "ucl")) +} +\description{ +Calculate confidence limits +} +\keyword{internal} diff --git a/man/calc_total_estimate.Rd b/man/calc_total_estimate.Rd new file mode 100644 index 0000000..52e27ed --- /dev/null +++ b/man/calc_total_estimate.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/post_strat_estimation.R +\name{calc_total_estimate} +\alias{calc_total_estimate} +\title{Calculate total estimate} +\usage{ +calc_total_estimate(est, pop) +} +\description{ +Calculate total estimate +} +\keyword{internal} diff --git a/man/calc_total_sd.Rd b/man/calc_total_sd.Rd new file mode 100644 index 0000000..aff216f --- /dev/null +++ b/man/calc_total_sd.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/post_strat_estimation.R +\name{calc_total_sd} +\alias{calc_total_sd} +\title{Calculate total sd} +\usage{ +calc_total_sd(se, pop) +} +\description{ +Calculate total sd +} +\keyword{internal} diff --git a/man/check_est_df.Rd b/man/check_est_df.Rd new file mode 100644 index 0000000..3e69d8f --- /dev/null +++ b/man/check_est_df.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{check_est_df} +\alias{check_est_df} +\title{Check est_df} +\usage{ +check_est_df(est_df) +} +\description{ +Check est_df +} +\keyword{internal} diff --git a/man/check_pop_df.Rd b/man/check_pop_df.Rd new file mode 100644 index 0000000..d54b1a7 --- /dev/null +++ b/man/check_pop_df.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{check_pop_df} +\alias{check_pop_df} +\title{Check pop_df} +\usage{ +check_pop_df(pop_df) +} +\description{ +Check pop_df +} +\keyword{internal} diff --git a/man/estimate_total.Rd b/man/estimate_total.Rd new file mode 100644 index 0000000..fba15d1 --- /dev/null +++ b/man/estimate_total.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/post_strat_estimation.R +\name{estimate_total} +\alias{estimate_total} +\title{Post-stratification analysis} +\usage{ +estimate_total(est_df, pop_df) +} +\arguments{ +\item{est_df}{A \code{\link[=data.frame]{data.frame()}} of stratified indicator estimates to get +overall estimates of. \code{est_df} should have a variable named \code{est} for +the values of the indicator estimate, a variable named \code{strata} for +information on the stratification or grouping of the estimates, and a +variable named \code{se} for the standard errors for the values of the +indicator estimate. This is usually produced via a call to +\code{\link[=boot_bw_estimate]{boot_bw_estimate()}}.} + +\item{pop_df}{A \code{\link[=data.frame]{data.frame()}} with at least two variables: \code{strata} for the +stratification/grouping information that matches \code{strata} in \code{est_df} and +\code{pop} for information on population for the given \code{strata}.} +} +\value{ +A vector of values for the overall estimate, overall 95\% lower +confidence limit, and overall 95\% upper confidence limit for each of the +\code{strata} in \code{est_df}. +} +\description{ +Post-stratification analysis +} +\examples{ +est_df <- boot_bw( + x = indicatorsHH, w = villageData, statistic = bootClassic, + params = "anc1", strata = "region", replicates = 9 +) |> + boot_bw_estimate() + +## Add population ---- +pop_df <- somalia_population |> + subset(select = c(region, total)) + +names(pop_df) <- c("strata", "pop") + +estimate_total(est_df, pop_df) + +} diff --git a/man/estimate_total_.Rd b/man/estimate_total_.Rd new file mode 100644 index 0000000..d67ef09 --- /dev/null +++ b/man/estimate_total_.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/post_strat_estimation.R +\name{estimate_total_} +\alias{estimate_total_} +\title{Estimate post-stratification weighted totals} +\usage{ +estimate_total_(est_pop_df) +} +\description{ +Estimate post-stratification weighted totals +} +\keyword{internal} diff --git a/man/somalia_population.Rd b/man/somalia_population.Rd new file mode 100644 index 0000000..1c34386 --- /dev/null +++ b/man/somalia_population.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{somalia_population} +\alias{somalia_population} +\title{Somalia regional population in 2022} +\format{ +An object of class \code{data.frame} with 19 rows and 18 columns. +} +\source{ +\url{https://fsnau.org/downloads/2022-Gu-IPC-Population-Tables-Current.pdf} +} +\usage{ +somalia_population +} +\description{ +A data.frame with 19 rows and 18 columns: +} +\details{ +\tabular{ll}{ + \strong{Variable} \tab \strong{Description} \cr + \code{region} \tab Region name \cr + \code{total} \tab Total population \cr + \code{urban} \tab Total urban population \cr + \code{rural} \tab Total rural population \cr + \code{idp} \tab Total IDP population \cr + \code{urban_stressed} \tab Total urban population - stressed \cr + \code{rural_stressed} \tab Total rural population - stressed \cr + \code{idp_stressed} \tab Total IDP population - stressed \cr + \code{urban_crisis} \tab Total urban population - crisis \cr + \code{rural_crisis} \tab Total rural population - crisis \cr + \code{idp_crisis} \tab Total IDP population - crisis \cr + \code{urban_emergency} \tab Total urban population - emergency \cr + \code{rural_emergency} \tab Total rural population - emergency \cr + \code{idp_emergency} \tab Total IDP population - emergency \cr + \code{urban_catastrophe} \tab Total urban population - catastrophe \cr + \code{rural_catastrophe} \tab Total rural population - catastrophe \cr + \code{idp_catastrophe} \tab Total IDP population - catastrophe \cr + \code{percent_at_least_crisis} \tab Percentage of population that are at least in crisis \cr +} +} +\keyword{datasets} diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml index 6a2da63..142cea0 100644 --- a/pkgdown/_pkgdown.yml +++ b/pkgdown/_pkgdown.yml @@ -56,6 +56,10 @@ reference: - bootClassic - bootPROBIT + - title: Stratified weighted estimation + contents: + - estimate_total + - title: Utility contents: - recode @@ -66,3 +70,4 @@ reference: - indicatorsCH2 - indicatorsHH - villageData + - somalia_population diff --git a/tests/testthat/test-post_strat_estimation.R b/tests/testthat/test-post_strat_estimation.R new file mode 100644 index 0000000..2a9387c --- /dev/null +++ b/tests/testthat/test-post_strat_estimation.R @@ -0,0 +1,44 @@ +# Tests for post-stratification estimation functions --------------------------- + +test_that("estimate_total works as expected", { + boot_df <- boot_bw( + indicatorsHH, villageData, statistic = bootClassic, params = "anc1", + replicates = 9 + ) + + est_df <- boot_bw_estimate(boot_df) + + pop_df <- somalia_population |> + subset(select = c(region, total)) + + names(pop_df) <- c("strata", "pop") + + expect_s3_class(estimate_total(est_df, pop_df), "data.frame") + + boot_df <- boot_bw( + indicatorsHH, villageData, statistic = bootClassic, params = "anc1", + replicates = 9, strata = "region" + ) + + est_df <- boot_bw_estimate(boot_df) + + expect_s3_class(estimate_total(est_df, pop_df), "data.frame") + + boot_df <- boot_bw( + indicatorsHH, villageData, statistic = bootClassic, + params = c("anc1", "anc2"), replicates = 9, strata = "region" + ) + + est_df <- boot_bw_estimate(boot_df) + + expect_s3_class(estimate_total(est_df, pop_df), "data.frame") + + est_dfx <- est_df + names(est_dfx) <- c("strat", "indicator", "es", "lcl", "ucl", "sse") + + pop_dfx <- pop_df + names(pop_dfx) <- c("strat", "pop") + + expect_error(estimate_total(est_dfx, pop_df)) + expect_error(estimate_total(est_df, pop_dfx)) +}) \ No newline at end of file diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..097b241 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/bbw-estimation.Rmd b/vignettes/bbw-estimation.Rmd new file mode 100644 index 0000000..6a21476 --- /dev/null +++ b/vignettes/bbw-estimation.Rmd @@ -0,0 +1,21 @@ +--- +title: "Blocked weighted bootstrap estimation" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Blocked weighted bootstrap estimation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup, echo = FALSE} +library(bbw) +``` + +The `{bbw}` package was developed primarily as a tool for analysing complex survey data. diff --git a/vignettes/bbw.Rmd b/vignettes/bbw.Rmd index cbcef2c..b0fe1fc 100644 --- a/vignettes/bbw.Rmd +++ b/vignettes/bbw.Rmd @@ -16,7 +16,9 @@ knitr::opts_chunk$set( ) ``` -The **blocked weighted bootstrap** is an estimation technique for use with data from two-stage cluster sampled surveys in which either prior weighting (e.g. *population-proportional sampling* or *PPS* as used in **Standardized Monitoring and Assessment of Relief and Transitions** or **SMART** surveys) or *posterior weighting* (e.g. as used in **Rapid Assessment Method** or **RAM** and **Simple Spatial Sampling Method** or **S3M** surveys). +The **blocked weighted bootstrap** is an estimation technique for use with data from two-stage cluster sampled surveys in which either prior weighting (e.g. *population-proportional sampling (PPS)* as used in [Standardized Monitoring and Assessment of Relief and Transitions (SMART)](https://smartmethodology.org/) surveys) or *posterior weighting* (e.g. as used in [Rapid Assessment Method (RAM)](https://rapidsurveys.io/ramOPmanual/) and [Simple Spatial Sampling Method (S3M)](https://researchonline.lshtm.ac.uk/id/eprint/2572543)surveys). + +## Features of blocked weighted bootstrap The bootstrap technique is described in this [article](https://en.wikipedia.org/wiki/Bootstrapping_(statistics)). The blocked weighted bootstrap used in RAM and S3M is a modification to the *percentile bootstrap* to include *blocking* and *weighting* to account for a *complex sample design*. @@ -24,24 +26,31 @@ With RAM and S3M surveys, the sample is complex in the sense that it is an unwei * **Blocked**: The block corresponds to the primary sampling unit (`PSU = cluster`). *PSU*s are resampled with replacement. Observations within the resampled PSUs are also sampled with replacement. -* **Weighted**: RAM and S3M samples do not use *population proportional sampling (PPS)* to weight the sample prior to data collection (e.g. as is done with **SMART** surveys). This means that a posterior weighting procedure is required. `{bbw}` uses a *"roulette wheel"* algorithm (see [Figure 1](#FIG1) below) to weight (i.e. by population) the selection probability of PSUs in bootstrap replicates. +* **Weighted**: RAM and S3M samples do not use *population proportional sampling (PPS)* to weight the sample prior to data collection (e.g. as is done with **SMART** surveys). This means that a posterior weighting procedure is required. `{bbw}` uses a *"roulette wheel"* algorithm (see [Figure 1](#fig1) below) to weight (i.e. by population) the selection probability of PSUs in bootstrap replicates.
- + **Figure 1:** Roulette wheel algorithm

-![Roulette wheel algorithm](../man/figures/rouletteWheel.png) +```{r, echo = FALSE, eval = TRUE, out.width = "50%", fig.alt = "Roulette wheel algorithm", fig.align = "center"} +knitr::include_graphics("../man/figures/rouletteWheel.png") +```

+
+## Posterior weighting through resampling + In the case of prior weighting by *PPS* all clusters are given the same weight. With posterior weighting (as in RAM or S3M) the weight is the population of each PSU. This procedure is very similar to the [fitness proportional selection](https://en.wikipedia.org/wiki/Fitness_proportionate_selection) technique used in *evolutionary computing*. A total of `m` PSUs are sampled with replacement for each bootstrap replicate (where `m` is the number of PSUs in the survey sample). The required statistic is applied to each replicate. The reported estimate consists of the 0.025th (*95\% LCL*), 0.5th (*point estimate*), and 0.975th (*95\% UCL*) quantiles of the distribution of the statistic across all survey replicates. +## Development history + Early versions of the `{bbw}` did not resample observations within PSUs following:
@@ -50,6 +59,18 @@ Early versions of the `{bbw}` did not resample observations within PSUs followin
-and used a large number (e.g. `3999`) survey replicates. Current versions of the `{bbw}` resample observations within PSUs and use a smaller number of survey replicates (e.g. `n = 400`). This is a more computationally efficient approach. +and used a large number (e.g. `3999`) survey replicates. Next versions (up to *v0.2.0*) of the `{bbw}` resample observations within PSUs and use a smaller number of survey replicates (e.g. `n = 400`). This is a more computationally efficient approach and is demonstrated in the following: + +
+ +> Aaron GJ, Sodani PR, Sankar R, Fairhurst J, Siling K, Guevarra E, et al. (2016) Household Coverage of Fortified Staple Food Commodities in Rajasthan, India. PLoS ONE 11(10): e0163176. https://doi.org/10.1371/journal.pone.0163176 + +> Aaron GJ, Strutt N, Boateng NA, Guevarra E, Siling K, Norris A, et al. (2016) Assessing Program Coverage of Two Approaches to Distributing a Complementary Feeding Supplement to Infants and Young Children in Ghana. PLoS ONE 11(10): e0162462. https://doi.org/10.1371/journal.pone.0162462 + +
+ +The current version (*v0.3.0*) now includes further improvements in computational efficiency using a vectorised algorithm and provides an option for parallel computation which when used reduces the computational overhead by at least 80%. + +## Advantages The main reason to use `{bbw}` is that the bootstrap allows a wider range statistics to be calculated than model-based techniques without resort to grand assumptions about the sampling distribution of the required statistic. A good example for this is the confidence interval on the difference between two medians which might be used for many socio-economic variables. The `{bbw}` also allows for a wider range of hypothesis tests to be used with complex sample survey data.