Merge pull request #49 from rapidsurveys:dev

create post-strat estimator; fix #19; fix #48
rapidsurveys · Jan 8, 2025 · 6a83135 · 6a83135
2 parents 7cbb0b7 + 771ddd6
commit 6a83135
Show file tree

Hide file tree

Showing 26 changed files with 558 additions and 32 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@
 
 README.html
 docs
+inst/doc
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: bbw
 Type: Package
 Title: Blocked Weighted Bootstrap
-Version: 0.2.4.9000
+Version: 0.2.5.9000
 Authors@R: c(
     person("Mark", "Myatt", 
            email = "[email protected]", role = c("aut", "cph")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -10,6 +10,7 @@ export(boot_bw_sample_clusters)
 export(boot_bw_sample_within_clusters)
 export(boot_bw_sequential)
 export(boot_bw_weight)
+export(estimate_total)
 export(recode)
 importFrom(car,bcPower)
 importFrom(car,powerTransform)

diff --git a/R/bootProbit.R b/R/bootProbit.R
@@ -46,7 +46,7 @@ bootPROBIT <- function(x, params, threshold = THRESHOLD) {
   d <- car::bcPower(d, lambda)
   threshold <- car::bcPower(threshold, lambda)
   m <- mean(d, na.rm = TRUE)
-  s <- stats::sd(d, na.rm = T)
+  s <- stats::sd(d, na.rm = TRUE)
 
   ## PROBIT estimate ----
   x <- stats::pnorm(q = threshold, mean = m, sd = s)

diff --git a/R/boot_bw_estimate.R b/R/boot_bw_estimate.R
@@ -51,5 +51,49 @@ boot_bw_estimate <- function(boot_df) {
   row.names(est) <- NULL
 
   ## Return est ----
+  est
+}
+
+
+#'
+#' Boot estimate
+#' 
+#' @keywords internal
+#' 
+
+boot_percentile <- function(boot_df) {
+  if (is.data.frame(boot_df)) {
+    est <- lapply(
+      X = boot_df,
+      FUN = stats::quantile,
+      probs = c(0.5, 0.025, 0.975),
+      na.rm = TRUE
+    ) |>
+      do.call(rbind, args = _) |>
+      as.data.frame()
+
+    se <- lapply(
+      X = boot_df,
+      FUN = stats::sd,
+      na.rm = TRUE
+    ) |>
+      do.call(rbind, args = _) |>
+      as.data.frame()
+
+    est <- data.frame(est, se)
+
+    names(est) <- c("est", "lcl", "ucl", "se")
+  } else {
+    est <- stats::quantile(
+      x = boot_df, probs = c(0.5, 0.025, 0.975), na.rm = TRUE
+    ) |>
+      rbind() |>
+      data.frame(
+        se = stats::sd(x = boot_df, na.rm = TRUE)
+      )
+
+    names(est) <- c("est", "lcl", "ucl", "se")
+  }
+
   est
 }
diff --git a/R/data.R b/R/data.R
@@ -139,3 +139,34 @@
 #' @source Mother and child health and nutrition survey in 3 regions of Somalia
 #'
 "indicatorsCH2"
+
+
+#'
+#' Somalia regional population in 2022
+#' 
+#' A data.frame with 19 rows and 18 columns:
+#' 
+#' **Variable** | **Description**
+#' :--- | :---
+#' `region` | Region name
+#' `total` | Total population
+#' `urban` | Total urban population
+#' `rural` | Total rural population
+#' `idp` | Total IDP population
+#' `urban_stressed` | Total urban population - stressed
+#' `rural_stressed` | Total rural population - stressed
+#' `idp_stressed` | Total IDP population - stressed
+#' `urban_crisis` | Total urban population - crisis
+#' `rural_crisis` | Total rural population - crisis
+#' `idp_crisis` | Total IDP population - crisis
+#' `urban_emergency` | Total urban population - emergency
+#' `rural_emergency` | Total rural population - emergency
+#' `idp_emergency` | Total IDP population - emergency
+#' `urban_catastrophe` | Total urban population - catastrophe
+#' `rural_catastrophe` | Total rural population - catastrophe
+#' `idp_catastrophe` | Total IDP population - catastrophe
+#' `percent_at_least_crisis` | Percentage of population that are at least in crisis
+#' 
+#' @source <https://fsnau.org/downloads/2022-Gu-IPC-Population-Tables-Current.pdf>
+#' 
+"somalia_population"
diff --git a/R/post_strat_estimation.R b/R/post_strat_estimation.R
@@ -0,0 +1,127 @@
+#'
+#' Post-stratification analysis
+#' 
+#' @param est_df A [data.frame()] of stratified indicator estimates to get
+#'   overall estimates of. `est_df` should have a variable named `est` for
+#'   the values of the indicator estimate, a variable named `strata` for
+#'   information on the stratification or grouping of the estimates, and a
+#'   variable named `se` for the standard errors for the values of the
+#'   indicator estimate. This is usually produced via a call to
+#'   [boot_bw_estimate()].
+#' @param pop_df A [data.frame()] with at least two variables: `strata` for the
+#'   stratification/grouping information that matches `strata` in `est_df` and
+#'   `pop` for information on population for the given `strata`.
+#' 
+#' @returns A vector of values for the overall estimate, overall 95% lower
+#'   confidence limit, and overall 95% upper confidence limit for each of the
+#'   `strata` in `est_df`.
+#' 
+#' @examples
+#' est_df <- boot_bw(
+#'   x = indicatorsHH, w = villageData, statistic = bootClassic, 
+#'   params = "anc1", strata = "region", replicates = 9
+#' ) |>
+#'   boot_bw_estimate()
+#' 
+#' ## Add population ----
+#' pop_df <- somalia_population |>
+#'   subset(select = c(region, total))
+#' 
+#' names(pop_df) <- c("strata", "pop")
+#' 
+#' estimate_total(est_df, pop_df)
+#' 
+#' @export
+#' 
+
+estimate_total <- function(est_df, pop_df) {
+  ## Check the data ----
+  check_est_df(est_df)
+  check_pop_df(pop_df)
+
+  ## Merge estimates with population data ----
+  est_pop_df <- merge(pop_df, est_df, by = "strata", all.y = TRUE)
+
+  ## Get total estimates ----
+  if (length(unique(est_pop_df$indicator)) > 1) {
+    est_pop_df <- split(x = est_pop_df, f = est_pop_df$indicator)
+
+    total_est_df <- lapply(
+      X = est_pop_df,
+      FUN = estimate_total_
+    ) |>
+      do.call(rbind, args = _) |>
+      as.data.frame()
+  } else {
+    total_est_df <- estimate_total_(est_pop_df)
+  }
+
+  ## Return estimates ----
+  total_est_df
+}
+
+
+#'
+#' Estimate post-stratification weighted totals
+#' 
+#' @keywords internal
+#' 
+
+estimate_total_ <- function(est_pop_df) {
+  with(est_pop_df, {
+    data.frame(
+      strata = "Overall",
+      indicator = unique(indicator),
+      est = calc_total_estimate(est, pop),
+      lcl = calc_total_ci(est, pop, se, "lcl"),
+      ucl = calc_total_ci(est, pop, se, "ucl"),
+      se = calc_total_sd(se, pop)
+    )
+  })
+}
+
+
+#'
+#' Calculate total estimate
+#' 
+#' @keywords internal
+#' 
+
+calc_total_estimate <- function(est, pop) {
+  sum(est * pop, na.rm = TRUE) / sum(pop, na.rm = TRUE)
+}
+
+
+#'
+#' Calculate total sd
+#' 
+#' @keywords internal
+#' 
+
+calc_total_sd <- function(se, pop) {
+  sum(se ^ 2 * pop / sum(pop, na.rm = TRUE), na.rm = TRUE)
+}
+
+
+#'
+#' Calculate confidence limits
+#' 
+#' @keywords internal
+#' 
+
+calc_total_ci <- function(est, pop, se, ci = c("lcl", "ucl")) {
+  ci <- match.arg(ci)
+
+  operator <- ifelse(ci == "lcl", "-", "+")
+
+  str2expression(
+    paste0(
+      "sum(est * pop, na.rm = TRUE) / sum(pop, na.rm = TRUE) ", 
+      operator,
+      " 1.96 * sqrt(sum(se ^ 2 * pop / sum(pop, na.rm = TRUE), na.rm = TRUE))"
+    )
+  ) |>
+    eval()
+}
+
+
diff --git a/R/utils.R b/R/utils.R
@@ -120,32 +120,58 @@ tidy_boot <- function(boot, w, strata, outputColumns) {
 
 
 #'
-#' Boot estimate
+#' Check est_df
 #' 
 #' @keywords internal
 #' 
 
-boot_percentile <- function(boot_df) {
-  if (is.data.frame(boot_df)) {
-    est <- lapply(
-      X = boot_df,
-      FUN = stats::quantile,
-      probs = c(0.5, 0.025, 0.975),
-      na.rm = TRUE
-    ) |>
-      do.call(rbind, args = _) |>
-      data.frame()
+check_est_df <- function(est_df) {
+  data_name_check <- c("strata", "est", "se") %in% names(est_df)
+  data_name_in <- c("strata", "est", "se")[which(data_name_check)]
+  data_name_out <- c("strata", "est", "se")[which(!data_name_check)]
+
+  arg_name <- deparse(substitute(est_df))
+
+  message_out <- ifelse(
+    length(data_name_out) == 1,
+    "{.strong {.val {arg_name}}} doesn't have a {.strong {.val {data_name_out}}} variable or has a different name",
+    "{.strong {.val {arg_name}}} doesn't have {.strong {.val {data_name_out}}} variables or have different names"
+  )
 
-    names(est) <- c("est", "lcl", "ucl")
+  if (all(data_name_check)) {
+    cli::cli_alert_success(
+      "{.arg est_df} has the appropriate/expected variables"
+    )
   } else {
-    est <- stats::quantile(
-      x = boot_df, probs = c(0.5, 0.025, 0.975), na.rm = TRUE
-    ) |>
-      rbind() |>
-      data.frame()
-
-    names(est) <- c("est", "lcl", "ucl")
+    cli::cli_abort(message_out)
   }
+}
+
+
+#'
+#' Check pop_df
+#' 
+#' @keywords internal
+#' 
+
+check_pop_df <- function(pop_df) {
+  data_name_check <- c("strata", "pop") %in% names(pop_df)
+  data_name_in <- c("strata", "pop")[which(data_name_check)]
+  data_name_out <- c("strata", "pop")[which(!data_name_check)]
+
+  arg_name <- deparse(substitute(pop_df))
 
-  est
+  message_out <- ifelse(
+    length(data_name_out) == 1,
+    "{.strong {.val {arg_name}}} doesn't have a {.strong {.val {data_name_out}}} variable or has a different name",
+    "{.strong {.val {arg_name}}} doesn't have {.strong {.val {data_name_out}}} variables or have different names"
+  )
+
+  if (all(data_name_check)) {
+    cli::cli_alert_success(
+      "{.arg pop_df} has the appropriate/expected variables"
+    )
+  } else {
+    cli::cli_abort(message_out)
+  }
 }
diff --git a/README.Rmd b/README.Rmd
@@ -29,7 +29,7 @@ knitr::opts_chunk$set(
 <!-- badges: end -->
 
 ## Overview
-The **blocked weighted bootstrap** is an estimation technique for use with data from two-stage cluster sampled surveys in which either prior weighting (e.g. *population-proportional sampling* or *PPS* as used in **Standardized Monitoring and Assessment of Relief and Transitions** or **SMART** surveys) or *posterior weighting* (e.g. as used in **Rapid Assessment Method** or **RAM** and **Simple Spatial Sampling Method** or **S3M** surveys) is implemented.
+The **blocked weighted bootstrap** is an estimation technique for use with data from two-stage cluster sampled surveys in which either prior weighting (e.g. *population-proportional sampling* or *PPS* as used in [Standardized Monitoring and Assessment of Relief and Transitions (SMART)](https://smartmethodology.org/) surveys) or *posterior weighting* (e.g. as used in [Rapid Assessment Method (RAM)](https://rapidsurveys.io/ramOPmanual/) and [Simple Spatial Sampling Method (S3M)](https://researchonline.lshtm.ac.uk/id/eprint/2572543) surveys) is implemented.
 
 ## Installation
 
@@ -61,16 +61,16 @@ With RAM and S3M surveys, the sample is complex in the sense that it is an unwei
 
 * **Blocked**: The block corresponds to the primary sampling unit (*PSU = cluster*). PSUs are resampled with replacement. Observations within the resampled PSUs are also sampled with replacement.
 
-* **Weighted**: RAM and S3M samples do not use population proportional sampling (PPS) to weight the sample prior to data collection (e.g. as is done with SMART surveys). This means that a posterior weighting procedure is required. `{bbw}` uses a *"roulette wheel"* algorithm (see [illustration below](#FIG1)) to weight (i.e. by population) the selection probability of PSUs in bootstrap replicates.
+* **Weighted**: RAM and S3M samples do not use population proportional sampling (PPS) to weight the sample prior to data collection (e.g. as is done with SMART surveys). This means that a posterior weighting procedure is required. `{bbw}` uses a *"roulette wheel"* algorithm (see [illustration below](#fig1)) to weight (i.e. by population) the selection probability of PSUs in bootstrap replicates.
 
-<a name="FIG1"></a>
+<a name="fig1"></a>
 <p align="center">
 ```{r, echo = FALSE, eval = TRUE, out.width = "50%", fig.alt = "Roulette wheel algorithm", fig.align = "center"}
 knitr::include_graphics("man/figures/rouletteWheel.png")
 ```
 </p>
 
-In the case of prior weighting by PPS all clusters are given the same weight. With posterior weighting (as in RAM or S3M) the weight is the population of each PSU. This procedure is very similar to the *fitness proportional selection* technique used in *evolutionary computing*.
+In the case of prior weighting by PPS all clusters are given the same weight. With posterior weighting (as in RAM or S3M) the weight is the population of each PSU. This procedure is very similar to the [*fitness proportional selection*](https://en.wikipedia.org/wiki/Fitness_proportionate_selection) technique used in *evolutionary computing*.
 
 A total of `m` PSUs are sampled with replacement for each bootstrap replicate (where `m` is the number of PSUs in the survey sample).
 

diff --git a/data-raw/population.R b/data-raw/population.R
@@ -0,0 +1,34 @@
+# Somalia regional population estimates 2022
+
+.url <- "https://fsnau.org/downloads/2022-Gu-IPC-Population-Tables-Current.pdf"
+
+somalia_population <- pdftools::pdf_text(.url) |>
+  stringr::str_split(pattern = "\n") |>
+  (\(x) x[[1]])() |>
+  (\(x) x[c(8:15, 18:19, 22:29, 31)])() |>
+  stringr::str_remove_all(pattern = ",") |>
+  stringr::str_split(pattern = "[ ]{2,}") |>
+  do.call(rbind, args = _) |>
+  data.frame()
+
+names(somalia_population) <- c(
+  "region", "total", "urban", "rural", "idp", 
+  "urban_stressed", "rural_stressed", "idp_stressed",
+  "urban_crisis", "rural_crisis", "idp_crisis",
+  "urban_emergency", "rural_emergency", "idp_emergency",
+  "urban_catastrophe", "rural_catastrophe", "idp_catastrophe",
+  "percent_at_least_crisis"
+)
+
+somalia_population[ , c("total", "urban", "rural", "idp", 
+  "urban_stressed", "rural_stressed", "idp_stressed",
+  "urban_crisis", "rural_crisis", "idp_crisis",
+  "urban_emergency", "rural_emergency", "idp_emergency",
+  "urban_catastrophe", "rural_catastrophe", "idp_catastrophe",
+  "percent_at_least_crisis")] <- lapply(
+    X = subset(somalia_population, select = -region),
+    FUN = as.numeric
+  ) |>
+  do.call(cbind, args = _)
+
+usethis::use_data(somalia_population, overwrite = TRUE, compress = "xz")
diff --git a/data/somalia_population.rda b/data/somalia_population.rda