Skip to content

Commit

Permalink
Merge pull request #49 from rapidsurveys:dev
Browse files Browse the repository at this point in the history
create post-strat estimator; fix #19; fix #48
  • Loading branch information
ernestguevarra authored Jan 8, 2025
2 parents 7cbb0b7 + 771ddd6 commit 6a83135
Show file tree
Hide file tree
Showing 26 changed files with 558 additions and 32 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@

README.html
docs
inst/doc
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: bbw
Type: Package
Title: Blocked Weighted Bootstrap
Version: 0.2.4.9000
Version: 0.2.5.9000
Authors@R: c(
person("Mark", "Myatt",
email = "[email protected]", role = c("aut", "cph")),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export(boot_bw_sample_clusters)
export(boot_bw_sample_within_clusters)
export(boot_bw_sequential)
export(boot_bw_weight)
export(estimate_total)
export(recode)
importFrom(car,bcPower)
importFrom(car,powerTransform)
Expand Down
2 changes: 1 addition & 1 deletion R/bootProbit.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ bootPROBIT <- function(x, params, threshold = THRESHOLD) {
d <- car::bcPower(d, lambda)
threshold <- car::bcPower(threshold, lambda)
m <- mean(d, na.rm = TRUE)
s <- stats::sd(d, na.rm = T)
s <- stats::sd(d, na.rm = TRUE)

## PROBIT estimate ----
x <- stats::pnorm(q = threshold, mean = m, sd = s)
Expand Down
44 changes: 44 additions & 0 deletions R/boot_bw_estimate.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,49 @@ boot_bw_estimate <- function(boot_df) {
row.names(est) <- NULL

## Return est ----
est
}


#'
#' Boot estimate
#'
#' @keywords internal
#'

boot_percentile <- function(boot_df) {
if (is.data.frame(boot_df)) {
est <- lapply(
X = boot_df,
FUN = stats::quantile,
probs = c(0.5, 0.025, 0.975),
na.rm = TRUE
) |>
do.call(rbind, args = _) |>
as.data.frame()

se <- lapply(
X = boot_df,
FUN = stats::sd,
na.rm = TRUE
) |>
do.call(rbind, args = _) |>
as.data.frame()

est <- data.frame(est, se)

names(est) <- c("est", "lcl", "ucl", "se")
} else {
est <- stats::quantile(
x = boot_df, probs = c(0.5, 0.025, 0.975), na.rm = TRUE
) |>
rbind() |>
data.frame(
se = stats::sd(x = boot_df, na.rm = TRUE)
)

names(est) <- c("est", "lcl", "ucl", "se")
}

est
}
31 changes: 31 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,34 @@
#' @source Mother and child health and nutrition survey in 3 regions of Somalia
#'
"indicatorsCH2"


#'
#' Somalia regional population in 2022
#'
#' A data.frame with 19 rows and 18 columns:
#'
#' **Variable** | **Description**
#' :--- | :---
#' `region` | Region name
#' `total` | Total population
#' `urban` | Total urban population
#' `rural` | Total rural population
#' `idp` | Total IDP population
#' `urban_stressed` | Total urban population - stressed
#' `rural_stressed` | Total rural population - stressed
#' `idp_stressed` | Total IDP population - stressed
#' `urban_crisis` | Total urban population - crisis
#' `rural_crisis` | Total rural population - crisis
#' `idp_crisis` | Total IDP population - crisis
#' `urban_emergency` | Total urban population - emergency
#' `rural_emergency` | Total rural population - emergency
#' `idp_emergency` | Total IDP population - emergency
#' `urban_catastrophe` | Total urban population - catastrophe
#' `rural_catastrophe` | Total rural population - catastrophe
#' `idp_catastrophe` | Total IDP population - catastrophe
#' `percent_at_least_crisis` | Percentage of population that are at least in crisis
#'
#' @source <https://fsnau.org/downloads/2022-Gu-IPC-Population-Tables-Current.pdf>
#'
"somalia_population"
127 changes: 127 additions & 0 deletions R/post_strat_estimation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#'
#' Post-stratification analysis
#'
#' @param est_df A [data.frame()] of stratified indicator estimates to get
#' overall estimates of. `est_df` should have a variable named `est` for
#' the values of the indicator estimate, a variable named `strata` for
#' information on the stratification or grouping of the estimates, and a
#' variable named `se` for the standard errors for the values of the
#' indicator estimate. This is usually produced via a call to
#' [boot_bw_estimate()].
#' @param pop_df A [data.frame()] with at least two variables: `strata` for the
#' stratification/grouping information that matches `strata` in `est_df` and
#' `pop` for information on population for the given `strata`.
#'
#' @returns A vector of values for the overall estimate, overall 95% lower
#' confidence limit, and overall 95% upper confidence limit for each of the
#' `strata` in `est_df`.
#'
#' @examples
#' est_df <- boot_bw(
#' x = indicatorsHH, w = villageData, statistic = bootClassic,
#' params = "anc1", strata = "region", replicates = 9
#' ) |>
#' boot_bw_estimate()
#'
#' ## Add population ----
#' pop_df <- somalia_population |>
#' subset(select = c(region, total))
#'
#' names(pop_df) <- c("strata", "pop")
#'
#' estimate_total(est_df, pop_df)
#'
#' @export
#'

estimate_total <- function(est_df, pop_df) {
## Check the data ----
check_est_df(est_df)
check_pop_df(pop_df)

## Merge estimates with population data ----
est_pop_df <- merge(pop_df, est_df, by = "strata", all.y = TRUE)

## Get total estimates ----
if (length(unique(est_pop_df$indicator)) > 1) {
est_pop_df <- split(x = est_pop_df, f = est_pop_df$indicator)

total_est_df <- lapply(
X = est_pop_df,
FUN = estimate_total_
) |>
do.call(rbind, args = _) |>
as.data.frame()
} else {
total_est_df <- estimate_total_(est_pop_df)
}

## Return estimates ----
total_est_df
}


#'
#' Estimate post-stratification weighted totals
#'
#' @keywords internal
#'

estimate_total_ <- function(est_pop_df) {
with(est_pop_df, {
data.frame(
strata = "Overall",
indicator = unique(indicator),
est = calc_total_estimate(est, pop),
lcl = calc_total_ci(est, pop, se, "lcl"),
ucl = calc_total_ci(est, pop, se, "ucl"),
se = calc_total_sd(se, pop)
)
})
}


#'
#' Calculate total estimate
#'
#' @keywords internal
#'

calc_total_estimate <- function(est, pop) {
sum(est * pop, na.rm = TRUE) / sum(pop, na.rm = TRUE)
}


#'
#' Calculate total sd
#'
#' @keywords internal
#'

calc_total_sd <- function(se, pop) {
sum(se ^ 2 * pop / sum(pop, na.rm = TRUE), na.rm = TRUE)
}


#'
#' Calculate confidence limits
#'
#' @keywords internal
#'

calc_total_ci <- function(est, pop, se, ci = c("lcl", "ucl")) {
ci <- match.arg(ci)

operator <- ifelse(ci == "lcl", "-", "+")

str2expression(
paste0(
"sum(est * pop, na.rm = TRUE) / sum(pop, na.rm = TRUE) ",
operator,
" 1.96 * sqrt(sum(se ^ 2 * pop / sum(pop, na.rm = TRUE), na.rm = TRUE))"
)
) |>
eval()
}


66 changes: 46 additions & 20 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -120,32 +120,58 @@ tidy_boot <- function(boot, w, strata, outputColumns) {


#'
#' Boot estimate
#' Check est_df
#'
#' @keywords internal
#'

boot_percentile <- function(boot_df) {
if (is.data.frame(boot_df)) {
est <- lapply(
X = boot_df,
FUN = stats::quantile,
probs = c(0.5, 0.025, 0.975),
na.rm = TRUE
) |>
do.call(rbind, args = _) |>
data.frame()
check_est_df <- function(est_df) {
data_name_check <- c("strata", "est", "se") %in% names(est_df)
data_name_in <- c("strata", "est", "se")[which(data_name_check)]
data_name_out <- c("strata", "est", "se")[which(!data_name_check)]

arg_name <- deparse(substitute(est_df))

message_out <- ifelse(
length(data_name_out) == 1,
"{.strong {.val {arg_name}}} doesn't have a {.strong {.val {data_name_out}}} variable or has a different name",
"{.strong {.val {arg_name}}} doesn't have {.strong {.val {data_name_out}}} variables or have different names"
)

names(est) <- c("est", "lcl", "ucl")
if (all(data_name_check)) {
cli::cli_alert_success(
"{.arg est_df} has the appropriate/expected variables"
)
} else {
est <- stats::quantile(
x = boot_df, probs = c(0.5, 0.025, 0.975), na.rm = TRUE
) |>
rbind() |>
data.frame()

names(est) <- c("est", "lcl", "ucl")
cli::cli_abort(message_out)
}
}


#'
#' Check pop_df
#'
#' @keywords internal
#'

check_pop_df <- function(pop_df) {
data_name_check <- c("strata", "pop") %in% names(pop_df)
data_name_in <- c("strata", "pop")[which(data_name_check)]
data_name_out <- c("strata", "pop")[which(!data_name_check)]

arg_name <- deparse(substitute(pop_df))

est
message_out <- ifelse(
length(data_name_out) == 1,
"{.strong {.val {arg_name}}} doesn't have a {.strong {.val {data_name_out}}} variable or has a different name",
"{.strong {.val {arg_name}}} doesn't have {.strong {.val {data_name_out}}} variables or have different names"
)

if (all(data_name_check)) {
cli::cli_alert_success(
"{.arg pop_df} has the appropriate/expected variables"
)
} else {
cli::cli_abort(message_out)
}
}
8 changes: 4 additions & 4 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ knitr::opts_chunk$set(
<!-- badges: end -->

## Overview
The **blocked weighted bootstrap** is an estimation technique for use with data from two-stage cluster sampled surveys in which either prior weighting (e.g. *population-proportional sampling* or *PPS* as used in **Standardized Monitoring and Assessment of Relief and Transitions** or **SMART** surveys) or *posterior weighting* (e.g. as used in **Rapid Assessment Method** or **RAM** and **Simple Spatial Sampling Method** or **S3M** surveys) is implemented.
The **blocked weighted bootstrap** is an estimation technique for use with data from two-stage cluster sampled surveys in which either prior weighting (e.g. *population-proportional sampling* or *PPS* as used in [Standardized Monitoring and Assessment of Relief and Transitions (SMART)](https://smartmethodology.org/) surveys) or *posterior weighting* (e.g. as used in [Rapid Assessment Method (RAM)](https://rapidsurveys.io/ramOPmanual/) and [Simple Spatial Sampling Method (S3M)](https://researchonline.lshtm.ac.uk/id/eprint/2572543) surveys) is implemented.

## Installation

Expand Down Expand Up @@ -61,16 +61,16 @@ With RAM and S3M surveys, the sample is complex in the sense that it is an unwei

* **Blocked**: The block corresponds to the primary sampling unit (*PSU = cluster*). PSUs are resampled with replacement. Observations within the resampled PSUs are also sampled with replacement.

* **Weighted**: RAM and S3M samples do not use population proportional sampling (PPS) to weight the sample prior to data collection (e.g. as is done with SMART surveys). This means that a posterior weighting procedure is required. `{bbw}` uses a *"roulette wheel"* algorithm (see [illustration below](#FIG1)) to weight (i.e. by population) the selection probability of PSUs in bootstrap replicates.
* **Weighted**: RAM and S3M samples do not use population proportional sampling (PPS) to weight the sample prior to data collection (e.g. as is done with SMART surveys). This means that a posterior weighting procedure is required. `{bbw}` uses a *"roulette wheel"* algorithm (see [illustration below](#fig1)) to weight (i.e. by population) the selection probability of PSUs in bootstrap replicates.

<a name="FIG1"></a>
<a name="fig1"></a>
<p align="center">
```{r, echo = FALSE, eval = TRUE, out.width = "50%", fig.alt = "Roulette wheel algorithm", fig.align = "center"}
knitr::include_graphics("man/figures/rouletteWheel.png")
```
</p>

In the case of prior weighting by PPS all clusters are given the same weight. With posterior weighting (as in RAM or S3M) the weight is the population of each PSU. This procedure is very similar to the *fitness proportional selection* technique used in *evolutionary computing*.
In the case of prior weighting by PPS all clusters are given the same weight. With posterior weighting (as in RAM or S3M) the weight is the population of each PSU. This procedure is very similar to the [*fitness proportional selection*](https://en.wikipedia.org/wiki/Fitness_proportionate_selection) technique used in *evolutionary computing*.

A total of `m` PSUs are sampled with replacement for each bootstrap replicate (where `m` is the number of PSUs in the survey sample).

Expand Down
34 changes: 34 additions & 0 deletions data-raw/population.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Somalia regional population estimates 2022

.url <- "https://fsnau.org/downloads/2022-Gu-IPC-Population-Tables-Current.pdf"

somalia_population <- pdftools::pdf_text(.url) |>
stringr::str_split(pattern = "\n") |>
(\(x) x[[1]])() |>
(\(x) x[c(8:15, 18:19, 22:29, 31)])() |>
stringr::str_remove_all(pattern = ",") |>
stringr::str_split(pattern = "[ ]{2,}") |>
do.call(rbind, args = _) |>
data.frame()

names(somalia_population) <- c(
"region", "total", "urban", "rural", "idp",
"urban_stressed", "rural_stressed", "idp_stressed",
"urban_crisis", "rural_crisis", "idp_crisis",
"urban_emergency", "rural_emergency", "idp_emergency",
"urban_catastrophe", "rural_catastrophe", "idp_catastrophe",
"percent_at_least_crisis"
)

somalia_population[ , c("total", "urban", "rural", "idp",
"urban_stressed", "rural_stressed", "idp_stressed",
"urban_crisis", "rural_crisis", "idp_crisis",
"urban_emergency", "rural_emergency", "idp_emergency",
"urban_catastrophe", "rural_catastrophe", "idp_catastrophe",
"percent_at_least_crisis")] <- lapply(
X = subset(somalia_population, select = -region),
FUN = as.numeric
) |>
do.call(cbind, args = _)

usethis::use_data(somalia_population, overwrite = TRUE, compress = "xz")
Binary file added data/somalia_population.rda
Binary file not shown.
Loading

0 comments on commit 6a83135

Please sign in to comment.