diff --git a/DESCRIPTION b/DESCRIPTION index 301772c..84dc090 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: SUMMER Type: Package Title: Small-Area-Estimation Unit/Area Models and Methods for Estimation in R Version: 1.4.0 -Date: 2023-10-24 +Date: 2024-02-29 Authors@R: c( person(given = "Zehang R", family = "Li", email= "lizehang@gmail.com", role = c("cre","aut")), person(given = "Bryan D", family = "Martin", email= "bmartin6@uw.edu", role = "aut"), diff --git a/NAMESPACE b/NAMESPACE index 5548879..4e1c377 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -77,6 +77,7 @@ importFrom(ggplot2,ylab) importFrom(grDevices,colorRampPalette) importFrom(graphics,text) importFrom(haven,as_factor) +importFrom(methods,as) importFrom(methods,is) importFrom(sf,st_as_sf) importFrom(sf,st_centroid) diff --git a/R/getSmoothedDensity.R b/R/getSmoothedDensity.R index e1d14c8..7039f4c 100644 --- a/R/getSmoothedDensity.R +++ b/R/getSmoothedDensity.R @@ -20,6 +20,7 @@ #' #' @return ridge plot of the density, and if \code{save.density = TRUE}, also a data frame of the calculated densities #' @seealso \code{\link{plot.SUMMERproj}} +#' @importFrom methods as #' @author Zehang Richard Li #' @examples #' \dontrun{ diff --git a/inst/doc/small-area-estimation.html b/inst/doc/small-area-estimation.html index 8654197..75d19e4 100644 --- a/inst/doc/small-area-estimation.html +++ b/inst/doc/small-area-estimation.html @@ -12,7 +12,7 @@ - +
In this vignette, we illustrate the use of the
smoothArea and smoothUnit functions for
generic small area estimation of outcomes other than under-five
-mortality rates.
First, we load the necessary packages and the necessary data. The
-required package INLA is not in a standard repository, so
-we install if it is not available. The survey package will
-be used to generate direct estimates, while dplyr and
-tidyr will be used to facilitate data manipulation.
sae
+vignette, we compare our results from SUMMER with those
+obtained using the sae package (for a survey of other
+packages for small area estimation, see (kreutzmann2019?)).
+First, we load the necessary packages and data. The required package
+INLA is not available via a standard repository, so we
+include code for installation if it is not found. The
+survey package will be used to generate direct estimates,
+while dplyr and tidyr will be used for data
+manipulation.
library(sae)
library(SUMMER)
library(survey)
library(dplyr)
library(tidyr)
-if (!isTRUE(requireNamespace("INLA", quietly = TRUE))) {
- install.packages("INLA", repos=c(getOption("repos"),
- INLA="https://inla.r-inla-download.org/R/stable"), dep=TRUE)
-}First, we compare our results with those obtained using the
-sae package using examples presented in the
-sae vignette. Molina and Marhuenda
-(2015) generate an artificial dataset on income and other related
-variables to illustrate the use of area level models in the
-sae package. In their simulated example, the objective is
-to estimate prevalence of poverty in Spanish counties.
In their vignette for the sae package, (molina2015?) generate an
+artificial dataset on income and other related variables to illustrate
+the use of area level models. In this example, the objective is to
+estimate prevalence of poverty in Spanish counties.
data("incomedata")
data("sizeprov")
data("sizeprovedu")
-povertyline <- 0.6*median(incomedata$income) # 6557.143
-incomedata$poor <- as.integer(incomedata$income < povertyline)The incomedata data frame contains information on
-17199 observations of individuals in 52
-Spanish provinces. Income values and sampling weights are provided for
-each individual along with covariate information including age group and
-education level. Molina and Marhuenda
-(2015) define the poverty line z, and calculate an
-indicator variable with value 1 if the corresponding income value is
-below the poverty line and 0 otherwise.
saeThe sae::direct function computes the Horvitz-Thompson
-estimator of domain means, given by,
\[\widehat{\overline{Y}}_{d}^{\text{DIR}}=\frac{1}{N_d}\sum_{k\in -S_d}w_{k}y_{k}\]
-where \(N_d\) is the population size
-of domain \(d\), \(S_d\) is the set of sampled observations in
-domain \(d\), \(w_{k}\) is the sampling weight for unit
-\(k\), and \(y_{k}\) is the observation for unit \(k\), for all \(k
-\in S_d\). The sae::direct function also estimates
-standard deviation and coefficient of variation for each domain.
The incomedata data frame contains information on 17199
+observations of individuals in 52 Spanish provinces. Income values and
+sampling weights are provided for each individual along with covariate
+information including age group and education level. (molina2015?) define the
+poverty line and calculate an indicator variable (which we name
+in_poverty) with value 1 if the corresponding income value
+is below the poverty line and 0 otherwise.
saeBefore considering model-based methods for small area estimation, we
+compute direct weighted estimators for the desired small area means. The
+sae::direct function computes the Horvitz-Thompson
+estimator of domain means given by
\[ +\widehat{\overline{Y}}_{i}^{\text{DIR}}=\frac{1}{N_i}\sum_{j\in +S_i}w_{j}y_{j} +\]
+where \(N_i\) is the population size
+of domain \(i\), \(S_i\) is the set of sampled observations in
+domain \(i\), \(w_{j}\) is the sampling weight for unit
+\(j\), and \(y_{j}\) is the observation for unit \(j\), for all \(j
+\in S_i\). The sae::direct function also estimates
+standard deviation and coefficient of variation for each domain. Note
+that \(N_i\) is assumed known and is
+provided in the data frame sizeprov. The domains of
+interest are identified via the provlab variable.
Popn <- sizeprov[, c("provlab", "Nd")]
-sae.DIR <- sae::direct(y = incomedata$poor, dom = incomedata$provlab,
- sweight = incomedata$weight, domsize = Popn)
-knitr::kable(head(sae.DIR), digits = 3)| Domain | -SampSize | -Direct | -SD | -CV | -
|---|---|---|---|---|
| Alava | -96 | -0.255 | -0.048 | -19 | -
| Albacete | -173 | -0.141 | -0.030 | -22 | -
| Alicante | -539 | -0.205 | -0.022 | -11 | -
| Almeria | -198 | -0.265 | -0.041 | -15 | -
| Avila | -58 | -0.055 | -0.026 | -46 | -
| Badajoz | -494 | -0.210 | -0.023 | -11 | -
surveysurveyWe can similarly use the survey::svyby function to
compute the Horvitz-Thompson estimates:
incomedata$pop <- sum(sizeprov$Nd[match(incomedata$provlab, sizeprov$provlab)])
-design <- survey::svydesign(ids = ~1, weights = ~weight, data = incomedata, fpc = ~pop)
-
-# estimate area totals
-svy.DIR <- survey::svyby(~poor, ~provlab, design, svytotal)
-
-# calculate corresponding area mean estimates
-svy.DIR$prov_pop <- sizeprov$Nd[match(svy.DIR$provlab, sizeprov$provlab)]
-svy.DIR$poor_mean = svy.DIR$poor/svy.DIR$prov_pop
-svy.DIR$poor_mean_se = svy.DIR$se/svy.DIR$prov_pop
-knitr::kable(head(svy.DIR) %>% select(provlab, poor_mean, poor_mean_se), digits = 3)| - | provlab | -poor_mean | -poor_mean_se | -
|---|---|---|---|
| Alava | -Alava | -0.255 | -0.048 | -
| Albacete | -Albacete | -0.141 | -0.030 | -
| Alicante | -Alicante | -0.205 | -0.022 | -
| Almeria | -Almeria | -0.265 | -0.041 | -
| Avila | -Avila | -0.055 | -0.026 | -
| Badajoz | -Badajoz | -0.210 | -0.023 | -
The Fay-Herriot model is a basic area level model used to obtain -small area estimators Fay and Herriot -(1979). The Fay-Herriot model is often specified as a two-stage -model; the first stage is the sampling model and the second stage is the -linking model.
-First stage:
-Let \(\widehat\theta^{\text{DIR}}_d\) be a direct -estimator of \(\theta_d\): \[\widehat\theta^{\text{DIR}}_d=\theta_d+\epsilon_d;\hspace{2em}\epsilon_d\sim_{ind}N(0,V_d),\hspace{2em}d=1,\ldots, -D,\]
-where \(V_d\) is the known -sampling variance of the direct estimator \(\widehat{\theta}^{\text{DIR}}_d\).
-Second stage:
-\[\theta_d = -\mathbf{x}_d^T\boldsymbol\beta+u_d,\hspace{2em}u_d\sim_{ind}N(0,\sigma_u^2)\hspace{2em}d=1,\ldots, -D,\]
+The basic area level model, also called the Fay-Herriot model, treats +direct estimates of small area quantities as response data and +explicitly models differences between areas using covariate information +and random effects (fay1979?). The Fay-Herriot +model can be viewed as a two-stage model: in the first stage, a +sampling model represents the sampling variability of a direct +estimator and in the second stage, a linking model describes +the between area differences in small area quantities.
+Sampling model:
+Let \(\widehat\theta^{\text{DIR}}_i\) be a direct +estimator of an area level mean or total \(\theta_i\). The sampling model treats \(\widehat\theta^{\text{DIR}}_i\) as a noisy +observation of the true finite population quantity \(\theta_i\):
+\[ +\widehat\theta^{\text{DIR}}_i=\theta_i+\epsilon_i;\hspace{2em}\epsilon_i\sim_{ind}N(0,V_i),\hspace{2em}i=1,\ldots, +M +\]
+where \(V_i\) is the known +sampling variance of the direct estimator \(\widehat{\theta}^{\text{DIR}}_i\).
+Linking model:
+\[ +\theta_i = +\textbf{x}_i^T\boldsymbol\beta+u_i,\hspace{2em}u_i\sim_{ind}N(0,\sigma_u^2)\hspace{2em}i=1,\ldots, +M, +\]
where \(\sigma_u^2\) (between-area residual variance) is estimated. In this basic Fay-Herriot model, the -area-specific random effects \(u_d\) -are assumed to be independent between areas.
-saeBelow, we provide a quantile-quantile plot comparing the direct estimates to a Gaussian distribution. Here the observed quantiles align well with those from a Gaussian distribution, which lends some support to the basic IID model.
-par(pty="s")
-muD <- mean(sae.DIR$Direct)
-sdD <- sd(sae.DIR$Direct)
-qqnorm((sae.DIR$Direct-muD)/sdD,main="")
-abline(0,1,col="red")The sae::mseFH function calculates the empirical best
-linear unbiased predictors (EBLUP) for all domain means as well as their
-estimated MSEs.
par(pty = "s")
+mu.DIR <- mean(sae.DIR$Direct)
+sd.DIR <- sd(sae.DIR$Direct)
+qqnorm((sae.DIR$Direct - mu.DIR) / sd.DIR, main = "")
+abline(0, 1, col = "red")saeAs described by (molina2015?), the
+sae::mseFH function fits the basic area level model (via
+REML by default) and calculates the empirical best linear unbiased
+predictors (EBLUP) for all domain means as well as their estimated
+MSEs.
sae.FH <- sae::mseFH(sae.DIR$Direct~1, sae.DIR$SD^2)
+sae.FH.table <- data.frame(
+ Domain = sae.DIR$Domain,
+ EBLUP = sae.FH$est$eblup,
+ RMSE = sqrt(sae.FH$mse)
+)
+head(sae.FH.table)
+## Domain EBLUP RMSE
+## 1 Alava 0.23407448 0.03836098
+## 2 Albacete 0.15335213 0.02744856
+## 3 Alicante 0.20511853 0.02051886
+## 4 Almeria 0.24498009 0.03427432
+## 5 Avila 0.07797403 0.02372809
+## 6 Badajoz 0.20928033 0.02186672SUMMERThe smoothArea function compute direct estimates and
-then produces smoothed estimates using a Bayesian Fay-Herriot model. The
-main arguments of interest are:
SUMMERThe SUMMER package adopts a Bayesian approach to
+inference using models such as the basic area level model, carrying out
+computation via the INLA package. The
+smoothArea function computes direct estimates and then
+produces smoothed estimates using a Bayesian Fay-Herriot model. The main
+arguments of interest are:
formula: describing the response variable and any
+formula: Describing the response variable and any
area-level covariatesdomain a one-sided formula with the variable containing
+domain A one-sided formula with the variable containing
domain labels on the right. The domain labels variable should be
contained in the dataset used to generate the design.design: a survey.design object containing
+design: A survey.design object containing
survey data and specifying the survey design.In addition, other commonly used optional arguments include:
adj.mat: Optional adjacency matrix if a spatial
smoothing model is desired.direct.est: If desired, the direct estimates may be
-specified and smoothing will be applied directly to the direct
+transform: If "logit" is specified, a
+logit transform will be applied to the direct estimates and an
+appropriate transformation will be applied to the estimated sampling
+variances before smoothing.direct.est: The direct estimates may be specified
+directly and smoothing will be applied directly to these user-provided
estimates.X.domain: Areal covariates data frame.domain.size: Domain size data frame used for computing
-direct estimates if domain sizes are known.X.domain: Data frame of area level covariates.domain.size: Data frame of domain sizes used for
+computing direct estimates if domain sizes are known.Other optional arguments can be specified to change the priors and are described further in the documentation.
-We obtain direct estimates and smoothed estimates below, fitting the -Fay-Herriot model to the untransformed direct estimates.
-# fit the model
+For the artificial poverty rate example, we fit the Fay-Herriot model
+and obtain the following smoothed estimates.
+# specify known domain sizes
domain.size <- sizeprov[, c("provlab", "Nd")]
colnames(domain.size)[2] <- "size"
-summer.FH <- smoothArea(formula = poor~1,
- domain = ~provlab,
- design = design,
- domain.size = domain.size)
+# fit model and obtain svysae object
+summer.FH <- smoothArea(formula = in_poverty~1,
+ domain = ~provlab,
+ design = design,
+ domain.size = domain.size,
+ return.samples = T)
+## Warning in smoothArea(formula = in_poverty ~ 1, domain = ~provlab, design =
+## design, : Direct estimates appear to be proportions. You may want to consider
+## using transform = 'logit'.
+
+summer.FH.table <- data.frame(
+ Domain = sae.DIR$Domain,
+ Median = summer.FH$iid.model.est$median,
+ SE = sqrt(summer.FH$iid.model.est$var)
+)
+head(summer.FH.table)
+## Domain Median SE
+## 1 Alava 0.23666203 0.03719127
+## 2 Albacete 0.15085556 0.02667922
+## 3 Alicante 0.20531685 0.01909936
+## 4 Almeria 0.24577825 0.03769929
+## 5 Avila 0.07652956 0.02222395
+## 6 Badajoz 0.20746467 0.02282758The fitted parameters from sae (obtained via
likelihood-based methods) and estimated parameter posterior distribution
from SUMMER (obtained from Bayesian methods, implemented
via INLA) are in reasonable agreement. The estimated
intercept \(\beta_0\) from
-sae is 0.2 ; the posterior median of \(\beta_0\) from SUMMER is
-0.2. In the absence of strong priors, fixed effects are
-usually in close agreement, with the posterior being symmetric. The
-estimated precision \(1/\sigma_u^2\)
-from sae is 281.35 , while the posterior
-median of \(1/\sigma_u^2\) from
-SUMMER is 273.24. The differences are larger
-here, but the posterior for the variance is skewed, and we would expect
-the posterior median to be smaller than the REML estimate. The area
-estimates and measures of uncertainty are in close agreement,
-however.
sae is 0.202 ; the posterior median of \(\beta_0\) from SUMMER is
+0.202. In the absence of strong priors, fixed effects are usually in
+close agreement, with the posterior being symmetric. The estimated
+precision \(1/\sigma_u^2\) from
+sae is 281.34787 , while the posterior median of \(1/\sigma_u^2\) from SUMMER is
+273.241562. The differences are larger here, but the posterior for the
+variance is skewed, and we would expect the posterior median to be
+smaller than the REML estimate. The area estimates and measures of
+uncertainty are in close agreement, however.
We first illustrate the shrinkage of the EBLUP estimates, and the reduced uncertainty:
par(mfrow = c(1, 2))
@@ -655,13 +622,13 @@ Implementing Fay-Herriot model in SUMMER
ylim=c(min(sae.DIR$SD^2, sae.FH$mse),max(sae.DIR$SD^2, sae.FH$mse)),
main = "Estimates of uncertainty")
abline(0,1,col="red")Now compare EBLUP and HB, using posterior variance from
SUMMER and estimated MSE from sae to measure
uncertainty:
par(mfrow = c(1, 2))
plot(sae.FH$est$eblup, summer.FH$iid.model.est$median,
- xlab = "SAE package",ylab = "SUMMER package",
+ xlab = "sae package",ylab = "SUMMER package",
main = "Small area estimates")
abline(0,1,col="red")
plot(sae.FH$mse,
@@ -670,233 +637,219 @@ Implementing Fay-Herriot model in SUMMER
ylab = "SUMMER package model variance",
main = "Estimates of mse/variance")
abline(0,1,col="red")Molina and Marhuenda (2015) also use a
-real dataset on milk expenditures Arora and
-Lahiri (1997) to illustrate the implementation of area level
-models in the sae package. The milk dataset
-contains information on direct estimators for area level expenditures on
-milk based on data from the dairy survey component of the Consumer
-Expenditure Survey conducted by the U.S. Census Bureau. Weights are
-based on inverse sampling probabilities and adjusted for
-non-response/post-stratification.
| SmallArea | -ni | -yi | -SD | -CV | -MajorArea | -Var | -
|---|---|---|---|---|---|---|
| 1 | -191 | -1.10 | -0.16 | -0.15 | -1 | -0.03 | -
| 2 | -633 | -1.07 | -0.08 | -0.07 | -1 | -0.01 | -
| 3 | -597 | -1.10 | -0.08 | -0.07 | -1 | -0.01 | -
| 4 | -221 | -0.63 | -0.11 | -0.17 | -1 | -0.01 | -
| 5 | -195 | -0.75 | -0.12 | -0.16 | -1 | -0.01 | -
| 6 | -191 | -0.98 | -0.14 | -0.14 | -1 | -0.02 | -
Here, SmallArea denotes areas of inferential interest,
-yi gives the direct estimates, ni the sample
-sizes, SD and CV give the estimated standard
-deviation and coefficient of variation, and MajorArea
-corresponds to major areas (US regions defined by You and Chapman (2006)).
saeThe SUMMER package includes functions to generate
+additional diagnostic plots based on samples from the model posterior.
+The compareEstimates() function generates a heatmap of the
+posterior pairwise probabilities of one area’s mean exceeding another’s
+mean.
When geographic polygon data for each domain is available as a
+shapefile, the sf package can be used to load this data in
+R. The mapEstimates() function can then be used to provide
+a summary map of posterior medians or posterior variances.
SUMMERSince the milk dataset only provides area level direct
-estimates and not unit level data, we use the direct.est
-argument as input.
# Major Area fixed effects
-Xmat <- milk[, c("SmallArea", "MajorArea")]
-Xmat$MajorArea <- as.factor(Xmat$MajorArea)
-
-# format direct estimates for SUMMER
-milk.dir <- milk[, c("SmallArea", "yi", "Var")]
-# Fit the model with Major Area intercepts
-summer.FH.milk<- smoothArea(formula = yi~MajorArea,
- direct.est = milk.dir,
- X.domain = Xmat,
- domain = ~SmallArea)Again note that the posterior median for \(\sigma_u^2\) is larger than the REML -estimate.
-# random effects precision
-1 / sae.FH.milk$est$fit$refvar
-## [1] 54
-summer.FH.milk$iid.model.fit$summary.hyperpar[,c(4)]
-## [1] 54
-
-# estimated fixed effects
-sae.FH.milk$est$fit$estcoef
-## beta std.error tvalue pvalue
-## X(Intercept) 0.97 0.069 14.0 2.8e-44
-## Xas.factor(MajorArea)2 0.13 0.103 1.3 2.0e-01
-## Xas.factor(MajorArea)3 0.23 0.092 2.5 1.4e-02
-## Xas.factor(MajorArea)4 -0.24 0.082 -3.0 3.1e-03
-summer.FH.milk$iid.model.fit$summary.fixed[,c(1:5)]
-## mean sd 0.025quant 0.5quant 0.97quant
-## (Intercept) 0.97 0.071 0.829 0.97 1.103
-## MajorArea2 0.13 0.106 -0.073 0.13 0.334
-## MajorArea3 0.23 0.094 0.041 0.23 0.404
-## MajorArea4 -0.24 0.083 -0.406 -0.24 -0.083Again, we observe good agreement.
-par(mfrow = c(1, 2))
-plot(sae.FH.milk$est$eblup[as.numeric(summer.FH.milk$iid.model.est$domain)],
- summer.FH.milk$iid.model.est$median,
- xlab = "sae package",
- ylab = "SUMMER package",
- main = "Small area estimates")
-abline(0,1,color="red")
-plot(sae.FH.milk$mse[as.numeric(summer.FH.milk$iid.model.est$domain)],
- summer.FH.milk$iid.model.est$var,
- xlab = "sae package mse",
- ylab = "SUMMER package model variance",
- main = "Estimates of mse/variance")
-abline(0,1,col="red")The sae package also provides tools for implementing a
spatial version of the Fay-Herriot model which assumes that the vector
of area specific effects follows a first order simultaneous
-autoregressive, or SAR(1), process: \[\mathbf{u}=\rho_1\mathbf{Wu}+\boldsymbol\epsilon,\hspace{1em}\boldsymbol\epsilon\sim
-N(\mathbf{0}_D,\sigma_I^2\mathbf{I}_D),\] where \(\mathbf{I}_D\) is the identity matrix for
-the \(D\) areas and \(\mathbf{0}_D\) is a vector of zeroes of
+autoregressive, or SAR(1), process: \[\textbf{u}=\rho_1\textbf{Wu}+\boldsymbol\epsilon,\hspace{1em}\boldsymbol\epsilon\sim
+N(\textbf{0}_i,\sigma_I^2\textbf{I}_i),\] where \(\textbf{I}_i\) is the identity matrix for
+the \(D\) areas and \(\textbf{0}_i\) is a vector of zeroes of
size \(D\). Additionally, \(\rho_1\in(-1,1)\) is an autoregression
-parameter and \(\mathbf{W}\) is an
+parameter and \(\textbf{W}\) is an
adjacency matrix (with rows standardized to sum to 1).
The sae::mseSFH function estimates the unknown variance
parameters, the resulting EBLUP small area estimators, and then uses
bootstrap methods to estimate the MSE of the estimators.
To illustrate the use of this function, Molina
-and Marhuenda (2015) consider a synthetic dataset concerning
-grape production surface area for 274 Italian municipalities. Below we
-load the relevant objects from the sae package. The
-grapes dataset containes direct estimators of the mean
-surface area in hectares for grape production in each municipality
-(grapehect), the sampling variance of these direct
-estimators (var), and relevant covariates including number
-of working dats and overall agrarian surface area. The
-grapesprox object contains the relevant adjacency matrix
-representing the municipalities’ neighborhood structure.
To illustrate the use of this function, (molina2015?) consider a
+synthetic dataset concerning grape production surface area for 274
+Italian municipalities. Below we load the relevant objects from the
+sae package. The grapes dataset containes
+direct estimators of the mean surface area in hectares for grape
+production in each municipality (grapehect), the sampling
+variance of these direct estimators (var), and relevant
+covariates including number of working dats and overall agrarian surface
+area. The grapesprox object contains the relevant adjacency
+matrix representing the municipalities’ neighborhood structure.
saesae.FH.grapes <- sae::mseSFH(grapehect ~ area + workdays - 1, var, grapesprox, data = grapes)
-
-results <- data.frame(DIR = grapes$grapehect,
- eblup.SFH = sae.FH.grapes$est$eblup,
- mse = sae.FH.grapes$mse)
-# reorder results for comparison later
-results$area_name <- paste0('area_', rownames(results))sae.FH.grapes <- sae::mseSFH(grapehect ~ area + workdays - 1, var, grapesprox, data = grapes)
+
+results <- data.frame(DIR = grapes$grapehect,
+ eblup.SFH = sae.FH.grapes$est$eblup,
+ mse = sae.FH.grapes$mse)
+# reorder results for comparison later
+results$area_name <- paste0('area_', rownames(results))SUMMERThe smoothSurvey function also allows the use of a model
+
The smoothArea function also allows the use of a model
with spatially correlated area effects, but the default implementation
-assumes a BYM2 model for \(\mathbf{u}\)
+assumes a BYM2 model for \(\textbf{u}\)
rather than a simultaneous autoregressive model as in the SFH model
implemented in sae.
# create area_name as SUMMER requires rownames of A_mat to match area variable
-grapes$area_name <- paste0('area_', rownames(grapes))
-Amat_grapes <- as.matrix(grapesprox)
-rownames(Amat_grapes) <- colnames(Amat_grapes) <- grapes$area_name
-X_grapes <- grapes[,c('area_name', 'area', 'workdays')]
-
-# format direct estimates for SUMMER
-grapes.dir <- grapes[, c(5, 1, 4)]
-# scale direct estimates for use with INLA
-grapes.dir$grapehect <- grapes.dir$grapehect / 10
-grapes.dir$var <- grapes.dir$var/ 100
-summer.FH.grapes<- smoothArea(formula = grapehect~area + workdays,
- direct.est = grapes.dir, X.domain = X_grapes,
- domain = ~area_name, adj.mat = Amat_grapes)# create area_name as SUMMER requires rownames of adj.mat to match area variable
+grapes$area_name <- paste0('area_', rownames(grapes))
+adj.mat.grapes <- as.matrix(grapesprox)
+rownames(adj.mat.grapes) <- colnames(adj.mat.grapes) <- grapes$area_name
+X_grapes <- grapes[,c('area_name', 'area', 'workdays')]
+
+# format direct estimates for SUMMER
+grapes.dir <- grapes[, c(5, 1, 4)]
+# scale direct estimates for use with INLA
+grapes.dir$grapehect <- grapes.dir$grapehect / 10
+grapes.dir$var <- grapes.dir$var/ 100
+summer.FH.grapes <- smoothArea(formula = grapehect~area + workdays,
+ direct.est = grapes.dir, X.domain = X_grapes,
+ domain = ~area_name, adj.mat = adj.mat.grapes)
+plot_list <- plot(summer.FH.grapes, return_list = T)Despite the differing models, we again observe good agreement with -the estimates, though less so with the estimates of uncertainty, which -is interesting.
-# rearrange output to be the same order
-results <- results[match(summer.FH.grapes$bym2.model.est$domain, results$area_name), ]
-par(mfrow = c(1, 2))
-plot(results$eblup.SFH,
- summer.FH.grapes$bym2.model.est$median * 10,
- xlab = "sae package",
- ylab = "SUMMER package",
- main = "Small area estimates")
-abline(0, 1, col = 'red')
-plot(results$mse,
- summer.FH.grapes$bym2.model.est$var * 100,
- xlab = "sae package mse",
- ylab = "SUMMER package model variance",
- main = "Estimates of mse/variance")
-abline(0, 1, col = 'red')summer.bym2.est <-
+ summer.FH.grapes$bym2.model.est[match(rownames(adj.mat.grapes), summer.FH.grapes$bym2.model.est$domain),]
+par(mfrow = c(1, 2))
+plot(results$eblup.SFH,
+ summer.bym2.est$median * 10,
+ xlab = "sae package",
+ ylab = "SUMMER package",
+ main = "Small area estimates")
+abline(0, 1, col = 'red')
+plot(results$mse,
+ summer.bym2.est$var * 100,
+ xlab = "sae package mse",
+ ylab = "SUMMER package model variance",
+ main = "Estimates of mse/variance")
+abline(0, 1, col = 'red')Below, we provide an example comparing spatial models from
+sae and SUMMER using data from the Behavioral
+Risk Factor Surveillance System (BRFSS).
library(ggplot2)
+library(patchwork)
+data(BRFSS)
+data(KingCounty)
+BRFSS <- subset(BRFSS, !is.na(BRFSS$diab2))
+BRFSS <- subset(BRFSS, !is.na(BRFSS$hracode))
+head(BRFSS)
+## age pracex educau zipcode sex street1 street2 seqno year
+## 1 30 White college grad 98001 male NA NA 2009000041 2009
+## 2 26 White college grad 98107 female NA NA 2009000309 2009
+## 3 33 Black college grad 98133 male NA NA 2009000404 2009
+## 4 25 White some college 98058 male NA NA 2009000531 2009
+## 5 23 White some college 98102 male NA NA 2009000675 2009
+## 6 19 Asian some college 98106 male NA NA 2009000694 2009
+## hispanic mracex strata hracode tract rwt_llcp genhlth2 fmd obese
+## 1 non-Hisp White 53019 Auburn-North NA 2107.463 0 0 0
+## 2 non-Hisp White 53019 Ballard NA 2197.322 0 1 0
+## 3 non-Hisp Black 53019 NW Seattle NA 3086.511 0 0 0
+## 4 non-Hisp White 53019 Renton-South NA 3184.740 1 1 1
+## 5 non-Hisp White 53019 Capitol Hill/E.lake NA 3184.740 0 0 0
+## 6 non-Hisp Asian 53019 North Highline NA 4391.304 0 0 0
+## smoker1 diab2 aceindx2 zipout streetx ethn age4 ctmiss
+## 1 0 0 NA 98001 0 1 3 1
+## 2 0 0 NA 98107 0 1 3 1
+## 3 0 0 NA 98133 0 2 3 1
+## 4 0 0 NA 98058 0 1 3 1
+## 5 0 0 NA 98102 0 1 4 1
+## 6 0 0 NA 98106 0 3 4 1
+mat <- getAmat(KingCounty, KingCounty$HRA2010v2_)
+design <- svydesign(ids = ~1, weights = ~rwt_llcp,
+ strata = ~strata, data = BRFSS)
+direct <- svyby(~diab2, ~hracode, design, svymean)saeBelow, we use sae to smooth the logit-transformed direct
+estimates.
direct$var <- direct$se ^ 2
+direct$logit.diab2 <- SUMMER::logit(direct$diab2)
+direct$logit.var <- direct$var / (direct$diab2 ^ 2 * (1 - direct$diab2) ^ 2)
+SFH.brfss <- sae::mseSFH(logit.diab2 ~ 1, logit.var, mat, data = direct)
+
+results <- data.frame(domain = direct$hracode,
+ eblup.SFH = SUMMER::expit(SFH.brfss$est$eblup),
+ mse = SFH.brfss$mse)SUMMERBelow, we fit two versions of the spatial area levelmodel in
+SUMMER. If we change pc.u and
+pc.alpha from the default value \(u=1,\alpha=0.01\) to \(u=0.1,\alpha=0.01\), we assign more prior
+mass on smaller variance of the random effects, inducing more
+smoothing.
summer.brfss <- smoothArea(diab2~1, domain= ~hracode,
+ design = design,
+ transform = "logit",
+ adj.mat = mat, level = 0.95)
+summer.brfss.alt <- smoothArea(diab2~1, domain= ~hracode,
+ design = design,
+ transform = "logit",
+ adj.mat = mat, level = 0.95,
+ pc.u = 0.1, pc.alpha = 0.01)Finally, we use SUMMER::mapPlot to compare median
+estimates and uncertainty estimates obtained via sae and
+SUMMER.
toplot <- summer.brfss$bym2.model.est
+toplot$logit.var <- toplot$var /
+ (summer.brfss$bym2.model.est$median ^ 2 *
+ (1 - summer.brfss$bym2.model.est$median) ^ 2)
+toplot$median.alt <- summer.brfss.alt$bym2.model.est$median
+toplot$logit.var.alt <- summer.brfss.alt$bym2.model.est$var /
+ (summer.brfss.alt$bym2.model.est$median ^ 2 *
+ (1 - summer.brfss.alt$bym2.model.est$median) ^ 2)
+toplot$median.sae <- results$eblup.SFH
+toplot$mse.sae <- results$mse
+variables <- c("median", "median.alt", "median.sae",
+ "logit.var", "logit.var.alt", "mse.sae")
+names <- c("Median (default prior)", "Median (new prior)", "EBLUP (sae)",
+ "Variance (default prior)", "Variance (new prior)", "MSE (sae)")
+mapPlot(data = toplot, geo = KingCounty,
+ variables=variables[1:3],
+ labels = names[1:3], by.data = "domain",
+ by.geo = "HRA2010v2_", size = 0.1) mapPlot(data = toplot, geo = KingCounty,
+ variables=variables[4:6], labels = names[4:6],
+ by.data = "domain", by.geo = "HRA2010v2_", size = 0.1) par(mfrow = c(1, 2))
+range1 <- range(c(direct$diab2,toplot$median.alt))
+plot(direct$diab2,toplot$median,
+ xlab = "direct estimates",
+ ylab = "model-based estimates",
+ main = "Small area estimates", col = 'red', pch = 16,
+ xlim=range1,ylim=range1)
+points(direct$diab2,toplot$median.sae, col = 'blue', pch = 16)
+points(direct$diab2,toplot$median.alt, col = 'cyan', pch = 16)
+legend('topleft', pch = 16, col = c('red', 'cyan', 'blue'),
+ legend = c("SUMMER",'SUMMER (new prior)', "sae"),bty="n")
+abline(0,1)
+range2 <- range(c(direct$logit.var,toplot$mse.sae,toplot$logit.var.alt))
+plot(direct$logit.var,toplot$logit.var,
+ xlab = "direct estimate var.",
+ ylab = "model-based uncertainty",
+ main = "Small area estimates", col = 'red', pch = 16,
+ xlim=range2,ylim=range2)
+points(direct$logit.var,toplot$mse.sae, col = 'blue', pch = 16)
+points(direct$logit.var,toplot$logit.var.alt, col = 'cyan', pch = 16)
+legend('topleft', pch = 16, col = c('red', 'cyan','blue'),
+ legend = c("SUMMER var.", 'SUMMER var. (new prior)', "sae mse"),bty="n")
+abline(0,1)Nested error model: \[y_{dk}=\mathbf{x}_{dk}^T\boldsymbol\beta+u_d+\epsilon_{dk},\hspace{1em}u_d\sim_{ind}N(0,\sigma_u^2),\hspace{1em}\epsilon_{dk}\sim_{ind}N(0,\sigma_\epsilon^2)\]
Here \(u_d\) are area random effects and \(\epsilon_{dk}\) are unit level -errors.
-This model assumes there is no sample selection -bias.
+errors. This model assumes the sampling design is ignorable.The sae package conducts estimation of domain means by
first estimating variance parameters \(\sigma^2_u\) and \(\sigma^2_\epsilon\). Next, given known
variance parameters, domain means \(\theta_d\) are predicted by calculating the
@@ -951,46 +902,50 @@
MeanSoyBeansPixPerSeg provide the known county means of the
auxiliary variables.
We load the sample data:
- -## County CornHec SoyBeansHec CornPix SoyBeansPix
-## 1 1 166 8.1 374 55
-## 2 2 96 106.0 209 218
-## 3 3 76 103.6 253 250
-## 4 4 185 6.5 432 96
-## 5 4 116 63.8 367 178
-## 6 5 162 43.5 361 137
-## 7 5 152 71.4 288 206
-## 8 5 162 42.5 369 165
-## 9 6 93 105.3 206 218
-## 10 6 150 76.5 316 221
+
+## County CornHec SoyBeansHec CornPix SoyBeansPix
+## 1 1 165.76 8.09 374 55
+## 2 2 96.32 106.03 209 218
+## 3 3 76.08 103.60 253 250
+## 4 4 185.35 6.47 432 96
+## 5 4 116.43 63.82 367 178
+## 6 5 162.08 43.50 361 137
Next, we load the population auxiliary information:
-data("cornsoybeanmeans")
-Xmean <-
- data.frame(cornsoybeanmeans[, c("CountyIndex",
- "MeanCornPixPerSeg",
- "MeanSoyBeansPixPerSeg")])
-Popn <-
- data.frame(cornsoybeanmeans[, c("CountyIndex",
- "PopnSegments")])
-head(Xmean)data("cornsoybeanmeans")
+Xmean <-
+ data.frame(cornsoybeanmeans[, c("CountyIndex",
+ "MeanCornPixPerSeg",
+ "MeanSoyBeansPixPerSeg")])
+head(Xmean)## CountyIndex MeanCornPixPerSeg MeanSoyBeansPixPerSeg
-## 1 1 295 190
-## 2 2 300 197
-## 3 3 290 205
-## 4 4 291 220
-## 5 5 318 188
-## 6 6 257 247
+## 1 1 295.29 189.70
+## 2 2 300.40 196.65
+## 3 3 289.60 205.28
+## 4 4 290.74 220.22
+## 5 5 318.21 188.06
+## 6 6 257.17 247.13
+
+## CountyIndex PopnSegments
+## 1 1 545
+## 2 2 566
+## 3 3 394
+## 4 4 424
+## 5 5 564
+## 6 6 570
The sae::pbmseBHF function obtains EBLUPs under the
nested error model and then uses a parametric bootstrap approach to
estimate MSEs.
cornsoybean <- cornsoybean[-33, ] # remove outlier
-sae.bhf <-
- pbmseBHF(CornHec ~ CornPix + SoyBeansPix,
- dom = County, meanxpop = Xmean,
- popnsize = Popn, B = 200,
- data = cornsoybean)SUMMER::smoothSurvey provides the ability to fit unit
+
cornsoybean <- cornsoybean[-33, ] # remove outlier
+sae.bhf <-
+ pbmseBHF(CornHec ~ CornPix + SoyBeansPix,
+ dom = County, meanxpop = Xmean,
+ popnsize = Popn, B = 200,
+ data = cornsoybean)SUMMER::smoothUnit provides the ability to fit unit
level models with unit level covariates for Gaussian response variables.
Below we use the is.unit argument to specify a unit level
model and then provide the column names of the unit level covariates in
@@ -1003,157 +958,44 @@
cornsoybean$id <- 1:dim(cornsoybean)[1]
-Xsummer <- Xmean
-colnames(Xsummer) = c("County", "CornPix", "SoyBeansPix")
-des0 <- svydesign(ids = ~1, data = cornsoybean)
-summer.bhf.unit <- smoothUnit(formula = CornHec ~ CornPix + SoyBeansPix,
- family = "gaussian",
- domain = ~County,
- design = des0, X.pop = Xsummer,
- pc.u = 1000, pc.alpha = 0.01, level = 0.95)cornsoybean$id <- 1:dim(cornsoybean)[1]
+Xsummer <- Xmean
+colnames(Xsummer) = c("County", "CornPix", "SoyBeansPix")
+des0 <- svydesign(ids = ~1, data = cornsoybean)
+## Warning in svydesign.default(ids = ~1, data = cornsoybean): No weights or
+## probabilities supplied, assuming equal probability
+summer.bhf.unit <- smoothUnit(formula = CornHec ~ CornPix + SoyBeansPix,
+ family = "gaussian",
+ domain = ~County,
+ design = des0, X.pop = Xsummer,
+ pc.u = 1000, pc.alpha = 0.01, level = 0.95)
+## Warning in smoothUnit(formula = CornHec ~ CornPix + SoyBeansPix, family =
+## "gaussian", : No spatial information provided, using iid domain effectsBelow, we plot comparisons of the sae and
SUMMER results.
par(mfrow = c(1, 2))
-range1 <- range(c(sae.bhf$est$eblup$eblup,summer.bhf.unit$median))
-plot(sae.bhf$est$eblup$eblup,summer.bhf.unit$iid.model.est$median,
- xlab = "sae package",
- ylab = "SUMMER unit-level model package",
- main = "Small area estimates",
- xlim=range1,ylim=range1)
-abline(0,1)
-range2 <- range(c(sae.bhf$mse$mse, summer.bhf.unit$var))
-plot(sae.bhf$mse$mse, summer.bhf.unit$iid.model.est$var,
- xlab = "sae package mse",
- ylab = "SUMMER unit-level model package model variance",
- main = "Estimates of mse/variance",
- xlim=range2,ylim=range2)
-abline(0,1)Below, we provide an example comparing spatial models from
-sae and SUMMER using data from the Behavioral
-Risk Factor Surveillance System (BRFSS).
library(patchwork)
-data(BRFSS)
-data(KingCounty)
-BRFSS <- subset(BRFSS, !is.na(BRFSS$diab2))
-BRFSS <- subset(BRFSS, !is.na(BRFSS$hracode))
-mat <- getAmat(KingCounty, KingCounty$HRA2010v2_)
-design <- svydesign(ids = ~1, weights = ~rwt_llcp,
- strata = ~strata, data = BRFSS)
-direct <- svyby(~diab2, ~hracode, design, svymean)saeBelow, we use sae to smooth the logit-transformed direct
-estimates.
direct$var <- direct$se ^ 2
-direct$logit.diab2 <- SUMMER::logit(direct$diab2)
-direct$logit.var <- direct$var / (direct$diab2 ^ 2 * (1 - direct$diab2) ^ 2)
-SFH.brfss <- sae::mseSFH(logit.diab2 ~ 1, logit.var, mat, data = direct)
-
-results <- data.frame(region = direct$hracode,
- eblup.SFH = SUMMER::expit(SFH.brfss$est$eblup),
- mse = SFH.brfss$mse)SUMMERBelow, we fit two versions of the spatial area levelmodel in
-SUMMER. If we change pc.u and
-pc.alpha from the default value \(u=1,\alpha=0.01\) to \(u=0.1,\alpha=0.01\), we assign more prior
-mass on smaller variance of the random effects, inducing more
-smoothing.
summer.brfss <- smoothArea(diab2~1, domain= ~hracode,
- design = design,
- adj.mat = mat)
-summer.brfss.alt <- smoothArea(diab2~1, domain= ~hracode,
- design = design,
- adj.mat = mat, level = 0.95,
- pc.u = 0.1, pc.alpha = 0.01)Finally, we use SUMMER::mapPlot to compare median
-estimates and uncertainty estimates obtained via sae and
-SUMMER.
toplot <- summer.brfss$bym2.model.est
-toplot$logit.var <- toplot$var /
- (summer.brfss$bym2.model.est$median ^ 2 *
- (1 - summer.brfss$bym2.model.est$median) ^ 2)
-toplot$median.alt <- summer.brfss.alt$bym2.model.est$median
-toplot$logit.var.alt <- summer.brfss.alt$bym2.model.est$var /
- (summer.brfss.alt$bym2.model.est$median ^ 2 *
- (1 - summer.brfss.alt$bym2.model.est$median) ^ 2)
-toplot$median.sae <- results$eblup.SFH
-toplot$mse.sae <- results$mse
-variables <- c("median", "median.alt", "median.sae",
- "logit.var", "logit.var.alt", "mse.sae")
-names <- c("Median (default prior)", "Median (new prior)", "EBLUP (sae)",
- "Variance (default prior)", "Variance (new prior)", "MSE (sae)")
-g1 <- mapPlot(data = toplot, geo = KingCounty,
- variables=variables[1:3],
- labels = names[1:3], by.data = "domain",
- by.geo = "HRA2010v2_", size = 0.1)
-g2 <- mapPlot(data = toplot, geo = KingCounty,
- variables=variables[4:6], labels = names[4:6],
- by.data = "domain", by.geo = "HRA2010v2_", size = 0.1)
-g1 / g2par(mfrow = c(1, 2))
-range1 <- range(c(direct$diab2,toplot$median.alt))
-plot(direct$diab2,toplot$median,
- xlab = "direct estimates",
- ylab = "model-based estimates",
- main = "Small area estimates", col = 'red', pch = 16,
- xlim=range1,ylim=range1)
-points(direct$diab2,toplot$median.sae, col = 'blue', pch = 16)
-points(direct$diab2,toplot$median.alt, col = 'cyan', pch = 16)
-legend('topleft', pch = 16, col = c('red', 'cyan', 'blue'),
- legend = c("SUMMER",'SUMMER (new prior)', "sae"),bty="n")
-abline(0,1)
-range2 <- range(c(direct$logit.var,toplot$mse.sae,toplot$logit.var.alt))
-plot(direct$logit.var,toplot$logit.var,
- xlab = "direct estimate var.",
- ylab = "model-based uncertainty",
- main = "Small area estimates", col = 'red', pch = 16,
- xlim=range2,ylim=range2)
-points(direct$logit.var,toplot$mse.sae, col = 'blue', pch = 16)
-points(direct$logit.var,toplot$logit.var.alt, col = 'cyan', pch = 16)
-legend('topleft', pch = 16, col = c('red', 'cyan','blue'),
- legend = c("SUMMER var.", 'SUMMER var. (new prior)',"sae mse"),bty="n")
-abline(0,1)par(mfrow = c(1, 2))
+range1 <- range(c(sae.bhf$est$eblup$eblup,summer.bhf.unit$median))
+plot(sae.bhf$est$eblup$eblup,summer.bhf.unit$iid.model.est$median,
+ xlab = "sae package",
+ ylab = "SUMMER unit-level model package",
+ main = "Small area estimates",
+ xlim=range1,ylim=range1)
+abline(0,1)
+range2 <- range(c(sae.bhf$mse$mse, summer.bhf.unit$var))
+plot(sae.bhf$mse$mse, summer.bhf.unit$iid.model.est$var,
+ xlab = "sae package mse",
+ ylab = "SUMMER unit-level model package model variance",
+ main = "Estimates of mse/variance",
+ xlim=range2,ylim=range2)
+abline(0,1)