diff --git a/DESCRIPTION b/DESCRIPTION index d861ec7..7b04812 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: SUMMER Type: Package Title: Small-Area-Estimation Unit/Area Models and Methods for Estimation in R -Version: 1.3.1 -Date: 2022-07-24 +Version: 1.4.0 +Date: 2023-10-24 Authors@R: c( person(given = "Zehang R", family = "Li", email= "lizehang@gmail.com", role = c("cre","aut")), person(given = "Bryan D", family = "Martin", email= "bmartin6@uw.edu", role = "aut"), @@ -15,17 +15,17 @@ Authors@R: c( person(given = "Geir-Arne", family = "Fuglstad", email = "geir-arne.fuglstad@ntnu.no", role = "aut"), person(given = "Andrea", family = "Riebler", email = "andrea.riebler@ntnu.no", role = "aut") ) -Description: Provides methods for spatial and spatio-temporal smoothing of demographic and health indicators using survey data, with particular focus on estimating and projecting under-five mortality rates, described in Mercer et al. (2015) , Li et al. (2019) and Li et al. (2020) . +Description: Provides methods for spatial and spatio-temporal smoothing of demographic and health indicators using survey data, with particular focus on estimating and projecting under-five mortality rates, described in Mercer et al. (2015) , Li et al. (2019) , Wu et al. (DHS Spatial Analysis Reports No. 21, 2021), and Li et al. (2023) . URL: https://github.com/richardli/SUMMER BugReports: https://github.com/richardli/SUMMER/issues Depends: R (>= 3.5) License: GPL (>= 2) -Imports: survey, stats, spdep, survival, ggplot2, scales, utils, Matrix, reshape2, viridis, sp, sf, shadowtext, ggridges, methods, data.table, RColorBrewer, grDevices +Imports: survey, stats, spdep, survival, ggplot2, scales, utils, Matrix, reshape2, viridis, sp, sf, shadowtext, ggridges, methods, data.table, RColorBrewer, grDevices, fields, terra Encoding: UTF-8 LazyData: true RoxygenNote: 7.2.3 Additional_repositories: https://inla.r-inla-download.org/R/testing/ -Suggests: INLA, knitr, rmarkdown, readstata13, patchwork, rdhs, R.rsp, fields, sae, dplyr, tidyr +Suggests: INLA, knitr, rmarkdown, readstata13, patchwork, rdhs, R.rsp, sae, dplyr, tidyr, raster VignetteBuilder: R.rsp, knitr NeedsCompilation: no Config/build/clean-inst-doc: FALSE diff --git a/NAMESPACE b/NAMESPACE index cb41b25..44d5b6a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,26 +9,40 @@ S3method(summary,SUMMERmodel.svy) S3method(summary,SUMMERprojlist) export(Benchmark) export(ChangeRegion) +export(adjustPopMat) export(aggregateSurvey) +export(areaPopToArea) +export(calibrateByRegion) export(expit) export(fitGeneric) export(fitINLA) export(fitINLA2) export(getAdjusted) export(getAmat) +export(getAreaName) export(getBirths) export(getCounts) export(getDiag) export(getDirect) export(getDirectList) +export(getPoppsub) export(getSmoothed) export(getSmoothedDensity) export(hatchPlot) export(logit) +export(logitNormMean) +export(makePopIntegrationTab) export(mapPlot) export(mapPoints) +export(pixelPopToArea) +export(poppRegionFromPopMat) +export(projKenya) export(ridgePlot) export(rst) +export(setThresholdsByRegion) +export(simPopCustom) +export(simPopSPDE) +export(simSPDE) export(simhyper) export(smoothArea) export(smoothCluster) @@ -40,6 +54,10 @@ import(Matrix) import(RColorBrewer) import(data.table) importFrom(Matrix,Diagonal) +importFrom(fields,fields.rdist.near) +importFrom(fields,in.poly) +importFrom(fields,make.surface.grid) +importFrom(fields,rdist) importFrom(ggplot2,aes) importFrom(ggplot2,geom_errorbar) importFrom(ggplot2,geom_line) @@ -60,12 +78,19 @@ importFrom(sf,st_centroid) importFrom(sf,st_coordinates) importFrom(sf,st_crs) importFrom(shadowtext,geom_shadowtext) +importFrom(sp,"proj4string<-") +importFrom(sp,CRS) +importFrom(sp,SpatialPoints) +importFrom(sp,as.SpatialPolygons.PolygonsList) +importFrom(sp,coordinates) +importFrom(sp,over) importFrom(sp,plot) importFrom(sp,proj4string) importFrom(spdep,nb2mat) importFrom(spdep,poly2nb) importFrom(stats,aggregate) importFrom(stats,as.formula) +importFrom(stats,cor) importFrom(stats,dgamma) importFrom(stats,dnorm) importFrom(stats,median) @@ -79,4 +104,6 @@ importFrom(stats,runif) importFrom(stats,sd) importFrom(stats,setNames) importFrom(stats,var) +importFrom(terra,extract) +importFrom(terra,gdal) importFrom(viridis,viridis_pal) diff --git a/NEWS.md b/NEWS.md index b20d75e..f910617 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,16 @@ # SUMMER - changes + + +Version 1.4.0 +========================== ++ New major functions for population and prevalence simulation based on population frame and population density information, along with methods for aggregating pixel level prevalences and populations to the areal level. + + +Version 1.3.0 +========================== ++ New SAE functions and vignette. + + Version 1.2.1 ========================== + `smoothSurvey` now can return posterior draws. diff --git a/R/.DS_Store b/R/.DS_Store new file mode 100755 index 0000000..5008ddf Binary files /dev/null and b/R/.DS_Store differ diff --git a/R/getAreaName.R b/R/getAreaName.R new file mode 100755 index 0000000..5dbf01b --- /dev/null +++ b/R/getAreaName.R @@ -0,0 +1,142 @@ +#' Determines which administrative areas contain the given points +#' +#' For any points not in an area, they are assigned the nearest area using +#' fields::fields.rdist.near or fields::rdist depending on the number of points +#' and the maximum memory in bytes with a warning. +#' +#' @param pts 2 column matrix of lon/lat coordinates +#' @param shapefile A SpatialPolygonsDataFrame object +#' @param areaNameVar The column name in \code{slot(shapefile, "data")} +#' corresponding to the area level of interest +#' @param delta Argument passed to fields::fields.rdist.near in fields package +#' @param mean.neighbor Argument passed to fields::fields.rdist.near in fields +#' package +#' @param maxBytes Maximum allowed memory in bytes (default is 3Gb). Determines +#' whether to call fields::fields.rdist.near which saves memory bute requires +#' delta and mean.neighbor inputs to be specified or fields::rdist. +#' +#' @details delta and mean.neighbor arguments only used when some points +#' are not in areas, perhaps due to inconsistencies in shapefiles. +#' +#' @author John Paige +#' +#' @seealso \code{\link{projKenya}}, \code{\link[fields]{fields.rdist.near}} +#' +#' @examples +#' \dontrun{ +#' # download Kenya GADM shapefiles from SUMMERdata github repository +#' githubURL <- "https://github.com/paigejo/SUMMERdata/blob/main/data/kenyaMaps.rda?raw=true" +#' download.file(githubURL,"kenyaMaps.rda") +#' +#' # load it in +#' load("kenyaMaps.rda") +#' +#' # use the shapefile data to see what Admin1 and 2 areas the +#' # points (0, 37) and (0.5, 38) are in +#' # (these are longitude/latitude coordinates) +#' pts = cbind(c(37, 38), c(0, .5)) +#' head(slot(adm1, "data")) +#' admin1Areas = getAreaName(pts, adm1, "NAME_1") +#' admin2Areas = getAreaName(pts, adm2, "NAME_2") +#' } +#' +#' @return A list of area IDs, area names, whether or not +#' points are in multiple areas, and whether or not points +#' are in no areas and assigned to the nearest one. +#' +#' @importFrom fields fields.rdist.near +#' @importFrom fields rdist +#' @importFrom fields in.poly +#' @export +getAreaName = function(pts, shapefile, areaNameVar="NAME_1", + delta=.05, mean.neighbor=50, maxBytes=3*2^30) { + # require(fields) + + if(nrow(pts) == 1) { + # must work around the fact that fields::in.poly() doesn't work for matrix with 1 row + pts = rbind(pts, pts) + oneRow = TRUE + } else { + oneRow = FALSE + } + + # data(kenyaMaps, envir = environment()) + areaNames = shapefile@data[[areaNameVar]] + + # get area map polygons and set helper function for testing if pts are in the constituencies + polys = shapefile@polygons + inRegion = function(i) { + countyPolys = polys[[i]]@Polygons + inside = sapply(1:length(countyPolys), function(x) { + fields::in.poly(pts, countyPolys[[x]]@coords, inflation=0) + }) + insideAny = apply(inside, 1, any) + + return(insideAny*i) + } + out = sapply(1:length(polys), inRegion) + if(oneRow) { + out = matrix(out[1,], nrow=1) + } + multipleRegs = apply(out, 1, function(vals) {sum(vals != 0) > 1}) + areaID = apply(out, 1, function(vals) {match(1, vals != 0)}) + areaNameVec = areaNames[areaID] + + # for all points not in a area polygon, determine the nearest area + insideAny = apply(out, 1, function(x) {any(x != 0)}) + if(any(!insideAny)) { + warning("Some points not inside any areas. Assigning them to nearest area") + problemPointsI = which(!insideAny) + + bytesUsed = length(problemPointsI) * nrow(pts) + if(bytesUsed > maxBytes) { + # get nearby points (points within delta lon/lat units), remove self matches + nearbyPoints = fields::fields.rdist.near(matrix(pts[problemPointsI,], ncol=2), pts, + delta=delta, mean.neighbor=mean.neighbor) + selfI = nearbyPoints$ra == 0 + nearbyPoints$ind = nearbyPoints$ind[!selfI,] + nearbyPoints$ra = nearbyPoints$ra[!selfI] + nearbyI = lapply(sort(unique(nearbyPoints$ind[,1])), function(x) { + nearbyPoints$ind[nearbyPoints$ind[,1] == x,2] + }) + } else { + # get all points, remove self matches + dists = fields::rdist(matrix(pts[problemPointsI,], ncol=2), pts) + dists[cbind(1:length(problemPointsI), problemPointsI)] = Inf + nearbyI = apply(dists, 1, which.min) + } + + # get nearby constituencies, counties, and distances + nearbyAreas = lapply(nearbyI, function(x) {areaNameVec[x]}) + nearbyLengths = sapply(nearbyI, function(x) {length(x)}) + + if(bytesUsed > maxBytes) { + nearbyDistances = c() + startI = 1 + for(i in 1:length(nearbyI)) { + endI = startI + nearbyLengths[i] - 1 + nearbyDistances = c(nearbyDistances, list(nearbyPoints$ra[startI:endI])) + startI = endI + 1 + } + } else { + nearbyDistances = dists[cbind(1:nrow(dists), nearbyI)] + } + + + # sort nearby constituencies and indices by distance + for(i in 1:length(nearbyI)) { + thisDistances = nearbyDistances[[i]] + sortI = sort(thisDistances, index.return=TRUE)$ix + nearbyDistances[[i]] = nearbyDistances[[i]][sortI] + nearbyAreas[[i]] = nearbyAreas[[i]][sortI] + nearbyI[[i]] = nearbyI[[i]][sortI] + } + + # get nearest non-NA area and assign it + closestArea = sapply(nearbyAreas, function(x) {x[match(TRUE, !is.na(x))]}) + areaNameVec[problemPointsI] = closestArea + } + + list(areaID=areaID, areaNames=areaNameVec, + multipleRegs=multipleRegs, notInAnyAreas=!insideAny) +} \ No newline at end of file diff --git a/R/kenyaPopulationData.R b/R/kenyaPopulationData.R new file mode 100755 index 0000000..2da2a9d --- /dev/null +++ b/R/kenyaPopulationData.R @@ -0,0 +1,27 @@ +#' Kenya 2009 Census Frame and Related Datasets +#' +#' Datasets related to the 2009 census frame for Kenya based on the 2009 Kenya Population and Housing +#' Census. General population totals are estimated for 2014. Based on 2014 population density +#' estimates interpolated with exponential growth rate between 2010 and 2015 from WorldPop data. +#' +#' @format A number of data.frames with information about the 2009 Kenya Population and Housing Census and the population in Kenya at the time of the 2014 Demographic Health Survey. Some of the +#' data.frames have been adjusted to contain information about neonatals born from 2010-2014 rather than general population in 2014. +#' The dataset names are: easpaKenya, easpaKenyaNeonatal, +#' poppaKenya, and poppsubKenya. +#' @references Kenya National Bureau of Statistics, Ministry of Health/Kenya, National AIDS Control Council/Kenya, Kenya Medical Research Institute, and National Council For Population And Development/Kenya, 2015. Kenya Demographic and Health Survey 2014. Rockville, Maryland, USA. URL: http://dhsprogram.com/pubs/pdf/FR308/FR308.pdf. +#' @references Stevens, F.R., Gaughan, A.E., Linard, C., Tatem, A.J., 2015. Disaggregat- ing census data for population mapping using random forests with remotely-sensed and ancillary data. PloS One 10, e0107042. +#' @references Tatem, A.J., 2017. WorldPop, open data for spatial demography. Scientific Data 4. +#' @source +#' @docType data +#' @name kenyaPopulationData +#' @usage data(kenyaPopulationData) +"easpaKenya" + +#' @rdname kenyaPopulationData +"easpaKenyaNeonatal" + +#' @rdname kenyaPopulationData +"poppaKenya" + +#' @rdname kenyaPopulationData +"poppsubKenya" \ No newline at end of file diff --git a/R/logitNormMean.R b/R/logitNormMean.R new file mode 100755 index 0000000..d380123 --- /dev/null +++ b/R/logitNormMean.R @@ -0,0 +1,57 @@ +#' Calculate the mean of a distribution whose +#' logit is Gaussian +#' +#' Adapted from logitnorm package. Calculates the mean of a distribution whose +#' logit is Gaussian. Each row of muSigmaMat is a mean and standard deviation +#' on the logit scale. +#' +#' @param muSigmaMat An n x 2 matrix where each row is \eqn{\mu}{mu} and \eqn{\sigma}{sigma} +#' on the logit scale for an independent random variable. +#' @param logisticApprox Whether or not to use logistic approximation to speed +#' up computation. See details for more information. +#' @param ... More arguments, passed to \code{integrate} function +#' +#' @return A vector of expectations of the specified random variables +#' +#' @author John Paige +#' +#' @details If \eqn{\mbox{logit}(Y) \sim N(\mu, \sigma^2)}{logit(Y) ~ N(mu, sigma^2)}, +#' This function calculates \eqn{E[Y]}{E[Y]} via either numerical integration or by +#' assuming that Y follows a logistic distribution. Under this approximation, setting +#' \eqn{k = 16 \sqrt(3) / (15 \pi)}{k = 16 * sqrt(3) / (15 * pi)}, we approximate +#' the expectation as: +#' \deqn{E[Y] = expit(\mu / \sqrt(1 + k^2 \sigma^2))}{E[Y] = expit(mu / sqrt(1 + k^2 * sigma^2))}. +#' The above logistic approximation speeds up the computation, but also sacrifices +#' some accuracy. +#' +#' @examples +#' mus = c(-5, 0, 5) +#' sigmas = rep(1, 3) +#' logitNormMean(cbind(mus, sigmas)) +#' logitNormMean(cbind(mus, sigmas), TRUE) +#' +#' @export +logitNormMean = function(muSigmaMat, logisticApprox=FALSE, ...) { + if(length(muSigmaMat) > 2) { + apply(muSigmaMat, 1, logitNormMean, logisticApprox=logisticApprox, ...) + } + else { + mu = muSigmaMat[1] + sigma = muSigmaMat[2] + if(sigma == 0) + SUMMER::expit(mu) + else { + if(any(is.na(c(mu, sigma)))) + NA + else if(!logisticApprox) { + # numerically calculate the mean + fExp <- function(x) exp(stats::plogis(x, log.p=TRUE) + stats::dnorm(x, mean = mu, sd = sigma, log=TRUE)) + stats::integrate(fExp, mu-10*sigma, mu+10*sigma, abs.tol = 0, ...)$value + } else { + # use logistic approximation + k = 16 * sqrt(3) / (15 * pi) + SUMMER::expit(mu / sqrt(1 + k^2 * sigma^2)) + } + } + } +} \ No newline at end of file diff --git a/R/popGrid.R b/R/popGrid.R new file mode 100755 index 0000000..5c042ea --- /dev/null +++ b/R/popGrid.R @@ -0,0 +1,1276 @@ +#' Generating pixellated populations, and population frames +#' +#' Functions for generating pixellated population information and +#' population frames at the `area` and `subarea` levels. +#' The `area` and `subarea` levels can be thought of as big +#' regions and little regions, where areas can be partitioned into +#' unique sets of subareas. For example, Admin-1 and Admin-2 +#' areas might be areas and subareas respectively. The population totals are either +#' tabulated at the area x urban/rural level, the subarea x urban/rural +#' level, or at the pixel level of a specified resolution. Totals +#' are calculated using population density information, shapefiles, +#' and, possibly, preexisting population frames at different +#' areal levels. Note that area names should each be unique, and similarly for +#' subarea names. +#' +#' @describeIn makePopIntegrationTab Generate pixellated `grid` of coordinates (both longitude/latitude and east/north) +#' over spatial domain of the given resolution with associated population totals, areas, subareas, +#' and urban/rural levels. For very small areas that might not +#' otherwise have a grid point in them, a custom integration point is added at their +#' centroid. Sets urbanicity classifications by thresholding input population density raster +#' using area and subarea population tables, and generates area and subarea population +#' tables from population density information if not already given. Can be used for integrating +#' predictions from the given coordinates to area and subarea levels using population weights. +#' +#' @param popMat Pixellated grid data frame with variables `area` and `pop` such as that +#' generated by \code{\link{makePopIntegrationTab}} +#' @param poppa data.frame of population per area separated by urban/rural. If `poppsub` +#' is not included, this is used for normalization of populations associated with +#' population integration points. Contains variables: +#' \describe{ +#' \item{area}{name of area} +#' \item{popUrb}{total urban (general) population of area} +#' \item{popRur}{total rural (general) population of area} +#' \item{popTotal}{total (general) population of area} +#' \item{pctUrb}{percentage of population in the area that is urban (between 0 and 100)} +#' } +#' @param poppsub data.frame of population per subarea separated by +#' urban/rural using for population normalization or urbanicity +#' classification. Often based on extra fine scale population density grid. +#' Has variables: +#' \describe{ +#' \item{subarea}{name of subarea} +#' \item{area}{name of area} +#' \item{popUrb}{total urban (general) population of subarea} +#' \item{popRur}{total rural (general) population of subarea} +#' \item{popTotal}{total (general) population of subarea} +#' \item{pctUrb}{percentage of population in the subarea that is urban (between 0 and 100)} +#' } +#' @param areapa A list with variables: +#' \describe{ +#' \item{area}{name of area} +#' \item{spatialArea}{spatial area of the subarea (e.g. in km^2)} +#' } +#' @param areapsub A list with variables: +#' \describe{ +#' \item{subarea}{name of subarea} +#' \item{spatialArea}{spatial area of the subarea (e.g. in km^2)} +#' } +#' @param kmRes The resolution of the pixelated grid in km +#' @param domainMapDat A shapefile representing the full spatial domain (e.g. country) +#' @param eastLim Range in km easting over the spatial domain under the input projection +#' @param northLim Range in km northing over the spatial domain under the input projection +#' @param mapProjection A projection function taking longitude and latitude and returning easting and +#' northing in km. Or the inverse if inverse is set to TRUE. For example, +#' \code{\link{projKenya}}. Check https://epsg.io/ for example for best projection EPSG codes +#' for specific countries +#' @param pop Population density raster +#' @param areaMapDat SpatialPolygonsDataFrame object with area level map information +#' @param subareaMapDat SpatialPolygonsDataFrame object with subarea level map information +#' @param areaNameVar The name of the area variable associated with \code{areaMapDat@data} +#' and \code{subareaMapDat@data} +#' @param subareaNameVar The name of the subarea variable associated with \code{subareaMapDat@data} +#' @param stratifyByUrban Whether to stratify the pixellated grid by urban/rural. If TRUE, +#' renormalizes population densities within areas or subareas crossed with urban/rural +#' @param poppaTarget Target population per area stratified by urban rural. Same format as poppa +#' @param adjustBy Whether to adjust population density by the `area` or `subarea` level +#' @param areaPolygonSubsetI Index in areaMapDat for a specific area to subset the grid over. This +#' option can help reduce computation time relative to constructing the whole grid +#' and subsetting afterwards +#' @param subareaPolygonSubsetI FOR EXPERIMENTAL PURPOSES ONLY. Index in subareaMapDat for a +#' specific area to subset the grid over. This +#' option can help reduce computation time relative to constructing the whole grid +#' and subsetting afterwards +#' @param customSubsetPolygons 'SpatialPolygonsDataFrame' or 'SpatialPolygons' object to subset +#' the grid over. This option can help reduce computation time relative to +#' constructing the whole grid and subsetting afterwards. `areaPolygonSubsetI` or +#' `subareaPolygonSubsetI` can be used when subsetting by areas or subareas in +#' `areaMapDat` or `subareaMapDat`. Must be in latitude/longitude projection "EPSG:4326" +#' @param returnPoppTables If TRUE, poppa and poppsub will be calculated based on the generated +#' population integration matrix and input area/subarea map data +#' @param mean.neighbor For determining what area or subarea points are nearest to if they do not +#' directly fall into an area. See \code{\link[fields]{fields.rdist.near}} for details. +#' @param delta For determining what area or subarea points are nearest to if they do not +#' directly fall into an area. See \code{\link[fields]{fields.rdist.near}} for details. +#' @param setNAsToZero If TRUE, sets NA populations to 0. +#' @param fixZeroPopDensitySubareas If TRUE, if population density in a subarea is estimated to be +#' zero, but the total population in the subarea is nonzero, population is filled into the +#' area uniformly +#' @param extractMethod Either 'bilinear' or 'simple'. see `method` from +#' \code{\link[terra]{extract}} +#' +#' @author John Paige +#' @seealso \code{\link{setThresholdsByRegion}}, \code{\link{poppRegionFromPopMat}}, \code{\link{simPopSPDE}}, \code{\link{simPopCustom}} +#' @examples +#' \dontrun{ +#' # download Kenya GADM shapefiles from SUMMERdata github repository +#' githubURL <- paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/", +#' "kenyaMaps.rda?raw=true") +#' tempDirectory = "~/" +#' mapsFilename = paste0(tempDirectory, "/kenyaMaps.rda") +#' if(!file.exists(mapsFilename)) { +#' download.file(githubURL,mapsFilename) +#' } +#' +#' # load it in +#' out = load(mapsFilename) +#' out +#' adm1@data$NAME_1 = as.character(adm1@data$NAME_1) +#' adm1@data$NAME_1[adm1@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +#' adm1@data$NAME_1[adm1@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" +#' adm2@data$NAME_1 = as.character(adm2@data$NAME_1) +#' adm2@data$NAME_1[adm2@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +#' adm2@data$NAME_1[adm2@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" +#' +#' # some Admin-2 areas have the same name +#' adm2@data$NAME_2 = as.character(adm2@data$NAME_2) +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Bungoma") & +#' (adm2@data$NAME_2 == "Lugari")] = "Lugari, Bungoma" +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Kakamega") & +#' (adm2@data$NAME_2 == "Lugari")] = "Lugari, Kakamega" +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Meru") & +#' (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Meru" +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Tharaka-Nithi") & +#' (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Tharaka-Nithi" +#' +#' # The spatial area of unknown 8 is so small, it causes problems unless its removed or +#' # unioned with another subarea. Union it with neighboring Kakeguria: +#' newadm2 = adm2 +#' unknown8I = which(newadm2$NAME_2 == "unknown 8") +#' newadm2$NAME_2[newadm2$NAME_2 %in% c("unknown 8", "Kapenguria")] <- +#' "Kapenguria + unknown 8" +#' admin2.IDs <- newadm2$NAME_2 +#' +#' library(maptools) +#' temp <- unionSpatialPolygons(newadm2, admin2.IDs) +#' tempData = newadm2@data[-unknown8I,] +#' tempData = tempData[order(tempData$NAME_2),] +#' newadm2 <- SpatialPolygonsDataFrame(temp, tempData, match.ID = F) +#' adm2 = newadm2 +#' +#' # download 2014 Kenya population density and associated TIF file +#' githubURL <- paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/", +#' "Kenya2014Pop/pop.rda?raw=true") +#' popFilename = paste0(tempDirectory, "/pop.rda") +#' if(!file.exists(popFilename)) { +#' download.file(githubURL,popFilename) +#' } +#' +#' githubURL <- paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/", +#' "Kenya2014Pop/worldpop_total_1y_2014_00_00.tif?raw=true") +#' popTIFFilename = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") +#' if(!file.exists(popTIFFilename)) { +#' download.file(githubURL,popTIFFilename) +#' } +#' +#' # load it in +#' require(terra) +#' out = load(popFilename) +#' out +#' +#' # make sure this is correct for re-projections +#' pop@file@name = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") +#' +#' eastLim = c(-110.6405, 832.4544) +#' northLim = c(-555.1739, 608.7130) +#' +#' ## Construct poppsubKenya, a table of urban/rural general population totals +#' ## in each subarea. Technically, this is not necessary since we can load in +#' ## poppsubKenya via data(kenyaPopulationData). First, we will need to calculate +#' ## the areas in km^2 of the areas and subareas +#' +#' library(rgdal) +#' library(sp) +#' +#' # use Lambert equal area projection of areas (Admin-1) and subareas (Admin-2) +#' midLon = mean(adm1@bbox[1,]) +#' midLat = mean(adm1@bbox[2,]) +#' p4s = paste0("+proj=laea +x_0=0 +y_0=0 +lon_0=", midLon, +#' " +lat_0=", midLat, " +units=km") +#' +#' library(rgdal) +#' +#' adm1proj <- spTransform(adm1, CRS(p4s)) +#' adm2proj <- spTransform(adm2, CRS(p4s)) +#' +#' # now calculate spatial area in km^2 +#' library(rgeos) +#' admin1Areas = gArea(adm1proj, TRUE) +#' admin2Areas = gArea(adm2proj, TRUE) +#' areapaKenya = data.frame(area=adm1proj@data$NAME_1, spatialArea=admin1Areas) +#' areapsubKenya = data.frame(area=adm2proj@data$NAME_1, subarea=adm2proj@data$NAME_2, +#' spatialArea=admin2Areas) +#' +#' # Calculate general population totals at the subarea (Admin-2) x urban/rural +#' # level and using 1km resolution population grid. Assign urbanicity by +#' # thresholding population density based on estimated proportion population +#' # urban/rural, making sure total area (Admin-1) urban/rural populations in +#' # each area matches poppaKenya. +#' require(fields) +#' # NOTE: the following function will typically take ~20 minutes. Can speed up +#' # by setting kmRes to be higher, but we recommend fine resolution for +#' # this step, since it only needs to be done once. Instead of running this, +#' # you can simply run data(kenyaPopulationData) +#' system.time(poppsubKenya <- getPoppsub( +#' kmRes=1, pop=pop, domainMapDat=adm0, +#' eastLim=eastLim, northLim=northLim, mapProjection=projKenya, +#' poppa = poppaKenya, areapa=areapaKenya, areapsub=areapsubKenya, +#' areaMapDat=adm1, subareaMapDat=adm2, +#' areaNameVar = "NAME_1", subareaNameVar="NAME_2")) +#' +#' # Now generate a general population integration table at 5km resolution, +#' # based on subarea (Admin-2) x urban/rural population totals. This takes +#' # ~1 minute +#' system.time(popMatKenya <- makePopIntegrationTab( +#' kmRes=5, pop=pop, domainMapDat=adm0, +#' eastLim=eastLim, northLim=northLim, mapProjection=projKenya, +#' poppa = poppaKenya, poppsub=poppsubKenya, +#' areaMapDat = adm1, subareaMapDat = adm2, +#' areaNameVar = "NAME_1", subareaNameVar="NAME_2")) +#' +#' ## Adjust popMat to be target (neonatal) rather than general population density. First +#' ## creat the target population frame +#' ## (these numbers are based on IPUMS microcensus data) +#' mothersPerHouseholdUrb = 0.3497151 +#' childrenPerMotherUrb = 1.295917 +#' mothersPerHouseholdRur = 0.4787696 +#' childrenPerMotherRur = 1.455222 +#' targetPopPerStratumUrban = easpaKenya$HHUrb * mothersPerHouseholdUrb * childrenPerMotherUrb +#' targetPopPerStratumRural = easpaKenya$HHRur * mothersPerHouseholdRur * childrenPerMotherRur +#' easpaKenyaNeonatal = easpaKenya +#' easpaKenyaNeonatal$popUrb = targetPopPerStratumUrban +#' easpaKenyaNeonatal$popRur = targetPopPerStratumRural +#' easpaKenyaNeonatal$popTotal = easpaKenyaNeonatal$popUrb + easpaKenyaNeonatal$popRur +#' easpaKenyaNeonatal$pctUrb = 100 * easpaKenyaNeonatal$popUrb / easpaKenyaNeonatal$popTotal +#' easpaKenyaNeonatal$pctTotal = +#' 100 * easpaKenyaNeonatal$popTotal / sum(easpaKenyaNeonatal$popTotal) +#' +#' # Generate the target population density by scaling the current population density grid +#' # at the Admin1 x urban/rural level +#' popMatKenyaNeonatal = adjustPopMat(popMatKenya, easpaKenyaNeonatal) +#' +#' # Generate neonatal population table from the neonatal population integration matrix. +#' # This is technically not necessary for population simulation purposes, but is here +#' # for illustrative purposes +#' poppsubKenyaNeonatal = poppRegionFromPopMat(popMatKenyaNeonatal, popMatKenyaNeonatal$subarea) +#' poppsubKenyaNeonatal = cbind(subarea=poppsubKenyaNeonatal$region, +#' area=adm2@data$NAME_1[match(poppsubKenyaNeonatal$region, +#' adm2@data$NAME_2)], +#' poppsubKenyaNeonatal[,-1]) +#' print(head(poppsubKenyaNeonatal)) +#' } +#' @importFrom terra extract +#' @importFrom terra gdal +#' @importFrom sp SpatialPoints +#' @importFrom sp CRS +#' @importFrom sp over +#' @importFrom sp coordinates +#' @importFrom sp as.SpatialPolygons.PolygonsList +#' @importFrom sp proj4string<- +#' @importFrom fields make.surface.grid +#' @export +makePopIntegrationTab = function(kmRes=5, pop, domainMapDat, eastLim, northLim, mapProjection, + areaMapDat, subareaMapDat, + areaNameVar="NAME_1", subareaNameVar="NAME_2", + poppa=NULL, poppsub=NULL, stratifyByUrban=TRUE, + areapa=NULL, areapsub=NULL, customSubsetPolygons=NULL, + areaPolygonSubsetI=NULL, subareaPolygonSubsetI=NULL, + mean.neighbor=50, delta=.1, returnPoppTables=FALSE, + setNAsToZero=TRUE, fixZeroPopDensitySubareas=FALSE, + extractMethod="bilinear") { + thresholdUrbanBy = ifelse(is.null(poppsub), "area", "subarea") + + # some basic checks to make sure inputs are correct + + # make sure area and subarea names are unique + nameCounts = aggregate(areaMapDat@data[[areaNameVar]], by=list(area=areaMapDat@data[[areaNameVar]]), FUN=length) + if(any(nameCounts$x > 1)) { + stop("area names are not unique in areaMapDat") + } + if(!is.null(subareaMapDat)) { + nameCounts = aggregate(subareaMapDat@data[[subareaNameVar]], by=list(area=subareaMapDat@data[[subareaNameVar]]), FUN=length) + if(any(nameCounts$x > 1)) { + stop("subarea names are not unique in subareaMapDat") + } + } + if(!is.null(poppa)) { + nameCounts = aggregate(poppa$area, by=list(area=poppa$area), FUN=length) + if(any(nameCounts$x > 1)) { + stop("area names are not unique in poppa") + } + + # make sure area names match up + if(!all.equal(sort(poppa$area), sort(areaMapDat@data[[areaNameVar]]))) { + stop("area names in poppa do not match those in areaMapDat@data") + } + } + if(!is.null(poppsub)) { + nameCounts = aggregate(poppsub$subarea, by=list(area=poppsub$subarea), FUN=length) + if(any(nameCounts$x > 1)) { + stop("subarea names are not unique in poppsub") + } + + # make sure subarea names match up + if(!is.null(subareaMapDat)) { + if(!all.equal(sort(poppsub$subarea), sort(subareaMapDat@data[[subareaNameVar]]))) { + stop("subarea names in poppsub do not match those in subareaMapDat@data") + } + } + } + + # get a rectangular grid + eastGrid = seq(eastLim[1], eastLim[2], by=kmRes) + northGrid = seq(northLim[1], northLim[2], by=kmRes) + + if(!is.null(areaPolygonSubsetI)) { + # get range of the grid that we actually need + temp = areaMapDat@polygons[[areaPolygonSubsetI]] + allPolygons = temp@Polygons + eastNorthCoords = do.call("rbind", lapply(1:length(allPolygons), function(i) {mapProjection(allPolygons[[i]]@coords)})) + eastSubRange = range(eastNorthCoords[,1]) + northSubRange = range(eastNorthCoords[,2]) + + # subset grid to the range we need + eastGrid = eastGrid[eastGrid >= eastSubRange[1]] + eastGrid = eastGrid[eastGrid <= eastSubRange[2]] + northGrid = northGrid[northGrid >= northSubRange[1]] + northGrid = northGrid[northGrid <= northSubRange[2]] + } + + if(!is.null(subareaPolygonSubsetI)) { + # get range of the grid that we actually need + temp = subareaMapDat@polygons[[subareaPolygonSubsetI]] + allPolygons = temp@Polygons + eastNorthCoords = do.call("rbind", lapply(1:length(allPolygons), function(i) {mapProjection(allPolygons[[i]]@coords)})) + eastSubRange = range(eastNorthCoords[,1]) + northSubRange = range(eastNorthCoords[,2]) + + # subset grid to the range we need + eastGrid = eastGrid[eastGrid >= eastSubRange[1]] + eastGrid = eastGrid[eastGrid <= eastSubRange[2]] + northGrid = northGrid[northGrid >= northSubRange[1]] + northGrid = northGrid[northGrid <= northSubRange[2]] + } + + if(!is.null(customSubsetPolygons)) { + if(("SpatialPolygons" %in% class(customSubsetPolygons)) || ("SpatialPolygonsDataFrame" %in% class(customSubsetPolygons))) { + + # calculate the east/north range of each polygon + getPolygonENrange = function(pol) { + # get range of the grid that we actually need + temp = pol + allPolygons = temp@Polygons + eastNorthCoords = do.call("rbind", lapply(1:length(allPolygons), function(i) {mapProjection(allPolygons[[i]]@coords)})) + eastSubRange = range(eastNorthCoords[,1]) + northSubRange = range(eastNorthCoords[,2]) + + # subset grid to the range we need + rbind(eastSubRange, + northSubRange) + } + allRanges = lapply(getPolygonENrange, customSubsetPolygons@polygons) + + # calculate the full range based on each individual polygon range + eastings = sapply(allRanges, function(x) {x[1,]}) + northings = sapply(allRanges, function(x) {x[2,]}) + eastSubRange = range(eastings) + northSubRange = range(northings) + + # subset grid to the range we need + eastGrid = eastGrid[eastGrid >= eastSubRange[1]] + eastGrid = eastGrid[eastGrid <= eastSubRange[2]] + northGrid = northGrid[northGrid >= northSubRange[1]] + northGrid = northGrid[northGrid <= northSubRange[2]] + } else { + stop("customSubsetPolygons must be of class 'SpatialPolygons' or 'SpatialPolygonsDataFrame'") + } + } + + utmGrid = matrix(fields::make.surface.grid(list(east=eastGrid, north=northGrid)), ncol=2) + + # project coordinates into lat/lon + if(length(utmGrid) > 0) { + lonLatGrid = matrix(mapProjection(utmGrid, inverse=TRUE), ncol=2) + } else { + warning(paste0("no grid cell centroid are in the areas of interest. ", + "Integration grid will be composed entirely of custom ", + "integration points at the centroids of the areas of interest")) + lonLatGrid = utmGrid + } + + # subset grid so it's in the domain + # inDomain = in.poly(lonLatGrid, domainPoly) + # determine version of PROJ + ver = terra::gdal(lib="proj") + PROJ6 <- as.numeric(substr(ver, 1, 1)) >= 6 + + # from lon/lat coords to easting/northing + if(!PROJ6) { + lonLatCoords = sp::SpatialPoints(lonLatGrid, proj4string=sp::CRS("+proj=longlat")) + } else { + lonLatCoords = sp::SpatialPoints(lonLatGrid, proj4string=sp::CRS(SRS_string="EPSG:4326")) + } + inDomain = sp::over(lonLatCoords, domainMapDat) + inDomain = !is.na(inDomain[,1]) + utmGrid = matrix(utmGrid[inDomain,], ncol=2) + lonLatGrid = matrix(lonLatGrid[inDomain,], ncol=2) + + # compute areas associated with locations + if(length(lonLatGrid) > 0) { + if(!is.null(subareaMapDat)) { + subareas = SUMMER::getAreaName(lonLatGrid, subareaMapDat, areaNameVar=subareaNameVar, mean.neighbor=mean.neighbor, delta=delta)$areaNames + } else { + subareas = NULL + } + + if(!is.null(areaMapDat)) { + areas = SUMMER::getAreaName(lonLatGrid, areaMapDat, areaNameVar=areaNameVar, mean.neighbor=mean.neighbor, delta=delta)$areaNames + } else { + areas = NULL + } + + } else { + subareas = character(0) + areas = character(0) + } + + if(!is.null(areaPolygonSubsetI)) { + areaSubsetName = areaMapDat@data[areaPolygonSubsetI,areaNameVar] + insideArea = areas == areaSubsetName + + # subset grid and area/subarea names so they're in the area of interest + utmGrid = matrix(utmGrid[insideArea,], ncol=2) + lonLatGrid = matrix(lonLatGrid[insideArea,], ncol=2) + areas = areas[insideArea] + subareas = subareas[insideArea] + } + + if(!is.null(subareaPolygonSubsetI)) { + subareaSubsetName = subareaMapDat@data[subareaPolygonSubsetI,subareaNameVar] + insideSubarea = subareas == subareaSubsetName + + # subset grid and area/subarea names so they're in the area of interest + utmGrid = matrix(utmGrid[insideSubarea,], ncol=2) + lonLatGrid = matrix(lonLatGrid[insideSubarea,], ncol=2) + areas = areas[insideSubarea] + subareas = subareas[insideSubarea] + } + + if(!is.null(customSubsetPolygons)) { + if(!grepl("+proj=longlat", customSubsetPolygons@proj4string@projargs)) { + warning(paste0("customSubsetPolygons does not use longitude/latitude ", + "coordinate system? Using makePopIntegrationTab outside ", + "its use case, which may result in errors")) + } + + if(!PROJ6) { + lonLatCoords = sp::SpatialPoints(lonLatGrid, proj4string=sp::CRS("+proj=longlat")) + } else { + lonLatCoords = sp::SpatialPoints(lonLatGrid, proj4string=sp::CRS(SRS_string="EPSG:4326")) + } + insideCustomSubset = sp::over(lonLatCoords, customSubsetPolygons) + + # subset grid and area/subarea names so they're in the area of interest + utmGrid = matrix(utmGrid[insideCustomSubset,], ncol=2) + lonLatGrid = matrix(lonLatGrid[insideCustomSubset,], ncol=2) + areas = areas[insideCustomSubset] + + if(!is.null(subareaMapDat)) { + subareas = subareas[insideCustomSubset] + } + } + + # determine what subareas we want our grid to contain + if(!is.null(subareaMapDat)) { + allSubareas = sort(unique(subareaMapDat@data[[subareaNameVar]])) + if(!is.null(areaPolygonSubsetI)) { + allSubareas = sort(unique(subareaMapDat@data[subareaMapDat@data[[areaNameVar]] == areaSubsetName,][[subareaNameVar]])) + } + if(!is.null(subareaPolygonSubsetI)) { + allSubareas = subareaSubsetName + } + if(!is.null(customSubsetPolygons)) { + allSubareas = sort(unique(subareas)) + } + } + + # filter out parts of poppsub that are irrelevant for the subareas of + # interest + if(!is.null(poppsub)) { + poppsub = poppsub[poppsub$subarea %in% allSubareas,] + } + + if(!is.null(subareaMapDat)) { + # check to make sure every subarea has at least 2 pixels + subareasFactor = factor(subareas, levels=allSubareas) + if(length(lonLatGrid) > 0) { + out = aggregate(subareas, by=list(subarea=subareasFactor), FUN=length, drop=FALSE) + } else { + out = data.frame(subarea=sort(unique(subareaMapDat@data[[subareaNameVar]])), + x=NA) + } + noPixels = is.na(out$x) + onePixel = out$x == 1 + onePixel[is.na(onePixel)] = FALSE + onePixelNames = out$subarea[onePixel] + badSubareas = noPixels | onePixel + badSubareaNames = as.character(out$subarea[badSubareas]) + + if(any(badSubareas)) { + badSubareaString = paste(badSubareaNames, collapse=", ") + warning(paste0("The following subareas have < 2 regular grid points ", + "at the given resolution: ", badSubareaString, ", so ", + "they will be given custom integration points")) + + # get centroids of the subareas (or it's single pixel coordinates) + thisSpatialPolyList = sp::as.SpatialPolygons.PolygonsList(subareaMapDat@polygons) + centroidsLonLat = matrix(ncol=2, nrow=length(allSubareas)) + + for(i in 1:length(allSubareas)) { + thisSubarea = allSubareas[i] + if(thisSubarea %in% onePixelNames) { + thisCentroid = lonLatGrid[subareas == thisSubarea,] + } else { + subareaI = match(as.character(thisSubarea), as.character(subareaMapDat@data[[subareaNameVar]])) + thisCentroid = sp::coordinates(thisSpatialPolyList[subareaI]) + } + + centroidsLonLat[i,] = thisCentroid + } + + # sort to match results of aggregate (alphabetical order) + # sortI = sort(as.character(subareaMapDat@data[[subareaNameVar]]), index.return=TRUE)$ix + sortI = sort(as.character(allSubareas), index.return=TRUE)$ix + centroidsLonLat = centroidsLonLat[sortI,] + + # remove the one pixel for subareas with only one pixel + # (we will add it in again later, twice if stratified: both urban and rural) + onePixel = which(subareas %in% onePixelNames) + if(length(onePixel) > 0) { + lonLatGrid = lonLatGrid[-onePixel,] + utmGrid = utmGrid[-onePixel,] + subareas = subareas[-onePixel] + areas = areas[-onePixel] + } + + # add centroids of only the bad subareas + centroidsLonLat = matrix(centroidsLonLat[badSubareas,], ncol=2) + + # convert to east/north + centroidsEastNorth = projKenya(centroidsLonLat[,1], centroidsLonLat[,2]) + + # only add centroid in stratum if bad subareas have any population in the stratum. + # If poppsub not included and resolution not high enough to have multiple points + # in a subarea, treat it as entirely urban or rural + if(is.null(poppsub)) { + hasUrbanPop = rep(TRUE, sum(badSubareas)) + hasRuralPop = rep(FALSE, sum(badSubareas)) + } else { + hasUrbanPop = (poppsub$popUrb > 0)[badSubareas] + hasRuralPop = (poppsub$popRur > 0)[badSubareas] + } + + # add centroids to the matrices of pixellated grid coordinates. + # Add them twice: once for urban, once for rural + lonLatGrid = rbind(lonLatGrid, centroidsLonLat[hasUrbanPop,], centroidsLonLat[hasRuralPop,]) + utmGrid = rbind(utmGrid, centroidsEastNorth[hasUrbanPop,], centroidsEastNorth[hasRuralPop,]) + + # add associated consituencies, areas, provinces to respective vectors + # Normally, we could just caluclate what subarea/area the points are in. + # However, since centroids might not be in the associated subarea, we + # instead just assign the known subareas directly + # newSubareas = getAreaName(rbind(centroidsLonLat[hasUrbanPop,], + # centroidsLonLat[hasRuralPop,]), + # subareaMapDat, subareaNameVar, delta, mean.neighbor)$areaNames + # newAreas = getAreaName(rbind(centroidsLonLat[hasUrbanPop,], + # centroidsLonLat[hasRuralPop,]), + # areaMapDat, areaNameVar, delta, mean.neighbor)$areaNames + newSubareas = c(badSubareaNames[hasUrbanPop], badSubareaNames[hasRuralPop]) + newAreas = subareaMapDat@data[[areaNameVar]][match(newSubareas, subareaMapDat@data[[subareaNameVar]])] + + subareas = c(as.character(subareas), as.character(newSubareas)) + areas = c(as.character(areas), as.character(newAreas)) + } + } else { + badSubareas = FALSE + } + + # get population density at those coordinates + if(!PROJ6) { + # extract the raster values for each chunk of points + interpPopVals <- tryCatch( + { + terra::extract(pop, sp::SpatialPoints(lonLatGrid),method=extractMethod) + }, + error=function(cond) { + message(cond) + stop(paste0("Error extracting raster values. In case of memory limitations, see ", + "terra::terraOptions()")) + # Choose a return value in case of error + return(NA) + } + ) + } else { + sp::proj4string(pop) = sp::CRS(SRS_string="EPSG:4326") + interpPopVals <- tryCatch( + { + terra::extract(pop, sp::SpatialPoints(lonLatGrid, proj4string=sp::CRS(SRS_string="EPSG:4326")), method="bilinear") + }, + error=function(cond) { + message(cond) + stop(paste0("Error extracting raster values. In case of memory limitations, see ", + "terra::aggregate, terra::resample, terra::projectRaster, and terra::terraOptions")) + # Choose a return value in case of error + return(NA) + } + ) + } + + if(setNAsToZero) { + interpPopVals[is.na(interpPopVals)] = 0 + } + + # if requested, fix subareas with entirely zero population density + if(fixZeroPopDensitySubareas && !is.null(poppsub)) { + newPop = data.frame(list(lon=lonLatGrid[,1], lat=lonLatGrid[,2], pop=interpPopVals, area=areas, subarea=subareas, urban=NA)) + poppsubCurrent = poppRegionFromPopMat(newPop, newPop$subarea) + + zeroPopSubareas = poppsubCurrent$region[poppsubCurrent$popTotal == 0] + if(length(zeroPopSubareas) > 0) { + warning(paste0("The following subareas have entirely zero population density ", + "but nonzero total population, and their population will be filled ", + "in uniformly: ", paste(zeroPopSubareas, collapse=", "))) + # fill in population density uniformly in these subareas + for(i in 1:length(zeroPopSubareas)) { + thisSub = zeroPopSubareas[i] + thisSubI = newPop$subarea == thisSub + thisSubPop = poppsubCurrent$popTotal[poppsubCurrent$region == thisSub] + newPop$pop[thisSubI] = thisSubPop/sum(thisSubI) + } + } + + interpPop = newPop$pop + } + + if(any(badSubareas)) { + # make sure population densities in the bad subareas + # are slightly different so one will be classified as urban and + # the other as rural. They will be renormalized later based on poppsub, + # or, if poppsub doesn't exist, only new/custom "urban" points will be kept, + # so no densities are modified here + nUnits = length(interpPopVals) + nNewRural = sum(hasRuralPop) + if(nNewRural >= 1) { + interpPopVals[(nUnits-nNewRural + 1):nUnits] = interpPopVals[(nUnits-nNewRural + 1):nUnits] / 2 + } + } + + # determine which points are urban via population density thresholding + newPop = data.frame(list(lon=lonLatGrid[,1], lat=lonLatGrid[,2], pop=interpPopVals, area=areas, subarea=subareas, urban=NA)) + if(thresholdUrbanBy == "area") { + threshes = SUMMER::setThresholdsByRegion(newPop, poppa, regionType="area") + popThreshes = sapply(1:nrow(newPop), function(i) {threshes$threshes[as.character(threshes$regions) == as.character(newPop$area[i])]}) + urban = newPop$pop >= unlist(popThreshes) + newPop$urban = urban + } else { + tempSubarea = poppsub + tempPop = newPop + allSubareas = sort(unique(tempPop$subarea)) + tempSubarea = tempSubarea[tempSubarea$subarea %in% allSubareas,] + threshes = SUMMER::setThresholdsByRegion(tempPop, poppr = tempSubarea, regionType="subarea") + popThreshes = sapply(1:nrow(newPop), function(i) {threshes$threshes[as.character(threshes$regions) == as.character(newPop$subarea[i])]}) + urban = newPop$pop >= unlist(popThreshes) + newPop$urban = urban + } + + newPop$east = utmGrid[,1] + newPop$north = utmGrid[,2] + + pointAreaNames = as.character(newPop$area) + pointAreaNamesU = as.character(newPop$area) + pointAreaNamesU[!newPop$urban] = "DoNotUseThis" + pointAreaNamesR = as.character(newPop$area) + pointAreaNamesR[newPop$urban] = "DoNotUseThis" + + if(!is.null(subareaMapDat)) { + pointSubareaNames = as.character(newPop$subarea) + pointSubareaNamesU = as.character(newPop$subarea) + pointSubareaNamesU[!newPop$urban] = "DoNotUseThis" + pointSubareaNamesR = as.character(newPop$subarea) + pointSubareaNamesR[newPop$urban] = "DoNotUseThis" + } + + # if necessary, renormalize population values within subareas or areas + # crossed with urban/rural to be the correct value + if(!is.null(poppsub)) { + # In this case, areapsub and areapa are not necessary + + if(stratifyByUrban) { + pointPopUrb = calibrateByRegion(newPop$pop, pointSubareaNamesU, poppsub$subarea, poppsub$popUrb) + pointPopRur = calibrateByRegion(newPop$pop, pointSubareaNamesR, poppsub$subarea, poppsub$popRur) + newPop$pop = (pointPopUrb + pointPopRur) + } else { + newPop$pop = calibrateByRegion(newPop$pop, pointSubareaNames, poppsub$subarea, poppsub$popTotal) + } + } else if(!is.null(poppa)) { + # i.e. we don't have poppsub, so now we need to be careful about the spatial areas + # (e.g. area in km^2) associated with each point + + # determine how to use spatial areas (e.g. in km^2) of areas/subareas: + # Case 1: We don't have subareaMapDat, but do have areapa + # Case 2: We don't have subareaMapDat, and don't have areapa + # Case 3: We have subareaMapDat and areapsub + # Case 4: We have subareaMapDat but don't have areapsub (this is sketchy if + # there's any custom integration points) + + # set the areas of each point to be equal to start out. This must be + # proportional to, but not necessarily equal to, the actual area + # associated with each point + pointAreas = rep(1, nrow(newPop)) + if(is.null(subareaMapDat) && !is.null(areapa)) { + pointAreas = calibrateByRegion(pointAreas, pointAreaNames, areapa$area, areapa$spatialArea) + } else if(is.null(subareaMapDat) && is.null(areapa)) { + # do nothing, since pointAreas = rep(1, nrow(newPop)) is fine + } else if(!is.null(subareaMapDat) && !is.null(areapsub)) { + pointAreas = calibrateByRegion(pointAreas, pointSubareaNames, areapsub$subarea, areapsub$spatialArea) + } else if(!is.null(subareaMapDat) && is.null(areapsub)) { + if(any(badSubareas)) { + stop("poppsub and areapsub are NULL, but subareaMapDat is included") + } + # could continue here, since pointAreas = rep(1, nrow(newPop)) might not be disastrous, + # but it would be wrong + } + + # use the population per stratum to renormalize population densities + if(stratifyByUrban) { + popUrb = calibrateByRegion(newPop$pop*pointAreas, pointAreaNamesU, poppa$area, poppa$popUrb) + popRur = calibrateByRegion(newPop$pop*pointAreas, pointAreaNamesR, poppa$area, poppa$popRur) + + newPop$pop = popUrb + popRur + } else { + newPop$pop = calibrateByRegion(newPop$pop*pointAreas, pointAreaNames, poppa$area, poppa$popTotal) + } + } else { + stop("must either include poppsub, or poppa") + } + + # if requested by user, return the population integration matrix as well + # as the associate population per area and subarea tables + if(returnPoppTables) { + if(!is.null(areaMapDat)) { + areas = getAreaName(cbind(newPop$lon, newPop$lat), shapefile=areaMapDat, areaNameVar=areaNameVar)$areaNames + newpoppa = poppRegionFromPopMat(newPop, areas) + names(newpoppa)[1] = "area" + newpoppa$pctTotal = 100 * newpoppa$popTotal/sum(newpoppa$popTotal) + newpoppa$pctUrb = 100 * newpoppa$popUrb/newpoppa$popTotal + } else { + if(is.null(poppa)) { + warning(paste0("returnPoppTables set to TRUE, but areaMapDat is NULL, ", + "so no area level population table will be calculated")) + } + newpoppa = NULL + } + if(!is.null(subareaMapDat)) { + subareas = getAreaName(cbind(newPop$lon, newPop$lat), shapefile=subareaMapDat, areaNameVar=subareaNameVar)$areaNames + newpoppsub = poppRegionFromPopMat(newPop, subareas) + names(newpoppsub)[1] = "subarea" + + # get area associated with subareas + associatedAreas = sapply(newpoppsub$subarea, function(subareaName) {newPop$area[match(subareaName, newPop$subarea)]}) + newpoppsub$area = associatedAreas + + newpoppsub = newpoppsub[,c("subarea", "area", "popUrb", "popRur", "popTotal")] + + # calculate percent urban and percent of total population + newpoppsub$pctUrb = 100 * newpoppsub$popUrb / newpoppsub$popTotal + newpoppsub$pctTotal = 100 * newpoppsub$popTotal / sum(newpoppsub$popTotal) + } else { + if(is.null(poppsub)) { + warning(paste0("returnPoppTables set to TRUE, but subareaMapDat is NULL, ", + "so no subarea level population table will be calculated")) + } + newpoppsub = NULL + } + + return(list(popMat=newPop, poppa=newpoppa, poppsub=newpoppsub)) + } + + newPop +} + +#' @describeIn makePopIntegrationTab Generate table of estimates of population +#' totals per subarea x urban/rural combination based on population density +#' raster at `kmres` resolution "grid", including custom integration points +#' for any subarea too small to include grid points at their centroids. +#' @export +getPoppsub = function(kmRes=1, pop, domainMapDat, eastLim, northLim, mapProjection, + poppa, areapa=NULL, areapsub, subareaMapDat, subareaNameVar="NAME_2", + stratifyByUrban=TRUE, areaMapDat=NULL, areaNameVar="NAME_1", + areaPolygonSubsetI=NULL, subareaPolygonSubsetI=NULL, + customSubsetPolygons=NULL, + mean.neighbor=50, delta=.1, setNAsToZero=TRUE, fixZeroPopDensitySubareas=FALSE) { + + out = SUMMER::makePopIntegrationTab(kmRes=kmRes, pop=pop, domainMapDat=domainMapDat, + areapa=areapa, areapsub=areapsub, + eastLim=eastLim, northLim=northLim, mapProjection=mapProjection, + subareaMapDat=subareaMapDat, areaMapDat=areaMapDat, + areaNameVar=areaNameVar, subareaNameVar=subareaNameVar, + poppa=poppa, stratifyByUrban=stratifyByUrban, + areaPolygonSubsetI=areaPolygonSubsetI, + subareaPolygonSubsetI=subareaPolygonSubsetI, + customSubsetPolygons=customSubsetPolygons, + mean.neighbor=mean.neighbor, delta=delta, + setNAsToZero=setNAsToZero, + fixZeroPopDensitySubareas=fixZeroPopDensitySubareas, + returnPoppTables=TRUE) + out$poppsub +} + +#' @describeIn makePopIntegrationTab Adjust population densities in grid based on a population frame. +#' @export +adjustPopMat = function(popMat, poppaTarget=NULL, adjustBy=c("area", "subarea"), stratifyByUrban=TRUE) { + adjustBy = match.arg(adjustBy) + + # sort get population per stratum from poppaTarget + if(adjustBy == "area") { + areas=sort(unique(poppaTarget$area)) + } else { + areas=sort(unique(poppaTarget$subarea)) + } + + if(stratifyByUrban) { + targetPopPerStratumUrban = poppaTarget$popUrb + targetPopPerStratumRural = poppaTarget$popRur + + # generate 2 nArea x nPixels matrices for urban and rural strata integrating pixels with respect to population density to get area estimates + getAreaStratumIntegrationMatrix = function(getUrban=TRUE) { + areas = as.character(areas) + + if(adjustBy == "area") { + mat = t(sapply(areas, function(area) { + popMat$area == area + })) + } else { + mat = t(sapply(areas, function(area) { + popMat$subarea == area + })) + } + + mat = sweep(mat, 2, popMat$pop, "*") + + sweep(mat, 2, popMat$urban == getUrban, "*") + } + urbanIntegrationMat = getAreaStratumIntegrationMatrix() + ruralIntegrationMat = getAreaStratumIntegrationMatrix(FALSE) + + # calculate number of people per stratum by integrating the population density surface + urbanPopulations = rowSums(urbanIntegrationMat) + ruralPopulations = rowSums(ruralIntegrationMat) + + # adjust each row of the integration matrices to get the correct expected number of children per stratum + urbanIntegrationMat = sweep(urbanIntegrationMat, 1, targetPopPerStratumUrban / urbanPopulations, "*") + urbanIntegrationMat[urbanPopulations == 0,] = 0 + ruralIntegrationMat = sweep(ruralIntegrationMat, 1, targetPopPerStratumRural / ruralPopulations, "*") + ruralIntegrationMat[ruralPopulations == 0,] = 0 + + # the column sums of the matrices give the correct modified population densities + popMat$pop = colSums(urbanIntegrationMat) + colSums(ruralIntegrationMat) + } else { + targetPopPerArea = poppaTarget$popTotal + + # generate 2 nArea x nPixels matrices for urban and rural strata integrating pixels with respect to population density to get area estimates + getAreaIntegrationMatrix = function() { + areas = as.character(areas) + + if(adjustBy == "area") { + mat = t(sapply(areas, function(area) { + popMat$area == area + })) + } else { + mat = t(sapply(areas, function(area) { + popMat$subarea == area + })) + } + + mat = sweep(mat, 2, popMat$pop, "*") + mat + } + integrationMat = getAreaIntegrationMatrix() + + # calculate number of people per stratum by integrating the population density surface + totalPopulations = rowSums(integrationMat) + + # adjust each row of the integration matrices to get the correct expected number of children per stratum + urbanIntegrationMat = sweep(integrationMat, 1, targetPopPerArea / totalPopulations, "*") + urbanIntegrationMat[totalPopulations == 0,] = 0 + + # the column sums of the matrices give the correct modified population densities + popMat$pop = colSums(integrationMat) + } + + + popMat +} + +#' Calibrate the point level totals so their sum matches the regional totals +#' +#' Calibrate/normalize the point level totals so their sum matches the +#' regional totals. Technically, the totals can be at any level smaller +#' than the region level specified. +#' +#' @param pointTotals Vector of point level totals that will be calibrated/normalized +#' @param pointRegions Vector of regions associated with each point +#' @param regions Vector of region names +#' @param regionTotals Vector of desired region level totals associated with `regions` +#' +#' @return A vector of same length as pointTotals and pointRegions containing +#' the calibrated/normalized point totals that sum to the correct regional totals +#' +#' @author John Paige +#' +#' @examples +#' pointTotals = c(1, 1, 1, 2) +#' pointRegions = c("a", "a", "b", "b") +#' regionTotals = c(10, 20) +#' regions = c("a", "b") +#' calibrateByRegion(pointTotals, pointRegions, regions, regionTotals) +#' +#' @return Vector of updated point level totals, calibrated to match region totals +#' +#' @export +calibrateByRegion = function(pointTotals, pointRegions, regions, regionTotals) { + regions = as.character(regions) + pointRegions = as.character(pointRegions) + + # generate nRegions x nPoints matrix that separates points totals to each region + getRegionIntegrationMatrix = function() { + + mat = t(sapply(regions, function(regionName) {pointRegions == regionName})) + mat = sweep(mat, 2, pointTotals, "*") + mat + } + integrationMat = getRegionIntegrationMatrix() + + # calculate total per region before calibration + tempRegionTotals = rowSums(integrationMat) + + # adjust each row of the integration matrices to get the correct totals per region + integrationMat = sweep(integrationMat, 1, regionTotals / tempRegionTotals, "*") + integrationMat[tempRegionTotals == 0,] = 0 + + # check to make sure all zero population totals are in zero population regions. If + # any extra, spread the region population in the associated pointTotals + badRegions = (tempRegionTotals == 0) & (regionTotals != 0) + if(any(badRegions)) { + badRegionIs = which(badRegions) + for(i in 1:length(badRegionIs)) { + thisRegionI = badRegionIs[i] + thisRegion = pointRegions == regions[thisRegionI] + integrationMat[thisRegionI,thisRegion] = regionTotals[thisRegionI] * (1/sum(thisRegion)) + } + } + + # the column sums of the matrix give the correct calibrated regional totals + colSums(integrationMat) +} + +#' Generate a population frame of a similar format to poppa argument of \code{\link{simPopCustom}} with a custom set of regions +#' +#' @param popMat Pixellated grid data frame with variables `area` and `pop`. Assumed to be stratified by urban/rural +#' @param regions character vector of length nPixels giving a custom set of regions for which to generate +#' a population frame using population density +#' +#' @details Urbanicity thresholds are set based on that region's percent population +#' urban. Intended as a helper function of \code{\link{getPoppsub}}, but +#' can also be used for custom sets of regions (i.e. more than just 2 +#' areal levels: area and subarea). +#' +#' @author John Paige +#' +#' @seealso \code{\link{getPoppsub}} +#' +#' @examples +#' \dontrun{ +#' data(kenyaPopulationData) +#' +#' #' # download Kenya GADM shapefiles from SUMMERdata github repository +#' githubURL <- "https://github.com/paigejo/SUMMERdata/blob/main/data/kenyaMaps.rda?raw=true" +#' tempDirectory = "~/" +#' mapsFilename = paste0(tempDirectory, "/kenyaMaps.rda") +#' if(!file.exists(mapsFilename)) { +#' download.file(githubURL,mapsFilename) +#' } +#' +#' # load it in +#' out = load(mapsFilename) +#' out +#' adm1@data$NAME_1 = as.character(adm1@data$NAME_1) +#' adm1@data$NAME_1[adm1@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +#' adm1@data$NAME_1[adm1@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" +#' adm2@data$NAME_1 = as.character(adm2@data$NAME_1) +#' adm2@data$NAME_1[adm2@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +#' adm2@data$NAME_1[adm2@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" +#' +#' # some Admin-2 areas have the same name +#' adm2@data$NAME_2 = as.character(adm2@data$NAME_2) +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Bungoma") & +#' (adm2@data$NAME_2 == "Lugari")] = "Lugari, Bungoma" +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Kakamega") & +#' (adm2@data$NAME_2 == "Lugari")] = "Lugari, Kakamega" +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Meru") & +#' (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Meru" +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Tharaka-Nithi") & +#' (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Tharaka-Nithi" +#' +#' # The spatial area of unknown 8 is so small, it causes problems unless +#' # its removed or unioned with another subarea. Union it with neighboring +#' # Kakeguria: +#' newadm2 = adm2 +#' unknown8I = which(newadm2$NAME_2 == "unknown 8") +#' newadm2$NAME_2[newadm2$NAME_2 %in% c("unknown 8", "Kapenguria")] <- "Kapenguria + unknown 8" +#' admin2.IDs <- newadm2$NAME_2 +#' +#' library(maptools) +#' temp <- unionSpatialPolygons(newadm2, admin2.IDs) +#' tempData = newadm2@data[-unknown8I,] +#' tempData = tempData[order(tempData$NAME_2),] +#' newadm2 <- SpatialPolygonsDataFrame(temp, tempData, match.ID = F) +#' adm2 = newadm2 +#' +#' # download 2014 Kenya population density and associated TIF file +#' githubURL <- +#' "https://github.com/paigejo/SUMMERdata/blob/main/data/Kenya2014Pop/pop.rda?raw=true" +#' popFilename = paste0(tempDirectory, "/pop.rda") +#' if(!file.exists(popFilename)) { +#' download.file(githubURL,popFilename) +#' } +#' +#' githubURL <- +#' paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/Kenya2014Pop/", +#' "worldpop_total_1y_2014_00_00.tif?raw=true") +#' popTIFFilename = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") +#' if(!file.exists(popTIFFilename)) { +#' download.file(githubURL,popTIFFilename) +#' } +#' +#' # load it in +#' require(terra) +#' out = load(popFilename) +#' out +#' +#' # make sure this is correct for re-projections +#' pop@file@name = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") +#' +#' eastLim = c(-110.6405, 832.4544) +#' northLim = c(-555.1739, 608.7130) +#' +#' require(fields) +#' +#' # Now generate a general population integration table at 5km resolution, +#' # based on subarea (Admin-2) x urban/rural population totals. This takes +#' # ~1 minute +#' system.time(popMatKenya <- makePopIntegrationTab( +#' kmRes=5, pop=pop, domainMapDat=adm0, +#' eastLim=eastLim, northLim=northLim, mapProjection=projKenya, +#' poppa = poppaKenya, poppsub=poppsubKenya, +#' areaMapDat = adm1, subareaMapDat = adm2, +#' areaNameVar = "NAME_1", subareaNameVar="NAME_2")) +#' +#' out = poppRegionFromPopMat(popMatKenya, popMatKenya$area) +#' out +#' poppaKenya +#' +#' out = poppRegionFromPopMat(popMatKenya, popMatKenya$subarea) +#' out +#' poppsubKenya +#' +#' popMatKenyaUnstratified = popMatKenya +#' popMatKenyaUnstratified$urban = NULL +#' out = poppRegionFromPopMat(popMatKenyaUnstratified, popMatKenyaUnstratified$area) +#' out +#' poppaKenya +#' } +#' @return A table of population totals by region +#' +#' @export +poppRegionFromPopMat = function(popMat, regions) { + stratifyByUrban = ("urban" %in% names(popMat)) && !all(is.na(popMat$urban)) + + if(stratifyByUrban) { + # in this case, generate urban, rural, and overall totals within each region + out = aggregate(popMat$pop, by=list(region=as.character(regions), urban=popMat$urban), FUN=sum, drop=FALSE) + regions = sort(unique(out$region)) + poppr = data.frame(region=regions, popUrb=out[(length(regions) + 1):(2*length(regions)), 3], + popRur=out[1:length(regions), 3]) + poppr$popUrb[is.na(poppr$popUrb)] = 0 + poppr$popRur[is.na(poppr$popRur)] = 0 + poppr$popTotal = poppr$popUrb + poppr$popRur + } else { + # in this case, only generate overall totals within each region + out = aggregate(popMat$pop, by=list(region=as.character(regions)), FUN=sum, drop=FALSE) + regions = sort(unique(out$region)) + poppr = data.frame(region=regions, popTotal=out[1:length(regions), 2]) + poppr$popTotal[is.na(poppr$popTotal)] = 0 + } + + poppr +} + +#' Set thresholds of population density for urbanicity classifications +#' within each region of the given type +#' +#' @param popMat pixellated population density data frame with variables +#' regionType and `pop` +#' @param poppr A table with population totals by region of the given type +#' (e.g. poppa or poppsub from \code{\link{makePopIntegrationTab}}) +#' @param regionType The variable name from poppr giving the region names. +#' Defaults to "area" +#' +#' @details Thresholds are set based on that region's percent population +#' urban. Intended as a helper function of \code{\link{makePopIntegrationTab}}. +#' +#' @author John Paige +#' +#' @seealso \code{\link{makePopIntegrationTab}} +#' +#' @examples +#' \dontrun{ +#' data(kenyaPopulationData) +#' +#' #' # download Kenya GADM shapefiles from SUMMERdata github repository +#' githubURL <- "https://github.com/paigejo/SUMMERdata/blob/main/data/kenyaMaps.rda?raw=true" +#' tempDirectory = "~/" +#' mapsFilename = paste0(tempDirectory, "/kenyaMaps.rda") +#' if(!file.exists(mapsFilename)) { +#' download.file(githubURL,mapsFilename) +#' } +#' +#' # load it in +#' out = load(mapsFilename) +#' out +#' adm1@data$NAME_1 = as.character(adm1@data$NAME_1) +#' adm1@data$NAME_1[adm1@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +#' adm1@data$NAME_1[adm1@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" +#' adm2@data$NAME_1 = as.character(adm2@data$NAME_1) +#' adm2@data$NAME_1[adm2@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +#' adm2@data$NAME_1[adm2@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" +#' +#' # some Admin-2 areas have the same name +#' adm2@data$NAME_2 = as.character(adm2@data$NAME_2) +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Bungoma") & +#' (adm2@data$NAME_2 == "Lugari")] = "Lugari, Bungoma" +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Kakamega") & +#' (adm2@data$NAME_2 == "Lugari")] = "Lugari, Kakamega" +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Meru") & +#' (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Meru" +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Tharaka-Nithi") & +#' (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Tharaka-Nithi" +#' +#' # The spatial area of unknown 8 is so small, it causes problems unless +#' # its removed or unioned with another subarea. Union it with neighboring +#' # Kakeguria: +#' newadm2 = adm2 +#' unknown8I = which(newadm2$NAME_2 == "unknown 8") +#' newadm2$NAME_2[newadm2$NAME_2 %in% c("unknown 8", "Kapenguria")] <- "Kapenguria + unknown 8" +#' admin2.IDs <- newadm2$NAME_2 +#' +#' library(maptools) +#' temp <- unionSpatialPolygons(newadm2, admin2.IDs) +#' tempData = newadm2@data[-unknown8I,] +#' tempData = tempData[order(tempData$NAME_2),] +#' newadm2 <- SpatialPolygonsDataFrame(temp, tempData, match.ID = F) +#' adm2 = newadm2 +#' +#' # download 2014 Kenya population density and associated TIF file +#' githubURL <- +#' "https://github.com/paigejo/SUMMERdata/blob/main/data/Kenya2014Pop/pop.rda?raw=true" +#' popFilename = paste0(tempDirectory, "/pop.rda") +#' if(!file.exists(popFilename)) { +#' download.file(githubURL,popFilename) +#' } +#' +#' githubURL <- +#' paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/Kenya2014Pop/", +#' "worldpop_total_1y_2014_00_00.tif?raw=true") +#' popTIFFilename = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") +#' if(!file.exists(popTIFFilename)) { +#' download.file(githubURL,popTIFFilename) +#' } +#' +#' # load it in +#' require(terra) +#' out = load(popFilename) +#' out +#' +#' # make sure this is correct for re-projections +#' pop@file@name = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") +#' +#' eastLim = c(-110.6405, 832.4544) +#' northLim = c(-555.1739, 608.7130) +#' +#' require(fields) +#' +#' # Now generate a general population integration table at 5km resolution, +#' # based on subarea (Admin-2) x urban/rural population totals. This takes +#' # ~1 minute +#' system.time(popMatKenya <- makePopIntegrationTab( +#' kmRes=5, pop=pop, domainMapDat=adm0, +#' eastLim=eastLim, northLim=northLim, mapProjection=projKenya, +#' poppa = poppaKenya, poppsub=poppsubKenya, +#' areaMapDat = adm1, subareaMapDat = adm2, +#' areaNameVar = "NAME_1", subareaNameVar="NAME_2")) +#' +#' out = setThresholdsByRegion(popMatKenya, poppaKenya) +#' out +#' +#' out = setThresholdsByRegion(popMatKenya, poppsubKenya, regionType="subarea") +#' out +#' } +#' +#' @return A list of region names and their urbanicity thresholds in population density +#' +#' @export +setThresholdsByRegion = function(popMat, poppr, regionType="area") { + + getRegionThresh = function(regionName) { + # do the setup + thisRegion = as.character(popMat[[regionType]]) == regionName + thisPop = popMat$pop[thisRegion] + thisTot = sum(thisPop, na.rm=TRUE) + pctUrb = poppr$pctUrb[poppr[[regionType]] == regionName]/100 + pctRural = 1 - pctUrb + + # must round to avoid numerical inaccuracies + if(round(pctUrb, 10) == 1) { + return(-Inf) + } else if(round(pctUrb, 10) == 0) { + return(Inf) + } + + # calculate threshold by integrating ecdf via sorted value cumulative sum + sortedPop = sort(thisPop) + cumsumPop = cumsum(sortedPop) + threshI = match(1, cumsumPop >= thisTot*pctRural) + if((threshI != 1) && (threshI != length(thisPop))) { + thresh = sortedPop[threshI] + } else { + # make sure not all pixels are urban or all are rural + if(threshI == 1) { + thresh = mean(c(sortedPop[1], sortedPop[2]), na.rm=TRUE) + } else { + thresh = mean(c(sortedPop[length(thisPop)], sortedPop[length(thisPop)-1]), na.rm=TRUE) + } + } + + thresh + } + + # compute threshold for each area + regions = poppr[[regionType]] + threshes = sapply(regions, getRegionThresh) + + list(regions=regions, threshes=threshes) +} + + + diff --git a/R/projKenya.R b/R/projKenya.R new file mode 100755 index 0000000..fe29e1a --- /dev/null +++ b/R/projKenya.R @@ -0,0 +1,80 @@ +#' Map projection for Kenya +#' +#' Projection specifically chosen for Kenya. Project from lat/lon to northing/easting +#' in kilometers. Uses epsg=21097 with km units. May not work on all systems due to +#' differences in the behavior between different PROJ and GDAL versions. +#' +#' @param lon either longitude or, if inverse == TRUE, easting in km +#' @param lat either latitude or, if inverse == TRUE, northing in km +#' @param inverse if FALSE, projects from lon/lat to easting/northing. Else from easting/northing to lon/lat +#' @return A 2 column matrix of easting/northing coordinates in km if inverse == FALSE. Otherwise, a 2 column matrix of longitude/latitude coordinates. +#' @author John Paige +#' @examples +#' eastLim = c(-110.6405, 832.4544) +#' northLim = c(-555.1739, 608.7130) +#' coordMatrixEN = cbind(eastLim, northLim) +#' coordMatrixLL = projKenya(coordMatrixEN, inverse=TRUE) +#' +#' coordMatrixLL +#' # if the coordMatrixLL isn't the following, projKenya may not support +#' # your installation of GDAL and/or PROJ: +#' # east north +#' # [1,] 33.5 -5.0 +#' # [2,] 42.0 5.5 +#' +#' projKenya(coordMatrixLL, inverse=FALSE) +#' # regardless of your PROJ/GDAL installations, the result of the +#' # above line of could should be: +#' # lon lat +#' # [1,] -110.6405 -555.1739 +#' # [2,] 832.4544 608.7130 +#' +#' @importFrom terra gdal +#' @importFrom sp SpatialPoints +#' @importFrom sp CRS +#' @export +projKenya = function(lon, lat=NULL, inverse=FALSE) { + if(is.null(lat)) { + lat = lon[,2] + lon = lon[,1] + } + + # determine version of PROJ + ver = terra::gdal(lib="proj") + PROJ6 <- as.numeric(substr(ver, 1, 1)) >= 6 + + if(!inverse) { + # from lon/lat coords to easting/northing + if(!PROJ6) { + lonLatCoords = sp::SpatialPoints(cbind(lon, lat), proj4string=sp::CRS("+proj=longlat")) + } else { + lonLatCoords = sp::SpatialPoints(cbind(lon, lat), proj4string=sp::CRS(SRS_string="EPSG:4326")) + } + coordsEN = sp::spTransform(lonLatCoords, sp::CRS("+init=epsg:21097 +units=m")) + + out = attr(coordsEN, "coords") + colnames(out) = c("east", "north") + + # convert coordinates from m to km + out = out/1000 + } + else { + # from easting/northing coords to lon/lat + + # first convert from km to m + east = lon*1000 + north = lat*1000 + + coordsEN = sp::SpatialPoints(cbind(east, north), proj4string=sp::CRS("+init=epsg:21097 +units=m")) + if(!PROJ6) { + lonLatCoords = sp::spTransform(coordsEN, sp::CRS("+proj=longlat")) + } else { + lonLatCoords = sp::spTransform(coordsEN, sp::CRS(SRS_string="EPSG:4326")) + } + + out = attr(lonLatCoords, "coords") + colnames(out) = c("lon", "lat") + } + + out +} \ No newline at end of file diff --git a/R/simPop.R b/R/simPop.R new file mode 100755 index 0000000..e6a6ea8 --- /dev/null +++ b/R/simPop.R @@ -0,0 +1,1926 @@ +# Functions for simulating populations + +#' Simulate populations and areal prevalences +#' +#' Given a spatial risk model, simulate populations and population prevalences at the +#' enumeration area level (represented as points), and aggregate to the pixel and +#' administrative areal level. +#' +#' @param nsim Number of simulations +#' @param easpa data.frame of enumeration area, households, and target population per area stratified by urban/rural with variables: +#' \describe{ +#' \item{area}{name of area} +#' \item{EAUrb}{number of urban enumeration areas in the area} +#' \item{EARur}{number of rural enumeration areas in the area} +#' \item{EATotal}{total number of enumeration areas in the area} +#' \item{HHUrb}{number of urban households in the area} +#' \item{HHRur}{number of rural households in the area} +#' \item{HHTotal}{total number of households in the area} +#' \item{popUrb}{total urban (target) population of area} +#' \item{popRur}{total rural (target) population of area} +#' \item{popTotal}{total (general) population of area} +#' } +#' @param popMat Pixellated grid data frame with variables `lon`, `lat`, `pop`, `area`, `subareas` (if subareaLevel is TRUE), `urban` (if stratifyByUrban is TRUE), `east`, and `north` +#' @param targetPopMat Same as popMat, but `pop` variable gives target rather than general population +#' @param poppsub data.frame of population per subarea separated by +#' urban/rural using for population density grid normalization or urbanicity +#' classification. Often based on extra fine scale population density grid. Has variables: +#' @param spdeMesh Triangular mesh for the SPDE +#' @param margVar Marginal variance of the spatial process, excluding cluster effects. +#' If 0, no spatial component is included +#' @param effRange Effective spatial range for the SPDE model +#' @param beta0 Intercept of logit model for risk +#' @param gamma Effect of urban on logit scale for logit model for risk +#' @param sigmaEpsilon Standard deviation on the logit scale for iid Gaussian EA level random effects in the risk model +#' @param seed Random number generator seed +#' @param inla.seed Seed input to inla.qsample. 0L sets seed intelligently, +#' > 0 sets a specific seed, < 0 keeps existing RNG +#' @param nHHSampled Number of households sampled per enumeration area. Default is 25 to match DHS surveys +#' @param stratifyByUrban Whether or not to stratify simulations by urban/rural classification +#' @param subareaLevel Whether or not to aggregate the population by subarea +#' @param doFineScaleRisk Whether or not to calculate the fine scale risk at each aggregation level in addition to the prevalence +#' @param doSmoothRisk Whether or not to calculate the smooth risk at each aggregation level in addition to the prevalence +#' @param doSmoothRiskLogisticApprox Whether to use logistic approximation when calculating smooth risk. See +#' \code{\link{logitNormMean}} for details. +#' @param min1PerSubarea If TRUE, ensures there is at least 1 EA per subarea. If subareas are particularly unlikely to +#' have enumeration areas since they have a very low proportion of the population in an area, then setting this to TRUE may be +#' computationally intensive. +#' @param logitRiskDraws nIntegrationPoints x nsim dimension matrix of draws from the pixel leve risk field on logit scale, leaving out +#' potential nugget/cluster/EA level effects. +#' @param sigmaEpsilonDraws nsim length vector of draws of cluster effect logit scale SD (joint draws with logitRiskDraws) +#' @param validationPixelI CURRENTLY FOR TESTING PURPOSES ONLY a set of indices of pixels for which we want to simulate populations (used for pixel level validation) +#' @param validationClusterI CURRENTLY FOR TESTING PURPOSES ONLY a set of indices of cluster for which we want to simulate populations (used for cluster level validation) +#' @param clustersPerPixel CURRENTLY FOR TESTING PURPOSES ONLY Used for pixel level validation. Fixes the number of EAs per pixel. +#' @param returnEAinfo If TRUE, returns information on every individual EA (BAU) for each simulated population +#' @param epsc nEAs x nsim matrix of simulated EA (BAU) level iid effects representing fine scale variation in +#' risk. If NULL, they are simulated as iid Gaussian on a logit scale with +#' SD given by sigmaEpsilonDraws +#' list(pixelPop=outPixelLevel, subareaPop=outSubareaLevel, areaPop=outAreaLevel, logitRiskDraws=logitRiskDraws) +#' @return The simulated population aggregated to the enumeration area, +#' pixel, subarea (generally Admin2), and area (generally Admin1) levels. Output includes: +#' \item{pixelPop}{A list of pixel level population aggregates} +#' \item{subareaPop}{A list of `subarea` level population aggregates} +#' \item{areaPop}{A list of `area` level population aggregates} +#' Each of these contains population numerator and denominator as well as prevalence and risk +#' information aggregated to the appropriate level. +#' +#' @details For population simulation and aggregation, we consider three models: smooth +#' risk, fine scale risk, and the fine scale prevalence. All will be described in detail +#' in a paper in preparation. In the smooth risk model, pixel level risks are integrated +#' with respect to target population density when producing areal estimates on a prespecified +#' set of integration points. The target population may be, for example, neonatals rather +#' than the general population. In the fine scale models, enumeration areas (EAs) are simulated as +#' point locations and iid random effects in the EA level risk are allowed. EAs and populations are dispersed conditional on the (possibly +#' approximately) known number of EAs, households, and target population at a particular +#' areal level (these we call `areas`) using multilevel multinomial sampling, first +#' sampling the EAs, then distributing households among the EAs, then the target population +#' among the households. Any areal level below the `areas` we call `subareas`. For instance, +#' the `areas` might be Admin-1 if that is the smallest level at which the number of EAs, +#' households, and people is known, and the `subareas` might be Admin-2. The multilevel +#' multinomial sampling may be stratified by urban/rural within the areas if the number of +#' EAs, households, and people is also approximately known at that level. +#' +#' Within each EA we assume a fixed probability of an event occurring, which is the fine scale `risk`. +#' The fine scale `prevalence` is the empirical proportion of events within that EA. We assume EA +#' level logit scale iid N(0, sigmaEpsilon^2) random effects in the risk model. When averaged +#' with equal weights over all EAs in an areal unit, this forms the fine scale risk. When +#' instead the population numerators and denominators are aggregated, and are used to +#' calculate the empirical proportion of events occurring in an areal unit, the resulting +#' quantity is the fine scale prevalence in that areal unit. +#' +#' Note that these functions can be used for either simulating populations for simulation +#' studies, or for generating predictions accounting for uncertainty in EA locations +#' and fine scale variation occuring at the EA level due to EA level iid random effects. +#' Required, however, is a seperately fit EA level spatial risk model +#' and information on the spatial population density and the population frame. +#' +#' @author John Paige +#' @references In preparation +#' @seealso \code{\link{simPopCustom}}, \code{\link{makePopIntegrationTab}}, \code{\link{adjustPopMat}}, \code{\link{simSPDE}}. +#' @examples +#' \dontrun{ +#' ## In this script we will create 5km resolution pixellated grid over Kenya, +#' ## and generate tables of estimated (both target and general) population +#' ## totals at the area (e.g. Admin-1) and subarea (e.g. Admin-2) levels. Then +#' ## we will use that to simulate populations of +#' +#' # download Kenya GADM shapefiles from SUMMERdata github repository +#' githubURL <- paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/", +#' "kenyaMaps.rda?raw=true") +#' tempDirectory = "~/" +#' mapsFilename = paste0(tempDirectory, "/kenyaMaps.rda") +#' if(!file.exists(mapsFilename)) { +#' download.file(githubURL,mapsFilename) +#' } +#' +#' # load it in +#' out = load(mapsFilename) +#' out +#' adm1@data$NAME_1 = as.character(adm1@data$NAME_1) +#' adm1@data$NAME_1[adm1@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +#' adm1@data$NAME_1[adm1@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" +#' adm2@data$NAME_1 = as.character(adm2@data$NAME_1) +#' adm2@data$NAME_1[adm2@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +#' adm2@data$NAME_1[adm2@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" +#' +#' # some Admin-2 areas have the same name +#' adm2@data$NAME_2 = as.character(adm2@data$NAME_2) +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Bungoma") & +#' (adm2@data$NAME_2 == "Lugari")] = "Lugari, Bungoma" +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Kakamega") & +#' (adm2@data$NAME_2 == "Lugari")] = "Lugari, Kakamega" +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Meru") & +#' (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Meru" +#' adm2@data$NAME_2[(adm2@data$NAME_1 == "Tharaka-Nithi") & +#' (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Tharaka-Nithi" +#' +#' # The spatial area of unknown 8 is so small, it causes problems unless its removed or +#' # unioned with another subarea. Union it with neighboring Kakeguria: +#' newadm2 = adm2 +#' unknown8I = which(newadm2$NAME_2 == "unknown 8") +#' newadm2$NAME_2[newadm2$NAME_2 %in% c("unknown 8", "Kapenguria")] <- +#' "Kapenguria + unknown 8" +#' admin2.IDs <- newadm2$NAME_2 +#' +#' library(maptools) +#' temp <- unionSpatialPolygons(newadm2, admin2.IDs) +#' tempData = newadm2@data[-unknown8I,] +#' tempData = tempData[order(tempData$NAME_2),] +#' newadm2 <- SpatialPolygonsDataFrame(temp, tempData, match.ID = F) +#' adm2 = newadm2 +#' +#' # download 2014 Kenya population density and associated TIF file +#' githubURL <- paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/", +#' "Kenya2014Pop/pop.rda?raw=true") +#' popFilename = paste0(tempDirectory, "/pop.rda") +#' if(!file.exists(popFilename)) { +#' download.file(githubURL,popFilename) +#' } +#' +#' githubURL <- paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/", +#' "Kenya2014Pop/worldpop_total_1y_2014_00_00.tif?raw=true") +#' popTIFFilename = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") +#' if(!file.exists(popTIFFilename)) { +#' download.file(githubURL,popTIFFilename) +#' } +#' +#' # load it in +#' require(raster) +#' out = load(popFilename) +#' out +#' +#' # make sure this is correct for re-projections +#' pop@file@name = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") +#' +#' eastLim = c(-110.6405, 832.4544) +#' northLim = c(-555.1739, 608.7130) +#' +#' ## Construct poppsubKenya, a table of urban/rural general population totals +#' ## in each subarea. Technically, this is not necessary since we can load in +#' ## poppsubKenya via data(kenyaPopulationData). First, we will need to calculate +#' ## the areas in km^2 of the areas and subareas +#' +#' library(rgdal) +#' library(sp) +#' +#' # use Lambert equal area projection of areas (Admin-1) and subareas (Admin-2) +#' midLon = mean(adm1@bbox[1,]) +#' midLat = mean(adm1@bbox[2,]) +#' p4s = paste0("+proj=laea +x_0=0 +y_0=0 +lon_0=", midLon, +#' " +lat_0=", midLat, " +units=km") +#' +#' library(rgdal) +#' +#' adm1proj <- spTransform(adm1, CRS(p4s)) +#' adm2proj <- spTransform(adm2, CRS(p4s)) +#' +#' # now calculate spatial area in km^2 +#' library(rgeos) +#' admin1Areas = gArea(adm1proj, TRUE) +#' admin2Areas = gArea(adm2proj, TRUE) +#' areapaKenya = data.frame(area=adm1proj@data$NAME_1, spatialArea=admin1Areas) +#' areapsubKenya = data.frame(area=adm2proj@data$NAME_1, subarea=adm2proj@data$NAME_2, +#' spatialArea=admin2Areas) +#' +#' # Calculate general population totals at the subarea (Admin-2) x urban/rural +#' # level and using 1km resolution population grid. Assign urbanicity by +#' # thresholding population density based on estimated proportion population +#' # urban/rural, making sure total area (Admin-1) urban/rural populations in +#' # each area matches poppaKenya. +#' require(fields) +#' # NOTE: the following function will typically take ~20 minutes. Can speed up +#' # by setting kmRes to be higher, but we recommend fine resolution for +#' # this step, since it only needs to be done once. +#' system.time(poppsubKenya <- getPoppsub( +#' kmRes=1, pop=pop, domainPoly=kenyaPoly, +#' eastLim=eastLim, northLim=northLim, mapProjection=projKenya, +#' poppa = poppaKenya, areapa=areapaKenya, areapsub=areapsubKenya, +#' areaMapDat=adm1, subareaMapDat=adm2, +#' areaNameVar = "NAME_1", subareaNameVar="NAME_2")) +#' +#' # Now generate a general population integration table at 5km resolution, +#' # based on subarea (Admin-2) x urban/rural population totals. This takes +#' # ~1 minute +#' system.time(popMatKenya <- makePopIntegrationTab( +#' kmRes=5, pop=pop, domainPoly=kenyaPoly, +#' eastLim=eastLim, northLim=northLim, mapProjection=projKenya, +#' poppa = poppaKenya, poppsub=poppsubKenya, +#' areaMapDat = adm1, subareaMapDat = adm2, +#' areaNameVar = "NAME_1", subareaNameVar="NAME_2")) +#' +#' ## Adjust popMat to be target (neonatal) rather than general population +#' ## density. First create the target population frame +#' ## (these numbers are based on IPUMS microcensus data) +#' mothersPerHouseholdUrb = 0.3497151 +#' childrenPerMotherUrb = 1.295917 +#' mothersPerHouseholdRur = 0.4787696 +#' childrenPerMotherRur = 1.455222 +#' targetPopPerStratumUrban = easpaKenya$HHUrb * mothersPerHouseholdUrb * +#' childrenPerMotherUrb +#' targetPopPerStratumRural = easpaKenya$HHRur * mothersPerHouseholdRur * +#' childrenPerMotherRur +#' easpaKenyaNeonatal = easpaKenya +#' easpaKenyaNeonatal$popUrb = targetPopPerStratumUrban +#' easpaKenyaNeonatal$popRur = targetPopPerStratumRural +#' easpaKenyaNeonatal$popTotal = easpaKenyaNeonatal$popUrb + +#' easpaKenyaNeonatal$popRur +#' easpaKenyaNeonatal$pctUrb = 100 * easpaKenyaNeonatal$popUrb / +#' easpaKenyaNeonatal$popTotal +#' easpaKenyaNeonatal$pctTotal = +#' 100 * easpaKenyaNeonatal$popTotal / sum(easpaKenyaNeonatal$popTotal) +#' +#' # Generate the target population density by scaling the current +#' # population density grid at the Admin1 x urban/rural level +#' popMatKenyaNeonatal = adjustPopMat(popMatKenya, easpaKenyaNeonatal) +#' +#' # Generate neonatal population table from the neonatal population integration +#' # matrix. This is technically not necessary for population simulation purposes, +#' # but is here for illustrative purposes +#' poppsubKenyaNeonatal = poppRegionFromPopMat(popMatKenyaNeonatal, +#' popMatKenyaNeonatal$subarea) +#' poppsubKenyaNeonatal = +#' cbind(subarea=poppsubKenyaNeonatal$region, +#' area=adm2@data$NAME_1[match(poppsubKenyaNeonatal$region, adm2@data$NAME_2)], +#' poppsubKenyaNeonatal[,-1]) +#' print(head(poppsubKenyaNeonatal)) +#' +#' ## Now we're ready to simulate neonatal populations along with neonatal +#' ## mortality risks and prevalences +#' +#' # use the following model to simulate the neonatal population based roughly +#' # on Paige et al. (2020) neonatal mortality modeling for Kenya. +#' beta0=-2.9 # intercept +#' gamma=-1 # urban effect +#' rho=(1/3)^2 # spatial variance +#' effRange = 400 # effective spatial range in km +#' sigmaEpsilon=sqrt(1/2.5) # cluster (nugget) effect standard deviation +#' +#' # Run a simulation! This produces multiple dense nEA x nsim and nPixel x nsim +#' # matrices. In the future sparse matrices and chunk by chunk computations +#' # may be incorporated. +#' simPop = simPopSPDE(nsim=1, easpa=easpaKenyaNeonatal, +#' popMat=popMatKenya, targetPopMat=popMatKenyaNeonatal, +#' poppsub=poppsubKenya, spdeMesh=kenyaMesh, +#' margVar=rho, sigmaEpsilon=sigmaEpsilon, +#' gamma=gamma, effRange=effRange, beta0=beta0, +#' seed=12, inla.seed=12, nHHSampled=25, +#' stratifyByUrban=TRUE, subareaLevel=TRUE, +#' doFineScaleRisk=TRUE, doSmoothRisk=TRUE, +#' min1PerSubarea=TRUE) +#' +#' # get average absolute percent error relative to fine scale prevalence at Admin-2 level +#' tempDat = simPop$subareaPop$aggregationResults[,c("region", "pFineScalePrevalence", +#' "pFineScaleRisk", "pSmoothRisk")] +#' 100*mean(abs(tempDat$pFineScalePrevalence - tempDat$pFineScaleRisk) / +#' tempDat$pFineScalePrevalence) +#' 100*mean(abs(tempDat$pFineScalePrevalence - tempDat$pSmoothRisk) / +#' tempDat$pFineScalePrevalence) +#' 100*mean(abs(tempDat$pFineScaleRisk - tempDat$pSmoothRisk) / +#' tempDat$pFineScalePrevalence) +#' +#' # verify number of EAs per area and subarea +#' cbind(aggregate(simPop$eaPop$eaSamples[,1], by=list(area=popMatKenya$area), FUN=sum), +#' trueNumEAs=easpaKenya$EATotal[order(easpaKenya$area)]) +#' aggregate(simPop$eaPop$eaSamples[,1], by=list(area=popMatKenya$subarea), FUN=sum) +#' +#' ## plot simulated population +#' # directory for plotting +#' # (mapPlot takes longer when it doesn't save to a file) +#' tempDirectory = "~/" +#' +#' # pixel level +#' zlim = c(0, quantile(probs=.995, c(simPop$pixelPop$pFineScalePrevalence, +#' simPop$pixelPop$pFineScaleRisk, +#' simPop$pixelPop$pSmoothRisk), na.rm=TRUE)) +#' pdf(file=paste0(tempDirectory, "simPopSPDEPixel.pdf"), width=8, height=8) +#' par(mfrow=c(2,2)) +#' plot(adm2, asp=1) +#' points(simPop$eaPop$eaDatList[[1]]$lon, simPop$eaPop$eaDatList[[1]]$lat, pch=".", col="blue") +#' plot(adm2, asp=1) +#' quilt.plot(popMatKenya$lon, popMatKenya$lat, simPop$pixelPop$pFineScalePrevalence, +#' zlim=zlim, add=TRUE, FUN=function(x) {mean(x, na.rm=TRUE)}) +#' plot(adm2, asp=1) +#' quilt.plot(popMatKenya$lon, popMatKenya$lat, simPop$pixelPop$pFineScaleRisk, +#' zlim=zlim, add=TRUE, FUN=function(x) {mean(x, na.rm=TRUE)}) +#' quilt.plot(popMatKenya$lon, popMatKenya$lat, simPop$pixelPop$pSmoothRisk, +#' zlim=zlim, FUN=function(x) {mean(x, na.rm=TRUE)}, asp=1) +#' plot(adm2, add=TRUE) +#' dev.off() +#' +#' range(simPop$eaPop$eaDatList[[1]]$N) +#' +#' # areal (Admin-1) level: these results should look essentially identical +#' +#' tempDat = simPop$areaPop$aggregationResults[,c("region", "pFineScalePrevalence", +#' "pFineScaleRisk", "pSmoothRisk")] +#' pdf(file=paste0(tempDirectory, "simPopSPDEAdmin-1.pdf"), width=7, height=7) +#' mapPlot(tempDat, +#' variables=c("pFineScalePrevalence", "pFineScaleRisk", "pSmoothRisk"), +#' geo=adm1, by.geo="NAME_1", by.data="region", is.long=FALSE) +#' dev.off() +#' +#' # subareal (Admin-2) level: these results should look subtley different +#' # depending on the type of prevalence/risk considered +#' tempDat = simPop$subareaPop$aggregationResults[,c("region", "pFineScalePrevalence", +#' "pFineScaleRisk", "pSmoothRisk")] +#' pdf(file=paste0(tempDirectory, "simPopSPDEAdmin-2.pdf"), width=7, height=7) +#' mapPlot(tempDat, +#' variables=c("pFineScalePrevalence", "pFineScaleRisk", "pSmoothRisk"), +#' geo=adm2, by.geo="NAME_2", by.data="region", is.long=FALSE) +#' dev.off() +#' } +#' @name simPop +NULL +#' @importFrom stats cor +#' @describeIn simPop +#' Simulate populations and population prevalences given census frame and population density +#' information. Uses SPDE model for generating spatial risk and can include iid cluster +#' level effect. +#' +#' @export +simPopSPDE = function(nsim=1, easpa, popMat, targetPopMat, poppsub, spdeMesh, + margVar=0.243, sigmaEpsilon=sqrt(0.463), + gamma=0.009, effRange=406.51, beta0=-3.922, + seed=NULL, inla.seed=-1L, nHHSampled=25, + stratifyByUrban=TRUE, subareaLevel=TRUE, + doFineScaleRisk=FALSE, doSmoothRisk=FALSE, + doSmoothRiskLogisticApprox=FALSE, + min1PerSubarea=TRUE) { + if(!is.null(seed)) { + set.seed(seed) + + if(inla.seed < 0) { + stop("seed specified, but not inla.seed. Set inla.seed to a positive integer to ensure reproducibility") + } + } + + if(nsim > 1) { + warning("nsim > 1. eaDat will only be generated for first simulation") + } + + totalEAs = sum(easpa$EATotal) + totalHouseholds = sum(easpa$HHTotal) + + ### generate Binomial probabilities from transformed logit scale GP + # generate SPDE simulations + pixelCoords = cbind(popMat$east, popMat$north) + + if(margVar != 0) { + SPDEArgs = list(coords=pixelCoords, nsim=nsim, margVar=margVar, effRange=effRange, + mesh=spdeMesh, inla.seed=inla.seed) + simVals = do.call("simSPDE", SPDEArgs) + } else { + simVals = matrix(rep(0, nrow(pixelCoords)), ncol=1) + } + + # add in intercept + simVals = simVals + beta0 + + # add in urban effect + simVals = sweep(simVals, 1, gamma*popMat$urban, "+") + + # simulate nugget/cluster effect + epsc = matrix(stats::rnorm(totalEAs*nsim, sd=sigmaEpsilon), ncol=nsim) + + # transform back to original scale for the pixel level probabilities + probsNoNug = expit(simVals) + + # simulate the enumeration areas + logitRiskDraws = simVals + sigmaEpsilonDraws = rep(sigmaEpsilon, nsim) + + print("Using SPDE model to simulate EA and pixel level populations") + outPixelLevel = simPopCustom(logitRiskDraws=logitRiskDraws, sigmaEpsilonDraws=sigmaEpsilonDraws, easpa=easpa, + popMat=popMat, targetPopMat=targetPopMat, + stratifyByUrban=stratifyByUrban, doSmoothRisk=doSmoothRisk, + doSmoothRiskLogisticApprox=doSmoothRiskLogisticApprox, + doFineScaleRisk=doFineScaleRisk, poppsub=poppsub, + subareaLevel=subareaLevel, min1PerSubarea=min1PerSubarea, + returnEAinfo=TRUE, epsc=epsc) + eaPop = list(eaDatList=outPixelLevel$eaDatList, eaSamples=outPixelLevel$eaSamples) + outPixelLevel$eaDatList = NULL + outPixelLevel$eaSamples = NULL + + if(subareaLevel) { + print("aggregating from pixel level to subarea level") + outSubareaLevel = pixelPopToArea(pixelLevelPop=outPixelLevel, eaSamples=eaPop$eaSamples, + areas=popMat$subarea, stratifyByUrban=stratifyByUrban, + targetPopMat=targetPopMat, doFineScaleRisk=doFineScaleRisk, + doSmoothRisk=doSmoothRisk) + + print("aggregating from subarea level to area level") + + # get areas associated with each subarea for aggregation + tempAreasFrom = popMat$subarea + tempAreasTo = popMat$area + areasFrom = sort(unique(tempAreasFrom)) + areasToI = match(areasFrom, tempAreasFrom) + areasTo = tempAreasTo[areasToI] + + # do the aggregation from subareas to areas + outAreaLevel = areaPopToArea(areaLevelPop=outSubareaLevel, areasFrom=areasFrom, areasTo=areasTo, + stratifyByUrban=stratifyByUrban, doFineScaleRisk=doFineScaleRisk, + doSmoothRisk=doSmoothRisk) + } else { + outSubareaLevel = NULL + + print("aggregating from pixel level to area level") + outAreaLevel = pixelPopToArea(pixelLevelPop=outPixelLevel, eaSamples=eaPop$eaSamples, + areas=popMat$area, stratifyByUrban=stratifyByUrban, + doFineScaleRisk=doFineScaleRisk, doSmoothRisk=doSmoothRisk) + } + + list(eaPop=eaPop, pixelPop=outPixelLevel, subareaPop=outSubareaLevel, areaPop=outAreaLevel, logitRiskDraws=logitRiskDraws) +} + +#' Aggregate populations to the specified areal level +#' +#' Takes simulated populations and aggregates +#' them to the specified areal level. Also calculates the aggregated risk and prevalence. +#' +#' @param pixelLevelPop pixel level population information that we want aggregate. In the same format as output from \code{\link{simPopCustom}} +#' @param eaSamples nIntegrationPoint x nsim matrix of the number of enumeration areas per pixel sampled in the input pixel level population +#' @param areas character vector of length nIntegrationPoints of area names over which we +#' want to aggregate. Can also be subareas +#' @param stratifyByUrban whether or not to stratify simulations by urban/rural classification +#' @param targetPopMat pixellated grid data frame with variables `lon`, `lat`, `pop` (target population), `area`, `subareas` (if subareaLevel is TRUE), `urban` (if stratifyByUrban is TRUE), `east`, and `north` +#' @param doFineScaleRisk whether or not to calculate the fine scale risk in addition to the prevalence. See details +#' @param doSmoothRisk Whether or not to calculate the smooth risk in addition to the prevalence. See details +#' @param areaLevelPop output of \code{\link{simPopCustom}} containing pixel level information +#' about the population of interest +#' @param areasFrom character vector of length equal to the number of areas from which +#' we would like to aggregate containing the unique names of the areas. +#' Can also be subareas, but these are smaller than the "to areas", and +#' each "from area" must be entirely contained in a single "to area" +#' @param areasTo character vector of length equal to the number of areas from which +#' we would like to aggregate containing the names of the areas containing +#' with each respective `from' area. Can also be a set of subareas, +#' but these are larger than the "from areas". +#' +#' @author John Paige +#' @references In Preparation +#' @return A list containing elements `fineScalePrevalence` and `fineScaleRisk`. Each +#' of these are in turn lists with aggregated prevalence and risk for the area of +#' interest, containg the following elements, were paranethesis indicate the elements +#' for the fineScaleRisk model rather than fineScalePrevalence: +#' \item{p}{Aggregated prevalence (risk), calculated as aggregate of Z divided by +#' aggregate of N} +#' \item{Z}{Aggregated (expected) population numerator} +#' \item{N}{Aggregated (expected) population denominator} +#' \item{pUrban}{Aggregated prevalence (risk) in urban part of the area, calculated +#' as aggregate of Z divided by aggregate of N} +#' \item{ZUrban}{Aggregated (expected) population numerator in urban part of the area} +#' \item{NUrban}{Aggregated (expected) population denominator in urban part of the area} +#' \item{pRural}{Aggregated prevalence (risk) in rural part of the area, calculated +#' as aggregate of Z divided by aggregate of N} +#' \item{ZRural}{Aggregated (expected) population numerator in rural part of the area} +#' \item{NRural}{Aggregated (expected) population denominator in rural part of the area} +#' \item{A}{Aggregation matrix used to aggregate from pixel level to areal level} +#' \item{AUrban}{Aggregation matrix used to aggregate from pixel level to urban part of the areal level} +#' \item{ARural}{Aggregation matrix used to aggregate from pixel level to rural part of the areal level} +#' @seealso \code{\link{areaPopToArea}} +#' @name aggPop +#' @examples +#' \dontrun{ +#' ##### Now we make a model for the risk. We will use an SPDE model with these +#' ##### parameters for the linear predictor on the logist scale, which are chosen +#' ##### to be of practical interest: +#' beta0=-2.9 # intercept +#' gamma=-1 # urban effect +#' rho=(1/3)^2 # spatial variance +#' effRange = 400 # effective spatial range in km +#' sigmaEpsilon=sqrt(1/2.5) # cluster (nugget) effect standard deviation +#' +#' # simulate the population! Note that this produces multiple dense +#' # nEA x nsim and nIntegrationPoint x nsim matrices. In the future +#' # sparse matrices will and chunk by chunk computations may be incorporated. +#' simPop = simPopSPDE(nsim=1, easpa=easpaKenyaNeonatal, +#' popMat=popMatKenya, targetPopMat=popMatKenyaNeonatal, +#' poppsub=poppsubKenya, spdeMesh=kenyaMesh, +#' margVar=rho, sigmaEpsilonSq=sigmaEpsilon^2, +#' gamma=gamma, effRange=effRange, beta0=beta0, +#' seed=123, inla.seed=12, nHHSampled=25, +#' stratifyByUrban=TRUE, subareaLevel=TRUE, +#' doFineScaleRisk=TRUE, +#' min1PerSubarea=TRUE) +#' +#' pixelPop = simPop$pixelPop +#' subareaPop = pixelPopToArea(pixelLevelPop=pixelPop, eaSamples=pixelPop$eaSamples, +#' areas=popMatKenya$subarea, stratifyByUrban=TRUE, +#' targetPopMat=popMatKenyaNeonatal, doFineScaleRisk=TRUE) +#' +#' # get areas associated with each subarea for aggregation +#' tempAreasFrom = popMatKenya$subarea +#' tempAreasTo = popMatKenya$area +#' areasFrom = sort(unique(tempAreasFrom)) +#' areasToI = match(areasFrom, tempAreasFrom) +#' areasTo = tempAreasTo[areasToI] +#' +#' # do the aggregation from subareas to areas +#' outAreaLevel = areaPopToArea(areaLevelPop=subareaPop, +#' areasFrom=areasFrom, areasTo=areasTo, +#' stratifyByUrban=TRUE, doFineScaleRisk=TRUE) +#' } +NULL + +#' @describeIn aggPop Aggregate from pixel to areal level +#' @export +pixelPopToArea = function(pixelLevelPop, eaSamples, areas, stratifyByUrban=TRUE, targetPopMat=NULL, + doFineScaleRisk=!is.null(pixelLevelPop$fineScaleRisk$p), + doSmoothRisk=!is.null(pixelLevelPop$smoothRisk$p)) { + + # fine scale prevalence aggregation model + nSamples = pixelLevelPop$NFineScalePrevalence + zSamples = pixelLevelPop$ZFineScalePrevalence + zSamples[is.na(zSamples)] = 0 # must set to zero temporarily so matrix multiplication works + out = aggPixelPreds(Zg=zSamples, Ng=nSamples, areas=areas, targetPopMat=targetPopMat, + useDensity=FALSE, stratifyByUrban=stratifyByUrban, normalize=FALSE) + aggregationResults = out$aggregationResults + aggregationMatrices = out$aggregationMatrices + names(aggregationResults)[-1] = paste(names(aggregationResults)[-1], "FineScalePrevalence", sep="") + names(aggregationMatrices) = paste(names(aggregationMatrices), "FineScalePrevalence", sep="") + + # fine scale risk aggregation model + if(doFineScaleRisk) { + nSamplesFineScaleRisk = pixelLevelPop$NFineScaleRisk + zSamplesFineScaleRisk = pixelLevelPop$ZFineScaleRisk + zSamplesFineScaleRisk[is.na(zSamplesFineScaleRisk)] = 0 # must set to zero temporarily so matrix multiplication works out + out = aggPixelPreds(Zg=zSamplesFineScaleRisk, Ng=nSamplesFineScaleRisk, areas=areas, targetPopMat=targetPopMat, + useDensity=FALSE, stratifyByUrban=stratifyByUrban, normalize=FALSE) + aggregationResultsFineScaleRisk = out$aggregationResults + aggregationMatricesFineScaleRisk = out$aggregationMatrices + names(aggregationResultsFineScaleRisk)[-1] = paste(names(aggregationResultsFineScaleRisk)[-1], "FineScaleRisk", sep="") + names(aggregationMatricesFineScaleRisk) = paste(names(aggregationMatricesFineScaleRisk), "FineScaleRisk", sep="") + + # aggregationResults = merge(aggregationResults, aggregationResultsFineScaleRisk, by="region") + aggregationResults = c(aggregationResults, aggregationResultsFineScaleRisk[-1]) + aggregationMatrices = c(aggregationMatrices, aggregationMatricesFineScaleRisk) + } + + if(doSmoothRisk) { + # NOTE: although useDensity is set to FALSE, that's only because the density is already + # directly incorporated into nSamplesSmoothRisk + nSamplesSmoothRisk = pixelLevelPop$NSmoothRisk + zSamplesSmoothRisk = pixelLevelPop$ZSmoothRisk + zSamplesSmoothRisk[is.na(zSamplesSmoothRisk)] = 0 # must set to zero temporarily so matrix multiplication works out + out = aggPixelPreds(Zg=zSamplesSmoothRisk, Ng=nSamplesSmoothRisk, areas=areas, targetPopMat=targetPopMat, + useDensity=FALSE, stratifyByUrban=stratifyByUrban, normalize=FALSE) + aggregationResultsSmoothRisk = out$aggregationResults + aggregationMatricesSmoothRisk = out$aggregationMatrices + names(aggregationResultsSmoothRisk)[-1] = paste(names(aggregationResultsSmoothRisk)[-1], "SmoothRisk", sep="") + names(aggregationMatricesSmoothRisk) = paste(names(aggregationMatricesSmoothRisk), "SmoothRisk", sep="") + + # aggregationResults = merge(aggregationResults, aggregationResultsSmoothRisk, by="region") + aggregationResults = c(aggregationResults, aggregationResultsSmoothRisk[-1]) + aggregationMatrices = c(aggregationMatrices, aggregationMatricesSmoothRisk) + } + + list(aggregationResults=aggregationResults, aggregationMatrices=aggregationMatrices) +} + +#' Helper function of \code{\link{pixelPopToArea}} +#' +#' Aggregates population from the +#' pixel level to the level of the area of interest. +#' +#' @param Zg nIntegrationPoint x nsim matrix of simulated response (population numerators) for each pixel and sample +#' @param Ng nIntegrationPoint x nsim matrix of simulated counts (population denominators) for each pixel and sample +#' @param areas nIntegrationPoint length character vector of areas (or subareas) +#' @param urban nIntegrationPoint length vector of indicators specifying whether or not pixels are urban or rural +#' @param targetPopMat same as in \code{\link{simPopCustom}} +#' @param useDensity whether to use population density as aggregation weights. +#' @param stratifyByUrban whether or not to stratify simulations by urban/rural classification +#' @param normalize if TRUE, pixel level aggregation weights within specified area are normalized to sum to 1. This produces an +#' average of the values in Zg rather than a sum. In general, should only be set to TRUE for smooth integrals of risk. +aggPixelPreds = function(Zg, Ng, areas, urban=targetPopMat$urban, targetPopMat=NULL, useDensity=FALSE, + stratifyByUrban=TRUE, normalize=useDensity) { + + if(useDensity && !normalize) { + stop("if useDensity is set to TRUE, normalize must be set to TRUE as well") + } + predsUrban = urban + predsArea = areas + + # set NAs and pixels without any sample size to 0 + Ng[is.na(Ng)] = 0 + if(!useDensity) { + # is useDensity is true, then Zg is really a set of probabilities, so no need to set to 0 + Zg[Ng == 0] = 0 + } + + # function to aggregate predictions over the + # population density grid. Use the following function to get numerical + # integration matrix for a given level of areal aggregation. returned + # matrices have dimension length(unique(areaNames)) x length(areaNames) + # areaNames: + # urbanProportions: DEPRACATED vector giving proportion of population urban for each unique area in areaNames. + # If specified, ensure that urban and rural parts of the full integration + # matrix have the appropriate relative weights for each area. Used for population + # density based integration + # normalize: whether or not to normalize the rows of the matrices to sum to 1 or to instead + # contain only binary values (or non-binary values based on the binary values if + # urbanProportions is not NULL) + getIntegrationMatrix = function(areaNames, urbanProportions=NULL, normalize=FALSE) { + + if(useDensity) { + popDensities = targetPopMat$pop + densities = popDensities + } else { + equalDensities = rep(1, nrow(Zg)) + densities = equalDensities + } + + uniqueNames = sort(unique(areaNames)) + getMatrixHelper = function(i, thisUrban=NULL, thisUseDensity=useDensity, thisNormalize=normalize) { + areaI = areaNames == uniqueNames[i] + + if(thisUseDensity) { + theseDensities = popDensities + } else { + theseDensities = equalDensities + } + + # make sure we only include pixels in the given area and, if necessary, with the given urbanicity + theseDensities[!areaI] = 0 + if(!is.null(thisUrban)) + theseDensities[predsUrban != thisUrban] = 0 + thisSum = sum(theseDensities) + if(thisSum != 0 && thisNormalize) + theseDensities * (1/thisSum) + else if(thisSum == 0) + rep(0, length(theseDensities)) + else + theseDensities + } + + if(!stratifyByUrban) { + integrationMatrix = t(matrix(sapply(1:length(uniqueNames), getMatrixHelper), ncol=length(uniqueNames))) + + integrationMatrix + } else { + integrationMatrixUrban = t(matrix(sapply(1:length(uniqueNames), getMatrixHelper, thisUrban=TRUE), ncol=length(uniqueNames))) + integrationMatrixRural = t(matrix(sapply(1:length(uniqueNames), getMatrixHelper, thisUrban=FALSE), ncol=length(uniqueNames))) + if(!is.null(urbanProportions)) { + integrationMatrix = sweep(integrationMatrixUrban, 1, urbanProportions, "*") + sweep(integrationMatrixRural, 1, 1-urbanProportions, "*") + } else { + integrationMatrix = t(matrix(sapply(1:length(uniqueNames), getMatrixHelper), ncol=length(uniqueNames))) + } + + rownames(integrationMatrix) = uniqueNames + rownames(integrationMatrixUrban) = uniqueNames + rownames(integrationMatrixRural) = uniqueNames + + list(integrationMatrix=integrationMatrix, + integrationMatrixUrban=integrationMatrixUrban, + integrationMatrixRural=integrationMatrixRural) + } + } + + # Use the following function to perform the + # aggregations + getIntegratedPredictions = function(areaNames) { + # get numerical integration matrix + A = getIntegrationMatrix(areaNames, normalize=normalize) + + # aggregate the prediction and denominator matrices (for whole areas and also urban/rural strata if necessary) + if(!stratifyByUrban) { + ZAggregated = A %*% Zg + NAggregated = A %*% Ng + pAggregated = ZAggregated / NAggregated + pAggregated[NAggregated == 0] = NA + + aggregationResults = list(p=pAggregated, Z=ZAggregated, N=NAggregated) + aggregationMatrices = list(A=A, AUrban=NULL, ARural=NULL) + } else { + AUrban = A$integrationMatrixUrban + ARural = A$integrationMatrixRural + A = A$integrationMatrix + + # first aggregate the numerator. The denominator will depend on the aggregation method + ZAggregated = A %*% Zg + ZAggregatedUrban = AUrban %*% Zg + ZAggregatedRural = ARural %*% Zg + + if(useDensity) { + # for population density aggregation, we integrate probabilities rather than aggregate + # empirical proportions + NAggregated = NULL + pAggregated = ZAggregated + ZAggregated = NULL + + NAggregatedUrban = NULL + pAggregatedUrban = ZAggregatedUrban + ZAggregatedUrban = NULL + + NAggregatedRural = NULL + pAggregatedRural = ZAggregatedRural + ZAggregatedRural = NULL + } else { + # if we do not use density, we must also aggregate the denominator to calculate + # the aggregated empirical proportions + NAggregated = A %*% Ng + pAggregated = ZAggregated / NAggregated + pAggregated[NAggregated == 0] = NA + + NAggregatedUrban = AUrban %*% Ng + pAggregatedUrban = ZAggregatedUrban / NAggregatedUrban + pAggregatedUrban[NAggregatedUrban == 0] = NA + + NAggregatedRural = ARural %*% Ng + pAggregatedRural = ZAggregatedRural / NAggregatedRural + pAggregatedRural[NAggregatedRural == 0] = NA + } + + aggregationResults = list(region=sort(unique(areas)), p=pAggregated, Z=ZAggregated, N=NAggregated, + pUrban=pAggregatedUrban, ZUrban=ZAggregatedUrban, NUrban=NAggregatedUrban, + pRural=pAggregatedRural, ZRural=ZAggregatedRural, NRural=NAggregatedRural) + aggregationMatrices = list(A=A, AUrban=AUrban, ARural=ARural) + } + + list(aggregationResults=aggregationResults, aggregationMatrices=aggregationMatrices) + } + + areaResults = getIntegratedPredictions(predsArea) + + # return results + areaResults +} + +#' @describeIn aggPop Aggregate areal populations to another areal level +#' @export +areaPopToArea = function(areaLevelPop, areasFrom, areasTo, + stratifyByUrban=TRUE, + doFineScaleRisk=!is.null(areaLevelPop$aggregationResults$pFineScaleRisk), + doSmoothRisk=!is.null(areaLevelPop$aggregationResults$pSmoothRisk)) { + + if(length(areasFrom) != length(unique(areasFrom))) { + stop("areasFrom must contain only unique names of areas to which we want to aggregate") + } + + uniqueNames = sort(unique(areasTo)) + + # construct row of the aggregation matrix given toArea index + getMatrixHelper = function(i) { + # get areasTo associated with this fromArea + thisToArea = uniqueNames[i] + thisFromAreas = unique(areasFrom[areasTo == thisToArea]) + areaI = areasFrom %in% thisFromAreas + + areaI + } + + # construct the aggregation matrix from areasFrom to areasTo + A = t(matrix(sapply(1:length(uniqueNames), getMatrixHelper), ncol=length(uniqueNames))) + rownames(A) = uniqueNames + colnames(A) = areasFrom + + ##### aggregate populations + # fine scale prevalence aggregation model + getaggregationResults = function(resultNameRoot="FineScalePrevalence") { + if(! (paste("N", resultNameRoot, sep="") %in% names(areaLevelPop$aggregationResults))) { + stop(paste0(resultNameRoot, " was not computed in input areaLevelPop")) + } + + nSamples = areaLevelPop$aggregationResults[[paste("N", resultNameRoot, sep="")]] + zSamples = areaLevelPop$aggregationResults[[paste("Z", resultNameRoot, sep="")]] + zSamples[is.na(zSamples)] = 0 # must set to zero temporarily so matrix multiplication works out + + ZAggregated = A %*% zSamples + NAggregated = A %*% nSamples + pAggregated = ZAggregated / NAggregated + pAggregated[NAggregated == 0] = NA + thisA=A %*% areaLevelPop$aggregationMatrices[[paste("A", resultNameRoot, sep="")]] + rownames(thisA) = uniqueNames + + if(stratifyByUrban) { + nSamplesUrban = areaLevelPop$aggregationResults[[paste("NUrban", resultNameRoot, sep="")]] + zSamplesUrban = areaLevelPop$aggregationResults[[paste("ZUrban", resultNameRoot, sep="")]] + zSamplesUrban[is.na(zSamplesUrban)] = 0 # must set to zero temporarily so matrix multiplication works out + + nSamplesRural = areaLevelPop$aggregationResults[[paste("NRural", resultNameRoot, sep="")]] + zSamplesRural = areaLevelPop$aggregationResults[[paste("ZRural", resultNameRoot, sep="")]] + zSamplesRural[is.na(zSamplesRural)] = 0 # must set to zero temporarily so matrix multiplication works out + + ZAggregatedUrban = A %*% zSamplesUrban + NAggregatedUrban = A %*% nSamplesUrban + pAggregatedUrban = ZAggregatedUrban / NAggregatedUrban + pAggregatedUrban[NAggregatedUrban == 0] = NA + thisAUrban=A %*% areaLevelPop$aggregationMatrices[[paste("AUrban", resultNameRoot, sep="")]] + rownames(thisAUrban) = uniqueNames + + ZAggregatedRural = A %*% zSamplesRural + NAggregatedRural = A %*% nSamplesRural + pAggregatedRural = ZAggregatedRural / NAggregatedRural + pAggregatedRural[NAggregatedRural == 0] = NA + thisARural=A %*% areaLevelPop$aggregationMatrices[[paste("ARural", resultNameRoot, sep="")]] + rownames(thisARural) = uniqueNames + } else { + ZAggregatedUrban = NULL + NAggregatedUrban = NULL + pAggregatedUrban = NULL + thisAUrban=NULL + + ZAggregatedRural = NULL + NAggregatedRural = NULL + pAggregatedRural = NULL + thisARural=NULL + } + + aggregationResults = list(region=uniqueNames, p=pAggregated, Z=ZAggregated, N=NAggregated, + pUrban=pAggregatedUrban, ZUrban=ZAggregatedUrban, NUrban=NAggregatedUrban, + pRural=pAggregatedRural, ZRural=ZAggregatedRural, NRural=NAggregatedRural) + aggregationMatrices = list(A=thisA, AUrban=thisAUrban, ARural=thisARural) + + capitalResultNameRoot = resultNameRoot + # capitalResultNameRoot = paste(toupper(substr(capitalResultNameRoot, 1, 1)), substr(capitalResultNameRoot, 2, nchar(capitalResultNameRoot)), sep="") + names(aggregationResults)[-1] = paste(names(aggregationResults)[-1], capitalResultNameRoot, sep="") + names(aggregationMatrices) = paste(names(aggregationMatrices)[-1], capitalResultNameRoot, sep="") + + list(aggregationResults=aggregationResults, aggregationMatrices=aggregationMatrices) + } + + # fine scale prevalence model + out = getaggregationResults("FineScalePrevalence") + aggregationResults = out$aggregationResults + aggregationMatrices = out$aggregationMatrices + + # fine scale risk aggregation model + if(doFineScaleRisk) { + out = getaggregationResults("FineScaleRisk") + resFineScaleRisk = out$aggregationResults + resAggregationMatrices = out$aggregationMatrices + + # aggregationResults = merge(aggregationResults, resFineScaleRisk, by="region") + aggregationResults = c(aggregationResults, resFineScaleRisk) + aggregationMatrices = c(aggregationMatrices, resAggregationMatrices) + } + + if(doSmoothRisk) { + out = getaggregationResults("SmoothRisk") + resSmoothRisk = out$aggregationResults + resAggregationMatrices = out$aggregationMatrices + + # aggregationResults = merge(aggregationResults, resSmoothRisk, by="region") + aggregationResults = c(aggregationResults, resSmoothRisk) + aggregationMatrices = c(aggregationMatrices, resAggregationMatrices) + } + + list(aggregationResults=aggregationResults, aggregationMatrices=aggregationMatrices) +} + +#' @describeIn simPop +#' Simulate populations and population prevalences given census frame and population density +#' information. Uses custom spatial logit risk function and can include iid cluster +#' level effect. +#' +#' @export +simPopCustom = function(logitRiskDraws, sigmaEpsilonDraws, easpa, popMat, targetPopMat, + stratifyByUrban=TRUE, validationPixelI=NULL, validationClusterI=NULL, + clustersPerPixel=NULL, + doFineScaleRisk=FALSE, doSmoothRisk=FALSE, + doSmoothRiskLogisticApprox=FALSE, + poppsub=NULL, subareaLevel=FALSE, + min1PerSubarea=TRUE, + returnEAinfo=FALSE, epsc=NULL) { + + if(is.null(poppsub) && subareaLevel) { + stop("if subareaLevel is TRUE, user must specify poppsub") + } + + if(!is.null(validationPixelI) || !is.null(validationClusterI) || !is.null(clustersPerPixel)) { + stop("validationPixelI, validationClusterI, and clustersPerPixel not yet fully implemented") + } + + nDraws = ncol(logitRiskDraws) + + # set default inputs + totalEAs = sum(easpa$EATotal) + if(!is.null(clustersPerPixel)) { + emptyPixels = clustersPerPixel == 0 + if(totalEAs != sum(clustersPerPixel)) + stop("sum(easpa$EATotal) != sum(clustersPerPixel)") + } + + # get area names + areas = sort(unique(popMat$area)) + if(any(areas != easpa$area)) + stop("area names and easpa do not match popMat or are not in the correct order") + + # determine if we care about subareas (smallest areas we care about. No info of EAs per subarea) + subareaLevel = !is.null(popMat$subarea) + + ##### Line 1 (of the algorithm): take draws from the binomial process for each stratum (each row of easpa) + # get probabilities for each pixel (or at least something proportional within each stratum) + print("drawing EAs") + pixelProbs = popMat$pop + + # take draws from the stratified binomial process for each posterior sample + if(is.null(clustersPerPixel)) { + if(subareaLevel) { + eaSamples = rStratifiedMultnomialBySubarea(nDraws, popMat, easpa, stratifyByUrban, poppsub=poppsub, + min1PerSubarea=min1PerSubarea) + } else { + eaSamples = rStratifiedMultnomial(nDraws, popMat, easpa, stratifyByUrban) + } + } + + if(!is.null(clustersPerPixel) && !exists("eaSamples")) { + eaSamples = matrix(rep(clustersPerPixel, nDraws), ncol=nDraws) + } + + # make matrix (or list) of pixel indices mapping matrices of EA values to matrices of pixel values + if(!is.null(clustersPerPixel)) { + pixelIndices = rep(1:nrow(popMat), times=clustersPerPixel) # this contains repetitions and has length == nEAs + uniquePixelIndices = sort(unique(pixelIndices)) + } else { + pixelIndexMat = matrix(rep(rep(1:nrow(popMat), nDraws), times=eaSamples), ncol=nDraws) + } + + # determine which EAs are urban if necessary + if(stratifyByUrban) { + # urbanMat = matrix(rep(rep(popMat$urban, nDraws), times=c(eaSamples)), ncol=nDraws) + if(!is.null(clustersPerPixel)) { + urbanVals = popMat$urban[pixelIndices] + uniqueUrbanVals = popMat$urban[uniquePixelIndices] + } else { + # urbanMat = matrix(popMat$urban[pixelIndexMat], ncol=nDraws) + urbanMat = NULL # don't calculate here for memory's sake + } + } else { + urbanMat = NULL + } + + # determine which EAs are from which area + if(!is.null(clustersPerPixel)) { + areaVals = popMat$area[pixelIndices] + uniqueAreaVals = popMat$area[uniquePixelIndices] + } else { + # areaMat = matrix(popMat$area[pixelIndexMat], ncol=nDraws) + areaMat = NULL # don't calculate here for memory's sake + } + + ##### Line 2: draw cluster effects, epsilon + # NOTE1: we assume there are many more EAs then sampled clusters, so that + # the cluster effects for each EA, including those sampled, are iid + print("simulating EA level risks, numerators, and denominators") + if(is.null(epsc)) { + epsc = matrix(stats::rnorm(totalEAs*nDraws, sd=rep(sigmaEpsilonDraws, each=totalEAs)), ncol=nDraws) + } + + ##### Line 3: draw EA population denominators, N + + if(!is.null(clustersPerPixel)) { + if(is.null(validationPixelI)) + stop("clustersPerPixel must only be set for validation, but validationPixelI is NULL") + + # in this case, every left out cluster has exactly 25 households. Simply sample target population + # with equal probability from each cluster/faux EA + Ncs = sampleNMultilevelMultinomialFixed(clustersPerPixel, nDraws=nDraws, pixelIndices=pixelIndices, + urbanVals=urbanVals, areaVals=areaVals, easpa=easpa, popMat=popMat, stratifyByUrban=stratifyByUrban, + verbose=TRUE) + } else { + if(returnEAinfo) { + out = sampleNMultilevelMultinomial(pixelIndexMat=pixelIndexMat, easpaList=list(easpa), + popMat=popMat, stratifyByUrban=stratifyByUrban, verbose=TRUE, returnEAinfo=returnEAinfo) + householdDraws = out$householdDraws + Ncs = out$targetPopDraws + } else { + Ncs <- sampleNMultilevelMultinomial(pixelIndexMat=pixelIndexMat, easpaList=list(easpa), + popMat=popMat, stratifyByUrban=stratifyByUrban, verbose=TRUE, returnEAinfo=returnEAinfo) + householdDraws = NULL + } + } + + ##### do part of Line 7 in advance + # calculate mu_{ic} for each EA in each pixel + if(!is.null(clustersPerPixel)) { + uc = logitRiskDraws[pixelIndices,] + muc = expit(uc + epsc) + } else { + uc = matrix(logitRiskDraws[cbind(rep(rep(1:nrow(logitRiskDraws), nDraws), times=c(eaSamples)), rep(1:nDraws, each=totalEAs))], ncol=nDraws) + muc = expit(uc + epsc) + } + + # calculate Z_{ic} for each EA in each pixel + Zc = matrix(stats::rbinom(n=totalEAs * nDraws, size=Ncs, prob=as.matrix(muc)), ncol=nDraws) + + ##### Line 4: Aggregate appropriate values from EAs to the grid cell level + + # function for aggregating values for each grid cell + getPixelColumnFromEAs = function(i, vals, applyFun=sum, popWeightMatrix=NULL) { + # calculate levels over which to aggregate + if(!is.null(clustersPerPixel)) { + indices = pixelIndices + } else { + indices = factor(as.character(pixelIndexMat[,i])) + } + + # in this case (the LCPb model), we calculate weighted means within factor levels using popWeightMatrix + if(!is.null(popWeightMatrix)) { + stop("using popWeightMatrix is no longer support, since this is much slower than calculating normalized + weights separately and multiplying values by them outside this function") + # applyFun = function(x) {stats::weighted.mean(x, popWeightMatrix[,i], na.rm=TRUE)} + + Data = data.frame(v=vals[,i], w=popWeightMatrix[,i]) + out = sapply(split(Data, indices), function(x) stats::weighted.mean(x$v,x$w)) + } else { + if(!is.null(clustersPerPixel)) { + # out = tapply(vals[,i], factor(as.character(pixelIndices)), FUN=applyFun) + out = tapply(vals[,i], indices, FUN=applyFun) + } else { + out = tapply(vals[,i], indices, FUN=applyFun) + } + } + + if(!is.null(clustersPerPixel)) { + returnValues = out + } else { + indices = as.numeric(names(out)) + + returnValues = rep(NA, nrow(logitRiskDraws)) + returnValues[indices] = out + } + + returnValues + } + + ##### Line 5: We already did this, resulting in logitRiskDraws input + + ##### Line 6: aggregate population denominators for each grid cell to get N_{ig} + print("Aggregating from EA level to the pixel level") + Ng <- sapply(1:ncol(Ncs), getPixelColumnFromEAs, vals=Ncs) + Ng[is.na(Ng)] = 0 + + ##### Line 7: aggregate response for each grid cell to get Z_{ig} + Zg <- sapply(1:ncol(Zc), getPixelColumnFromEAs, vals=Zc) + + ##### Line 8: Calculate empirical mortality proportions for each grid cell, p_{ig}. + ##### Whenever N_{ig} is 0, set p_{ig} to NA as well + pg = Zg / Ng + pg[Ng == 0] = NA + + ##### calculate results for the other models if necessary + if(doFineScaleRisk || doSmoothRisk) { + nPerEA = getExpectedNperEA(easpa, targetPopMat) + } + + if(doFineScaleRisk) { + fineScaleRisk = sapply(1:ncol(muc), getPixelColumnFromEAs, vals=muc, applyFun=function(x) {mean(x, na.rm=TRUE)}) + fineScaleRisk[!is.finite(fineScaleRisk)] = NA + + # in order to get valid count estimates, we also need the expected denominator per EA in each stratum: + nSamplesFineScaleRisk = sweep(eaSamples, 1, nPerEA, "*") + zSamplesFineScaleRisk = fineScaleRisk * nSamplesFineScaleRisk + zSamplesFineScaleRisk[is.na(zSamplesFineScaleRisk)] = 0 + } else { + fineScaleRisk = NULL + zSamplesFineScaleRisk = NULL + nSamplesFineScaleRisk = NULL + } + + if(doSmoothRisk) { + # integrate out the cluster effect in the risk + smoothRisk = matrix(SUMMER::logitNormMean(cbind(c(as.matrix(logitRiskDraws)), rep(sigmaEpsilonDraws, each=nrow(logitRiskDraws))), logisticApprox=doSmoothRiskLogisticApprox), nrow=nrow(logitRiskDraws)) + + # get expected numerator and denominators + nSamplesSmoothRisk = matrix(rep(targetPopMat$pop, nDraws), ncol=nDraws) + zSamplesSmoothRisk = sweep(smoothRisk, 1, targetPopMat$pop, "*") + } else { + smoothRisk = NULL + zSamplesSmoothRisk = NULL + nSamplesSmoothRisk = NULL + } + + ##### Extra steps: collect draws at each level and generate: + ##### areas, preds, + pixelLevelPop = list(pFineScalePrevalence=pg, ZFineScalePrevalence=Zg, NFineScalePrevalence=Ng, + pFineScaleRisk=fineScaleRisk, ZFineScaleRisk=zSamplesFineScaleRisk, NFineScaleRisk=nSamplesFineScaleRisk, + pSmoothRisk=smoothRisk, ZSmoothRisk=zSamplesSmoothRisk, NSmoothRisk=nSamplesSmoothRisk) + + if(!returnEAinfo) { + pixelLevelPop + } else { + + + # return list of eaDat objects + getEADat = function(i) { + theseI = pixelIndexMat[,i] + + eaDat = data.frame(lon=popMat$lon[theseI], lat=popMat$lat[theseI], + area=popMat$area[theseI], subarea=rep("temp", length(theseI)), + urban=popMat$urban[theseI], east=popMat$east[theseI], north=popMat$north[theseI], + popDensity=popMat$pop[theseI], popDensityTarget=targetPopMat$pop[theseI], pixelIs=theseI, + nHH=householdDraws[,i], N=Ncs[,i], Z=Zc[,i], + pFineScalePrevalence=Zc[,i]/Ncs[,i]) + if(doFineScaleRisk) { + eaDat$pFineScaleRisk=muc[,i] + } + if(doSmoothRisk) { + eaDat$pSmoothRisk=smoothRisk[theseI,i] + } + + if(subareaLevel) { + eaDat$subarea = popMat$subarea[theseI] + } else { + eaDat$subarea = NULL + } + + eaDat$pFineScalePrevalence[eaDat$N == 0] = NA + + eaDat + } + + print("Constructing list of simulated EA data.frames") + eaDatList = lapply(1:nDraws, getEADat) + + c(pixelLevelPop, list(eaDatList=eaDatList, eaSamples=eaSamples)) + } +} + + +#' Internal functions for population simulation +#' +#' Functions for calculating valuable quantities and for drawing from important +#' distributions for population simulation. +#' +#' @param easpa Census frame. See \code{\link{simPopCustom}} for details +#' @param popMat data.frame of pixellated grid of population densities. See \code{\link{simPopCustom}} for details +#' @param i Index +#' @param urban If TRUE, calculate only for urban part of the area. If FALSE, for only rural part +#' @param stratifyByUrban whether or not to stratify calculations by urban/rural classification +#' @param validationPixelI CURRENTLY FOR TESTING PURPOSES ONLY a set of indices of pixels for which we want to simulate populations (used for pixel level validation) +#' @param n Number of samples +#' @param poppsub Population per subarea. See \code{\link{simPopCustom}} for details +#' @param min1PerSubarea Whether or not to ensure there is at least 1 EA per subarea. See \code{\link{simPopCustom}} for details +#' @param method If min1PerSubarea is TRUE, the sampling method for the truncated multinomial to use with rmulitnom1. rmultinom1 automatically +#' switches between them depending on the number of expected samples. The methods are: +#' \describe{ +#' \item{mult1}{rejection sampling from multinomial plus 1 in each category} +#' \item{mult}{rejection sampling from multinomial if any category has zero count} +#' \item{indepMH}{independent Metropolis-Hastings using multinomial plus 1 distribution as proposal} +#' } +#' @param minSample The minimum number of samples per `chunk` of samples for truncated multinomial sampling. Defaults to 1 +#' @param easpsub This could either be total EAs per subarea, or subarea crossed with urban or +#' rural if stratifyByUrban is TRUE +#' @param size Multinomial size parameter. See \code{\link[stats]{rmultinom}} +#' @param prob Multinomial probability vector parameter. See \code{\link[stats]{rmultinom}} +#' @param maxSize The maximum number of elements in a matrix drawn from the proposal distribution per sample chunk. +#' @param maxExpectedSizeBeforeSwitch Max expected number of samples / k, the number of categories, before switching method +#' @param init Initial sample if method is `indepMH` +#' @param burnIn Number of initial samples before samples are collected if method is `indepMH` +#' @param filterEvery Store only every filterEvery samples if method is i`indepMH` +#' @param zeroProbZeroSamples If TRUE, set samples for parts of prob vector that are zero to zero. Otherwise they are set to one. +#' @param allowSizeLessThanK If TRUE, then if size < the number of categories (k), returns matrix where each +#' column is vector of size ones and k - size zeros. If FALSE, throws an error if size < k +#' @param clustersPerPixel CURRENTLY FOR TESTING PURPOSES ONLY a vector of length nIntegrationPoints specifying the number of clusters per pixel if they are fixed +#' @param pixelIndices A nEA x n matrix of pixel indices associated with each EA per simulation/draw +#' @param urbanVals A nEA x n matrix of urbanicities associated with each EA per simulation/draw +#' @param areaVals A nEA x n matrix of area names associated with each EA per simulation/draw +#' @param easpaList A list of length n with each element being of the format of easpa +#' giving the number of households and EAs +#' per stratum. It is assumed that the number of EAs per stratum is +#' the same in each list element. If easpaList is a data frame, +#' number of households per stratum is assumed constant +#' @param nDraws Number of draws +#' @param pixelIndexMat Matrix of pixel indices associated with each EA and draw. Not +#' required by getExpectedNperEA unless level == "EA" +#' @param urbanMat Matrix of urbanicities associated with each EA and draw +#' @param areaMat Matrix of areas associated with each EA and draw +#' @param verbose Whether to print progress as the function proceeds +#' @param returnEAinfo Whether a data frame at the EA level is desired +#' @param minHHPerEA The minimum number of households per EA (defaults to 25, since +#' that is the number of households sampled per DHS cluster) +#' @param fixHHPerEA If not NULL, the fixed number of households per EA +#' @param fixPopPerHH If not NULL, the fixed target population per household +#' @param level Whether to calculate results at the integration grid or EA level +#' @name simPopInternal +NULL + +#' @describeIn simPopInternal Calculates expected denominator per enumeration area. +getExpectedNperEA = function(easpa, popMat, level=c("grid", "EA"), pixelIndexMat=NULL) { + level = match.arg(level) + + # calculate the expected denominator per enumeration area in each stratum. + nPerEAUrban = easpa$popUrb / easpa$EAUrb + nPerEARural = easpa$popRur / easpa$EARur + + # expanded the expected denominator values vector to be of length equal + # to the number of grid cells + uniqueAreas = sort(unique(popMat$area)) + outUrban = numeric(nrow(popMat)) + outRural = numeric(nrow(popMat)) + for(i in 1:length(uniqueAreas)) { + urbanI = getSortIndices(i, urban=TRUE, popMat=popMat, stratifyByUrban=TRUE) + ruralI = getSortIndices(i, urban=FALSE, popMat=popMat, stratifyByUrban=TRUE) + outUrban[urbanI] = nPerEAUrban[i] + outRural[ruralI] = nPerEARural[i] + } + + out = outUrban + outRural + + if(level == "EA") { + if(is.null(pixelIndexMat)) { + stop("if level == EA, must specify pixelIndexMat") + } + out = matrix(out[pixelIndexMat], ncol=ncol(pixelIndexMat)) + } + + out +} + +#' @describeIn simPopInternal For recombining separate multinomials into the draws over all grid points +getSortIndices = function(i, urban=TRUE, popMat, stratifyByUrban=TRUE, validationPixelI=NULL) { + + # get area names + areas = sort(unique(popMat$area)) + + # determine which pixels and how many EAs are in this stratum + if(stratifyByUrban) { + includeI = (popMat$area == areas[i]) & (popMat$urban == urban) + } + else { + includeI = popMat$area == areas[i] + } + + # include only indices included within validation if necessary + if(!is.null(validationPixelI)) { + + # convert validationPixelI into a logical + temp = rep(FALSE, length(includeI)) + temp[validationPixelI] = TRUE + + # include only indices we are interested in for the validation + includeI = includeI & temp + } + + which(includeI) +} + +#' @describeIn simPopInternal Gives nIntegrationPoints x n matrix of draws from the stratified multinomial with values +#' corresponding to the value of |C^g| for each pixel, g (the number of EAs/pixel) +rStratifiedMultnomial = function(n, popMat, easpa, stratifyByUrban=TRUE) { + + # get area names + areas = sort(unique(popMat$area)) + if(any(areas != easpa$area)) + stop("area names and easpa do not match popMat or are not in the correct order") + + # we will need to draw separate multinomial for each stratum. Start by + # creating matrix of all draws of |C^g| + eaSamples = matrix(NA, nrow=nrow(popMat), ncol=n) + + # now draw multinomials + if(stratifyByUrban) { + # draw for each area crossed with urban/rural + urbanSamples = do.call("rbind", lapply(1:length(areas), rMyMultinomial, n=n, urban=TRUE, + stratifyByUrban=stratifyByUrban, popMat=popMat, easpa=easpa)) + ruralSamples = do.call("rbind", lapply(1:length(areas), rMyMultinomial, n=n, urban=FALSE, + stratifyByUrban=stratifyByUrban, popMat=popMat, easpa=easpa)) + + # get the indices used to recombine into the full set of draws + urbanIndices = unlist(sapply(1:length(areas), getSortIndices, urban=TRUE, popMat=popMat, stratifyByUrban=stratifyByUrban)) + ruralIndices = unlist(sapply(1:length(areas), getSortIndices, urban=FALSE, popMat=popMat, stratifyByUrban=stratifyByUrban)) + + # recombine into eaSamples + eaSamples[urbanIndices,] = urbanSamples + eaSamples[ruralIndices,] = ruralSamples + } else { + # draw for each area + stratumSamples = rbind(sapply(1:length(areas), n=n, rMyMultinomial, + stratifyByUrban=stratifyByUrban, popMat=popMat, easpa=easpa)) + + # get the indices used to recombine into the full set of draws + stratumIndices = c(sapply(1:length(areas), getSortIndices, popMat=popMat, stratifyByUrban=stratifyByUrban)) + + # recombine into eaSamples + eaSamples[stratumIndices,] = stratumSamples + } + + # return results + eaSamples +} + +#' @describeIn simPopInternal Gives nIntegrationPoints x n matrix of draws from the stratified multinomial with values +# corresponding to the number of EAs in each pixel +rStratifiedMultnomialBySubarea = function(n, popMat, easpa, stratifyByUrban=TRUE, poppsub=NULL, + min1PerSubarea=TRUE, minSample=1) { + if(is.null(poppsub)) { + # use popMat to calculate poppsub + stop("Calculating poppsub with popMat not yet implemented") + + # poppsub: a table with the following variables: + # subarea + # area + # popUrb + # popRur + # popTotal + } + + # get area names + areas = sort(unique(popMat$area)) + subareas = sort(unique(popMat$subarea)) + if(any(areas != easpa$area)) + stop("area names and easpa do not match popMat or are not in the correct order") + + # we will need to draw separate multinomial for each stratum + + # create temporary popMat, except with one row for each constituency + popSubareaMat = popMat[1:length(subareas),] + popSubareaMat$area = poppsub$area + popSubareaMat$subarea = poppsub$subarea + if(stratifyByUrban) { + popSubareaMat$urban = FALSE + popSubareaMat = rbind(popSubareaMat, popSubareaMat) + popSubareaMat$urban[1:length(subareas)] = TRUE + popSubareaMat$pop[1:length(subareas)] = poppsub$popUrb + popSubareaMat$pop[(length(subareas) + 1):(2 * length(subareas))] = poppsub$popRur + } else { + popSubareaMat$pop = poppsub$popTotal + } + + # now draw multinomials + if(stratifyByUrban) { + # draw for each constituency in each area crossed with urban/rural + urbanSamplesCon = do.call("rbind", lapply(1:length(areas), rMyMultinomial, n=n, urban=TRUE, + stratifyByUrban=stratifyByUrban, popMat=popSubareaMat, easpa=easpa, + min1PerSubarea=min1PerSubarea, method="mult", + minSample=minSample)) + ruralSamplesCon = do.call("rbind", lapply(1:length(areas), rMyMultinomial, n=n, urban=FALSE, + stratifyByUrban=stratifyByUrban, popMat=popSubareaMat, easpa=easpa, + min1PerSubarea=min1PerSubarea, method="mult", + minSample=minSample)) + + # get the indices used to recombine into the full set of draws for the subareas + urbanIndicesCon = unlist(sapply(1:length(areas), getSortIndices, urban=TRUE, popMat=popSubareaMat, stratifyByUrban=stratifyByUrban)) + ruralIndicesCon = unlist(sapply(1:length(areas), getSortIndices, urban=FALSE, popMat=popSubareaMat, stratifyByUrban=stratifyByUrban)) - length(urbanIndicesCon) + + # recombine into eaSamples for the subareas + urbanSamplesCon[urbanIndicesCon,] = urbanSamplesCon + ruralSamplesCon[ruralIndicesCon,] = ruralSamplesCon + + # draw for each pixel crossed with urban/rural + urbanSamples = do.call("rbind", lapply(1:length(subareas), rMyMultinomialSubarea, n=n, urban=TRUE, + stratifyByUrban=stratifyByUrban, popMat=popMat, easpsub=urbanSamplesCon)) + ruralSamples = do.call("rbind", lapply(1:length(subareas), rMyMultinomialSubarea, n=n, urban=FALSE, + stratifyByUrban=stratifyByUrban, popMat=popMat, easpsub=ruralSamplesCon)) + + # get the indices used to recombine into the full set of draws + tempPopMat = popMat + tempPopMat$area = tempPopMat$subarea + urbanIndices = unlist(sapply(1:length(subareas), getSortIndices, urban=TRUE, popMat=tempPopMat, stratifyByUrban=stratifyByUrban)) + ruralIndices = unlist(sapply(1:length(subareas), getSortIndices, urban=FALSE, popMat=tempPopMat, stratifyByUrban=stratifyByUrban)) + + # create matrix of all draws of |C^g| and recombine urban/rural draws into eaSamples + eaSamples = matrix(NA, nrow=nrow(popMat), ncol=n) + eaSamples[urbanIndices,] = urbanSamples + eaSamples[ruralIndices,] = ruralSamples + } else { + # draw for each constituency in each area crossed with urban/rural + samplesCon = do.call("rbind", lapply(1:length(areas), rMyMultinomial, n=n, urban=TRUE, + stratifyByUrban=stratifyByUrban, popMat=popSubareaMat, easpa=easpa, + min1PerSubarea=min1PerSubarea, method="mult", + minSample=minSample)) + + # get the indices used to recombine into the full set of draws for the subareas + indicesCon = unlist(sapply(1:length(areas), getSortIndices, popMat=popSubareaMat, stratifyByUrban=stratifyByUrban)) + + # recombine into eaSamples for the subareas + samplesCon[indicesCon,] = samplesCon + + # draw for each pixel in each constituency + stratumSamples = rbind(sapply(1:length(subareas), n=n, rMyMultinomialSubarea, + stratifyByUrban=stratifyByUrban, popMat=popMat, easpsub=samplesCon)) + + # get the indices used to recombine into the full set of draws + tempPopMat = popMat + tempPopMat$area = tempPopMat$subarea + stratumIndices = c(sapply(1:length(subareas), getSortIndices, popMat=tempPopMat, stratifyByUrban=stratifyByUrban)) + + # create matrix of all draws of |C^g| + eaSamples = matrix(NA, nrow=nrow(popMat), ncol=n) + eaSamples[stratumIndices,] = stratumSamples + } + + # return results + eaSamples +} + +#' @describeIn simPopInternal +rMyMultinomial = function(n, i, stratifyByUrban=TRUE, urban=TRUE, popMat=NULL, easpa=NULL, min1PerSubarea=FALSE, + method=c("mult1", "mult", "indepMH"), minSample=1) { + method = match.arg(method) + + # get area names + areas = sort(unique(popMat$area)) + if(any(areas != easpa$area)) + stop("area names and easpa do not match popMat or are not in the correct order") + + # determine which pixels and how many EAs are in this stratum + if(stratifyByUrban) { + includeI = (popMat$area == areas[i]) & (popMat$urban == urban) + nEA = ifelse(urban, easpa$EAUrb[i], easpa$EARur[i]) + } + else { + includeI = popMat$area == areas[i] + nEA = ifelse(urban, easpa$EAUrb[i], easpa$EATotal[i]) + } + + # sample from the pixels if this stratum exists + if(sum(includeI) == 0) + return(matrix(nrow=0, ncol=n)) + thesePixelProbs = popMat$pop[includeI] + if(any(thesePixelProbs > 0)) { + if(!min1PerSubarea) { + stats::rmultinom(n, nEA, prob=thesePixelProbs) + } else { + rmultinom1(n, nEA, prob=thesePixelProbs, method=method, allowSizeLessThanK=TRUE, minSample=minSample) + } + } else { + matrix(0, nrow=length(thesePixelProbs), ncol=n) + } +} + +#' @describeIn simPopInternal +rMyMultinomialSubarea = function(n, i, easpsub, stratifyByUrban=TRUE, urban=TRUE, popMat=NULL) { + + # get constituency names + subareas = sort(unique(popMat$subarea)) + + # determine which pixels and how many EAs are in this stratum + if(stratifyByUrban) { + includeI = (popMat$subarea == subareas[i]) & (popMat$urban == urban) + } + else { + includeI = popMat$subarea == subareas[i] + } + nEA = easpsub[i,] + + # sample from the pixels if this stratum exists + if(sum(includeI) == 0){ + if(any(nEA != 0)) + stop(paste0("no valid pixels to put EAs in for subarea ", as.character(subareas[i]), " and urban level ", urban)) + return(matrix(nrow=0, ncol=n)) + } + + thesePixelProbs = popMat$pop[includeI] + nonZeroEAs = nEA != 0 + out = matrix(0, nrow=sum(includeI), ncol=n) + if(any(nonZeroEAs)) { + out[,nonZeroEAs] = sapply(nEA[nonZeroEAs], stats::rmultinom, n=1, prob=thesePixelProbs) + } + out +} + +#' @describeIn simPopInternal Random (truncated) multinomial draws conditional on the number of each type being at least one +rmultinom1 = function(n=1, size, prob, maxSize=8000*8000, method=c("mult1", "mult", "indepMH"), verbose=FALSE, minSample=100, + maxExpectedSizeBeforeSwitch=1000*1e7, init=NULL, burnIn=floor(n/4), filterEvery=10, zeroProbZeroSamples=TRUE, + allowSizeLessThanK=FALSE) { + method = match.arg(method) + prob = prob*(1/sum(prob)) + + if(zeroProbZeroSamples && any(prob == 0)) { + zero = prob == 0 + out = matrix(0, nrow=length(prob), ncol=n) + + if(sum(!zero) > 0) { + out[!zero,] = rmultinom1(n, size, prob[!zero], maxSize, method, verbose, minSample, maxExpectedSizeBeforeSwitch, init, burnIn, + filterEvery, zeroProbZeroSamples, allowSizeLessThanK) + } + + return(out) + } + + k = length(prob) + if(allowSizeLessThanK && (size <= k)) { + return(replicate(n, as.numeric(1:k %in% sample(1:k, size, replace=FALSE)))) + } else if(size < k) { + stop("size < k but rmultinom1 requires at least 1 sample per multinomial type") + } + + maxSamples = floor(maxSize / k) + averageProbMult = prod((size/k)*prob) + + if(method != "indepMH") + samples = matrix(NA, nrow=k, ncol=n) + else + samples = matrix(NA, nrow=k, ncol=round(n*filterEvery)) + if(method == "mult1") { + averagex = 1 + (size-k)*prob + averageProb = (size-k) / (prod(averagex)) + + while(any(is.na(samples))) { + # calculate the number of remaining samples + samplesLeft = sum(apply(samples, 2, function(x) {any(is.na(x))})) + + # approximate expected number of samples so that, after some are rejected, we will + # have the right number of samples. Kappa is approximated for when p_1=p_2=...=p_k + expectedSamples = ceiling(samplesLeft/averageProb) + # logMStar = lfactorial(size) - lfactorial(size-k) + sum(log(prob)) - log(size-k+1) + # avgAcceptProb = exp(-logMStar) + # logKappaApprox = lchoose(size + k - 1, k - 1) - lchoose(size - 1, k - 1) + # logM = logMStar -logKappaApprox + # avgAcceptProb = exp(-logM) + # expectedSamples = ceiling(samplesLeft/avgAcceptProb) + + if(expectedSamples*k > maxExpectedSizeBeforeSwitch) { + warning("too many samples expected with method=='mult1'. Switching to method=='indepMH'") + return(rmultinom1(n, size, prob, maxSize, method="indepMH", verbose, minSample, + maxExpectedSizeBeforeSwitch, init, burnIn, filterEvery, + zeroProbZeroSamples, allowSizeLessThanK)) + } + + # sample expectedSamples times a fudge factor, but make sure we don't get past memory limit + thisNumberOfSamples = max(minSample, min(maxSamples, expectedSamples * 1.1)) + if(verbose) + print(paste0("Sampling ", thisNumberOfSamples, ". Sampled ", n-samplesLeft, "/", n, ". Expected remaining samples: ", expectedSamples)) + thisSamples = matrix(1 + stats::rmultinom(thisNumberOfSamples, size-k, prob=prob), nrow=length(prob)) + + # calculate accept probabilities + thisProbs = exp(log(size-k) - apply(log(thisSamples), 2, sum)) + # thisProbs = (size-k) / apply(thisSamples, 2, prod) + if(verbose) { + print(paste0("Max sampled accept prob: ", max(thisProbs), ". Mean sampled accept prob: ", mean(thisProbs))) + print(paste0("Expected number of samples based on avg sampled acceptance prob: ", ceiling(samplesLeft/mean(thisProbs)))) + } + + # reject relevant samples + u = stats::runif(thisNumberOfSamples) + thisSamples = matrix(thisSamples[,u n) { + thisSamples = matrix(thisSamples[,1:samplesLeft], nrow=length(prob)) + } + + # add in accepted samples, if any + if(ncol(thisSamples) > 0) { + samples[,(n-samplesLeft+1):(n-samplesLeft+ncol(thisSamples))] = thisSamples + } else { + warning(paste0("no samples accepted this round out of ", thisNumberOfSamples, " total. Redrawing...")) + } + } + } else if(method == "mult") { + + while(any(is.na(samples))) { + # calculate the number of remaining samples + samplesLeft = sum(apply(samples, 2, function(x) {any(is.na(x))})) + + # approximate expected number of samples so that, after some are rejected, we will + # have the right number of samples + expectedSamples = ceiling(samplesLeft/averageProbMult) + + if(expectedSamples*k > maxExpectedSizeBeforeSwitch) { + warning("too many samples expected with method=='mult'. Switching to method=='mult1'") + return(rmultinom1(n, size, prob, maxSize, method="mult1", verbose, minSample, maxExpectedSizeBeforeSwitch, + init, burnIn, filterEvery, + zeroProbZeroSamples, allowSizeLessThanK)) + } + + # sample expectedSamples times a fudge factor, but make sure we don't get past memory limit + thisNumberOfSamples = max(minSample, min(maxSamples, expectedSamples * 1.1)) + if(verbose) + print(paste0("Sampling ", thisNumberOfSamples, ". Sampled ", n-samplesLeft, "/", n, ". Expected remaining samples: ", expectedSamples)) + thisSamples = matrix(stats::rmultinom(thisNumberOfSamples, size, prob=prob), ncol=thisNumberOfSamples) + + # reject relevant samples + accept = apply(thisSamples, 2, function(x) {all(x>0)}) + thisSamples = matrix(thisSamples[,accept], nrow=length(prob)) + + # remove excess samples if necessary + totalSamples = ncol(thisSamples) + n - samplesLeft + if(totalSamples > n) { + thisSamples = matrix(thisSamples[,1:samplesLeft], nrow=length(prob)) + } + + # add in accepted samples, if any + if(ncol(thisSamples) > 0) { + samples[,(n-samplesLeft+1):(n-samplesLeft+ncol(thisSamples))] = thisSamples + } else { + warning(paste0("no samples accepted this round out of ", thisNumberOfSamples, " total. Redrawing...")) + } + } + } else if(method == "indepMH") { + # we use the mult1 method for independent proposals with independent Metropolis-Hastings + # (https://www.statlect.com/fundamentals-of-statistics/Metropolis-Hastings-algorithm#hid9) + + # initialize at something reasonable, if not set by user + if(is.null(init)) { + init = 1 + floor(size*prob) + while(sum(init) > size) { + tooMuchBy = sum(init) - size + numberReducible = sum(init > 1) + reduceNumber = min(tooMuchBy, numberReducible) + init[init > 1][1:reduceNumber] = init[init > 1][1:reduceNumber] - 1 + } + } + if(sum(init) != size) + stop("sum(init) != size") + + # approximate target log-density + lp <- log(prob) + lf <- function(x) { + if(any(x < 1) || sum(x) != size) + return(-Inf) + sum(lp*x - lfactorial(x)) + } + + # true proposal log-density + lq = function(x) { + if(sum(x) != size) + return(-Inf) + sum(lp*(x-1) - lfactorial(x-1)) + lfactorial(size-k) + } + + # proposal function + q <- function(x) { + 1 + stats::rmultinom(1, size-k, prob) + } + + # do the sampling + tmp <- init + ar <- 0 + for (i in 1:burnIn) { + proposal <- q(tmp) + p <- exp((lf(proposal) - lq(proposal)) - (lf(tmp) - lq(tmp))) + if (stats::runif(1) < p) { + tmp <- proposal + } + } + for (i in 1:ncol(samples)) { + proposal <- q(tmp) + p <- exp((lf(proposal) - lq(proposal)) - (lf(tmp) - lq(tmp))) + if (stats::runif(1) < p) { + tmp <- proposal + ar <- ar + 1 + } + samples[,i] <- tmp + } + + # calculated acceptance percentage + if(verbose) { + print(paste0("acceptance percentage: ", ar/ncol(samples))) + } + + # estimate autocorrelation in samples (if it is na then all samples have the same value there) + calcCor = apply(samples, 1, function(x) {cor(x[-1], x[-length(x)])}) + calcCor[is.na(calcCor)] = 1 + print(paste0("estimated mean (max) lag 1 correlation after filtering: ", mean(calcCor^filterEvery), " (", max(calcCor^filterEvery), ")")) + + # filter out samples to reduce autocorrelation + samples = samples[,seq(from=1, to=ncol(samples), by=filterEvery)] + } + + samples +} + +#' @describeIn simPopInternal Take multilevel multinomial draws first from joint distribution of +#' number of households per EA given the total per stratum, and then from the joint +#' distribution of the total target population per household given +#' the total per stratum +sampleNMultilevelMultinomial = function(nDraws = ncol(pixelIndexMat), pixelIndexMat=NULL, urbanMat=NULL, areaMat=NULL, easpaList, + popMat, stratifyByUrban=TRUE, verbose=TRUE, returnEAinfo=FALSE, + minHHPerEA=25, fixHHPerEA=NULL, fixPopPerHH=NULL) { + + if(length(easpaList) == 1) { + easpaList = replicate(nDraws, easpaList[[1]], simplify=FALSE) + } + areas = easpaList[[1]]$area + + if((is.null(areaMat) || is.null(urbanMat)) && is.null(pixelIndexMat)) { + stop("user must either supply pixelIndexMat or both areaMat and urbanMat") + } + if(is.null(areaMat)) { + areaIs = match(popMat$area, sort(unique(areas))) # convert from character to indices to save memory + areaMat = matrix(areaIs[pixelIndexMat], ncol=nDraws) + } + if(is.null(urbanMat)) { + urbanMat = matrix(popMat$urban[pixelIndexMat], ncol=nDraws) + } + + # start by drawing the totals, then divide households amongst EAs, then divide target population amongst households. + # Make sure there are at least 25 households per EA (only divide the rest randomly) + + ##### Draw the totals + + # get the total number of enumeration areas per stratum (this does not change between draws) + areasIs = match(areas, sort(unique(areas))) # convert from character to indices + totalEAsUrban = easpaList[[1]]$EAUrb + totalEAsRural = easpaList[[1]]$EARur + totalEAs = easpaList[[1]]$EATotal + nEAs = sum(totalEAs) + + ##### draw the total target population per enumeration area + + targetPopDraws = matrix(nrow=nEAs, ncol=nDraws) + if(returnEAinfo) { + householdDraws = matrix(nrow=nEAs, ncol=nDraws) + } + + # Draw the number of households per stratum area that will be randomly distributed (total minus the minimum minHHPerEA) + if(stratifyByUrban) { + totalHouseholdsUrban = sweep(matrix(sapply(easpaList, function(x) {x$HHUrb}), ncol=length(easpaList)), 1, -minHHPerEA*totalEAsUrban, "+") + totalHouseholdsRural = sweep(matrix(sapply(easpaList, function(x) {x$HHRur}), ncol=length(easpaList)), 1, -minHHPerEA*totalEAsRural, "+") + totalChildrenUrban = matrix(sapply(easpaList, function(x) {x$popUrb}), ncol=length(easpaList)) + totalChildrenRural = matrix(sapply(easpaList, function(x) {x$popRur}), ncol=length(easpaList)) + } else { + totalHouseholds = sweep(sapply(matrix(easpaList, function(x) {x$HHTotal}), ncol=length(easpaList)), 1, -minHHPerEA*totalEAs, "+") + totalChildren = matrix(sapply(easpaList, function(x) {x$popHHTotal}), ncol=length(easpaList)) + } + + # distribute the households throughout the enumeration areas with multinomial distribution, then + # distribute the target population amongst the households, also with a multinomial distribution + for(i in 1:length(areasIs)) { + thisArea = areasIs[i] + thisAreaL = areaMat==thisArea + + # print progress if in verbose mode + if(verbose) { + print(paste0("drawing Ns for each EA for area ", thisArea, " (", i, "/", length(areas), ")")) + } + + # draw households per EA (make sure there are any rural EAs) + if(stratifyByUrban) { + if(totalEAsUrban[i] != 0) { + householdDrawsUrban <- matrix(sapply(totalHouseholdsUrban[i,], stats::rmultinom, n=1, prob=rep(1/totalEAsUrban[i], totalEAsUrban[i])), nrow=totalEAsUrban[i], ncol=nDraws) + minHHPerEA + } + if(totalEAsRural[i] != 0) { + householdDrawsRural <- matrix(sapply(totalHouseholdsRural[i,], stats::rmultinom, n=1, prob=rep(1/totalEAsRural[i], totalEAsRural[i])), nrow=totalEAsRural[i], ncol=nDraws) + minHHPerEA + } + + # if we must return EA info, we must return the household draws for each EA: + if(returnEAinfo) { + if(totalEAsUrban[i] != 0) { + householdDraws[thisAreaL & urbanMat] = householdDrawsUrban + } + if(totalEAsRural[i] != 0) { + householdDraws[thisAreaL & !urbanMat] = householdDrawsRural + } + } + } else { + householdDraws = matrix(sapply(totalHouseholds[i,], stats::rmultinom, n=1, prob=rep(1/totalEAs[i], totalEAs[i])), nrow=totalEAs[i], ncol=nDraws) + minHHPerEA + } + + # drawing target population per EA + if(is.null(fixPopPerHH)) { + # draw target pop per EA with probability proportional to the number of households + if(stratifyByUrban) { + if(totalEAsUrban[i] != 0) { + probsUrban = sweep(householdDrawsUrban, 2, 1 / colSums(householdDrawsUrban), "*") + targetPopDraws[thisAreaL & urbanMat] = sapply(1:nDraws, function(j) {stats::rmultinom(1, totalChildrenUrban[i,j], probsUrban[,j])}) + } + + if(totalEAsRural[i] != 0) { + probsRural = sweep(householdDrawsRural, 2, 1 / colSums(householdDrawsRural), "*") + targetPopDraws[thisAreaL & !urbanMat] = sapply(1:nDraws, function(j) {stats::rmultinom(1, totalChildrenRural[i,j], probsRural[,j])}) + } + } else { + probs = sweep(householdDraws, 2, 1 / colSums(householdDraws), "*") + targetPopDraws[thisAreaL] = sapply(1:nDraws, function(j) {stats::rmultinom(1, totalChildren[i,j], probs[,j])}) + } + } else { + # set target pop per EA based on fixed number per household + if(stratifyByUrban) { + if(totalEAsUrban[i] != 0) { + targetPopDraws[thisAreaL & urbanMat] = fixPopPerHH*householdDrawsUrban + } + if(totalEAsRural[i] != 0) { + targetPopDraws[thisAreaL & !urbanMat] = fixPopPerHH*householdDrawsRural + } + } else { + targetPopDraws[thisAreaL] = fixPopPerHH*householdDraws + } + } + } + + ##### Return results + if(!returnEAinfo) { + targetPopDraws + } else { + list(householdDraws=householdDraws, targetPopDraws=targetPopDraws) + } +} + +#' @describeIn simPopInternal Same as sampleNMultilevelMultinomial, except the number of EAs per pixel is fixed +sampleNMultilevelMultinomialFixed = function(clustersPerPixel, nDraws=ncol(pixelIndices), pixelIndices=NULL, + urbanVals=NULL, areaVals=NULL, easpa, popMat, stratifyByUrban=TRUE, + verbose=TRUE) { + + # set default inputs + if((is.null(areaVals) || is.null(urbanVals)) && is.null(pixelIndices)) { + stop("user must either supply pixelIndices or both areaVals and urbanVals") + } + if(is.null(areaVals)) { + areaVals = matrix(popMat$area[pixelIndices], ncol=nDraws) + } + if(is.null(urbanVals)) { + urbanVals = matrix(popMat$urban[pixelIndices], ncol=nDraws) + } + + # start by drawing the totals, then divide households amongst EAs, then divide target population amongst households. + # Make sure there are at least 25 households per EA (only divide the rest randomly) + + ##### Draw the totals + + # get the total number of enumeration areas per stratum (this does not change between draws) + areas = easpa$area + totalEAsUrban = easpa$EAUrb + totalEAsRural = easpa$EARur + totalEAs = easpa$EATotal + nEAs = sum(totalEAs) + + if(nEAs != sum(clustersPerPixel)) { + stop("sum(easpa$EATotal) != sum(clustersPerPixel)") + } + + ##### draw the total target population per EA + targetPopDraws = matrix(nrow=nEAs, ncol=nDraws) + + # Draw the number of households per stratum area that will be randomly distributed (total minus the minimum 25) + if(stratifyByUrban) { + totalHouseholdsUrban = easpa$HHUrb -25*totalEAsUrban + totalHouseholdsRural = easpa$HHRur -25*totalEAsRural + totalChildrenUrban = easpa$popUrb + totalChildrenRural = easpa$popRur + } else { + totalHouseholds = easpa$HHTotal - 25*totalEAs + totalChildren = easpa$popHHTotal + } + + # distribute the households throughout the enumeration areas with multinomial distribution, then + # distribute the target population amongst the households, also with a multinomial distribution + for(i in 1:length(areas)) { + thisArea = areas[i] + + # print progress if in verbose mode + if(verbose) { + print(paste0("drawing Ns for each EA for area ", thisArea, " (", i, "/", length(areas), ")")) + } + + # draw households per EA (make sure there are any rural EAs) + if(stratifyByUrban) { + if(totalEAsUrban[i] != 0) { + if(any(totalHouseholdsUrban != 0)) { + householdDrawsUrban = stats::rmultinom(n=nDraws, size=totalHouseholdsUrban[i], prob=rep(1/totalEAsUrban[i], totalEAsUrban[i])) + 25 + } else { + householdDrawsUrban = matrix(rep(25, totalEAsUrban[i]*nDraws), ncol=nDraws) + } + } + + if(totalEAsRural[i] != 0) { + if(any(totalHouseholdsRural != 0)) { + householdDrawsRural = stats::rmultinom(n=nDraws, size=totalHouseholdsRural[i], prob=rep(1/totalEAsRural[i], totalEAsRural[i])) + 25 + } else { + householdDrawsRural = matrix(rep(25, totalEAsRural[i]*nDraws), ncol=nDraws) + } + } + } else { + if(any(totalHouseholdsRural != 0)) { + householdDraws = stats::rmultinom(n=nDraws, size=totalHouseholds[i], prob=rep(1/totalEAs[i], totalEAs[i])) + 25 + } else { + householdDraws = matrix(rep(25, totalEAs[i]*nDraws), ncol=nDraws) + } + } + + # drawing target population per EA, with probability proportional to the number of households + if(stratifyByUrban) { + + if(totalEAsUrban[i] != 0) { + probsUrban = sweep(householdDrawsUrban, 2, 1 / colSums(householdDrawsUrban), "*") + targetPopDraws[areaVals==thisArea & urbanVals] = sapply(1:nDraws, function(j) {stats::rmultinom(1, totalChildrenUrban[i], probsUrban[,j])}) + } + + if(totalEAsRural[i] != 0) { + probsRural = sweep(householdDrawsRural, 2, 1 / colSums(householdDrawsRural), "*") + targetPopDraws[areaVals==thisArea & !urbanVals] = sapply(1:nDraws, function(j) {stats::rmultinom(1, totalChildrenRural[i], probsRural[,j])}) + } + + } else { + probs = sweep(householdDraws, 2, 1 / colSums(householdDraws), "*") + targetPopDraws[areaVals==thisArea] = sapply(1:nDraws, function(j) {stats::rmultinom(1, totalChildren[i], probs[,j])}) + } + } + + ##### Return results + targetPopDraws +} \ No newline at end of file diff --git a/R/simSPDE.R b/R/simSPDE.R new file mode 100755 index 0000000..79ae073 --- /dev/null +++ b/R/simSPDE.R @@ -0,0 +1,59 @@ +#' Simulate from the SPDE spatial model +#' +#' Generates nCoords x nsim matrix of simulated +#' values of the SPDE spatial process +#' +#' @param coords 2 column matrix of spatial coordinates at which to simulate the spatial process +#' @param nsim number of draws from the SPDE model +#' @param mesh SPDE mesh +#' @param effRange effective spatial range +#' @param margVar marginal variance of the spatial process +#' @param inla.seed seed input to inla.qsample. 0L sets seed intelligently, +#' > 0 sets a specific seed, < 0 keeps existing RNG +#' +#' @author John Paige +#' +#' @references Lindgren, F., Rue, H., Lindström, J., 2011. An explicit link between Gaussian fields and Gaussian Markov random fields: the stochastic differential equation approach (with discussion). Journal of the Royal Statistical Society, Series B 73, 423–498. +#' +#' @examples +#' \dontrun{ +#' set.seed(123) +#' require(INLA) +#' coords = matrix(runif(10*2), ncol=2) +#' mesh = inla.mesh.2d(loc.domain=cbind(c(0, 0, 1, 1), c(0, 1, 0, 1)), +#' n=3000, max.n=5000, max.edge=c(.01, .05), offset=-.1) +#' simVals = simSPDE(coords, nsim=1, mesh, effRange=.2, inla.seed=1L) +#' } +#' +#' @export +simSPDE = function(coords, nsim=1, mesh, effRange=(max(coords[,1])-min(coords[,1]))/3, margVar=1, inla.seed=0L) { + + # calculate SPDE model parameters based on Lindgren Rue (2015) "Bayesian Spatial Modelling with R-INLA" + meshSize <- min(c(diff(range(mesh$loc[, 1])), diff(range(mesh$loc[, 2])))) + # it is easier to use theta and set sigma0 to 1 then to set sigma0 and the effective range directly + # kappa0 <- sqrt(8)/effRange * meshSize # since nu = 1 + # kappa0 <- sqrt(8)/effRange # since nu = 1 + # kappa0 = sqrt(8) / 5 + # logKappa = log(kappa0) + sigma0 = 1 + # tau0 <- 1/(sqrt(4 * pi) * kappa0 * sigma0) + # logTau = log(tau0) + + # from page 5 of the paper listed above: + logKappa = 0.5 * log(8) + logTau = 0.5 * (lgamma(1) - (lgamma(2) + log(4*pi))) - logKappa + theta = c(log(sqrt(margVar)), log(effRange)) + spde <- INLA::inla.spde2.matern(mesh, B.tau = cbind(logTau, -1, +1), + B.kappa = cbind(logKappa, 0, -1), theta.prior.mean = theta, + theta.prior.prec = c(0.1, 1)) + + # generate A and Q precision matrix + Q = INLA::inla.spde2.precision(spde, theta = theta) + A = INLA::inla.spde.make.A(mesh, coords) + + # generate simulations + simField = INLA::inla.qsample(nsim, Q, seed=inla.seed) + simDat = as.matrix(A %*% simField) + + simDat +} \ No newline at end of file diff --git a/README.md b/README.md index 12067d8..5de2eb0 100755 --- a/README.md +++ b/README.md @@ -4,10 +4,25 @@ SAE Unit/area Models and Methods for Estimation in R + ## Next major update (version 1.4.0) -+ New major functions for simulation and using SPDE spatial model and aggregation over grids. -+ New datasets on Kenya population map. -+ New methods for benchmarking. ++ New major functions for population and prevalence simulation based on population frame and population density information, along with methods for aggregating pixel level prevalences and populations to the areal level. + ++ New major functions. + * ``simSPDE``: Simulates from the SPDE spatial model. + * ``simPopSPDE``: Simulates population in a country based on SPDE spatial model and population frame information. + * ``simPopPixel``: Simulates population in a country based on custom spatial model and population frame information. ++ New minor functions. + * ``aggPixelPreds`` and ``areaPopToArea``: Functions for aggregating populations and prevalences to the areal level. + * Functions for setting pixellated grid and population density grid. + * Functions for adjusting population density grid based on population frame information. + * Functions for projections to use with Kenya maps and for getting Admin areas given spatial locations. ++ New internal functions for helping with population simulation. ++ New datasets + * ``kenyaMaps``: Kenya administrative area map shapefiles, and a triangular mesh for the SPDE model for Kenya. + * ``kenyaPopulationData``: General and neonatal population frames for Kenya, and general and neonatal population density information for Kenya. + * ``popMatKenya`` and ``popMatKenyaNeonatal``: General and neonatal population density grids at 5km resolution. + ## Major update (version 1.3.0) + New SAE functions and vignette. @@ -17,6 +32,12 @@ SAE Unit/area Models and Methods for Estimation in R + Allows `smoothDirect` and `smoothCluster` to fit space-only models. + S3 class output. +## Major update (version 1.1.0) ++ Major expansion for `smoothSurvey` to implement popular SAE methods. Syntax change to the function. ++ Allows `smoothDirect` and `smoothCluster` to fit space-only models. ++ `smoothSurvey`, `smoothDirect`, and `smoothCluster` now returns S3 classed objects. + + ## Major update (version 1.1.0) + Major expansion for `smoothSurvey` to implement popular SAE methods. Syntax change to the function. + Allows `smoothDirect` and `smoothCluster` to fit space-only models. diff --git a/data/kenyaPopulationData.rda b/data/kenyaPopulationData.rda new file mode 100755 index 0000000..6706e2d Binary files /dev/null and b/data/kenyaPopulationData.rda differ diff --git a/inst/.DS_Store b/inst/.DS_Store new file mode 100755 index 0000000..ec1eba7 Binary files /dev/null and b/inst/.DS_Store differ diff --git a/man/malawiMap.Rd b/man/MalawiMap.Rd similarity index 100% rename from man/malawiMap.Rd rename to man/MalawiMap.Rd diff --git a/man/aggPixelPreds.Rd b/man/aggPixelPreds.Rd new file mode 100755 index 0000000..afc3f1f --- /dev/null +++ b/man/aggPixelPreds.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simPop.R +\name{aggPixelPreds} +\alias{aggPixelPreds} +\title{Helper function of \code{\link{pixelPopToArea}}} +\usage{ +aggPixelPreds( + Zg, + Ng, + areas, + urban = targetPopMat$urban, + targetPopMat = NULL, + useDensity = FALSE, + stratifyByUrban = TRUE, + normalize = useDensity +) +} +\arguments{ +\item{Zg}{nIntegrationPoint x nsim matrix of simulated response (population numerators) for each pixel and sample} + +\item{Ng}{nIntegrationPoint x nsim matrix of simulated counts (population denominators) for each pixel and sample} + +\item{areas}{nIntegrationPoint length character vector of areas (or subareas)} + +\item{urban}{nIntegrationPoint length vector of indicators specifying whether or not pixels are urban or rural} + +\item{targetPopMat}{same as in \code{\link{simPopCustom}}} + +\item{useDensity}{whether to use population density as aggregation weights.} + +\item{stratifyByUrban}{whether or not to stratify simulations by urban/rural classification} + +\item{normalize}{if TRUE, pixel level aggregation weights within specified area are normalized to sum to 1. This produces an +average of the values in Zg rather than a sum. In general, should only be set to TRUE for smooth integrals of risk.} +} +\description{ +Aggregates population from the +pixel level to the level of the area of interest. +} diff --git a/man/aggPop.Rd b/man/aggPop.Rd new file mode 100755 index 0000000..5ef77f5 --- /dev/null +++ b/man/aggPop.Rd @@ -0,0 +1,139 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simPop.R +\name{aggPop} +\alias{aggPop} +\alias{pixelPopToArea} +\alias{areaPopToArea} +\title{Aggregate populations to the specified areal level} +\usage{ +pixelPopToArea( + pixelLevelPop, + eaSamples, + areas, + stratifyByUrban = TRUE, + targetPopMat = NULL, + doFineScaleRisk = !is.null(pixelLevelPop$fineScaleRisk$p), + doSmoothRisk = !is.null(pixelLevelPop$smoothRisk$p) +) + +areaPopToArea( + areaLevelPop, + areasFrom, + areasTo, + stratifyByUrban = TRUE, + doFineScaleRisk = !is.null(areaLevelPop$aggregationResults$pFineScaleRisk), + doSmoothRisk = !is.null(areaLevelPop$aggregationResults$pSmoothRisk) +) +} +\arguments{ +\item{pixelLevelPop}{pixel level population information that we want aggregate. In the same format as output from \code{\link{simPopCustom}}} + +\item{eaSamples}{nIntegrationPoint x nsim matrix of the number of enumeration areas per pixel sampled in the input pixel level population} + +\item{areas}{character vector of length nIntegrationPoints of area names over which we +want to aggregate. Can also be subareas} + +\item{stratifyByUrban}{whether or not to stratify simulations by urban/rural classification} + +\item{targetPopMat}{pixellated grid data frame with variables `lon`, `lat`, `pop` (target population), `area`, `subareas` (if subareaLevel is TRUE), `urban` (if stratifyByUrban is TRUE), `east`, and `north`} + +\item{doFineScaleRisk}{whether or not to calculate the fine scale risk in addition to the prevalence. See details} + +\item{doSmoothRisk}{Whether or not to calculate the smooth risk in addition to the prevalence. See details} + +\item{areaLevelPop}{output of \code{\link{simPopCustom}} containing pixel level information +about the population of interest} + +\item{areasFrom}{character vector of length equal to the number of areas from which +we would like to aggregate containing the unique names of the areas. +Can also be subareas, but these are smaller than the "to areas", and +each "from area" must be entirely contained in a single "to area"} + +\item{areasTo}{character vector of length equal to the number of areas from which +we would like to aggregate containing the names of the areas containing +with each respective `from' area. Can also be a set of subareas, +but these are larger than the "from areas".} +} +\value{ +A list containing elements `fineScalePrevalence` and `fineScaleRisk`. Each +of these are in turn lists with aggregated prevalence and risk for the area of +interest, containg the following elements, were paranethesis indicate the elements +for the fineScaleRisk model rather than fineScalePrevalence: +\item{p}{Aggregated prevalence (risk), calculated as aggregate of Z divided by +aggregate of N} +\item{Z}{Aggregated (expected) population numerator} +\item{N}{Aggregated (expected) population denominator} +\item{pUrban}{Aggregated prevalence (risk) in urban part of the area, calculated +as aggregate of Z divided by aggregate of N} +\item{ZUrban}{Aggregated (expected) population numerator in urban part of the area} +\item{NUrban}{Aggregated (expected) population denominator in urban part of the area} +\item{pRural}{Aggregated prevalence (risk) in rural part of the area, calculated +as aggregate of Z divided by aggregate of N} +\item{ZRural}{Aggregated (expected) population numerator in rural part of the area} +\item{NRural}{Aggregated (expected) population denominator in rural part of the area} +\item{A}{Aggregation matrix used to aggregate from pixel level to areal level} +\item{AUrban}{Aggregation matrix used to aggregate from pixel level to urban part of the areal level} +\item{ARural}{Aggregation matrix used to aggregate from pixel level to rural part of the areal level} +} +\description{ +Takes simulated populations and aggregates +them to the specified areal level. Also calculates the aggregated risk and prevalence. +} +\section{Functions}{ +\itemize{ +\item \code{pixelPopToArea()}: Aggregate from pixel to areal level + +\item \code{areaPopToArea()}: Aggregate areal populations to another areal level + +}} +\examples{ +\dontrun{ +##### Now we make a model for the risk. We will use an SPDE model with these +##### parameters for the linear predictor on the logist scale, which are chosen +##### to be of practical interest: +beta0=-2.9 # intercept +gamma=-1 # urban effect +rho=(1/3)^2 # spatial variance +effRange = 400 # effective spatial range in km +sigmaEpsilon=sqrt(1/2.5) # cluster (nugget) effect standard deviation + +# simulate the population! Note that this produces multiple dense +# nEA x nsim and nIntegrationPoint x nsim matrices. In the future +# sparse matrices will and chunk by chunk computations may be incorporated. +simPop = simPopSPDE(nsim=1, easpa=easpaKenyaNeonatal, + popMat=popMatKenya, targetPopMat=popMatKenyaNeonatal, + poppsub=poppsubKenya, spdeMesh=kenyaMesh, + margVar=rho, sigmaEpsilonSq=sigmaEpsilon^2, + gamma=gamma, effRange=effRange, beta0=beta0, + seed=123, inla.seed=12, nHHSampled=25, + stratifyByUrban=TRUE, subareaLevel=TRUE, + doFineScaleRisk=TRUE, + min1PerSubarea=TRUE) + +pixelPop = simPop$pixelPop +subareaPop = pixelPopToArea(pixelLevelPop=pixelPop, eaSamples=pixelPop$eaSamples, + areas=popMatKenya$subarea, stratifyByUrban=TRUE, + targetPopMat=popMatKenyaNeonatal, doFineScaleRisk=TRUE) + +# get areas associated with each subarea for aggregation +tempAreasFrom = popMatKenya$subarea +tempAreasTo = popMatKenya$area +areasFrom = sort(unique(tempAreasFrom)) +areasToI = match(areasFrom, tempAreasFrom) +areasTo = tempAreasTo[areasToI] + +# do the aggregation from subareas to areas +outAreaLevel = areaPopToArea(areaLevelPop=subareaPop, + areasFrom=areasFrom, areasTo=areasTo, + stratifyByUrban=TRUE, doFineScaleRisk=TRUE) +} +} +\references{ +In Preparation +} +\seealso{ +\code{\link{areaPopToArea}} +} +\author{ +John Paige +} diff --git a/man/calibrateByRegion.Rd b/man/calibrateByRegion.Rd new file mode 100755 index 0000000..a9b4382 --- /dev/null +++ b/man/calibrateByRegion.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/popGrid.R +\name{calibrateByRegion} +\alias{calibrateByRegion} +\title{Calibrate the point level totals so their sum matches the regional totals} +\usage{ +calibrateByRegion(pointTotals, pointRegions, regions, regionTotals) +} +\arguments{ +\item{pointTotals}{Vector of point level totals that will be calibrated/normalized} + +\item{pointRegions}{Vector of regions associated with each point} + +\item{regions}{Vector of region names} + +\item{regionTotals}{Vector of desired region level totals associated with `regions`} +} +\value{ +A vector of same length as pointTotals and pointRegions containing +the calibrated/normalized point totals that sum to the correct regional totals + +Vector of updated point level totals, calibrated to match region totals +} +\description{ +Calibrate/normalize the point level totals so their sum matches the +regional totals. Technically, the totals can be at any level smaller +than the region level specified. +} +\examples{ +pointTotals = c(1, 1, 1, 2) +pointRegions = c("a", "a", "b", "b") +regionTotals = c(10, 20) +regions = c("a", "b") +calibrateByRegion(pointTotals, pointRegions, regions, regionTotals) + +} +\author{ +John Paige +} diff --git a/man/getAreaName.Rd b/man/getAreaName.Rd new file mode 100755 index 0000000..2a24066 --- /dev/null +++ b/man/getAreaName.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getAreaName.R +\name{getAreaName} +\alias{getAreaName} +\title{Determines which administrative areas contain the given points} +\usage{ +getAreaName( + pts, + shapefile, + areaNameVar = "NAME_1", + delta = 0.05, + mean.neighbor = 50, + maxBytes = 3 * 2^30 +) +} +\arguments{ +\item{pts}{2 column matrix of lon/lat coordinates} + +\item{shapefile}{A SpatialPolygonsDataFrame object} + +\item{areaNameVar}{The column name in \code{slot(shapefile, "data")} +corresponding to the area level of interest} + +\item{delta}{Argument passed to fields::fields.rdist.near in fields package} + +\item{mean.neighbor}{Argument passed to fields::fields.rdist.near in fields +package} + +\item{maxBytes}{Maximum allowed memory in bytes (default is 3Gb). Determines +whether to call fields::fields.rdist.near which saves memory bute requires +delta and mean.neighbor inputs to be specified or fields::rdist.} +} +\value{ +A list of area IDs, area names, whether or not +points are in multiple areas, and whether or not points +are in no areas and assigned to the nearest one. +} +\description{ +For any points not in an area, they are assigned the nearest area using +fields::fields.rdist.near or fields::rdist depending on the number of points +and the maximum memory in bytes with a warning. +} +\details{ +delta and mean.neighbor arguments only used when some points +are not in areas, perhaps due to inconsistencies in shapefiles. +} +\examples{ +\dontrun{ +# download Kenya GADM shapefiles from SUMMERdata github repository +githubURL <- "https://github.com/paigejo/SUMMERdata/blob/main/data/kenyaMaps.rda?raw=true" +download.file(githubURL,"kenyaMaps.rda") + +# load it in +load("kenyaMaps.rda") + +# use the shapefile data to see what Admin1 and 2 areas the +# points (0, 37) and (0.5, 38) are in +# (these are longitude/latitude coordinates) +pts = cbind(c(37, 38), c(0, .5)) +head(slot(adm1, "data")) +admin1Areas = getAreaName(pts, adm1, "NAME_1") +admin2Areas = getAreaName(pts, adm2, "NAME_2") +} + +} +\seealso{ +\code{\link{projKenya}}, \code{\link[fields]{fields.rdist.near}} +} +\author{ +John Paige +} diff --git a/man/kenyaPopulationData.Rd b/man/kenyaPopulationData.Rd new file mode 100755 index 0000000..f3cb615 --- /dev/null +++ b/man/kenyaPopulationData.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/kenyaPopulationData.R +\docType{data} +\name{kenyaPopulationData} +\alias{kenyaPopulationData} +\alias{easpaKenya} +\alias{easpaKenyaNeonatal} +\alias{poppaKenya} +\alias{poppsubKenya} +\title{Kenya 2009 Census Frame and Related Datasets} +\format{ +A number of data.frames with information about the 2009 Kenya Population and Housing Census and the population in Kenya at the time of the 2014 Demographic Health Survey. Some of the +data.frames have been adjusted to contain information about neonatals born from 2010-2014 rather than general population in 2014. +The dataset names are: easpaKenya, easpaKenyaNeonatal, +poppaKenya, and poppsubKenya. + +An object of class \code{data.frame} with 47 rows and 12 columns. + +An object of class \code{data.frame} with 47 rows and 6 columns. + +An object of class \code{data.frame} with 300 rows and 7 columns. +} +\source{ + +} +\usage{ +data(kenyaPopulationData) + +easpaKenyaNeonatal + +poppaKenya + +poppsubKenya +} +\description{ +Datasets related to the 2009 census frame for Kenya based on the 2009 Kenya Population and Housing +Census. General population totals are estimated for 2014. Based on 2014 population density +estimates interpolated with exponential growth rate between 2010 and 2015 from WorldPop data. +} +\references{ +Kenya National Bureau of Statistics, Ministry of Health/Kenya, National AIDS Control Council/Kenya, Kenya Medical Research Institute, and National Council For Population And Development/Kenya, 2015. Kenya Demographic and Health Survey 2014. Rockville, Maryland, USA. URL: http://dhsprogram.com/pubs/pdf/FR308/FR308.pdf. + +Stevens, F.R., Gaughan, A.E., Linard, C., Tatem, A.J., 2015. Disaggregat- ing census data for population mapping using random forests with remotely-sensed and ancillary data. PloS One 10, e0107042. + +Tatem, A.J., 2017. WorldPop, open data for spatial demography. Scientific Data 4. +} +\keyword{datasets} diff --git a/man/logitNormMean.Rd b/man/logitNormMean.Rd new file mode 100755 index 0000000..a596339 --- /dev/null +++ b/man/logitNormMean.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/logitNormMean.R +\name{logitNormMean} +\alias{logitNormMean} +\title{Calculate the mean of a distribution whose +logit is Gaussian} +\usage{ +logitNormMean(muSigmaMat, logisticApprox = FALSE, ...) +} +\arguments{ +\item{muSigmaMat}{An n x 2 matrix where each row is \eqn{\mu}{mu} and \eqn{\sigma}{sigma} +on the logit scale for an independent random variable.} + +\item{logisticApprox}{Whether or not to use logistic approximation to speed +up computation. See details for more information.} + +\item{...}{More arguments, passed to \code{integrate} function} +} +\value{ +A vector of expectations of the specified random variables +} +\description{ +Adapted from logitnorm package. Calculates the mean of a distribution whose +logit is Gaussian. Each row of muSigmaMat is a mean and standard deviation +on the logit scale. +} +\details{ +If \eqn{\mbox{logit}(Y) \sim N(\mu, \sigma^2)}{logit(Y) ~ N(mu, sigma^2)}, +This function calculates \eqn{E[Y]}{E[Y]} via either numerical integration or by +assuming that Y follows a logistic distribution. Under this approximation, setting +\eqn{k = 16 \sqrt(3) / (15 \pi)}{k = 16 * sqrt(3) / (15 * pi)}, we approximate +the expectation as: +\deqn{E[Y] = expit(\mu / \sqrt(1 + k^2 \sigma^2))}{E[Y] = expit(mu / sqrt(1 + k^2 * sigma^2))}. +The above logistic approximation speeds up the computation, but also sacrifices +some accuracy. +} +\examples{ +mus = c(-5, 0, 5) +sigmas = rep(1, 3) +logitNormMean(cbind(mus, sigmas)) +logitNormMean(cbind(mus, sigmas), TRUE) + +} +\author{ +John Paige +} diff --git a/man/makePopIntegrationTab.Rd b/man/makePopIntegrationTab.Rd new file mode 100755 index 0000000..3fd6086 --- /dev/null +++ b/man/makePopIntegrationTab.Rd @@ -0,0 +1,368 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/popGrid.R +\name{makePopIntegrationTab} +\alias{makePopIntegrationTab} +\alias{getPoppsub} +\alias{adjustPopMat} +\title{Generating pixellated populations, and population frames} +\usage{ +makePopIntegrationTab( + kmRes = 5, + pop, + domainMapDat, + eastLim, + northLim, + mapProjection, + areaMapDat, + subareaMapDat, + areaNameVar = "NAME_1", + subareaNameVar = "NAME_2", + poppa = NULL, + poppsub = NULL, + stratifyByUrban = TRUE, + areapa = NULL, + areapsub = NULL, + customSubsetPolygons = NULL, + areaPolygonSubsetI = NULL, + subareaPolygonSubsetI = NULL, + mean.neighbor = 50, + delta = 0.1, + returnPoppTables = FALSE, + setNAsToZero = TRUE, + fixZeroPopDensitySubareas = FALSE, + extractMethod = "bilinear" +) + +getPoppsub( + kmRes = 1, + pop, + domainMapDat, + eastLim, + northLim, + mapProjection, + poppa, + areapa = NULL, + areapsub, + subareaMapDat, + subareaNameVar = "NAME_2", + stratifyByUrban = TRUE, + areaMapDat = NULL, + areaNameVar = "NAME_1", + areaPolygonSubsetI = NULL, + subareaPolygonSubsetI = NULL, + customSubsetPolygons = NULL, + mean.neighbor = 50, + delta = 0.1, + setNAsToZero = TRUE, + fixZeroPopDensitySubareas = FALSE +) + +adjustPopMat( + popMat, + poppaTarget = NULL, + adjustBy = c("area", "subarea"), + stratifyByUrban = TRUE +) +} +\arguments{ +\item{kmRes}{The resolution of the pixelated grid in km} + +\item{pop}{Population density raster} + +\item{domainMapDat}{A shapefile representing the full spatial domain (e.g. country)} + +\item{eastLim}{Range in km easting over the spatial domain under the input projection} + +\item{northLim}{Range in km northing over the spatial domain under the input projection} + +\item{mapProjection}{A projection function taking longitude and latitude and returning easting and +northing in km. Or the inverse if inverse is set to TRUE. For example, +\code{\link{projKenya}}. Check https://epsg.io/ for example for best projection EPSG codes +for specific countries} + +\item{areaMapDat}{SpatialPolygonsDataFrame object with area level map information} + +\item{subareaMapDat}{SpatialPolygonsDataFrame object with subarea level map information} + +\item{areaNameVar}{The name of the area variable associated with \code{areaMapDat@data} +and \code{subareaMapDat@data}} + +\item{subareaNameVar}{The name of the subarea variable associated with \code{subareaMapDat@data}} + +\item{poppa}{data.frame of population per area separated by urban/rural. If `poppsub` + is not included, this is used for normalization of populations associated with + population integration points. Contains variables: +\describe{ + \item{area}{name of area} + \item{popUrb}{total urban (general) population of area} + \item{popRur}{total rural (general) population of area} + \item{popTotal}{total (general) population of area} + \item{pctUrb}{percentage of population in the area that is urban (between 0 and 100)} +}} + +\item{poppsub}{data.frame of population per subarea separated by +urban/rural using for population normalization or urbanicity +classification. Often based on extra fine scale population density grid. +Has variables: +\describe{ + \item{subarea}{name of subarea} + \item{area}{name of area} + \item{popUrb}{total urban (general) population of subarea} + \item{popRur}{total rural (general) population of subarea} + \item{popTotal}{total (general) population of subarea} + \item{pctUrb}{percentage of population in the subarea that is urban (between 0 and 100)} +}} + +\item{stratifyByUrban}{Whether to stratify the pixellated grid by urban/rural. If TRUE, +renormalizes population densities within areas or subareas crossed with urban/rural} + +\item{areapa}{A list with variables: +\describe{ + \item{area}{name of area} + \item{spatialArea}{spatial area of the subarea (e.g. in km^2)} +}} + +\item{areapsub}{A list with variables: +\describe{ + \item{subarea}{name of subarea} + \item{spatialArea}{spatial area of the subarea (e.g. in km^2)} +}} + +\item{customSubsetPolygons}{'SpatialPolygonsDataFrame' or 'SpatialPolygons' object to subset +the grid over. This option can help reduce computation time relative to +constructing the whole grid and subsetting afterwards. `areaPolygonSubsetI` or +`subareaPolygonSubsetI` can be used when subsetting by areas or subareas in +`areaMapDat` or `subareaMapDat`. Must be in latitude/longitude projection "EPSG:4326"} + +\item{areaPolygonSubsetI}{Index in areaMapDat for a specific area to subset the grid over. This +option can help reduce computation time relative to constructing the whole grid +and subsetting afterwards} + +\item{subareaPolygonSubsetI}{FOR EXPERIMENTAL PURPOSES ONLY. Index in subareaMapDat for a +specific area to subset the grid over. This +option can help reduce computation time relative to constructing the whole grid +and subsetting afterwards} + +\item{mean.neighbor}{For determining what area or subarea points are nearest to if they do not +directly fall into an area. See \code{\link[fields]{fields.rdist.near}} for details.} + +\item{delta}{For determining what area or subarea points are nearest to if they do not +directly fall into an area. See \code{\link[fields]{fields.rdist.near}} for details.} + +\item{returnPoppTables}{If TRUE, poppa and poppsub will be calculated based on the generated +population integration matrix and input area/subarea map data} + +\item{setNAsToZero}{If TRUE, sets NA populations to 0.} + +\item{fixZeroPopDensitySubareas}{If TRUE, if population density in a subarea is estimated to be +zero, but the total population in the subarea is nonzero, population is filled into the +area uniformly} + +\item{extractMethod}{Either 'bilinear' or 'simple'. see `method` from +\code{\link[terra]{extract}}} + +\item{popMat}{Pixellated grid data frame with variables `area` and `pop` such as that +generated by \code{\link{makePopIntegrationTab}}} + +\item{poppaTarget}{Target population per area stratified by urban rural. Same format as poppa} + +\item{adjustBy}{Whether to adjust population density by the `area` or `subarea` level} +} +\description{ +Functions for generating pixellated population information and +population frames at the `area` and `subarea` levels. +The `area` and `subarea` levels can be thought of as big +regions and little regions, where areas can be partitioned into +unique sets of subareas. For example, Admin-1 and Admin-2 +areas might be areas and subareas respectively. The population totals are either +tabulated at the area x urban/rural level, the subarea x urban/rural +level, or at the pixel level of a specified resolution. Totals +are calculated using population density information, shapefiles, +and, possibly, preexisting population frames at different +areal levels. Note that area names should each be unique, and similarly for +subarea names. +} +\section{Functions}{ +\itemize{ +\item \code{makePopIntegrationTab()}: Generate pixellated `grid` of coordinates (both longitude/latitude and east/north) +over spatial domain of the given resolution with associated population totals, areas, subareas, +and urban/rural levels. For very small areas that might not +otherwise have a grid point in them, a custom integration point is added at their +centroid. Sets urbanicity classifications by thresholding input population density raster +using area and subarea population tables, and generates area and subarea population +tables from population density information if not already given. Can be used for integrating +predictions from the given coordinates to area and subarea levels using population weights. + +\item \code{getPoppsub()}: Generate table of estimates of population +totals per subarea x urban/rural combination based on population density +raster at `kmres` resolution "grid", including custom integration points +for any subarea too small to include grid points at their centroids. + +\item \code{adjustPopMat()}: Adjust population densities in grid based on a population frame. + +}} +\examples{ +\dontrun{ +# download Kenya GADM shapefiles from SUMMERdata github repository +githubURL <- paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/", + "kenyaMaps.rda?raw=true") +tempDirectory = "~/" +mapsFilename = paste0(tempDirectory, "/kenyaMaps.rda") +if(!file.exists(mapsFilename)) { + download.file(githubURL,mapsFilename) +} + +# load it in +out = load(mapsFilename) +out +adm1@data$NAME_1 = as.character(adm1@data$NAME_1) +adm1@data$NAME_1[adm1@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +adm1@data$NAME_1[adm1@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" +adm2@data$NAME_1 = as.character(adm2@data$NAME_1) +adm2@data$NAME_1[adm2@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +adm2@data$NAME_1[adm2@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" + +# some Admin-2 areas have the same name +adm2@data$NAME_2 = as.character(adm2@data$NAME_2) +adm2@data$NAME_2[(adm2@data$NAME_1 == "Bungoma") & + (adm2@data$NAME_2 == "Lugari")] = "Lugari, Bungoma" +adm2@data$NAME_2[(adm2@data$NAME_1 == "Kakamega") & + (adm2@data$NAME_2 == "Lugari")] = "Lugari, Kakamega" +adm2@data$NAME_2[(adm2@data$NAME_1 == "Meru") & + (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Meru" +adm2@data$NAME_2[(adm2@data$NAME_1 == "Tharaka-Nithi") & + (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Tharaka-Nithi" + +# The spatial area of unknown 8 is so small, it causes problems unless its removed or +# unioned with another subarea. Union it with neighboring Kakeguria: +newadm2 = adm2 +unknown8I = which(newadm2$NAME_2 == "unknown 8") +newadm2$NAME_2[newadm2$NAME_2 \%in\% c("unknown 8", "Kapenguria")] <- + "Kapenguria + unknown 8" +admin2.IDs <- newadm2$NAME_2 + +library(maptools) +temp <- unionSpatialPolygons(newadm2, admin2.IDs) +tempData = newadm2@data[-unknown8I,] +tempData = tempData[order(tempData$NAME_2),] +newadm2 <- SpatialPolygonsDataFrame(temp, tempData, match.ID = F) +adm2 = newadm2 + +# download 2014 Kenya population density and associated TIF file +githubURL <- paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/", + "Kenya2014Pop/pop.rda?raw=true") +popFilename = paste0(tempDirectory, "/pop.rda") +if(!file.exists(popFilename)) { + download.file(githubURL,popFilename) +} + +githubURL <- paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/", + "Kenya2014Pop/worldpop_total_1y_2014_00_00.tif?raw=true") +popTIFFilename = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") +if(!file.exists(popTIFFilename)) { + download.file(githubURL,popTIFFilename) +} + +# load it in +require(terra) +out = load(popFilename) +out + +# make sure this is correct for re-projections +pop@file@name = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") + +eastLim = c(-110.6405, 832.4544) +northLim = c(-555.1739, 608.7130) + +## Construct poppsubKenya, a table of urban/rural general population totals +## in each subarea. Technically, this is not necessary since we can load in +## poppsubKenya via data(kenyaPopulationData). First, we will need to calculate +## the areas in km^2 of the areas and subareas + +library(rgdal) +library(sp) + +# use Lambert equal area projection of areas (Admin-1) and subareas (Admin-2) +midLon = mean(adm1@bbox[1,]) +midLat = mean(adm1@bbox[2,]) +p4s = paste0("+proj=laea +x_0=0 +y_0=0 +lon_0=", midLon, + " +lat_0=", midLat, " +units=km") + +library(rgdal) + +adm1proj <- spTransform(adm1, CRS(p4s)) +adm2proj <- spTransform(adm2, CRS(p4s)) + +# now calculate spatial area in km^2 +library(rgeos) +admin1Areas = gArea(adm1proj, TRUE) +admin2Areas = gArea(adm2proj, TRUE) +areapaKenya = data.frame(area=adm1proj@data$NAME_1, spatialArea=admin1Areas) +areapsubKenya = data.frame(area=adm2proj@data$NAME_1, subarea=adm2proj@data$NAME_2, + spatialArea=admin2Areas) + +# Calculate general population totals at the subarea (Admin-2) x urban/rural +# level and using 1km resolution population grid. Assign urbanicity by +# thresholding population density based on estimated proportion population +# urban/rural, making sure total area (Admin-1) urban/rural populations in +# each area matches poppaKenya. +require(fields) +# NOTE: the following function will typically take ~20 minutes. Can speed up +# by setting kmRes to be higher, but we recommend fine resolution for +# this step, since it only needs to be done once. Instead of running this, +# you can simply run data(kenyaPopulationData) +system.time(poppsubKenya <- getPoppsub( + kmRes=1, pop=pop, domainMapDat=adm0, + eastLim=eastLim, northLim=northLim, mapProjection=projKenya, + poppa = poppaKenya, areapa=areapaKenya, areapsub=areapsubKenya, + areaMapDat=adm1, subareaMapDat=adm2, + areaNameVar = "NAME_1", subareaNameVar="NAME_2")) + +# Now generate a general population integration table at 5km resolution, +# based on subarea (Admin-2) x urban/rural population totals. This takes +# ~1 minute +system.time(popMatKenya <- makePopIntegrationTab( + kmRes=5, pop=pop, domainMapDat=adm0, + eastLim=eastLim, northLim=northLim, mapProjection=projKenya, + poppa = poppaKenya, poppsub=poppsubKenya, + areaMapDat = adm1, subareaMapDat = adm2, + areaNameVar = "NAME_1", subareaNameVar="NAME_2")) + +## Adjust popMat to be target (neonatal) rather than general population density. First +## creat the target population frame +## (these numbers are based on IPUMS microcensus data) +mothersPerHouseholdUrb = 0.3497151 +childrenPerMotherUrb = 1.295917 +mothersPerHouseholdRur = 0.4787696 +childrenPerMotherRur = 1.455222 +targetPopPerStratumUrban = easpaKenya$HHUrb * mothersPerHouseholdUrb * childrenPerMotherUrb +targetPopPerStratumRural = easpaKenya$HHRur * mothersPerHouseholdRur * childrenPerMotherRur +easpaKenyaNeonatal = easpaKenya +easpaKenyaNeonatal$popUrb = targetPopPerStratumUrban +easpaKenyaNeonatal$popRur = targetPopPerStratumRural +easpaKenyaNeonatal$popTotal = easpaKenyaNeonatal$popUrb + easpaKenyaNeonatal$popRur +easpaKenyaNeonatal$pctUrb = 100 * easpaKenyaNeonatal$popUrb / easpaKenyaNeonatal$popTotal +easpaKenyaNeonatal$pctTotal = + 100 * easpaKenyaNeonatal$popTotal / sum(easpaKenyaNeonatal$popTotal) + +# Generate the target population density by scaling the current population density grid +# at the Admin1 x urban/rural level +popMatKenyaNeonatal = adjustPopMat(popMatKenya, easpaKenyaNeonatal) + +# Generate neonatal population table from the neonatal population integration matrix. +# This is technically not necessary for population simulation purposes, but is here +# for illustrative purposes +poppsubKenyaNeonatal = poppRegionFromPopMat(popMatKenyaNeonatal, popMatKenyaNeonatal$subarea) +poppsubKenyaNeonatal = cbind(subarea=poppsubKenyaNeonatal$region, + area=adm2@data$NAME_1[match(poppsubKenyaNeonatal$region, + adm2@data$NAME_2)], + poppsubKenyaNeonatal[,-1]) +print(head(poppsubKenyaNeonatal)) +} +} +\seealso{ +\code{\link{setThresholdsByRegion}}, \code{\link{poppRegionFromPopMat}}, \code{\link{simPopSPDE}}, \code{\link{simPopCustom}} +} +\author{ +John Paige +} diff --git a/man/poppRegionFromPopMat.Rd b/man/poppRegionFromPopMat.Rd new file mode 100755 index 0000000..12280b0 --- /dev/null +++ b/man/poppRegionFromPopMat.Rd @@ -0,0 +1,134 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/popGrid.R +\name{poppRegionFromPopMat} +\alias{poppRegionFromPopMat} +\title{Generate a population frame of a similar format to poppa argument of \code{\link{simPopCustom}} with a custom set of regions} +\usage{ +poppRegionFromPopMat(popMat, regions) +} +\arguments{ +\item{popMat}{Pixellated grid data frame with variables `area` and `pop`. Assumed to be stratified by urban/rural} + +\item{regions}{character vector of length nPixels giving a custom set of regions for which to generate +a population frame using population density} +} +\value{ +A table of population totals by region +} +\description{ +Generate a population frame of a similar format to poppa argument of \code{\link{simPopCustom}} with a custom set of regions +} +\details{ +Urbanicity thresholds are set based on that region's percent population +urban. Intended as a helper function of \code{\link{getPoppsub}}, but +can also be used for custom sets of regions (i.e. more than just 2 +areal levels: area and subarea). +} +\examples{ +\dontrun{ +data(kenyaPopulationData) + +#' # download Kenya GADM shapefiles from SUMMERdata github repository +githubURL <- "https://github.com/paigejo/SUMMERdata/blob/main/data/kenyaMaps.rda?raw=true" +tempDirectory = "~/" +mapsFilename = paste0(tempDirectory, "/kenyaMaps.rda") +if(!file.exists(mapsFilename)) { + download.file(githubURL,mapsFilename) +} + +# load it in +out = load(mapsFilename) +out +adm1@data$NAME_1 = as.character(adm1@data$NAME_1) +adm1@data$NAME_1[adm1@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +adm1@data$NAME_1[adm1@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" +adm2@data$NAME_1 = as.character(adm2@data$NAME_1) +adm2@data$NAME_1[adm2@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +adm2@data$NAME_1[adm2@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" + +# some Admin-2 areas have the same name +adm2@data$NAME_2 = as.character(adm2@data$NAME_2) +adm2@data$NAME_2[(adm2@data$NAME_1 == "Bungoma") & + (adm2@data$NAME_2 == "Lugari")] = "Lugari, Bungoma" +adm2@data$NAME_2[(adm2@data$NAME_1 == "Kakamega") & + (adm2@data$NAME_2 == "Lugari")] = "Lugari, Kakamega" +adm2@data$NAME_2[(adm2@data$NAME_1 == "Meru") & + (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Meru" +adm2@data$NAME_2[(adm2@data$NAME_1 == "Tharaka-Nithi") & + (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Tharaka-Nithi" + +# The spatial area of unknown 8 is so small, it causes problems unless +# its removed or unioned with another subarea. Union it with neighboring +# Kakeguria: +newadm2 = adm2 +unknown8I = which(newadm2$NAME_2 == "unknown 8") +newadm2$NAME_2[newadm2$NAME_2 \%in\% c("unknown 8", "Kapenguria")] <- "Kapenguria + unknown 8" +admin2.IDs <- newadm2$NAME_2 + +library(maptools) +temp <- unionSpatialPolygons(newadm2, admin2.IDs) +tempData = newadm2@data[-unknown8I,] +tempData = tempData[order(tempData$NAME_2),] +newadm2 <- SpatialPolygonsDataFrame(temp, tempData, match.ID = F) +adm2 = newadm2 + +# download 2014 Kenya population density and associated TIF file +githubURL <- + "https://github.com/paigejo/SUMMERdata/blob/main/data/Kenya2014Pop/pop.rda?raw=true" +popFilename = paste0(tempDirectory, "/pop.rda") +if(!file.exists(popFilename)) { + download.file(githubURL,popFilename) +} + +githubURL <- + paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/Kenya2014Pop/", + "worldpop_total_1y_2014_00_00.tif?raw=true") +popTIFFilename = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") +if(!file.exists(popTIFFilename)) { + download.file(githubURL,popTIFFilename) +} + +# load it in +require(terra) +out = load(popFilename) +out + +# make sure this is correct for re-projections +pop@file@name = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") + +eastLim = c(-110.6405, 832.4544) +northLim = c(-555.1739, 608.7130) + +require(fields) + +# Now generate a general population integration table at 5km resolution, +# based on subarea (Admin-2) x urban/rural population totals. This takes +# ~1 minute +system.time(popMatKenya <- makePopIntegrationTab( + kmRes=5, pop=pop, domainMapDat=adm0, + eastLim=eastLim, northLim=northLim, mapProjection=projKenya, + poppa = poppaKenya, poppsub=poppsubKenya, + areaMapDat = adm1, subareaMapDat = adm2, + areaNameVar = "NAME_1", subareaNameVar="NAME_2")) + +out = poppRegionFromPopMat(popMatKenya, popMatKenya$area) +out +poppaKenya + +out = poppRegionFromPopMat(popMatKenya, popMatKenya$subarea) +out +poppsubKenya + +popMatKenyaUnstratified = popMatKenya +popMatKenyaUnstratified$urban = NULL +out = poppRegionFromPopMat(popMatKenyaUnstratified, popMatKenyaUnstratified$area) +out +poppaKenya +} +} +\seealso{ +\code{\link{getPoppsub}} +} +\author{ +John Paige +} diff --git a/man/projKenya.Rd b/man/projKenya.Rd new file mode 100755 index 0000000..ca8a99a --- /dev/null +++ b/man/projKenya.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/projKenya.R +\name{projKenya} +\alias{projKenya} +\title{Map projection for Kenya} +\usage{ +projKenya(lon, lat = NULL, inverse = FALSE) +} +\arguments{ +\item{lon}{either longitude or, if inverse == TRUE, easting in km} + +\item{lat}{either latitude or, if inverse == TRUE, northing in km} + +\item{inverse}{if FALSE, projects from lon/lat to easting/northing. Else from easting/northing to lon/lat} +} +\value{ +A 2 column matrix of easting/northing coordinates in km if inverse == FALSE. Otherwise, a 2 column matrix of longitude/latitude coordinates. +} +\description{ +Projection specifically chosen for Kenya. Project from lat/lon to northing/easting +in kilometers. Uses epsg=21097 with km units. May not work on all systems due to +differences in the behavior between different PROJ and GDAL versions. +} +\examples{ +eastLim = c(-110.6405, 832.4544) +northLim = c(-555.1739, 608.7130) +coordMatrixEN = cbind(eastLim, northLim) +coordMatrixLL = projKenya(coordMatrixEN, inverse=TRUE) + +coordMatrixLL +# if the coordMatrixLL isn't the following, projKenya may not support +# your installation of GDAL and/or PROJ: +# east north +# [1,] 33.5 -5.0 +# [2,] 42.0 5.5 + +projKenya(coordMatrixLL, inverse=FALSE) +# regardless of your PROJ/GDAL installations, the result of the +# above line of could should be: +# lon lat +# [1,] -110.6405 -555.1739 +# [2,] 832.4544 608.7130 + +} +\author{ +John Paige +} diff --git a/man/setThresholdsByRegion.Rd b/man/setThresholdsByRegion.Rd new file mode 100755 index 0000000..61b197f --- /dev/null +++ b/man/setThresholdsByRegion.Rd @@ -0,0 +1,131 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/popGrid.R +\name{setThresholdsByRegion} +\alias{setThresholdsByRegion} +\title{Set thresholds of population density for urbanicity classifications +within each region of the given type} +\usage{ +setThresholdsByRegion(popMat, poppr, regionType = "area") +} +\arguments{ +\item{popMat}{pixellated population density data frame with variables +regionType and `pop`} + +\item{poppr}{A table with population totals by region of the given type +(e.g. poppa or poppsub from \code{\link{makePopIntegrationTab}})} + +\item{regionType}{The variable name from poppr giving the region names. +Defaults to "area"} +} +\value{ +A list of region names and their urbanicity thresholds in population density +} +\description{ +Set thresholds of population density for urbanicity classifications +within each region of the given type +} +\details{ +Thresholds are set based on that region's percent population +urban. Intended as a helper function of \code{\link{makePopIntegrationTab}}. +} +\examples{ +\dontrun{ +data(kenyaPopulationData) + +#' # download Kenya GADM shapefiles from SUMMERdata github repository +githubURL <- "https://github.com/paigejo/SUMMERdata/blob/main/data/kenyaMaps.rda?raw=true" +tempDirectory = "~/" +mapsFilename = paste0(tempDirectory, "/kenyaMaps.rda") +if(!file.exists(mapsFilename)) { + download.file(githubURL,mapsFilename) +} + +# load it in +out = load(mapsFilename) +out +adm1@data$NAME_1 = as.character(adm1@data$NAME_1) +adm1@data$NAME_1[adm1@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +adm1@data$NAME_1[adm1@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" +adm2@data$NAME_1 = as.character(adm2@data$NAME_1) +adm2@data$NAME_1[adm2@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +adm2@data$NAME_1[adm2@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" + +# some Admin-2 areas have the same name +adm2@data$NAME_2 = as.character(adm2@data$NAME_2) +adm2@data$NAME_2[(adm2@data$NAME_1 == "Bungoma") & + (adm2@data$NAME_2 == "Lugari")] = "Lugari, Bungoma" +adm2@data$NAME_2[(adm2@data$NAME_1 == "Kakamega") & + (adm2@data$NAME_2 == "Lugari")] = "Lugari, Kakamega" +adm2@data$NAME_2[(adm2@data$NAME_1 == "Meru") & + (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Meru" +adm2@data$NAME_2[(adm2@data$NAME_1 == "Tharaka-Nithi") & + (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Tharaka-Nithi" + +# The spatial area of unknown 8 is so small, it causes problems unless +# its removed or unioned with another subarea. Union it with neighboring +# Kakeguria: +newadm2 = adm2 +unknown8I = which(newadm2$NAME_2 == "unknown 8") +newadm2$NAME_2[newadm2$NAME_2 \%in\% c("unknown 8", "Kapenguria")] <- "Kapenguria + unknown 8" +admin2.IDs <- newadm2$NAME_2 + +library(maptools) +temp <- unionSpatialPolygons(newadm2, admin2.IDs) +tempData = newadm2@data[-unknown8I,] +tempData = tempData[order(tempData$NAME_2),] +newadm2 <- SpatialPolygonsDataFrame(temp, tempData, match.ID = F) +adm2 = newadm2 + +# download 2014 Kenya population density and associated TIF file +githubURL <- + "https://github.com/paigejo/SUMMERdata/blob/main/data/Kenya2014Pop/pop.rda?raw=true" +popFilename = paste0(tempDirectory, "/pop.rda") +if(!file.exists(popFilename)) { + download.file(githubURL,popFilename) +} + +githubURL <- + paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/Kenya2014Pop/", + "worldpop_total_1y_2014_00_00.tif?raw=true") +popTIFFilename = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") +if(!file.exists(popTIFFilename)) { + download.file(githubURL,popTIFFilename) +} + +# load it in +require(terra) +out = load(popFilename) +out + +# make sure this is correct for re-projections +pop@file@name = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") + +eastLim = c(-110.6405, 832.4544) +northLim = c(-555.1739, 608.7130) + +require(fields) + +# Now generate a general population integration table at 5km resolution, +# based on subarea (Admin-2) x urban/rural population totals. This takes +# ~1 minute +system.time(popMatKenya <- makePopIntegrationTab( + kmRes=5, pop=pop, domainMapDat=adm0, + eastLim=eastLim, northLim=northLim, mapProjection=projKenya, + poppa = poppaKenya, poppsub=poppsubKenya, + areaMapDat = adm1, subareaMapDat = adm2, + areaNameVar = "NAME_1", subareaNameVar="NAME_2")) + +out = setThresholdsByRegion(popMatKenya, poppaKenya) +out + +out = setThresholdsByRegion(popMatKenya, poppsubKenya, regionType="subarea") +out +} + +} +\seealso{ +\code{\link{makePopIntegrationTab}} +} +\author{ +John Paige +} diff --git a/man/simPop.Rd b/man/simPop.Rd new file mode 100755 index 0000000..249f25e --- /dev/null +++ b/man/simPop.Rd @@ -0,0 +1,447 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simPop.R +\name{simPop} +\alias{simPop} +\alias{simPopSPDE} +\alias{simPopCustom} +\title{Simulate populations and areal prevalences} +\usage{ +simPopSPDE( + nsim = 1, + easpa, + popMat, + targetPopMat, + poppsub, + spdeMesh, + margVar = 0.243, + sigmaEpsilon = sqrt(0.463), + gamma = 0.009, + effRange = 406.51, + beta0 = -3.922, + seed = NULL, + inla.seed = -1L, + nHHSampled = 25, + stratifyByUrban = TRUE, + subareaLevel = TRUE, + doFineScaleRisk = FALSE, + doSmoothRisk = FALSE, + doSmoothRiskLogisticApprox = FALSE, + min1PerSubarea = TRUE +) + +simPopCustom( + logitRiskDraws, + sigmaEpsilonDraws, + easpa, + popMat, + targetPopMat, + stratifyByUrban = TRUE, + validationPixelI = NULL, + validationClusterI = NULL, + clustersPerPixel = NULL, + doFineScaleRisk = FALSE, + doSmoothRisk = FALSE, + doSmoothRiskLogisticApprox = FALSE, + poppsub = NULL, + subareaLevel = FALSE, + min1PerSubarea = TRUE, + returnEAinfo = FALSE, + epsc = NULL +) +} +\arguments{ +\item{nsim}{Number of simulations} + +\item{easpa}{data.frame of enumeration area, households, and target population per area stratified by urban/rural with variables: +\describe{ + \item{area}{name of area} + \item{EAUrb}{number of urban enumeration areas in the area} + \item{EARur}{number of rural enumeration areas in the area} + \item{EATotal}{total number of enumeration areas in the area} + \item{HHUrb}{number of urban households in the area} + \item{HHRur}{number of rural households in the area} + \item{HHTotal}{total number of households in the area} + \item{popUrb}{total urban (target) population of area} + \item{popRur}{total rural (target) population of area} + \item{popTotal}{total (general) population of area} +}} + +\item{popMat}{Pixellated grid data frame with variables `lon`, `lat`, `pop`, `area`, `subareas` (if subareaLevel is TRUE), `urban` (if stratifyByUrban is TRUE), `east`, and `north`} + +\item{targetPopMat}{Same as popMat, but `pop` variable gives target rather than general population} + +\item{poppsub}{data.frame of population per subarea separated by +urban/rural using for population density grid normalization or urbanicity +classification. Often based on extra fine scale population density grid. Has variables:} + +\item{spdeMesh}{Triangular mesh for the SPDE} + +\item{margVar}{Marginal variance of the spatial process, excluding cluster effects. +If 0, no spatial component is included} + +\item{sigmaEpsilon}{Standard deviation on the logit scale for iid Gaussian EA level random effects in the risk model} + +\item{gamma}{Effect of urban on logit scale for logit model for risk} + +\item{effRange}{Effective spatial range for the SPDE model} + +\item{beta0}{Intercept of logit model for risk} + +\item{seed}{Random number generator seed} + +\item{inla.seed}{Seed input to inla.qsample. 0L sets seed intelligently, +> 0 sets a specific seed, < 0 keeps existing RNG} + +\item{nHHSampled}{Number of households sampled per enumeration area. Default is 25 to match DHS surveys} + +\item{stratifyByUrban}{Whether or not to stratify simulations by urban/rural classification} + +\item{subareaLevel}{Whether or not to aggregate the population by subarea} + +\item{doFineScaleRisk}{Whether or not to calculate the fine scale risk at each aggregation level in addition to the prevalence} + +\item{doSmoothRisk}{Whether or not to calculate the smooth risk at each aggregation level in addition to the prevalence} + +\item{doSmoothRiskLogisticApprox}{Whether to use logistic approximation when calculating smooth risk. See +\code{\link{logitNormMean}} for details.} + +\item{min1PerSubarea}{If TRUE, ensures there is at least 1 EA per subarea. If subareas are particularly unlikely to +have enumeration areas since they have a very low proportion of the population in an area, then setting this to TRUE may be +computationally intensive.} + +\item{logitRiskDraws}{nIntegrationPoints x nsim dimension matrix of draws from the pixel leve risk field on logit scale, leaving out +potential nugget/cluster/EA level effects.} + +\item{sigmaEpsilonDraws}{nsim length vector of draws of cluster effect logit scale SD (joint draws with logitRiskDraws)} + +\item{validationPixelI}{CURRENTLY FOR TESTING PURPOSES ONLY a set of indices of pixels for which we want to simulate populations (used for pixel level validation)} + +\item{validationClusterI}{CURRENTLY FOR TESTING PURPOSES ONLY a set of indices of cluster for which we want to simulate populations (used for cluster level validation)} + +\item{clustersPerPixel}{CURRENTLY FOR TESTING PURPOSES ONLY Used for pixel level validation. Fixes the number of EAs per pixel.} + +\item{returnEAinfo}{If TRUE, returns information on every individual EA (BAU) for each simulated population} + +\item{epsc}{nEAs x nsim matrix of simulated EA (BAU) level iid effects representing fine scale variation in + risk. If NULL, they are simulated as iid Gaussian on a logit scale with + SD given by sigmaEpsilonDraws +list(pixelPop=outPixelLevel, subareaPop=outSubareaLevel, areaPop=outAreaLevel, logitRiskDraws=logitRiskDraws)} +} +\value{ +The simulated population aggregated to the enumeration area, +pixel, subarea (generally Admin2), and area (generally Admin1) levels. Output includes: +\item{pixelPop}{A list of pixel level population aggregates} +\item{subareaPop}{A list of `subarea` level population aggregates} +\item{areaPop}{A list of `area` level population aggregates} +Each of these contains population numerator and denominator as well as prevalence and risk +information aggregated to the appropriate level. +} +\description{ +Given a spatial risk model, simulate populations and population prevalences at the +enumeration area level (represented as points), and aggregate to the pixel and +administrative areal level. +} +\details{ +For population simulation and aggregation, we consider three models: smooth +risk, fine scale risk, and the fine scale prevalence. All will be described in detail +in a paper in preparation. In the smooth risk model, pixel level risks are integrated +with respect to target population density when producing areal estimates on a prespecified +set of integration points. The target population may be, for example, neonatals rather +than the general population. In the fine scale models, enumeration areas (EAs) are simulated as +point locations and iid random effects in the EA level risk are allowed. EAs and populations are dispersed conditional on the (possibly +approximately) known number of EAs, households, and target population at a particular +areal level (these we call `areas`) using multilevel multinomial sampling, first +sampling the EAs, then distributing households among the EAs, then the target population +among the households. Any areal level below the `areas` we call `subareas`. For instance, +the `areas` might be Admin-1 if that is the smallest level at which the number of EAs, +households, and people is known, and the `subareas` might be Admin-2. The multilevel +multinomial sampling may be stratified by urban/rural within the areas if the number of +EAs, households, and people is also approximately known at that level. + +Within each EA we assume a fixed probability of an event occurring, which is the fine scale `risk`. +The fine scale `prevalence` is the empirical proportion of events within that EA. We assume EA +level logit scale iid N(0, sigmaEpsilon^2) random effects in the risk model. When averaged +with equal weights over all EAs in an areal unit, this forms the fine scale risk. When +instead the population numerators and denominators are aggregated, and are used to +calculate the empirical proportion of events occurring in an areal unit, the resulting +quantity is the fine scale prevalence in that areal unit. + +Note that these functions can be used for either simulating populations for simulation +studies, or for generating predictions accounting for uncertainty in EA locations +and fine scale variation occuring at the EA level due to EA level iid random effects. +Required, however, is a seperately fit EA level spatial risk model +and information on the spatial population density and the population frame. +} +\section{Functions}{ +\itemize{ +\item \code{simPopSPDE()}: Simulate populations and population prevalences given census frame and population density +information. Uses SPDE model for generating spatial risk and can include iid cluster +level effect. + +\item \code{simPopCustom()}: Simulate populations and population prevalences given census frame and population density +information. Uses custom spatial logit risk function and can include iid cluster +level effect. + +}} +\examples{ +\dontrun{ +## In this script we will create 5km resolution pixellated grid over Kenya, +## and generate tables of estimated (both target and general) population +## totals at the area (e.g. Admin-1) and subarea (e.g. Admin-2) levels. Then +## we will use that to simulate populations of + +# download Kenya GADM shapefiles from SUMMERdata github repository +githubURL <- paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/", + "kenyaMaps.rda?raw=true") +tempDirectory = "~/" +mapsFilename = paste0(tempDirectory, "/kenyaMaps.rda") +if(!file.exists(mapsFilename)) { + download.file(githubURL,mapsFilename) +} + +# load it in +out = load(mapsFilename) +out +adm1@data$NAME_1 = as.character(adm1@data$NAME_1) +adm1@data$NAME_1[adm1@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +adm1@data$NAME_1[adm1@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" +adm2@data$NAME_1 = as.character(adm2@data$NAME_1) +adm2@data$NAME_1[adm2@data$NAME_1 == "Trans Nzoia"] = "Trans-Nzoia" +adm2@data$NAME_1[adm2@data$NAME_1 == "Elgeyo-Marakwet"] = "Elgeyo Marakwet" + +# some Admin-2 areas have the same name +adm2@data$NAME_2 = as.character(adm2@data$NAME_2) +adm2@data$NAME_2[(adm2@data$NAME_1 == "Bungoma") & + (adm2@data$NAME_2 == "Lugari")] = "Lugari, Bungoma" +adm2@data$NAME_2[(adm2@data$NAME_1 == "Kakamega") & + (adm2@data$NAME_2 == "Lugari")] = "Lugari, Kakamega" +adm2@data$NAME_2[(adm2@data$NAME_1 == "Meru") & + (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Meru" +adm2@data$NAME_2[(adm2@data$NAME_1 == "Tharaka-Nithi") & + (adm2@data$NAME_2 == "Igembe South")] = "Igembe South, Tharaka-Nithi" + +# The spatial area of unknown 8 is so small, it causes problems unless its removed or +# unioned with another subarea. Union it with neighboring Kakeguria: +newadm2 = adm2 +unknown8I = which(newadm2$NAME_2 == "unknown 8") +newadm2$NAME_2[newadm2$NAME_2 \%in\% c("unknown 8", "Kapenguria")] <- + "Kapenguria + unknown 8" +admin2.IDs <- newadm2$NAME_2 + +library(maptools) +temp <- unionSpatialPolygons(newadm2, admin2.IDs) +tempData = newadm2@data[-unknown8I,] +tempData = tempData[order(tempData$NAME_2),] +newadm2 <- SpatialPolygonsDataFrame(temp, tempData, match.ID = F) +adm2 = newadm2 + +# download 2014 Kenya population density and associated TIF file +githubURL <- paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/", + "Kenya2014Pop/pop.rda?raw=true") +popFilename = paste0(tempDirectory, "/pop.rda") +if(!file.exists(popFilename)) { + download.file(githubURL,popFilename) +} + +githubURL <- paste0("https://github.com/paigejo/SUMMERdata/blob/main/data/", + "Kenya2014Pop/worldpop_total_1y_2014_00_00.tif?raw=true") +popTIFFilename = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") +if(!file.exists(popTIFFilename)) { + download.file(githubURL,popTIFFilename) +} + +# load it in +require(raster) +out = load(popFilename) +out + +# make sure this is correct for re-projections +pop@file@name = paste0(tempDirectory, "/worldpop_total_1y_2014_00_00.tif") + +eastLim = c(-110.6405, 832.4544) +northLim = c(-555.1739, 608.7130) + +## Construct poppsubKenya, a table of urban/rural general population totals +## in each subarea. Technically, this is not necessary since we can load in +## poppsubKenya via data(kenyaPopulationData). First, we will need to calculate +## the areas in km^2 of the areas and subareas + +library(rgdal) +library(sp) + +# use Lambert equal area projection of areas (Admin-1) and subareas (Admin-2) +midLon = mean(adm1@bbox[1,]) +midLat = mean(adm1@bbox[2,]) +p4s = paste0("+proj=laea +x_0=0 +y_0=0 +lon_0=", midLon, + " +lat_0=", midLat, " +units=km") + +library(rgdal) + +adm1proj <- spTransform(adm1, CRS(p4s)) +adm2proj <- spTransform(adm2, CRS(p4s)) + +# now calculate spatial area in km^2 +library(rgeos) +admin1Areas = gArea(adm1proj, TRUE) +admin2Areas = gArea(adm2proj, TRUE) +areapaKenya = data.frame(area=adm1proj@data$NAME_1, spatialArea=admin1Areas) +areapsubKenya = data.frame(area=adm2proj@data$NAME_1, subarea=adm2proj@data$NAME_2, + spatialArea=admin2Areas) + +# Calculate general population totals at the subarea (Admin-2) x urban/rural +# level and using 1km resolution population grid. Assign urbanicity by +# thresholding population density based on estimated proportion population +# urban/rural, making sure total area (Admin-1) urban/rural populations in +# each area matches poppaKenya. +require(fields) +# NOTE: the following function will typically take ~20 minutes. Can speed up +# by setting kmRes to be higher, but we recommend fine resolution for +# this step, since it only needs to be done once. +system.time(poppsubKenya <- getPoppsub( + kmRes=1, pop=pop, domainPoly=kenyaPoly, + eastLim=eastLim, northLim=northLim, mapProjection=projKenya, + poppa = poppaKenya, areapa=areapaKenya, areapsub=areapsubKenya, + areaMapDat=adm1, subareaMapDat=adm2, + areaNameVar = "NAME_1", subareaNameVar="NAME_2")) + +# Now generate a general population integration table at 5km resolution, +# based on subarea (Admin-2) x urban/rural population totals. This takes +# ~1 minute +system.time(popMatKenya <- makePopIntegrationTab( + kmRes=5, pop=pop, domainPoly=kenyaPoly, + eastLim=eastLim, northLim=northLim, mapProjection=projKenya, + poppa = poppaKenya, poppsub=poppsubKenya, + areaMapDat = adm1, subareaMapDat = adm2, + areaNameVar = "NAME_1", subareaNameVar="NAME_2")) + +## Adjust popMat to be target (neonatal) rather than general population +## density. First create the target population frame +## (these numbers are based on IPUMS microcensus data) +mothersPerHouseholdUrb = 0.3497151 +childrenPerMotherUrb = 1.295917 +mothersPerHouseholdRur = 0.4787696 +childrenPerMotherRur = 1.455222 +targetPopPerStratumUrban = easpaKenya$HHUrb * mothersPerHouseholdUrb * + childrenPerMotherUrb +targetPopPerStratumRural = easpaKenya$HHRur * mothersPerHouseholdRur * + childrenPerMotherRur +easpaKenyaNeonatal = easpaKenya +easpaKenyaNeonatal$popUrb = targetPopPerStratumUrban +easpaKenyaNeonatal$popRur = targetPopPerStratumRural +easpaKenyaNeonatal$popTotal = easpaKenyaNeonatal$popUrb + + easpaKenyaNeonatal$popRur +easpaKenyaNeonatal$pctUrb = 100 * easpaKenyaNeonatal$popUrb / + easpaKenyaNeonatal$popTotal +easpaKenyaNeonatal$pctTotal = + 100 * easpaKenyaNeonatal$popTotal / sum(easpaKenyaNeonatal$popTotal) + +# Generate the target population density by scaling the current +# population density grid at the Admin1 x urban/rural level +popMatKenyaNeonatal = adjustPopMat(popMatKenya, easpaKenyaNeonatal) + +# Generate neonatal population table from the neonatal population integration +# matrix. This is technically not necessary for population simulation purposes, +# but is here for illustrative purposes +poppsubKenyaNeonatal = poppRegionFromPopMat(popMatKenyaNeonatal, + popMatKenyaNeonatal$subarea) +poppsubKenyaNeonatal = + cbind(subarea=poppsubKenyaNeonatal$region, + area=adm2@data$NAME_1[match(poppsubKenyaNeonatal$region, adm2@data$NAME_2)], + poppsubKenyaNeonatal[,-1]) +print(head(poppsubKenyaNeonatal)) + +## Now we're ready to simulate neonatal populations along with neonatal +## mortality risks and prevalences + +# use the following model to simulate the neonatal population based roughly +# on Paige et al. (2020) neonatal mortality modeling for Kenya. +beta0=-2.9 # intercept +gamma=-1 # urban effect +rho=(1/3)^2 # spatial variance +effRange = 400 # effective spatial range in km +sigmaEpsilon=sqrt(1/2.5) # cluster (nugget) effect standard deviation + +# Run a simulation! This produces multiple dense nEA x nsim and nPixel x nsim +# matrices. In the future sparse matrices and chunk by chunk computations +# may be incorporated. +simPop = simPopSPDE(nsim=1, easpa=easpaKenyaNeonatal, + popMat=popMatKenya, targetPopMat=popMatKenyaNeonatal, + poppsub=poppsubKenya, spdeMesh=kenyaMesh, + margVar=rho, sigmaEpsilon=sigmaEpsilon, + gamma=gamma, effRange=effRange, beta0=beta0, + seed=12, inla.seed=12, nHHSampled=25, + stratifyByUrban=TRUE, subareaLevel=TRUE, + doFineScaleRisk=TRUE, doSmoothRisk=TRUE, + min1PerSubarea=TRUE) + +# get average absolute percent error relative to fine scale prevalence at Admin-2 level +tempDat = simPop$subareaPop$aggregationResults[,c("region", "pFineScalePrevalence", + "pFineScaleRisk", "pSmoothRisk")] +100*mean(abs(tempDat$pFineScalePrevalence - tempDat$pFineScaleRisk) / + tempDat$pFineScalePrevalence) +100*mean(abs(tempDat$pFineScalePrevalence - tempDat$pSmoothRisk) / + tempDat$pFineScalePrevalence) +100*mean(abs(tempDat$pFineScaleRisk - tempDat$pSmoothRisk) / + tempDat$pFineScalePrevalence) + +# verify number of EAs per area and subarea +cbind(aggregate(simPop$eaPop$eaSamples[,1], by=list(area=popMatKenya$area), FUN=sum), + trueNumEAs=easpaKenya$EATotal[order(easpaKenya$area)]) +aggregate(simPop$eaPop$eaSamples[,1], by=list(area=popMatKenya$subarea), FUN=sum) + +## plot simulated population +# directory for plotting +# (mapPlot takes longer when it doesn't save to a file) +tempDirectory = "~/" + +# pixel level +zlim = c(0, quantile(probs=.995, c(simPop$pixelPop$pFineScalePrevalence, + simPop$pixelPop$pFineScaleRisk, + simPop$pixelPop$pSmoothRisk), na.rm=TRUE)) +pdf(file=paste0(tempDirectory, "simPopSPDEPixel.pdf"), width=8, height=8) +par(mfrow=c(2,2)) +plot(adm2, asp=1) +points(simPop$eaPop$eaDatList[[1]]$lon, simPop$eaPop$eaDatList[[1]]$lat, pch=".", col="blue") +plot(adm2, asp=1) +quilt.plot(popMatKenya$lon, popMatKenya$lat, simPop$pixelPop$pFineScalePrevalence, + zlim=zlim, add=TRUE, FUN=function(x) {mean(x, na.rm=TRUE)}) +plot(adm2, asp=1) +quilt.plot(popMatKenya$lon, popMatKenya$lat, simPop$pixelPop$pFineScaleRisk, + zlim=zlim, add=TRUE, FUN=function(x) {mean(x, na.rm=TRUE)}) +quilt.plot(popMatKenya$lon, popMatKenya$lat, simPop$pixelPop$pSmoothRisk, + zlim=zlim, FUN=function(x) {mean(x, na.rm=TRUE)}, asp=1) +plot(adm2, add=TRUE) +dev.off() + +range(simPop$eaPop$eaDatList[[1]]$N) + +# areal (Admin-1) level: these results should look essentially identical + +tempDat = simPop$areaPop$aggregationResults[,c("region", "pFineScalePrevalence", + "pFineScaleRisk", "pSmoothRisk")] +pdf(file=paste0(tempDirectory, "simPopSPDEAdmin-1.pdf"), width=7, height=7) +mapPlot(tempDat, + variables=c("pFineScalePrevalence", "pFineScaleRisk", "pSmoothRisk"), + geo=adm1, by.geo="NAME_1", by.data="region", is.long=FALSE) +dev.off() + +# subareal (Admin-2) level: these results should look subtley different +# depending on the type of prevalence/risk considered +tempDat = simPop$subareaPop$aggregationResults[,c("region", "pFineScalePrevalence", + "pFineScaleRisk", "pSmoothRisk")] +pdf(file=paste0(tempDirectory, "simPopSPDEAdmin-2.pdf"), width=7, height=7) +mapPlot(tempDat, + variables=c("pFineScalePrevalence", "pFineScaleRisk", "pSmoothRisk"), + geo=adm2, by.geo="NAME_2", by.data="region", is.long=FALSE) +dev.off() +} +} +\references{ +In preparation +} +\seealso{ +\code{\link{simPopCustom}}, \code{\link{makePopIntegrationTab}}, \code{\link{adjustPopMat}}, \code{\link{simSPDE}}. +} +\author{ +John Paige +} diff --git a/man/simPopInternal.Rd b/man/simPopInternal.Rd new file mode 100755 index 0000000..4932623 --- /dev/null +++ b/man/simPopInternal.Rd @@ -0,0 +1,217 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simPop.R +\name{simPopInternal} +\alias{simPopInternal} +\alias{getExpectedNperEA} +\alias{getSortIndices} +\alias{rStratifiedMultnomial} +\alias{rStratifiedMultnomialBySubarea} +\alias{rMyMultinomial} +\alias{rMyMultinomialSubarea} +\alias{rmultinom1} +\alias{sampleNMultilevelMultinomial} +\alias{sampleNMultilevelMultinomialFixed} +\title{Internal functions for population simulation} +\usage{ +getExpectedNperEA(easpa, popMat, level = c("grid", "EA"), pixelIndexMat = NULL) + +getSortIndices( + i, + urban = TRUE, + popMat, + stratifyByUrban = TRUE, + validationPixelI = NULL +) + +rStratifiedMultnomial(n, popMat, easpa, stratifyByUrban = TRUE) + +rStratifiedMultnomialBySubarea( + n, + popMat, + easpa, + stratifyByUrban = TRUE, + poppsub = NULL, + min1PerSubarea = TRUE, + minSample = 1 +) + +rMyMultinomial( + n, + i, + stratifyByUrban = TRUE, + urban = TRUE, + popMat = NULL, + easpa = NULL, + min1PerSubarea = FALSE, + method = c("mult1", "mult", "indepMH"), + minSample = 1 +) + +rMyMultinomialSubarea( + n, + i, + easpsub, + stratifyByUrban = TRUE, + urban = TRUE, + popMat = NULL +) + +rmultinom1( + n = 1, + size, + prob, + maxSize = 8000 * 8000, + method = c("mult1", "mult", "indepMH"), + verbose = FALSE, + minSample = 100, + maxExpectedSizeBeforeSwitch = 1000 * 1e+07, + init = NULL, + burnIn = floor(n/4), + filterEvery = 10, + zeroProbZeroSamples = TRUE, + allowSizeLessThanK = FALSE +) + +sampleNMultilevelMultinomial( + nDraws = ncol(pixelIndexMat), + pixelIndexMat = NULL, + urbanMat = NULL, + areaMat = NULL, + easpaList, + popMat, + stratifyByUrban = TRUE, + verbose = TRUE, + returnEAinfo = FALSE, + minHHPerEA = 25, + fixHHPerEA = NULL, + fixPopPerHH = NULL +) + +sampleNMultilevelMultinomialFixed( + clustersPerPixel, + nDraws = ncol(pixelIndices), + pixelIndices = NULL, + urbanVals = NULL, + areaVals = NULL, + easpa, + popMat, + stratifyByUrban = TRUE, + verbose = TRUE +) +} +\arguments{ +\item{easpa}{Census frame. See \code{\link{simPopCustom}} for details} + +\item{popMat}{data.frame of pixellated grid of population densities. See \code{\link{simPopCustom}} for details} + +\item{level}{Whether to calculate results at the integration grid or EA level} + +\item{pixelIndexMat}{Matrix of pixel indices associated with each EA and draw. Not +required by getExpectedNperEA unless level == "EA"} + +\item{i}{Index} + +\item{urban}{If TRUE, calculate only for urban part of the area. If FALSE, for only rural part} + +\item{stratifyByUrban}{whether or not to stratify calculations by urban/rural classification} + +\item{validationPixelI}{CURRENTLY FOR TESTING PURPOSES ONLY a set of indices of pixels for which we want to simulate populations (used for pixel level validation)} + +\item{n}{Number of samples} + +\item{poppsub}{Population per subarea. See \code{\link{simPopCustom}} for details} + +\item{min1PerSubarea}{Whether or not to ensure there is at least 1 EA per subarea. See \code{\link{simPopCustom}} for details} + +\item{minSample}{The minimum number of samples per `chunk` of samples for truncated multinomial sampling. Defaults to 1} + +\item{method}{If min1PerSubarea is TRUE, the sampling method for the truncated multinomial to use with rmulitnom1. rmultinom1 automatically + switches between them depending on the number of expected samples. The methods are: +\describe{ + \item{mult1}{rejection sampling from multinomial plus 1 in each category} + \item{mult}{rejection sampling from multinomial if any category has zero count} + \item{indepMH}{independent Metropolis-Hastings using multinomial plus 1 distribution as proposal} +}} + +\item{easpsub}{This could either be total EAs per subarea, or subarea crossed with urban or +rural if stratifyByUrban is TRUE} + +\item{size}{Multinomial size parameter. See \code{\link[stats]{rmultinom}}} + +\item{prob}{Multinomial probability vector parameter. See \code{\link[stats]{rmultinom}}} + +\item{maxSize}{The maximum number of elements in a matrix drawn from the proposal distribution per sample chunk.} + +\item{verbose}{Whether to print progress as the function proceeds} + +\item{maxExpectedSizeBeforeSwitch}{Max expected number of samples / k, the number of categories, before switching method} + +\item{init}{Initial sample if method is `indepMH`} + +\item{burnIn}{Number of initial samples before samples are collected if method is `indepMH`} + +\item{filterEvery}{Store only every filterEvery samples if method is i`indepMH`} + +\item{zeroProbZeroSamples}{If TRUE, set samples for parts of prob vector that are zero to zero. Otherwise they are set to one.} + +\item{allowSizeLessThanK}{If TRUE, then if size < the number of categories (k), returns matrix where each +column is vector of size ones and k - size zeros. If FALSE, throws an error if size < k} + +\item{nDraws}{Number of draws} + +\item{urbanMat}{Matrix of urbanicities associated with each EA and draw} + +\item{areaMat}{Matrix of areas associated with each EA and draw} + +\item{easpaList}{A list of length n with each element being of the format of easpa +giving the number of households and EAs +per stratum. It is assumed that the number of EAs per stratum is +the same in each list element. If easpaList is a data frame, +number of households per stratum is assumed constant} + +\item{returnEAinfo}{Whether a data frame at the EA level is desired} + +\item{minHHPerEA}{The minimum number of households per EA (defaults to 25, since +that is the number of households sampled per DHS cluster)} + +\item{fixHHPerEA}{If not NULL, the fixed number of households per EA} + +\item{fixPopPerHH}{If not NULL, the fixed target population per household} + +\item{clustersPerPixel}{CURRENTLY FOR TESTING PURPOSES ONLY a vector of length nIntegrationPoints specifying the number of clusters per pixel if they are fixed} + +\item{pixelIndices}{A nEA x n matrix of pixel indices associated with each EA per simulation/draw} + +\item{urbanVals}{A nEA x n matrix of urbanicities associated with each EA per simulation/draw} + +\item{areaVals}{A nEA x n matrix of area names associated with each EA per simulation/draw} +} +\description{ +Functions for calculating valuable quantities and for drawing from important +distributions for population simulation. +} +\section{Functions}{ +\itemize{ +\item \code{getExpectedNperEA()}: Calculates expected denominator per enumeration area. + +\item \code{getSortIndices()}: For recombining separate multinomials into the draws over all grid points + +\item \code{rStratifiedMultnomial()}: Gives nIntegrationPoints x n matrix of draws from the stratified multinomial with values +corresponding to the value of |C^g| for each pixel, g (the number of EAs/pixel) + +\item \code{rStratifiedMultnomialBySubarea()}: Gives nIntegrationPoints x n matrix of draws from the stratified multinomial with values + +\item \code{rMyMultinomial()}: + +\item \code{rMyMultinomialSubarea()}: + +\item \code{rmultinom1()}: Random (truncated) multinomial draws conditional on the number of each type being at least one + +\item \code{sampleNMultilevelMultinomial()}: Take multilevel multinomial draws first from joint distribution of +number of households per EA given the total per stratum, and then from the joint +distribution of the total target population per household given +the total per stratum + +\item \code{sampleNMultilevelMultinomialFixed()}: Same as sampleNMultilevelMultinomial, except the number of EAs per pixel is fixed + +}} diff --git a/man/simSPDE.Rd b/man/simSPDE.Rd new file mode 100755 index 0000000..4d836d2 --- /dev/null +++ b/man/simSPDE.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simSPDE.R +\name{simSPDE} +\alias{simSPDE} +\title{Simulate from the SPDE spatial model} +\usage{ +simSPDE( + coords, + nsim = 1, + mesh, + effRange = (max(coords[, 1]) - min(coords[, 1]))/3, + margVar = 1, + inla.seed = 0L +) +} +\arguments{ +\item{coords}{2 column matrix of spatial coordinates at which to simulate the spatial process} + +\item{nsim}{number of draws from the SPDE model} + +\item{mesh}{SPDE mesh} + +\item{effRange}{effective spatial range} + +\item{margVar}{marginal variance of the spatial process} + +\item{inla.seed}{seed input to inla.qsample. 0L sets seed intelligently, +> 0 sets a specific seed, < 0 keeps existing RNG} +} +\description{ +Generates nCoords x nsim matrix of simulated +values of the SPDE spatial process +} +\examples{ +\dontrun{ +set.seed(123) +require(INLA) +coords = matrix(runif(10*2), ncol=2) +mesh = inla.mesh.2d(loc.domain=cbind(c(0, 0, 1, 1), c(0, 1, 0, 1)), + n=3000, max.n=5000, max.edge=c(.01, .05), offset=-.1) +simVals = simSPDE(coords, nsim=1, mesh, effRange=.2, inla.seed=1L) +} + +} +\references{ +Lindgren, F., Rue, H., Lindström, J., 2011. An explicit link between Gaussian fields and Gaussian Markov random fields: the stochastic differential equation approach (with discussion). Journal of the Royal Statistical Society, Series B 73, 423–498. +} +\author{ +John Paige +}