Merge pull request #320 from USGS-R/develop

as for full 2016 data pull
DOI-USGS · Feb 5, 2017 · 0d1c963 · 0d1c963
2 parents dd5110a + d87ca83
commit 0d1c963
Show file tree

Hide file tree

Showing 146 changed files with 10,041 additions and 286 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: streamMetabolizer
 Type: Package
 Title: Models for Estimating Aquatic Photosynthesis and Respiration
-Version: 0.9.33
-Date: 2017-01-17
+Version: 0.9.38
+Date: 2017-02-03
 Author: Alison P. Appling, Robert O. Hall, Maite Arroita, and Charles B.
     Yackulic
 Authors@R: c(

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,23 @@
+# 0.9.36
+
+* Models should now be able to accept `tbl_df`s (dplyr/tibble format) for the
+`data` and `data_daily` arguments to `metab()`
+
+# 0.9.35
+
+* Bayesian models now distinguish between compilation time and fitting time
+
+* Updates to `plot_distribs` for recent changes to Bayesian models
+
+# 0.9.34
+
+* Bayesian models with `pool_K600 != 'none'` can how have their 
+`K600_daily_sigma` (or `K600_daily_sdlog`) be a fitted value, a fixed value, or 
+a value fixed at 0
+
 # 0.9.33
 
-* new function: `calc_light_merged`, which merges modeled and observed light
+* new function: `calc_light_merged`, which merges modeled and observed light 
 into a smooth curve
 
 # 0.9.29
@@ -10,7 +27,7 @@ model (and therefore also its `info` or `data_daily` slots)
 
 # 0.9.28
 
-* switched from `rlnorm` to `rnorm` for distribution of `K600_daily` around
+* switched from `rlnorm` to `rnorm` for distribution of `K600_daily` around 
 `K600_daily_pred` in linear and binned models
 
 # 0.9.27

diff --git a/R/calc_light_merged.R b/R/calc_light_merged.R
@@ -23,21 +23,19 @@
 #' @export
 #' @examples 
 #' \dontrun{
-#' library(mda.streams)
 #' library(dplyr)
-#' library(lubridate)
 #' library(ggplot2)
 #' library(unitted)
-#' coords <- get_site_coords('nwis_08062500')
-#' PAR.obs <- get_ts('par_calcSw', 'nwis_08062500') %>%
-#'   mutate(solar.time = convert_UTC_to_solartime(DateTime, coords$lon)) %>%
-#'     select(solar.time, light=par) %>%
-#'     subset(solar.time %within% interval(ymd("2008-03-10"), ymd("2008-03-14"), tz=tz(solar.time)) &
-#'           !(solar.time %within% interval(as.POSIXct("2008-03-12 10:00", tz=tz(solar.time)), 
-#'                                          as.POSIXct("2008-03-12 14:00", tz=tz(solar.time)))))
-#' PAR.mod <- u(data.frame(solar.time=seq(as.POSIXct('2008-03-11', tz=tz(PAR.obs$solar.time)),
-#'     as.POSIXct('2008-03-15', tz=tz(PAR.obs$solar.time)), by=as.difftime(10, units='mins'))) %>%
-#'   mutate(light=calc_light(u(solar.time), latitude=coords$lat, longitude=coords$lon)))
+#' timebounds <- as.POSIXct(c('2008-03-12 00:00', '2008-03-12 23:59'), tz='UTC')
+#' coords <- list(lat=32.4, lon=-96.5)
+#' PAR.obs <- data_frame(
+#'   solar.time=seq(timebounds[1], timebounds[2], by=as.difftime(3, units='hours')),
+#'   light=c(0, 0, 85.9, 1160.5, 1539.0, 933.9, 0, 0)
+#' ) %>% as.data.frame()
+#' PAR.mod <- data_frame(
+#'   solar.time=seq(timebounds[1], timebounds[2], by=as.difftime(0.25, units='hours')),
+#'   light=calc_light(solar.time, latitude=coords$lat, longitude=coords$lon)
+#' ) %>% as.data.frame()
 #' PAR.merged <- calc_light_merged(PAR.obs, PAR.mod$solar.time, 
 #'   latitude=coords$lat, longitude=coords$lon, max.gap=as.difftime(20, units='hours'))
 #' ggplot(bind_rows(mutate(v(PAR.obs), type='obs'), mutate(v(PAR.mod), type='mod'), 
@@ -51,6 +49,22 @@ calc_light_merged <- function(
   solar.time, latitude, longitude, max.PAR=NA,
   max.gap=as.difftime(3, units="hours"), attach.units=is.unitted(PAR.obs)) {
 
+  # ensure units are correct and present within this function
+  arg_units <- list(
+    PAR.obs=get_units(mm_data(solar.time, light)),
+    solar.time='',
+    latitude='degN',
+    longitude='degE')
+  for(argname in names(arg_units)) {
+    arg <- get(argname)
+    unit <- arg_units[[argname]]
+    if(is.unitted(arg)) {
+      stopifnot(all(get_units(arg) == unit))
+    } else {
+      assign(argname, u(arg, unit))
+    }
+  }
+
   . <- is.mod <- obs <- mod <- resid.abs.int <- resid.prop.int <- merged <- '.dplyr.var'
 
   # set smart default for max.PAR to make the ts pretty
@@ -94,27 +108,34 @@ calc_light_merged <- function(
   }
 
   # create a 'merged' time series where modeled values are adjusted to flow
-  # through observed values
+  # through observed values.
+
+  # compute the modeled values and then the residuals as both differences and 
+  # proportions, dealing with 0 specially
   PAR.merged <- PAR.merged %>%
     mutate(
-      # model light
       mod = calc_light(solar.time, latitude, longitude, max.PAR),
-      # compute the residuals as both differences and proportions, dealing with NA and 0 specially
-      resid.abs = ifelse(is.na(obs), 0, obs-mod),
-      resid.prop = ifelse(mod==u(0, 'umol m^-2 s^-1') | (is.na(obs) & mod==u(0, 'umol m^-2 s^-1')), 1, obs/mod))
-  # interpolate the residuals to match up with every modeled light value. pipes fail with this approx call, so use boring notation
-  PAR.merged$resid.abs.int <- approx(x=PAR.merged$solar.time, y=PAR.merged$resid.abs, xout=PAR.merged$solar.time, rule=2)$y
-  if(is.unitted(PAR.merged)) PAR.merged$resid.abs.int <- u(PAR.merged$resid.abs.int, get_units(PAR.merged$obs))
-  PAR.merged$resid.prop.int <- approx(x=PAR.merged$solar.time, y=PAR.merged$resid.prop, xout=PAR.merged$solar.time, rule=2)$y
+      resid.abs = obs - mod,
+      resid.prop = ifelse(mod==u(0, 'umol m^-2 s^-1'), NA, obs / mod))
+
+  # interpolate the residuals to match up with every modeled light value. pipes 
+  # fail with this approx call, so use boring notation
+  PAR.obsonly <- PAR.merged[!is.na(PAR.merged$obs), ]
+  PAR.merged$resid.abs.int <- approx(x=PAR.obsonly$solar.time, y=v(PAR.obsonly$resid.abs), xout=v(PAR.merged$solar.time), rule=2)$y
+  PAR.merged$resid.prop.int <- approx(x=PAR.obsonly$solar.time, y=PAR.obsonly$resid.prop, xout=PAR.merged$solar.time, rule=2)$y
+
+  # do the correction from mod scale to obs scale
   PAR.merged <- PAR.merged %>%
-    # do the correction from mod scale to obs scale. purely absolute or purely proportional can give some funky values, so use the 
-    mutate(merged = u(ifelse(resid.prop.int <= 1, mod * resid.prop.int, mod + resid.abs.int), get_units(mod)))
+    mutate(
+      merged = u(ifelse(resid.prop.int <= 1, mod * resid.prop.int, pmax(0, v(mod) + resid.abs.int)), get_units(mod)))
 
+  # collect just the rows and cols we want
   PAR.merged <- PAR.merged %>%
-    # collect just the rows and cols we want
     subset(is.mod) %>%
     select(solar.time, light=merged)
 
+  if(!attach.units) PAR.merged <- v(PAR.merged)
+
   # return
   PAR.merged
 }

diff --git a/R/calc_velocity.R b/R/calc_velocity.R
@@ -9,15 +9,12 @@
 #' @param m exponent in velocity-discharge relation (unitless; f in Raymond et al.)
 #' @return v (= V = U), stream flow velcoity, in the same units as k
 #' @examples
-#' \dontrun{
-#' # Warning: this function is deprecated.
 #' Qs <- seq(1,9,2)
 #' calc_velocity(Q=Qs)
 #' calc_velocity(Q=Qs, k=0.4)
 #' library(unitted)
 #' calc_velocity(Q=u(Qs, "m^3 s^-1"), m=u(40))
 #' calc_velocity(Q=u(Qs, "m^3 s^-1"), k=u(0.36, "m s^-1"))
-#' }
 #' @references Raymond, Peter A., Christopher J. Zappa, David Butman, Thomas L. 
 #'   Bott, Jody Potter, Patrick Mulholland, Andrew E. Laursen, William H. 
 #'   McDowell, and Denis Newbold. \emph{Scaling the gas transfer velocity and 
@@ -33,9 +30,6 @@
 #' @export
 calc_velocity <- function(Q, k=u(0.194,"m s^-1"), m=u(0.285,"")) {
 
-  .Deprecated()
-  warning("submit a GitHub issue if you want calc_velocity() to stick around")
-
   with.units <- is.unitted(Q) || (if(!missing(k)) is.unitted(k) else FALSE) || (if(!missing(m)) is.unitted(m) else FALSE)
 
   if(with.units) {

diff --git a/R/metab_Kmodel.R b/R/metab_Kmodel.R
@@ -398,13 +398,14 @@ get_params.metab_Kmodel <- function(
           K600.daily.sd = fit[['se']])
       },
       lm = {
-        preds <- predict(fit, newdata=data_daily, interval='confidence', level=0.95)
+        preds <- predict(fit, newdata=data_daily, interval='confidence', level=0.95) %>% 
+          as.data.frame()
         params %<>% mutate(
-          K600.daily = preds[,'fit'], # only the approx mean if ktrans=='log'
+          K600.daily = preds[['fit']], # only the approx mean if ktrans=='log'
           K600.daily.sd = NA, # this shouldn't be necessary after resolving #238
-          K600_daily_50pct = preds[,'fit'],
-          K600_daily_2.5pct = preds[,'lwr'],
-          K600_daily_97.5pct = preds[,'upr'])
+          K600_daily_50pct = preds[['fit']],
+          K600_daily_2.5pct = preds[['lwr']],
+          K600_daily_97.5pct = preds[['upr']])
       },
       loess = {
         preds <- predict(fit, newdata=data_daily, se=TRUE)

diff --git a/R/metab_bayes.R b/R/metab_bayes.R
@@ -63,16 +63,16 @@ metab_bayes <- function(
     # Check data for correct column names & units
     dat_list <- mm_validate_data(if(missing(data)) NULL else data, if(missing(data_daily)) NULL else data_daily, "metab_bayes")
     num_discharge_cols <- length(grep('discharge', c(names(dat_list$data), names(dat_list$data_daily))))
-    pool_K600 <- mm_parse_name(specs$model_name)$pool_K600
-    if(xor(num_discharge_cols > 0, pool_K600 %in% c('linear','binned'))) 
-      stop('discharge data should be included if & only if pool_K600 indicates hierarchy')
+    pool_K600_type <- mm_parse_name(specs$model_name, expand = TRUE)$pool_K600_type
+    if(xor(num_discharge_cols > 0, pool_K600_type %in% c('linear','binned'))) 
+      stop('discharge data should be included if & only if pool_K600_type indicates hierarchy')
     if(num_discharge_cols > 1)
       stop('either discharge or discharge.daily may be specified, but not both')
 
     # Handle discharge. If K600 is a hierarchical function of discharge and 
     # data$discharge was given, compute daily discharge and store in data.daily 
     # where it'll be accessible for the user to inspect it after model fitting
-    if((pool_K600 %in% c('linear','binned')) && ('discharge' %in% names(dat_list$data))) {
+    if((pool_K600_type %in% c('linear','binned')) && ('discharge' %in% names(dat_list$data))) {
       # calculate daily discharge
       dailymean <- function(data_ply, data_daily_ply, day_start, day_end, ply_date, ply_validity, timestep_days, ...) {
         data.frame(discharge.daily = if(isTRUE(ply_validity[1])) mean(data_ply$discharge) else NA)
@@ -111,7 +111,7 @@ metab_bayes <- function(
       dat_list$data_daily$lnQ.daily <- log(v(dat_list$data_daily$discharge.daily))
     }
     # If we need discharge bins, compute & store those now, as well
-    if(pool_K600 %in% c('binned')) {
+    if(pool_K600_type %in% c('binned')) {
       # linear interpolation from node to node, horizontal at the edges
       bounds <- c(-Inf, specs$K600_lnQ_nodes_centers, Inf)
       cuts <- cut(dat_list$data_daily$lnQ.daily, breaks=bounds, ordered_result=TRUE)
@@ -176,6 +176,7 @@ metab_bayes <- function(
       {if(!is.null(.)) setNames(., 'Compilation') else .}
       log <- extract_object_list('log') %>% { setNames(., paste0('MCMC_', names(.))) }
       bayes_log <- c(compile_log, log)
+      bayes_compile_time <- bayes_all_list[['compile_time']]
       bayes_mcmc <- extract_object_list('mcmcfit')
       bayes_mcmc_data <- extract_object_list('mcmc_data')
       bayes_all <- list(daily=bayes_daily)
@@ -195,10 +196,11 @@ metab_bayes <- function(
       . <- '.dplyr.var'
       bayes_log <- bayes_all_list[c('compile_log', 'log')] %>% 
         setNames(c('Compilation','MCMC_All_Days')) %>% { .[!sapply(., is.null)] }
+      bayes_compile_time <- bayes_all_list[['compile_time']]
       bayes_mcmc <- bayes_all_list$mcmcfit
       bayes_mcmc_data <- bayes_all_list$mcmc_data
       # now a list of dfs, log, warnings, and errors
-      bayes_all <- bayes_all_list[!(names(bayes_all_list) %in% c('compile_log','log','mcmcfit','mcmc_data'))]
+      bayes_all <- bayes_all_list[!(names(bayes_all_list) %in% c('compile_log','compile_time','log','mcmcfit','mcmc_data'))]
     }
   })
 
@@ -210,7 +212,8 @@ metab_bayes <- function(
     log=bayes_log,
     mcmc=bayes_mcmc,
     mcmc_data=bayes_mcmc_data,
-    fitting_time=fitting_time,
+    fitting_time=fitting_time - bayes_compile_time,
+    compile_time=bayes_compile_time,
     specs=specs,
     data=dat_list$data, # keep the units if given
     data_daily=dat_list$data_daily)
@@ -483,7 +486,7 @@ prepdata_bayes <- function(
   if(n24 > num_daily_obs) stop("day_end - day_start < 24 hours; aborting because daily metabolism could be wrong")
 
   # parse model name into features for deciding what data to include
-  features <- mm_parse_name(model_name)
+  features <- mm_parse_name(model_name, expand=TRUE)
 
   # Format the data for Stan. Stan disallows period-separated names, so
   # change all the input data to underscore-separated. parameters given in
@@ -501,7 +504,7 @@ prepdata_bayes <- function(
     ),
 
     switch(
-      features$pool_K600,
+      features$pool_K600_type,
       linear = list(lnQ_daily = data_daily$lnQ.daily),
       binned = list(
           b = length(specs$K600_lnQ_nodes_centers),
@@ -533,7 +536,7 @@ prepdata_bayes <- function(
 
     specs[specs$params_in]
   )
-  if(features$pool_K600 == 'binned') {
+  if(features$pool_K600_type == 'binned') {
     data_list$K600_lnQ_nodes_meanlog <- array(data_list$K600_lnQ_nodes_meanlog, dim=data_list$b)
     data_list$K600_lnQ_nodes_sdlog <- array(data_list$K600_lnQ_nodes_sdlog, dim=data_list$b)
   }
@@ -604,12 +607,15 @@ runstan_bayes <- function(data_list, model_path, params_out, split_dates, keep_m
 
   # use auto_write=TRUE to recompile if needed, or load from existing .rds file
   # without recompiling if possible
+  compile_time <- system.time({})
   mobj_path <- gsub('.stan$', '.stanrds', model_path)
   if(!file.exists(mobj_path) || file.info(mobj_path)$mtime < file.info(model_path)$mtime) {
     if(verbose) message("compiling Stan model")
-    compile_log <- capture.output({
-      stan_mobj <- rstan::stan_model(file=model_path, auto_write=TRUE)
-    }, type=c('output'), split=verbose)
+    compile_time <- system.time({
+      compile_log <- capture.output({
+        stan_mobj <- rstan::stan_model(file=model_path, auto_write=TRUE)
+      }, type=c('output'), split=verbose)
+    })
     rm(stan_mobj)
     gc() # this humble line saves us from many horrible R crashes
     autowrite_path <- gsub('.stan$', '.rds', model_path)
@@ -681,7 +687,10 @@ runstan_bayes <- function(data_list, model_path, params_out, split_dates, keep_m
   newlogfiles <- normalizePath(file.path(tempdir(), grep("_StanProgress.txt", dir(tempdir()), value=TRUE)))
   logfile <- setdiff(newlogfiles, oldlogfiles)
   log <- if(length(logfile) > 0) readLines(logfile) else consolelog
-  stan_out <- c(stan_out, c(list(log=log), if(exists('compile_log')) list(compile_log=compile_log)))
+  stan_out <- c(stan_out, c(
+    list(log=log), 
+    if(exists('compile_log')) list(compile_log=compile_log), 
+    list(compile_time=compile_time)))
 
   return(stan_out)
 }
@@ -770,7 +779,7 @@ format_mcmc_mat_nosplit <- function(mcmc_mat, data_list_d, data_list_n, keep_mcm
 setClass(
   "metab_bayes", 
   contains="metab_model",
-  slots=c(log="ANY", mcmc="ANY", mcmc_data="ANY")
+  slots=c(log="ANY", mcmc="ANY", mcmc_data="ANY", compile_time="ANY")
 )
 
 #' Extract any MCMC model objects that were stored with the model
@@ -957,6 +966,7 @@ get_params.metab_bayes <- function(metab_model, date_start=NA, date_end=NA, unce
   }
   names(metab_model@fit$daily) <- gsub('_mean$', '', names(metab_model@fit$daily))
   names(metab_model@fit$daily) <- gsub('_sd$', '.sd', names(metab_model@fit$daily))
+  names(metab_model@fit$daily) <- gsub('_50pct$', '.median', names(metab_model@fit$daily))
   names(metab_model@fit$daily) <- gsub('_2.5pct$', '.lower', names(metab_model@fit$daily))
   names(metab_model@fit$daily) <- gsub('_97.5pct$', '.upper', names(metab_model@fit$daily))
   # code duplicated in get_params.metab_Kmodel:
@@ -970,6 +980,6 @@ get_params.metab_bayes <- function(metab_model, date_start=NA, date_end=NA, unce
     dmsg <- metab_model@fit$daily$errors
     metab_model@fit$daily$errors <- ifelse(dmsg == '', omsg, paste(omsg, dmsg, sep=';'))
   }
-  metab_model@fit <- metab_model@fit$daily # SUPER-TEMPORARY we're still converting fit$daily to fit until #247, #229
+  metab_model@fit <- metab_model@fit$daily # TEMPORARY we're still converting fit$daily to fit until #247, #229
   NextMethod()
 }
diff --git a/R/metab_model.get_params.R b/R/metab_model.get_params.R
@@ -72,7 +72,8 @@ get_params.metab_model <- function(
   # add uncertainty columns if requested
   if(uncertainty != 'none') {
     metab.vars <- metab.out
-    metab.uncert <- matrix(paste0(rep(metab.out, each=3), rep(c('.sd','.lower','.upper'), times=length(metab.out))), nrow=3, byrow=FALSE)
+    suffixes <- c('.sd','.median','.lower','.upper')
+    metab.uncert <- matrix(paste0(rep(metab.out, each=length(suffixes)), rep(suffixes, times=length(metab.out))), nrow=length(suffixes), byrow=FALSE)
     metab.out <- c(rbind(metab.out, metab.uncert)) %>% { .[. %in% names(pars)]}
   }
 
@@ -91,16 +92,24 @@ get_params.metab_model <- function(
 
   # convert sds to CIs if requested
   if(uncertainty == 'sd') {
-    extra.cols <- grep('\\.lower$|\\.upper$', names(params))
+    extra.cols <- grep('\\.median$|\\.lower$|\\.upper$', names(params))
     if(length(extra.cols) > 0) params <- params[-extra.cols]
   } else if(uncertainty == 'ci') {
     # use existing .lower and .upper cols if available
     for(mv in metab.vars) {
       if(all(paste0(mv,c('.lower','.upper')) %in% names(params))) {
         extra.cols <- grep(paste0(mv,'\\.sd$'), names(params))
         if(length(extra.cols) > 0) params <- params[-extra.cols]
+        # if we're using existing .lower and .upper cols, also try to use existing .median col
+        if(paste0(mv,'.median') %in% names(params)) {
+          params[[mv]] <- params[[paste0(mv,'.median')]] # copy .median over/into un-suffixed name
+        }
       }
     }
+    # remove any '.median' columns; by now we've either used them or have no
+    # use for them
+    extra.cols <- grep(paste0('\\.median$'), names(params))
+    if(length(extra.cols) > 0) params <- params[-extra.cols]
     # convert any remaining .sd cols to .lower and .upper parametrically
     params <- mm_sd_to_ci(params)
   }

diff --git a/R/metab_model.show.R b/R/metab_model.show.R
@@ -18,7 +18,7 @@ setMethod(
     }
     cat("streamMetabolizer version", object@pkg_version, "\n")
     print(get_specs(object), header="Specifications:\n", prefix="  ")
-    cat("Fitting time: ", get_fitting_time(object)[['elapsed']], " secs elapsed\n", sep="")
+    cat("Fitting time: ", tryCatch(get_fitting_time(object)[['elapsed']], error=function(e) "NA"), " secs elapsed\n", sep="")
 
     # set the sim.seed for a sec so the params and metab preds look the same.
     # setting the seed makes a copy of object, so we won't need to reset the

diff --git a/R/metab_model_interface.R b/R/metab_model_interface.R
@@ -163,7 +163,10 @@ get_version <- function(metab_model) {
 #' @param uncertainty character. Should columns for the uncertainty of parameter
 #'   estimates be excluded ('none'), reported as standard deviations ('sd'), or 
 #'   reported as lower and upper bounds of a 95 percent confidence interval 
-#'   ('ci')?
+#'   ('ci')? When available (e.g., for Bayesian models), if 'ci' then the 
+#'   central value will be the median (50th quantile) and the ranges will be the
+#'   2.5th and 97.5th quantiles. If 'sd' then the central value will always be 
+#'   the mean.
 #' @param messages logical. Should warning and error messages from the fitting 
 #'   procedure be included in the output?
 #' @param fixed character. Should values pulled from data_daily (i.e., fixed