diff --git a/DESCRIPTION b/DESCRIPTION index 8ed7ded3..ec8fde3c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: LDATS Title: Latent Dirichlet Allocation Coupled with Time Series Analyses -Version: 0.2.6 +Version: 0.2.7 Authors@R: c( person(c("Juniper", "L."), "Simonis", email = "juniper.simonis@weecology.org", role = c("aut", "cre"), @@ -51,17 +51,11 @@ Imports: topicmodels, viridis Suggests: - dplyr, - here, knitr, pkgdown, rmarkdown, testthat, - vdiffr, - clue, - reshape, - RCurl, - tidyr + vdiffr VignetteBuilder: knitr RoxygenNote: 6.1.1 diff --git a/NEWS.md b/NEWS.md index 959bfb72..c7769356 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,15 @@ Version numbers follow [Semantic Versioning](https://semver.org/). + +# LDATS 0.2.7(https://github.com/weecology/ldats/releases/tag/v0.2.7) +*2020-03-18* + +## Patching CRAN issues with vignette building +* Dependencies are being managed different now +* For the paper comparison vignette, all of the code is pre-run and saved in the LDATS-replications repository +* Allows removal of otherwise unused packages from this package's dependency list + # LDATS 0.2.6(https://github.com/weecology/ldats/releases/tag/v0.2.6) *2020-03-02* diff --git a/_pkgdown.yml b/_pkgdown.yml index f8ed41ce..a60bdd43 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -11,9 +11,9 @@ navbar: text: "Vignettes" menu: - text: "Rodents example" - href: rodents-example.html + href: articles/rodents-example.html - text: "Comparison to Christensen et al." - href: paper-comparison.html + href: articles/paper-comparison.html - text: "LDATS codebase" href: articles/LDATS_codebase.html reference: diff --git a/cran-comments.md b/cran-comments.md index bec33962..15a4aa8d 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,15 +1,17 @@ -This resubmission addresses problems associated with the stringsAsFactors update in R-devel +This resubmission addresses problems associated with vignette dependencies ## Test environments * local Windows 10 home install, R 3.6.1 64-bit and 32-bit -* local Windows 10 home install, R-devel (2020-03-01 r77881) 64-bit and 32-bit -* ubuntu 16.04.6 LTS (on travis-ci), R 3.6.2 and R-devel (2019-03-02 r77888) +* local Windows 10 home install, R-devel (2020-03-12 r77936) 64-bit and 32-bit +* ubuntu 16.04.6 LTS (on travis-ci), R 3.6.2 and R-devel (2020-03-13 r77948) +* win builder, R 3.5.3 * win builder, R 3.6.3 -* win builder, R-devel (2020-01-28 r77738) -* R-hub builder, Debian Linux, R-devel, GCC -* R-hub builder, macOS 10.11 El Capitan, R-release (experimental) -* R-hub builder, Fedora Linux, R-devel, GCC +* win builder, R-devel (2020-03-11 r77925) +* R-hub builder, Ubuntu Linux 16.04 LTS, R-release, GCC +* R-hub builder, Windows Server 2008 R2 SP1, R-devel, 32/64 bit * R-hub builder, Fedora Linux, R-devel, clang, gfortran +* R-hub builder, macOS 10.11 El Capitan, R-release (experimental) +* R-hub builder, Oracle Solaris 10, x86, 32 bit, R-patched (experimental) ## R CMD check results: There were no ERRORs, WARNINGs, or NOTEs diff --git a/doc/LDATS_codebase.R b/doc/LDATS_codebase.R index 1708c2c9..266d70bb 100644 --- a/doc/LDATS_codebase.R +++ b/doc/LDATS_codebase.R @@ -1,15 +1,15 @@ -## ----setup, include=FALSE------------------------------------------------ +## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) -## ---- include=FALSE------------------------------------------------------ +## ---- include=FALSE----------------------------------------------------------- library(LDATS) vers <- packageVersion("LDATS") today <- Sys.Date() -## ---- eval=FALSE--------------------------------------------------------- +## ---- eval=FALSE-------------------------------------------------------------- # install.packages("devtools") # devtools::install_github("weecology/LDATS") diff --git a/doc/LDATS_codebase.html b/doc/LDATS_codebase.html index 1de2685f..46e96341 100644 --- a/doc/LDATS_codebase.html +++ b/doc/LDATS_codebase.html @@ -1,15 +1,14 @@ - + - - + @@ -120,9 +119,6 @@ font-size: 14px; line-height: 1.35; } -#header { -text-align: center; -} #TOC { clear: both; margin: 0 0 10px 10px; @@ -290,7 +286,8 @@ code > span.co { color: #888888; font-style: italic; } code > span.ot { color: #007020; } code > span.al { color: #ff0000; font-weight: bold; } -code > span.fu { color: #900; font-weight: bold; } code > span.er { color: #a61717; background-color: #e3d2d2; } +code > span.fu { color: #900; font-weight: bold; } +code > span.er { color: #a61717; background-color: #e3d2d2; } @@ -310,7 +307,7 @@

Juniper L. Simonis

Overview

-

This vignette outlines the code base for the LDATS package. It was constructed using LDATS version 0.3.0 on 2019-10-12.

+

This vignette outlines the code base for the LDATS package. It was constructed using LDATS version 0.2.7 on 2020-03-18.

Installation

diff --git a/doc/paper-comparison.R b/doc/paper-comparison.R index 81ecb571..6f73616f 100644 --- a/doc/paper-comparison.R +++ b/doc/paper-comparison.R @@ -1,166 +1,209 @@ params <- list(run_models = FALSE) -## ----setup, include=FALSE------------------------------------------------ +## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) -## ---- include=FALSE------------------------------------------------------ +## ---- include=FALSE----------------------------------------------------------- vers <- packageVersion("LDATS") -## ---- eval = FALSE------------------------------------------------------- -# # install.packages("devtools") +## ---- eval = FALSE------------------------------------------------------------ +# install.packages("devtools") # devtools::install_github("weecology/LDATS") -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- library(LDATS) set.seed(42) nseeds <- 200 nit <- 10000 -## ---- eval = FALSE------------------------------------------------------- +## ---- eval = params$run_models------------------------------------------------ +# install.packages(c("dplyr", "gridExtra", "multipanel", "RColorBrewer", "RCurl", "reshape2")) + +## ---- eval = FALSE------------------------------------------------------------ # rmarkdown::render("paper-comparison.Rmd", params = list(run_models = TRUE)) -## ----set download location----------------------------------------------- +## ----set download location---------------------------------------------------- vignette_files <- tempdir() -## ----download scripts---------------------------------------------------- -test_file <- file.path(vignette_files, "rodent_LDA_analysis.r") +## ----download scripts and data------------------------------------------------ +test_file <- file.path(vignette_files, "scripts", "rodent_LDA_analysis.r") + +if (!file.exists(test_file)){ -if (!file.exists(test_file)) -{ - # from the Extreme-events-LDA repo - github_path <- "https://raw.githubusercontent.com/emchristensen/Extreme-events-LDA/master/" + # scripts + dir.create(file.path(vignette_files, "scripts"), showWarnings = FALSE) + github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/scripts/" files_to_download <- c("rodent_LDA_analysis.r", "rodent_data_for_LDA.r", "AIC_model_selection.R", "changepointmodel.r", - "LDA-distance.R", "Rodent_table_dat.csv", - "LDA_figure_scripts.R") + "LDA-distance.R", "LDA_figure_scripts.R") - for (file in files_to_download) - { + for (file in files_to_download) { download.file(url = paste0(github_path, file), - destfile = file.path(vignette_files, file)) + destfile = file.path(vignette_files, "scripts", file)) } + + + # data + dir.create(file.path(vignette_files, "data"), showWarnings = FALSE) + github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/data/" + files_to_download <- c("moon_dates.csv", "Portal_rodent_trapping.csv", + "Rodent_table_dat.csv", "paper_dat.csv", + "paper_dates.csv", "paper_covariates.csv") - # from the PortalData repo - github_path <- "https://raw.githubusercontent.com/weecology/PortalData/master/Rodents/" - files_to_download <- c("moon_dates.csv", "Portal_rodent_trapping.csv") - - for (file in files_to_download) - { + for (file in files_to_download) { download.file(url = paste0(github_path, file), - destfile = file.path(vignette_files, file)) + destfile = file.path(vignette_files, "data", file)) } } -## ----download pre-generated model outputs, eval = !params$run_model------ -test_file <- file.path(vignette_files, "ldats_ldamodel.RDS") +## ----download pre-generated model outputs------------------------------------- +test_file <- file.path(vignette_files, "output", "ldats_ldamodel.RDS") -if (!file.exists(test_file)) -{ - # from the Extreme-events-LDA repo - github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/model_outputs/" +if (!file.exists(test_file)){ + + dir.create(file.path(vignette_files, "output"), showWarnings = FALSE) + github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/output/" files_to_download <- c("ldats_ldamodel.RDS", "paper_ldamodel.RDS", "ldats_ldats.RDS", "ldats_paper.RDS", "paper_ldats.RDS", "paper_paper.RDS", - "lda_distances.png") - - for (file in files_to_download) - { + "ldats_rodents_adjusted.RDS", "rodents.RDS", + "ldats_paper_cpt.RDS", "ldats_paper_cpt_dates.RDS", + "ldats_ldats_cpt.RDS", "ldats_ldats_cpt_dates.RDS", + "paper_paper_cpt.RDS", "paper_paper_cpt_dates.RDS", + "paper_ldats_cpt.RDS", "paper_ldats_cpt_dates.RDS", + "annual_hist.RDS", "cpt_dates.RDS", + "lda_distances.png", "paper_paper_cpt_plot.png", + "ldats_paper_cpt_plot.png") + + for (file in files_to_download){ download.file(url = paste0(github_path, file), - destfile = file.path(vignette_files, file), + destfile = file.path(vignette_files, "output", file), mode = "wb") } } -## ----LDATS data---------------------------------------------------------- +## ----LDATS data--------------------------------------------------------------- data(rodents) head(rodents[[1]]) -## ----Paper data---------------------------------------------------------- -# parameters for subsetting the full Portal rodents data -periods <- 1:436 -control_plots <- c(2, 4, 8, 11, 12, 14, 17, 22) -species_list <- c("BA", "DM", "DO", "DS", "NA", "OL", "OT", "PB", "PE", "PF", - "PH", "PI", "PL", "PM", "PP", "RF", "RM", "RO", "SF", "SH", "SO") - -source(file.path(vignette_files, "rodent_data_for_LDA.r")) - -# assemble `paper_dat`, the data from Christensen et al. 2018 -paper_dat <- create_rodent_table(period_first = min(periods), - period_last = max(periods), - selected_plots = control_plots, - selected_species = species_list) - -# assemble `paper_covariates`, the associated dates and covariate data -moondat <- read.csv(file.path(vignette_files, "moon_dates.csv"), stringsAsFactors = F) - -paper_dates <- moondat %>% - dplyr::filter(period %>% dplyr::between(min(periods), max(periods))) %>% - dplyr::pull(censusdate) %>% - as.Date() - -paper_covariates <- data.frame( - index = seq_along(paper_dates), - date = paper_dates, - year_continuous = lubridate::decimal_date(paper_dates)) %>% - dplyr::mutate( - sin_year = sin(year_continuous * 2 * pi), - cos_year = cos(year_continuous * 2 * pi) - ) - -## ----rodent data comparison---------------------------------------------- +## ----Paper data, eval = params$run_models------------------------------------- +# # parameters for subsetting the full Portal rodents data +# periods <- 1:436 +# control_plots <- c(2, 4, 8, 11, 12, 14, 17, 22) +# species_list <- c("BA", "DM", "DO", "DS", "NA", "OL", "OT", "PB", "PE", "PF", +# "PH", "PI", "PL", "PM", "PP", "RF", "RM", "RO", "SF", "SH", "SO") +# +# source(file.path(vignette_files, "scripts", "rodent_data_for_LDA.r")) +# +# # assemble `paper_dat`, the data from Christensen et al. 2018 +# paper_dat <- create_rodent_table(period_first = min(periods), +# period_last = max(periods), +# selected_plots = control_plots, +# selected_species = species_list) +# +# # assemble `paper_covariates`, the associated dates and covariate data +# moondat <- read.csv(file.path(vignette_files, "data", "moon_dates.csv"), stringsAsFactors = F) +# +# paper_dates <- moondat %>% +# dplyr::filter(period %>% dplyr::between(min(periods), max(periods))) %>% +# dplyr::pull(censusdate) %>% +# as.Date() +# +# paper_covariates <- data.frame( +# index = seq_along(paper_dates), +# date = paper_dates, +# year_continuous = lubridate::decimal_date(paper_dates)) %>% +# dplyr::mutate( +# sin_year = sin(year_continuous * 2 * pi), +# cos_year = cos(year_continuous * 2 * pi) +# ) + +## ----Paper data2, eval = !params$run_models, include = FALSE------------------ + moondat <- read.csv(file.path(vignette_files, "data", "moon_dates.csv"), stringsAsFactors = FALSE) + paper_dat <- read.csv(file.path(vignette_files, "data", "paper_dat.csv"), stringsAsFactors = FALSE) + paper_dates <- read.csv(file.path(vignette_files, "data", "paper_dates.csv"), stringsAsFactors = FALSE) + paper_covariates <- read.csv(file.path(vignette_files, "data", "paper_covariates.csv"), stringsAsFactors = FALSE) + +## ----rodent data comparison--------------------------------------------------- compare <- rodents[[1]] == paper_dat length(which(rowSums(compare) < ncol(compare))) -## ----adjust LDATS data after Christensen et al, eval = TRUE-------------- -# get the trapping effort for each sample -trap_table <- read.csv(file.path(vignette_files, "Portal_rodent_trapping.csv")) -trap_table_controls <- dplyr::filter(trap_table, plot %in% control_plots) -nplots_controls <- aggregate(trap_table_controls$sampled, - by = list(period = trap_table_controls$period), - FUN = sum) +## ----Data adjustment eval, eval = FALSE--------------------------------------- +# # retrieve data on number of plots trapped per month +# trap_table = read.csv('https://raw.githubusercontent.com/weecology/PortalData/master/Rodents/Portal_rodent_trapping.csv') +# trap_table_controls = filter(trap_table, plot %in% selected_plots) +# nplots_controls = aggregate(trap_table_controls$sampled,by=list(period = trap_table_controls$period),FUN=sum) +# +# # adjust species counts by number of plots trapped that month +# r_table_adjusted = as.data.frame.matrix(r_table) +# for (n in 1:436) { +# #divide by number of control plots actually trapped (should be 8) and multiply by 8 to estimate captures as if all plots were trapped +# r_table_adjusted[n,] = round(r_table_adjusted[n,]/nplots_controls$x[n]*8) +# } + +## ----adjust LDATS data after Christensen et al, eval = params$run_models------ +# # get the trapping effort for each sample +# trap_table <- read.csv(file.path(vignette_files, "data", "Portal_rodent_trapping.csv")) +# trap_table_controls <- dplyr::filter(trap_table, plot %in% control_plots) +# nplots_controls <- aggregate(trap_table_controls$sampled, +# by = list(period = trap_table_controls$period), +# FUN = sum) +# +# # adjust species counts by number of plots trapped that month +# # divide by number of control plots actually trapped (should be 8) and +# # multiply by 8 to estimate captures as if all plots were trapped +# ldats_rodents_adjusted <- as.data.frame.matrix(rodents[[1]]) +# ldats_rodents_adjusted[periods, ] <- round(ldats_rodents_adjusted[periods, ] / nplots_controls$x[periods] * 8) -# adjust species counts by number of plots trapped that month -# divide by number of control plots actually trapped (should be 8) and -# multiply by 8 to estimate captures as if all plots were trapped -ldats_rodents_adjusted <- as.data.frame.matrix(rodents[[1]]) -ldats_rodents_adjusted[periods, ] <- round(ldats_rodents_adjusted[periods, ] / nplots_controls$x[periods] * 8) +## ----eval = params$run_models, include = FALSE-------------------------------- +# saveRDS(ldats_rodents_adjusted, file = file.path(vignette_files, "output", "ldats_rodents_adjusted.RDS")) -## ----dataset comparisons------------------------------------------------- -compare_raw <- rodents[[1]] == ldats_rodents_adjusted -length(which(rowSums(compare_raw) < ncol(compare_raw))) +## ----eval = !params$run_models, include = FALSE------------------------------- +ldats_rodents_adjusted <- readRDS(file.path(vignette_files, "output", "ldats_rodents_adjusted.RDS")) -compare_adjusted <- ldats_rodents_adjusted == paper_dat -length(which(rowSums(compare_adjusted) < ncol(compare_adjusted))) +## ----dataset comparisons, eval = params$run_models---------------------------- +# compare_raw <- rodents[[1]] == ldats_rodents_adjusted +# length(which(rowSums(compare_raw) < ncol(compare_raw))) +# +# compare_adjusted <- ldats_rodents_adjusted == paper_dat +# length(which(rowSums(compare_adjusted) < ncol(compare_adjusted))) -## ----switch to adjusted rodents------------------------------------------ +## ----switch to adjusted rodents----------------------------------------------- rodents[[1]] <- paper_dat -## ----add dates to covariate table---------------------------------------- +## ----show covariate table----------------------------------------------------- head(rodents$document_covariate_table) -new_cov_table <- dplyr::left_join(rodents$document_covariate_table, - dplyr::select(moondat, newmoonnumber, censusdate), - by = c("newmoon" = "newmoonnumber")) %>% - dplyr::rename(date = censusdate) +## ----eval = params$run_models, include = FALSE-------------------------------- +# "%>%" <- dplyr::"%>%" + +## ----add dates to covariate table, eval = params$run_models------------------- +# new_cov_table <- dplyr::left_join(rodents$document_covariate_table, +# dplyr::select(moondat, newmoonnumber, censusdate), +# by = c("newmoon" = "newmoonnumber")) %>% +# dplyr::rename(date = censusdate) +# +# rodents$document_covariate_table <- new_cov_table -rodents$document_covariate_table <- new_cov_table +## ----eval = params$run_models, include = FALSE-------------------------------- +# saveRDS(rodents, file = file.path(vignette_files, "output", "rodents.RDS")) -## ----LDATS LDAs, eval = params$run_models-------------------------------- +## ----LDATS LDAs, eval = params$run_models------------------------------------- # ldats_ldas <- LDATS::LDA_set(document_term_table = rodents$document_term_table, # topics = 2:6, nseeds = nseeds) # ldats_ldamodel <- LDATS::select_LDA(LDA_models = ldats_ldas)[[1]] # # saveRDS(ldats_ldamodel, file = file.path(vignette_files, "ldats_ldamodel.RDS")) -## ----paper LDAs, eval = params$run_models-------------------------------- -# source(file.path(vignette_files, "AIC_model_selection.R")) -# source(file.path(vignette_files, "LDA-distance.R")) +## ----paper LDAs, eval = params$run_models------------------------------------- +# source(file.path(vignette_files, "scripts", "AIC_model_selection.R")) +# source(file.path(vignette_files, "scripts", "LDA-distance.R")) # # # Some of the functions require the data to be stored in the `dat` variable # dat <- paper_dat @@ -189,7 +232,7 @@ rodents$document_covariate_table <- new_cov_table # # choose seed with highest log likelihood for all following analyses # # (also produces plot of community composition for "best" run compared to "worst") # -# png(file.path(vignette_files, "lda_distances.png"), width = 800, height = 400) +# png(file.path(vignette_files, "output", "lda_distances.png"), width = 800, height = 400) # dat <- paper_dat # calculate_LDA_distance has some required named variables # best_seed <- calculate_LDA_distance(paper_dat, seeds_4topics) # dev.off() @@ -205,99 +248,106 @@ rodents$document_covariate_table <- new_cov_table # # saveRDS(ldamodel, file = file.path(vignette_files, "paper_ldamodel.RDS")) -## ------------------------------------------------------------------------ -knitr::include_graphics(file.path(vignette_files, "lda_distances.png")) +## ----------------------------------------------------------------------------- +knitr::include_graphics(file.path(vignette_files, "output", "lda_distances.png")) -## ------------------------------------------------------------------------ -ldamodel <- readRDS(file.path(vignette_files, "paper_ldamodel.RDS")) -ldats_ldamodel <- readRDS(file.path(vignette_files, "ldats_ldamodel.RDS")) +## ----------------------------------------------------------------------------- +ldamodel <- readRDS(file.path(vignette_files, "output", "paper_ldamodel.RDS")) +ldats_ldamodel <- readRDS(file.path(vignette_files, "output", "ldats_ldamodel.RDS")) -## ----plot paper LDA, fig.width = 7, fig.height = 6----------------------- +## ----plot paper LDA, fig.width = 7, fig.height = 6---------------------------- plot(ldamodel, cols = NULL, option = "D") -## ----plot LDATS LDA, fig.width = 7, fig.height = 6----------------------- +## ----plot LDATS LDA, fig.width = 7, fig.height = 6---------------------------- plot(ldats_ldamodel, cols = NULL, option = "D") -## ----paper changepoint models-------------------------------------------- -#### Run changepoint #### -source(file.path(vignette_files, "changepointmodel.r")) - -find_changepoints <- function(lda_model, paper_covariates, n_changepoints = 1:6) -{ - # set up parameters for model - x <- dplyr::select(paper_covariates, - year_continuous, - sin_year, - cos_year) - - # run models with 1, 2, 3, 4, 5, 6 changepoints - cpt_results <- data.frame(n_changepoints = n_changepoints) - cpt_results$cpt_model <- lapply(cpt_results$n_changepoints, - function(n_changepoints) - { - changepoint_model(lda_model, x, n_changepoints, maxit = nit, - weights = rep(1, NROW(x))) - }) - return(cpt_results) -} - -# Among a selection of models with different # of changepoints, -# - compute AIC -# - select the model with the best AIC -# - get the posterior distributions for the changepoints -select_cpt_model <- function(cpt_results, ntopics) -{ - # compute log likelihood as the mean deviance - cpt_results$mean_deviances <- vapply(cpt_results$cpt_model, - function(cpt_model) {mean(cpt_model$saved_lls)}, - 0) - - # compute AIC = ( -2 * log likelihood) + 2 * (#parameters) - cpt_results$AIC <- cpt_results$mean_deviances * -2 + - 2 * (3 * (ntopics - 1) * (cpt_results$n_changepoints + 1) + - (cpt_results$n_changepoints)) - - # select the best model - cpt <- cpt_results$cpt_model[[which.min(cpt_results$AIC)]] - return(cpt) -} - -# transform the output from `compute_cpt` and match up the time indices with -# dates from the original data -get_dates <- function(cpt, covariates = paper_covariates) -{ - cpt$saved[,1,] %>% - t() %>% - as.data.frame() %>% - reshape::melt() %>% - dplyr::left_join(covariates, by = c("value" = "index")) -} - -## ----run LDATS LDA and paper cpt, eval = params$run_models--------------- +## ----paper changepoint models, eval = params$run_models----------------------- +# #### Run changepoint #### +# source(file.path(vignette_files, "scripts", "changepointmodel.r")) +# +# find_changepoints <- function(lda_model, paper_covariates, n_changepoints = 1:6){ +# # set up parameters for model +# x <- dplyr::select(paper_covariates, +# year_continuous, +# sin_year, +# cos_year) +# +# # run models with 1, 2, 3, 4, 5, 6 changepoints +# cpt_results <- data.frame(n_changepoints = n_changepoints) +# cpt_results$cpt_model <- lapply(cpt_results$n_changepoints, +# function(n_changepoints){ +# changepoint_model(lda_model, x, n_changepoints, maxit = nit, +# weights = rep(1, NROW(x))) +# }) +# return(cpt_results) +# } +# +# # Among a selection of models with different # of changepoints, +# # - compute AIC +# # - select the model with the best AIC +# # - get the posterior distributions for the changepoints +# select_cpt_model <- function(cpt_results, ntopics){ +# # compute log likelihood as the mean deviance +# cpt_results$mean_deviances <- vapply(cpt_results$cpt_model, +# function(cpt_model) {mean(cpt_model$saved_lls)}, +# 0) +# +# # compute AIC = ( -2 * log likelihood) + 2 * (#parameters) +# cpt_results$AIC <- cpt_results$mean_deviances * -2 + +# 2 * (3 * (ntopics - 1) * (cpt_results$n_changepoints + 1) + +# (cpt_results$n_changepoints)) +# +# # select the best model +# cpt <- cpt_results$cpt_model[[which.min(cpt_results$AIC)]] +# return(cpt) +# } +# +# # transform the output from `compute_cpt` and match up the time indices with +# # dates from the original data +# get_dates <- function(cpt, covariates = paper_covariates){ +# cpt$saved[,1,] %>% +# t() %>% +# as.data.frame() %>% +# reshape::melt() %>% +# dplyr::left_join(covariates, by = c("value" = "index")) +# } + +## ----save annual_hist, include = FALSE, eval = params$run_models-------------- +# saveRDS(annual_hist, file = file.path(vignette_files, "output", "annual_hist.RDS")) + +## ----run LDATS LDA and paper cpt, eval = params$run_models-------------------- # ldats_paper_results <- find_changepoints(ldats_ldamodel, paper_covariates) # -# saveRDS(ldats_paper_results, file = file.path(vignette_files, "ldats_paper.RDS")) +# saveRDS(ldats_paper_results, file = file.path(vignette_files, "output", "ldats_paper.RDS")) -## ----compute changepoints for LDATS LDA and paper cpt-------------------- -ldats_paper_results <- readRDS(file.path(vignette_files, "ldats_paper.RDS")) +## ----compute changepoints for LDATS LDA and paper cpt, eval = params$run_models---- +# ldats_paper_results <- readRDS(file.path(vignette_files, "output", "ldats_paper.RDS")) +# +# ldats_paper_cpt <- select_cpt_model(ldats_paper_results, +# ntopics = ldats_ldamodel@k) +# ldats_paper_cpt_dates <- get_dates(ldats_paper_cpt) -ldats_paper_cpt <- select_cpt_model(ldats_paper_results, - ntopics = ldats_ldamodel@k) -ldats_paper_cpt_dates <- get_dates(ldats_paper_cpt) +## ----include = FALSE, eval = params$run_models-------------------------------- +# saveRDS(ldats_paper_cpt, file = file.path(vignette_files, "output", "ldats_paper_cpt.RDS")) +# saveRDS(ldats_paper_cpt_dates, file = file.path(vignette_files, "output", "ldats_paper_cpt_dates.RDS")) -## ----run paper LDA and paper cpt, eval = params$run_models--------------- +## ----run paper LDA and paper cpt, eval = params$run_models-------------------- # paper_paper_results <- find_changepoints(ldamodel, paper_covariates) # # saveRDS(paper_paper_results, file = file.path(vignette_files, "paper_paper.RDS")) -## ----compute changepoints for paper LDA and paper cpt-------------------- -paper_paper_results <- readRDS(file.path(vignette_files, "paper_paper.RDS")) +## ----compute changepoints for paper LDA and paper cpt, eval = params$run_models---- +# paper_paper_results <- readRDS(file.path(vignette_files, "output", "paper_paper.RDS")) +# +# paper_paper_cpt <- select_cpt_model(paper_paper_results, +# ntopics = ldamodel@k) +# paper_paper_cpt_dates <- get_dates(ldats_paper_cpt) -paper_paper_cpt <- select_cpt_model(paper_paper_results, - ntopics = ldamodel@k) -paper_paper_cpt_dates <- get_dates(ldats_paper_cpt) +## ----include = FALSE, eval = params$run_models-------------------------------- +# saveRDS(paper_paper_cpt, file = file.path(vignette_files, "output", "paper_paper_cpt.RDS")) +# saveRDS(paper_paper_cpt_dates, file = file.path(vignette_files, "output", "paper_paper_cpt_dates.RDS")) -## ----run LDATS LDA and LDATS cpt, eval = params$run_models--------------- +## ----run LDATS LDA and LDATS cpt, eval = params$run_models-------------------- # ldats_ldats_results <- TS_on_LDA(LDA_models = ldats_ldamodel, # document_covariate_table = rodents$document_covariate_table, # formulas = ~ sin_year + cos_year, @@ -306,34 +356,38 @@ paper_paper_cpt_dates <- get_dates(ldats_paper_cpt) # weights = NULL, # control = list(nit = nit)) # -# saveRDS(ldats_ldats_results, file = file.path(vignette_files, "ldats_ldats.RDS")) - -## ----construct lookup table for LDATS output for changepoint times------- -# make the full sequence of possible newmoon values -full_index <- seq(min(rodents$document_covariate_table$newmoon), - max(rodents$document_covariate_table$newmoon)) - -# generate a lookup table with dates for the newmoons, using `approx` to -# linearly interpolate the missing values -ldats_dates <- approx(rodents$document_covariate_table$newmoon, - as.Date(rodents$document_covariate_table$date), - full_index) %>% - as.data.frame() %>% - mutate(index = x, - date = as.Date(y, origin = "1970-01-01")) %>% - select(index, date) - -## ----compute changepoints for LDATS LDA and LDATS cpt-------------------- -ldats_ldats_results <- readRDS(file.path(vignette_files, "ldats_ldats.RDS")) - -ldats_ldats_cpt <- select_TS(ldats_ldats_results) +# saveRDS(ldats_ldats_results, file = file.path(vignette_files, "output", "ldats_ldats.RDS")) + +## ----construct lookup table for LDATS output for changepoint times, eval = params$run_models---- +# # make the full sequence of possible newmoon values +# full_index <- seq(min(rodents$document_covariate_table$newmoon), +# max(rodents$document_covariate_table$newmoon)) +# +# # generate a lookup table with dates for the newmoons, using `approx` to +# # linearly interpolate the missing values +# ldats_dates <- approx(rodents$document_covariate_table$newmoon, +# as.Date(rodents$document_covariate_table$date), +# full_index) %>% +# as.data.frame() %>% +# mutate(index = x, +# date = as.Date(y, origin = "1970-01-01")) %>% +# select(index, date) + +## ----compute changepoints for LDATS LDA and LDATS cpt, eval = params$run_models---- +# ldats_ldats_results <- readRDS(file.path(vignette_files, "output", "ldats_ldats.RDS")) +# +# ldats_ldats_cpt <- select_TS(ldats_ldats_results) +# +# ldats_ldats_cpt_dates <- ldats_ldats_cpt$rhos %>% +# as.data.frame() %>% +# reshape::melt() %>% +# dplyr::left_join(ldats_dates, by = c("value" = "index")) -ldats_ldats_cpt_dates <- ldats_ldats_cpt$rhos %>% - as.data.frame() %>% - reshape::melt() %>% - dplyr::left_join(ldats_dates, by = c("value" = "index")) +## ----include = FALSE, eval = params$run_models-------------------------------- +# saveRDS(ldats_ldats_cpt, file = file.path(vignette_files, "output", "ldats_ldats_cpt.RDS")) +# saveRDS(ldats_ldats_cpt_dates, file = file.path(vignette_files, "output", "ldats_ldats_cpt_dates.RDS")) -## ----run paper LDA and LDATS cpt, eval = params$run_models--------------- +## ----run paper LDA and LDATS cpt, eval = params$run_models-------------------- # paper_ldats_results <- TS_on_LDA(LDA_models = ldamodel, # document_covariate_table = rodents$document_covariate_table, # formulas = ~ sin_year + cos_year, @@ -344,62 +398,97 @@ ldats_ldats_cpt_dates <- ldats_ldats_cpt$rhos %>% # control = list(nit = nit)) # # -# saveRDS(paper_ldats_results, file = file.path(vignette_files, "paper_ldats.RDS")) - -## ----select paper lda + ldats cpt---------------------------------------- -paper_ldats_results <- readRDS(file.path(vignette_files, "paper_ldats.RDS")) - -paper_ldats_cpt <- select_TS(paper_ldats_results) - -paper_ldats_cpt_dates <- paper_ldats_cpt$rhos %>% - as.data.frame() %>% - reshape::melt() %>% - dplyr::left_join(ldats_dates, by = c("value" = "index")) +# saveRDS(paper_ldats_results, file = file.path(vignette_files, "output", "paper_ldats.RDS")) -## ------------------------------------------------------------------------ +## ----select paper lda + ldats cpt, eval = params$run_models------------------- +# paper_ldats_results <- readRDS(file.path(vignette_files, "output", "paper_ldats.RDS")) +# +# paper_ldats_cpt <- select_TS(paper_ldats_results) +# +# paper_ldats_cpt_dates <- paper_ldats_cpt$rhos %>% +# as.data.frame() %>% +# reshape::melt() %>% +# dplyr::left_join(ldats_dates, by = c("value" = "index")) + +## ----include = FALSE, eval = params$run_models-------------------------------- +# saveRDS(paper_ldats_cpt, file = file.path(vignette_files, "output", "paper_ldats_cpt.RDS")) +# saveRDS(paper_ldats_cpt_dates, file = file.path(vignette_files, "output", "paper_ldats_cpt_dates.RDS")) + +## ----eval = !params$run_models, include = FALSE------------------------------- +ldats_paper_cpt <- readRDS(file.path(vignette_files, "output", "ldats_paper_cpt.RDS")) +paper_paper_cpt <- readRDS(file.path(vignette_files, "output", "paper_paper_cpt.RDS")) +paper_ldats_cpt <- readRDS(file.path(vignette_files, "output", "paper_ldats_cpt.RDS")) +ldats_ldats_cpt <- readRDS(file.path(vignette_files, "output", "ldats_ldats_cpt.RDS")) +ldats_paper_cpt_dates <- readRDS(file.path(vignette_files, "output", "ldats_paper_cpt_dates.RDS")) +paper_paper_cpt_dates <- readRDS(file.path(vignette_files, "output", "paper_paper_cpt_dates.RDS")) +paper_ldats_cpt_dates <- readRDS(file.path(vignette_files, "output", "paper_ldats_cpt_dates.RDS")) +ldats_ldats_cpt_dates <- readRDS(file.path(vignette_files, "output", "ldats_ldats_cpt_dates.RDS")) + +## ----------------------------------------------------------------------------- nlevels(ldats_paper_cpt_dates$variable) nlevels(paper_paper_cpt_dates$variable) nlevels(ldats_ldats_cpt_dates$variable) nlevels(paper_ldats_cpt_dates$variable) -## ----plot paper LDA and LDATS cpts, fig.width = 7, fig.height = 6-------- +## ----plot paper LDA and LDATS cpts, fig.width = 7, fig.height = 6------------- plot(paper_ldats_cpt) -## ----plot ldats LDA and LDATS cpt, fig.width = 7, fig.height = 6--------- +## ----plot ldats LDA and LDATS cpt, fig.width = 7, fig.height = 6-------------- plot(ldats_ldats_cpt) -## ----plot paper LDA and paper cpt, fig.width = 7, fig.height = 6--------- -paper_cpts <- find_changepoint_location(paper_paper_cpt) -ntopics <- ldamodel@k +## ---- eval = !params$run_models----------------------------------------------- +annual_hist <- readRDS(file.path(vignette_files, "output", "annual_hist.RDS")) + +## ----plot paper LDA and paper cpt, eval = params$run_models------------------- +# paper_cpts <- find_changepoint_location(paper_paper_cpt) +# ntopics <- ldamodel@k +# +# png(file.path(vignette_files, "output", "paper_paper_cpt_plot.png"), width = 800, height = 600) +# get_ll_non_memoized_plot(ldamodel, paper_covariates, paper_cpts, make_plot = TRUE, +# weights = rep(1, NROW(paper_covariates))) +# dev.off() + +## ----plot paper LDA and LDATS cpts2, fig.width = 7, fig.height = 6------------ +paper_paper_hist <- annual_hist(paper_paper_cpt, paper_covariates$year_continuous) -paper_cpt_plot <- get_ll_non_memoized_plot(ldamodel, paper_covariates, paper_cpts, make_plot = TRUE, - weights = rep(1, NROW(paper_covariates))) +## ----------------------------------------------------------------------------- +knitr::include_graphics(file.path(vignette_files, "output", "paper_paper_cpt_plot.png")) -annual_hist(paper_paper_cpt, paper_covariates$year_continuous) -paper_cpt_plot +## ----plot LDATS lda and paper cpt, eval = params$run_models------------------- +# ldats_cpts <- find_changepoint_location(ldats_paper_cpt) +# ntopics <- ldats_ldamodel@k +# +# png(file.path(vignette_files, "output", "ldats_paper_cpt_plot.png"), width = 800, height = 600) +# get_ll_non_memoized_plot(ldats_ldamodel, paper_covariates, ldats_cpts, make_plot = TRUE, +# weights = rep(1, NROW(paper_covariates))) +# dev.off() -## ----plot LDATS lda and paper cpt, fig.width = 7, fig.height = 6--------- -ldats_cpts <- find_changepoint_location(ldats_paper_cpt) -ntopics <- ldats_ldamodel@k +## ----plot LDATS lda and paper cpt2, fig.width = 7, fig.height = 6------------- +ldats_paper_hist <- annual_hist(ldats_paper_cpt, paper_covariates$year_continuous) -ldats_cpt_plot <- get_ll_non_memoized_plot(ldats_ldamodel, paper_covariates, ldats_cpts, make_plot = TRUE, - weights = rep(1, NROW(paper_covariates))) +## ----------------------------------------------------------------------------- +knitr::include_graphics(file.path(vignette_files, "output", "ldats_paper_cpt_plot.png")) -annual_hist(ldats_paper_cpt, paper_covariates$year_continuous) -ldats_cpt_plot +## ----report cpt dates, include = FALSE, eval = params$run_models-------------- +# paper_paper_cpt_dates$date <- as.Date(paper_paper_cpt_dates$date) +# ldats_paper_cpt_dates$date <- as.Date(ldats_paper_cpt_dates$date) +# +# cpt_dates <- dplyr::bind_rows("paperLDA_paperCPT" = paper_paper_cpt_dates, +# "ldatsLDA_paperCPT" = ldats_paper_cpt_dates, +# "ldatsLDA_ldatsCPT" = ldats_ldats_cpt_dates, +# "paperLDA_ldatsCPT" = paper_ldats_cpt_dates, +# .id = "analysis") %>% +# dplyr::group_by(analysis, variable) %>% +# dplyr::summarize(date = mean(date)) %>% +# dplyr::ungroup() %>% +# dplyr::rename(changepoint = variable) %>% +# tidyr::spread(analysis, date) +# +# saveRDS(cpt_dates, file = file.path(vignette_files, "output", "cpt_dates.RDS")) -## ----report cpt dates, include = F--------------------------------------- -cpt_dates <- dplyr::bind_rows("paperLDA_paperCPT" = paper_paper_cpt_dates, - "ldatsLDA_paperCPT" = ldats_paper_cpt_dates, - "ldatsLDA_ldatsCPT" = ldats_ldats_cpt_dates, - "paperLDA_ldatsCPT" = paper_ldats_cpt_dates, - .id = "analysis") %>% - dplyr::group_by(analysis, variable) %>% - dplyr::summarize(date = mean(date)) %>% - dplyr::ungroup() %>% - dplyr::rename(changepoint = variable) %>% - tidyr::spread(analysis, date) +## ----eval = !params$run_models, include = FALSE------------------------------- +cpt_dates <- readRDS(file.path(vignette_files, "output", "cpt_dates.RDS")) -## ----print cpt dates----------------------------------------------------- +## ----print cpt dates---------------------------------------------------------- knitr::kable(cpt_dates) diff --git a/doc/paper-comparison.Rmd b/doc/paper-comparison.Rmd index b5b34072..9e970557 100644 --- a/doc/paper-comparison.Rmd +++ b/doc/paper-comparison.Rmd @@ -1,6 +1,6 @@ --- title: "Comparison to Christensen et al. 2018" -author: "Renata Diaz and Hao Ye" +author: "Renata Diaz, Juniper Simonis, and Hao Ye" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{paper-comparison} @@ -22,7 +22,8 @@ vers <- packageVersion("LDATS") # Introduction -This document provides a side-by-side comparison of **`LDATS`** (version `r vers`) results with analysis from [Christensen et al. 2018](https://esajournals.onlinelibrary.wiley.com/doi/full/10.1002/ecy.2373). +This document provides a side-by-side comparison of **`LDATS`** (version `r vers`) results with analysis from [Christensen et al. 2018](https://esajournals.onlinelibrary.wiley.com/doi/full/10.1002/ecy.2373). +Due to the size and duration of model runs, we use pre-generated model output from the [LDATS-replications repo](https://github.com/weecology/LDATS-replications). ## Summary @@ -40,7 +41,7 @@ This document provides a side-by-side comparison of **`LDATS`** (version `r vers To obtain the most recent version of **`LDATS`**, install the most recent version from GitHub: ```{r, eval = FALSE} -# install.packages("devtools") +install.packages("devtools") devtools::install_github("weecology/LDATS") ``` @@ -52,6 +53,13 @@ nseeds <- 200 nit <- 10000 ``` +To run the analyses here, you will also need to download **`dplyr`**, **`gridExtra`**, **`multipanel`**, **`RColorBrewer`**, **`RCurl`**, and **`reshape2`** as the manuscript's code relies on these packages. + +```{r, eval = params$run_models} +install.packages(c("dplyr", "gridExtra", "multipanel", "RColorBrewer", "RCurl", "reshape2")) +``` + + ## Running the Models Because both the Latent Dirichlet Allocation (LDA) and time series components of the analysis can take a long time to run (especially with the settings above for the number of seeds and iterations), we will use pre-generated model outputs and turn off certain code chunks that run the models using a global `rmarkdown` parameter, `run_models = FALSE`. @@ -69,7 +77,7 @@ We're going to download analysis scripts, data files, and model objects, so we u vignette_files <- tempdir() ``` -To replicate the Christensen et al. 2018 analysis, we download some of the original scripts & data files from [Extreme-events-LDA repo](https://github.com/emchristensen/Extreme-events-LDA), as well as some raw data files from the [PortalData repo](https://github.com/weecology/PortalData): +To replicate the Christensen et al. 2018 analysis, we download some of the original scripts & data files from [Extreme-events-LDA repo](https://github.com/emchristensen/Extreme-events-LDA), as well as some raw data files from the [PortalData repo](https://github.com/weecology/PortalData), which are stored in the [LDATS-replications repo](https://github.com/weecology/LDATS-replications): Main Analysis Scripts: @@ -99,37 +107,48 @@ Data: * `Portal_rodent_trapping.csv` - table of trapping effort (downloaded from the PortalData repository) +* `paper_dat.csv` + - rodent data table from Christensen et al. 2018 + +* `paper_dates.csv` + - dates used in Christensen et al. 2018 + +* `paper_covariates.csv` + - table of dates and covariate data + Figure scripts: * `LDA_figure_scripts.R` - contains functions for making main plots in manuscript (Fig 1). Called from rodent_LDA_analysis.R -```{r download scripts} -test_file <- file.path(vignette_files, "rodent_LDA_analysis.r") +```{r download scripts and data} +test_file <- file.path(vignette_files, "scripts", "rodent_LDA_analysis.r") -if (!file.exists(test_file)) -{ - # from the Extreme-events-LDA repo - github_path <- "https://raw.githubusercontent.com/emchristensen/Extreme-events-LDA/master/" +if (!file.exists(test_file)){ + + # scripts + dir.create(file.path(vignette_files, "scripts"), showWarnings = FALSE) + github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/scripts/" files_to_download <- c("rodent_LDA_analysis.r", "rodent_data_for_LDA.r", "AIC_model_selection.R", "changepointmodel.r", - "LDA-distance.R", "Rodent_table_dat.csv", - "LDA_figure_scripts.R") + "LDA-distance.R", "LDA_figure_scripts.R") - for (file in files_to_download) - { + for (file in files_to_download) { download.file(url = paste0(github_path, file), - destfile = file.path(vignette_files, file)) + destfile = file.path(vignette_files, "scripts", file)) } + + + # data + dir.create(file.path(vignette_files, "data"), showWarnings = FALSE) + github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/data/" + files_to_download <- c("moon_dates.csv", "Portal_rodent_trapping.csv", + "Rodent_table_dat.csv", "paper_dat.csv", + "paper_dates.csv", "paper_covariates.csv") - # from the PortalData repo - github_path <- "https://raw.githubusercontent.com/weecology/PortalData/master/Rodents/" - files_to_download <- c("moon_dates.csv", "Portal_rodent_trapping.csv") - - for (file in files_to_download) - { + for (file in files_to_download) { download.file(url = paste0(github_path, file), - destfile = file.path(vignette_files, file)) + destfile = file.path(vignette_files, "data", file)) } } ``` @@ -146,46 +165,66 @@ LDA models: * `paper_ldamodel.RDS` - the best LDA model as selected by the Christensen et al. analysis -Changepoint outputs +Changepoint outputs: -* `ldats_ldats.RDS` - - the posterior distribution of changepoints, using the LDATS LDA model and the LDATS changepoint selection +* `ldats_ldats.RDS`, `ldats_ldats_cpt.RDS`, `ldats_ldats_cpt_dates.RDS` + - the posterior distribution, count, and dates of changepoints, using the LDATS LDA model and the LDATS changepoint selection -* `ldats_paper.RDS` - - the posterior distribution of changepoints, using the LDATS LDA model and the paper's changepoint selection +* `ldats_paper.RDS`, `ldats_paper_cpt.RDS`, `ldats_paper_cpt_dates.RDS + - the posterior distribution, count, and dates of changepoints, using the LDATS LDA model and the paper's changepoint selection -* `paper_ldats.RDS` - - the posterior distribution of changepoints, using the paper LDA model and the LDATS changepoint selection +* `paper_ldats.RDS`, `paper_ldats_cpt.RDS`, `paper_ldats_cpt_dates.RDS + - the posterior distribution, count, and dates of changepoints, using the paper LDA model and the LDATS changepoint selection -* `paper_paper.RDS` - - the posterior distribution of changepoints, using the paper LDA model and the paper's changepoint selection +* `paper_paper.RDS`, `paper_paper_cpt.RDS`, `paper_paper_cpt_dates.RDS + - the posterior distribution, count, and dates of changepoints, using the paper LDA model and the paper's changepoint selection + +* `cpt_dates.RDS` + - summary table of changepoint results across models Figures * `lda_distances.png` - figure showing the variance in the topics identified by the paper's LDA model code -```{r download pre-generated model outputs, eval = !params$run_model} -test_file <- file.path(vignette_files, "ldats_ldamodel.RDS") +* `paper_paper_cpt_plot.png` + - figure showing the time series results for the paper analysis of the paper LDA + +* `ldats_paper_cpt_plot.png` + - figure showing the time series results for the paper analysis of the LDATS LDA + +* `annual_hist.RDS` + - function for making histogram of change points over years + -if (!file.exists(test_file)) -{ - # from the Extreme-events-LDA repo - github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/model_outputs/" +```{r download pre-generated model outputs} +test_file <- file.path(vignette_files, "output", "ldats_ldamodel.RDS") + +if (!file.exists(test_file)){ + + dir.create(file.path(vignette_files, "output"), showWarnings = FALSE) + github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/output/" files_to_download <- c("ldats_ldamodel.RDS", "paper_ldamodel.RDS", "ldats_ldats.RDS", "ldats_paper.RDS", "paper_ldats.RDS", "paper_paper.RDS", - "lda_distances.png") - - for (file in files_to_download) - { + "ldats_rodents_adjusted.RDS", "rodents.RDS", + "ldats_paper_cpt.RDS", "ldats_paper_cpt_dates.RDS", + "ldats_ldats_cpt.RDS", "ldats_ldats_cpt_dates.RDS", + "paper_paper_cpt.RDS", "paper_paper_cpt_dates.RDS", + "paper_ldats_cpt.RDS", "paper_ldats_cpt_dates.RDS", + "annual_hist.RDS", "cpt_dates.RDS", + "lda_distances.png", "paper_paper_cpt_plot.png", + "ldats_paper_cpt_plot.png") + + for (file in files_to_download){ download.file(url = paste0(github_path, file), - destfile = file.path(vignette_files, file), + destfile = file.path(vignette_files, "output", file), mode = "wb") } } ``` + # Data Comparison The dataset of Portal rodents on control plots is included in the LDATS package: @@ -198,14 +237,14 @@ head(rodents[[1]]) We can compare this against the data used in Christensen et al: -```{r Paper data} +```{r Paper data, eval = params$run_models} # parameters for subsetting the full Portal rodents data periods <- 1:436 control_plots <- c(2, 4, 8, 11, 12, 14, 17, 22) species_list <- c("BA", "DM", "DO", "DS", "NA", "OL", "OT", "PB", "PE", "PF", "PH", "PI", "PL", "PM", "PP", "RF", "RM", "RO", "SF", "SH", "SO") -source(file.path(vignette_files, "rodent_data_for_LDA.r")) +source(file.path(vignette_files, "scripts", "rodent_data_for_LDA.r")) # assemble `paper_dat`, the data from Christensen et al. 2018 paper_dat <- create_rodent_table(period_first = min(periods), @@ -214,7 +253,7 @@ paper_dat <- create_rodent_table(period_first = min(periods), selected_species = species_list) # assemble `paper_covariates`, the associated dates and covariate data -moondat <- read.csv(file.path(vignette_files, "moon_dates.csv"), stringsAsFactors = F) +moondat <- read.csv(file.path(vignette_files, "data", "moon_dates.csv"), stringsAsFactors = F) paper_dates <- moondat %>% dplyr::filter(period %>% dplyr::between(min(periods), max(periods))) %>% @@ -231,6 +270,13 @@ paper_covariates <- data.frame( ) ``` +```{r Paper data2, eval = !params$run_models, include = FALSE} + moondat <- read.csv(file.path(vignette_files, "data", "moon_dates.csv"), stringsAsFactors = FALSE) + paper_dat <- read.csv(file.path(vignette_files, "data", "paper_dat.csv"), stringsAsFactors = FALSE) + paper_dates <- read.csv(file.path(vignette_files, "data", "paper_dates.csv"), stringsAsFactors = FALSE) + paper_covariates <- read.csv(file.path(vignette_files, "data", "paper_covariates.csv"), stringsAsFactors = FALSE) +``` + ## Compare the data from Christensen et al. with the included data in `LDATS` ```{r rodent data comparison} @@ -239,11 +285,11 @@ compare <- rodents[[1]] == paper_dat length(which(rowSums(compare) < ncol(compare))) ``` -There are 16 rows where the data included in LDATS differs from the paper data. This is because the LDATS data is not adjusted to account for trapping effort, while the paper data does, by dividing all census counts by the actual number of plots trapped and multiplying by 8 to account for incompletely-trapped censuses. +There are 16 rows where the data included in LDATS differs from the paper data. This is because the LDATS data is not adjusted to account for trapping effort, while the paper data is, by dividing all census counts by the actual number of plots trapped and multiplying by 8 to account for incompletely-trapped censuses. To confirm this, refer to lines 36-46 in `rodent_data_for_LDA.r`: -```{} +```{r Data adjustment eval, eval = FALSE} # retrieve data on number of plots trapped per month trap_table = read.csv('https://raw.githubusercontent.com/weecology/PortalData/master/Rodents/Portal_rodent_trapping.csv') trap_table_controls = filter(trap_table, plot %in% selected_plots) @@ -259,9 +305,9 @@ To confirm this, refer to lines 36-46 in `rodent_data_for_LDA.r`: We can run the same procedure on the LDATS data to verify that we obtain a data.frame that matches. -```{r adjust LDATS data after Christensen et al, eval = TRUE} +```{r adjust LDATS data after Christensen et al, eval = params$run_models} # get the trapping effort for each sample -trap_table <- read.csv(file.path(vignette_files, "Portal_rodent_trapping.csv")) +trap_table <- read.csv(file.path(vignette_files, "data", "Portal_rodent_trapping.csv")) trap_table_controls <- dplyr::filter(trap_table, plot %in% control_plots) nplots_controls <- aggregate(trap_table_controls$sampled, by = list(period = trap_table_controls$period), @@ -274,9 +320,16 @@ ldats_rodents_adjusted <- as.data.frame.matrix(rodents[[1]]) ldats_rodents_adjusted[periods, ] <- round(ldats_rodents_adjusted[periods, ] / nplots_controls$x[periods] * 8) ``` +```{r eval = params$run_models, include = FALSE} +saveRDS(ldats_rodents_adjusted, file = file.path(vignette_files, "output", "ldats_rodents_adjusted.RDS")) +``` + Now we can compare the adjusted LDATS dataset with both the original ldats dataset and the dataset from the paper: -```{r dataset comparisons} +```{r eval = !params$run_models, include = FALSE} +ldats_rodents_adjusted <- readRDS(file.path(vignette_files, "output", "ldats_rodents_adjusted.RDS")) +``` +```{r dataset comparisons, eval = params$run_models} compare_raw <- rodents[[1]] == ldats_rodents_adjusted length(which(rowSums(compare_raw) < ncol(compare_raw))) @@ -292,16 +345,25 @@ rodents[[1]] <- paper_dat The LDATS rodent data comes with a `document_covariate_table`, which we will use later as the predictor variables for the changepoint models. In this table, time is expressed as new moon numbers. Later we will want to be able to interpret the results in terms of census dates. We will add a column to the `document_covariate_table` to convert new moon numbers to census dates. We will not reference this column in any of the formulas we pass to the changepoint models, so it will be ignored until we need it. -```{r add dates to covariate table} + +```{r show covariate table} head(rodents$document_covariate_table) +``` +```{r eval = params$run_models, include = FALSE} +"%>%" <- dplyr::"%>%" +``` +```{r add dates to covariate table, eval = params$run_models} new_cov_table <- dplyr::left_join(rodents$document_covariate_table, dplyr::select(moondat, newmoonnumber, censusdate), by = c("newmoon" = "newmoonnumber")) %>% - dplyr::rename(date = censusdate) + dplyr::rename(date = censusdate) rodents$document_covariate_table <- new_cov_table ``` +```{r eval = params$run_models, include = FALSE} +saveRDS(rodents, file = file.path(vignette_files, "output", "rodents.RDS")) +``` # Identify community groups using LDA @@ -320,8 +382,8 @@ saveRDS(ldats_ldamodel, file = file.path(vignette_files, "ldats_ldamodel.RDS")) Second, we run the LDA models from Christensen et al. to do the same task: ```{r paper LDAs, eval = params$run_models} -source(file.path(vignette_files, "AIC_model_selection.R")) -source(file.path(vignette_files, "LDA-distance.R")) +source(file.path(vignette_files, "scripts", "AIC_model_selection.R")) +source(file.path(vignette_files, "scripts", "LDA-distance.R")) # Some of the functions require the data to be stored in the `dat` variable dat <- paper_dat @@ -350,7 +412,7 @@ seeds_4topics <- best_ntopic %>% # choose seed with highest log likelihood for all following analyses # (also produces plot of community composition for "best" run compared to "worst") -png(file.path(vignette_files, "lda_distances.png"), width = 800, height = 400) +png(file.path(vignette_files, "output", "lda_distances.png"), width = 800, height = 400) dat <- paper_dat # calculate_LDA_distance has some required named variables best_seed <- calculate_LDA_distance(paper_dat, seeds_4topics) dev.off() @@ -368,15 +430,15 @@ saveRDS(ldamodel, file = file.path(vignette_files, "paper_ldamodel.RDS")) ``` ```{r} -knitr::include_graphics(file.path(vignette_files, "lda_distances.png")) +knitr::include_graphics(file.path(vignette_files, "output", "lda_distances.png")) ``` ## Plots To visualize the LDA assignment of species to topics, we load in the saved LDA models from previously: ```{r} -ldamodel <- readRDS(file.path(vignette_files, "paper_ldamodel.RDS")) -ldats_ldamodel <- readRDS(file.path(vignette_files, "ldats_ldamodel.RDS")) +ldamodel <- readRDS(file.path(vignette_files, "output", "paper_ldamodel.RDS")) +ldats_ldamodel <- readRDS(file.path(vignette_files, "output", "ldats_ldamodel.RDS")) ``` How the paper LDA model assigns species to topics: @@ -406,12 +468,11 @@ Having divided the data to generate catch-per-effort, the paper changepoint mode We define a few helper functions for running the changepoints model of Christensen et al. and processing the output to obtain the dates: -```{r paper changepoint models} +```{r paper changepoint models, eval = params$run_models} #### Run changepoint #### -source(file.path(vignette_files, "changepointmodel.r")) +source(file.path(vignette_files, "scripts", "changepointmodel.r")) -find_changepoints <- function(lda_model, paper_covariates, n_changepoints = 1:6) -{ +find_changepoints <- function(lda_model, paper_covariates, n_changepoints = 1:6){ # set up parameters for model x <- dplyr::select(paper_covariates, year_continuous, @@ -421,8 +482,7 @@ find_changepoints <- function(lda_model, paper_covariates, n_changepoints = 1:6) # run models with 1, 2, 3, 4, 5, 6 changepoints cpt_results <- data.frame(n_changepoints = n_changepoints) cpt_results$cpt_model <- lapply(cpt_results$n_changepoints, - function(n_changepoints) - { + function(n_changepoints){ changepoint_model(lda_model, x, n_changepoints, maxit = nit, weights = rep(1, NROW(x))) }) @@ -433,8 +493,7 @@ find_changepoints <- function(lda_model, paper_covariates, n_changepoints = 1:6) # - compute AIC # - select the model with the best AIC # - get the posterior distributions for the changepoints -select_cpt_model <- function(cpt_results, ntopics) -{ +select_cpt_model <- function(cpt_results, ntopics){ # compute log likelihood as the mean deviance cpt_results$mean_deviances <- vapply(cpt_results$cpt_model, function(cpt_model) {mean(cpt_model$saved_lls)}, @@ -452,8 +511,7 @@ select_cpt_model <- function(cpt_results, ntopics) # transform the output from `compute_cpt` and match up the time indices with # dates from the original data -get_dates <- function(cpt, covariates = paper_covariates) -{ +get_dates <- function(cpt, covariates = paper_covariates){ cpt$saved[,1,] %>% t() %>% as.data.frame() %>% @@ -461,6 +519,10 @@ get_dates <- function(cpt, covariates = paper_covariates) dplyr::left_join(covariates, by = c("value" = "index")) } ``` +```{r save annual_hist, include = FALSE, eval = params$run_models} +saveRDS(annual_hist, file = file.path(vignette_files, "output", "annual_hist.RDS")) +``` + ### LDATS LDA and paper changepoint @@ -468,17 +530,21 @@ Run the Christensen et al. time series model to identify changepoints on the LDA ```{r run LDATS LDA and paper cpt, eval = params$run_models} ldats_paper_results <- find_changepoints(ldats_ldamodel, paper_covariates) -saveRDS(ldats_paper_results, file = file.path(vignette_files, "ldats_paper.RDS")) +saveRDS(ldats_paper_results, file = file.path(vignette_files, "output", "ldats_paper.RDS")) ``` Extract the dates of the changepoints: -```{r compute changepoints for LDATS LDA and paper cpt} -ldats_paper_results <- readRDS(file.path(vignette_files, "ldats_paper.RDS")) +```{r compute changepoints for LDATS LDA and paper cpt, eval = params$run_models} +ldats_paper_results <- readRDS(file.path(vignette_files, "output", "ldats_paper.RDS")) ldats_paper_cpt <- select_cpt_model(ldats_paper_results, ntopics = ldats_ldamodel@k) ldats_paper_cpt_dates <- get_dates(ldats_paper_cpt) ``` +```{r include = FALSE, eval = params$run_models} +saveRDS(ldats_paper_cpt, file = file.path(vignette_files, "output", "ldats_paper_cpt.RDS")) +saveRDS(ldats_paper_cpt_dates, file = file.path(vignette_files, "output", "ldats_paper_cpt_dates.RDS")) +``` ### Paper LDA and paper changepoint @@ -490,13 +556,17 @@ saveRDS(paper_paper_results, file = file.path(vignette_files, "paper_paper.RDS") ``` Extract the dates of the changepoints: -```{r compute changepoints for paper LDA and paper cpt} -paper_paper_results <- readRDS(file.path(vignette_files, "paper_paper.RDS")) +```{r compute changepoints for paper LDA and paper cpt, eval = params$run_models} +paper_paper_results <- readRDS(file.path(vignette_files, "output", "paper_paper.RDS")) paper_paper_cpt <- select_cpt_model(paper_paper_results, ntopics = ldamodel@k) paper_paper_cpt_dates <- get_dates(ldats_paper_cpt) ``` +```{r include = FALSE, eval = params$run_models} +saveRDS(paper_paper_cpt, file = file.path(vignette_files, "output", "paper_paper_cpt.RDS")) +saveRDS(paper_paper_cpt_dates, file = file.path(vignette_files, "output", "paper_paper_cpt_dates.RDS")) +``` ## Running LDATS changepoint models @@ -513,12 +583,12 @@ ldats_ldats_results <- TS_on_LDA(LDA_models = ldats_ldamodel, weights = NULL, control = list(nit = nit)) -saveRDS(ldats_ldats_results, file = file.path(vignette_files, "ldats_ldats.RDS")) +saveRDS(ldats_ldats_results, file = file.path(vignette_files, "output", "ldats_ldats.RDS")) ``` Unlike the paper changepoint model, LDATS can recognize that sampling periods may not be equidistant, and can place changepoint estimates at new moons if they fall between nonconsecutive sampling periods. We can estimate the dates corresponding to those new moons, extrapolating from the census dates for adjacent census periods. -```{r construct lookup table for LDATS output for changepoint times} +```{r construct lookup table for LDATS output for changepoint times, eval = params$run_models} # make the full sequence of possible newmoon values full_index <- seq(min(rodents$document_covariate_table$newmoon), max(rodents$document_covariate_table$newmoon)) @@ -536,8 +606,8 @@ ldats_dates <- approx(rodents$document_covariate_table$newmoon, Select the best time series model and extract the dates of the changepoints: -```{r compute changepoints for LDATS LDA and LDATS cpt} -ldats_ldats_results <- readRDS(file.path(vignette_files, "ldats_ldats.RDS")) +```{r compute changepoints for LDATS LDA and LDATS cpt, eval = params$run_models} +ldats_ldats_results <- readRDS(file.path(vignette_files, "output", "ldats_ldats.RDS")) ldats_ldats_cpt <- select_TS(ldats_ldats_results) @@ -546,6 +616,10 @@ ldats_ldats_cpt_dates <- ldats_ldats_cpt$rhos %>% reshape::melt() %>% dplyr::left_join(ldats_dates, by = c("value" = "index")) ``` +```{r include = FALSE, eval = params$run_models} +saveRDS(ldats_ldats_cpt, file = file.path(vignette_files, "output", "ldats_ldats_cpt.RDS")) +saveRDS(ldats_ldats_cpt_dates, file = file.path(vignette_files, "output", "ldats_ldats_cpt_dates.RDS")) +``` ### Paper LDA and LDATS changepoint @@ -562,13 +636,13 @@ paper_ldats_results <- TS_on_LDA(LDA_models = ldamodel, control = list(nit = nit)) -saveRDS(paper_ldats_results, file = file.path(vignette_files, "paper_ldats.RDS")) +saveRDS(paper_ldats_results, file = file.path(vignette_files, "output", "paper_ldats.RDS")) ``` Select the best time series model and extract the dates of the changepoints: -```{r select paper lda + ldats cpt} -paper_ldats_results <- readRDS(file.path(vignette_files, "paper_ldats.RDS")) +```{r select paper lda + ldats cpt, eval = params$run_models} +paper_ldats_results <- readRDS(file.path(vignette_files, "output", "paper_ldats.RDS")) paper_ldats_cpt <- select_TS(paper_ldats_results) @@ -577,9 +651,22 @@ paper_ldats_cpt_dates <- paper_ldats_cpt$rhos %>% reshape::melt() %>% dplyr::left_join(ldats_dates, by = c("value" = "index")) ``` +```{r include = FALSE, eval = params$run_models} +saveRDS(paper_ldats_cpt, file = file.path(vignette_files, "output", "paper_ldats_cpt.RDS")) +saveRDS(paper_ldats_cpt_dates, file = file.path(vignette_files, "output", "paper_ldats_cpt_dates.RDS")) +``` ## How many changepoints were identified? - +```{r eval = !params$run_models, include = FALSE} +ldats_paper_cpt <- readRDS(file.path(vignette_files, "output", "ldats_paper_cpt.RDS")) +paper_paper_cpt <- readRDS(file.path(vignette_files, "output", "paper_paper_cpt.RDS")) +paper_ldats_cpt <- readRDS(file.path(vignette_files, "output", "paper_ldats_cpt.RDS")) +ldats_ldats_cpt <- readRDS(file.path(vignette_files, "output", "ldats_ldats_cpt.RDS")) +ldats_paper_cpt_dates <- readRDS(file.path(vignette_files, "output", "ldats_paper_cpt_dates.RDS")) +paper_paper_cpt_dates <- readRDS(file.path(vignette_files, "output", "paper_paper_cpt_dates.RDS")) +paper_ldats_cpt_dates <- readRDS(file.path(vignette_files, "output", "paper_ldats_cpt_dates.RDS")) +ldats_ldats_cpt_dates <- readRDS(file.path(vignette_files, "output", "ldats_ldats_cpt_dates.RDS")) +``` ```{r} nlevels(ldats_paper_cpt_dates$variable) nlevels(paper_paper_cpt_dates$variable) @@ -602,34 +689,57 @@ plot(paper_ldats_cpt) plot(ldats_ldats_cpt) ``` +```{r, eval = !params$run_models} +annual_hist <- readRDS(file.path(vignette_files, "output", "annual_hist.RDS")) +``` + ### Paper LDA and paper changepoint -```{r plot paper LDA and paper cpt, fig.width = 7, fig.height = 6} +```{r plot paper LDA and paper cpt, eval = params$run_models} paper_cpts <- find_changepoint_location(paper_paper_cpt) ntopics <- ldamodel@k -paper_cpt_plot <- get_ll_non_memoized_plot(ldamodel, paper_covariates, paper_cpts, make_plot = TRUE, +png(file.path(vignette_files, "output", "paper_paper_cpt_plot.png"), width = 800, height = 600) +get_ll_non_memoized_plot(ldamodel, paper_covariates, paper_cpts, make_plot = TRUE, weights = rep(1, NROW(paper_covariates))) +dev.off() +``` + +```{r plot paper LDA and LDATS cpts2, fig.width = 7, fig.height = 6} +paper_paper_hist <- annual_hist(paper_paper_cpt, paper_covariates$year_continuous) +``` -annual_hist(paper_paper_cpt, paper_covariates$year_continuous) -paper_cpt_plot +```{r} +knitr::include_graphics(file.path(vignette_files, "output", "paper_paper_cpt_plot.png")) ``` + ### LDATS LDA and paper changepoint -```{r plot LDATS lda and paper cpt, fig.width = 7, fig.height = 6} +```{r plot LDATS lda and paper cpt, eval = params$run_models} ldats_cpts <- find_changepoint_location(ldats_paper_cpt) ntopics <- ldats_ldamodel@k -ldats_cpt_plot <- get_ll_non_memoized_plot(ldats_ldamodel, paper_covariates, ldats_cpts, make_plot = TRUE, +png(file.path(vignette_files, "output", "ldats_paper_cpt_plot.png"), width = 800, height = 600) +get_ll_non_memoized_plot(ldats_ldamodel, paper_covariates, ldats_cpts, make_plot = TRUE, weights = rep(1, NROW(paper_covariates))) +dev.off() +``` -annual_hist(ldats_paper_cpt, paper_covariates$year_continuous) -ldats_cpt_plot +```{r plot LDATS lda and paper cpt2, fig.width = 7, fig.height = 6} +ldats_paper_hist <- annual_hist(ldats_paper_cpt, paper_covariates$year_continuous) +``` + +```{r} +knitr::include_graphics(file.path(vignette_files, "output", "ldats_paper_cpt_plot.png")) ``` The results of the changepoint model appear robust to both choice of LDA model and choice of changepoint model. ## Report changepoint dates -```{r report cpt dates, include = F} + +```{r report cpt dates, include = FALSE, eval = params$run_models} +paper_paper_cpt_dates$date <- as.Date(paper_paper_cpt_dates$date) +ldats_paper_cpt_dates$date <- as.Date(ldats_paper_cpt_dates$date) + cpt_dates <- dplyr::bind_rows("paperLDA_paperCPT" = paper_paper_cpt_dates, "ldatsLDA_paperCPT" = ldats_paper_cpt_dates, "ldatsLDA_ldatsCPT" = ldats_ldats_cpt_dates, @@ -640,6 +750,12 @@ cpt_dates <- dplyr::bind_rows("paperLDA_paperCPT" = paper_paper_cpt_dates, dplyr::ungroup() %>% dplyr::rename(changepoint = variable) %>% tidyr::spread(analysis, date) + +saveRDS(cpt_dates, file = file.path(vignette_files, "output", "cpt_dates.RDS")) +``` + +```{r eval = !params$run_models, include = FALSE} +cpt_dates <- readRDS(file.path(vignette_files, "output", "cpt_dates.RDS")) ``` ```{r print cpt dates} diff --git a/doc/paper-comparison.html b/doc/paper-comparison.html index 3546b7e7..ea28c77f 100644 --- a/doc/paper-comparison.html +++ b/doc/paper-comparison.html @@ -1,17 +1,16 @@ - + - - + - + Comparison to Christensen et al. 2018 @@ -120,9 +119,6 @@ font-size: 14px; line-height: 1.35; } -#header { -text-align: center; -} #TOC { clear: both; margin: 0 0 10px 10px; @@ -290,7 +286,8 @@ code > span.co { color: #888888; font-style: italic; } code > span.ot { color: #007020; } code > span.al { color: #ff0000; font-weight: bold; } -code > span.fu { color: #900; font-weight: bold; } code > span.er { color: #a61717; background-color: #e3d2d2; } +code > span.fu { color: #900; font-weight: bold; } +code > span.er { color: #a61717; background-color: #e3d2d2; } @@ -304,13 +301,13 @@

Comparison to Christensen et al. 2018

-

Renata Diaz and Hao Ye

+

Renata Diaz, Juniper Simonis, and Hao Ye

Introduction

-

This document provides a side-by-side comparison of LDATS (version 0.3.0) results with analysis from Christensen et al. 2018.

+

This document provides a side-by-side comparison of LDATS (version 0.2.7) results with analysis from Christensen et al. 2018. Due to the size and duration of model runs, we use pre-generated model output from the LDATS-replications repo.

Summary

@@ -362,25 +359,27 @@

Setup

LDATS Installation

To obtain the most recent version of LDATS, install the most recent version from GitHub:

-
# install.packages("devtools")
+
install.packages("devtools")
 devtools::install_github("weecology/LDATS")

Load in the LDATS package.

library(LDATS)
 set.seed(42)
 nseeds <- 200
 nit <- 10000
+

To run the analyses here, you will also need to download dplyr, gridExtra, multipanel, RColorBrewer, RCurl, and reshape2 as the manuscript’s code relies on these packages.

+
install.packages(c("dplyr", "gridExtra", "multipanel", "RColorBrewer", "RCurl", "reshape2"))

Running the Models

Because both the Latent Dirichlet Allocation (LDA) and time series components of the analysis can take a long time to run (especially with the settings above for the number of seeds and iterations), we will use pre-generated model outputs and turn off certain code chunks that run the models using a global rmarkdown parameter, run_models = FALSE.

To change this functionality, you can re-render this file with:

-
rmarkdown::render("paper-comparison.Rmd", params = list(run_models = TRUE))
+
rmarkdown::render("paper-comparison.Rmd", params = list(run_models = TRUE))

Download Analysis Scripts and Data Files

We’re going to download analysis scripts, data files, and model objects, so we use a temporary location for storage:

-
vignette_files <- tempdir()
-

To replicate the Christensen et al. 2018 analysis, we download some of the original scripts & data files from Extreme-events-LDA repo, as well as some raw data files from the PortalData repo:

+
vignette_files <- tempdir()
+

To replicate the Christensen et al. 2018 analysis, we download some of the original scripts & data files from Extreme-events-LDA repo, as well as some raw data files from the PortalData repo, which are stored in the LDATS-replications repo:

Main Analysis Scripts:

  • rodent_LDA_analysis.R @@ -418,39 +417,53 @@

    Download Analysis Scripts and Data Files

    • table of trapping effort (downloaded from the PortalData repository)
  • +
  • paper_dat.csv +
      +
    • rodent data table from Christensen et al. 2018
    • +
  • +
  • paper_dates.csv +
      +
    • dates used in Christensen et al. 2018
    • +
  • +
  • paper_covariates.csv +
      +
    • table of dates and covariate data
    • +

Figure scripts:

  • LDA_figure_scripts.R
  • contains functions for making main plots in manuscript (Fig 1). Called from rodent_LDA_analysis.R
-
test_file <- file.path(vignette_files, "rodent_LDA_analysis.r")
-
-if (!file.exists(test_file))
-{
-  # from the Extreme-events-LDA repo
-  github_path <- "https://raw.githubusercontent.com/emchristensen/Extreme-events-LDA/master/"
-  files_to_download <- c("rodent_LDA_analysis.r", "rodent_data_for_LDA.r", 
-                         "AIC_model_selection.R", "changepointmodel.r", 
-                         "LDA-distance.R", "Rodent_table_dat.csv", 
-                         "LDA_figure_scripts.R")
-  
-  for (file in files_to_download)
-  {
-    download.file(url = paste0(github_path, file),
-                  destfile = file.path(vignette_files, file))
-  }
-  
-  # from the PortalData repo
-  github_path <- "https://raw.githubusercontent.com/weecology/PortalData/master/Rodents/"
-  files_to_download <- c("moon_dates.csv", "Portal_rodent_trapping.csv")
-  
-  for (file in files_to_download)
-  {
-    download.file(url = paste0(github_path, file),
-                  destfile = file.path(vignette_files, file))
-  }
-}
+
test_file <- file.path(vignette_files, "scripts", "rodent_LDA_analysis.r")
+
+if (!file.exists(test_file)){
+
+  # scripts
+  dir.create(file.path(vignette_files, "scripts"), showWarnings = FALSE)
+  github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/scripts/"
+  files_to_download <- c("rodent_LDA_analysis.r", "rodent_data_for_LDA.r", 
+                         "AIC_model_selection.R", "changepointmodel.r", 
+                         "LDA-distance.R", "LDA_figure_scripts.R")
+  
+  for (file in files_to_download)  {
+    download.file(url = paste0(github_path, file),
+                  destfile = file.path(vignette_files, "scripts", file))
+  }
+
+    
+  # data
+  dir.create(file.path(vignette_files, "data"), showWarnings = FALSE)
+  github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/data/"
+  files_to_download <- c("moon_dates.csv", "Portal_rodent_trapping.csv", 
+                         "Rodent_table_dat.csv", "paper_dat.csv",
+                         "paper_dates.csv", "paper_covariates.csv")
+  
+  for (file in files_to_download)  {
+    download.file(url = paste0(github_path, file),
+                  destfile = file.path(vignette_files, "data", file))
+  }
+}

Download Model Outputs

@@ -466,23 +479,27 @@

Download Model Outputs

  • the best LDA model as selected by the Christensen et al. analysis
  • -

    Changepoint outputs

    +

    Changepoint outputs:

    +
      +
    • ldats_ldats.RDS, ldats_ldats_cpt.RDS, ldats_ldats_cpt_dates.RDS
        -
      • ldats_ldats.RDS +
      • the posterior distribution, count, and dates of changepoints, using the LDATS LDA model and the LDATS changepoint selection
      • +
    • +
    • ldats_paper.RDS, ldats_paper_cpt.RDS, `ldats_paper_cpt_dates.RDS
        -
      • the posterior distribution of changepoints, using the LDATS LDA model and the LDATS changepoint selection
      • +
      • the posterior distribution, count, and dates of changepoints, using the LDATS LDA model and the paper’s changepoint selection
    • -
    • ldats_paper.RDS +
    • paper_ldats.RDS, paper_ldats_cpt.RDS, `paper_ldats_cpt_dates.RDS
        -
      • the posterior distribution of changepoints, using the LDATS LDA model and the paper’s changepoint selection
      • +
      • the posterior distribution, count, and dates of changepoints, using the paper LDA model and the LDATS changepoint selection
    • -
    • paper_ldats.RDS +
    • paper_paper.RDS, paper_paper_cpt.RDS, `paper_paper_cpt_dates.RDS
        -
      • the posterior distribution of changepoints, using the paper LDA model and the LDATS changepoint selection
      • +
      • the posterior distribution, count, and dates of changepoints, using the paper LDA model and the paper’s changepoint selection
    • -
    • paper_paper.RDS +
    • cpt_dates.RDS
        -
      • the posterior distribution of changepoints, using the paper LDA model and the paper’s changepoint selection
      • +
      • summary table of changepoint results across models

    Figures

    @@ -491,207 +508,213 @@

    Download Model Outputs

    • figure showing the variance in the topics identified by the paper’s LDA model code
    +
  • paper_paper_cpt_plot.png +
      +
    • figure showing the time series results for the paper analysis of the paper LDA
    • +
  • +
  • ldats_paper_cpt_plot.png +
      +
    • figure showing the time series results for the paper analysis of the LDATS LDA
    • +
  • +
  • annual_hist.RDS +
      +
    • function for making histogram of change points over years
    • +
  • -
    test_file <- file.path(vignette_files, "ldats_ldamodel.RDS")
    -
    -if (!file.exists(test_file))
    -{
    -  # from the Extreme-events-LDA repo
    -  github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/model_outputs/"
    -  files_to_download <- c("ldats_ldamodel.RDS", "paper_ldamodel.RDS", 
    -                         "ldats_ldats.RDS", "ldats_paper.RDS", 
    -                         "paper_ldats.RDS", "paper_paper.RDS", 
    -                         "lda_distances.png")
    -
    -  for (file in files_to_download)
    -  {
    -    download.file(url = paste0(github_path, file),
    -                  destfile = file.path(vignette_files, file), 
    -                  mode = "wb")
    -  }
    -}
    +
    test_file <- file.path(vignette_files, "output", "ldats_ldamodel.RDS")
    +
    +if (!file.exists(test_file)){
    +
    +  dir.create(file.path(vignette_files, "output"), showWarnings = FALSE)
    +  github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/output/"
    +  files_to_download <- c("ldats_ldamodel.RDS", "paper_ldamodel.RDS", 
    +                         "ldats_ldats.RDS", "ldats_paper.RDS", 
    +                         "paper_ldats.RDS", "paper_paper.RDS", 
    +                         "ldats_rodents_adjusted.RDS", "rodents.RDS",
    +                         "ldats_paper_cpt.RDS", "ldats_paper_cpt_dates.RDS",
    +                         "ldats_ldats_cpt.RDS", "ldats_ldats_cpt_dates.RDS",
    +                         "paper_paper_cpt.RDS", "paper_paper_cpt_dates.RDS",
    +                         "paper_ldats_cpt.RDS", "paper_ldats_cpt_dates.RDS",
    +                         "annual_hist.RDS", "cpt_dates.RDS",
    +                         "lda_distances.png", "paper_paper_cpt_plot.png",
    +                         "ldats_paper_cpt_plot.png")
    +
    +  for (file in files_to_download){
    +    download.file(url = paste0(github_path, file),
    +                  destfile = file.path(vignette_files, "output", file), 
    +                  mode = "wb")
    +  }
    +}

    Data Comparison

    The dataset of Portal rodents on control plots is included in the LDATS package:

    -
    data(rodents)
    -
    -head(rodents[[1]])
    -#>   BA DM DO DS NA. OL OT PB PE PF PH PI PL PM PP RF RM RO SF SH SO
    -#> 1  0 13  0  2   2  0  0  0  1  1  0  0  0  0  3  0  0  0  0  0  0
    -#> 2  0 20  1  3   2  0  0  0  0  4  0  0  0  0  2  0  0  0  0  0  0
    -#> 3  0 21  0  8   4  0  0  0  1  2  0  0  0  0  1  0  0  0  0  0  0
    -#> 4  0 21  3 12   4  2  3  0  1  1  0  0  0  0  0  0  0  0  0  0  0
    -#> 5  0 16  1  9   5  2  1  0  0  2  0  0  0  0  0  0  1  0  0  0  0
    -#> 6  0 17  1 13   5  1  5  0  0  3  0  0  0  0  0  0  0  0  0  0  0
    +
    data(rodents)
    +
    +head(rodents[[1]])
    +#>   BA DM DO DS NA. OL OT PB PE PF PH PI PL PM PP RF RM RO SF SH SO
    +#> 1  0 13  0  2   2  0  0  0  1  1  0  0  0  0  3  0  0  0  0  0  0
    +#> 2  0 20  1  3   2  0  0  0  0  4  0  0  0  0  2  0  0  0  0  0  0
    +#> 3  0 21  0  8   4  0  0  0  1  2  0  0  0  0  1  0  0  0  0  0  0
    +#> 4  0 21  3 12   4  2  3  0  1  1  0  0  0  0  0  0  0  0  0  0  0
    +#> 5  0 16  1  9   5  2  1  0  0  2  0  0  0  0  0  0  1  0  0  0  0
    +#> 6  0 17  1 13   5  1  5  0  0  3  0  0  0  0  0  0  0  0  0  0  0

    We can compare this against the data used in Christensen et al:

    -
    # parameters for subsetting the full Portal rodents data
    -periods <- 1:436
    -control_plots <- c(2, 4, 8, 11, 12, 14, 17, 22)
    -species_list <- c("BA", "DM", "DO", "DS", "NA", "OL", "OT", "PB", "PE", "PF", 
    -                  "PH", "PI", "PL", "PM", "PP", "RF", "RM", "RO", "SF", "SH", "SO")
    -
    -source(file.path(vignette_files, "rodent_data_for_LDA.r"))
    -#> Loading required package: bitops
    -#> 
    -#> Attaching package: 'dplyr'
    -#> The following objects are masked from 'package:stats':
    -#> 
    -#>     filter, lag
    -#> The following objects are masked from 'package:base':
    -#> 
    -#>     intersect, setdiff, setequal, union
    -
    -# assemble `paper_dat`, the data from Christensen et al. 2018
    -paper_dat <- create_rodent_table(period_first = min(periods),
    -                                 period_last = max(periods),
    -                                 selected_plots = control_plots,
    -                                 selected_species = species_list)
    -
    -# assemble `paper_covariates`, the associated dates and covariate data
    -moondat <- read.csv(file.path(vignette_files, "moon_dates.csv"), stringsAsFactors = F)
    -
    -paper_dates <- moondat %>%
    -  dplyr::filter(period %>% dplyr::between(min(periods), max(periods))) %>% 
    -  dplyr::pull(censusdate) %>%
    -  as.Date()
    -
    -paper_covariates <- data.frame(
    -  index = seq_along(paper_dates), 
    -  date = paper_dates, 
    -  year_continuous = lubridate::decimal_date(paper_dates)) %>%
    -  dplyr::mutate( 
    -    sin_year = sin(year_continuous * 2 * pi), 
    -    cos_year = cos(year_continuous * 2 * pi)
    -  )
    +
    # parameters for subsetting the full Portal rodents data
    +periods <- 1:436
    +control_plots <- c(2, 4, 8, 11, 12, 14, 17, 22)
    +species_list <- c("BA", "DM", "DO", "DS", "NA", "OL", "OT", "PB", "PE", "PF", 
    +                  "PH", "PI", "PL", "PM", "PP", "RF", "RM", "RO", "SF", "SH", "SO")
    +
    +source(file.path(vignette_files, "scripts", "rodent_data_for_LDA.r"))
    +
    +# assemble `paper_dat`, the data from Christensen et al. 2018
    +paper_dat <- create_rodent_table(period_first = min(periods),
    +                                 period_last = max(periods),
    +                                 selected_plots = control_plots,
    +                                 selected_species = species_list)
    +
    +# assemble `paper_covariates`, the associated dates and covariate data
    +moondat <- read.csv(file.path(vignette_files, "data", "moon_dates.csv"), stringsAsFactors = F)
    +
    +paper_dates <- moondat %>%
    +  dplyr::filter(period %>% dplyr::between(min(periods), max(periods))) %>% 
    +  dplyr::pull(censusdate) %>%
    +  as.Date()
    +
    +paper_covariates <- data.frame(
    +  index = seq_along(paper_dates), 
    +  date = paper_dates, 
    +  year_continuous = lubridate::decimal_date(paper_dates)) %>%
    +  dplyr::mutate( 
    +    sin_year = sin(year_continuous * 2 * pi), 
    +    cos_year = cos(year_continuous * 2 * pi)
    +  )

    Compare the data from Christensen et al. with the included data in LDATS

    -
    compare <- rodents[[1]] == paper_dat
    -
    -length(which(rowSums(compare) < ncol(compare)))
    -#> [1] 16
    -

    There are 16 rows where the data included in LDATS differs from the paper data. This is because the LDATS data is not adjusted to account for trapping effort, while the paper data does, by dividing all census counts by the actual number of plots trapped and multiplying by 8 to account for incompletely-trapped censuses.

    +
    compare <- rodents[[1]] == paper_dat
    +
    +length(which(rowSums(compare) < ncol(compare)))
    +#> [1] 16
    +

    There are 16 rows where the data included in LDATS differs from the paper data. This is because the LDATS data is not adjusted to account for trapping effort, while the paper data is, by dividing all census counts by the actual number of plots trapped and multiplying by 8 to account for incompletely-trapped censuses.

    To confirm this, refer to lines 36-46 in rodent_data_for_LDA.r:

    -
      # retrieve data on number of plots trapped per month
    -  trap_table = read.csv('https://raw.githubusercontent.com/weecology/PortalData/master/Rodents/Portal_rodent_trapping.csv')
    -  trap_table_controls = filter(trap_table, plot %in% selected_plots)
    -  nplots_controls = aggregate(trap_table_controls$sampled,by=list(period = trap_table_controls$period),FUN=sum)
    -  
    -  # adjust species counts by number of plots trapped that month
    -  r_table_adjusted = as.data.frame.matrix(r_table)
    -  for (n in 1:436) {
    -    #divide by number of control plots actually trapped (should be 8) and multiply by 8 to estimate captures as if all plots were trapped
    -    r_table_adjusted[n,] = round(r_table_adjusted[n,]/nplots_controls$x[n]*8)
    -  }
    +
      # retrieve data on number of plots trapped per month
    +  trap_table = read.csv('https://raw.githubusercontent.com/weecology/PortalData/master/Rodents/Portal_rodent_trapping.csv')
    +  trap_table_controls = filter(trap_table, plot %in% selected_plots)
    +  nplots_controls = aggregate(trap_table_controls$sampled,by=list(period = trap_table_controls$period),FUN=sum)
    +  
    +  # adjust species counts by number of plots trapped that month
    +  r_table_adjusted = as.data.frame.matrix(r_table)
    +  for (n in 1:436) {
    +    #divide by number of control plots actually trapped (should be 8) and multiply by 8 to estimate captures as if all plots were trapped
    +    r_table_adjusted[n,] = round(r_table_adjusted[n,]/nplots_controls$x[n]*8)
    +  }

    We can run the same procedure on the LDATS data to verify that we obtain a data.frame that matches.

    -
    # get the trapping effort for each sample
    -trap_table <- read.csv(file.path(vignette_files, "Portal_rodent_trapping.csv"))
    -trap_table_controls <- dplyr::filter(trap_table, plot %in% control_plots)
    -nplots_controls <- aggregate(trap_table_controls$sampled, 
    -                            by = list(period = trap_table_controls$period), 
    -                            FUN = sum)
    -
    -# adjust species counts by number of plots trapped that month
    -#   divide by number of control plots actually trapped (should be 8) and 
    -#   multiply by 8 to estimate captures as if all plots were trapped
    -ldats_rodents_adjusted <- as.data.frame.matrix(rodents[[1]])
    -ldats_rodents_adjusted[periods, ] <- round(ldats_rodents_adjusted[periods, ] / nplots_controls$x[periods] * 8)
    +
    # get the trapping effort for each sample
    +trap_table <- read.csv(file.path(vignette_files, "data", "Portal_rodent_trapping.csv"))
    +trap_table_controls <- dplyr::filter(trap_table, plot %in% control_plots)
    +nplots_controls <- aggregate(trap_table_controls$sampled, 
    +                            by = list(period = trap_table_controls$period), 
    +                            FUN = sum)
    +
    +# adjust species counts by number of plots trapped that month
    +#   divide by number of control plots actually trapped (should be 8) and 
    +#   multiply by 8 to estimate captures as if all plots were trapped
    +ldats_rodents_adjusted <- as.data.frame.matrix(rodents[[1]])
    +ldats_rodents_adjusted[periods, ] <- round(ldats_rodents_adjusted[periods, ] / nplots_controls$x[periods] * 8)

    Now we can compare the adjusted LDATS dataset with both the original ldats dataset and the dataset from the paper:

    -
    compare_raw <- rodents[[1]] == ldats_rodents_adjusted
    -length(which(rowSums(compare_raw) < ncol(compare_raw)))
    -#> [1] 16
    -
    -compare_adjusted <- ldats_rodents_adjusted == paper_dat
    -length(which(rowSums(compare_adjusted) < ncol(compare_adjusted)))
    -#> [1] 0
    +
    compare_raw <- rodents[[1]] == ldats_rodents_adjusted
    +length(which(rowSums(compare_raw) < ncol(compare_raw)))
    +
    +compare_adjusted <- ldats_rodents_adjusted == paper_dat
    +length(which(rowSums(compare_adjusted) < ncol(compare_adjusted)))

    Because the LDA procedure weights the information from documents (census periods) according to the number of words (rodents captured), we now believe it is most appropriate to run the LDA on unadjusted trapping data, and we recommend that users of LDATS do so. However, to maintain consistency with Christensen et al 2018, we will proceed using the adjusted rodent table in this vignette.

    -
    rodents[[1]] <- paper_dat
    +
    rodents[[1]] <- paper_dat

    The LDATS rodent data comes with a document_covariate_table, which we will use later as the predictor variables for the changepoint models. In this table, time is expressed as new moon numbers. Later we will want to be able to interpret the results in terms of census dates. We will add a column to the document_covariate_table to convert new moon numbers to census dates. We will not reference this column in any of the formulas we pass to the changepoint models, so it will be ignored until we need it.

    -
    head(rodents$document_covariate_table)
    -#>   newmoon sin_year cos_year
    -#> 1       1  -0.2470  -0.9690
    -#> 2       2  -0.6808  -0.7325
    -#> 3       3  -0.9537  -0.3008
    -#> 4       4  -0.9813   0.1925
    -#> 5       5  -0.7583   0.6519
    -#> 6       6  -0.3537   0.9354
    -
    -new_cov_table <- dplyr::left_join(rodents$document_covariate_table, 
    -                                  dplyr::select(moondat, newmoonnumber, censusdate), 
    -                                  by = c("newmoon" = "newmoonnumber")) %>%
    -  dplyr::rename(date = censusdate)
    -
    -rodents$document_covariate_table <- new_cov_table
    +
    head(rodents$document_covariate_table)
    +#>   newmoon sin_year cos_year
    +#> 1       1  -0.2470  -0.9690
    +#> 2       2  -0.6808  -0.7325
    +#> 3       3  -0.9537  -0.3008
    +#> 4       4  -0.9813   0.1925
    +#> 5       5  -0.7583   0.6519
    +#> 6       6  -0.3537   0.9354
    +
    new_cov_table <- dplyr::left_join(rodents$document_covariate_table, 
    +                                  dplyr::select(moondat, newmoonnumber, censusdate), 
    +                                  by = c("newmoon" = "newmoonnumber")) %>%
    +                                  dplyr::rename(date = censusdate)
    +
    +rodents$document_covariate_table <- new_cov_table

    Identify community groups using LDA

    While LDATS can run start-to-finish with LDATS::LDA_TS, here we will work through the process function-by-function to isolate differences with the paper. For a breakdown of the LDA_TS pipeline, see the codebase vignette.

    First, we run the LDA models from LDATS to identify the number of topics:

    -
    ldats_ldas <- LDATS::LDA_set(document_term_table = rodents$document_term_table, 
    -                             topics = 2:6, nseeds = nseeds)
    -ldats_ldamodel <- LDATS::select_LDA(LDA_models = ldats_ldas)[[1]]
    -
    -saveRDS(ldats_ldamodel, file = file.path(vignette_files, "ldats_ldamodel.RDS"))
    +
    ldats_ldas <- LDATS::LDA_set(document_term_table = rodents$document_term_table, 
    +                             topics = 2:6, nseeds = nseeds)
    +ldats_ldamodel <- LDATS::select_LDA(LDA_models = ldats_ldas)[[1]]
    +
    +saveRDS(ldats_ldamodel, file = file.path(vignette_files, "ldats_ldamodel.RDS"))

    Second, we run the LDA models from Christensen et al. to do the same task:

    -
    source(file.path(vignette_files, "AIC_model_selection.R"))
    -source(file.path(vignette_files, "LDA-distance.R"))
    -
    -# Some of the functions require the data to be stored in the `dat` variable
    -dat <- paper_dat
    -
    -# Fit a bunch of LDA models with different seeds
    -# Only use even numbers for seeds because consecutive seeds give identical results
    -seeds <- 2 * seq(nseeds)
    -
    -# repeat LDA model fit and AIC calculation with a bunch of different seeds to test robustness of the analysis
    -best_ntopic <- repeat_VEM(paper_dat,
    -                          seeds,
    -                          topic_min = 2,
    -                          topic_max = 6)
    -hist(best_ntopic$k, breaks = seq(from = 0.5, to = 9.5), 
    -     xlab = "best # of topics", main = "")
    -
    -# 2b. how different is species composition of 4 community-types when LDA is run with different seeds?
    -# ==================================================================
    -# get the best 100 seeds where 4 topics was the best LDA model
    -seeds_4topics <- best_ntopic %>%
    -  filter(k == 4) %>%
    -  arrange(aic) %>%
    -  head(min(100, nseeds)) %>%
    -  pull(SEED)
    -
    -# choose seed with highest log likelihood for all following analyses
    -#    (also produces plot of community composition for "best" run compared to "worst")
    -
    -png(file.path(vignette_files, "lda_distances.png"), width = 800, height = 400)
    -dat <- paper_dat # calculate_LDA_distance has some required named variables
    -best_seed <- calculate_LDA_distance(paper_dat, seeds_4topics)
    -dev.off()
    -mean_dist <- unlist(best_seed)[2]
    -max_dist <- unlist(best_seed)[3]
    -
    -# ==================================================================
    -# 3. run LDA model
    -# ==================================================================
    -ntopics <- 4
    -SEED <- unlist(best_seed)[1]  # For the paper, use seed 206
    -ldamodel <- LDA(paper_dat, ntopics, control = list(seed = SEED), method = "VEM")
    -
    -saveRDS(ldamodel, file = file.path(vignette_files, "paper_ldamodel.RDS"))
    -
    knitr::include_graphics(file.path(vignette_files, "lda_distances.png"))
    +
    source(file.path(vignette_files, "scripts", "AIC_model_selection.R"))
    +source(file.path(vignette_files, "scripts", "LDA-distance.R"))
    +
    +# Some of the functions require the data to be stored in the `dat` variable
    +dat <- paper_dat
    +
    +# Fit a bunch of LDA models with different seeds
    +# Only use even numbers for seeds because consecutive seeds give identical results
    +seeds <- 2 * seq(nseeds)
    +
    +# repeat LDA model fit and AIC calculation with a bunch of different seeds to test robustness of the analysis
    +best_ntopic <- repeat_VEM(paper_dat,
    +                          seeds,
    +                          topic_min = 2,
    +                          topic_max = 6)
    +hist(best_ntopic$k, breaks = seq(from = 0.5, to = 9.5), 
    +     xlab = "best # of topics", main = "")
    +
    +# 2b. how different is species composition of 4 community-types when LDA is run with different seeds?
    +# ==================================================================
    +# get the best 100 seeds where 4 topics was the best LDA model
    +seeds_4topics <- best_ntopic %>%
    +  filter(k == 4) %>%
    +  arrange(aic) %>%
    +  head(min(100, nseeds)) %>%
    +  pull(SEED)
    +
    +# choose seed with highest log likelihood for all following analyses
    +#    (also produces plot of community composition for "best" run compared to "worst")
    +
    +png(file.path(vignette_files, "output", "lda_distances.png"), width = 800, height = 400)
    +dat <- paper_dat # calculate_LDA_distance has some required named variables
    +best_seed <- calculate_LDA_distance(paper_dat, seeds_4topics)
    +dev.off()
    +mean_dist <- unlist(best_seed)[2]
    +max_dist <- unlist(best_seed)[3]
    +
    +# ==================================================================
    +# 3. run LDA model
    +# ==================================================================
    +ntopics <- 4
    +SEED <- unlist(best_seed)[1]  # For the paper, use seed 206
    +ldamodel <- LDA(paper_dat, ntopics, control = list(seed = SEED), method = "VEM")
    +
    +saveRDS(ldamodel, file = file.path(vignette_files, "paper_ldamodel.RDS"))
    +
    knitr::include_graphics(file.path(vignette_files, "output", "lda_distances.png"))

    Plots

    To visualize the LDA assignment of species to topics, we load in the saved LDA models from previously:

    -
    ldamodel <- readRDS(file.path(vignette_files, "paper_ldamodel.RDS"))
    -ldats_ldamodel <- readRDS(file.path(vignette_files, "ldats_ldamodel.RDS"))
    +
    ldamodel <- readRDS(file.path(vignette_files, "output", "paper_ldamodel.RDS"))
    +ldats_ldamodel <- readRDS(file.path(vignette_files, "output", "ldats_ldamodel.RDS"))

    How the paper LDA model assigns species to topics:

    -
    plot(ldamodel, cols = NULL, option = "D")
    +
    plot(ldamodel, cols = NULL, option = "D")

    How the LDATS LDA model assigns species to topics:

    -
    plot(ldats_ldamodel, cols = NULL, option = "D")
    +
    plot(ldats_ldamodel, cols = NULL, option = "D")

    The paper method finds 4 topics and LDATS finds 6. This is because of an update to the model selection procedure. The paper conservatively overestimates the number of parameters (by counting all of the variational parameters) and therefore overpenalizes the AIC for models with more topics. Comparatively, the LDATS method now uses the number of parameters remaining after the variational approximation, as returned by the LDA object. For this vignette, we will compare the results from using both LDA models.

    @@ -709,92 +732,80 @@

    Changepoint models

    Running paper changepoint models

    We define a few helper functions for running the changepoints model of Christensen et al. and processing the output to obtain the dates:

    -
    #### Run changepoint ####
    -source(file.path(vignette_files, "changepointmodel.r"))
    -#> 
    -#> Attaching package: 'lubridate'
    -#> The following object is masked from 'package:base':
    -#> 
    -#>     date
    -#> Loading required package: viridisLite
    -
    -find_changepoints <- function(lda_model, paper_covariates, n_changepoints = 1:6)
    -{
    -  # set up parameters for model
    -  x <- dplyr::select(paper_covariates, 
    -                     year_continuous, 
    -                     sin_year, 
    -                     cos_year)
    -
    -  # run models with 1, 2, 3, 4, 5, 6 changepoints
    -  cpt_results <- data.frame(n_changepoints = n_changepoints)
    -  cpt_results$cpt_model <- lapply(cpt_results$n_changepoints,
    -                                  function(n_changepoints)
    -                                  {
    -                                    changepoint_model(lda_model, x, n_changepoints, maxit = nit, 
    -                                                      weights = rep(1, NROW(x)))
    -                                  })
    -  return(cpt_results)
    -}
    -
    -# Among a selection of models with different # of changepoints, 
    -#   - compute AIC
    -#   - select the model with the best AIC
    -#   - get the posterior distributions for the changepoints
    -select_cpt_model <- function(cpt_results, ntopics)
    -{
    -  # compute log likelihood as the mean deviance
    -  cpt_results$mean_deviances <- vapply(cpt_results$cpt_model, 
    -                                       function(cpt_model) {mean(cpt_model$saved_lls)}, 
    -                                       0)
    -
    -  # compute AIC = ( -2 * log likelihood) + 2 * (#parameters)
    -  cpt_results$AIC <- cpt_results$mean_deviances * -2 + 
    -    2 * (3 * (ntopics - 1) * (cpt_results$n_changepoints + 1) +
    -           (cpt_results$n_changepoints))
    -  
    -  # select the best model
    -  cpt <- cpt_results$cpt_model[[which.min(cpt_results$AIC)]]
    -  return(cpt)
    -}
    -
    -# transform the output from `compute_cpt` and match up the time indices with 
    -#   dates from the original data
    -get_dates <- function(cpt, covariates = paper_covariates)
    -{
    -  cpt$saved[,1,] %>%
    -    t() %>%
    -    as.data.frame() %>%
    -    reshape::melt() %>%
    -    dplyr::left_join(covariates, by = c("value" = "index"))
    -}
    +
    #### Run changepoint ####
    +source(file.path(vignette_files, "scripts", "changepointmodel.r"))
    +
    +find_changepoints <- function(lda_model, paper_covariates, n_changepoints = 1:6){
    +  # set up parameters for model
    +  x <- dplyr::select(paper_covariates, 
    +                     year_continuous, 
    +                     sin_year, 
    +                     cos_year)
    +
    +  # run models with 1, 2, 3, 4, 5, 6 changepoints
    +  cpt_results <- data.frame(n_changepoints = n_changepoints)
    +  cpt_results$cpt_model <- lapply(cpt_results$n_changepoints,
    +                                  function(n_changepoints){
    +                                    changepoint_model(lda_model, x, n_changepoints, maxit = nit, 
    +                                                      weights = rep(1, NROW(x)))
    +                                  })
    +  return(cpt_results)
    +}
    +
    +# Among a selection of models with different # of changepoints, 
    +#   - compute AIC
    +#   - select the model with the best AIC
    +#   - get the posterior distributions for the changepoints
    +select_cpt_model <- function(cpt_results, ntopics){
    +  # compute log likelihood as the mean deviance
    +  cpt_results$mean_deviances <- vapply(cpt_results$cpt_model, 
    +                                       function(cpt_model) {mean(cpt_model$saved_lls)}, 
    +                                       0)
    +
    +  # compute AIC = ( -2 * log likelihood) + 2 * (#parameters)
    +  cpt_results$AIC <- cpt_results$mean_deviances * -2 + 
    +    2 * (3 * (ntopics - 1) * (cpt_results$n_changepoints + 1) +
    +           (cpt_results$n_changepoints))
    +  
    +  # select the best model
    +  cpt <- cpt_results$cpt_model[[which.min(cpt_results$AIC)]]
    +  return(cpt)
    +}
    +
    +# transform the output from `compute_cpt` and match up the time indices with 
    +#   dates from the original data
    +get_dates <- function(cpt, covariates = paper_covariates){
    +  cpt$saved[,1,] %>%
    +    t() %>%
    +    as.data.frame() %>%
    +    reshape::melt() %>%
    +    dplyr::left_join(covariates, by = c("value" = "index"))
    +}

    LDATS LDA and paper changepoint

    Run the Christensen et al. time series model to identify changepoints on the LDA topics selected by LDATS:

    -
    ldats_paper_results <- find_changepoints(ldats_ldamodel, paper_covariates)
    -
    -saveRDS(ldats_paper_results, file = file.path(vignette_files, "ldats_paper.RDS"))
    +
    ldats_paper_results <- find_changepoints(ldats_ldamodel, paper_covariates)
    +
    +saveRDS(ldats_paper_results, file = file.path(vignette_files, "output", "ldats_paper.RDS"))

    Extract the dates of the changepoints:

    -
    ldats_paper_results <- readRDS(file.path(vignette_files, "ldats_paper.RDS"))
    -
    -ldats_paper_cpt <- select_cpt_model(ldats_paper_results, 
    -                                    ntopics = ldats_ldamodel@k)
    -ldats_paper_cpt_dates <- get_dates(ldats_paper_cpt)
    -#> Using  as id variables
    +
    ldats_paper_results <- readRDS(file.path(vignette_files, "output", "ldats_paper.RDS"))
    +
    +ldats_paper_cpt <- select_cpt_model(ldats_paper_results, 
    +                                    ntopics = ldats_ldamodel@k)
    +ldats_paper_cpt_dates <- get_dates(ldats_paper_cpt)

    Paper LDA and paper changepoint

    Run the Christensen et al. time series model to identify changepoints on the LDA topics selected by Christensen et al.:

    -
    paper_paper_results <- find_changepoints(ldamodel, paper_covariates)
    -
    -saveRDS(paper_paper_results, file = file.path(vignette_files, "paper_paper.RDS"))
    +
    paper_paper_results <- find_changepoints(ldamodel, paper_covariates)
    +
    +saveRDS(paper_paper_results, file = file.path(vignette_files, "paper_paper.RDS"))

    Extract the dates of the changepoints:

    -
    paper_paper_results <- readRDS(file.path(vignette_files, "paper_paper.RDS"))
    -
    -paper_paper_cpt <- select_cpt_model(paper_paper_results, 
    -                                    ntopics = ldamodel@k)
    -paper_paper_cpt_dates <- get_dates(ldats_paper_cpt)
    -#> Using  as id variables
    +
    paper_paper_results <- readRDS(file.path(vignette_files, "output", "paper_paper.RDS"))
    +
    +paper_paper_cpt <- select_cpt_model(paper_paper_results, 
    +                                    ntopics = ldamodel@k)
    +paper_paper_cpt_dates <- get_dates(ldats_paper_cpt)
    @@ -802,121 +813,122 @@

    Running LDATS changepoint models

    LDATS LDA and LDATS changepoint

    Run the LDATS time series model to identify changepoints on the LDA topics selected by LDATS:

    -
    ldats_ldats_results <- TS_on_LDA(LDA_models = ldats_ldamodel,
    -                                 document_covariate_table = rodents$document_covariate_table,
    -                                 formulas = ~ sin_year + cos_year,
    -                                 nchangepoints = 1:6,
    -                                 timename = "newmoon",
    -                                 weights = NULL,
    -                                 control = list(nit = nit))
    -
    -saveRDS(ldats_ldats_results, file = file.path(vignette_files, "ldats_ldats.RDS"))
    +
    ldats_ldats_results <- TS_on_LDA(LDA_models = ldats_ldamodel,
    +                                 document_covariate_table = rodents$document_covariate_table,
    +                                 formulas = ~ sin_year + cos_year,
    +                                 nchangepoints = 1:6,
    +                                 timename = "newmoon",
    +                                 weights = NULL,
    +                                 control = list(nit = nit))
    +
    +saveRDS(ldats_ldats_results, file = file.path(vignette_files, "output", "ldats_ldats.RDS"))

    Unlike the paper changepoint model, LDATS can recognize that sampling periods may not be equidistant, and can place changepoint estimates at new moons if they fall between nonconsecutive sampling periods. We can estimate the dates corresponding to those new moons, extrapolating from the census dates for adjacent census periods.

    -
    # make the full sequence of possible newmoon values
    -full_index <- seq(min(rodents$document_covariate_table$newmoon), 
    -                  max(rodents$document_covariate_table$newmoon))
    -
    -# generate a lookup table with dates for the newmoons, using `approx` to 
    -#   linearly interpolate the missing values
    -ldats_dates <- approx(rodents$document_covariate_table$newmoon, 
    -                     as.Date(rodents$document_covariate_table$date), 
    -                     full_index) %>%
    -  as.data.frame() %>%
    -  mutate(index = x, 
    -         date = as.Date(y, origin = "1970-01-01")) %>%
    -  select(index, date)
    +
    # make the full sequence of possible newmoon values
    +full_index <- seq(min(rodents$document_covariate_table$newmoon), 
    +                  max(rodents$document_covariate_table$newmoon))
    +
    +# generate a lookup table with dates for the newmoons, using `approx` to 
    +#   linearly interpolate the missing values
    +ldats_dates <- approx(rodents$document_covariate_table$newmoon, 
    +                     as.Date(rodents$document_covariate_table$date), 
    +                     full_index) %>%
    +  as.data.frame() %>%
    +  mutate(index = x, 
    +         date = as.Date(y, origin = "1970-01-01")) %>%
    +  select(index, date)

    Select the best time series model and extract the dates of the changepoints:

    -
    ldats_ldats_results <- readRDS(file.path(vignette_files, "ldats_ldats.RDS"))
    -  
    -ldats_ldats_cpt <- select_TS(ldats_ldats_results)
    -
    -ldats_ldats_cpt_dates <- ldats_ldats_cpt$rhos %>%
    -  as.data.frame() %>%
    -  reshape::melt() %>%
    -  dplyr::left_join(ldats_dates, by = c("value" = "index"))
    -#> Using  as id variables
    +
    ldats_ldats_results <- readRDS(file.path(vignette_files, "output", "ldats_ldats.RDS"))
    +  
    +ldats_ldats_cpt <- select_TS(ldats_ldats_results)
    +
    +ldats_ldats_cpt_dates <- ldats_ldats_cpt$rhos %>%
    +  as.data.frame() %>%
    +  reshape::melt() %>%
    +  dplyr::left_join(ldats_dates, by = c("value" = "index"))

    Paper LDA and LDATS changepoint

    Run the LDATS time series model to identify changepoints on the LDA topics selected by Christensen et al.:

    -
    paper_ldats_results <- TS_on_LDA(LDA_models = ldamodel,
    -                             document_covariate_table = rodents$document_covariate_table,
    -                             formulas = ~ sin_year + cos_year,
    -                             nchangepoints = 1:6,
    -
    -                             timename = "newmoon",
    -                             weights = NULL,
    -                             control = list(nit = nit))
    -
    -
    -saveRDS(paper_ldats_results, file = file.path(vignette_files, "paper_ldats.RDS"))
    +
    paper_ldats_results <- TS_on_LDA(LDA_models = ldamodel,
    +                             document_covariate_table = rodents$document_covariate_table,
    +                             formulas = ~ sin_year + cos_year,
    +                             nchangepoints = 1:6,
    +
    +                             timename = "newmoon",
    +                             weights = NULL,
    +                             control = list(nit = nit))
    +
    +
    +saveRDS(paper_ldats_results, file = file.path(vignette_files, "output", "paper_ldats.RDS"))

    Select the best time series model and extract the dates of the changepoints:

    -
    paper_ldats_results <- readRDS(file.path(vignette_files, "paper_ldats.RDS"))
    -  
    -paper_ldats_cpt <- select_TS(paper_ldats_results)
    -
    -paper_ldats_cpt_dates <- paper_ldats_cpt$rhos %>%
    -  as.data.frame() %>%
    -  reshape::melt() %>%
    -  dplyr::left_join(ldats_dates, by = c("value" = "index"))
    -#> Using  as id variables
    +
    paper_ldats_results <- readRDS(file.path(vignette_files, "output", "paper_ldats.RDS"))
    +  
    +paper_ldats_cpt <- select_TS(paper_ldats_results)
    +
    +paper_ldats_cpt_dates <- paper_ldats_cpt$rhos %>%
    +  as.data.frame() %>%
    +  reshape::melt() %>%
    +  dplyr::left_join(ldats_dates, by = c("value" = "index"))

    How many changepoints were identified?

    -
    nlevels(ldats_paper_cpt_dates$variable)
    -#> [1] 4
    -nlevels(paper_paper_cpt_dates$variable)
    -#> [1] 4
    -nlevels(ldats_ldats_cpt_dates$variable)
    -#> [1] 4
    -nlevels(paper_ldats_cpt_dates$variable)
    -#> [1] 4
    +
    nlevels(ldats_paper_cpt_dates$variable)
    +#> [1] 4
    +nlevels(paper_paper_cpt_dates$variable)
    +#> [1] 4
    +nlevels(ldats_ldats_cpt_dates$variable)
    +#> [1] 4
    +nlevels(paper_ldats_cpt_dates$variable)
    +#> [1] 4

    All of the models find four changepoints.

    Plot changepoint models

    Paper LDA and LDATS changepoint

    -
    plot(paper_ldats_cpt)
    +
    plot(paper_ldats_cpt)

    LDATS LDA and LDATS changepoint

    -
    plot(ldats_ldats_cpt)
    +
    plot(ldats_ldats_cpt)

    +
    annual_hist <- readRDS(file.path(vignette_files, "output", "annual_hist.RDS"))

    Paper LDA and paper changepoint

    -
    paper_cpts <- find_changepoint_location(paper_paper_cpt)
    -ntopics <- ldamodel@k
    -
    -paper_cpt_plot <- get_ll_non_memoized_plot(ldamodel, paper_covariates, paper_cpts, make_plot = TRUE,
    -                                           weights = rep(1, NROW(paper_covariates)))
    -
    -annual_hist(paper_paper_cpt, paper_covariates$year_continuous)
    -

    -
    paper_cpt_plot
    -

    +
    paper_cpts <- find_changepoint_location(paper_paper_cpt)
    +ntopics <- ldamodel@k
    +
    +png(file.path(vignette_files, "output", "paper_paper_cpt_plot.png"), width = 800, height = 600)
    +get_ll_non_memoized_plot(ldamodel, paper_covariates, paper_cpts, make_plot = TRUE,
    +                                           weights = rep(1, NROW(paper_covariates)))
    +dev.off()
    +
    paper_paper_hist <- annual_hist(paper_paper_cpt, paper_covariates$year_continuous)
    +

    +
    knitr::include_graphics(file.path(vignette_files, "output", "paper_paper_cpt_plot.png"))
    +

    LDATS LDA and paper changepoint

    -
    ldats_cpts <- find_changepoint_location(ldats_paper_cpt)
    -ntopics <- ldats_ldamodel@k
    -
    -ldats_cpt_plot <- get_ll_non_memoized_plot(ldats_ldamodel, paper_covariates, ldats_cpts, make_plot = TRUE,
    -                                           weights = rep(1, NROW(paper_covariates)))
    -
    -annual_hist(ldats_paper_cpt, paper_covariates$year_continuous)
    -

    -
    ldats_cpt_plot
    -

    +
    ldats_cpts <- find_changepoint_location(ldats_paper_cpt)
    +ntopics <- ldats_ldamodel@k
    +
    +png(file.path(vignette_files, "output", "ldats_paper_cpt_plot.png"), width = 800, height = 600)
    +get_ll_non_memoized_plot(ldats_ldamodel, paper_covariates, ldats_cpts, make_plot = TRUE,
    +                                           weights = rep(1, NROW(paper_covariates)))
    +dev.off()
    +
    ldats_paper_hist <- annual_hist(ldats_paper_cpt, paper_covariates$year_continuous)
    +

    +
    knitr::include_graphics(file.path(vignette_files, "output", "ldats_paper_cpt_plot.png"))
    +

    The results of the changepoint model appear robust to both choice of LDA model and choice of changepoint model.

    Report changepoint dates

    -
    knitr::kable(cpt_dates)
    +
    knitr::kable(cpt_dates)
    diff --git a/doc/rodents-example.R b/doc/rodents-example.R index 46f0fdbf..32611cf2 100644 --- a/doc/rodents-example.R +++ b/doc/rodents-example.R @@ -1,44 +1,44 @@ -## ----setup, include=FALSE------------------------------------------------ +## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) -## ---- include=FALSE------------------------------------------------------ +## ---- include=FALSE----------------------------------------------------------- library(LDATS) vers <- packageVersion("LDATS") today <- Sys.Date() -## ---- eval=FALSE--------------------------------------------------------- +## ---- eval=FALSE-------------------------------------------------------------- # install.packages("devtools") # devtools::install_github("weecology/LDATS") # library(LDATS) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- data(rodents) head(rodents$document_term_table, 10) head(rodents$document_covariate_table, 10) -## ----lda_set, eval =F---------------------------------------------------- +## ----lda_set, eval =F--------------------------------------------------------- # lda_model_set <- LDA_set(document_term_table = rodents$document_term_table, # topics = c(2:5), # nseeds = 10, # control = list(quiet = TRUE)) # -## ----lda set not quiet, eval =F------------------------------------------ +## ----lda set not quiet, eval =F----------------------------------------------- # lda_model_set2 <- LDA_set(document_term_table = rodents$document_term_table, # topics = c(2:3), # nseeds = 2) -## ----load lda model set, include = F------------------------------------- -load(here::here('vignettes', 'rodents-example-files', 'lda_model_set.Rds')) +## ----load lda model set, include = F------------------------------------------ +load(file.path('.', 'rodents-example-files', 'lda_model_set.Rds')) rm(lda_model_set2) -## ----select LDA---------------------------------------------------------- +## ----select LDA--------------------------------------------------------------- selected_lda_model <- select_LDA(lda_model_set) -## ----LDA results--------------------------------------------------------- +## ----LDA results-------------------------------------------------------------- # Number of topics: selected_lda_model[[1]]@k @@ -48,11 +48,11 @@ selected_lda_model[[1]]@k head(selected_lda_model[[1]]@gamma) -## ----plot lda, fig.width=7, fig.height=6--------------------------------- +## ----plot lda, fig.width=7, fig.height=6-------------------------------------- plot(selected_lda_model[[1]]) -## ----ts on lda, eval = F------------------------------------------------- +## ----ts on lda, eval = F------------------------------------------------------ # changepoint_models <- TS_on_LDA(LDA_models = selected_lda_model, # document_covariate_table = rodents$document_covariate_table, # formulas = ~ sin_year + cos_year, @@ -62,13 +62,13 @@ plot(selected_lda_model[[1]]) # control = list(nit = 1000)) # -## ----reload ts, include = F---------------------------------------------- -load(here::here('vignettes', 'rodents-example-files', 'changepoint_models.Rds')) +## ----reload ts, include = F--------------------------------------------------- +load(file.path('.', 'rodents-example-files', 'changepoint_models.Rds')) -## ----select ts----------------------------------------------------------- +## ----select ts---------------------------------------------------------------- selected_changepoint_model <- select_TS(changepoint_models) -## ----cpt results--------------------------------------------------------- +## ----cpt results-------------------------------------------------------------- # Number of changepoints selected_changepoint_model$nchangepoints @@ -80,10 +80,10 @@ selected_changepoint_model$rho_summary head(selected_changepoint_model$rhos) -## ----plot cpt, fig.width=7, fig.height=6--------------------------------- +## ----plot cpt, fig.width=7, fig.height=6-------------------------------------- plot(selected_changepoint_model) -## ----lda_ts, eval = F---------------------------------------------------- +## ----lda_ts, eval = F--------------------------------------------------------- # lda_ts_results <- LDA_TS(data = rodents, # nseeds = 10, # topics = 2:5, @@ -92,10 +92,10 @@ plot(selected_changepoint_model) # timename = "newmoon", # control = list(nit = 1000)) -## ----load ldats results, include = F------------------------------------- -load(here::here('vignettes', 'rodents-example-files', 'lda_ts_results.Rds')) +## ----load ldats results, include = F------------------------------------------ +load(file.path('.', 'rodents-example-files', 'lda_ts_results.Rds')) -## ----LDA_TS results------------------------------------------------------ +## ----LDA_TS results----------------------------------------------------------- names(lda_ts_results) # Number of topics @@ -107,6 +107,6 @@ lda_ts_results$`Selected TS model`$nchangepoints # Summary of changepoint locations lda_ts_results$`Selected TS model`$rho_summary -## ----plot LDA_TS results, fig.height = 16, fig.width = 7, echo = F------- +## ----plot LDA_TS results, fig.height = 16, fig.width = 7, echo = F------------ plot(lda_ts_results) diff --git a/doc/rodents-example.Rmd b/doc/rodents-example.Rmd index ef390a6d..26fb9342 100644 --- a/doc/rodents-example.Rmd +++ b/doc/rodents-example.Rmd @@ -81,7 +81,7 @@ lda_model_set2 <- LDA_set(document_term_table = rodents$document_term_table, `LDA_set()` returns a list of LDA models. We use `select_LDA()` to identify the best number of topics and choice of seed from our set of models. By default, we will choose models based on minimum AIC. To use different selection criteria, define the appropriate functions and specify them by passing `list(measurer = [measurer function], selector = [max, min, etc])` to the `control` argument. ```{r load lda model set, include = F} -load(here::here('vignettes', 'rodents-example-files', 'lda_model_set.Rds')) +load(file.path('.', 'rodents-example-files', 'lda_model_set.Rds')) rm(lda_model_set2) ``` @@ -137,7 +137,7 @@ Also, it is important to note that by default the TS functions take the name of `select_TS()` will identify the best-fit changepoint model of the models from `TS_on_LDA()`. As with `select_LDA()`, we can adjust the `measurer` and `selector` functions using the `control` argument list. ```{r reload ts, include = F} -load(here::here('vignettes', 'rodents-example-files', 'changepoint_models.Rds')) +load(file.path('.', 'rodents-example-files', 'changepoint_models.Rds')) ``` ```{r select ts} @@ -180,7 +180,7 @@ lda_ts_results <- LDA_TS(data = rodents, ``` ```{r load ldats results, include = F} -load(here::here('vignettes', 'rodents-example-files', 'lda_ts_results.Rds')) +load(file.path('.', 'rodents-example-files', 'lda_ts_results.Rds')) ``` `LDA_TS()` returns a list of all the model objects, and we can access their contents as above: diff --git a/doc/rodents-example.html b/doc/rodents-example.html index 1e83d20e..5d7143f2 100644 --- a/doc/rodents-example.html +++ b/doc/rodents-example.html @@ -1,15 +1,14 @@ - + - - + @@ -120,9 +119,6 @@ font-size: 14px; line-height: 1.35; } -#header { -text-align: center; -} #TOC { clear: both; margin: 0 0 10px 10px; @@ -290,7 +286,8 @@ code > span.co { color: #888888; font-style: italic; } code > span.ot { color: #007020; } code > span.al { color: #ff0000; font-weight: bold; } -code > span.fu { color: #900; font-weight: bold; } code > span.er { color: #a61717; background-color: #e3d2d2; } +code > span.fu { color: #900; font-weight: bold; } +code > span.er { color: #a61717; background-color: #e3d2d2; } @@ -308,7 +305,7 @@

    Renata Diaz and Juniper L. Simonis

    -

    This vignette walks through an example of LDATS at the command line and was constructed using LDATS version 0.3.0 on 2019-10-12.

    +

    This vignette walks through an example of LDATS at the command line and was constructed using LDATS version 0.2.7 on 2020-03-18.

    Installation

    To obtain the most recent version of LDATS, install and load the most recent version from GitHub:

    diff --git a/vignettes/paper-comparison.Rmd b/vignettes/paper-comparison.Rmd index b5b34072..9e970557 100644 --- a/vignettes/paper-comparison.Rmd +++ b/vignettes/paper-comparison.Rmd @@ -1,6 +1,6 @@ --- title: "Comparison to Christensen et al. 2018" -author: "Renata Diaz and Hao Ye" +author: "Renata Diaz, Juniper Simonis, and Hao Ye" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{paper-comparison} @@ -22,7 +22,8 @@ vers <- packageVersion("LDATS") # Introduction -This document provides a side-by-side comparison of **`LDATS`** (version `r vers`) results with analysis from [Christensen et al. 2018](https://esajournals.onlinelibrary.wiley.com/doi/full/10.1002/ecy.2373). +This document provides a side-by-side comparison of **`LDATS`** (version `r vers`) results with analysis from [Christensen et al. 2018](https://esajournals.onlinelibrary.wiley.com/doi/full/10.1002/ecy.2373). +Due to the size and duration of model runs, we use pre-generated model output from the [LDATS-replications repo](https://github.com/weecology/LDATS-replications). ## Summary @@ -40,7 +41,7 @@ This document provides a side-by-side comparison of **`LDATS`** (version `r vers To obtain the most recent version of **`LDATS`**, install the most recent version from GitHub: ```{r, eval = FALSE} -# install.packages("devtools") +install.packages("devtools") devtools::install_github("weecology/LDATS") ``` @@ -52,6 +53,13 @@ nseeds <- 200 nit <- 10000 ``` +To run the analyses here, you will also need to download **`dplyr`**, **`gridExtra`**, **`multipanel`**, **`RColorBrewer`**, **`RCurl`**, and **`reshape2`** as the manuscript's code relies on these packages. + +```{r, eval = params$run_models} +install.packages(c("dplyr", "gridExtra", "multipanel", "RColorBrewer", "RCurl", "reshape2")) +``` + + ## Running the Models Because both the Latent Dirichlet Allocation (LDA) and time series components of the analysis can take a long time to run (especially with the settings above for the number of seeds and iterations), we will use pre-generated model outputs and turn off certain code chunks that run the models using a global `rmarkdown` parameter, `run_models = FALSE`. @@ -69,7 +77,7 @@ We're going to download analysis scripts, data files, and model objects, so we u vignette_files <- tempdir() ``` -To replicate the Christensen et al. 2018 analysis, we download some of the original scripts & data files from [Extreme-events-LDA repo](https://github.com/emchristensen/Extreme-events-LDA), as well as some raw data files from the [PortalData repo](https://github.com/weecology/PortalData): +To replicate the Christensen et al. 2018 analysis, we download some of the original scripts & data files from [Extreme-events-LDA repo](https://github.com/emchristensen/Extreme-events-LDA), as well as some raw data files from the [PortalData repo](https://github.com/weecology/PortalData), which are stored in the [LDATS-replications repo](https://github.com/weecology/LDATS-replications): Main Analysis Scripts: @@ -99,37 +107,48 @@ Data: * `Portal_rodent_trapping.csv` - table of trapping effort (downloaded from the PortalData repository) +* `paper_dat.csv` + - rodent data table from Christensen et al. 2018 + +* `paper_dates.csv` + - dates used in Christensen et al. 2018 + +* `paper_covariates.csv` + - table of dates and covariate data + Figure scripts: * `LDA_figure_scripts.R` - contains functions for making main plots in manuscript (Fig 1). Called from rodent_LDA_analysis.R -```{r download scripts} -test_file <- file.path(vignette_files, "rodent_LDA_analysis.r") +```{r download scripts and data} +test_file <- file.path(vignette_files, "scripts", "rodent_LDA_analysis.r") -if (!file.exists(test_file)) -{ - # from the Extreme-events-LDA repo - github_path <- "https://raw.githubusercontent.com/emchristensen/Extreme-events-LDA/master/" +if (!file.exists(test_file)){ + + # scripts + dir.create(file.path(vignette_files, "scripts"), showWarnings = FALSE) + github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/scripts/" files_to_download <- c("rodent_LDA_analysis.r", "rodent_data_for_LDA.r", "AIC_model_selection.R", "changepointmodel.r", - "LDA-distance.R", "Rodent_table_dat.csv", - "LDA_figure_scripts.R") + "LDA-distance.R", "LDA_figure_scripts.R") - for (file in files_to_download) - { + for (file in files_to_download) { download.file(url = paste0(github_path, file), - destfile = file.path(vignette_files, file)) + destfile = file.path(vignette_files, "scripts", file)) } + + + # data + dir.create(file.path(vignette_files, "data"), showWarnings = FALSE) + github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/data/" + files_to_download <- c("moon_dates.csv", "Portal_rodent_trapping.csv", + "Rodent_table_dat.csv", "paper_dat.csv", + "paper_dates.csv", "paper_covariates.csv") - # from the PortalData repo - github_path <- "https://raw.githubusercontent.com/weecology/PortalData/master/Rodents/" - files_to_download <- c("moon_dates.csv", "Portal_rodent_trapping.csv") - - for (file in files_to_download) - { + for (file in files_to_download) { download.file(url = paste0(github_path, file), - destfile = file.path(vignette_files, file)) + destfile = file.path(vignette_files, "data", file)) } } ``` @@ -146,46 +165,66 @@ LDA models: * `paper_ldamodel.RDS` - the best LDA model as selected by the Christensen et al. analysis -Changepoint outputs +Changepoint outputs: -* `ldats_ldats.RDS` - - the posterior distribution of changepoints, using the LDATS LDA model and the LDATS changepoint selection +* `ldats_ldats.RDS`, `ldats_ldats_cpt.RDS`, `ldats_ldats_cpt_dates.RDS` + - the posterior distribution, count, and dates of changepoints, using the LDATS LDA model and the LDATS changepoint selection -* `ldats_paper.RDS` - - the posterior distribution of changepoints, using the LDATS LDA model and the paper's changepoint selection +* `ldats_paper.RDS`, `ldats_paper_cpt.RDS`, `ldats_paper_cpt_dates.RDS + - the posterior distribution, count, and dates of changepoints, using the LDATS LDA model and the paper's changepoint selection -* `paper_ldats.RDS` - - the posterior distribution of changepoints, using the paper LDA model and the LDATS changepoint selection +* `paper_ldats.RDS`, `paper_ldats_cpt.RDS`, `paper_ldats_cpt_dates.RDS + - the posterior distribution, count, and dates of changepoints, using the paper LDA model and the LDATS changepoint selection -* `paper_paper.RDS` - - the posterior distribution of changepoints, using the paper LDA model and the paper's changepoint selection +* `paper_paper.RDS`, `paper_paper_cpt.RDS`, `paper_paper_cpt_dates.RDS + - the posterior distribution, count, and dates of changepoints, using the paper LDA model and the paper's changepoint selection + +* `cpt_dates.RDS` + - summary table of changepoint results across models Figures * `lda_distances.png` - figure showing the variance in the topics identified by the paper's LDA model code -```{r download pre-generated model outputs, eval = !params$run_model} -test_file <- file.path(vignette_files, "ldats_ldamodel.RDS") +* `paper_paper_cpt_plot.png` + - figure showing the time series results for the paper analysis of the paper LDA + +* `ldats_paper_cpt_plot.png` + - figure showing the time series results for the paper analysis of the LDATS LDA + +* `annual_hist.RDS` + - function for making histogram of change points over years + -if (!file.exists(test_file)) -{ - # from the Extreme-events-LDA repo - github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/model_outputs/" +```{r download pre-generated model outputs} +test_file <- file.path(vignette_files, "output", "ldats_ldamodel.RDS") + +if (!file.exists(test_file)){ + + dir.create(file.path(vignette_files, "output"), showWarnings = FALSE) + github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/output/" files_to_download <- c("ldats_ldamodel.RDS", "paper_ldamodel.RDS", "ldats_ldats.RDS", "ldats_paper.RDS", "paper_ldats.RDS", "paper_paper.RDS", - "lda_distances.png") - - for (file in files_to_download) - { + "ldats_rodents_adjusted.RDS", "rodents.RDS", + "ldats_paper_cpt.RDS", "ldats_paper_cpt_dates.RDS", + "ldats_ldats_cpt.RDS", "ldats_ldats_cpt_dates.RDS", + "paper_paper_cpt.RDS", "paper_paper_cpt_dates.RDS", + "paper_ldats_cpt.RDS", "paper_ldats_cpt_dates.RDS", + "annual_hist.RDS", "cpt_dates.RDS", + "lda_distances.png", "paper_paper_cpt_plot.png", + "ldats_paper_cpt_plot.png") + + for (file in files_to_download){ download.file(url = paste0(github_path, file), - destfile = file.path(vignette_files, file), + destfile = file.path(vignette_files, "output", file), mode = "wb") } } ``` + # Data Comparison The dataset of Portal rodents on control plots is included in the LDATS package: @@ -198,14 +237,14 @@ head(rodents[[1]]) We can compare this against the data used in Christensen et al: -```{r Paper data} +```{r Paper data, eval = params$run_models} # parameters for subsetting the full Portal rodents data periods <- 1:436 control_plots <- c(2, 4, 8, 11, 12, 14, 17, 22) species_list <- c("BA", "DM", "DO", "DS", "NA", "OL", "OT", "PB", "PE", "PF", "PH", "PI", "PL", "PM", "PP", "RF", "RM", "RO", "SF", "SH", "SO") -source(file.path(vignette_files, "rodent_data_for_LDA.r")) +source(file.path(vignette_files, "scripts", "rodent_data_for_LDA.r")) # assemble `paper_dat`, the data from Christensen et al. 2018 paper_dat <- create_rodent_table(period_first = min(periods), @@ -214,7 +253,7 @@ paper_dat <- create_rodent_table(period_first = min(periods), selected_species = species_list) # assemble `paper_covariates`, the associated dates and covariate data -moondat <- read.csv(file.path(vignette_files, "moon_dates.csv"), stringsAsFactors = F) +moondat <- read.csv(file.path(vignette_files, "data", "moon_dates.csv"), stringsAsFactors = F) paper_dates <- moondat %>% dplyr::filter(period %>% dplyr::between(min(periods), max(periods))) %>% @@ -231,6 +270,13 @@ paper_covariates <- data.frame( ) ``` +```{r Paper data2, eval = !params$run_models, include = FALSE} + moondat <- read.csv(file.path(vignette_files, "data", "moon_dates.csv"), stringsAsFactors = FALSE) + paper_dat <- read.csv(file.path(vignette_files, "data", "paper_dat.csv"), stringsAsFactors = FALSE) + paper_dates <- read.csv(file.path(vignette_files, "data", "paper_dates.csv"), stringsAsFactors = FALSE) + paper_covariates <- read.csv(file.path(vignette_files, "data", "paper_covariates.csv"), stringsAsFactors = FALSE) +``` + ## Compare the data from Christensen et al. with the included data in `LDATS` ```{r rodent data comparison} @@ -239,11 +285,11 @@ compare <- rodents[[1]] == paper_dat length(which(rowSums(compare) < ncol(compare))) ``` -There are 16 rows where the data included in LDATS differs from the paper data. This is because the LDATS data is not adjusted to account for trapping effort, while the paper data does, by dividing all census counts by the actual number of plots trapped and multiplying by 8 to account for incompletely-trapped censuses. +There are 16 rows where the data included in LDATS differs from the paper data. This is because the LDATS data is not adjusted to account for trapping effort, while the paper data is, by dividing all census counts by the actual number of plots trapped and multiplying by 8 to account for incompletely-trapped censuses. To confirm this, refer to lines 36-46 in `rodent_data_for_LDA.r`: -```{} +```{r Data adjustment eval, eval = FALSE} # retrieve data on number of plots trapped per month trap_table = read.csv('https://raw.githubusercontent.com/weecology/PortalData/master/Rodents/Portal_rodent_trapping.csv') trap_table_controls = filter(trap_table, plot %in% selected_plots) @@ -259,9 +305,9 @@ To confirm this, refer to lines 36-46 in `rodent_data_for_LDA.r`: We can run the same procedure on the LDATS data to verify that we obtain a data.frame that matches. -```{r adjust LDATS data after Christensen et al, eval = TRUE} +```{r adjust LDATS data after Christensen et al, eval = params$run_models} # get the trapping effort for each sample -trap_table <- read.csv(file.path(vignette_files, "Portal_rodent_trapping.csv")) +trap_table <- read.csv(file.path(vignette_files, "data", "Portal_rodent_trapping.csv")) trap_table_controls <- dplyr::filter(trap_table, plot %in% control_plots) nplots_controls <- aggregate(trap_table_controls$sampled, by = list(period = trap_table_controls$period), @@ -274,9 +320,16 @@ ldats_rodents_adjusted <- as.data.frame.matrix(rodents[[1]]) ldats_rodents_adjusted[periods, ] <- round(ldats_rodents_adjusted[periods, ] / nplots_controls$x[periods] * 8) ``` +```{r eval = params$run_models, include = FALSE} +saveRDS(ldats_rodents_adjusted, file = file.path(vignette_files, "output", "ldats_rodents_adjusted.RDS")) +``` + Now we can compare the adjusted LDATS dataset with both the original ldats dataset and the dataset from the paper: -```{r dataset comparisons} +```{r eval = !params$run_models, include = FALSE} +ldats_rodents_adjusted <- readRDS(file.path(vignette_files, "output", "ldats_rodents_adjusted.RDS")) +``` +```{r dataset comparisons, eval = params$run_models} compare_raw <- rodents[[1]] == ldats_rodents_adjusted length(which(rowSums(compare_raw) < ncol(compare_raw))) @@ -292,16 +345,25 @@ rodents[[1]] <- paper_dat The LDATS rodent data comes with a `document_covariate_table`, which we will use later as the predictor variables for the changepoint models. In this table, time is expressed as new moon numbers. Later we will want to be able to interpret the results in terms of census dates. We will add a column to the `document_covariate_table` to convert new moon numbers to census dates. We will not reference this column in any of the formulas we pass to the changepoint models, so it will be ignored until we need it. -```{r add dates to covariate table} + +```{r show covariate table} head(rodents$document_covariate_table) +``` +```{r eval = params$run_models, include = FALSE} +"%>%" <- dplyr::"%>%" +``` +```{r add dates to covariate table, eval = params$run_models} new_cov_table <- dplyr::left_join(rodents$document_covariate_table, dplyr::select(moondat, newmoonnumber, censusdate), by = c("newmoon" = "newmoonnumber")) %>% - dplyr::rename(date = censusdate) + dplyr::rename(date = censusdate) rodents$document_covariate_table <- new_cov_table ``` +```{r eval = params$run_models, include = FALSE} +saveRDS(rodents, file = file.path(vignette_files, "output", "rodents.RDS")) +``` # Identify community groups using LDA @@ -320,8 +382,8 @@ saveRDS(ldats_ldamodel, file = file.path(vignette_files, "ldats_ldamodel.RDS")) Second, we run the LDA models from Christensen et al. to do the same task: ```{r paper LDAs, eval = params$run_models} -source(file.path(vignette_files, "AIC_model_selection.R")) -source(file.path(vignette_files, "LDA-distance.R")) +source(file.path(vignette_files, "scripts", "AIC_model_selection.R")) +source(file.path(vignette_files, "scripts", "LDA-distance.R")) # Some of the functions require the data to be stored in the `dat` variable dat <- paper_dat @@ -350,7 +412,7 @@ seeds_4topics <- best_ntopic %>% # choose seed with highest log likelihood for all following analyses # (also produces plot of community composition for "best" run compared to "worst") -png(file.path(vignette_files, "lda_distances.png"), width = 800, height = 400) +png(file.path(vignette_files, "output", "lda_distances.png"), width = 800, height = 400) dat <- paper_dat # calculate_LDA_distance has some required named variables best_seed <- calculate_LDA_distance(paper_dat, seeds_4topics) dev.off() @@ -368,15 +430,15 @@ saveRDS(ldamodel, file = file.path(vignette_files, "paper_ldamodel.RDS")) ``` ```{r} -knitr::include_graphics(file.path(vignette_files, "lda_distances.png")) +knitr::include_graphics(file.path(vignette_files, "output", "lda_distances.png")) ``` ## Plots To visualize the LDA assignment of species to topics, we load in the saved LDA models from previously: ```{r} -ldamodel <- readRDS(file.path(vignette_files, "paper_ldamodel.RDS")) -ldats_ldamodel <- readRDS(file.path(vignette_files, "ldats_ldamodel.RDS")) +ldamodel <- readRDS(file.path(vignette_files, "output", "paper_ldamodel.RDS")) +ldats_ldamodel <- readRDS(file.path(vignette_files, "output", "ldats_ldamodel.RDS")) ``` How the paper LDA model assigns species to topics: @@ -406,12 +468,11 @@ Having divided the data to generate catch-per-effort, the paper changepoint mode We define a few helper functions for running the changepoints model of Christensen et al. and processing the output to obtain the dates: -```{r paper changepoint models} +```{r paper changepoint models, eval = params$run_models} #### Run changepoint #### -source(file.path(vignette_files, "changepointmodel.r")) +source(file.path(vignette_files, "scripts", "changepointmodel.r")) -find_changepoints <- function(lda_model, paper_covariates, n_changepoints = 1:6) -{ +find_changepoints <- function(lda_model, paper_covariates, n_changepoints = 1:6){ # set up parameters for model x <- dplyr::select(paper_covariates, year_continuous, @@ -421,8 +482,7 @@ find_changepoints <- function(lda_model, paper_covariates, n_changepoints = 1:6) # run models with 1, 2, 3, 4, 5, 6 changepoints cpt_results <- data.frame(n_changepoints = n_changepoints) cpt_results$cpt_model <- lapply(cpt_results$n_changepoints, - function(n_changepoints) - { + function(n_changepoints){ changepoint_model(lda_model, x, n_changepoints, maxit = nit, weights = rep(1, NROW(x))) }) @@ -433,8 +493,7 @@ find_changepoints <- function(lda_model, paper_covariates, n_changepoints = 1:6) # - compute AIC # - select the model with the best AIC # - get the posterior distributions for the changepoints -select_cpt_model <- function(cpt_results, ntopics) -{ +select_cpt_model <- function(cpt_results, ntopics){ # compute log likelihood as the mean deviance cpt_results$mean_deviances <- vapply(cpt_results$cpt_model, function(cpt_model) {mean(cpt_model$saved_lls)}, @@ -452,8 +511,7 @@ select_cpt_model <- function(cpt_results, ntopics) # transform the output from `compute_cpt` and match up the time indices with # dates from the original data -get_dates <- function(cpt, covariates = paper_covariates) -{ +get_dates <- function(cpt, covariates = paper_covariates){ cpt$saved[,1,] %>% t() %>% as.data.frame() %>% @@ -461,6 +519,10 @@ get_dates <- function(cpt, covariates = paper_covariates) dplyr::left_join(covariates, by = c("value" = "index")) } ``` +```{r save annual_hist, include = FALSE, eval = params$run_models} +saveRDS(annual_hist, file = file.path(vignette_files, "output", "annual_hist.RDS")) +``` + ### LDATS LDA and paper changepoint @@ -468,17 +530,21 @@ Run the Christensen et al. time series model to identify changepoints on the LDA ```{r run LDATS LDA and paper cpt, eval = params$run_models} ldats_paper_results <- find_changepoints(ldats_ldamodel, paper_covariates) -saveRDS(ldats_paper_results, file = file.path(vignette_files, "ldats_paper.RDS")) +saveRDS(ldats_paper_results, file = file.path(vignette_files, "output", "ldats_paper.RDS")) ``` Extract the dates of the changepoints: -```{r compute changepoints for LDATS LDA and paper cpt} -ldats_paper_results <- readRDS(file.path(vignette_files, "ldats_paper.RDS")) +```{r compute changepoints for LDATS LDA and paper cpt, eval = params$run_models} +ldats_paper_results <- readRDS(file.path(vignette_files, "output", "ldats_paper.RDS")) ldats_paper_cpt <- select_cpt_model(ldats_paper_results, ntopics = ldats_ldamodel@k) ldats_paper_cpt_dates <- get_dates(ldats_paper_cpt) ``` +```{r include = FALSE, eval = params$run_models} +saveRDS(ldats_paper_cpt, file = file.path(vignette_files, "output", "ldats_paper_cpt.RDS")) +saveRDS(ldats_paper_cpt_dates, file = file.path(vignette_files, "output", "ldats_paper_cpt_dates.RDS")) +``` ### Paper LDA and paper changepoint @@ -490,13 +556,17 @@ saveRDS(paper_paper_results, file = file.path(vignette_files, "paper_paper.RDS") ``` Extract the dates of the changepoints: -```{r compute changepoints for paper LDA and paper cpt} -paper_paper_results <- readRDS(file.path(vignette_files, "paper_paper.RDS")) +```{r compute changepoints for paper LDA and paper cpt, eval = params$run_models} +paper_paper_results <- readRDS(file.path(vignette_files, "output", "paper_paper.RDS")) paper_paper_cpt <- select_cpt_model(paper_paper_results, ntopics = ldamodel@k) paper_paper_cpt_dates <- get_dates(ldats_paper_cpt) ``` +```{r include = FALSE, eval = params$run_models} +saveRDS(paper_paper_cpt, file = file.path(vignette_files, "output", "paper_paper_cpt.RDS")) +saveRDS(paper_paper_cpt_dates, file = file.path(vignette_files, "output", "paper_paper_cpt_dates.RDS")) +``` ## Running LDATS changepoint models @@ -513,12 +583,12 @@ ldats_ldats_results <- TS_on_LDA(LDA_models = ldats_ldamodel, weights = NULL, control = list(nit = nit)) -saveRDS(ldats_ldats_results, file = file.path(vignette_files, "ldats_ldats.RDS")) +saveRDS(ldats_ldats_results, file = file.path(vignette_files, "output", "ldats_ldats.RDS")) ``` Unlike the paper changepoint model, LDATS can recognize that sampling periods may not be equidistant, and can place changepoint estimates at new moons if they fall between nonconsecutive sampling periods. We can estimate the dates corresponding to those new moons, extrapolating from the census dates for adjacent census periods. -```{r construct lookup table for LDATS output for changepoint times} +```{r construct lookup table for LDATS output for changepoint times, eval = params$run_models} # make the full sequence of possible newmoon values full_index <- seq(min(rodents$document_covariate_table$newmoon), max(rodents$document_covariate_table$newmoon)) @@ -536,8 +606,8 @@ ldats_dates <- approx(rodents$document_covariate_table$newmoon, Select the best time series model and extract the dates of the changepoints: -```{r compute changepoints for LDATS LDA and LDATS cpt} -ldats_ldats_results <- readRDS(file.path(vignette_files, "ldats_ldats.RDS")) +```{r compute changepoints for LDATS LDA and LDATS cpt, eval = params$run_models} +ldats_ldats_results <- readRDS(file.path(vignette_files, "output", "ldats_ldats.RDS")) ldats_ldats_cpt <- select_TS(ldats_ldats_results) @@ -546,6 +616,10 @@ ldats_ldats_cpt_dates <- ldats_ldats_cpt$rhos %>% reshape::melt() %>% dplyr::left_join(ldats_dates, by = c("value" = "index")) ``` +```{r include = FALSE, eval = params$run_models} +saveRDS(ldats_ldats_cpt, file = file.path(vignette_files, "output", "ldats_ldats_cpt.RDS")) +saveRDS(ldats_ldats_cpt_dates, file = file.path(vignette_files, "output", "ldats_ldats_cpt_dates.RDS")) +``` ### Paper LDA and LDATS changepoint @@ -562,13 +636,13 @@ paper_ldats_results <- TS_on_LDA(LDA_models = ldamodel, control = list(nit = nit)) -saveRDS(paper_ldats_results, file = file.path(vignette_files, "paper_ldats.RDS")) +saveRDS(paper_ldats_results, file = file.path(vignette_files, "output", "paper_ldats.RDS")) ``` Select the best time series model and extract the dates of the changepoints: -```{r select paper lda + ldats cpt} -paper_ldats_results <- readRDS(file.path(vignette_files, "paper_ldats.RDS")) +```{r select paper lda + ldats cpt, eval = params$run_models} +paper_ldats_results <- readRDS(file.path(vignette_files, "output", "paper_ldats.RDS")) paper_ldats_cpt <- select_TS(paper_ldats_results) @@ -577,9 +651,22 @@ paper_ldats_cpt_dates <- paper_ldats_cpt$rhos %>% reshape::melt() %>% dplyr::left_join(ldats_dates, by = c("value" = "index")) ``` +```{r include = FALSE, eval = params$run_models} +saveRDS(paper_ldats_cpt, file = file.path(vignette_files, "output", "paper_ldats_cpt.RDS")) +saveRDS(paper_ldats_cpt_dates, file = file.path(vignette_files, "output", "paper_ldats_cpt_dates.RDS")) +``` ## How many changepoints were identified? - +```{r eval = !params$run_models, include = FALSE} +ldats_paper_cpt <- readRDS(file.path(vignette_files, "output", "ldats_paper_cpt.RDS")) +paper_paper_cpt <- readRDS(file.path(vignette_files, "output", "paper_paper_cpt.RDS")) +paper_ldats_cpt <- readRDS(file.path(vignette_files, "output", "paper_ldats_cpt.RDS")) +ldats_ldats_cpt <- readRDS(file.path(vignette_files, "output", "ldats_ldats_cpt.RDS")) +ldats_paper_cpt_dates <- readRDS(file.path(vignette_files, "output", "ldats_paper_cpt_dates.RDS")) +paper_paper_cpt_dates <- readRDS(file.path(vignette_files, "output", "paper_paper_cpt_dates.RDS")) +paper_ldats_cpt_dates <- readRDS(file.path(vignette_files, "output", "paper_ldats_cpt_dates.RDS")) +ldats_ldats_cpt_dates <- readRDS(file.path(vignette_files, "output", "ldats_ldats_cpt_dates.RDS")) +``` ```{r} nlevels(ldats_paper_cpt_dates$variable) nlevels(paper_paper_cpt_dates$variable) @@ -602,34 +689,57 @@ plot(paper_ldats_cpt) plot(ldats_ldats_cpt) ``` +```{r, eval = !params$run_models} +annual_hist <- readRDS(file.path(vignette_files, "output", "annual_hist.RDS")) +``` + ### Paper LDA and paper changepoint -```{r plot paper LDA and paper cpt, fig.width = 7, fig.height = 6} +```{r plot paper LDA and paper cpt, eval = params$run_models} paper_cpts <- find_changepoint_location(paper_paper_cpt) ntopics <- ldamodel@k -paper_cpt_plot <- get_ll_non_memoized_plot(ldamodel, paper_covariates, paper_cpts, make_plot = TRUE, +png(file.path(vignette_files, "output", "paper_paper_cpt_plot.png"), width = 800, height = 600) +get_ll_non_memoized_plot(ldamodel, paper_covariates, paper_cpts, make_plot = TRUE, weights = rep(1, NROW(paper_covariates))) +dev.off() +``` + +```{r plot paper LDA and LDATS cpts2, fig.width = 7, fig.height = 6} +paper_paper_hist <- annual_hist(paper_paper_cpt, paper_covariates$year_continuous) +``` -annual_hist(paper_paper_cpt, paper_covariates$year_continuous) -paper_cpt_plot +```{r} +knitr::include_graphics(file.path(vignette_files, "output", "paper_paper_cpt_plot.png")) ``` + ### LDATS LDA and paper changepoint -```{r plot LDATS lda and paper cpt, fig.width = 7, fig.height = 6} +```{r plot LDATS lda and paper cpt, eval = params$run_models} ldats_cpts <- find_changepoint_location(ldats_paper_cpt) ntopics <- ldats_ldamodel@k -ldats_cpt_plot <- get_ll_non_memoized_plot(ldats_ldamodel, paper_covariates, ldats_cpts, make_plot = TRUE, +png(file.path(vignette_files, "output", "ldats_paper_cpt_plot.png"), width = 800, height = 600) +get_ll_non_memoized_plot(ldats_ldamodel, paper_covariates, ldats_cpts, make_plot = TRUE, weights = rep(1, NROW(paper_covariates))) +dev.off() +``` -annual_hist(ldats_paper_cpt, paper_covariates$year_continuous) -ldats_cpt_plot +```{r plot LDATS lda and paper cpt2, fig.width = 7, fig.height = 6} +ldats_paper_hist <- annual_hist(ldats_paper_cpt, paper_covariates$year_continuous) +``` + +```{r} +knitr::include_graphics(file.path(vignette_files, "output", "ldats_paper_cpt_plot.png")) ``` The results of the changepoint model appear robust to both choice of LDA model and choice of changepoint model. ## Report changepoint dates -```{r report cpt dates, include = F} + +```{r report cpt dates, include = FALSE, eval = params$run_models} +paper_paper_cpt_dates$date <- as.Date(paper_paper_cpt_dates$date) +ldats_paper_cpt_dates$date <- as.Date(ldats_paper_cpt_dates$date) + cpt_dates <- dplyr::bind_rows("paperLDA_paperCPT" = paper_paper_cpt_dates, "ldatsLDA_paperCPT" = ldats_paper_cpt_dates, "ldatsLDA_ldatsCPT" = ldats_ldats_cpt_dates, @@ -640,6 +750,12 @@ cpt_dates <- dplyr::bind_rows("paperLDA_paperCPT" = paper_paper_cpt_dates, dplyr::ungroup() %>% dplyr::rename(changepoint = variable) %>% tidyr::spread(analysis, date) + +saveRDS(cpt_dates, file = file.path(vignette_files, "output", "cpt_dates.RDS")) +``` + +```{r eval = !params$run_models, include = FALSE} +cpt_dates <- readRDS(file.path(vignette_files, "output", "cpt_dates.RDS")) ``` ```{r print cpt dates} diff --git a/vignettes/rodents-example.Rmd b/vignettes/rodents-example.Rmd index ef390a6d..26fb9342 100644 --- a/vignettes/rodents-example.Rmd +++ b/vignettes/rodents-example.Rmd @@ -81,7 +81,7 @@ lda_model_set2 <- LDA_set(document_term_table = rodents$document_term_table, `LDA_set()` returns a list of LDA models. We use `select_LDA()` to identify the best number of topics and choice of seed from our set of models. By default, we will choose models based on minimum AIC. To use different selection criteria, define the appropriate functions and specify them by passing `list(measurer = [measurer function], selector = [max, min, etc])` to the `control` argument. ```{r load lda model set, include = F} -load(here::here('vignettes', 'rodents-example-files', 'lda_model_set.Rds')) +load(file.path('.', 'rodents-example-files', 'lda_model_set.Rds')) rm(lda_model_set2) ``` @@ -137,7 +137,7 @@ Also, it is important to note that by default the TS functions take the name of `select_TS()` will identify the best-fit changepoint model of the models from `TS_on_LDA()`. As with `select_LDA()`, we can adjust the `measurer` and `selector` functions using the `control` argument list. ```{r reload ts, include = F} -load(here::here('vignettes', 'rodents-example-files', 'changepoint_models.Rds')) +load(file.path('.', 'rodents-example-files', 'changepoint_models.Rds')) ``` ```{r select ts} @@ -180,7 +180,7 @@ lda_ts_results <- LDA_TS(data = rodents, ``` ```{r load ldats results, include = F} -load(here::here('vignettes', 'rodents-example-files', 'lda_ts_results.Rds')) +load(file.path('.', 'rodents-example-files', 'lda_ts_results.Rds')) ``` `LDA_TS()` returns a list of all the model objects, and we can access their contents as above: