diff --git a/DESCRIPTION b/DESCRIPTION index a06cae9..12b01ab 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,6 +38,7 @@ Config/testthat/edition: 3 Imports: dplyr, lorem, + lubridate, purrr, rlang, tidyr diff --git a/NAMESPACE b/NAMESPACE index c83a2c9..a324f88 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,4 +9,6 @@ export(get_long_slider_fields) export(get_long_text_field_values) export(get_long_text_fields) export(get_one_rectangle_of_values) +importFrom(lubridate,now) +importFrom(lubridate,today) importFrom(rlang,.data) diff --git a/R/get_long_text_field_values.R b/R/get_long_text_field_values.R index 5201a12..a098139 100644 --- a/R/get_long_text_field_values.R +++ b/R/get_long_text_field_values.R @@ -9,6 +9,8 @@ #' @return a long dataframe of text field values with one row for each value set. #' @export #' +#' @importFrom lubridate today now +#' #' @examples #' \dontrun{ #' get_long_text_field_values(long_text_fields) @@ -17,13 +19,117 @@ get_long_text_field_values <- function(long_text_fields) { tvt_na <- function(df) { df |> dplyr::filter(.data$tvt == "tvt_na") |> - dplyr::mutate(value = replicate(length(.data$field_name), lorem::ipsum_words(round(stats::rnorm(mean = .data$mean, sd = .data$sd, n = 1))))) - # |> - # dplyr::slice_sample(prop = 0.5, weight_by = .data$weight) + dplyr::mutate(value = sapply(seq_len(nrow(df)), function(i) { + # Ensure the generated value is valid + n_words <- max(1, round(stats::rnorm( + n = 1, + mean = df$mean[i], + sd = df$sd[i] + ))) + as.character(paste(lorem::ipsum_words(n_words), collapse = " ")) + })) + } + + tvt_datetime <- function(df) { + df |> + dplyr::filter(.data$tvt == "tvt_datetime") |> + dplyr::mutate(value = { + anchor_time <- eval(parse(text = .data$origin_function)) + # Generate random times around anchor_time + times <- as.POSIXct( + stats::rnorm( + n = nrow(df), + mean = as.numeric(anchor_time), + sd = .data$sd # Using summarized value + ), + origin = "1970-01-01" + ) + # Convert to character in ISO format + format(times, "%Y-%m-%d %H:%M:%S") + }) + } + + tvt_date <- function(df) { + df |> + dplyr::filter(.data$tvt == "tvt_date") |> + dplyr::mutate(value = { + anchor_date <- eval(parse(text = .data$origin_function)) + # Generate random dates around anchor date + dates <- as.Date( + stats::rnorm( + n = nrow(df), + mean = as.numeric(anchor_date), + sd = .data$sd + ), + origin = "1970-01-01" + ) + # Convert to character in ISO format based on the date format + format(dates, "%Y-%m-%d") + }) } + tvt_email <- function(df) { + df |> + dplyr::filter(.data$tvt == "tvt_email") |> + dplyr::mutate( + value = as.character(replicate( + length(.data$field_name), + paste0(lorem::ipsum_words(1), "@example.org") + )) + ) + } + + rtnorm <- function(n, mean, sd, a = NA, b = NA) { + a <- as.numeric(a) + b <- as.numeric(b) + a <- ifelse(is.na(a), -Inf, a) + b <- ifelse(is.na(b), Inf, b) + stats::qnorm(stats::runif(n, stats::pnorm(a, mean, sd), stats::pnorm(b, mean, sd)), mean, sd) + } + + tvt_integer <- function(df) { + df |> + dplyr::filter(.data$tvt == "tvt_integer") |> + dplyr::mutate( + value = as.character(round(rtnorm( + n = length(.data$field_name), mean = .data$mean, sd = .data$sd, + a = .data$text_validation_min, b = .data$text_validation_max + ))) + ) + } + + tvt_number <- function(df) { + df |> + dplyr::filter(.data$tvt == "tvt_number") |> + dplyr::mutate( + value = as.character(round(rtnorm( + n = length(.data$field_name), mean = .data$mean, sd = .data$sd, + a = .data$text_validation_min, b = .data$text_validation_max + ), digits = 2)) + ) + } + + tvt_zipcode <- function(df) { + df |> + dplyr::filter(.data$tvt == "tvt_zipcode") |> + dplyr::mutate( + value = "32611" + ) + } + + tvt_phone <- function(df) { + df |> + dplyr::filter(.data$tvt == "tvt_phone") |> + dplyr::mutate( + value = "800-867-5309" + ) + } + + tvt_types <- c( - "tvt_na" + "tvt_na", "tvt_datetime", "tvt_date", + "tvt_email", "tvt_integer", "tvt_number", + "tvt_zipcode", "tvt_phone" ) process_one_text_validation_type <- function(my_tvt, df) { @@ -35,11 +141,21 @@ get_long_text_field_values <- function(long_text_fields) { } text_field_values <- - purrr::map(tvt_types, - process_one_text_validation_type, - long_text_fields |> dplyr::filter(.data$field_type == "text") - ) |> - dplyr::bind_rows() + purrr::map( + tvt_types, + process_one_text_validation_type, + long_text_fields |> dplyr::filter(.data$field_type == "text") + ) |> + # get rid of empty data frames in the list output + purrr::keep(~ nrow(.x) > 0) + + # If the list is empty after filtering, create a minimal empty data frame + if (length(text_field_values) == 0) { + text_field_values <- dplyr::tibble(field_name = character(0), value = character(0)) + } else { + # Otherwise, combine all data frames in the list + text_field_values <- dplyr::bind_rows(text_field_values) + } result <- text_field_values |> dplyr::select("field_name", "value") diff --git a/R/get_long_text_fields.R b/R/get_long_text_fields.R index f32de06..44d878a 100644 --- a/R/get_long_text_fields.R +++ b/R/get_long_text_fields.R @@ -1,7 +1,8 @@ #' @title Get every text field response from a REDCap data dictionary #' #' @description -#' Given a REDCap data dictionary, enumerate every text field in that data dictionary and return a dataset with default weights +#' Given a REDCap data dictionary, enumerate every text field in that data dictionary +#' and return a dataset with default weights #' #' @param metadata A REDCap data dictionary #' @@ -42,8 +43,19 @@ get_long_text_fields <- function(metadata) { "text_validation_max" )) |> dplyr::rename(text_validation_type = "text_validation_type_or_show_slider_number") |> + dplyr::mutate( + text_validation_min = as.character(.data$text_validation_min), + text_validation_max = as.character(.data$text_validation_max) + ) |> dplyr::mutate(tvt = dplyr::case_when( is.na(.data$text_validation_type) ~ "tvt_na", + grepl("^datetime.*", .data$text_validation_type) ~ "tvt_datetime", + grepl("^date_", .data$text_validation_type) ~ "tvt_date", + grepl("email", .data$text_validation_type) ~ "tvt_email", + grepl("integer", .data$text_validation_type) ~ "tvt_integer", + grepl("number", .data$text_validation_type) ~ "tvt_number", + grepl("zipcode", .data$text_validation_type) ~ "tvt_zipcode", + grepl("phone", .data$text_validation_type) ~ "tvt_phone", TRUE ~ "tvt_unsupported" )) |> # set weights for each response @@ -57,6 +69,104 @@ get_long_text_fields <- function(metadata) { return(result) } + tvt_email <- function(text_fields) { + result <- + text_fields |> + dplyr::filter(grepl("email", .data$text_validation_type)) |> + dplyr::mutate(weight = 100) + return(result) + } + + tvt_datetime <- function(text_fields) { + result <- + text_fields |> + dplyr::filter(grepl("^datetime.*", .data$text_validation_type)) |> + dplyr::mutate( + text_validation_min = dplyr::case_when( + grepl("^\\[.*\\]$", .data$text_validation_min) ~ NA_character_, + grepl("^-?[0-9]+(\\.[0-9]+)?$", .data$text_validation_min) ~ .data$text_validation_min, + grepl("^\\d{4}-\\d{2}-\\d{2}$", .data$text_validation_min) ~ as.character(.data$text_validation_min), + TRUE ~ NA_character_ + ), + text_validation_max = dplyr::case_when( + grepl("^\\[.*\\]$", .data$text_validation_max) ~ NA_character_, + grepl("^-?[0-9]+(\\.[0-9]+)?$", .data$text_validation_max) ~ .data$text_validation_max, + grepl("^\\d{4}-\\d{2}-\\d{2}$", .data$text_validation_max) ~ as.character(.data$text_validation_max), + TRUE ~ NA_character_ + ), + origin_function = "lubridate::now()", + sd = 3600 * 24 + ) + return(result) + } + + tvt_date <- function(text_fields) { + result <- + text_fields |> + dplyr::filter(grepl("^date_", .data$text_validation_type)) |> + dplyr::mutate( + text_validation_min = dplyr::case_when( + grepl("^\\[.*\\]$", .data$text_validation_min) ~ NA_character_, + grepl("^-?[0-9]+(\\.[0-9]+)?$", .data$text_validation_min) ~ .data$text_validation_min, + grepl("^\\d{4}-\\d{2}-\\d{2}$", .data$text_validation_min) ~ as.character(.data$text_validation_min), + TRUE ~ NA_character_ + ), + text_validation_max = dplyr::case_when( + grepl("^\\[.*\\]$", .data$text_validation_max) ~ NA_character_, + grepl("^-?[0-9]+(\\.[0-9]+)?$", .data$text_validation_max) ~ .data$text_validation_max, + grepl("^\\d{4}-\\d{2}-\\d{2}$", .data$text_validation_max) ~ as.character(.data$text_validation_max), + TRUE ~ NA_character_ + ), + origin_function = "lubridate::today()", + sd = 3600 * 24 + ) + return(result) + } + + tvt_integer <- function(text_fields) { + result <- + text_fields |> + dplyr::filter(grepl("integer", .data$text_validation_type)) |> + dplyr::mutate( + min_val = dplyr::coalesce(as.numeric(.data$text_validation_min), as.numeric(.data$text_validation_max) - 10, 0), + max_val = dplyr::coalesce(as.numeric(.data$text_validation_max), as.numeric(.data$text_validation_min) + 10, 10), + mean = (.data$min_val + .data$max_val) / 2, + sd = (.data$max_val - .data$min_val) / 6 + ) |> + dplyr::select(-"min_val", -"max_val") + return(result) + } + + tvt_number <- function(text_fields) { + result <- + text_fields |> + dplyr::filter(grepl("number", .data$text_validation_type)) |> + dplyr::mutate( + min_val = dplyr::coalesce(as.numeric(.data$text_validation_min), as.numeric(.data$text_validation_max) - 10, 0), + max_val = dplyr::coalesce(as.numeric(.data$text_validation_max), as.numeric(.data$text_validation_min) + 10, 10), + mean = (.data$min_val + .data$max_val) / 2, + sd = (.data$max_val - .data$min_val) / 6 + ) |> + dplyr::select(-"min_val", -"max_val") + return(result) + } + + tvt_zipcode <- function(text_fields) { + result <- + text_fields |> + dplyr::filter(grepl("zipcode", .data$text_validation_type)) |> + dplyr::mutate(weight = 100) + return(result) + } + + tvt_phone <- function(text_fields) { + result <- + text_fields |> + dplyr::filter(grepl("phone", .data$text_validation_type)) |> + dplyr::mutate(weight = 100) + return(result) + } + tvt_unsupported <- function(text_fields) { result <- text_fields |> @@ -65,8 +175,9 @@ get_long_text_fields <- function(metadata) { } tvt_types <- c( - "tvt_na", - "tvt_unsupported" + "tvt_na", "tvt_datetime", "tvt_unsupported", "tvt_date", + "tvt_email", "tvt_integer", "tvt_number", "tvt_zipcode", + "tvt_phone" ) process_one_text_validation_type <- function(my_tvt, df) { diff --git a/README.md b/README.md index 8a2178b..e7fa2c7 100644 --- a/README.md +++ b/README.md @@ -19,19 +19,20 @@ Copy [proof_of_concept.R](https://github.com/ctsit/redcapfiller/blob/main/proof_ ### Limitations -REDCap Filler does not yet understand all the dimensions of a modern REDCap project. It can fill the categorical fields. It can fill unvalidated text fields. It ignores all other field types and will not attempt to fill them. It doesn't even know how to fill a date field. :-( Filler only knows how to fill classic projects without repeating forms or events. It does not honor form display logic and ignores all fields governed by branching logic. +REDCap Filler does not yet understand all the dimensions of a modern REDCap project. It can fill the categorical fields. It can fill unvalidated text field and the text validation types date, datetime, email, integer, number, phone, and zipcode. It ignores all other field types and validation types and will not attempt to fill them. Filler only knows how to fill classic projects without repeating forms or events. It does not honor form display logic and ignores all fields governed by branching logic. -Focusing more on what Filler _can_ do, the first release milestone will support these features: +Focusing more on what Filler _can_ or _will_ do, the first release milestone will support these features: -- Use the REDCap data dictionary as the primary input. Add a few parameters to define record count and forms to fill +- Use the REDCap data dictionary as the primary input. Add a few parameters to define record count and forms to fill. - Work on classic projects with no repeating objects. - Fill the forms named in a vector of form names. -- Fill out every field on the named forms as long as there is no BL constraint. +- Fill out every field on the named forms as long as there is no BL constraint. (Note: Timeline constraints might delay some text validation types) - Do not violate any data constraints. - Inject a simple default randomness wherever practicable. - Set the form completed fields to green. - Use the REDCap API to read the data dictionary and write the data. - Provide a uniform distribution on categorical fields. +- Provide normal distribution on numeric and date fields. ### Futures @@ -39,6 +40,7 @@ This project aims to populate complex REDCap projects using the project design. 1. Populate all fields in a classic project with no repeating objects, ignoring fields with BL constraints and completely ignoring FDL. 2. Add support for longitudinal projects +3. Complete any text validation types missing in the first release. 3. Add support for repeating events and repeating forms. 4. Allow non-uniform distribution of categorical fields. Allow non-default distributions on ranged fields. 5. Allow inter-record and intra-record date offsets. diff --git a/man/get_long_text_fields.Rd b/man/get_long_text_fields.Rd index 823d3c1..d6e0974 100644 --- a/man/get_long_text_fields.Rd +++ b/man/get_long_text_fields.Rd @@ -25,7 +25,8 @@ a dataframe with these columns } } \description{ -Given a REDCap data dictionary, enumerate every text field in that data dictionary and return a dataset with default weights +Given a REDCap data dictionary, enumerate every text field in that data dictionary +and return a dataset with default weights } \examples{ \dontrun{ diff --git a/tests/testthat/get_long_text_field_values/input.rds b/tests/testthat/get_long_text_field_values/input.rds index 45ba121..e042e59 100644 Binary files a/tests/testthat/get_long_text_field_values/input.rds and b/tests/testthat/get_long_text_field_values/input.rds differ diff --git a/tests/testthat/get_one_rectangle_of_values/input.rds b/tests/testthat/get_one_rectangle_of_values/input.rds index 19cb144..57fc7f0 100644 Binary files a/tests/testthat/get_one_rectangle_of_values/input.rds and b/tests/testthat/get_one_rectangle_of_values/input.rds differ diff --git a/tests/testthat/get_one_rectangle_of_values/make_test_data.R b/tests/testthat/get_one_rectangle_of_values/make_test_data.R index 8a8841f..a9cf1a2 100644 --- a/tests/testthat/get_one_rectangle_of_values/make_test_data.R +++ b/tests/testthat/get_one_rectangle_of_values/make_test_data.R @@ -1,9 +1,11 @@ -metadata_file <- testthat::test_path("get_one_rectangle_of_values", "metadata.csv") +metadata_file <- testthat::test_path("shared_testdata", "metadata.csv") metadata <- readr::read_csv(metadata_file) long_fields_and_responses <- dplyr::bind_rows( get_long_categorical_field_responses(metadata), - get_long_text_fields(metadata) + get_long_text_fields(metadata), + get_long_notes_fields(metadata), + get_long_slider_fields(metadata) ) long_fields_and_responses |> diff --git a/tests/testthat/test-get_long_text_field_values.R b/tests/testthat/test-get_long_text_field_values.R index b0bdd38..26406e8 100644 --- a/tests/testthat/test-get_long_text_field_values.R +++ b/tests/testthat/test-get_long_text_field_values.R @@ -31,3 +31,19 @@ testthat::test_that("get_long_text_field_values returns a varying vector of valu )) }) +testthat::test_that("get_long_text_field_values returns a value field for date and datetime fields", { + date_fields <- output$field_name[grepl("^date.", output$field_name)] + datetime_fields <- output$field_name[grepl("^datetime.", output$field_name)] + + date_values <- output$value[output$field_name %in% date_fields] + datetime_values <- output$value[output$field_name %in% datetime_fields] + + testthat::expect_true(length(date_fields) > 0 | length(datetime_fields) > 0) + testthat::expect_true(all(date_values != "") & all(datetime_values != "")) +}) + +# test that every value is a character +testthat::test_that("get_long_text_field_values returns a character vector", { + testthat::expect_true(all(sapply(output$value, is.character))) +}) + diff --git a/tests/testthat/test-get_long_text_fields.R b/tests/testthat/test-get_long_text_fields.R index 3e61b7e..de8ffcd 100644 --- a/tests/testthat/test-get_long_text_fields.R +++ b/tests/testthat/test-get_long_text_fields.R @@ -31,3 +31,11 @@ testthat::test_that("get_long_text_fields: weights are balanced", { dplyr::pull(balanced)) }) +testthat::test_that("get_long_text_fields: origin_function and sd columns are present", { + datetime_support_columns <- c("origin_function", "sd") + testthat::expect_equal( + output |> + dplyr::select(dplyr::all_of(datetime_support_columns)) |> names(), + datetime_support_columns + ) +}) diff --git a/tests/testthat/test-get_one_rectangle_of_values.R b/tests/testthat/test-get_one_rectangle_of_values.R index bdfe251..89eb406 100644 --- a/tests/testthat/test-get_one_rectangle_of_values.R +++ b/tests/testthat/test-get_one_rectangle_of_values.R @@ -1,50 +1,92 @@ -long_fields_and_responses <- readRDS( - testthat::test_path("get_one_rectangle_of_values", "input.rds") -) |> +long_fields_and_responses <- readRDS(testthat::test_path("get_one_rectangle_of_values", "input.rds")) |> # filter out the record_id row because we are generating it in these tests dplyr::filter(.data$field_name != "record_id") output <- get_one_rectangle_of_values( one_record_id = 1, record_id_name = "record_id", - forms_to_fill = "tests", + forms_to_fill = "form_1", long_fields_and_responses ) output_with_special_record_id <- get_one_rectangle_of_values( one_record_id = 1, record_id_name = "special_id", - forms_to_fill = "tests", + forms_to_fill = "form_1", long_fields_and_responses ) -testthat::test_that("get_one_rectangle_of_values: ethnicity, occupation, race, and state are represented in the columns", { - testthat::expect_true(all(c("record_id", "ethnicity", "occupation", "race", "state") %in% names(output))) +testthat::test_that("get_one_rectangle_of_values: outputs have the correct record ID field names", { + testthat::expect_equal(names(output)[1], "record_id") + testthat::expect_equal(names(output_with_special_record_id)[1], "special_id") }) -testthat::test_that("get_one_rectangle_of_values: bl_caffeine, bl_exercise, and bl_treatments are represented in the columns", { - testthat::expect_true(all(c("bl_caffeine", "bl_exercise", "bl_treatments") %in% gsub("___.*", "", names(output)))) +testthat::test_that("get_one_rectangle_of_values: basic form fields are present", { + testthat::expect_true(all(c("f_dropdown", "f_radio", "f_yes_no", "f_text") %in% names(output))) + testthat::expect_true(all(c("f_dropdown", "f_radio", "f_yes_no", "f_text") %in% names(output_with_special_record_id))) }) -testthat::test_that("get_one_rectangle_of_values: special_id is in the record_id position", { - testthat::expect_equal(names(output_with_special_record_id)[[1]], "special_id") +testthat::test_that("get_one_rectangle_of_values: checkbox fields are present with different indices", { + testthat::expect_true(any(grepl("f_checkbox___", names(output)))) + testthat::expect_true(any(grepl("f_checkbox___", names(output_with_special_record_id)))) }) -testthat::test_that("get_one_rectangle_of_values: fname and lname are represented in the columns", { - testthat::expect_true(all(c("fname", "lname") %in% gsub("___.*", "", names(output)))) +# Helper function to check field presence by pattern +check_fields_by_pattern <- function(output, output_with_special_record_id, pattern, description) { + fields_output <- names(output)[grepl(pattern, names(output))] + fields_special_output <- names(output_with_special_record_id)[grepl(pattern, names(output_with_special_record_id))] + + testthat::test_that(paste("get_one_rectangle_of_values:", description, "fields are present"), { + testthat::expect_true(length(fields_output) > 0, + info = paste("No", description, "fields found in output")) + testthat::expect_true(length(fields_special_output) > 0, + info = paste("No", description, "fields found in output_with_special_record_id")) + }) +} + +check_fields_by_pattern(output, output_with_special_record_id, + "^v_number|^v_integer", "numeric") + +check_fields_by_pattern(output, output_with_special_record_id, + "^v_date|^v_datetime", "date/datetime") + +check_fields_by_pattern(output, output_with_special_record_id, "^v_email", "email") +check_fields_by_pattern(output, output_with_special_record_id, "^v_zipcode", "zipcode") +check_fields_by_pattern(output, output_with_special_record_id, "^v_phone", "phone") + +# Test for text fields +testthat::test_that("get_one_rectangle_of_values: text fields are handled correctly", { + text_fields <- c("f_text", "f_notes") + + for (field in text_fields) { + testthat::expect_true(field %in% names(output), + info = paste("Field", field, "not found in output")) + testthat::expect_true(field %in% names(output_with_special_record_id), + info = paste("Field", field, "not found in output_with_special_record_id")) + } +}) + +testthat::test_that("get_one_rectangle_of_values: slider field exists", { + testthat::expect_true("f_slider" %in% names(output)) + testthat::expect_true("f_slider" %in% names(output_with_special_record_id)) }) -testthat::test_that("get_one_rectangle_of_values: notes fields are represented in the columns", { - testthat::expect_true(all(c("addl_notes", "bl_treatments_other_notes", "phy_notes") %in% names(output))) +testthat::test_that("get_one_rectangle_of_values: both outputs have same number of columns", { + testthat::expect_equal(length(names(output)), length(names(output_with_special_record_id))) }) -testthat::test_that("get_one_rectangle_of_values: has values for all notes fields", { - testthat::expect_true(all(!is.na(output$addl_notes))) - testthat::expect_true(all(!is.na(output$bl_treatments_other_notes))) - testthat::expect_true(all(!is.na(output$phy_notes))) +testthat::test_that("get_one_rectangle_of_values: outputs have similar structure beyond ID and checkbox fields", { + output_filtered <- names(output)[!grepl("^record_id$|f_checkbox___", names(output))] + special_id_filtered <- names(output_with_special_record_id)[ + !grepl("^special_id$|f_checkbox___", names(output_with_special_record_id)) + ] - testthat::expect_true(all(!is.na(output_with_special_record_id$addl_notes))) - testthat::expect_true(all(!is.na(output_with_special_record_id$bl_treatments_other_notes))) - testthat::expect_true(all(!is.na(output_with_special_record_id$phy_notes))) + testthat::expect_equal(output_filtered, special_id_filtered) }) +testthat::test_that("get_one_rectangle_of_values: important fields have values", { + testthat::expect_false(is.na(output$f_text[1])) + testthat::expect_false(is.na(output$f_notes[1])) + testthat::expect_false(is.na(output_with_special_record_id$f_text[1])) + testthat::expect_false(is.na(output_with_special_record_id$f_notes[1])) +})