Skip to content

Add support for various tvt fields #26

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Mar 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ Config/testthat/edition: 3
Imports:
dplyr,
lorem,
lubridate,
purrr,
rlang,
tidyr
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ export(get_long_slider_fields)
export(get_long_text_field_values)
export(get_long_text_fields)
export(get_one_rectangle_of_values)
importFrom(lubridate,now)
importFrom(lubridate,today)
importFrom(rlang,.data)
134 changes: 125 additions & 9 deletions R/get_long_text_field_values.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#' @return a long dataframe of text field values with one row for each value set.
#' @export
#'
#' @importFrom lubridate today now
#'
#' @examples
#' \dontrun{
#' get_long_text_field_values(long_text_fields)
Expand All @@ -17,13 +19,117 @@ get_long_text_field_values <- function(long_text_fields) {
tvt_na <- function(df) {
df |>
dplyr::filter(.data$tvt == "tvt_na") |>
dplyr::mutate(value = replicate(length(.data$field_name), lorem::ipsum_words(round(stats::rnorm(mean = .data$mean, sd = .data$sd, n = 1)))))
# |>
# dplyr::slice_sample(prop = 0.5, weight_by = .data$weight)
dplyr::mutate(value = sapply(seq_len(nrow(df)), function(i) {
# Ensure the generated value is valid
n_words <- max(1, round(stats::rnorm(
n = 1,
mean = df$mean[i],
sd = df$sd[i]
)))
as.character(paste(lorem::ipsum_words(n_words), collapse = " "))
}))
}

tvt_datetime <- function(df) {
df |>
dplyr::filter(.data$tvt == "tvt_datetime") |>
dplyr::mutate(value = {
anchor_time <- eval(parse(text = .data$origin_function))
# Generate random times around anchor_time
times <- as.POSIXct(
stats::rnorm(
n = nrow(df),
mean = as.numeric(anchor_time),
sd = .data$sd # Using summarized value
),
origin = "1970-01-01"
)
# Convert to character in ISO format
format(times, "%Y-%m-%d %H:%M:%S")
})
}

tvt_date <- function(df) {
df |>
dplyr::filter(.data$tvt == "tvt_date") |>
dplyr::mutate(value = {
anchor_date <- eval(parse(text = .data$origin_function))
# Generate random dates around anchor date
dates <- as.Date(
stats::rnorm(
n = nrow(df),
mean = as.numeric(anchor_date),
sd = .data$sd
),
origin = "1970-01-01"
)
# Convert to character in ISO format based on the date format
format(dates, "%Y-%m-%d")
})
}

tvt_email <- function(df) {
df |>
dplyr::filter(.data$tvt == "tvt_email") |>
dplyr::mutate(
value = as.character(replicate(
length(.data$field_name),
paste0(lorem::ipsum_words(1), "@example.org")
))
)
}

rtnorm <- function(n, mean, sd, a = NA, b = NA) {
a <- as.numeric(a)
b <- as.numeric(b)
a <- ifelse(is.na(a), -Inf, a)
b <- ifelse(is.na(b), Inf, b)
stats::qnorm(stats::runif(n, stats::pnorm(a, mean, sd), stats::pnorm(b, mean, sd)), mean, sd)
}

tvt_integer <- function(df) {
df |>
dplyr::filter(.data$tvt == "tvt_integer") |>
dplyr::mutate(
value = as.character(round(rtnorm(
n = length(.data$field_name), mean = .data$mean, sd = .data$sd,
a = .data$text_validation_min, b = .data$text_validation_max
)))
)
}

tvt_number <- function(df) {
df |>
dplyr::filter(.data$tvt == "tvt_number") |>
dplyr::mutate(
value = as.character(round(rtnorm(
n = length(.data$field_name), mean = .data$mean, sd = .data$sd,
a = .data$text_validation_min, b = .data$text_validation_max
), digits = 2))
)
}

tvt_zipcode <- function(df) {
df |>
dplyr::filter(.data$tvt == "tvt_zipcode") |>
dplyr::mutate(
value = "32611"
)
}

tvt_phone <- function(df) {
df |>
dplyr::filter(.data$tvt == "tvt_phone") |>
dplyr::mutate(
value = "800-867-5309"
)
}


tvt_types <- c(
"tvt_na"
"tvt_na", "tvt_datetime", "tvt_date",
"tvt_email", "tvt_integer", "tvt_number",
"tvt_zipcode", "tvt_phone"
)

process_one_text_validation_type <- function(my_tvt, df) {
Expand All @@ -35,11 +141,21 @@ get_long_text_field_values <- function(long_text_fields) {
}

text_field_values <-
purrr::map(tvt_types,
process_one_text_validation_type,
long_text_fields |> dplyr::filter(.data$field_type == "text")
) |>
dplyr::bind_rows()
purrr::map(
tvt_types,
process_one_text_validation_type,
long_text_fields |> dplyr::filter(.data$field_type == "text")
) |>
# get rid of empty data frames in the list output
purrr::keep(~ nrow(.x) > 0)

# If the list is empty after filtering, create a minimal empty data frame
if (length(text_field_values) == 0) {
text_field_values <- dplyr::tibble(field_name = character(0), value = character(0))
} else {
# Otherwise, combine all data frames in the list
text_field_values <- dplyr::bind_rows(text_field_values)
}

result <- text_field_values |>
dplyr::select("field_name", "value")
Expand Down
117 changes: 114 additions & 3 deletions R/get_long_text_fields.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#' @title Get every text field response from a REDCap data dictionary
#'
#' @description
#' Given a REDCap data dictionary, enumerate every text field in that data dictionary and return a dataset with default weights
#' Given a REDCap data dictionary, enumerate every text field in that data dictionary
#' and return a dataset with default weights
#'
#' @param metadata A REDCap data dictionary
#'
Expand Down Expand Up @@ -42,8 +43,19 @@ get_long_text_fields <- function(metadata) {
"text_validation_max"
)) |>
dplyr::rename(text_validation_type = "text_validation_type_or_show_slider_number") |>
dplyr::mutate(
text_validation_min = as.character(.data$text_validation_min),
text_validation_max = as.character(.data$text_validation_max)
) |>
dplyr::mutate(tvt = dplyr::case_when(
is.na(.data$text_validation_type) ~ "tvt_na",
grepl("^datetime.*", .data$text_validation_type) ~ "tvt_datetime",
grepl("^date_", .data$text_validation_type) ~ "tvt_date",
grepl("email", .data$text_validation_type) ~ "tvt_email",
grepl("integer", .data$text_validation_type) ~ "tvt_integer",
grepl("number", .data$text_validation_type) ~ "tvt_number",
grepl("zipcode", .data$text_validation_type) ~ "tvt_zipcode",
grepl("phone", .data$text_validation_type) ~ "tvt_phone",
TRUE ~ "tvt_unsupported"
)) |>
# set weights for each response
Expand All @@ -57,6 +69,104 @@ get_long_text_fields <- function(metadata) {
return(result)
}

tvt_email <- function(text_fields) {
result <-
text_fields |>
dplyr::filter(grepl("email", .data$text_validation_type)) |>
dplyr::mutate(weight = 100)
return(result)
}

tvt_datetime <- function(text_fields) {
result <-
text_fields |>
dplyr::filter(grepl("^datetime.*", .data$text_validation_type)) |>
dplyr::mutate(
text_validation_min = dplyr::case_when(
grepl("^\\[.*\\]$", .data$text_validation_min) ~ NA_character_,
grepl("^-?[0-9]+(\\.[0-9]+)?$", .data$text_validation_min) ~ .data$text_validation_min,
grepl("^\\d{4}-\\d{2}-\\d{2}$", .data$text_validation_min) ~ as.character(.data$text_validation_min),
TRUE ~ NA_character_
),
text_validation_max = dplyr::case_when(
grepl("^\\[.*\\]$", .data$text_validation_max) ~ NA_character_,
grepl("^-?[0-9]+(\\.[0-9]+)?$", .data$text_validation_max) ~ .data$text_validation_max,
grepl("^\\d{4}-\\d{2}-\\d{2}$", .data$text_validation_max) ~ as.character(.data$text_validation_max),
TRUE ~ NA_character_
),
origin_function = "lubridate::now()",
sd = 3600 * 24
)
return(result)
}

tvt_date <- function(text_fields) {
result <-
text_fields |>
dplyr::filter(grepl("^date_", .data$text_validation_type)) |>
dplyr::mutate(
text_validation_min = dplyr::case_when(
grepl("^\\[.*\\]$", .data$text_validation_min) ~ NA_character_,
grepl("^-?[0-9]+(\\.[0-9]+)?$", .data$text_validation_min) ~ .data$text_validation_min,
grepl("^\\d{4}-\\d{2}-\\d{2}$", .data$text_validation_min) ~ as.character(.data$text_validation_min),
TRUE ~ NA_character_
),
text_validation_max = dplyr::case_when(
grepl("^\\[.*\\]$", .data$text_validation_max) ~ NA_character_,
grepl("^-?[0-9]+(\\.[0-9]+)?$", .data$text_validation_max) ~ .data$text_validation_max,
grepl("^\\d{4}-\\d{2}-\\d{2}$", .data$text_validation_max) ~ as.character(.data$text_validation_max),
TRUE ~ NA_character_
),
origin_function = "lubridate::today()",
sd = 3600 * 24
)
return(result)
}

tvt_integer <- function(text_fields) {
result <-
text_fields |>
dplyr::filter(grepl("integer", .data$text_validation_type)) |>
dplyr::mutate(
min_val = dplyr::coalesce(as.numeric(.data$text_validation_min), as.numeric(.data$text_validation_max) - 10, 0),
max_val = dplyr::coalesce(as.numeric(.data$text_validation_max), as.numeric(.data$text_validation_min) + 10, 10),
mean = (.data$min_val + .data$max_val) / 2,
sd = (.data$max_val - .data$min_val) / 6
) |>
dplyr::select(-"min_val", -"max_val")
return(result)
}

tvt_number <- function(text_fields) {
result <-
text_fields |>
dplyr::filter(grepl("number", .data$text_validation_type)) |>
dplyr::mutate(
min_val = dplyr::coalesce(as.numeric(.data$text_validation_min), as.numeric(.data$text_validation_max) - 10, 0),
max_val = dplyr::coalesce(as.numeric(.data$text_validation_max), as.numeric(.data$text_validation_min) + 10, 10),
mean = (.data$min_val + .data$max_val) / 2,
sd = (.data$max_val - .data$min_val) / 6
) |>
dplyr::select(-"min_val", -"max_val")
return(result)
}

tvt_zipcode <- function(text_fields) {
result <-
text_fields |>
dplyr::filter(grepl("zipcode", .data$text_validation_type)) |>
dplyr::mutate(weight = 100)
return(result)
}

tvt_phone <- function(text_fields) {
result <-
text_fields |>
dplyr::filter(grepl("phone", .data$text_validation_type)) |>
dplyr::mutate(weight = 100)
return(result)
}

tvt_unsupported <- function(text_fields) {
result <-
text_fields |>
Expand All @@ -65,8 +175,9 @@ get_long_text_fields <- function(metadata) {
}

tvt_types <- c(
"tvt_na",
"tvt_unsupported"
"tvt_na", "tvt_datetime", "tvt_unsupported", "tvt_date",
"tvt_email", "tvt_integer", "tvt_number", "tvt_zipcode",
"tvt_phone"
)

process_one_text_validation_type <- function(my_tvt, df) {
Expand Down
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,28 @@ Copy [proof_of_concept.R](https://github.com/ctsit/redcapfiller/blob/main/proof_

### Limitations

REDCap Filler does not yet understand all the dimensions of a modern REDCap project. It can fill the categorical fields. It can fill unvalidated text fields. It ignores all other field types and will not attempt to fill them. It doesn't even know how to fill a date field. :-( Filler only knows how to fill classic projects without repeating forms or events. It does not honor form display logic and ignores all fields governed by branching logic.
REDCap Filler does not yet understand all the dimensions of a modern REDCap project. It can fill the categorical fields. It can fill unvalidated text field and the text validation types date, datetime, email, integer, number, phone, and zipcode. It ignores all other field types and validation types and will not attempt to fill them. Filler only knows how to fill classic projects without repeating forms or events. It does not honor form display logic and ignores all fields governed by branching logic.

Focusing more on what Filler _can_ do, the first release milestone will support these features:
Focusing more on what Filler _can_ or _will_ do, the first release milestone will support these features:

- Use the REDCap data dictionary as the primary input. Add a few parameters to define record count and forms to fill
- Use the REDCap data dictionary as the primary input. Add a few parameters to define record count and forms to fill.
- Work on classic projects with no repeating objects.
- Fill the forms named in a vector of form names.
- Fill out every field on the named forms as long as there is no BL constraint.
- Fill out every field on the named forms as long as there is no BL constraint. (Note: Timeline constraints might delay some text validation types)
- Do not violate any data constraints.
- Inject a simple default randomness wherever practicable.
- Set the form completed fields to green.
- Use the REDCap API to read the data dictionary and write the data.
- Provide a uniform distribution on categorical fields.
- Provide normal distribution on numeric and date fields.

### Futures

This project aims to populate complex REDCap projects using the project design. If the REDCap API exposes a design dimension, we plan to use that to guide how the Filler populates projects. Yet, that will take some time to develop fully. This is the proposed timeline of features:

1. Populate all fields in a classic project with no repeating objects, ignoring fields with BL constraints and completely ignoring FDL.
2. Add support for longitudinal projects
3. Complete any text validation types missing in the first release.
3. Add support for repeating events and repeating forms.
4. Allow non-uniform distribution of categorical fields. Allow non-default distributions on ranged fields.
5. Allow inter-record and intra-record date offsets.
Expand Down
3 changes: 2 additions & 1 deletion man/get_long_text_fields.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified tests/testthat/get_long_text_field_values/input.rds
Binary file not shown.
Binary file modified tests/testthat/get_one_rectangle_of_values/input.rds
Binary file not shown.
6 changes: 4 additions & 2 deletions tests/testthat/get_one_rectangle_of_values/make_test_data.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
metadata_file <- testthat::test_path("get_one_rectangle_of_values", "metadata.csv")
metadata_file <- testthat::test_path("shared_testdata", "metadata.csv")
metadata <- readr::read_csv(metadata_file)

long_fields_and_responses <- dplyr::bind_rows(
get_long_categorical_field_responses(metadata),
get_long_text_fields(metadata)
get_long_text_fields(metadata),
get_long_notes_fields(metadata),
get_long_slider_fields(metadata)
)

long_fields_and_responses |>
Expand Down
16 changes: 16 additions & 0 deletions tests/testthat/test-get_long_text_field_values.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,19 @@ testthat::test_that("get_long_text_field_values returns a varying vector of valu
))
})

testthat::test_that("get_long_text_field_values returns a value field for date and datetime fields", {
date_fields <- output$field_name[grepl("^date.", output$field_name)]
datetime_fields <- output$field_name[grepl("^datetime.", output$field_name)]

date_values <- output$value[output$field_name %in% date_fields]
datetime_values <- output$value[output$field_name %in% datetime_fields]

testthat::expect_true(length(date_fields) > 0 | length(datetime_fields) > 0)
testthat::expect_true(all(date_values != "") & all(datetime_values != ""))
})

# test that every value is a character
testthat::test_that("get_long_text_field_values returns a character vector", {
testthat::expect_true(all(sapply(output$value, is.character)))
})

Loading