diff --git a/NEWS.md b/NEWS.md index 68a0f70..fe21d1d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ # readODS 2.2.0 * Fix #151 - Now `read_ods()` and `list_ods_sheets()` can also be used to process flat ods files. `read_fods()` and `list_fods_sheets()` are still available, but not as the so-called "common interface." +* Fix #154 - `read_ods()` and `read_fods()` have two arguments - `trim_ws` and `n_max` which are the same as the arguments of `readxl::read_excel()`. # readODS 2.1.1 diff --git a/R/read_ods.R b/R/read_ods.R index ec05966..0eea24f 100644 --- a/R/read_ods.R +++ b/R/read_ods.R @@ -32,26 +32,30 @@ return(g) } - - ## Based on readxl, although the implementation is different. ## If max row is -1, read to end of row. ## Row and column-numbers are 1-based -.standardise_limits <- function(range, skip) { +.standardise_limits <- function(range, skip, n_max) { if(is.null(range)) { skip <- check_nonnegative_integer(x = skip, argument = "skip") + n_max <- check_nonnegative_integer(x = n_max, argument = "n_max") + if (n_max == Inf) { + max_row <- -1 + } else { + max_row <- n_max + 1 + } limits <- c( min_row = skip + 1, - max_row = -1, + max_row = max_row, min_col = 1, max_col = -1 ) } else { - if(skip != 0) { - warning("Range and non-zero value for skip given. Defaulting to range.", call. = FALSE) + if(skip != 0 || n_max != Inf) { + warning("Range and non-default value for skip or n_max given. Defaulting to range.", call. = FALSE) } tryCatch({ - limits <- cellranger::as.cell_limits(range) + limits <- cellranger::as.cell_limits(range) }, error = function(e) { stop("Invalid `range`") }) @@ -82,7 +86,9 @@ row_names = FALSE, strings_as_factors = FALSE, verbose = FALSE, - as_tibble = TRUE) { + as_tibble = TRUE, + trim_ws = TRUE, + n_max = Inf) { if (!file.exists(path)) { stop("file does not exist", call. = FALSE) } @@ -104,6 +110,9 @@ if (!is.logical(as_tibble)) { stop("as_tibble must be of type `boolean", call. = FALSE) } + if (!is.logical(trim_ws)) { + stop("trim_ws must be of type `boolean", call. = FALSE) + } if (row_names && as_tibble) { stop("Tibbles do not support row names. To use row names, set as_tibble to false", call. = FALSE) } @@ -115,6 +124,9 @@ stop("Unknown col_types. Can either be a class col_spec, list, character, NULL or NA.", call. = FALSE) } + if (!is.numeric(n_max)) { + stop("n_max must be numeric.", call. = FALSE) + } } .return_empty <- function(as_tibble = FALSE) { @@ -127,22 +139,22 @@ return(data.frame()) } -.type_convert <- function(df, col_types = NULL, verbose = TRUE, na = c("", "NA")) { +.type_convert <- function(df, col_types = NULL, verbose = TRUE, na = c("", "NA"), trim_ws = TRUE) { if (verbose) { res <- readr::type_convert(df = df, col_types, na = na) } else { suppressMessages({ - res <- readr::type_convert(df = df, col_types, na = na) + res <- readr::type_convert(df = df, col_types, na = na, trim_ws = trim_ws) }) } return(res) } -.handle_col_types <- function(res, col_types, verbose, na) { +.handle_col_types <- function(res, col_types, verbose, na, trim_ws) { if (isTRUE(is.na(col_types)) || nrow(res) == 0) { return(res) } - .type_convert(df = res, col_types = col_types, verbose = verbose, na = na) + .type_convert(df = res, col_types = col_types, verbose = verbose, na = na, trim_ws = trim_ws) } ## standardise `sheet` parameter as a number, i.e. sheet_index @@ -173,19 +185,21 @@ } .read_ods <- function(path, - sheet = 1, - col_names = TRUE, - col_types = NULL, - na = "", - skip = 0, - formula_as_formula = FALSE, - range = NULL, - row_names = FALSE, - strings_as_factors = FALSE, - verbose = FALSE, - as_tibble = TRUE, - .name_repair = "unique", - flat = FALSE) { + sheet = 1, + col_names = TRUE, + col_types = NULL, + na = "", + skip = 0, + formula_as_formula = FALSE, + range = NULL, + row_names = FALSE, + strings_as_factors = FALSE, + verbose = FALSE, + as_tibble = TRUE, + .name_repair = "unique", + flat = FALSE, + trim_ws = TRUE, + n_max = Inf) { .check_read_args(path, sheet, col_names, @@ -197,7 +211,9 @@ row_names, strings_as_factors, verbose, - as_tibble) + as_tibble, + trim_ws, + n_max) path <- normalizePath(path) if (flat) { .get_sheet_names_func <- get_flat_sheet_names_ @@ -207,7 +223,7 @@ .read_ods_func <- read_ods_ } ## Get cell range info - limits <- .standardise_limits(range, skip) + limits <- .standardise_limits(range, skip, n_max) sheet_index <- .standardise_sheet(sheet = sheet, sheet_names = .get_sheet_names_func(file = path, include_external_data = TRUE), range = range) strings <- .read_ods_func(file = path, @@ -233,7 +249,7 @@ byrow = TRUE), stringsAsFactors = FALSE) res <- .change_df_with_col_row_header(x = res, col_header = col_names, row_header = row_names, .name_repair = .name_repair) - res <- .handle_col_types(res, col_types = col_types, verbose = verbose, na = na) + res <- .handle_col_types(res, col_types = col_types, verbose = verbose, na = na, trim_ws = trim_ws) if (strings_as_factors) { res <- .convert_strings_to_factors(df = res) } @@ -300,9 +316,11 @@ #' Default is `"unique"`. #' #' @param ods_format character, must be "auto", "ods" or "fods". The default "auto" is to determine the format automatically. By default, the format is determined by file extension, unless `guess` is `FALSE`. -#' @param guess logical. If the file extension is absent or not recognized, this +#' @param guess logical, If the file extension is absent or not recognized, this #' controls whether we attempt to guess format based on the file signature or #' "magic number". +#' @param trim_ws logical, should leading and trailing whitespace be trimmed? +#' @param n_max numeric, Maximum number of data rows to read. Ignored if `range` is given. #' @return A tibble (\code{tibble}) or data frame (\code{data.frame}) containing a representation of data in the (f)ods file. #' @author Peter Brohan , Chung-hong Chan , Gerrit-Jan Schutten #' @examples @@ -343,7 +361,9 @@ read_ods <- function(path, as_tibble = TRUE, .name_repair = "unique", ods_format = c("auto", "ods", "fods"), - guess = FALSE) { + guess = FALSE, + trim_ws = TRUE, + n_max = Inf) { ods_format <- .determine_ods_format(path = path, guess = guess, ods_format = match.arg(ods_format)) ## Should use match.call but there's a weird bug if one of the variable names is 'file' .read_ods(path = path, @@ -359,24 +379,28 @@ read_ods <- function(path, verbose = verbose, as_tibble = as_tibble, .name_repair = .name_repair, - flat = ods_format == "fods") + flat = ods_format == "fods", + trim_ws = trim_ws, + n_max = n_max) } #' @rdname read_ods #' @export read_fods <- function(path, - sheet = 1, - col_names = TRUE, - col_types = NULL, - na = "", - skip = 0, - formula_as_formula = FALSE, - range = NULL, - row_names = FALSE, - strings_as_factors = FALSE, - verbose = FALSE, - as_tibble = TRUE, - .name_repair = "unique") { + sheet = 1, + col_names = TRUE, + col_types = NULL, + na = "", + skip = 0, + formula_as_formula = FALSE, + range = NULL, + row_names = FALSE, + strings_as_factors = FALSE, + verbose = FALSE, + as_tibble = TRUE, + .name_repair = "unique", + trim_ws = TRUE, + n_max = Inf) { ## Should use match.call but there's a weird bug if one of the variable names is 'file' .read_ods(path = normalizePath(path, mustWork = FALSE), sheet = sheet, @@ -391,5 +415,7 @@ read_fods <- function(path, verbose = verbose, as_tibble = as_tibble, .name_repair = .name_repair, - flat = TRUE) + flat = TRUE, + trim_ws = trim_ws, + n_max = n_max) } diff --git a/man/list_ods_sheets.Rd b/man/list_ods_sheets.Rd index 96a8ea6..41134f9 100644 --- a/man/list_ods_sheets.Rd +++ b/man/list_ods_sheets.Rd @@ -24,7 +24,7 @@ ods_sheets(path) \item{ods_format}{character, must be "auto", "ods" or "fods". The default "auto" is to determine the format automatically. By default, the format is determined by file extension, unless \code{guess} is \code{FALSE}.} -\item{guess}{logical. If the file extension is absent or not recognized, this +\item{guess}{logical, If the file extension is absent or not recognized, this controls whether we attempt to guess format based on the file signature or "magic number".} } diff --git a/man/read_ods.Rd b/man/read_ods.Rd index 83f5df0..8772669 100644 --- a/man/read_ods.Rd +++ b/man/read_ods.Rd @@ -20,7 +20,9 @@ read_ods( as_tibble = TRUE, .name_repair = "unique", ods_format = c("auto", "ods", "fods"), - guess = FALSE + guess = FALSE, + trim_ws = TRUE, + n_max = Inf ) read_fods( @@ -36,7 +38,9 @@ read_fods( strings_as_factors = FALSE, verbose = FALSE, as_tibble = TRUE, - .name_repair = "unique" + .name_repair = "unique", + trim_ws = TRUE, + n_max = Inf ) } \arguments{ @@ -78,9 +82,13 @@ Default is \code{"unique"}.} \item{ods_format}{character, must be "auto", "ods" or "fods". The default "auto" is to determine the format automatically. By default, the format is determined by file extension, unless \code{guess} is \code{FALSE}.} -\item{guess}{logical. If the file extension is absent or not recognized, this +\item{guess}{logical, If the file extension is absent or not recognized, this controls whether we attempt to guess format based on the file signature or "magic number".} + +\item{trim_ws}{logical, should leading and trailing whitespace be trimmed?} + +\item{n_max}{numeric, Maximum number of data rows to read. Ignored if \code{range} is given.} } \value{ A tibble (\code{tibble}) or data frame (\code{data.frame}) containing a representation of data in the (f)ods file. diff --git a/tests/testdata/leadingspaces.ods b/tests/testdata/leadingspaces.ods new file mode 100644 index 0000000..f7db1a7 Binary files /dev/null and b/tests/testdata/leadingspaces.ods differ diff --git a/tests/testthat/test_read_ods.R b/tests/testthat/test_read_ods.R index d850473..728bcfb 100644 --- a/tests/testthat/test_read_ods.R +++ b/tests/testthat/test_read_ods.R @@ -8,6 +8,9 @@ test_that("Incorrect Argument", { expect_error(read_ods(path = "../testdata/sum.ods", strings_as_factors = "a"), "strings_as_factors must be of type `boolean`") expect_error(read_ods(path = "../testdata/sum.ods", verbose = "a"), "verbose must be of type `boolean`") expect_error(read_ods(path = "../testdata/sum.ods", row_names = TRUE), "Tibbles do not support") + expect_error(read_ods(path = "../testdata/sum.ods", n_max = "abc"), "n_max must be numeric") + expect_error(read_ods(path = "../testdata/sum.ods", n_max = -1), "n_max must be a positive integer") + expect_error(read_ods(path = "../testdata/sum.ods", trim_ws = "a"), "trim_ws must be") }) test_that("exceptions in C++ (hard to test)", { @@ -58,6 +61,8 @@ test_that("Parses range inputs correctly", { expect_equal(x[[2,2]], 2) expect_message(x <- read_ods("../testdata/multisheet.ods", range = "Sheet3!D2:E4")) expect_equal(x[[1,1]], 3) + expect_warning(x <- read_ods("../testdata/multisheet.ods", n_max = 10, range = "Sheet2!B4:D9"), "Range and non-default") + expect_equal(x[[2,2]], 2) }) test_that("Deals with repeated spaces correctly when fetching only part of sheet",{ @@ -105,6 +110,20 @@ test_that("No Warning of empty sheet", { expect_silent(read_fods("../testdata/empty.fods")) }) +test_that("n_max", { + expect_silent(x <- read_ods("../testdata/starwars.ods", n_max = Inf)) + expect_equal(nrow(x), 10) + expect_silent(x <- read_ods("../testdata/starwars.ods", n_max = 5)) + expect_equal(nrow(x), 5) + expect_silent(x <- read_ods("../testdata/starwars.ods", n_max = 100)) + expect_equal(nrow(x), 10) +}) + +test_that("trim_ws", { + expect_equal(read_ods("../testdata/leadingspaces.ods", trim_ws = FALSE)[1,1, drop = TRUE], " abc") + expect_equal(read_ods("../testdata/leadingspaces.ods", trim_ws = TRUE)[1,1, drop = TRUE], "abc") +}) + ## V2.0.0 behavior: backward compatibility test_that("Single column ODS v2.0.0", {