Merge pull request #153 from ropensci/coltypes2

ropensci · Aug 22, 2023 · 38cefb8 · 38cefb8
2 parents 5c93d0b + b73cd96
commit 38cefb8
Show file tree

Hide file tree

Showing 6 changed files with 168 additions and 97 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: readODS
 Type: Package
 Title: Read and Write ODS Files
-Version: 2.0.3
+Version: 2.0.4
 Authors@R: 
     c(person("Gerrit-Jan", "Schutten", role = c("aut"), email = "[email protected]"),
     person("Chung-hong", "Chan", role = c("aut", "cre"), email = "[email protected]", comment = c(ORCID = "0000-0002-6232-7530")),

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,16 @@
+# readODS 2.04
+
+## `col_types` can be character ("shorthand") or list
+
+fix #135 and the review by Dr  Ruedni
+
+```r
+# Specifying col_types as shorthand, the third column as factor; other by guessing
+read_ods("starwars.ods", col_types = "??f")
+# Specifying col_types as list
+read_ods("starwars.ods", col_types = list(species = "f"))
+```
+
 # readODS 2.0.3
 
 ## Add support for writing flat ODS

diff --git a/R/read_ods.R b/R/read_ods.R
@@ -20,7 +20,7 @@
 }
 
 ## Based on readxl, although the implementation is different.
-## If max row is -1, read to end of row. 
+## If max row is -1, read to end of row.
 ## Row and column-numbers are 1-based
 .standardise_limits <- function(range, skip) {
     if(is.null(range)) {
@@ -50,17 +50,6 @@
     return(limits)
 }
 
-.silent_type_convert <- function(x, verbose = TRUE, na = c("", "NA")) {
-    if (verbose) {
-        res <- readr::type_convert(df = x, na = na)
-    } else {
-        suppressMessages({
-            res <- readr::type_convert(df = x, na = na)
-        })
-    }
-    return(res)
-}
-
 .convert_strings_to_factors <- function(df) {
     i <- vapply(df, is.character, logical(1))
     df[i] <- lapply(df[i], as.factor)
@@ -106,6 +95,65 @@
     if (row_names && as_tibble) {
         stop("Tibbles do not support row names. To use row names, set as_tibble to false", call. = FALSE)
     }
+    if (!inherits(col_types, "col_spec") &&
+        isFALSE(is.na(col_types)) &&
+        isFALSE(is.null(col_types)) &&
+        isFALSE(is.character(col_types)) &&
+        isFALSE(is.list(col_types))) {
+        stop("Unknown col_types. Can either be a class col_spec, list, character, NULL or NA.",
+             call. = FALSE)
+    }
+}
+
+.return_empty <- function(as_tibble = FALSE) {
+    warning("empty sheet, return empty data frame.", call. = FALSE)
+    if(as_tibble) {
+        return(tibble::tibble())
+    }
+    return(data.frame())
+}
+
+.type_convert <- function(df, col_types = NULL, verbose = TRUE, na = c("", "NA")) {
+    if (verbose) {
+        res <- readr::type_convert(df = df, col_types, na = na)
+    } else {
+        suppressMessages({
+            res <- readr::type_convert(df = df, col_types, na = na)
+        })
+    }
+    return(res)
+}
+
+.handle_col_types <- function(res, col_types, verbose, na) {
+    if (isTRUE(is.na(col_types))) {
+        return(res)
+    }
+    .type_convert(df = res, col_types = col_types, verbose = verbose, na = na)
+}
+
+## standardise `sheet` parameter as a number
+.standardise_sheet <- function(sheet, sheets, limits, range) {
+    sheet_from_range <- cellranger::as.cell_limits(range)[["sheet"]]
+    if (!is.null(range) && !is.na(sheet_from_range)) {
+        if (sheet != 1) {
+            warning("Sheet suggested in range and using sheet argument. Defaulting to range",
+                    call. = FALSE)
+        }
+        sheet <- sheet_from_range ## override
+    }
+    is_in_sheet_names <- stringi::stri_cmp(e1 = sheet, e2 = sheets) == 0
+    if (!is.numeric(sheet) && !any(is_in_sheet_names)) {
+        stop(paste0("No sheet found with name '", sheet, "'", sep = ""),
+             call. = FALSE)
+    }
+    if (is.numeric(sheet) && sheet > length(sheets)) {
+        stop(paste0("File contains only ", length(sheets), " sheets. Sheet index out of range.",
+                    call. = FALSE))
+    }
+    if (!is.numeric(sheet)) {
+        sheet <- which(is_in_sheet_names)
+    }
+    return(sheet)
 }
 
 .read_ods <- function(path,
@@ -134,65 +182,26 @@
         strings_as_factors,
         verbose,
         as_tibble)
-    # Get cell range info
-    limits <- .standardise_limits(range, skip)
-    # Get sheet number.
     if (flat) {
-        sheets <- get_flat_sheet_names_(file = path, include_external_data = TRUE)
+        .get_sheet_names_func <- get_flat_sheet_names_
+        .read_ods_func <- read_flat_ods_
     } else {
-        sheets <- get_sheet_names_(file = path, include_external_data = TRUE)
-    }
-    sheet_name <- cellranger::as.cell_limits(range)[["sheet"]]
-    if(!is.null(range) && !is.na(sheet_name)) {
-        if(sheet != 1) {
-            warning("Sheet suggested in range and using sheet argument. Defaulting to range",
-                call. = FALSE)
-        }
-        is_in_sheet_names <- stringi::stri_cmp(e1 = sheet_name, e2 = sheets) == 0
-        if(any(is_in_sheet_names)) {
-            sheet <- which(is_in_sheet_names)
-        } else {
-            stop(paste0("No sheet found with name '", sheet_name, "'", sep = ""),
-                call. = FALSE)
-        }
-    } else {
-        is_in_sheet_names <- stringi::stri_cmp(e1 = sheet, e2 = sheets) == 0
-        if (!is.numeric(sheet) && any(is_in_sheet_names)) {
-            sheet <- which(is_in_sheet_names)
-        } else if (!is.numeric(sheet)) {
-            stop(paste0("No sheet found with name '", sheet, "'", sep = ""), 
-                call. = FALSE)
-        }
-        if (sheet > length(sheets)) {
-            stop(paste0("File contains only ", length(sheets), " sheets. Sheet index out of range.",
-                call. = FALSE))
-        }
+        .get_sheet_names_func <- get_sheet_names_
+        .read_ods_func <- read_ods_
     }
-
-    if(flat) {
-        strings <- read_flat_ods_(file = path,
-                                  start_row = limits["min_row"],
-                                  stop_row = limits["max_row"],
-                                  start_col = limits["min_col"],
-                                  stop_col = limits["max_col"],
-                                  sheet = sheet,
-                                  formula_as_formula = formula_as_formula)
-    } else {
-        strings <- read_ods_(file = path,
-                             start_row = limits["min_row"],
-                             stop_row = limits["max_row"],
-                             start_col = limits["min_col"],
-                             stop_col = limits["max_col"],
-                             sheet = sheet,
-                             formula_as_formula = formula_as_formula)
-    }
-    if(strings[1] == 0 || strings[2] == 0) {
-        warning("empty sheet, return empty data frame.", call. = FALSE)
-        if(as_tibble) {
-            return(tibble::tibble())
-        } else {
-            return(data.frame())
-        }
+    ## Get cell range info
+    limits <- .standardise_limits(range, skip)
+    sheet <- .standardise_sheet(sheet = sheet, sheets = .get_sheet_names_func(file = path, include_external_data = TRUE),
+                                limits = limits, range = range)
+    strings <- .read_ods_func(file = path,
+                              start_row = limits["min_row"],
+                              stop_row = limits["max_row"],
+                              start_col = limits["min_col"],
+                              stop_col = limits["max_col"],
+                              sheet = sheet,
+                              formula_as_formula = formula_as_formula)
+    if (strings[1] == 0 || strings[2] == 0) {
+        return(.return_empty(as_tibble = as_tibble))
     }
     res <- as.data.frame(
         matrix(
@@ -201,27 +210,14 @@
             byrow = TRUE),
         stringsAsFactors = FALSE)
     res <- .change_df_with_col_row_header(x = res, col_header = col_names, row_header = row_names, .name_repair = .name_repair)
-    res <- data.frame(res)
-    if (inherits(col_types, 'col_spec')) {
-        res <- readr::type_convert(df = res, col_types = col_types, na = na)
-    } else if (length(col_types) == 0 && is.null(col_types)) {
-        res <- .silent_type_convert(x = res, verbose = verbose, na = na)
-    } else if (length(col_types) == 1 && is.na(col_types[1])) {
-        {} #Pass
-    } else {
-        stop("Unknown col_types. Can either be a class col_spec, NULL or NA.",
-            call. = FALSE)
-    }
-
+    res <- .handle_col_types(data.frame(res), col_types = col_types, verbose = verbose, na = na)
     if (strings_as_factors) {
         res <- .convert_strings_to_factors(df = res)
     }
-
     if (as_tibble) {
         res <- tibble::as_tibble(x = res, .name_repair = .name_repair)
     }
     return(res)
-
 }
 
 #' Read Data From (F)ODS File
@@ -232,12 +228,12 @@
 #' @param path path to the (f)ods file.
 #' @param sheet sheet to read. Either a string (the sheet name), or an integer sheet number. The default is 1.
 #' @param col_names logical, indicating whether the file contains the names of the variables as its first line. Default is TRUE.
-#' @param col_types Either NULL to guess from the spreadsheet or refer to [readr::type_convert()] to specify cols specification. NA will return a data frame with all columns being "characters".
+#' @param col_types Either NULL to guess from the spreadsheet or refer to [readr::type_convert()] to specify cols specification. It can also be a shorthand such as "ccf" ("character", "character", "factor"), a list, or an object created by [readr::cols()]. NA will return a data frame with all columns being "characters". Please note that it will not speed up the reading by a lot by specifying this parameter explicitly. It is more for accuracy.
 #' @param na Character vector of strings to use for missing values. By default read_ods converts blank cells to missing data. It can also be set to
 #' NULL, so that empty cells are treated as NA.
 #' @param skip the number of lines of the data file to skip before beginning to read data. If this parameter is larger than the total number of lines in the ods file, an empty data frame is returned.
 #' @param formula_as_formula logical, a switch to display formulas as formulas "SUM(A1:A3)" or as the resulting value "3"... or "8".. . Default is FALSE.
-#' @param range selection of rectangle using Excel-like cell range, such as \code{range = "D12:F15"} or \code{range = "R1C12:R6C15"}. Cell range processing is handled by the \code{\link[=cellranger]{cellranger}} package.
+#' @param range selection of rectangle using Excel-like cell range, such as \code{range = "D12:F15"} or \code{range = "R1C12:R6C15"}. Cell range processing is handled by the \code{\link[=cellranger]{cellranger}} package. If sheet name is in the range, such as \code{range = "Sheet2!A2:B7"}, this sheet name is used instead of the provided `sheet`. If `sheet` is not the default value (1), a warning is given.
 #' @param row_names logical, indicating whether the file contains the names of the rows as its first column. Default is FALSE.
 #' @param strings_as_factors logical, if character columns to be converted to factors. Default is FALSE.
 #' @param verbose logical, if messages should be displayed. Default is FALSE.
@@ -248,9 +244,9 @@
 #'  - `"check_unique"`: Check names are unique, but do not repair
 #'  - `"universal"` : Checks names are unique and valid R variables names in scope
 #'  - A function to apply custom name repair.
-#'  
+#'
 #'  Default is `"unique"`.
-#'  
+#'
 #' @return A tibble (\code{tibble}) or data frame (\code{data.frame}) containing a representation of data in the (f)ods file.
 #' @author Peter Brohan <peter.brohan+cran@@gmail.com>, Chung-hong Chan <chainsawtiney@@gmail.com>, Gerrit-Jan Schutten <phonixor@@gmail.com>
 #' @examples
@@ -267,6 +263,12 @@
 #' read_fods("starwars.fods", sheet = 2)
 #' # Read a specific range, e.g. A1:C11
 #' read_fods("starwars.fods", sheet = 2, range = "A1:C11")
+#' # Give a warning and read from Sheet1 (not 2)
+#' read_fods("starwars.fods", sheet = 2, range = "Sheet1!A1:C11")
+#' # Specifying col_types as shorthand, the third column as factor; other by guessing
+#' read_ods("starwars.ods", col_types = "??f")
+#' # Specifying col_types as list
+#' read_ods("starwars.ods", col_types = list(species = "f"))
 #' }
 #' @export
 read_ods <- function(path,

diff --git a/man/read_ods.Rd b/man/read_ods.Rd
diff --git a/tests/testthat/test_col_types.R b/tests/testthat/test_col_types.R
@@ -1,3 +1,14 @@
+### throw an error if col_types is not col_spec, single value NA or single value NULL
+
+test_that("col_types throw error", {
+    expect_error(read_ods('../testdata/col_types.ods', col_types = 123))
+    expect_error(read_ods('../testdata/col_types.ods', col_types = c(NA, NA)))
+    expect_error(read_ods('../testdata/col_types.ods', col_types = c(NA, 123)))
+    expect_error(x <- read_ods("../testdata/col_types.ods", col_types = TRUE))
+    expect_error(x <- read_ods("../testdata/col_types.ods", col_types = 123))
+
+})
+
 test_that("col_types ODS", {
     x <- read_ods('../testdata/col_types.ods', col_types = NA, as_tibble = TRUE)
     expect_equal(class(x[[2]]), "character")
@@ -21,10 +32,50 @@ test_that("multi col_types ODS", {
     expect_equal(class(x[[2]]), "character")
 })
 
-### throw an error if col_types is not col_spec, single value NA or single value NULL
+test_that("col_types shorthand", {
+    x <- read_ods("../testdata/col_types.ods") # col_types = NULL
+    expect_equal(class(x[[1]]), "character")
+    expect_equal(class(x[[2]]), "numeric")
+    expect_equal(class(x[[3]]), "numeric")
+    x <- read_ods("../testdata/col_types.ods", col_types = NA)
+    expect_equal(class(x[[1]]), "character")
+    expect_equal(class(x[[2]]), "character")
+    expect_equal(class(x[[3]]), "character")
+    x <- read_ods("../testdata/col_types.ods", col_types = "fii")
+    expect_equal(class(x[[1]]), "factor")
+    expect_equal(class(x[[2]]), "integer")
+    expect_equal(class(x[[3]]), "integer")
+    x <- read_ods("../testdata/col_types.ods", col_types = "cii")
+    expect_equal(class(x[[1]]), "character")
+    expect_equal(class(x[[2]]), "integer")
+    expect_equal(class(x[[3]]), "integer")
+    ## https://github.com/tidyverse/readr/issues/1509
+    ## readr::type_convert(df, col_types = "f_")
+    ## x <- read_ods("../testdata/col_types.ods", col_types = "fi_")
+    ## expect_equal(class(x[,1]]), "factor")
+    ## expect_equal(class(x[,2]]), "integer")
+    ## expect_equal(ncol(x), 3)
+})
 
-test_that("col_types throw error", {
-    expect_error(read_ods('../testdata/col_types.ods', col_types = 123))
-    expect_error(read_ods('../testdata/col_types.ods', col_types = c(NA, NA)))
-    expect_error(read_ods('../testdata/col_types.ods', col_types = c(NA, 123)))
-})
+test_that("col_types orthodox", {
+    x <- read_ods('../testdata/col_types.ods', col_types = readr::cols(cola = readr::col_factor(), colb = readr::col_integer(), colc = readr::col_integer()), as_tibble = TRUE)
+    expect_equal(class(x[[1]]), "factor")
+    expect_equal(class(x[[2]]), "integer")
+    expect_equal(class(x[[2]]), "integer")
+    x <- read_ods('../testdata/col_types.ods', col_types = readr::cols(cola = readr::col_character(), colb = readr::col_integer(), colc = readr::col_integer()), as_tibble = TRUE)
+    expect_equal(class(x[[1]]), "character")
+    expect_equal(class(x[[2]]), "integer")
+    expect_equal(class(x[[2]]), "integer")
+})
+
+
+test_that("col_types list", {
+    x <- read_ods("../testdata/col_types.ods", col_types = list(cola = "f", colb = "i", colc = "i"))
+    expect_equal(class(x[[1]]), "factor")
+    expect_equal(class(x[[2]]), "integer")
+    expect_equal(class(x[[3]]), "integer")
+    x <- read_ods("../testdata/col_types.ods", col_types = list(cola = "c", colb = "i", colc = "i"))
+    expect_equal(class(x[[1]]), "character")
+    expect_equal(class(x[[2]]), "integer")
+    expect_equal(class(x[[3]]), "integer")
+})
diff --git a/tests/testthat/test_read_ods.R b/tests/testthat/test_read_ods.R
@@ -2,7 +2,6 @@ test_that("Incorrect Argument", {
     expect_error(read_ods(), "No file path was")
     expect_error(read_ods(path = "not/real/file.ods"), "file does not exist")
     expect_error(read_ods(path = "../testdata/sum.ods", col_names = "a"), "col_names must be of type `boolean`")
-    expect_error(read_ods(path = '../testdata/sum.ods', col_types = "a"), "Unknown col_types. Can either be a class col_spec, NULL or NA.")
     expect_error(read_ods(path = "../testdata/sum.ods", skip = -1), "skip must be a positive integer")
     expect_error(read_ods(path = "../testdata/sum.ods", formula_as_formula = "a"), "formula_as_formula must be of type `boolean`")
     expect_error(read_ods(path = "../testdata/sum.ods", row_names = "a"), "row_names must be of type `boolean`")
@@ -97,4 +96,4 @@ test_that("read with column headers", {
     expect_silent(x <- read_ods("../testdata/starwars.ods", row_names = TRUE, as_tibble = FALSE))
     expect_equal(ncol(x), 2)
     expect_equal(colnames(x), c("homeworld", "species"))
-})
+})