From 436bd6c8c73920f2874124797aea364bf1d26ba9 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 10 Dec 2024 06:19:47 +0800 Subject: [PATCH] fread parse Y/N as bool (#4564) * basic idea for Y/N bool parser * New logicalYN argument * NEWS * tests * missing \arguments{} entry * Some comments --------- Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ R/fread.R | 8 +++++--- inst/tests/tests.Rraw | 38 +++++++++++++++++++++++++++++--------- man/fread.Rd | 2 ++ src/data.table.h | 2 +- src/fread.c | 27 ++++++++++++++++++++++++--- src/fread.h | 15 ++++++++++++++- src/freadR.c | 11 +++++++---- 8 files changed, 84 insertions(+), 21 deletions(-) diff --git a/NEWS.md b/NEWS.md index 13906048a5..885eb56c5b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -67,6 +67,8 @@ rowwiseDT( 5. `setcolorder()` gains `skip_absent` to ignore unrecognized columns (i.e. columns included in `neworder` but not present in the data), [#6044, #6068](https://github.com/Rdatatable/data.table/pull/6044). Default behavior (`skip_absent=FALSE`) remains unchanged, i.e. unrecognized columns result in an error. Thanks to @sluga for the suggestion and @sluga & @Nj221102 for the PRs. +6. `fread()` gains `logicalYN` argument to read columns consisting only of strings `Y`, `N` as `logical` (as opposed to character), [#4563](https://github.com/Rdatatable/data.table/issues/4563). The default is controlled by option `datatable.logicalYN`, itself defaulting to `FALSE`, for back-compatibility -- some smaller tables (especially sharded tables) might inadvertently read a "true" string column as `logical` and cause bugs. This is particularly important for tables with a column named `y` or `n` -- automatic header detection under `logicalYN=TRUE` will see these values in the first row as being "data" as opposed to column names. A parallel option was not included for `fwrite()` at this time -- users looking for a compact representation of logical columns can still use `fwrite(logical01=TRUE)`. We also opted for now to check only `Y`, `N` and not `Yes`/`No`/`YES`/`NO`. + ## BUG FIXES 1. `fwrite()` respects `dec=','` for timestamp columns (`POSIXct` or `nanotime`) with sub-second accuracy, [#6446](https://github.com/Rdatatable/data.table/issues/6446). Thanks @kav2k for pointing out the inconsistency and @MichaelChirico for the PR. diff --git a/R/fread.R b/R/fread.R index 837324c1a7..64247d7aca 100644 --- a/R/fread.R +++ b/R/fread.R @@ -4,7 +4,9 @@ na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbo skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE), -nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), keepLeadingZeros=getOption("datatable.keepLeadingZeros",FALSE), +nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), +logicalYN=getOption("datatable.logicalYN", FALSE), +keepLeadingZeros=getOption("datatable.keepLeadingZeros",FALSE), yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") { if (missing(input)+is.null(file)+is.null(text)+is.null(cmd) < 3L) stopf("Used more than one of the arguments input=, file=, text= and cmd=.") @@ -24,7 +26,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } stopifnot( isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill) || is.numeric(fill) && length(fill)==1L && fill >= 0L, isTRUEorFALSE(showProgress), - isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml), + isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(logicalYN), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml), isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0), is.numeric(nrows), length(nrows)==1L ) @@ -277,7 +279,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") tz="UTC" } ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip, - fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC") + fill,showProgress,nThread,verbose,warnings2errors,logical01,logicalYN,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC") if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns nr = length(ans[[1L]]) require_bit64_if_needed(ans) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3d8d2fabfe..657478c61f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5567,6 +5567,10 @@ test(1343.5, fread("A,B\n1,true\n2,\n3,false"), data.table(A=1:3, B=c(TRUE,NA,FA test(1343.6, fread("A,B\n1,true\n2,NA\n3,"), data.table(A=1:3, B=c(TRUE,NA,NA))) test(1344.1, fread("A,B\n1,2\n0,3\n,1\n", logical01=FALSE), data.table(A=c(1L,0L,NA), B=c(2L,3L,1L))) test(1344.2, fread("A,B\n1,2\n0,3\n,1\n", logical01=TRUE), data.table(A=c(TRUE,FALSE,NA), B=c(2L,3L,1L))) +test(1344.3, fread("A,B\nY,2\nN,3\nNA,1\n", logicalYN=FALSE), data.table(A=c('Y','N',NA), B=c(2L,3L,1L))) +test(1344.4, fread("A,B\nY,2\nN,3\nNA,1\n", logicalYN=TRUE), data.table(A=c(TRUE,FALSE,NA), B=c(2L,3L,1L))) +test(1344.5, fread("A,B\nY,2\nN,3\n,1\n", logicalYN=FALSE, na.strings=""), data.table(A=c('Y','N',NA), B=c(2L,3L,1L))) +test(1344.6, fread("A,B\nY,2\nN,3\n,1\n", logicalYN=TRUE, na.strings=""), data.table(A=c(TRUE,FALSE,NA), B=c(2L,3L,1L))) # .N now available in i DT = data.table(a=1:3,b=1:6) @@ -7870,9 +7874,14 @@ str = "a,b\n1.5,\"at the 5\" end of the gene.\"" test(1551.1, fread(str), data.table(a = 1.5, b = "at the 5\" end of the gene."), warning=w<-"resolved improper quoting") #1256 str = "x,y\nx1,\"oops\" y1\n" -test(1551.2, fread(str), data.table(x = "x1", y = "\"oops\" y1"), warning=w) +test(1551.21, fread(str), data.table(x='x1', y='"oops" y1'), warning=w) +# during header detection, 'y' is seen as a valid value --> header determined 'FALSE' despite later non-Y/N data +test(1551.22, fread(str, logicalYN=TRUE), data.table(V1=c('x', 'x1'), V2=c('y', '"oops" y1')), warning=w) +test(1551.23, fread(str, logicalYN=TRUE, header=TRUE), data.table(x='x1', y='"oops" y1'), warning=w) str = "x,y\nx1,\"oops\" y1" -test(1551.3, fread(str), data.table(x = "x1", y = "\"oops\" y1"), warning=w) +test(1551.31, fread(str), data.table(x="x1", y='"oops" y1'), warning=w) +test(1551.32, fread(str, logicalYN=TRUE), data.table(V1=c('x', 'x1'), V2=c('y', '"oops" y1')), warning=w) +test(1551.33, fread(str, logicalYN=TRUE, header=TRUE), data.table(x="x1", y='"oops" y1'), warning=w) #1077 str = '2,3\n""foo,bar' test(1551.4, fread(str), data.table(V1=c("2","\"\"foo"), V2=c("3","bar")), warning=w) @@ -7882,8 +7891,12 @@ test(1551.5, fread(str), data.table(L1 = c("L2", "L3"), some = c("some", "this"), unquoted = c("\"half\" quoted", "should work"), stuff = c("stuff", "ok though")), warning = w) #1095 -rhs = read.table(testDir("issue_1095_fread.txt.bz2"), sep=",", comment.char="", stringsAsFactors=FALSE, quote="", strip.white=TRUE) -if (test_R.utils) test(1551.6, fread(testDir("issue_1095_fread.txt.bz2"), logical01=FALSE), setDT(rhs), warning=w) +rhs = setDT(read.table(testDir("issue_1095_fread.txt.bz2"), sep=",", comment.char="", stringsAsFactors=FALSE, quote="", strip.white=TRUE)) +if (test_R.utils) { + test(1551.61, fread(testDir("issue_1095_fread.txt.bz2"), logical01=FALSE), rhs, warning=w) + rhs[, names(.SD) := lapply(.SD, \(x) x == "Y"), .SDcols = c("V16", "V17", "V45")] + test(1551.62, fread(testDir("issue_1095_fread.txt.bz2"), logical01=FALSE, logicalYN=TRUE), rhs, warning=w) +} # FR #1314 rest of na.strings issue str = "a,b,c,d\n#N/A,+1,5.5,FALSE\n#N/A,5,6.6,TRUE\n#N/A,+1,#N/A,-999\n#N/A,#N/A,-999,FALSE\n#N/A,1,NA,TRUE" @@ -7896,8 +7909,10 @@ test(1552.3, fread(str, na.strings=c("#N/A", "-999", "+1")), read_table(str, na. test(1552.4, fread(str, na.strings=c("#N/A", "-999", "+1", "1")), read_table(str, na.strings=c("#N/A", "-999", "+1", "1"))) # enabled by FR #2927 test(1552.5, fread(str, na.strings=c("#N/A", "-999", "FALSE")), error="NAstring <>.*boolean.*not permitted") test(1552.6, fread("A\n1.0\n2\n-", na.strings=c("-")), data.table(A=c(1.0, 2.0, NA))) -test(1552.7, fread(str, na.strings=c("#N/A", "-999", "+1", "1"), logical01=TRUE), +test(1552.71, fread(str, na.strings=c("#N/A", "-999", "+1", "1"), logical01=TRUE), error="NAstring <<1>> and logical01=TRUE.*not permitted") +test(1552.72, fread(str, na.strings=c("#N/A", "-999", "+1", "Y"), logicalYN=TRUE), + error="NAstring <> and logicalYN=TRUE.*not permitted") str = "a,b,c\n0,1,2\n1,0,2" test(1552.8, fread(str, na.strings = "0"), data.table(a=c(NA,1L), b=c(1L,NA), c=c(2L,2L))) test(1552.9, fread(str, na.strings = c("0","1")), data.table(a=c(NA,NA), b=c(NA,NA), c=c(2L,2L))) @@ -8896,8 +8911,10 @@ test(1618.5, fread("a,c,b\n1,2,3", select=c("b", "c"), col.names=c("q", "r")), d test(1618.6, fread("a,c,b\n1,2,3", select=c("b", "z")), data.table(b=3L), warning="Column name 'z' not found.*skipping") # Additional test for 1445 for non-monotonic integer select -select1618.8 <- c(4, 9, 8, 23, 1, 21, 5, 18, 11, 13) -test(1618.8, names(fread("a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z\na,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z", select = select1618.8)), letters[select1618.8]) +select1618 <- c(4, 9, 8, 23, 1, 21, 5, 18, 11, 13) +str = paste0(paste(letters, collapse=','), '\n', paste(letters, collapse=',')) +test(1618.8, names(fread(str, select=select1618)), letters[select1618]) +test(1618.9, names(fread(str, select=select1618, logicalYN=TRUE)), paste0('V', select1618)) # fix for #1270. Have been problems with R before vs after 3.1.0 here. But now ok in all R versions. DT = data.table(x=1:2, y=5:6) @@ -9106,7 +9123,8 @@ test(1626.91, fsetdiff(DT, DT["b"]), DT[c(1,4)]) # fix for #1087 and #1465 test(1627.1, charToRaw(names(fread(testDir("issue_1087_utf8_bom.csv")))[1L]), as.raw(97L)) test(1627.2, names(fread(testDir("issue_1087_utf8_bom.csv"), verbose=TRUE))[1L], "a", output="UTF-8 byte order mark EF BB BF found") -test(1627.3, names(fread(testDir("gb18030.txt")))[1L], "x", warning="GB-18030 encoding detected") +test(1627.31, names(fread(testDir("gb18030.txt")))[1L], "x", warning="GB-18030 encoding detected") +test(1627.32, names(fread(testDir("gb18030.txt"), logicalYN=TRUE))[1L], "V1", warning="GB-18030 encoding detected") test(1627.4, fread(testDir("utf16le.txt")), error="File is encoded in UTF-16") test(1627.5, fread(testDir("utf16be.txt")), error="File is encoded in UTF-16") @@ -11448,9 +11466,11 @@ unlink(f) test(1753.1, fread("X,Y\n1,2\n3,4\n5,6"), data.table(X=INT(1,3,5),Y=INT(2,4,6))) test(1753.2, fread("X,Y\n1,2\n3,4,\n5,6",logical01=TRUE), ans<-data.table(X=TRUE,Y=2L), warning="Stopped.*line 3. Expected 2 fields but found 3.*discarded.*<<3,4,>>") test(1753.3, fread("X,Y\n1,2\n3,4,7\n5,6",logical01=TRUE), ans, warning="Stopped.*line 3. Expected 2 fields but found 3.*discarded.*<<3,4,7>>") +test(1753.4, fread("X,Y\nY,2\n3,4,\n5,6",logicalYN=TRUE), ans<-data.table(X=TRUE,Y=2L), warning="Stopped.*line 3. Expected 2 fields but found 3.*discarded.*<<3,4,>>") +test(1753.5, fread("X,Y\nY,2\n3,4,7\n5,6",logicalYN=TRUE), ans, warning="Stopped.*line 3. Expected 2 fields but found 3.*discarded.*<<3,4,7>>") # issue 2051 where a quoted field contains ", New quote rule detection handles it. -if (test_R.utils) test(1753.4, fread(testDir("issue_2051.csv.gz"))[2,grep("^Our.*tool$",COLUMN50)], 1L) +if (test_R.utils) test(1753.6, fread(testDir("issue_2051.csv.gz"))[2,grep("^Our.*tool$",COLUMN50)], 1L) # check omp critical around SET_STRING_ELT # minimal construction big enough for parallelism with 8 or less threads. On a machine with more, do setDTthreads(8) first otherwise diff --git a/man/fread.Rd b/man/fread.Rd index 4448b482bc..00a24dab3a 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -23,6 +23,7 @@ showProgress=getOption("datatable.showProgress", interactive()), data.table=getOption("datatable.fread.datatable", TRUE), nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01", FALSE), # due to change to TRUE; see NEWS +logicalYN=getOption("datatable.logicalYN", FALSE), keepLeadingZeros = getOption("datatable.keepLeadingZeros", FALSE), yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" ) @@ -61,6 +62,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" \item{data.table}{ TRUE returns a \code{data.table}. FALSE returns a \code{data.frame}. The default for this argument can be changed with \code{options(datatable.fread.datatable=FALSE)}.} \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{logical01}{If TRUE a column containing only 0s and 1s will be read as logical, otherwise as integer.} + \item{logicalYN}{If TRUE a column containing only Ys and Ns will be read as logical, otherwise as character.} \item{keepLeadingZeros}{If TRUE a column containing numeric data with leading zeros will be read as character, otherwise leading zeros will be removed and converted to numeric.} \item{yaml}{ If \code{TRUE}, \code{fread} will attempt to parse (using \code{\link[yaml]{yaml.load}}) the top of the input as YAML, and further to glean parameters relevant to improving the performance of \code{fread} on the data itself. The entire YAML section is returned as parsed into a \code{list} in the \code{yaml_metadata} attribute. See \code{Details}. } \item{autostart}{ Deprecated and ignored with warning. Please use \code{skip} instead. } diff --git a/src/data.table.h b/src/data.table.h index f2c1874e19..0ac2d21fa6 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -294,7 +294,7 @@ SEXP setcharvec(SEXP, SEXP, SEXP); SEXP chmatch_R(SEXP, SEXP, SEXP); SEXP chmatchdup_R(SEXP, SEXP, SEXP); SEXP chin_R(SEXP, SEXP); -SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP rbindlist(SEXP, SEXP, SEXP, SEXP, SEXP); SEXP setlistelt(SEXP, SEXP, SEXP); diff --git a/src/fread.c b/src/fread.c index 7b89b32946..a3346bba20 100644 --- a/src/fread.c +++ b/src/fread.c @@ -75,8 +75,9 @@ static freadMainArgs args = {0}; // global for use by DTPRINT; static implies = static int mmp_fd = -1; #endif -const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string"}; -int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 }; +// See header for more explanation. +const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string"}; +int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 }; // In AIX, NAN and INFINITY don't qualify as constant literals. Refer: PR #3043 // So we assign them through below init function. @@ -1154,6 +1155,21 @@ static void parse_bool_lowercase(FieldParseContext *ctx) } } +/* Parse Y | y | N | n as boolean */ +static void parse_bool_yesno(FieldParseContext *ctx) +{ + const char *ch = *(ctx->ch); + int8_t *target = (int8_t*) ctx->targets[sizeof(int8_t)]; + if (ch[0] == 'Y' || ch[0] == 'y') { + *target = 1; + *(ctx->ch) = ch + 1; + } else if (ch[0] == 'N' || ch[0] == 'n') { + *target = 0; + *(ctx->ch) = ch + 1; + } else { + *target = NA_BOOL8; + } +} /* How to register a new parser * (1) Write the parser @@ -1170,6 +1186,7 @@ static reader_fun_t fun[NUMTYPE] = { (reader_fun_t) &parse_bool_uppercase, (reader_fun_t) &parse_bool_titlecase, (reader_fun_t) &parse_bool_lowercase, + (reader_fun_t) &parse_bool_yesno, (reader_fun_t) &StrtoI32, (reader_fun_t) &StrtoI64, (reader_fun_t) &parse_double_regular, @@ -1326,7 +1343,9 @@ int freadMain(freadMainArgs _args) { strcmp(ch,"True")==0 || strcmp(ch,"False")==0) STOP(_("freadMain: NAstring <<%s>> is recognized as type boolean, this is not permitted."), ch); if ((strcmp(ch,"1")==0 || strcmp(ch,"0")==0) && args.logical01) - STOP(_("freadMain: NAstring <<%s>> and logical01=TRUE, this is not permitted."), ch); + STOP(_("freadMain: NAstring <<%s>> and %s=TRUE, this is not permitted."), ch, "logical01"); + if ((strcmp(ch,"Y")==0 || strcmp(ch,"N")==0) && args.logicalYN) + STOP(_("freadMain: NAstring <<%s>> and %s=TRUE, this is not permitted."), ch, "logicalYN"); char *end; errno = 0; (void)strtod(ch, &end); // careful not to let "" get to here as strtod considers "" numeric @@ -1335,6 +1354,7 @@ int freadMain(freadMainArgs _args) { nastr++; } disabled_parsers[CT_BOOL8_N] = !args.logical01; + disabled_parsers[CT_BOOL8_Y] = !args.logicalYN; disabled_parsers[CT_ISO8601_DATE] = disabled_parsers[CT_ISO8601_TIME] = args.oldNoDateTime; // temporary new option in v1.13.0; see NEWS if (verbose) { if (*NAstrings == NULL) { @@ -1353,6 +1373,7 @@ int freadMain(freadMainArgs _args) { if (args.skipString) DTPRINT(_(" skip to string = <<%s>>\n"), args.skipString); DTPRINT(_(" show progress = %d\n"), args.showProgress); DTPRINT(_(" 0/1 column will be read as %s\n"), args.logical01? "boolean" : "integer"); + DTPRINT(_(" Y/N column will be read as %s\n"), args.logicalYN? "boolean" : "character"); } if (*NAstrings==NULL || // user sets na.strings=NULL (**NAstrings=='\0' && *(NAstrings+1)==NULL)) { // user sets na.strings="" diff --git a/src/fread.h b/src/fread.h index 060b8006f9..0fa9ca023e 100644 --- a/src/fread.h +++ b/src/fread.h @@ -17,6 +17,12 @@ #endif // Ordered hierarchy of types +// Each of these corresponds to a parser; they must be ordered "preferentially", i.e., if the same +// input could be validly parsed as both types t1 and t2, and we "prefer" type t1, t1 must come +// before t2. Most commonly, we prefer types using less storage. For example, characters '1.34' +// in a file could be double, complex, or string. We prefer double, which uses only 8 bytes. +// Similarly, '1234' could be integer, double, integer64, complex, or string. We prefer integer, +// which uses only 4 bytes. typedef enum { NEG = -1, // dummy to force signed type; sign bit used for out-of-sample type bump management CT_DROP = 0, // skip column requested by user; it is navigated as a string column with the prevailing quoteRule @@ -25,6 +31,7 @@ typedef enum { CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, + CT_BOOL8_Y, // Y/N-as-bool CT_INT32, // int32_t CT_INT64, // int64_t CT_FLOAT64, // double (64-bit IEEE 754 float) @@ -38,8 +45,10 @@ typedef enum { #define IS_DEC_TYPE(x) ((x) == CT_FLOAT64 || (x) == CT_FLOAT64_EXT || (x) == CT_ISO8601_TIME) // types where dec matters -extern int8_t typeSize[NUMTYPE]; +// Used to govern when coercion is allowed. We cannot coerce to a "lower" type, unless it has the same typeName. extern const char typeName[NUMTYPE][10]; +extern int8_t typeSize[NUMTYPE]; + extern const long double pow10lookup[301]; extern const uint8_t hexdigits[256]; @@ -149,6 +158,10 @@ typedef struct freadMainArgs // will become integer. bool logical01; + // If true, then column of Ns and Ys will be read as logical, otherwise it + // will become character. + bool logicalYN; + bool keepLeadingZeros; // should datetime with no Z or UTZ-offset be read as UTC? diff --git a/src/freadR.c b/src/freadR.c index 05b8cd00e1..4f94239cab 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -24,9 +24,10 @@ Secondary separator for list() columns, such as columns 11 and 12 in BED (no nee #define NUT NUMTYPE+2 // +1 for "numeric" alias for "double"; +1 for CLASS fallback using as.class() at R level afterwards -static int typeSxp[NUT] = {NILSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, STRSXP, REALSXP, STRSXP }; -static char typeRName[NUT][10]={"NULL", "logical", "logical", "logical", "logical", "logical", "integer", "integer64", "double", "double", "double", "IDate", "POSIXct", "character", "numeric", "CLASS" }; -static int typeEnum[NUT] = {CT_DROP, CT_EMPTY, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_INT32, CT_INT64, CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_ISO8601_DATE, CT_ISO8601_TIME, CT_STRING, CT_FLOAT64, CT_STRING}; +// these correspond to typeName, typeSize in fread.c, with few exceptions notes above on the NUT macro. +static int typeSxp[NUT] = {NILSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, STRSXP, REALSXP, STRSXP }; +static char typeRName[NUT][10]={"NULL", "logical", "logical", "logical", "logical", "logical", "logical", "integer", "integer64", "double", "double", "double", "IDate", "POSIXct", "character", "numeric", "CLASS" }; +static int typeEnum[NUT] = {CT_DROP, CT_EMPTY, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_BOOL8_Y, CT_INT32, CT_INT64, CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_ISO8601_DATE, CT_ISO8601_TIME, CT_STRING, CT_FLOAT64, CT_STRING}; static colType readInt64As=CT_INT64; static SEXP selectSxp; static SEXP dropSxp; @@ -66,6 +67,7 @@ SEXP freadR( SEXP verboseArg, SEXP warnings2errorsArg, SEXP logical01Arg, + SEXP logicalYNArg, // extras needed by callbacks from freadMain SEXP selectArg, @@ -128,6 +130,7 @@ SEXP freadR( args.nrowLimit = (int64_t)(REAL(nrowLimitArg)[0]); args.logical01 = LOGICAL(logical01Arg)[0]; + args.logicalYN = LOGICAL(logicalYNArg)[0]; { SEXP tt = PROTECT(GetOption(sym_old_fread_datetime_character, R_NilValue)); args.oldNoDateTime = oldNoDateTime = isLogical(tt) && LENGTH(tt)==1 && LOGICAL(tt)[0]==TRUE; @@ -640,7 +643,7 @@ void pushBuffer(ThreadLocalFreadParsingContext *ctx) } } else if (thisSize == 1) { - if (type[j] > CT_BOOL8_L) STOP(_("Field size is 1 but the field is of type %d\n"), type[j]); + if (type[j] > CT_BOOL8_Y) STOP(_("Field size is 1 but the field is of type %d\n"), type[j]); Rboolean *dest = (Rboolean *)LOGICAL(VECTOR_ELT(DT, resj)) + DTi; const char *src1 = (char*)buff1 + off1; for (int i=0; i