From f4b5debf140babdb5733168d80bfdc634e6aae8b Mon Sep 17 00:00:00 2001 From: Sebastian Krantz Date: Tue, 20 Aug 2024 09:13:46 +0200 Subject: [PATCH] Check strong encoding and coerce if necessary in fmatch() / join(). Should fix #566, #579, and #618. --- src/base_radixsort.h | 1 + src/data.table.h | 6 ++++++ src/data.table_utils.c | 26 ++++++++++++++++++++++++++ src/join.c | 9 +++++---- src/match.c | 34 ++++++++++++++++++++++++++++++++-- 5 files changed, 70 insertions(+), 6 deletions(-) diff --git a/src/base_radixsort.h b/src/base_radixsort.h index f6589ed7..85d397a0 100644 --- a/src/base_radixsort.h +++ b/src/base_radixsort.h @@ -23,6 +23,7 @@ #define SET_TRLEN(x, v) SET_STDVEC_TRUELENGTH(x, ((int) (v))) #define MYLEV(x) (((SEXPREC_partial *)(x))->sxpinfo.gp) +#define IS_UTF8(x) (MYLEV(x) & 8) #define IS_ASCII(x) (MYLEV(x) & 64) // from data.table.h #define SETTOF(x,v) ((((SEXPREC_partial *)(x))->sxpinfo.type)=(v)) diff --git a/src/data.table.h b/src/data.table.h index 3a1e348a..5e003935 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -17,6 +17,10 @@ #define SEXPPTR(x) ((SEXP *)DATAPTR(x)) // to avoid overhead of looped VECTOR_ELT #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped VECTOR_ELT +// Needed for match.c and join.c +#define NEED2UTF8(s) !(IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s)) +#define ENC2UTF8(s) (!NEED2UTF8(s) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8)) + // for use with bit64::integer64 #define NA_INTEGER64 INT64_MIN #define MAX_INTEGER64 INT64_MAX @@ -52,6 +56,8 @@ extern size_t sizes[100]; // max appears to be FUNSXP = 99, see Rinternals.h extern size_t typeorder[100]; // data.table_utils.c +int need2utf8(SEXP x); +SEXP coerceUtf8IfNeeded(SEXP x); SEXP setnames(SEXP x, SEXP nam); bool allNA(SEXP x, bool errorForBadType); SEXP allNAv(SEXP x, SEXP errorForBadType); diff --git a/src/data.table_utils.c b/src/data.table_utils.c index 0357051f..6d160d08 100644 --- a/src/data.table_utils.c +++ b/src/data.table_utils.c @@ -6,6 +6,32 @@ #include "data.table.h" +int need2utf8(SEXP x) { + const int xlen = length(x); + const SEXP *xd = STRING_PTR_RO(x); + // for (int i=0; i