Skip to content

Commit

Permalink
{171428666}: column character encoding
Browse files Browse the repository at this point in the history
A new `ENCODING` keyword is introduced in this patch, that specifies
a character set for a column. Invalid character encoding will be
rejected. So far only `"utf8"` and `NONE` are supported.

This allows users to conveniently create an indexable utf8 cstring column.
It's implemented as a check constraint using the `utf8_validate()` function.

Signed-off-by: Rivers Zhang <[email protected]>
  • Loading branch information
riverszhang89 committed Jan 31, 2025
1 parent 2dc04c8 commit 00664f0
Show file tree
Hide file tree
Showing 17 changed files with 183 additions and 22 deletions.
32 changes: 19 additions & 13 deletions db/types.c
Original file line number Diff line number Diff line change
Expand Up @@ -3695,6 +3695,21 @@ TYPES_INLINE int CLIENT_BLOB_to_CLIENT_PSTR2(
return -1;
}

static int utf8_validate_permitting_trailing_zeros(const char *u, int max)
{
int valid_len;

if (utf8_validate(u, max, &valid_len) != 0)
return -1;

/* utf8_validate() stops at the 1st NUL character. We want to permit trailing zeros */
for (; valid_len < max - 1; ++valid_len) {
if (u[valid_len] != '\0')
return -1;
}
return 0;
}

/**
* Finds out where the input vutf8 string is stored and then determines where it
* should be copied and copies it. Doesn't deal with NULLs.
Expand All @@ -3717,7 +3732,6 @@ static TYPES_INLINE int vutf8_convert(int len, const void *in, int in_len,
blob_buffer_t *inblob,
blob_buffer_t *outblob, int *outdtsz)
{
int valid_len;
if (out_len > 0)
memset(out, 0, out_len);

Expand All @@ -3742,10 +3756,8 @@ static TYPES_INLINE int vutf8_convert(int len, const void *in, int in_len,
/* validate input blob */
assert(inblob->length == len);

if (utf8_validate(inblob->data, inblob->length, &valid_len) ||
valid_len != len - 1) {
if (utf8_validate_permitting_trailing_zeros(inblob->data, inblob->length))
return -1;
}

memcpy(outblob, inblob, sizeof(blob_buffer_t));
bzero(inblob, sizeof(blob_buffer_t));
Expand All @@ -3767,8 +3779,7 @@ static TYPES_INLINE int vutf8_convert(int len, const void *in, int in_len,

/* if the string isn't empty, validate the string and make sure its
* length matches len (minus 1 for the NUL byte) */
if (len > 0 &&
(utf8_validate(in, len, &valid_len) || valid_len != len - 1))
if (len > 0 && utf8_validate_permitting_trailing_zeros(in, len))
return -1;

memcpy(out, in, len);
Expand All @@ -3785,7 +3796,6 @@ static TYPES_INLINE int vutf8_convert(int len, const void *in, int in_len,
* fit in the out buffer, then the string needs to be copied from the in
* buffer to a new out blob */
else if (len <= in_len) {
int valid_len;

if (outblob) {
if (len > gbl_blob_sz_thresh_bytes)
Expand All @@ -3800,8 +3810,7 @@ static TYPES_INLINE int vutf8_convert(int len, const void *in, int in_len,

/* if the string isn't empty, validate the string and make sure its
* length matches len (minus 1 for the NUL byte) */
if (len > 0 &&
(utf8_validate(in, len, &valid_len) || valid_len != len - 1))
if (len > 0 && utf8_validate_permitting_trailing_zeros(in, len))
return -1;

memcpy(outblob->data, in, len);
Expand All @@ -3821,8 +3830,6 @@ static TYPES_INLINE int vutf8_convert(int len, const void *in, int in_len,
* blob to the out buffer */
else /* len <= out_len */
{
int valid_len;

/* Do not attempt to convert a blob placeholder (i.e., length == -2) */
if (inblob && inblob->length != OSQL_BLOB_FILLER_LENGTH) {
if (!inblob->exists || !inblob->data) {
Expand All @@ -3832,8 +3839,7 @@ static TYPES_INLINE int vutf8_convert(int len, const void *in, int in_len,

/* if the string isn't empty, validate the string and make sure its
* length matches len (minus 1 for the NUL byte) */
if (len > 0 && (utf8_validate(inblob->data, len, &valid_len) ||
valid_len != len - 1))
if (len > 0 && utf8_validate_permitting_trailing_zeros(inblob->data, len))
return -1;

memcpy(out, inblob->data, len);
Expand Down
Binary file modified docs/images/alter-table-ddl.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/images/column-constraint.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
12 changes: 12 additions & 0 deletions docs/src/sqlitegen/bubble-generator-data.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,12 @@ stack
}
}
{line OPTION DBPAD = signed-number }
{line ENCODING
{or
{line /string-literal}
{line NONE}
}
}
}

table-constraint {
Expand Down Expand Up @@ -828,6 +834,12 @@ stack
}
NOT NULL
}
{line ENCODING
{or
{line /string-literal}
{line NONE}
}
}
}
}
{line OPTIONS ( table-options ) }
Expand Down
7 changes: 5 additions & 2 deletions schemachange/sc_records.c
Original file line number Diff line number Diff line change
Expand Up @@ -545,12 +545,12 @@ static int prepare_and_verify_newdb_record(struct convert_record_data *data,
if (rc < 0) {
logmsg(LOGMSG_DEBUG, "%s:%d internal error during CHECK constraint\n",
__func__, __LINE__);
return ERR_CONSTR;
return ERR_CHECK_CONSTRAINT;
} else if (rc > 0) {
logmsg(LOGMSG_DEBUG, "%s:%d CHECK constraint failed for '%s'\n",
__func__, __LINE__,
data->iq.usedb->check_constraints[rc - 1].consname);
return ERR_CONSTR;
return ERR_CHECK_CONSTRAINT;
}

rc = verify_record_constraint(&data->iq, data->to, data->trans, p_buf_data,
Expand Down Expand Up @@ -1123,6 +1123,9 @@ static int convert_record(struct convert_record_data *data)
} else if (rc == ERR_VERIFY_PI) {
sc_client_error(data->s, "Error verifying partial indexes! rrn %d genid 0x%llx", rrn, genid);
return -2;
} else if (rc == ERR_CHECK_CONSTRAINT) {
sc_client_error(data->s, "Record violates check constraints rrn %d genid 0x%llx", rrn, genid);
return -2;
} else if (rc != 0) {
sc_client_error(data->s,
"Error adding record rcode %d opfailcode %d ixfailnum %d rrn %d genid 0x%llx, stripe %d", rc,
Expand Down
70 changes: 66 additions & 4 deletions sqlite/src/comdb2build.c
Original file line number Diff line number Diff line change
Expand Up @@ -6406,7 +6406,7 @@ void comdb2DeferForeignKey(Parse *pParse, int isDeferred)
return;
}

static void drop_constraint(Parse *pParse, Token *pName, int type)
static void drop_constraint(Parse *pParse, Token *pName, int type, int hush)
{
if (comdb2IsPrepareOnly(pParse))
return;
Expand All @@ -6433,7 +6433,7 @@ static void drop_constraint(Parse *pParse, Token *pName, int type)
if (cons) {
/* Mark it as dropped. */
cons->flags |= CONS_DELETED;
} else {
} else if (!hush) {
pParse->rc = SQLITE_ERROR;
sqlite3ErrorMsg(pParse, "Constraint '%s' not found.", name);
goto cleanup;
Expand All @@ -6454,15 +6454,15 @@ void comdb2DropForeignKey(Parse *pParse, /* Parser context */
Token *pName /* Foreign key name */
)
{
drop_constraint(pParse, pName, CONS_FKEY);
drop_constraint(pParse, pName, CONS_FKEY, 0);
return;
}

void comdb2DropConstraint(Parse *pParse, /* Parser context */
Token *pName /* Foreign key name */
)
{
drop_constraint(pParse, pName, CONS_ALL);
drop_constraint(pParse, pName, CONS_ALL, 0);
return;
}

Expand Down Expand Up @@ -7690,3 +7690,65 @@ void create_default_consumer_sp(Parse *p, char *spname)
comdb2prepareNoRows(v, p, 0, sc, &comdb2SqlSchemaChange, (vdbeFuncArgFree)&free_schema_change_type);

}

void comdb2ChangeCharacterSet(Parse *pParse, Token *t, int alter)
{
struct comdb2_ddl_context *ctx;
struct comdb2_column *column;
sqlite3 *db = pParse->db;

char *charset = NULL;
char expr[MAXCOLNAME + sizeof("utf8_validate()=0")];
char constraint_name[MAXCOLNAME + sizeof("$" GEN_CONS_PREFIX "_CHAR_ENC_")];
int nw;

Token colToken;
Token funcToken;
ExprList *arg;
Expr *func;
Expr *zero;
Expr *equality;

if (t != NULL) {
charset = sqlite3NameFromToken(db, t);
if (charset == NULL)
return;

/* so far only utf8 is supported */
if (strcasecmp(charset, "utf8") != 0 && strcasecmp(charset, "utf-8") != 0) {
setError(pParse, SQLITE_MISUSE, "unknown charset");
goto out;
}
}

ctx = pParse->comdb2_ddl_ctx;
if (alter)
column = ctx->alter_column;
else
column = (struct comdb2_column *)LISTC_BOT(&ctx->schema->column_list);

if (column->type != SQL_TYPE_CSTRING && column->type != SQL_TYPE_VARCHAR && column->type != SQL_TYPE_CHAR) {
setError(pParse, SQLITE_MISUSE, "invalid column type to use character encoding");
goto out;
}

snprintf(constraint_name, sizeof(constraint_name), "$" GEN_CONS_PREFIX "_CHAR_ENC_%s", column->name);
sqlite3TokenInit(&pParse->constraintName, constraint_name);

if (t == NULL) {
drop_constraint(pParse, &pParse->constraintName, CONS_CHECK, 1);
} else {
sqlite3TokenInit(&colToken, column->name);
sqlite3TokenInit(&funcToken, "utf8_validate");

arg = sqlite3ExprListAppend(pParse, NULL, sqlite3ExprAlloc(db, TK_ID, &colToken, 0));
func = sqlite3ExprFunction(pParse, arg, &funcToken, 0);
zero = sqlite3ExprAlloc(db, TK_INTEGER, &sqlite3IntTokens[0], 0);

equality = sqlite3PExpr(pParse, TK_EQ, func, zero);
nw = snprintf(expr, sizeof(expr), "utf8_validate(%s)=0", column->name);
comdb2AddCheckConstraint(pParse, equality, expr, expr + nw + 1);
}
out:
sqlite3DbFree(db, charset);
}
1 change: 1 addition & 0 deletions sqlite/src/comdb2build.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ void comdb2AddIndex(Parse *, Token *, ExprList *, int, Expr *, const char *,
const char *, int, u8, int, ExprList *);
void comdb2AddDbpad(Parse *, int);
void comdb2AddCheckConstraint(Parse *, Expr *, const char *, const char *);
void comdb2ChangeCharacterSet(Parse *pParse, Token *, int);
void comdb2CreateIndex(Parse *, Token *, Token *, SrcList *, ExprList *, int,
Token *, Expr *, const char *, const char *, int, int,
u8, int, ExprList *, int);
Expand Down
31 changes: 31 additions & 0 deletions sqlite/src/func.c
Original file line number Diff line number Diff line change
Expand Up @@ -1418,6 +1418,36 @@ static void uncompressGzipFunc(
return;
}

/* Return 0 if payload is utf8. Return (-N - 1), where N is the index
* of the first malformed character */
int utf8_validate(const char *str, int len, int *valid_len);
static void comdb2Utf8ValidateFunc(
sqlite3_context *context,
int argc,
sqlite3_value **argv
){
int valid_len, rc, len;
const char *z;
assert(argc == 1);
UNUSED_PARAMETER(argc);

switch( sqlite3_value_type(argv[0]) ){
case SQLITE_BLOB:
len = sqlite3_value_bytes(argv[0]);
z = sqlite3_value_blob(argv[0]);
rc = utf8_validate(z, len, &valid_len);
break;
case SQLITE_TEXT:
len = sqlite3_value_bytes(argv[0]) + 1; /* +1 for \0 */
z = (const char *)sqlite3_value_text(argv[0]);
rc = utf8_validate(z, len, &valid_len);
break;
default:
rc = -1;
break;
}
sqlite3_result_int(context, rc == 0 ? rc : (-valid_len - 1));
}
#endif /* defined(SQLITE_BUILDING_FOR_COMDB2) */

/*
Expand Down Expand Up @@ -3093,6 +3123,7 @@ void sqlite3RegisterBuiltinFunctions(void){
FUNCTION(comdb2_starttime, 0, 0, 0, comdb2StartTimeFunc),
FUNCTION(comdb2_user, 0, 0, 0, comdb2UserFunc),
FUNCTION(comdb2_last_cost, 0, 0, 0, comdb2LastCostFunc),
FUNCTION(utf8_validate, 1, 0, 0, comdb2Utf8ValidateFunc),
FUNCTION(checksum_md5, 1, 0, 0, md5Func),
FUNCTION(compress, 1, 0, 0, compressFunc),
FUNCTION(uncompress, 1, 0, 0, uncompressFunc),
Expand Down
8 changes: 8 additions & 0 deletions sqlite/src/parse.y
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,8 @@ ccons ::= PRIMARY KEY sortorder(Z) onconf(R) autoinc(I).
{sqlite3AddPrimaryKey(pParse,0,R,I,Z);}
%endif !SQLITE_BUILDING_FOR_COMDB2
%ifdef SQLITE_BUILDING_FOR_COMDB2
ccons ::= ENCODING STRING(H). {comdb2ChangeCharacterSet(pParse,&H,0);}
ccons ::= ENCODING NONE. {comdb2ChangeCharacterSet(pParse,NULL,0);}
ccons ::= UNIQUE onconf(R). {
comdb2AddIndex(pParse, 0, 0, R, 0, 0, 0, SQLITE_SO_ASC,
SQLITE_IDXTYPE_UNIQUE, 0, 0);
Expand Down Expand Up @@ -2037,6 +2039,12 @@ alter_table_alter_column_cmd ::= SET NOT NULL. {
alter_table_alter_column_cmd ::= DROP NOT NULL. {
comdb2AlterColumnDropNotNull(pParse);
}
alter_table_alter_column_cmd ::= ENCODING STRING(H). {
comdb2ChangeCharacterSet(pParse,&H,1);
}
alter_table_alter_column_cmd ::= ENCODING NONE. {
comdb2ChangeCharacterSet(pParse,NULL,1);
}
alter_table_alter_column ::= alter_table_alter_column_start
alter_table_alter_column_cmd. {
comdb2AlterColumnEnd(pParse);
Expand Down
1 change: 1 addition & 0 deletions sqlite/tool/mkkeywordhash.c
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ static Keyword aKeywordTable[] = {
{ "DISTINCT", "TK_DISTINCT", ALWAYS },
{ "DO", "TK_DO", UPSERT },
{ "DROP", "TK_DROP", ALWAYS },
{ "ENCODING", "TK_ENCODING", ALWAYS },
{ "END", "TK_END", ALWAYS },
{ "EACH", "TK_EACH", TRIGGER },
{ "ELSE", "TK_ELSE", ALWAYS },
Expand Down
2 changes: 2 additions & 0 deletions tests/auth.test/t09.expected
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
(candidate='EACH')
(candidate='ELSE')
(candidate='ENABLE')
(candidate='ENCODING')
(candidate='END')
(candidate='ESCAPE')
(candidate='EXCEPT')
Expand Down Expand Up @@ -377,6 +378,7 @@
(candidate='unlikely()')
(candidate='upper()')
(candidate='usleep()')
(candidate='utf8_validate()')
(candidate='zeroblob()')
(username='user1')
(username='user2')
Expand Down
5 changes: 3 additions & 2 deletions tests/comdb2sys.test/comdb2sys.expected
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@
(tablename='t3', bytes=73728)
(tablename='t4', bytes=73728)
[select * from comdb2_tablesizes order by tablename] rc 0
(KEYWORDS_COUNT=223)
(KEYWORDS_COUNT=224)
[SELECT COUNT(*) AS KEYWORDS_COUNT FROM comdb2_keywords] rc 0
(RESERVED_KW=66)
(RESERVED_KW=67)
[SELECT COUNT(*) AS RESERVED_KW FROM comdb2_keywords WHERE reserved = 'Y'] rc 0
(NONRESERVED_KW=157)
[SELECT COUNT(*) AS NONRESERVED_KW FROM comdb2_keywords WHERE reserved = 'N'] rc 0
Expand All @@ -104,6 +104,7 @@
(name='DISTINCT', reserved='Y')
(name='DROP', reserved='Y')
(name='ELSE', reserved='Y')
(name='ENCODING', reserved='Y')
(name='ESCAPE', reserved='Y')
(name='EXCEPT', reserved='Y')
(name='EXISTS', reserved='Y')
Expand Down
2 changes: 1 addition & 1 deletion tests/ddl_no_csc2.test/t09_check.expected
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
')
(rows inserted=1)
(rows inserted=1)
[ALTER TABLE t1 ADD CONSTRAINT valid_colors CHECK (color IN ('red', 'green', 'blue'))] failed with rc 240 Record violates foreign constraints rrn xx genid xx
[ALTER TABLE t1 ADD CONSTRAINT valid_colors CHECK (color IN ('red', 'green', 'blue'))] failed with rc 240 Record violates check constraints rrn xx genid xx
(csc2='schema
{
cstring color[11] null = yes
Expand Down
17 changes: 17 additions & 0 deletions tests/ddl_no_csc2.test/t15_encoding.expected
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[CREATE TABLE t15(a INTEGER ENCODING 'ascii')] failed with rc -3 unknown charset
[CREATE TABLE t15(a INTEGER ENCODING 'utf8')] failed with rc -3 invalid column type to use character encoding
[CREATE TABLE t15(a TEXT ENCODING 'utf8')] failed with rc -3 invalid column type to use character encoding
(csc2='schema
{
cstring a[11] null = yes
}
constraints
{
check "$CONSTRAINT_CHAR_ENC_a" = {where utf8_validate(a)=0}
}
')
[INSERT INTO t15 VALUES (CAST(x'616263FF616263' AS TEXT))] failed with rc 403 CHECK constraint violation CHECK constraint failed for '$CONSTRAINT_CHAR_ENC_a' unable to add record rc = 320
(COUNT(*)=0)
(rows inserted=1)
(COUNT(*)=1)
[ALTER TABLE t15 ALTER COLUMN a ENCODING 'utf8'] failed with rc 240 Record violates check constraints rrn xx genid xx
Loading

0 comments on commit 00664f0

Please sign in to comment.