Skip to content

Commit

Permalink
unicode support implemented
Browse files Browse the repository at this point in the history
  • Loading branch information
ony committed Feb 27, 2014
1 parent 3050f43 commit 703c185
Show file tree
Hide file tree
Showing 5 changed files with 197 additions and 20 deletions.
9 changes: 9 additions & 0 deletions inc/pjson.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#define __poll_json_h__

#include <string.h>
#include <stdint.h>
#include <wchar.h>

#ifdef __cplusplus
extern "C" {
Expand All @@ -39,6 +41,13 @@ typedef struct {

int state;
const char *ptr; /* current position withing chunk */

union {
struct {
uint32_t c; /* may contain surrogate pair */
mbstate_t s;
} str;
};
} pj_parser, *pj_parser_ref;

typedef enum {
Expand Down
2 changes: 1 addition & 1 deletion src/pjson_debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ static inline void trace_token(const char *file, int line, pj_token *token)
PRINT_TRACE("%s+%d: token %d (STR) \"%.*s\"", file, line, token->token_type, (int)token->len, token->str);
break;
default:
TRACEF("%s+%d: token %d (%s)", file, line, token->token_type, token_type_name(token->token_type));
PRINT_TRACE("%s+%d: token %d (%s)", file, line, token->token_type, token_type_name(token->token_type));
}
}
#endif
Expand Down
37 changes: 23 additions & 14 deletions src/pjson_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,13 @@ typedef enum {
S_SPACE,
S_VALUE,
S_NUM,
S_N = 10, S_NU, S_NUL,
S_T = 20, S_TR, S_TRU,
S_F = 30, S_FA, S_FAL, S_FALS,
S_STR = 40, S_ESC, S_STR_VALUE, /* str may end up as key */
S_N, S_NU, S_NUL,
S_T, S_TR, S_TRU,
S_F, S_FA, S_FAL, S_FALS,
S_STR, S_ESC,
S_UNICODE, S_UNICODE_FINISH = S_UNICODE + 4, /* 4 hex digits */
S_UNICODE_ESC, /* handle surrogate pairs */
S_STR_VALUE, /* str may end up as key */
} state;

#define F_BUF 0x100
Expand Down Expand Up @@ -61,24 +64,30 @@ static void pj_set_end(pj_parser_ref parser)
}
}

static bool pj_reserve(pj_parser_ref parser, pj_token *token, size_t len, const char *p)
{
char * buf_ptr1 = parser->buf_ptr + len;
if (buf_ptr1 > parser->buf_end)
{
TRACEF("overflow required %ld more", len - (parser->buf_end - parser->buf));
token->token_type = PJ_OVERFLOW;
token->len = len;
parser->ptr = p;
return false;
}
return true;
}

static bool pj_add_block(pj_parser_ref parser, pj_token *token, const char *block, size_t len, const char *p)
{
TRACE_FUNC();
char * buf_ptr1 = parser->buf_ptr + len;
if (len > 0)
{
/* ensure that we have enough space */
if (buf_ptr1 > parser->buf_end)
{
TRACEF("overflow required %ld more", len - parser->buf_len);
token->token_type = PJ_OVERFLOW;
token->len = len;
parser->ptr = p;
return false;
}
if (!pj_reserve(parser, token, len, p)) return false;

(void) memcpy(parser->buf_ptr, block, len);
parser->buf_ptr = buf_ptr1;
parser->buf_ptr += len;
}
return true;
}
Expand Down
134 changes: 131 additions & 3 deletions src/pjson_string.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,23 @@
#ifndef __pjson_string_h__
#define __pjson_string_h__

#include <errno.h>
#include <string.h>
#include <arpa/inet.h>

#include "pjson.h"
#include "pjson_state.h"
#include "pjson_debug.h"

static bool pj_string_esc(pj_parser_ref parser, pj_token *token, const char *p);
static bool pj_unicode(pj_parser_ref parser, pj_token *token, const char *p);
static bool pj_unicode_esc(pj_parser_ref parser, pj_token *token, const char *p);

static bool pj_string(pj_parser_ref parser, pj_token *token, const char *p)
{
TRACE_FUNC();
assert( parser->str.c == 0 && mbsinit(&parser->str.s) );

const char * const p_end = parser->chunk_end;

for (;;)
Expand Down Expand Up @@ -66,7 +74,7 @@ static bool pj_string(pj_parser_ref parser, pj_token *token, const char *p)

case '\\':
if (!pj_add_chunk(parser, token, p)) return false;
parser->state = S_STR | F_BUF;
parser->state = S_ESC | F_BUF;
return pj_string_esc(parser, token, ++p);

default: ++p;
Expand All @@ -78,6 +86,8 @@ static bool pj_string(pj_parser_ref parser, pj_token *token, const char *p)
static bool pj_string_esc(pj_parser_ref parser, pj_token *token, const char *p)
{
TRACE_FUNC();
assert( parser->str.c == 0 && mbsinit(&parser->str.s) );

const char * const p_end = parser->chunk_end;
if (p == p_end)
{
Expand All @@ -90,7 +100,7 @@ static bool pj_string_esc(pj_parser_ref parser, pj_token *token, const char *p)
TRACEF("char '%c'", *p);
switch (*p)
{
/* guarded chars */
/* guarded chars */
case '"': case '/': case '\\':
return pj_string(parser, token, ++p);

Expand All @@ -114,12 +124,130 @@ static bool pj_string_esc(pj_parser_ref parser, pj_token *token, const char *p)
case 'r':
if (!pj_add_block(parser, token, "\r", 1, p)) return false;
parser->chunk = ++p;
return pj_string(parser, token, ++p);
return pj_string(parser, token, p);

case 'u': return pj_unicode_esc(parser, token, p);

default:
pj_err_tok(parser, token);
return false;
}
}

static bool pj_unicode(pj_parser_ref parser, pj_token *token, const char *p)
{
TRACE_FUNC();
const char * const p_end = parser->chunk_end;
uint32_t c = parser->str.c & ~0xffff;
uint16_t c16 = parser->str.c & 0xffff;
size_t n = pj_state(parser) - S_UNICODE;
for (;;)
{
TRACE_PARSER(parser, p);
if (p == p_end)
{
parser->state = (S_UNICODE + n) | F_BUF;
c = c | c16;
parser->str.c = c;
pj_part_tok(parser, token, p);
return false;
}
if (n == 4)
{
bool surrogate = 0xd800 <= c16 && c16 <= 0xdbff;
if (surrogate) /* surrogate pair */
{
parser->str.c = (uint32_t)c16 << 16;
TRACEF("surrogate %08x", parser->str.c);
}
else
{
wchar_t wc;
if (c & ~0xffff) /* this is surrogate pair */
{
wc = ((c >> 16) - 0xd800) << 10;
wc |= c16 - 0xdc00;
wc += 0x010000;
}
else
{
wc = htons(c16);
}
/* flush wchar */
size_t encoded = wcrtomb(parser->buf_ptr, wc, &parser->str.s);
if (encoded == (size_t)-1)
{
TRACEF("invalid unicode: (errno=#%d) %s", errno, strerror(errno));
pj_err_tok(parser, token);
return false;
}
TRACEF("wc = %04x (%C) encoded into %ld bytes", wc, wc, encoded);
parser->buf_ptr += encoded;
parser->str.c = 0;
}

/* next unicode? */
switch (*p)
{
case '\\':
return pj_unicode_esc(parser, token, ++p);
default:
if (surrogate)
{
pj_err_tok(parser, token);
return false;
}
parser->chunk = p;
parser->str.s = (mbstate_t) { 0 }; /* reset multibyte state */
return pj_string(parser, token, p);
}
}
else
{
int digit;
switch (*p)
{
case '0' ... '9': digit = (*p - '0'); break;
case 'a' ... 'f': digit = 10 + (*p - 'a'); break;
case 'A' ... 'F': digit = 10 + (*p - 'A'); break;
default:
pj_err_tok(parser, token);
return false;
}
c16 = c16 * 0x10 + digit;
++p; ++n;
}
}
return false;
}

static bool pj_unicode_esc(pj_parser_ref parser, pj_token *token, const char *p)
{
TRACE_FUNC();
const char * const p_end = parser->chunk_end;
if (p == p_end)
{
parser->state = S_UNICODE_ESC | F_BUF;
pj_part_tok(parser, token, p);
return false;
}
switch (*p)
{
case 'u':
if (!pj_reserve(parser, token, MB_CUR_MAX, p))
{
parser->state = S_UNICODE_ESC | F_BUF;
return false;
}
parser->state = S_UNICODE | F_BUF;
return pj_unicode(parser, token, ++p);
default:
parser->state = S_ESC | F_BUF;
/* reset multibyte state */
parser->str.c = 0;
parser->str.s = (mbstate_t) { 0 };
return pj_string_esc(parser, token, p);
}
}

#endif
35 changes: 33 additions & 2 deletions test/str.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,8 +319,38 @@ TEST(str, utf8_direct)
EXPECT_EQ( PJ_STARVING, tokens[1].token_type );
}

TEST(str, DISABLED_utf8_escape_bmp)
TEST(str, platform_utf8)
{
setlocale(LC_CTYPE, "en_US.utf8");
char buf[MB_CUR_MAX];
mbstate_t mbs { 0 };

wchar_t c = u'';
string c_mb = u8"";
size_t n = wcrtomb(buf, c, &mbs);
ASSERT_NE( (size_t)-1, n );
EXPECT_EQ( c_mb.size(), n );
EXPECT_EQ( c_mb, string(buf, n) );
}

TEST(str, platform_utf8_surrogate)
{
setlocale(LC_CTYPE, "en_US.utf8");
char buf[MB_CUR_MAX];
mbstate_t mbs { 0 };

wchar_t c = L'𝄞';
string c_mb = u8"𝄞";
size_t n = wcrtomb(buf, c, &mbs);
ASSERT_NE( (size_t)-1, n );
EXPECT_EQ( c_mb.size(), n );
EXPECT_EQ( c_mb, string(buf, n) );
EXPECT_EQ( 0x1d11e, c );
}

TEST(str, utf8_escape_bmp)
{
setlocale(LC_CTYPE, "en_US.utf8");
pj_parser parser;
char buf[256];
pj_init(&parser, buf, sizeof(buf));
Expand All @@ -336,8 +366,9 @@ TEST(str, DISABLED_utf8_escape_bmp)
EXPECT_EQ( PJ_STARVING, tokens[1].token_type );
}

TEST(str, DISABLED_utf8_surrogate_pair)
TEST(str, utf8_surrogate_pair)
{
setlocale(LC_CTYPE, "en_US.utf8");
pj_parser parser;
char buf[256];
pj_init(&parser, buf, sizeof(buf));
Expand Down

0 comments on commit 703c185

Please sign in to comment.