Skip to content

Commit f29716e

Browse files
authored
Add native \p{M} (Unicode Mark) regex support for Qwen3.5 tokenizer (#1063)
1 parent e1cccaa commit f29716e

2 files changed

Lines changed: 90 additions & 0 deletions

File tree

operators/tokenizer/bpe_utils.hpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,51 @@ class PreTokenizerWithRegEx {
642642
return res;
643643
}
644644

645+
// "[^\r\n\p{L}\p{N}]?[\p{L}\p{M}]+"
646+
std::u32string_view Match_Qwen35_Pattern_1() {
647+
648+
if (m_text.empty()) return {};
649+
650+
if ((!IsRN(m_text[0]) && !IsN(m_text[0])) || IsLM(m_text[0])) {
651+
if (IsLM(m_text[0]) || ((m_text.size() > 1) && IsLM(m_text[1]))) {
652+
size_t i = 1;
653+
for (; i < m_text.size(); ++i) {
654+
if (!IsLM(m_text[i])) break;
655+
}
656+
std::u32string_view res = m_text.substr(0, i);
657+
m_text = m_text.substr(i);
658+
return res;
659+
}
660+
}
661+
662+
return {};
663+
}
664+
665+
// " ?[^\s\p{L}\p{M}\p{N}]+[\r\n]*"
666+
std::u32string_view Match_Qwen35_Pattern_2() {
667+
668+
if (m_text.empty()) return {};
669+
670+
auto pos = 0;
671+
if (m_text[0] == U' ') pos = 1;
672+
if (pos < m_text.size() && NotLMNZ(m_text[pos])) {
673+
size_t i = pos + 1;
674+
for (; i < m_text.size(); ++i) {
675+
if (!NotLMNZ(m_text[i])) break;
676+
}
677+
if (i < m_text.size() && IsRN(m_text[i])) {
678+
for (; i < m_text.size(); ++i) {
679+
if (!IsRN(m_text[i])) break;
680+
}
681+
}
682+
std::u32string_view res = m_text.substr(0, i);
683+
m_text = m_text.substr(i);
684+
return res;
685+
}
686+
687+
return {};
688+
}
689+
645690
// "(\p{N})"
646691
std::u32string_view Match_General_Pattern_1() {
647692

@@ -673,6 +718,8 @@ class PreTokenizerWithRegEx {
673718
{R"((?i:'s|'t|'re|'ve|'m|'ll|'d))", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
674719
{R"( ?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
675720
{R"( ?[^\s\p{L}\p{N}]+[\r\n]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
721+
{R"([^\r\n\p{L}\p{N}]?[\p{L}\p{M}]+)", &PreTokenizerWithRegEx::Match_Qwen35_Pattern_1},
722+
{R"( ?[^\s\p{L}\p{M}\p{N}]+[\r\n]*)", &PreTokenizerWithRegEx::Match_Qwen35_Pattern_2},
676723
{R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
677724
{R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
678725
{R"( ?[^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_3},
@@ -835,6 +882,12 @@ class PreTokenizerWithRegEx {
835882
return (category & ufal::unilib::unicode::Z) != 0;
836883
}
837884

885+
static bool IsLM(char32_t ch) {
886+
auto category = ufal::unilib::unicode::category(ch);
887+
return ((category & ufal::unilib::unicode::L) != 0 ||
888+
(category & ufal::unilib::unicode::M) != 0);
889+
}
890+
838891
static bool NotLNZ(char32_t ch) {
839892
// \r\n\t\f\v
840893
if (ch == U'\r' || ch == U'\n' || ch == U'\t' || ch == U'\f' || ch == U'\v') return false;
@@ -845,6 +898,17 @@ class PreTokenizerWithRegEx {
845898
return true;
846899
}
847900

901+
static bool NotLMNZ(char32_t ch) {
902+
// \r\n\t\f\v
903+
if (ch == U'\r' || ch == U'\n' || ch == U'\t' || ch == U'\f' || ch == U'\v') return false;
904+
auto category = ufal::unilib::unicode::category(ch);
905+
if (category & ufal::unilib::unicode::L) return false;
906+
if (category & ufal::unilib::unicode::M) return false;
907+
if (category & ufal::unilib::unicode::N) return false;
908+
if (category & ufal::unilib::unicode::Z) return false;
909+
return true;
910+
}
911+
848912
private:
849913
std::u32string_view m_text;
850914
char32_t m_last_char = 0;

test/pp_api_test/test_tokenizer_impl.cc

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,32 @@ TEST(OrtxTokenizerTest, RegexTest) {
4141
EXPECT_EQ(res, out_tokens);
4242
}
4343

44+
TEST(OrtxTokenizerTest, Qwen35RegexTest) {
45+
// Qwen3.5 tokenizer regex — exercises [\p{L}\p{M}] and [^\s\p{L}\p{M}\p{N}] matchers
46+
std::u32string str = U"Hello, world! 42\ncaf\u0065\u0301 ok"; // café with combining acute accent (U+0301)
47+
auto reg_splitter = std::make_unique<ort_extensions::bpe::PreTokenizerWithRegEx>();
48+
49+
auto status = reg_splitter->Compile(
50+
R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?[\p{L}\p{M}]+|\p{N}| ?[^\s\p{L}\p{M}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+)");
51+
ASSERT_TRUE(status.IsOk()) << status.ToString();
52+
53+
reg_splitter->Set(str.c_str());
54+
55+
std::vector<std::u32string> res;
56+
for (;;) {
57+
std::u32string_view tok = reg_splitter->GetNextToken();
58+
if (tok.empty()) break;
59+
res.push_back(ustring(tok));
60+
}
61+
62+
// Expected tokens:
63+
// "Hello" "," " world" "!" " " "4" "2" "\n" "café" (with combining accent) " ok"
64+
std::vector<std::u32string> expected = {
65+
U"Hello", U",", U" world", U"!", U" ", U"4", U"2", U"\n",
66+
U"caf\u0065\u0301", U" ok"};
67+
EXPECT_EQ(res, expected);
68+
}
69+
4470
TEST(OrtxTokenizerTest, AddedTokensTest) {
4571
auto tokenizer = std::make_unique<ort_extensions::TokenizerImpl>();
4672
auto status = tokenizer->Load("data/added-tokens");

0 commit comments

Comments
 (0)