Add native \p{M} (Unicode Mark) regex support for Qwen3.5 tokenizer (#1063)

apsonawane · web-flow · commit f29716e7f7f6 · 2026-05-15T17:15:02.000Z
diff --git a/operators/tokenizer/bpe_utils.hpp b/operators/tokenizer/bpe_utils.hpp
@@ -642,6 +642,51 @@ class PreTokenizerWithRegEx {
     return res;
   }
 
+  // "[^\r\n\p{L}\p{N}]?[\p{L}\p{M}]+"
+  std::u32string_view Match_Qwen35_Pattern_1() {
+
+    if (m_text.empty()) return {};
+
+    if ((!IsRN(m_text[0]) && !IsN(m_text[0])) || IsLM(m_text[0])) {
+      if (IsLM(m_text[0]) || ((m_text.size() > 1) && IsLM(m_text[1]))) {
+        size_t i = 1;
+        for (; i < m_text.size(); ++i) {
+          if (!IsLM(m_text[i])) break;
+        }
+        std::u32string_view res = m_text.substr(0, i);
+        m_text = m_text.substr(i);
+        return res;
+      }
+    }
+
+    return {};
+  }
+
+  // " ?[^\s\p{L}\p{M}\p{N}]+[\r\n]*"
+  std::u32string_view Match_Qwen35_Pattern_2() {
+
+    if (m_text.empty()) return {};
+
+    auto pos = 0;
+    if (m_text[0] == U' ') pos = 1;
+    if (pos < m_text.size() && NotLMNZ(m_text[pos])) {
+      size_t i = pos + 1;
+      for (; i < m_text.size(); ++i) {
+        if (!NotLMNZ(m_text[i])) break;
+      }
+      if (i < m_text.size() && IsRN(m_text[i])) {
+        for (; i < m_text.size(); ++i) {
+          if (!IsRN(m_text[i])) break;
+        }
+      }
+      std::u32string_view res = m_text.substr(0, i);
+      m_text = m_text.substr(i);
+      return res;
+    }
+
+    return {};
+  }
+
   // "(\p{N})"
   std::u32string_view Match_General_Pattern_1() {
 
@@ -673,6 +718,8 @@ class PreTokenizerWithRegEx {
         {R"((?i:'s|'t|'re|'ve|'m|'ll|'d))", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
         {R"( ?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
         {R"( ?[^\s\p{L}\p{N}]+[\r\n]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
+        {R"([^\r\n\p{L}\p{N}]?[\p{L}\p{M}]+)", &PreTokenizerWithRegEx::Match_Qwen35_Pattern_1},
+        {R"( ?[^\s\p{L}\p{M}\p{N}]+[\r\n]*)", &PreTokenizerWithRegEx::Match_Qwen35_Pattern_2},
         {R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
         {R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
         {R"( ?[^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_3},
@@ -835,6 +882,12 @@ class PreTokenizerWithRegEx {
     return (category & ufal::unilib::unicode::Z) != 0;
   }
 
+  static bool IsLM(char32_t ch) {
+    auto category = ufal::unilib::unicode::category(ch);
+    return ((category & ufal::unilib::unicode::L) != 0 ||
+            (category & ufal::unilib::unicode::M) != 0);
+  }
+
   static bool NotLNZ(char32_t ch) {
     // \r\n\t\f\v
     if (ch == U'\r' || ch == U'\n' || ch == U'\t' || ch == U'\f' || ch == U'\v') return false;
@@ -845,6 +898,17 @@ class PreTokenizerWithRegEx {
     return true;
   }
 
+  static bool NotLMNZ(char32_t ch) {
+    // \r\n\t\f\v
+    if (ch == U'\r' || ch == U'\n' || ch == U'\t' || ch == U'\f' || ch == U'\v') return false;
+    auto category = ufal::unilib::unicode::category(ch);
+    if (category & ufal::unilib::unicode::L) return false;
+    if (category & ufal::unilib::unicode::M) return false;
+    if (category & ufal::unilib::unicode::N) return false;
+    if (category & ufal::unilib::unicode::Z) return false;
+    return true;
+  }
+
  private:
   std::u32string_view m_text;
   char32_t m_last_char = 0;
diff --git a/test/pp_api_test/test_tokenizer_impl.cc b/test/pp_api_test/test_tokenizer_impl.cc
@@ -41,6 +41,32 @@ TEST(OrtxTokenizerTest, RegexTest) {
   EXPECT_EQ(res, out_tokens);
 }
 
+TEST(OrtxTokenizerTest, Qwen35RegexTest) {
+  // Qwen3.5 tokenizer regex — exercises [\p{L}\p{M}] and [^\s\p{L}\p{M}\p{N}] matchers
+  std::u32string str = U"Hello, world! 42\ncaf\u0065\u0301 ok";  // café with combining acute accent (U+0301)
+  auto reg_splitter = std::make_unique<ort_extensions::bpe::PreTokenizerWithRegEx>();
+
+  auto status = reg_splitter->Compile(
+      R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?[\p{L}\p{M}]+|\p{N}| ?[^\s\p{L}\p{M}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+)");
+  ASSERT_TRUE(status.IsOk()) << status.ToString();
+
+  reg_splitter->Set(str.c_str());
+
+  std::vector<std::u32string> res;
+  for (;;) {
+    std::u32string_view tok = reg_splitter->GetNextToken();
+    if (tok.empty()) break;
+    res.push_back(ustring(tok));
+  }
+
+  // Expected tokens:
+  // "Hello" "," " world" "!" " " "4" "2" "\n" "café" (with combining accent) " ok"
+  std::vector<std::u32string> expected = {
+      U"Hello", U",", U" world", U"!", U" ", U"4", U"2", U"\n",
+      U"caf\u0065\u0301", U" ok"};
+  EXPECT_EQ(res, expected);
+}
+
 TEST(OrtxTokenizerTest, AddedTokensTest) {
   auto tokenizer = std::make_unique<ort_extensions::TokenizerImpl>();
   auto status = tokenizer->Load("data/added-tokens");