@@ -642,6 +642,51 @@ class PreTokenizerWithRegEx {
642642 return res;
643643 }
644644
645+ // "[^\r\n\p{L}\p{N}]?[\p{L}\p{M}]+"
646+ std::u32string_view Match_Qwen35_Pattern_1 () {
647+
648+ if (m_text.empty ()) return {};
649+
650+ if ((!IsRN (m_text[0 ]) && !IsN (m_text[0 ])) || IsLM (m_text[0 ])) {
651+ if (IsLM (m_text[0 ]) || ((m_text.size () > 1 ) && IsLM (m_text[1 ]))) {
652+ size_t i = 1 ;
653+ for (; i < m_text.size (); ++i) {
654+ if (!IsLM (m_text[i])) break ;
655+ }
656+ std::u32string_view res = m_text.substr (0 , i);
657+ m_text = m_text.substr (i);
658+ return res;
659+ }
660+ }
661+
662+ return {};
663+ }
664+
665+ // " ?[^\s\p{L}\p{M}\p{N}]+[\r\n]*"
666+ std::u32string_view Match_Qwen35_Pattern_2 () {
667+
668+ if (m_text.empty ()) return {};
669+
670+ auto pos = 0 ;
671+ if (m_text[0 ] == U' ' ) pos = 1 ;
672+ if (pos < m_text.size () && NotLMNZ (m_text[pos])) {
673+ size_t i = pos + 1 ;
674+ for (; i < m_text.size (); ++i) {
675+ if (!NotLMNZ (m_text[i])) break ;
676+ }
677+ if (i < m_text.size () && IsRN (m_text[i])) {
678+ for (; i < m_text.size (); ++i) {
679+ if (!IsRN (m_text[i])) break ;
680+ }
681+ }
682+ std::u32string_view res = m_text.substr (0 , i);
683+ m_text = m_text.substr (i);
684+ return res;
685+ }
686+
687+ return {};
688+ }
689+
645690 // "(\p{N})"
646691 std::u32string_view Match_General_Pattern_1 () {
647692
@@ -673,6 +718,8 @@ class PreTokenizerWithRegEx {
673718 {R"( (?i:'s|'t|'re|'ve|'m|'ll|'d))" , &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
674719 {R"( ?[^\s\p{L}\p{N}]+[\r\n/]*)" , &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
675720 {R"( ?[^\s\p{L}\p{N}]+[\r\n]*)" , &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
721+ {R"( [^\r\n\p{L}\p{N}]?[\p{L}\p{M}]+)" , &PreTokenizerWithRegEx::Match_Qwen35_Pattern_1},
722+ {R"( ?[^\s\p{L}\p{M}\p{N}]+[\r\n]*)" , &PreTokenizerWithRegEx::Match_Qwen35_Pattern_2},
676723 {R"( [^\r\n\p{L}\p{N}]?\p{L}+)" , &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
677724 {R"( 's|'t|'re|'ve|'m|'ll|'d)" , &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
678725 {R"( ?[^\s\p{L}\p{N}]+)" , &PreTokenizerWithRegEx::Match_GPT2_Pattern_3},
@@ -835,6 +882,12 @@ class PreTokenizerWithRegEx {
835882 return (category & ufal::unilib::unicode::Z) != 0 ;
836883 }
837884
885+ static bool IsLM (char32_t ch) {
886+ auto category = ufal::unilib::unicode::category (ch);
887+ return ((category & ufal::unilib::unicode::L) != 0 ||
888+ (category & ufal::unilib::unicode::M) != 0 );
889+ }
890+
838891 static bool NotLNZ (char32_t ch) {
839892 // \r\n\t\f\v
840893 if (ch == U' \r ' || ch == U' \n ' || ch == U' \t ' || ch == U' \f ' || ch == U' \v ' ) return false ;
@@ -845,6 +898,17 @@ class PreTokenizerWithRegEx {
845898 return true ;
846899 }
847900
901+ static bool NotLMNZ (char32_t ch) {
902+ // \r\n\t\f\v
903+ if (ch == U' \r ' || ch == U' \n ' || ch == U' \t ' || ch == U' \f ' || ch == U' \v ' ) return false ;
904+ auto category = ufal::unilib::unicode::category (ch);
905+ if (category & ufal::unilib::unicode::L) return false ;
906+ if (category & ufal::unilib::unicode::M) return false ;
907+ if (category & ufal::unilib::unicode::N) return false ;
908+ if (category & ufal::unilib::unicode::Z) return false ;
909+ return true ;
910+ }
911+
848912 private:
849913 std::u32string_view m_text;
850914 char32_t m_last_char = 0 ;
0 commit comments