Move lists of special single tokens to separate files

Thomas Proisl · Thomas Proisl · commit 14c010c404c6 · 2024-08-05T08:16:48.000+02:00
diff --git a/src/somajo/data/single_token_abbreviations_de.txt b/src/somajo/data/single_token_abbreviations_de.txt
@@ -40,15 +40,3 @@ T.V.
 Uni-Kl.
 USt-IdNr.
 Zeitschr.titel
-
-# These should be moved to another file:
-.Net
-/rant
-/s
-E/E
-tl;dr
-zl;ng
-
-# SAP Versions
-S/4
-R/3
diff --git a/src/somajo/data/single_token_abbreviations_en.txt b/src/somajo/data/single_token_abbreviations_en.txt
@@ -9,6 +9,3 @@ a.m.
 p.m.
 P.S.
 T.V.
-
-# These should be moved to another file:
-tl;dr
diff --git a/src/somajo/data/single_tokens_de.txt b/src/somajo/data/single_tokens_de.txt
@@ -0,0 +1,19 @@
+# A list of tokens that should not be split.
+#
+# Lines starting with “#” are treated as comments and will be ignored.
+
+.Net
+/rant
+/s
+E/E
+tl;dr
+zl;ng
+
+# SAP Versions
+S/4
+R/3
+
+# mobile telephony
+3G
+4G
+5G
diff --git a/src/somajo/data/single_tokens_en.txt b/src/somajo/data/single_tokens_en.txt
@@ -0,0 +1,10 @@
+# A list of tokens that should not be split.
+#
+# Lines starting with “#” are treated as comments and will be ignored.
+
+tl;dr
+
+# mobile telephony
+3G
+4G
+5G
diff --git a/src/somajo/tokenizer.py b/src/somajo/tokenizer.py
@@ -162,6 +162,10 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False
                                          \#x[0-9a-f]+         # hexadecimal entities
                                       );""", re.VERBOSE | re.IGNORECASE)
 
+        # high priority single tokens
+        single_token_list = utils.read_abbreviation_file(f"single_tokens_{self.language[:2]}.txt")
+        self.single_tokens = re.compile(r"(?<![\w.])(?:" + r'|'.join([re.escape(_) for _ in single_token_list]) + r')(?!\p{L})', re.IGNORECASE)
+
         # EMOTICONS
         emoticon_set = {"(-.-)", "(T_T)", "(♥_♥)", ")':", ")-:",
                         "(-:", ")=", ")o:", ")x", ":'C", ":/", ":<",
@@ -698,6 +702,9 @@ def _tokenize(self, token_dll):
         # XML entities
         self._split_all_matches(self.entity, token_dll, "XML_entity")
 
+        # high priority single tokens
+        self._split_all_matches(self.single_tokens, token_dll)
+
         # emoticons
         self._split_all_matches(self.heart_emoticon, token_dll, "emoticon")
         self._split_all_matches(self.emoticon, token_dll, "emoticon")

-Original file line number
+Diff line change
 p.m.
 P.S.
 T.V.
+-
 -# These should be moved to another file:
 -tl;dr