Skip to content

Commit 14c010c

Browse files
author
Thomas Proisl
committed
Move lists of special single tokens to separate files
1 parent a247cc6 commit 14c010c

File tree

5 files changed

+36
-15
lines changed

5 files changed

+36
-15
lines changed

src/somajo/data/single_token_abbreviations_de.txt

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,3 @@ T.V.
4040
Uni-Kl.
4141
USt-IdNr.
4242
Zeitschr.titel
43-
44-
# These should be moved to another file:
45-
.Net
46-
/rant
47-
/s
48-
E/E
49-
tl;dr
50-
zl;ng
51-
52-
# SAP Versions
53-
S/4
54-
R/3

src/somajo/data/single_token_abbreviations_en.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,3 @@ a.m.
99
p.m.
1010
P.S.
1111
T.V.
12-
13-
# These should be moved to another file:
14-
tl;dr

src/somajo/data/single_tokens_de.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# A list of tokens that should not be split.
2+
#
3+
# Lines starting with “#” are treated as comments and will be ignored.
4+
5+
.Net
6+
/rant
7+
/s
8+
E/E
9+
tl;dr
10+
zl;ng
11+
12+
# SAP Versions
13+
S/4
14+
R/3
15+
16+
# mobile telephony
17+
3G
18+
4G
19+
5G

src/somajo/data/single_tokens_en.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# A list of tokens that should not be split.
2+
#
3+
# Lines starting with “#” are treated as comments and will be ignored.
4+
5+
tl;dr
6+
7+
# mobile telephony
8+
3G
9+
4G
10+
5G

src/somajo/tokenizer.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,10 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False
162162
\#x[0-9a-f]+ # hexadecimal entities
163163
);""", re.VERBOSE | re.IGNORECASE)
164164

165+
# high priority single tokens
166+
single_token_list = utils.read_abbreviation_file(f"single_tokens_{self.language[:2]}.txt")
167+
self.single_tokens = re.compile(r"(?<![\w.])(?:" + r'|'.join([re.escape(_) for _ in single_token_list]) + r')(?!\p{L})', re.IGNORECASE)
168+
165169
# EMOTICONS
166170
emoticon_set = {"(-.-)", "(T_T)", "(♥_♥)", ")':", ")-:",
167171
"(-:", ")=", ")o:", ")x", ":'C", ":/", ":<",
@@ -698,6 +702,9 @@ def _tokenize(self, token_dll):
698702
# XML entities
699703
self._split_all_matches(self.entity, token_dll, "XML_entity")
700704

705+
# high priority single tokens
706+
self._split_all_matches(self.single_tokens, token_dll)
707+
701708
# emoticons
702709
self._split_all_matches(self.heart_emoticon, token_dll, "emoticon")
703710
self._split_all_matches(self.emoticon, token_dll, "emoticon")

0 commit comments

Comments
 (0)