Skip to content

Commit 3b47d82

Browse files
committed
feat(mapper): add custom tokenizer support to RemoveRepeatSentencesMapper
The built-in regex sentence splitter treats every period followed by a non-quote character as a sentence boundary, which incorrectly splits text containing decimal numbers (e.g. "2.5 kg"), abbreviations, and version numbers. When these fragments are independently deduplicated, the resulting text is corrupted. Add a `tokenizer` parameter that accepts a custom sentence tokenizer to override the default regex splitter. The tokenizer can be: - A Python callable (for API usage), e.g. `nltk.sent_tokenize` - A lambda string (for YAML configs), e.g. `"lambda text: __import__('nltk').sent_tokenize(text)"` - None (default) to preserve existing behavior Lambda strings are validated using `ast.parse`, following the same pattern as `PythonLambdaMapper`. Made-with: Cursor
1 parent fa4d7b2 commit 3b47d82

File tree

2 files changed

+89
-1
lines changed

2 files changed

+89
-1
lines changed

data_juicer/ops/mapper/remove_repeat_sentences_mapper.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import ast
2+
from collections.abc import Callable
3+
14
import regex as re
25

36
from ..base_op import OPERATORS, Mapper
@@ -11,6 +14,21 @@ def split_sentence(text):
1114
return text.split("\n")
1215

1316

17+
def _wrap_tokenizer(fn: Callable[[str], list[str]]) -> Callable[[str], list[str]]:
18+
"""Wrap a tokenizer to match split_sentence's whitespace convention.
19+
20+
split_sentence preserves leading whitespace on each fragment (e.g. ["Hello.", " Goodbye."]), so the downstream
21+
``new_sent += sentence`` concatenation produces correct spacing. Custom tokenizers will typically return clean
22+
tokens without leading whitespace, so this wrapper prepends a space to every sentence after the first.
23+
"""
24+
25+
def wrapped(line: str) -> list[str]:
26+
sentences = fn(line)
27+
return [(" " + s if i > 0 else s) for i, s in enumerate(sentences)]
28+
29+
return wrapped
30+
31+
1432
@OPERATORS.register_module("remove_repeat_sentences_mapper")
1533
class RemoveRepeatSentencesMapper(Mapper):
1634
"""Mapper to remove repeat sentences in text samples.
@@ -30,6 +48,7 @@ def __init__(
3048
lowercase: bool = False,
3149
ignore_special_character: bool = True,
3250
min_repeat_sentence_length: int = 2,
51+
tokenizer: Callable[[str], list[str]] | str | None = None,
3352
*args,
3453
**kwargs,
3554
):
@@ -45,6 +64,11 @@ def __init__(
4564
length will not be deduplicated. If ignore_special_character is
4665
set to True, then special characters are not included in this
4766
length.
67+
:param tokenizer: Custom sentence tokenizer. Can be a callable
68+
that takes a string and returns a list of sentence strings,
69+
or a lambda string for YAML configs (e.g.
70+
``"lambda text: __import__('nltk').sent_tokenize(text)"``).
71+
If None, uses the built-in regex-based splitter.
4872
:param args: extra args
4973
:param kwargs: extra args
5074
"""
@@ -54,6 +78,29 @@ def __init__(
5478
self.min_repeat_sentence_length = min_repeat_sentence_length
5579
self.remove_regex = re.compile(r"[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]") if ignore_special_character else None
5680

81+
if tokenizer is None:
82+
self._tokenize = split_sentence
83+
elif callable(tokenizer):
84+
self._tokenize = _wrap_tokenizer(tokenizer)
85+
elif isinstance(tokenizer, str):
86+
self._tokenize = _wrap_tokenizer(self._create_tokenizer(tokenizer))
87+
else:
88+
raise ValueError(f"tokenizer must be None, a callable, or a lambda string, " f"got {type(tokenizer)}")
89+
90+
@staticmethod
91+
def _create_tokenizer(tokenizer_str: str) -> Callable[[str], list[str]]:
92+
"""Parse and validate a tokenizer lambda string."""
93+
try:
94+
node = ast.parse(tokenizer_str, mode="eval")
95+
if not isinstance(node.body, ast.Lambda):
96+
raise ValueError("Input string must be a valid lambda function.")
97+
if len(node.body.args.args) != 1:
98+
raise ValueError("Lambda function must have exactly one argument.")
99+
compiled_code = compile(node, "<string>", "eval")
100+
return eval(compiled_code, {"__builtins__": __builtins__})
101+
except Exception as e:
102+
raise ValueError(f"Invalid tokenizer lambda: {e}")
103+
57104
def process_batched(self, samples):
58105
for idx, text in enumerate(samples[self.text_key]):
59106
lines = [e for e in text.split("\n")]
@@ -62,7 +109,7 @@ def process_batched(self, samples):
62109
for line in lines:
63110
new_sent = ""
64111
if line:
65-
sentences = split_sentence(line)
112+
sentences = self._tokenize(line)
66113
for sentence in sentences:
67114
copy = sentence.strip()
68115
if self.lowercase:

tests/ops/mapper/test_remove_repeat_sentences_mapper.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,47 @@ def test_text2(self):
6767
min_repeat_sentence_length=5)
6868
self._run_helper(samples, op)
6969

70+
def test_custom_tokenizer_callable(self):
71+
72+
from nltk.tokenize import sent_tokenize
73+
74+
samples = [{
75+
'text':
76+
'The quick brown fox jumps over the lazy dog. Isn\'t it amazing? The quick brown fox jumps over the lazy dog. Speaking of weather, today is delightful.',
77+
'target':
78+
'The quick brown fox jumps over the lazy dog. Isn\'t it amazing? Speaking of weather, today is delightful.'
79+
}]
80+
81+
op = RemoveRepeatSentencesMapper(tokenizer=sent_tokenize)
82+
self._run_helper(samples, op)
83+
84+
def test_custom_tokenizer_lambda_str(self):
85+
86+
samples = [{
87+
'text':
88+
'The quick brown fox jumps over the lazy dog. Isn\'t it amazing? The quick brown fox jumps over the lazy dog. Speaking of weather, today is delightful.',
89+
'target':
90+
'The quick brown fox jumps over the lazy dog. Isn\'t it amazing? Speaking of weather, today is delightful.'
91+
}]
92+
93+
op = RemoveRepeatSentencesMapper(
94+
tokenizer="lambda text: __import__('nltk').sent_tokenize(text)")
95+
self._run_helper(samples, op)
96+
97+
def test_custom_tokenizer_preserves_decimals(self):
98+
99+
from nltk.tokenize import sent_tokenize
100+
101+
samples = [{
102+
'text':
103+
'The package weighs 2.5 kg and ships tomorrow. Delivery takes 3 days. The package weighs 2.5 kg and ships tomorrow. Please confirm the order.',
104+
'target':
105+
'The package weighs 2.5 kg and ships tomorrow. Delivery takes 3 days. Please confirm the order.'
106+
}]
107+
108+
op = RemoveRepeatSentencesMapper(tokenizer=sent_tokenize)
109+
self._run_helper(samples, op)
110+
70111

71112
if __name__ == '__main__':
72113
unittest.main()

0 commit comments

Comments
 (0)