feat(mapper): add custom tokenizer support to RemoveRepeatSentencesMapper

JohnGiorgi · JohnGiorgi · commit 3b47d822126a · 2026-02-27T15:50:59.000Z
The built-in regex sentence splitter treats every period followed by a
non-quote character as a sentence boundary, which incorrectly splits
text containing decimal numbers (e.g. "2.5 kg"), abbreviations, and
version numbers. When these fragments are independently deduplicated,
the resulting text is corrupted.

Add a `tokenizer` parameter that accepts a custom sentence tokenizer
to override the default regex splitter. The tokenizer can be:

- A Python callable (for API usage), e.g. `nltk.sent_tokenize`
- A lambda string (for YAML configs), e.g.
  `"lambda text: __import__('nltk').sent_tokenize(text)"`
- None (default) to preserve existing behavior

Lambda strings are validated using `ast.parse`, following the same
pattern as `PythonLambdaMapper`.

Made-with: Cursor
diff --git a/data_juicer/ops/mapper/remove_repeat_sentences_mapper.py b/data_juicer/ops/mapper/remove_repeat_sentences_mapper.py
@@ -1,3 +1,6 @@
+import ast
+from collections.abc import Callable
+
 import regex as re
 
 from ..base_op import OPERATORS, Mapper
@@ -11,6 +14,21 @@ def split_sentence(text):
     return text.split("\n")
 
 
+def _wrap_tokenizer(fn: Callable[[str], list[str]]) -> Callable[[str], list[str]]:
+    """Wrap a tokenizer to match split_sentence's whitespace convention.
+
+    split_sentence preserves leading whitespace on each fragment (e.g. ["Hello.", " Goodbye."]), so the downstream
+    ``new_sent += sentence`` concatenation produces correct spacing. Custom tokenizers will typically return clean
+    tokens without leading whitespace, so this wrapper prepends a space to every sentence after the first.
+    """
+
+    def wrapped(line: str) -> list[str]:
+        sentences = fn(line)
+        return [(" " + s if i > 0 else s) for i, s in enumerate(sentences)]
+
+    return wrapped
+
+
 @OPERATORS.register_module("remove_repeat_sentences_mapper")
 class RemoveRepeatSentencesMapper(Mapper):
     """Mapper to remove repeat sentences in text samples.
@@ -30,6 +48,7 @@ def __init__(
         lowercase: bool = False,
         ignore_special_character: bool = True,
         min_repeat_sentence_length: int = 2,
+        tokenizer: Callable[[str], list[str]] | str | None = None,
         *args,
         **kwargs,
     ):
@@ -45,6 +64,11 @@ def __init__(
             length will not be deduplicated. If ignore_special_character is
             set to True, then special characters are not included in this
             length.
+        :param tokenizer: Custom sentence tokenizer. Can be a callable
+            that takes a string and returns a list of sentence strings,
+            or a lambda string for YAML configs (e.g.
+            ``"lambda text: __import__('nltk').sent_tokenize(text)"``).
+            If None, uses the built-in regex-based splitter.
         :param args: extra args
         :param kwargs: extra args
         """
@@ -54,6 +78,29 @@ def __init__(
         self.min_repeat_sentence_length = min_repeat_sentence_length
         self.remove_regex = re.compile(r"[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]") if ignore_special_character else None
 
+        if tokenizer is None:
+            self._tokenize = split_sentence
+        elif callable(tokenizer):
+            self._tokenize = _wrap_tokenizer(tokenizer)
+        elif isinstance(tokenizer, str):
+            self._tokenize = _wrap_tokenizer(self._create_tokenizer(tokenizer))
+        else:
+            raise ValueError(f"tokenizer must be None, a callable, or a lambda string, " f"got {type(tokenizer)}")
+
+    @staticmethod
+    def _create_tokenizer(tokenizer_str: str) -> Callable[[str], list[str]]:
+        """Parse and validate a tokenizer lambda string."""
+        try:
+            node = ast.parse(tokenizer_str, mode="eval")
+            if not isinstance(node.body, ast.Lambda):
+                raise ValueError("Input string must be a valid lambda function.")
+            if len(node.body.args.args) != 1:
+                raise ValueError("Lambda function must have exactly one argument.")
+            compiled_code = compile(node, "<string>", "eval")
+            return eval(compiled_code, {"__builtins__": __builtins__})
+        except Exception as e:
+            raise ValueError(f"Invalid tokenizer lambda: {e}")
+
     def process_batched(self, samples):
         for idx, text in enumerate(samples[self.text_key]):
             lines = [e for e in text.split("\n")]
@@ -62,7 +109,7 @@ def process_batched(self, samples):
             for line in lines:
                 new_sent = ""
                 if line:
-                    sentences = split_sentence(line)
+                    sentences = self._tokenize(line)
                     for sentence in sentences:
                         copy = sentence.strip()
                         if self.lowercase:
diff --git a/tests/ops/mapper/test_remove_repeat_sentences_mapper.py b/tests/ops/mapper/test_remove_repeat_sentences_mapper.py
@@ -67,6 +67,47 @@ def test_text2(self):
                                          min_repeat_sentence_length=5)
         self._run_helper(samples, op)
 
+    def test_custom_tokenizer_callable(self):
+
+        from nltk.tokenize import sent_tokenize
+
+        samples = [{
+            'text':
+            'The quick brown fox jumps over the lazy dog. Isn\'t it amazing? The quick brown fox jumps over the lazy dog. Speaking of weather, today is delightful.',
+            'target':
+            'The quick brown fox jumps over the lazy dog. Isn\'t it amazing? Speaking of weather, today is delightful.'
+        }]
+
+        op = RemoveRepeatSentencesMapper(tokenizer=sent_tokenize)
+        self._run_helper(samples, op)
+
+    def test_custom_tokenizer_lambda_str(self):
+
+        samples = [{
+            'text':
+            'The quick brown fox jumps over the lazy dog. Isn\'t it amazing? The quick brown fox jumps over the lazy dog. Speaking of weather, today is delightful.',
+            'target':
+            'The quick brown fox jumps over the lazy dog. Isn\'t it amazing? Speaking of weather, today is delightful.'
+        }]
+
+        op = RemoveRepeatSentencesMapper(
+            tokenizer="lambda text: __import__('nltk').sent_tokenize(text)")
+        self._run_helper(samples, op)
+
+    def test_custom_tokenizer_preserves_decimals(self):
+
+        from nltk.tokenize import sent_tokenize
+
+        samples = [{
+            'text':
+            'The package weighs 2.5 kg and ships tomorrow. Delivery takes 3 days. The package weighs 2.5 kg and ships tomorrow. Please confirm the order.',
+            'target':
+            'The package weighs 2.5 kg and ships tomorrow. Delivery takes 3 days. Please confirm the order.'
+        }]
+
+        op = RemoveRepeatSentencesMapper(tokenizer=sent_tokenize)
+        self._run_helper(samples, op)
+
 
 if __name__ == '__main__':
     unittest.main()