datajuicer · JohnGiorgi · Mar 3, 2026 · gemini-code-assist · Mar 3, 2026 · JohnGiorgi
diff --git a/data_juicer/ops/filter/token_num_filter.py b/data_juicer/ops/filter/token_num_filter.py
@@ -4,7 +4,6 @@
 from data_juicer.utils.model_utils import get_model, prepare_model
 
 from ..base_op import OPERATORS, Filter
-from ..common import get_words_from_document
 
 OP_NAME = "token_num_filter"
 
@@ -19,6 +18,8 @@ class TokenNumFilter(Filter):
     the token count is not already computed, it will be calculated using the specified
     tokenizer."""
 
+    _batched_op = True
+
     def __init__(
         self,
         hf_tokenizer: str = "EleutherAI/pythia-6.9b-deduped",
@@ -48,15 +49,28 @@ def __init__(
             model_type="huggingface", pretrained_model_name_or_path=hf_tokenizer, return_model=False
         )
 
-    def compute_stats_single(self, sample):
-        # check if it's computed already
-        if StatsKeys.num_token in sample[Fields.stats]:
-            return sample
+    def compute_stats_batched(self, samples, *args, **kwargs):
+        samples_list = samples[self.text_key]
+        samples_stats = samples[Fields.stats]
+
+        # Collect indices and texts that need tokenization
+        indices = []
+        texts = []
+        for idx, stat in enumerate(samples_stats):
+            if StatsKeys.num_token not in stat:
+                indices.append(idx)
+                texts.append(samples_list[idx])
-        for idx, stat in enumerate(samples_stats):
-            if StatsKeys.num_token not in stat:
-                indices.append(idx)
-                texts.append(samples_list[idx])
+        for idx, stat in enumerate(samples_stats):
+            if StatsKeys.num_token not in stat:
+                text = samples_list[idx]
+                if isinstance(text, str):
+                    indices.append(idx)
+                    texts.append(text)
+                else:
+                    # Treat non-string samples as having 0 tokens
+                    samples_stats[idx][StatsKeys.num_token] = 0
 def get_words_from_document( 
     document, 
     token_func=None, 
     new_line=True, 
     tab=True, 
 ): 
     """ 
     Get words from a document. Useful to compute ratios, like the 
     stopwords ratio. 
     :param document: document that need to split words. 
     :param token_func: function of tokenizer, if specified, the function 
      will be used for split document into different tokens. 
     :param new_line: whether to use '\\\\n' to split words. 
     :param tab: whether to use '\\\\t' to split words. 
     :return: word list obtained from document 
     """ 
     if token_func: 
         words = token_func(document) 
     else: 
         words = split_on_whitespace(document, new_line, tab) 
     return words 
 # 2. filter out those samples with empty or None text 
 # TODO: optimize the filtering operation for better efficiency 
 logger.info(f"There are {len(dataset)} sample(s) in the original dataset.") 
 def non_empty_text(sample, target_keys): 
     for target_key in target_keys: 
         # TODO: case for CFT, in which the len(sample[target_key]) == 0 
         if sample[target_key] is None: 
             # we filter out the samples contains at least None column 
             # since the op can not handle it now 
             return False 
     return True 
 dataset = dataset.filter(non_empty_text, num_proc=num_proc, fn_kwargs={"target_keys": text_keys}) 
 logger.info(f"{len(dataset)} samples left after filtering empty text.") 
-        for idx, stat in enumerate(samples_stats):
-            if StatsKeys.num_token not in stat:
-                indices.append(idx)
-                texts.append(samples_list[idx])
+        for idx, stat in enumerate(samples_stats):
+            if StatsKeys.num_token not in stat:
+                text = samples_list[idx]
+                if isinstance(text, str):
+                    indices.append(idx)
+                    texts.append(text)
+                else:
+                    # Treat non-string samples as having 0 tokens
+                    samples_stats[idx][StatsKeys.num_token] = 0
 def get_words_from_document( 
     document, 
     token_func=None, 
     new_line=True, 
     tab=True, 
 ): 
     """ 
     Get words from a document. Useful to compute ratios, like the 
     stopwords ratio. 
  
     :param document: document that need to split words. 
     :param token_func: function of tokenizer, if specified, the function 
      will be used for split document into different tokens. 
     :param new_line: whether to use '\\\\n' to split words. 
     :param tab: whether to use '\\\\t' to split words. 
     :return: word list obtained from document 
     """ 
     if token_func: 
         words = token_func(document) 
     else: 
         words = split_on_whitespace(document, new_line, tab) 
     return words 
 # 2. filter out those samples with empty or None text 
 # TODO: optimize the filtering operation for better efficiency 
 logger.info(f"There are {len(dataset)} sample(s) in the original dataset.") 
  
 def non_empty_text(sample, target_keys): 
     for target_key in target_keys: 
         # TODO: case for CFT, in which the len(sample[target_key]) == 0 
         if sample[target_key] is None: 
             # we filter out the samples contains at least None column 
             # since the op can not handle it now 
             return False 
     return True 
  
 dataset = dataset.filter(non_empty_text, num_proc=num_proc, fn_kwargs={"target_keys": text_keys}) 
 logger.info(f"{len(dataset)} samples left after filtering empty text.") 
+
+        if texts:
+            tokenizer = get_model(self.model_key)
+            encoded = tokenizer(texts, add_special_tokens=False)
+            for i, idx in enumerate(indices):
+                samples_stats[idx][StatsKeys.num_token] = len(encoded["input_ids"][i])
 
-        tokenizer = get_model(self.model_key)
-        tokens = get_words_from_document(sample[self.text_key], token_func=tokenizer.tokenize if tokenizer else None)
-        sample[Fields.stats][StatsKeys.num_token] = len(tokens)
-        return sample
+        return samples
 
-    def process_single(self, sample):
-        return self.get_keep_boolean(sample[Fields.stats][StatsKeys.num_token], self.min_num, self.max_num)
+    def process_batched(self, samples):
+        return [
+            self.get_keep_boolean(stat[StatsKeys.num_token], self.min_num, self.max_num)
+            for stat in samples[Fields.stats]
+        ]