Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 25 additions & 11 deletions data_juicer/ops/filter/token_num_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..common import get_words_from_document

OP_NAME = "token_num_filter"

Expand All @@ -19,6 +18,8 @@ class TokenNumFilter(Filter):
the token count is not already computed, it will be calculated using the specified
tokenizer."""

_batched_op = True

def __init__(
self,
hf_tokenizer: str = "EleutherAI/pythia-6.9b-deduped",
Expand Down Expand Up @@ -48,15 +49,28 @@ def __init__(
model_type="huggingface", pretrained_model_name_or_path=hf_tokenizer, return_model=False
)

def compute_stats_single(self, sample):
# check if it's computed already
if StatsKeys.num_token in sample[Fields.stats]:
return sample
def compute_stats_batched(self, samples, *args, **kwargs):
samples_list = samples[self.text_key]
samples_stats = samples[Fields.stats]

# Collect indices and texts that need tokenization
indices = []
texts = []
for idx, stat in enumerate(samples_stats):
if StatsKeys.num_token not in stat:
indices.append(idx)
texts.append(samples_list[idx])
Comment on lines +59 to +62
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The previous implementation used get_words_from_document, which likely handled non-string or None values in the text field gracefully (e.g., by treating them as empty documents, resulting in a token count of 0). This new batched implementation directly appends samples_list[idx] to the texts list. If samples_list contains any non-string values, the tokenizer call will raise an error. To improve robustness and prevent potential regressions, it's important to handle non-string inputs. A safe way to do this is to assign a token count of 0 to non-string samples, mimicking the likely behavior of the previous implementation.

Suggested change
for idx, stat in enumerate(samples_stats):
if StatsKeys.num_token not in stat:
indices.append(idx)
texts.append(samples_list[idx])
for idx, stat in enumerate(samples_stats):
if StatsKeys.num_token not in stat:
text = samples_list[idx]
if isinstance(text, str):
indices.append(idx)
texts.append(text)
else:
# Treat non-string samples as having 0 tokens
samples_stats[idx][StatsKeys.num_token] = 0

Copy link
Contributor Author

@JohnGiorgi JohnGiorgi Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is true:

def get_words_from_document(
document,
token_func=None,
new_line=True,
tab=True,
):
"""
Get words from a document. Useful to compute ratios, like the
stopwords ratio.
:param document: document that need to split words.
:param token_func: function of tokenizer, if specified, the function
will be used for split document into different tokens.
:param new_line: whether to use '\\\\n' to split words.
:param tab: whether to use '\\\\t' to split words.
:return: word list obtained from document
"""
if token_func:
words = token_func(document)
else:
words = split_on_whitespace(document, new_line, tab)
return words

Copy link
Contributor Author

@JohnGiorgi JohnGiorgi Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also think there is an upstream filter for empty strings:

# 2. filter out those samples with empty or None text
# TODO: optimize the filtering operation for better efficiency
logger.info(f"There are {len(dataset)} sample(s) in the original dataset.")
def non_empty_text(sample, target_keys):
for target_key in target_keys:
# TODO: case for CFT, in which the len(sample[target_key]) == 0
if sample[target_key] is None:
# we filter out the samples contains at least None column
# since the op can not handle it now
return False
return True
dataset = dataset.filter(non_empty_text, num_proc=num_proc, fn_kwargs={"target_keys": text_keys})
logger.info(f"{len(dataset)} samples left after filtering empty text.")


if texts:
tokenizer = get_model(self.model_key)
encoded = tokenizer(texts, add_special_tokens=False)
for i, idx in enumerate(indices):
samples_stats[idx][StatsKeys.num_token] = len(encoded["input_ids"][i])

tokenizer = get_model(self.model_key)
tokens = get_words_from_document(sample[self.text_key], token_func=tokenizer.tokenize if tokenizer else None)
sample[Fields.stats][StatsKeys.num_token] = len(tokens)
return sample
return samples

def process_single(self, sample):
return self.get_keep_boolean(sample[Fields.stats][StatsKeys.num_token], self.min_num, self.max_num)
def process_batched(self, samples):
return [
self.get_keep_boolean(stat[StatsKeys.num_token], self.min_num, self.max_num)
for stat in samples[Fields.stats]
]