Skip to content

Commit 3519bff

Browse files
authored
Merge pull request #42 from github/aneubeck/count_till_limit
fix count_till_limit function
2 parents 6e03fd0 + c0a3cb7 commit 3519bff

File tree

1 file changed

+13
-6
lines changed

1 file changed

+13
-6
lines changed

crates/bpe-openai/src/lib.rs

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,11 @@ impl Tokenizer {
9292
/// Otherwise, it returns none. This function can be faster than [`Self::count`]` when the
9393
/// token limit is much smaller than the provided text. Applies pre-tokenization before counting.
9494
pub fn count_till_limit(&self, text: &str, token_limit: usize) -> Option<usize> {
95-
self.split(text)
96-
.try_fold(token_limit, |token_limit, piece| {
97-
self.bpe
98-
.count_till_limit(piece.as_bytes(), token_limit)
99-
.map(|piece_count| token_limit - piece_count)
100-
})
95+
self.split(text).try_fold(0, |consumed, piece| {
96+
self.bpe
97+
.count_till_limit(piece.as_bytes(), token_limit - consumed)
98+
.map(|piece_count| consumed + piece_count)
99+
})
101100
}
102101

103102
/// Returns the tokens for the encoding of the given text. Applies pre-tokenization before
@@ -231,4 +230,12 @@ mod tests {
231230
}
232231
}
233232
}
233+
234+
#[test]
235+
fn test_count_till_limit() {
236+
assert_eq!(cl100k_base().count_till_limit("abc", 3), Some(1));
237+
assert_eq!(cl100k_base().count_till_limit("abcabc", 3), Some(2));
238+
assert_eq!(cl100k_base().count_till_limit("abcabcabc", 3), Some(3));
239+
assert_eq!(cl100k_base().count_till_limit("abcabcabcabc", 3), None);
240+
}
234241
}

0 commit comments

Comments
 (0)