@@ -92,12 +92,11 @@ impl Tokenizer {
9292 /// Otherwise, it returns none. This function can be faster than [`Self::count`]` when the
9393 /// token limit is much smaller than the provided text. Applies pre-tokenization before counting.
9494 pub fn count_till_limit ( & self , text : & str , token_limit : usize ) -> Option < usize > {
95- self . split ( text)
96- . try_fold ( token_limit, |token_limit, piece| {
97- self . bpe
98- . count_till_limit ( piece. as_bytes ( ) , token_limit)
99- . map ( |piece_count| token_limit - piece_count)
100- } )
95+ self . split ( text) . try_fold ( 0 , |consumed, piece| {
96+ self . bpe
97+ . count_till_limit ( piece. as_bytes ( ) , token_limit - consumed)
98+ . map ( |piece_count| consumed + piece_count)
99+ } )
101100 }
102101
103102 /// Returns the tokens for the encoding of the given text. Applies pre-tokenization before
@@ -231,4 +230,12 @@ mod tests {
231230 }
232231 }
233232 }
233+
234+ #[ test]
235+ fn test_count_till_limit ( ) {
236+ assert_eq ! ( cl100k_base( ) . count_till_limit( "abc" , 3 ) , Some ( 1 ) ) ;
237+ assert_eq ! ( cl100k_base( ) . count_till_limit( "abcabc" , 3 ) , Some ( 2 ) ) ;
238+ assert_eq ! ( cl100k_base( ) . count_till_limit( "abcabcabc" , 3 ) , Some ( 3 ) ) ;
239+ assert_eq ! ( cl100k_base( ) . count_till_limit( "abcabcabcabc" , 3 ) , None ) ;
240+ }
234241}
0 commit comments