Merge pull request #42 from github/aneubeck/count_till_limit

aneubeck · web-flow · commit 3519bff6427c · 2024-12-06T08:04:42.000+01:00
fix count_till_limit function
diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs
@@ -92,12 +92,11 @@ impl Tokenizer {
     /// Otherwise, it returns none. This function can be faster than [`Self::count`]` when the
     /// token limit is much smaller than the provided text. Applies pre-tokenization before counting.
     pub fn count_till_limit(&self, text: &str, token_limit: usize) -> Option<usize> {
-        self.split(text)
-            .try_fold(token_limit, |token_limit, piece| {
-                self.bpe
-                    .count_till_limit(piece.as_bytes(), token_limit)
-                    .map(|piece_count| token_limit - piece_count)
-            })
+        self.split(text).try_fold(0, |consumed, piece| {
+            self.bpe
+                .count_till_limit(piece.as_bytes(), token_limit - consumed)
+                .map(|piece_count| consumed + piece_count)
+        })
     }
 
     /// Returns the tokens for the encoding of the given text. Applies pre-tokenization before
@@ -231,4 +230,12 @@ mod tests {
             }
         }
     }
+
+    #[test]
+    fn test_count_till_limit() {
+        assert_eq!(cl100k_base().count_till_limit("abc", 3), Some(1));
+        assert_eq!(cl100k_base().count_till_limit("abcabc", 3), Some(2));
+        assert_eq!(cl100k_base().count_till_limit("abcabcabc", 3), Some(3));
+        assert_eq!(cl100k_base().count_till_limit("abcabcabcabc", 3), None);
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -92,12 +92,11 @@ impl Tokenizer {`
`92`	`92`	/// Otherwise, it returns none. This function can be faster than [`Self::count`]` when the
`93`	`93`	`/// token limit is much smaller than the provided text. Applies pre-tokenization before counting.`
`94`	`94`	`pub fn count_till_limit(&self, text: &str, token_limit: usize) -> Option<usize> {`
`95`		`- self.split(text)`
`96`		`- .try_fold(token_limit, \|token_limit, piece\| {`
`97`		`- self.bpe`
`98`		`- .count_till_limit(piece.as_bytes(), token_limit)`
`99`		`- .map(\|piece_count\| token_limit - piece_count)`
`100`		`- })`
	`95`	`+ self.split(text).try_fold(0, \|consumed, piece\| {`
	`96`	`+ self.bpe`
	`97`	`+ .count_till_limit(piece.as_bytes(), token_limit - consumed)`
	`98`	`+ .map(\|piece_count\| consumed + piece_count)`
	`99`	`+ })`
`101`	`100`	`}`
`102`	`101`
`103`	`102`	`/// Returns the tokens for the encoding of the given text. Applies pre-tokenization before`
`@@ -231,4 +230,12 @@ mod tests {`
`231`	`230`	`}`
`232`	`231`	`}`
`233`	`232`	`}`
	`233`	`+`
	`234`	`+ #[test]`
	`235`	`+ fn test_count_till_limit() {`
	`236`	`+ assert_eq!(cl100k_base().count_till_limit("abc", 3), Some(1));`
	`237`	`+ assert_eq!(cl100k_base().count_till_limit("abcabc", 3), Some(2));`
	`238`	`+ assert_eq!(cl100k_base().count_till_limit("abcabcabc", 3), Some(3));`
	`239`	`+ assert_eq!(cl100k_base().count_till_limit("abcabcabcabc", 3), None);`
	`240`	`+ }`
`234`	`241`	`}`