Bump version, sync codebase

hauntsaninja · hauntsaninja · commit 095924e02c85 · 2023-05-07T13:24:03.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [v0.4.0]
+- Add `decode_batch` and `decode_bytes_batch`
+- Improve error messages and handling
+
 ## [v0.3.3]
 - `tiktoken` will now make a best effort attempt to replace surrogate pairs with the corresponding
    Unicode character and will replace lone surrogates with the Unicode replacement character.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tiktoken"
-version = "0.3.3"
+version = "0.4.0"
 edition = "2021"
 rust-version = "1.57.0"
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "tiktoken"
-version = "0.3.3"
+version = "0.4.0"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 readme = "README.md"
 license = {file = "LICENSE"}
diff --git a/scripts/redact.py b/scripts/redact.py
@@ -20,7 +20,7 @@ def redact_file(path: Path, dry_run: bool) -> None:
         return
 
     pattern = "|".join(
-        re.escape(x)
+        r" *" + re.escape(x)
         for x in [
             "# ===== redact-beg =====\n",
             "# ===== redact-end =====\n",
diff --git a/tiktoken/core.py b/tiktoken/core.py
@@ -276,6 +276,19 @@ def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:
         """
         return [self.decode_single_token_bytes(token) for token in tokens]
 
+    def decode_batch(
+        self, batch: list[list[int]], *, errors: str = "replace", num_threads: int = 8
+    ) -> list[str]:
+        """Decodes a batch (list of lists of tokens) into a list of strings."""
+        decoder = functools.partial(self.decode, errors=errors)
+        with ThreadPoolExecutor(num_threads) as e:
+            return list(e.map(decoder, batch))
+
+    def decode_bytes_batch(self, batch: list[list[int]], *, num_threads: int = 8) -> list[bytes]:
+        """Decodes a batch (list of lists of tokens) into a list of bytes."""
+        with ThreadPoolExecutor(num_threads) as e:
+            return list(e.map(self.decode_bytes, batch))
+
     # ====================
     # Miscellaneous
     # ====================
@@ -327,6 +340,7 @@ def _encode_bytes(self, text: bytes) -> list[int]:
         return self._core_bpe._encode_bytes(text)
 
 
+
 @functools.lru_cache(maxsize=128)
 def _special_token_regex(tokens: frozenset[str]) -> "regex.Pattern[str]":
     inner = "|".join(regex.escape(token) for token in tokens)
diff --git a/tiktoken/load.py b/tiktoken/load.py
@@ -14,10 +14,10 @@ def read_file(blobpath: str) -> bytes:
     if not blobpath.startswith("http://") and not blobpath.startswith("https://"):
         try:
             import blobfile
-        except ImportError:
+        except ImportError as e:
             raise ImportError(
                 "blobfile is not installed. Please install it by running `pip install blobfile`."
-            )
+            ) from e
         with blobfile.BlobFile(blobpath, "rb") as f:
             return f.read()
     # avoiding blobfile for public files helps avoid auth issues, like MFA prompts
@@ -102,10 +102,10 @@ def decode_data_gym(value: str) -> bytes:
 def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None:
     try:
         import blobfile
-    except ImportError:
+    except ImportError as e:
         raise ImportError(
             "blobfile is not installed. Please install it by running `pip install blobfile`."
-        )
+        ) from e
     with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f:
         for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]):
             f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")
diff --git a/tiktoken/model.py b/tiktoken/model.py
@@ -69,7 +69,7 @@ def encoding_for_model(model_name: str) -> Encoding:
     if encoding_name is None:
         raise KeyError(
             f"Could not automatically map {model_name} to a tokeniser. "
-            "Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
+            "Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
         ) from None
 
     return get_encoding(encoding_name)