Skip to content

Commit 095924e

Browse files
committed
Bump version, sync codebase
1 parent f19feec commit 095924e

File tree

7 files changed

+26
-8
lines changed

7 files changed

+26
-8
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
This is the changelog for the open source version of tiktoken.
44

5+
## [v0.4.0]
6+
- Add `decode_batch` and `decode_bytes_batch`
7+
- Improve error messages and handling
8+
59
## [v0.3.3]
610
- `tiktoken` will now make a best effort attempt to replace surrogate pairs with the corresponding
711
Unicode character and will replace lone surrogates with the Unicode replacement character.

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "tiktoken"
3-
version = "0.3.3"
3+
version = "0.4.0"
44
edition = "2021"
55
rust-version = "1.57.0"
66

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "tiktoken"
3-
version = "0.3.3"
3+
version = "0.4.0"
44
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
55
readme = "README.md"
66
license = {file = "LICENSE"}

scripts/redact.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def redact_file(path: Path, dry_run: bool) -> None:
2020
return
2121

2222
pattern = "|".join(
23-
re.escape(x)
23+
r" *" + re.escape(x)
2424
for x in [
2525
"# ===== redact-beg =====\n",
2626
"# ===== redact-end =====\n",

tiktoken/core.py

+14
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,19 @@ def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:
276276
"""
277277
return [self.decode_single_token_bytes(token) for token in tokens]
278278

279+
def decode_batch(
280+
self, batch: list[list[int]], *, errors: str = "replace", num_threads: int = 8
281+
) -> list[str]:
282+
"""Decodes a batch (list of lists of tokens) into a list of strings."""
283+
decoder = functools.partial(self.decode, errors=errors)
284+
with ThreadPoolExecutor(num_threads) as e:
285+
return list(e.map(decoder, batch))
286+
287+
def decode_bytes_batch(self, batch: list[list[int]], *, num_threads: int = 8) -> list[bytes]:
288+
"""Decodes a batch (list of lists of tokens) into a list of bytes."""
289+
with ThreadPoolExecutor(num_threads) as e:
290+
return list(e.map(self.decode_bytes, batch))
291+
279292
# ====================
280293
# Miscellaneous
281294
# ====================
@@ -327,6 +340,7 @@ def _encode_bytes(self, text: bytes) -> list[int]:
327340
return self._core_bpe._encode_bytes(text)
328341

329342

343+
330344
@functools.lru_cache(maxsize=128)
331345
def _special_token_regex(tokens: frozenset[str]) -> "regex.Pattern[str]":
332346
inner = "|".join(regex.escape(token) for token in tokens)

tiktoken/load.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@ def read_file(blobpath: str) -> bytes:
1414
if not blobpath.startswith("http://") and not blobpath.startswith("https://"):
1515
try:
1616
import blobfile
17-
except ImportError:
17+
except ImportError as e:
1818
raise ImportError(
1919
"blobfile is not installed. Please install it by running `pip install blobfile`."
20-
)
20+
) from e
2121
with blobfile.BlobFile(blobpath, "rb") as f:
2222
return f.read()
2323
# avoiding blobfile for public files helps avoid auth issues, like MFA prompts
@@ -102,10 +102,10 @@ def decode_data_gym(value: str) -> bytes:
102102
def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None:
103103
try:
104104
import blobfile
105-
except ImportError:
105+
except ImportError as e:
106106
raise ImportError(
107107
"blobfile is not installed. Please install it by running `pip install blobfile`."
108-
)
108+
) from e
109109
with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f:
110110
for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]):
111111
f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")

tiktoken/model.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def encoding_for_model(model_name: str) -> Encoding:
6969
if encoding_name is None:
7070
raise KeyError(
7171
f"Could not automatically map {model_name} to a tokeniser. "
72-
"Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
72+
"Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
7373
) from None
7474

7575
return get_encoding(encoding_name)

0 commit comments

Comments
 (0)