Skip to content

Commit 39f29ce

Browse files
committed
Sync codebase
1 parent 52fceb8 commit 39f29ce

File tree

5 files changed

+27
-11
lines changed

5 files changed

+27
-11
lines changed

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
This is the changelog for the open source version of tiktoken.
44

5+
## [v0.5.1]
6+
- Add `encoding_name_for_model`, undo some renames to variables that are implementation details
7+
58
## [v0.5.0]
69
- Add `tiktoken._educational` submodule to better document how byte pair encoding works
710
- Ensure `encoding_for_model` knows about several new models

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "tiktoken"
3-
version = "0.5.0"
3+
version = "0.5.1"
44
edition = "2021"
55
rust-version = "1.57.0"
66

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "tiktoken"
3-
version = "0.5.0"
3+
version = "0.5.1"
44
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
55
readme = "README.md"
66
license = {file = "LICENSE"}

tiktoken/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
# This is the public API of tiktoken
12
from .core import Encoding as Encoding
23
from .model import encoding_for_model as encoding_for_model
4+
from .model import encoding_name_for_model as encoding_name_for_model
35
from .registry import get_encoding as get_encoding
46
from .registry import list_encoding_names as list_encoding_names

tiktoken/model.py

+20-9
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from .registry import get_encoding
55

66
# TODO: these will likely be replaced by an API endpoint
7-
_MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
7+
MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
88
# chat
99
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
1010
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
@@ -16,7 +16,7 @@
1616
"ft:babbage-002": "cl100k_base",
1717
}
1818

19-
_MODEL_TO_ENCODING: dict[str, str] = {
19+
MODEL_TO_ENCODING: dict[str, str] = {
2020
# chat
2121
"gpt-4": "cl100k_base",
2222
"gpt-3.5-turbo": "cl100k_base",
@@ -64,23 +64,34 @@
6464
}
6565

6666

67-
def encoding_for_model(model_name: str) -> Encoding:
68-
"""Returns the encoding used by a model."""
67+
def encoding_name_for_model(model_name: str) -> str:
68+
"""Returns the name of the encoding used by a model.
69+
70+
Raises a KeyError if the model name is not recognised.
71+
"""
6972
encoding_name = None
70-
if model_name in _MODEL_TO_ENCODING:
71-
encoding_name = _MODEL_TO_ENCODING[model_name]
73+
if model_name in MODEL_TO_ENCODING:
74+
encoding_name = MODEL_TO_ENCODING[model_name]
7275
else:
7376
# Check if the model matches a known prefix
7477
# Prefix matching avoids needing library updates for every model version release
7578
# Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
76-
for model_prefix, model_encoding_name in _MODEL_PREFIX_TO_ENCODING.items():
79+
for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
7780
if model_name.startswith(model_prefix):
78-
return get_encoding(model_encoding_name)
81+
return model_encoding_name
7982

8083
if encoding_name is None:
8184
raise KeyError(
8285
f"Could not automatically map {model_name} to a tokeniser. "
8386
"Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
8487
) from None
8588

86-
return get_encoding(encoding_name)
89+
return encoding_name
90+
91+
92+
def encoding_for_model(model_name: str) -> Encoding:
93+
"""Returns the encoding used by a model.
94+
95+
Raises a KeyError if the model name is not recognised.
96+
"""
97+
return get_encoding(encoding_name_for_model(model_name))

0 commit comments

Comments
 (0)