Sync codebase

hauntsaninja · hauntsaninja · commit 39f29cecdb6f · 2023-09-12T17:40:01.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [v0.5.1]
+- Add `encoding_name_for_model`, undo some renames to variables that are implementation details
+
 ## [v0.5.0]
 - Add `tiktoken._educational` submodule to better document how byte pair encoding works
 - Ensure `encoding_for_model` knows about several new models
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tiktoken"
-version = "0.5.0"
+version = "0.5.1"
 edition = "2021"
 rust-version = "1.57.0"
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "tiktoken"
-version = "0.5.0"
+version = "0.5.1"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 readme = "README.md"
 license = {file = "LICENSE"}
diff --git a/tiktoken/__init__.py b/tiktoken/__init__.py
@@ -1,4 +1,6 @@
+# This is the public API of tiktoken
 from .core import Encoding as Encoding
 from .model import encoding_for_model as encoding_for_model
+from .model import encoding_name_for_model as encoding_name_for_model
 from .registry import get_encoding as get_encoding
 from .registry import list_encoding_names as list_encoding_names
diff --git a/tiktoken/model.py b/tiktoken/model.py
@@ -4,7 +4,7 @@
 from .registry import get_encoding
 
 # TODO: these will likely be replaced by an API endpoint
-_MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
+MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
     # chat
     "gpt-4-": "cl100k_base",  # e.g., gpt-4-0314, etc., plus gpt-4-32k
     "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
@@ -16,7 +16,7 @@
     "ft:babbage-002": "cl100k_base",
 }
 
-_MODEL_TO_ENCODING: dict[str, str] = {
+MODEL_TO_ENCODING: dict[str, str] = {
     # chat
     "gpt-4": "cl100k_base",
     "gpt-3.5-turbo": "cl100k_base",
@@ -64,23 +64,34 @@
 }
 
 
-def encoding_for_model(model_name: str) -> Encoding:
-    """Returns the encoding used by a model."""
+def encoding_name_for_model(model_name: str) -> str:
+    """Returns the name of the encoding used by a model.
+
+    Raises a KeyError if the model name is not recognised.
+    """
     encoding_name = None
-    if model_name in _MODEL_TO_ENCODING:
-        encoding_name = _MODEL_TO_ENCODING[model_name]
+    if model_name in MODEL_TO_ENCODING:
+        encoding_name = MODEL_TO_ENCODING[model_name]
     else:
         # Check if the model matches a known prefix
         # Prefix matching avoids needing library updates for every model version release
         # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
-        for model_prefix, model_encoding_name in _MODEL_PREFIX_TO_ENCODING.items():
+        for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
             if model_name.startswith(model_prefix):
-                return get_encoding(model_encoding_name)
+                return model_encoding_name
 
     if encoding_name is None:
         raise KeyError(
             f"Could not automatically map {model_name} to a tokeniser. "
             "Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
         ) from None
 
-    return get_encoding(encoding_name)
+    return encoding_name
+
+
+def encoding_for_model(model_name: str) -> Encoding:
+    """Returns the encoding used by a model.
+
+    Raises a KeyError if the model name is not recognised.
+    """
+    return get_encoding(encoding_name_for_model(model_name))