Skip to content

Commit def176a

Browse files
committed
Invalidate cache of vocab by checksum
1 parent b6348d7 commit def176a

File tree

4 files changed

+110
-9
lines changed

4 files changed

+110
-9
lines changed

src/EncoderProvider.php

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,27 @@ final class EncoderProvider implements ResetInterface
2222
public const ENCODINGS = [
2323
'r50k_base' => [
2424
'vocab' => 'https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken',
25+
'hash' => '306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930',
2526
'pat' => '/\'s|\'t|\'re|\'ve|\'m|\'ll|\'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/u',
2627
],
2728
'p50k_base' => [
2829
'vocab' => 'https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken',
30+
'hash' => '94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069',
2931
'pat' => '/\'s|\'t|\'re|\'ve|\'m|\'ll|\'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/u',
3032
],
3133
'p50k_edit' => [
3234
'vocab' => 'https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken',
35+
'hash' => '94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069',
3336
'pat' => '/\'s|\'t|\'re|\'ve|\'m|\'ll|\'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/u',
3437
],
3538
'cl100k_base' => [
3639
'vocab' => 'https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken',
40+
'hash' => '223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7',
3741
'pat' => '/(?i:\'s|\'t|\'re|\'ve|\'m|\'ll|\'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/u',
3842
],
3943
'o200k_base' => [
4044
'vocab' => 'https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken',
45+
'hash' => '446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d',
4146
'pat' => '/[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:\'s|\'t|\'re|\'ve|\'m|\'ll|\'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:\'s|\'t|\'re|\'ve|\'m|\'ll|\'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n\/]*|\s*[\r\n]+|\s+(?!\S)|\s+/u',
4247
],
4348
];

src/Vocab/Loader/DefaultVocabLoader.php

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,11 @@
1212
use function fclose;
1313
use function file_exists;
1414
use function fopen;
15+
use function hash_equals;
16+
use function hash_file;
1517
use function is_dir;
1618
use function is_writable;
1719
use function mkdir;
18-
use function preg_match;
1920
use function sha1;
2021
use function sprintf;
2122
use function stream_copy_to_stream;
@@ -28,16 +29,12 @@ public function __construct(private string|null $cacheDir = null)
2829
{
2930
}
3031

31-
public function load(string $uri): Vocab
32+
public function load(string $uri, string|null $checksum = null): Vocab
3233
{
33-
if ($this->cacheDir !== null && preg_match('@^https?://@i', $uri)) {
34-
$cacheFile = $this->cacheDir . DIRECTORY_SEPARATOR . sha1($uri);
35-
} else {
36-
$cacheFile = null;
37-
}
34+
$cacheFile = $this->cacheDir !== null ? $this->cacheDir . DIRECTORY_SEPARATOR . sha1($uri) : null;
3835

3936
if ($cacheFile !== null) {
40-
if (file_exists($cacheFile)) {
37+
if (file_exists($cacheFile) && $this->checkHash($cacheFile, $checksum)) {
4138
return Vocab::fromFile($cacheFile);
4239
}
4340

@@ -83,4 +80,19 @@ public function load(string $uri): Vocab
8380
fclose($stream);
8481
}
8582
}
83+
84+
private function checkHash(string $filename, string|null $expectedHash): bool
85+
{
86+
if ($expectedHash === null) {
87+
return true;
88+
}
89+
90+
$hash = hash_file('sha256', $filename);
91+
92+
if ($hash === false) {
93+
return false;
94+
}
95+
96+
return hash_equals($hash, $expectedHash);
97+
}
8698
}

src/Vocab/VocabLoader.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,5 @@
77
interface VocabLoader
88
{
99
/** @param non-empty-string $uri */
10-
public function load(string $uri): Vocab;
10+
public function load(string $uri, string|null $checksum = null): Vocab;
1111
}
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Yethee\Tiktoken\Tests\Vocab\Loader;
6+
7+
use PHPUnit\Framework\TestCase;
8+
use RecursiveDirectoryIterator;
9+
use RecursiveIteratorIterator;
10+
use Yethee\Tiktoken\Vocab\Loader\DefaultVocabLoader;
11+
12+
use function copy;
13+
use function dirname;
14+
use function file_put_contents;
15+
use function hash;
16+
use function rmdir;
17+
use function sys_get_temp_dir;
18+
use function unlink;
19+
20+
final class DefaultVocabLoaderTest extends TestCase
21+
{
22+
private string $cacheDir;
23+
24+
public function testLoadFromCache(): void
25+
{
26+
$loader = new DefaultVocabLoader($this->cacheDir);
27+
28+
$vocabUrl = 'http://localhost/cl100k_base.tiktoken';
29+
$cacheFile = $this->cacheDir . '/' . hash('sha1', $vocabUrl);
30+
31+
copy(dirname(__DIR__, 2) . '/Fixtures/cl100k_base.tiktoken', $cacheFile);
32+
self::assertFileEquals(dirname(__DIR__, 2) . '/Fixtures/cl100k_base.tiktoken', $cacheFile);
33+
34+
$vocab = $loader->load($vocabUrl, '223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7');
35+
36+
self::assertSame(100256, $vocab->count());
37+
}
38+
39+
public function testInvalidateCacheWhenChecksumMismatch(): void
40+
{
41+
$loader = new DefaultVocabLoader($this->cacheDir);
42+
43+
$vocabUrl = dirname(__DIR__, 2) . '/Fixtures/p50k_base.tiktoken';
44+
$cacheFile = $this->cacheDir . '/' . hash('sha1', $vocabUrl);
45+
46+
file_put_contents($cacheFile, 'outdated content');
47+
self::assertFileExists($cacheFile);
48+
49+
$vocab = $loader->load($vocabUrl, '94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069');
50+
51+
self::assertSame(50280, $vocab->count());
52+
53+
self::assertFileExists($cacheFile);
54+
self::assertFileEquals($vocabUrl, $cacheFile);
55+
}
56+
57+
protected function setUp(): void
58+
{
59+
$this->cacheDir = sys_get_temp_dir() . '/tiktoken-test';
60+
61+
self::removeDir($this->cacheDir);
62+
}
63+
64+
protected function tearDown(): void
65+
{
66+
self::removeDir($this->cacheDir);
67+
}
68+
69+
private static function removeDir(string $path): void
70+
{
71+
$iterator = new RecursiveIteratorIterator(
72+
new RecursiveDirectoryIterator($path, RecursiveDirectoryIterator::SKIP_DOTS),
73+
RecursiveIteratorIterator::CHILD_FIRST,
74+
);
75+
76+
foreach ($iterator as $entry) {
77+
if ($entry->isFile()) {
78+
unlink($entry->getPathname());
79+
} else {
80+
rmdir($entry->getPathname());
81+
}
82+
}
83+
}
84+
}

0 commit comments

Comments
 (0)