@@ -2,24 +2,23 @@ use std::cmp::Reverse;
2
2
use std:: collections:: BinaryHeap ;
3
3
use std:: hash:: { Hash , Hasher } ;
4
4
use std:: ops:: Range ;
5
+ use std:: sync:: LazyLock ;
5
6
6
7
use aneubeck_daachorse:: { DoubleArrayAhoCorasick , DoubleArrayAhoCorasickBuilder } ;
7
8
use fnv:: { FnvHashMap , FnvHasher } ;
8
9
use itertools:: Itertools ;
9
- use once_cell:: sync:: Lazy ;
10
10
use serde:: de:: Visitor ;
11
11
use serde:: { Deserialize , Deserializer , Serialize , Serializer } ;
12
- use tiktoken_rs:: CoreBPE ;
13
12
14
13
use crate :: backtrack_encoder:: BacktrackEncoder ;
15
14
use crate :: bitfield:: BitField ;
16
15
17
- static BPE_CL100K : Lazy < BytePairEncoding > = Lazy :: new ( || {
16
+ static BPE_CL100K : LazyLock < BytePairEncoding > = LazyLock :: new ( || {
18
17
let bytes = include_bytes ! ( "data/bpe_cl100k.dict" ) ;
19
18
rmp_serde:: from_slice ( bytes) . expect ( "" )
20
19
} ) ;
21
20
22
- static BPE_O200K : Lazy < BytePairEncoding > = Lazy :: new ( || {
21
+ static BPE_O200K : LazyLock < BytePairEncoding > = LazyLock :: new ( || {
23
22
let bytes = include_bytes ! ( "data/bpe_o200k.dict" ) ;
24
23
rmp_serde:: from_slice ( bytes) . expect ( "" )
25
24
} ) ;
@@ -194,7 +193,8 @@ impl BytePairEncoding {
194
193
}
195
194
196
195
/// Construct a BytePairEncoding instance frmo a tiktoken dictionary.
197
- pub fn from_tiktoken ( tiktoken_bpe : & CoreBPE , num_tokens : usize ) -> Self {
196
+ #[ cfg( feature = "tiktoken-rs" ) ]
197
+ pub fn from_tiktoken ( tiktoken_bpe : & tiktoken_rs:: CoreBPE , num_tokens : usize ) -> Self {
198
198
Self :: from_dictionary ( ( 0 ..num_tokens) . map ( |i| tiktoken_bpe. _decode_native ( & [ i] ) ) )
199
199
}
200
200
@@ -492,6 +492,7 @@ impl BytePairEncoding {
492
492
}
493
493
}
494
494
495
+ #[ cfg( feature = "rand" ) ]
495
496
pub fn create_test_bytes ( bpe : & BytePairEncoding , tokens : usize ) -> Vec < u8 > {
496
497
use rand:: { thread_rng, Rng } ;
497
498
let mut text = vec ! [ ] ;
@@ -576,7 +577,7 @@ mod data {
576
577
#[ test]
577
578
#[ ignore = "run manually to find a suitable hash factor" ]
578
579
fn find_hash_factor ( ) {
579
- let bpes: & mut [ ( CoreBPE , usize ) ] = & mut [
580
+ let bpes = & mut [
580
581
( cl100k_base ( ) . unwrap ( ) , BPE_CL100K_LEN ) ,
581
582
( o200k_base ( ) . unwrap ( ) , BPE_O200K_LEN ) ,
582
583
] ;
@@ -609,7 +610,7 @@ mod data {
609
610
}
610
611
611
612
#[ track_caller]
612
- fn serialize_tokens ( dict : & CoreBPE , num_tokens : usize , name : & str ) {
613
+ fn serialize_tokens ( dict : & tiktoken_rs :: CoreBPE , num_tokens : usize , name : & str ) {
613
614
let path = PathBuf :: from ( file ! ( ) ) ;
614
615
let dir = path. parent ( ) . unwrap ( ) ;
615
616
let data_file = dir. join ( format ! ( "data/bpe_{name}.dict" ) ) ;
0 commit comments