Skip to content

Commit f0c9def

Browse files
Merge pull request #18 from github/tiktoken-as-feature
Reduce required crate dependencies
2 parents 3ebe786 + 0c2ec6c commit f0c9def

File tree

2 files changed

+20
-15
lines changed

2 files changed

+20
-15
lines changed

crates/bpe/Cargo.toml

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,24 @@ edition = "2021"
77
crate-type = ["lib", "staticlib"]
88
bench = false
99

10+
[[bench]]
11+
name = "counting"
12+
path = "benches/counting.rs"
13+
harness = false
14+
15+
[features]
16+
rand = ["dep:rand"]
17+
tiktoken-rs = ["dep:tiktoken-rs"]
18+
1019
[dependencies]
1120
aneubeck-daachorse = "1.1.1"
1221
fnv = "1.0"
1322
itertools = "0.12"
14-
once_cell = "1"
15-
rand = "0.8"
23+
rand = { version = "0.8", optional = true }
1624
rmp-serde = "1"
1725
serde = { version = "1", features = ["derive"] }
18-
tiktoken-rs = "0.5"
26+
tiktoken-rs = { version = "0.5", optional = true }
1927

2028
[dev-dependencies]
29+
bpe = { path = ".", features = ["rand", "tiktoken-rs"] }
2130
criterion = "0.5"
22-
23-
[[bench]]
24-
name = "counting"
25-
path = "benches/counting.rs"
26-
harness = false

crates/bpe/src/byte_pair_encoding.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,23 @@ use std::cmp::Reverse;
22
use std::collections::BinaryHeap;
33
use std::hash::{Hash, Hasher};
44
use std::ops::Range;
5+
use std::sync::LazyLock;
56

67
use aneubeck_daachorse::{DoubleArrayAhoCorasick, DoubleArrayAhoCorasickBuilder};
78
use fnv::{FnvHashMap, FnvHasher};
89
use itertools::Itertools;
9-
use once_cell::sync::Lazy;
1010
use serde::de::Visitor;
1111
use serde::{Deserialize, Deserializer, Serialize, Serializer};
12-
use tiktoken_rs::CoreBPE;
1312

1413
use crate::backtrack_encoder::BacktrackEncoder;
1514
use crate::bitfield::BitField;
1615

17-
static BPE_CL100K: Lazy<BytePairEncoding> = Lazy::new(|| {
16+
static BPE_CL100K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
1817
let bytes = include_bytes!("data/bpe_cl100k.dict");
1918
rmp_serde::from_slice(bytes).expect("")
2019
});
2120

22-
static BPE_O200K: Lazy<BytePairEncoding> = Lazy::new(|| {
21+
static BPE_O200K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
2322
let bytes = include_bytes!("data/bpe_o200k.dict");
2423
rmp_serde::from_slice(bytes).expect("")
2524
});
@@ -194,7 +193,8 @@ impl BytePairEncoding {
194193
}
195194

196195
/// Construct a BytePairEncoding instance frmo a tiktoken dictionary.
197-
pub fn from_tiktoken(tiktoken_bpe: &CoreBPE, num_tokens: usize) -> Self {
196+
#[cfg(feature = "tiktoken-rs")]
197+
pub fn from_tiktoken(tiktoken_bpe: &tiktoken_rs::CoreBPE, num_tokens: usize) -> Self {
198198
Self::from_dictionary((0..num_tokens).map(|i| tiktoken_bpe._decode_native(&[i])))
199199
}
200200

@@ -492,6 +492,7 @@ impl BytePairEncoding {
492492
}
493493
}
494494

495+
#[cfg(feature = "rand")]
495496
pub fn create_test_bytes(bpe: &BytePairEncoding, tokens: usize) -> Vec<u8> {
496497
use rand::{thread_rng, Rng};
497498
let mut text = vec![];
@@ -576,7 +577,7 @@ mod data {
576577
#[test]
577578
#[ignore = "run manually to find a suitable hash factor"]
578579
fn find_hash_factor() {
579-
let bpes: &mut [(CoreBPE, usize)] = &mut [
580+
let bpes = &mut [
580581
(cl100k_base().unwrap(), BPE_CL100K_LEN),
581582
(o200k_base().unwrap(), BPE_O200K_LEN),
582583
];
@@ -609,7 +610,7 @@ mod data {
609610
}
610611

611612
#[track_caller]
612-
fn serialize_tokens(dict: &CoreBPE, num_tokens: usize, name: &str) {
613+
fn serialize_tokens(dict: &tiktoken_rs::CoreBPE, num_tokens: usize, name: &str) {
613614
let path = PathBuf::from(file!());
614615
let dir = path.parent().unwrap();
615616
let data_file = dir.join(format!("data/bpe_{name}.dict"));

0 commit comments

Comments
 (0)