Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Byte Pair Encoding (BPE) class for subword tokenization #3056

Merged
merged 7 commits into from
Mar 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions dlib/test/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -408,17 +408,17 @@ namespace


dlog << LINFO << "testing sort_1a_c";
queue_sort_test<queue<int, mm>::sort_1a_c> ();
queue_sort_test<dlib::queue<int, mm>::sort_1a_c>();
dlog << LINFO << "testing sort_1a";
queue_sort_test<queue<int, mm>::sort_1a>();
queue_sort_test<dlib::queue<int, mm>::sort_1a>();
dlog << LINFO << "testing sort_1b";
queue_sort_test<queue<int, mm>::sort_1b> ();
queue_sort_test<dlib::queue<int, mm>::sort_1b>();
dlog << LINFO << "testing sort_1b_c";
queue_sort_test<queue<int, mm>::sort_1b_c>();
queue_sort_test<dlib::queue<int, mm>::sort_1b_c>();
dlog << LINFO << "testing sort_1c";
queue_sort_test<queue<int, mm>::sort_1c> ();
queue_sort_test<dlib::queue<int, mm>::sort_1c>();
dlog << LINFO << "testing sort_1c_c";
queue_sort_test<queue<int, mm>::sort_1c_c>();
queue_sort_test<dlib::queue<int, mm>::sort_1c_c>();
}
} a;

Expand Down
2 changes: 1 addition & 1 deletion dlib/test/static_set.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ namespace

srand(static_cast<unsigned int>(time(0)));

typedef queue<int>::kernel_2a_c queue_of_int;
typedef dlib::queue<int>::kernel_2a_c queue_of_int;
typedef dlib::set<int>::kernel_1a_c set_of_int;

queue_of_int q, qb, qc;
Expand Down
62 changes: 58 additions & 4 deletions dlib/test/tokenizer.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
// Copyright (C) 2005 Davis E. King ([email protected])
// Copyright (C) 2005 Davis E. King ([email protected])
// License: Boost Software License See LICENSE.txt for the full license.


#include <string>
#include <sstream>
#include <regex>

#include <dlib/tokenizer.h>
#include "tester.h"
Expand Down Expand Up @@ -350,9 +351,62 @@ namespace

}

string postprocess_decoded_text(const string& decoded) {
string result = decoded;
result = regex_replace(result, std::regex("<text>"), "");
result = regex_replace(result, std::regex("</text>"), "\n");
if (!result.empty() && result.back() == '\n') result.pop_back();
return result;
}

template <
typename bpe_tok
>
void bpe_tokenizer_test(
)
/*!
requires
- bpe_tok is an implementation of bpe_tokenizer.h
ensures
- runs tests on bpe_tok for compliance with the specs
!*/
{
print_spinner();

bpe_tok test;

std::string training_text = R"(
Byte Pair Encoding (BPE) is a subword tokenization algorithm widely used in Natural Language Processing (NLP).
It iteratively merges the most frequent pairs of bytes or characters to form a vocabulary of subword units.
This approach is particularly useful for handling out-of-vocabulary words and reducing the size of the vocabulary
while maintaining the ability to represent any text. BPE was introduced in the paper "Neural Machine Translation
of Rare Words with Subword Units" by Sennrich et al. in 2016. The algorithm is simple yet effective and has been
adopted in many state-of-the-art NLP models, including GPT and BERT.
)";

test.train(training_text, 300, true);

std::ostringstream out_stream;
serialize(test, out_stream);

bpe_tok loaded_test;
std::istringstream in_stream(out_stream.str());
deserialize(loaded_test, in_stream);

std::vector<std::string> test_strings = {
u8"This is a test of the tokenisation process...\nimplemented in the Dlib library!", // English
u8"Ceci est un test du processus de\ntokenisation implémenté dans\nla bibliothèque Dlib!", // French
u8"Dette er en test af tokeniseringsprocessen implementeret i Dlib-biblioteket!", // Danish
u8"这是对Dlib库中实现的标记化过程的测试!" // Chinese
};

for (const auto& text : test_strings) {
std::vector<int> encoded = loaded_test.encode(text);
std::string decoded = postprocess_decoded_text(loaded_test.decode(encoded));

DLIB_TEST_MSG(text == decoded, "decoded: " << decoded);
}
}

class tokenizer_tester : public tester
{
Expand All @@ -370,9 +424,9 @@ namespace
tokenizer_kernel_test<tokenizer::kernel_1a> ();
dlog << LINFO << "testing kernel_1a_c";
tokenizer_kernel_test<tokenizer::kernel_1a_c>();
dlog << LINFO << "testing bpe_tokenizer";
bpe_tokenizer_test<bpe_tokenizer>();
}
} a;

}


}
2 changes: 1 addition & 1 deletion dlib/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

#include "tokenizer/tokenizer_kernel_1.h"
#include "tokenizer/tokenizer_kernel_c.h"

#include "tokenizer/bpe_tokenizer.h"

namespace dlib
{
Expand Down
Loading
Loading