davisking · davisking · Mar 23, 2025 · Feb 15, 2025 · Feb 15, 2025 · Feb 15, 2025
diff --git a/dlib/test/queue.cpp b/dlib/test/queue.cpp
@@ -408,17 +408,17 @@ namespace
 
 
             dlog << LINFO << "testing sort_1a_c";
-            queue_sort_test<queue<int, mm>::sort_1a_c>  ();
+            queue_sort_test<dlib::queue<int, mm>::sort_1a_c>();
             dlog << LINFO << "testing sort_1a";
-            queue_sort_test<queue<int, mm>::sort_1a>();
+            queue_sort_test<dlib::queue<int, mm>::sort_1a>();
             dlog << LINFO << "testing sort_1b";
-            queue_sort_test<queue<int, mm>::sort_1b>  ();
+            queue_sort_test<dlib::queue<int, mm>::sort_1b>();
             dlog << LINFO << "testing sort_1b_c";
-            queue_sort_test<queue<int, mm>::sort_1b_c>();
+            queue_sort_test<dlib::queue<int, mm>::sort_1b_c>();
             dlog << LINFO << "testing sort_1c";
-            queue_sort_test<queue<int, mm>::sort_1c>  ();
+            queue_sort_test<dlib::queue<int, mm>::sort_1c>();
             dlog << LINFO << "testing sort_1c_c";
-            queue_sort_test<queue<int, mm>::sort_1c_c>();
+            queue_sort_test<dlib::queue<int, mm>::sort_1c_c>();
         }
     } a;
 

diff --git a/dlib/test/static_set.cpp b/dlib/test/static_set.cpp
@@ -39,7 +39,7 @@ namespace
 
         srand(static_cast<unsigned int>(time(0)));
 
-        typedef queue<int>::kernel_2a_c queue_of_int;
+        typedef dlib::queue<int>::kernel_2a_c queue_of_int;
         typedef dlib::set<int>::kernel_1a_c set_of_int;
 
         queue_of_int q, qb, qc;

diff --git a/dlib/test/tokenizer.cpp b/dlib/test/tokenizer.cpp
@@ -1,9 +1,10 @@
-// Copyright (C) 2005  Davis E. King ([email protected])
+// Copyright (C) 2005  Davis E. King ([email protected])
 // License: Boost Software License   See LICENSE.txt for the full license.
 
 
 #include <string>
 #include <sstream>
+#include <regex>
 
 #include <dlib/tokenizer.h>
 #include "tester.h"
@@ -350,9 +351,62 @@ namespace
 
     }
 
+    string postprocess_decoded_text(const string& decoded) {
+        string result = decoded;
+        result = regex_replace(result, std::regex("<text>"), "");
+        result = regex_replace(result, std::regex("</text>"), "\n");
+        if (!result.empty() && result.back() == '\n') result.pop_back();
+        return result;
+    }
+
+    template <
+        typename bpe_tok
+    >
+    void bpe_tokenizer_test(
+    )
+        /*!
+            requires
+                - bpe_tok is an implementation of bpe_tokenizer.h
+            ensures
+                - runs tests on bpe_tok for compliance with the specs
+        !*/
+    {
+        print_spinner();
+
+        bpe_tok test;
+
+        std::string training_text = R"(
+        Byte Pair Encoding (BPE) is a subword tokenization algorithm widely used in Natural Language Processing (NLP).
+        It iteratively merges the most frequent pairs of bytes or characters to form a vocabulary of subword units.
+        This approach is particularly useful for handling out-of-vocabulary words and reducing the size of the vocabulary
+        while maintaining the ability to represent any text. BPE was introduced in the paper "Neural Machine Translation
+        of Rare Words with Subword Units" by Sennrich et al. in 2016. The algorithm is simple yet effective and has been
+        adopted in many state-of-the-art NLP models, including GPT and BERT.
+        )";
 
+        test.train(training_text, 300, true);
 
+        std::ostringstream out_stream;
+        serialize(test, out_stream);
 
+        bpe_tok loaded_test;
+        std::istringstream in_stream(out_stream.str());
+        deserialize(loaded_test, in_stream);
+
+        std::vector<std::string> test_strings = {
+            u8"This is a test of the tokenisation process...\nimplemented in the Dlib library!", // English
+            u8"Ceci est un test du processus de\ntokenisation implémenté dans\nla bibliothèque Dlib!", // French
+            u8"Dette er en test af tokeniseringsprocessen implementeret i Dlib-biblioteket!", // Danish
+            u8"这是对Dlib库中实现的标记化过程的测试！" // Chinese
+        };
+
+        for (const auto& text : test_strings) {
+            std::vector<int> encoded = loaded_test.encode(text);
+            std::string decoded = postprocess_decoded_text(loaded_test.decode(encoded));
+
+            DLIB_TEST_MSG(text == decoded, "decoded: " << decoded);
+        }
+    }
 
     class tokenizer_tester : public tester
     {
@@ -370,9 +424,9 @@ namespace
             tokenizer_kernel_test<tokenizer::kernel_1a>  ();
             dlog << LINFO << "testing kernel_1a_c";
             tokenizer_kernel_test<tokenizer::kernel_1a_c>();
+            dlog << LINFO << "testing bpe_tokenizer";
+            bpe_tokenizer_test<bpe_tokenizer>();
         }
     } a;
 
-}
-
-
+}
diff --git a/dlib/tokenizer.h b/dlib/tokenizer.h
@@ -5,7 +5,7 @@
 
 #include "tokenizer/tokenizer_kernel_1.h"
 #include "tokenizer/tokenizer_kernel_c.h"
-
+#include "tokenizer/bpe_tokenizer.h"
 
 namespace dlib
 {