Skip to content

Commit

Permalink
Add new BPE_Tokenizer class to Dlib
Browse files Browse the repository at this point in the history
- Implement BPE (Byte Pair Encoding) tokenization
- Add training and encoding methods
- Include unit tests
  • Loading branch information
Cydral committed Feb 15, 2025
1 parent ef3d636 commit 5ddf55e
Show file tree
Hide file tree
Showing 5 changed files with 539 additions and 14 deletions.
12 changes: 6 additions & 6 deletions dlib/test/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -408,17 +408,17 @@ namespace


dlog << LINFO << "testing sort_1a_c";
queue_sort_test<queue<int, mm>::sort_1a_c> ();
queue_sort_test<dlib::queue<int, mm>::sort_1a_c>();
dlog << LINFO << "testing sort_1a";
queue_sort_test<queue<int, mm>::sort_1a>();
queue_sort_test<dlib::queue<int, mm>::sort_1a>();
dlog << LINFO << "testing sort_1b";
queue_sort_test<queue<int, mm>::sort_1b> ();
queue_sort_test<dlib::queue<int, mm>::sort_1b>();
dlog << LINFO << "testing sort_1b_c";
queue_sort_test<queue<int, mm>::sort_1b_c>();
queue_sort_test<dlib::queue<int, mm>::sort_1b_c>();
dlog << LINFO << "testing sort_1c";
queue_sort_test<queue<int, mm>::sort_1c> ();
queue_sort_test<dlib::queue<int, mm>::sort_1c>();
dlog << LINFO << "testing sort_1c_c";
queue_sort_test<queue<int, mm>::sort_1c_c>();
queue_sort_test<dlib::queue<int, mm>::sort_1c_c>();
}
} a;

Expand Down
2 changes: 1 addition & 1 deletion dlib/test/static_set.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ namespace

srand(static_cast<unsigned int>(time(0)));

typedef queue<int>::kernel_2a_c queue_of_int;
typedef dlib::queue<int>::kernel_2a_c queue_of_int;
typedef dlib::set<int>::kernel_1a_c set_of_int;

queue_of_int q, qb, qc;
Expand Down
62 changes: 56 additions & 6 deletions dlib/test/tokenizer.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (C) 2005 Davis E. King (davis@dlib.net)
// Copyright (C) 2005 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.


Expand Down Expand Up @@ -350,9 +350,59 @@ namespace

}

template <
typename bpe_tok
>
void bpe_tokenizer_test(
)
/*!
requires
- bpe_tok is an implementation of bpe_tokenizer.h
ensures
- runs tests on bpe_tok for compliance with the specs
!*/
{
print_spinner();



bpe_tok test;

std::string training_text = R"(
Byte Pair Encoding (BPE) is a subword tokenization algorithm widely used in Natural Language Processing (NLP).
It iteratively merges the most frequent pairs of bytes or characters to form a vocabulary of subword units.
This approach is particularly useful for handling out-of-vocabulary words and reducing the size of the vocabulary
while maintaining the ability to represent any text. BPE was introduced in the paper "Neural Machine Translation
of Rare Words with Subword Units" by Sennrich et al. in 2016. The algorithm is simple yet effective and has been
adopted in many state-of-the-art NLP models, including GPT and BERT.
)";

test.train(training_text, 300, true);

std::ofstream out_file("bpe_tokenizer_model.dat", std::ios::binary);
serialize(test, out_file);
out_file.close();

bpe_tok loaded_test;
std::ifstream in_file("bpe_tokenizer_model.dat", std::ios::binary);
deserialize(loaded_test, in_file);
in_file.close();

std::vector<std::string> test_strings = {
u8"\nThis is a test of the tokenisation process...\nimplemented in the Dlib library!\n", // English
u8"Ceci est un test du processus de\ntokenisation implémenté dans\nla bibliothèque Dlib!", // French
u8"Dette er en test af tokeniseringsprocessen implementeret i Dlib-biblioteket!", // Danish
u8"这是对Dlib库中实现的标记化过程的测试!" // Chinese
};

for (const auto& text : test_strings) {
std::vector<int> encoded = loaded_test.encode(text);
std::string decoded = loaded_test.decode(encoded);

std::cout << "Original: " << text << "\n";
std::cout << "Encoded: ";
for (int id : encoded) std::cout << id << " ";
std::cout << "\nDecoded: " << decoded << "\n----------------------------------------\n";
}
}

class tokenizer_tester : public tester
{
Expand All @@ -370,9 +420,9 @@ namespace
tokenizer_kernel_test<tokenizer::kernel_1a> ();
dlog << LINFO << "testing kernel_1a_c";
tokenizer_kernel_test<tokenizer::kernel_1a_c>();
dlog << LINFO << "testing bpe_tokenizer";
bpe_tokenizer_test<bpe_tokenizer>();
}
} a;

}


}
2 changes: 1 addition & 1 deletion dlib/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

#include "tokenizer/tokenizer_kernel_1.h"
#include "tokenizer/tokenizer_kernel_c.h"

#include "tokenizer/bpe_tokenizer.h"

namespace dlib
{
Expand Down
Loading

0 comments on commit 5ddf55e

Please sign in to comment.