From 084546bcc23b95c310bfc1284d4315ed6c68d33e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 26 Aug 2023 19:21:22 +0300 Subject: [PATCH] tests : add option to tokenize text files --- tests/test-tokenizer-0.cpp | 44 +++++++++++++++++++++++++++++++++++++- tests/test-tokenizer-0.py | 18 ++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index f7c4dcf77987d0..5c589420852067 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -5,6 +5,7 @@ #include #include #include +#include // generate using test-tokenizer-0.py static const std::map> & k_tests() { @@ -41,12 +42,17 @@ static const std::map> & k_tests() { int main(int argc, char **argv) { if (argc < 2) { - fprintf(stderr, "Usage: %s \n", argv[0]); + fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); return 1; } const std::string fname = argv[1]; + std::string fname_text; + if (argc > 2) { + fname_text = argv[2]; + } + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); llama_model * model; @@ -131,6 +137,42 @@ int main(int argc, char **argv) { } } + if (!fname_text.empty()) { + fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); + + std::string text; + { + std::ifstream ifs(fname_text); + if (!ifs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); + return 1; + } + text = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); + } + + fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); + + const std::vector res = llama_tokenize(ctx, text, true); + + fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); + + { + const std::string fname_out = fname_text + ".tokcpp"; + + std::ofstream ofs(fname_out); + if (!ofs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return 1; + } + + for (const auto & tok : res) { + ofs << tok << " "; + } + + ofs << "\n"; + } + } + llama_free_model(model); llama_free(ctx); diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py index 722ba81118f759..a21e9ed70c5b89 100644 --- a/tests/test-tokenizer-0.py +++ b/tests/test-tokenizer-0.py @@ -6,6 +6,7 @@ parser = argparse.ArgumentParser() parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize") args = parser.parse_args() dir_tokenizer = args.dir_tokenizer @@ -68,3 +69,20 @@ for x in res: print("%7d," % x, end='') print(" }, },") + +fname_tok = args.fname_tok +if fname_tok: + print('tokenizing file: ', fname_tok) + fname_out = fname_tok + '.tok' + with open(fname_tok, 'r') as f: + lines = f.readlines() + s = ''.join(lines) + res = tokenizer.encode(s, add_bos=True) + # write to file + with open(fname_out, 'w') as f: + for x in res: + f.write(str(x) + ' ') + f.write('\n') + print('len(res): ', len(res)) + print('len(lines): ', len(lines)) + print('results written to: ', fname_out)