From 9b354ae0029f37a169d8ce0b7d68405c6edbaec6 Mon Sep 17 00:00:00 2001 From: lucylq Date: Wed, 19 Feb 2025 14:47:04 -0800 Subject: [PATCH] Tokenizer test (#21) Summary: X-link: https://github.com/pytorch/executorch/pull/8586 Test Plan: ## OSS Build ``` cmake . -DCMAKE_INSTALL_PREFIX=cmake-out -DTOKENIZERS_BUILD_TEST=ON -Bcmake-out cmake --build cmake-out -j9 --target install ``` Test ``` (executorch) [lfq@devvm20128.prn0 /data/users/lfq/tokenizers/cmake-out (lfq.tokenizer-test)]$ ctest Test project /data/users/lfq/tokenizers/cmake-out Start 1: test_base64 1/5 Test https://github.com/pytorch-labs/tokenizers/issues/1: test_base64 ...................... Passed 0.00 sec Start 2: test_llama2c_tokenizer 2/5 Test https://github.com/pytorch-labs/tokenizers/issues/2: test_llama2c_tokenizer ........... Passed 0.00 sec Start 3: test_pre_tokenizer 3/5 Test https://github.com/pytorch-labs/tokenizers/issues/3: test_pre_tokenizer ............... Passed 0.73 sec Start 4: test_sentencepiece 4/5 Test https://github.com/pytorch-labs/tokenizers/issues/4: test_sentencepiece ............... Passed 0.04 sec Start 5: test_tiktoken 5/5 Test https://github.com/pytorch-labs/tokenizers/issues/5: test_tiktoken .................... Passed 3.32 sec 100% tests passed, 0 tests failed out of 5 Total Test time (real) = 4.10 sec ``` ## Internal ``` buck2 test fbsource//xplat/pytorch/tokenizers/test: buck2 test fbcode//pytorch/tokenizers/test: ``` Reviewed By: larryliu0820 Differential Revision: D69860352 Pulled By: lucylq --- targets.bzl | 6 +++ test/TARGETS | 8 +++ test/targets.bzl | 87 +++++++++++++++++++++++++++++++++ test/test_llama2c_tokenizer.cpp | 8 +-- test/test_sentencepiece.cpp | 9 ---- test/test_tiktoken.cpp | 8 --- 6 files changed, 102 insertions(+), 24 deletions(-) create mode 100644 test/TARGETS create mode 100644 test/targets.bzl diff --git a/targets.bzl b/targets.bzl index 7504dfc..824c611 100644 --- a/targets.bzl +++ b/targets.bzl @@ -15,6 +15,7 @@ def define_common_targets(): ]), visibility = [ "@EXECUTORCH_CLIENTS", + "//pytorch/tokenizers/...", ], header_namespace = "", ) @@ -29,12 +30,14 @@ def define_common_targets(): ], visibility = [ "@EXECUTORCH_CLIENTS", + "//pytorch/tokenizers/...", ], compiler_flags = [ "-D_USE_INTERNAL_STRING_VIEW", ], external_deps = [ "sentencepiece", + "abseil-cpp", ], ) @@ -49,6 +52,7 @@ def define_common_targets(): ], visibility = [ "@EXECUTORCH_CLIENTS", + "//pytorch/tokenizers/...", ], compiler_flags = [ "-D_USE_INTERNAL_STRING_VIEW", @@ -84,6 +88,7 @@ def define_common_targets(): ], visibility = [ "@EXECUTORCH_CLIENTS", + "//pytorch/tokenizers/...", ], compiler_flags = [ "-D_USE_INTERNAL_STRING_VIEW", @@ -104,5 +109,6 @@ def define_common_targets(): ], visibility = [ "@EXECUTORCH_CLIENTS", + "//pytorch/tokenizers/...", ], ) diff --git a/test/TARGETS b/test/TARGETS new file mode 100644 index 0000000..2341af9 --- /dev/null +++ b/test/TARGETS @@ -0,0 +1,8 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/test/targets.bzl b/test/targets.bzl new file mode 100644 index 0000000..b250a1c --- /dev/null +++ b/test/targets.bzl @@ -0,0 +1,87 @@ +load( + "@fbsource//tools/build_defs:default_platform_defs.bzl", + "ANDROID", + "CXX", +) +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + runtime.cxx_test( + name = "test_base64", + srcs = [ + "test_base64.cpp", + ], + deps = [ + "//pytorch/tokenizers:headers", + ], + ) + + runtime.cxx_test( + name = "test_llama2c_tokenizer", + srcs = [ + "test_llama2c_tokenizer.cpp", + ], + deps = [ + "//pytorch/tokenizers:llama2c_tokenizer", + ], + env = { + "RESOURCES_PATH": "$(location :resources)/resources", + }, + platforms = [CXX, ANDROID], # Cannot bundle resources on Apple platform. + ) + + runtime.cxx_test( + name = "test_pre_tokenizer", + srcs = [ + "test_pre_tokenizer.cpp", + ], + deps = [ + "//pytorch/tokenizers:headers", + "//pytorch/tokenizers:hf_tokenizer", + ], + ) + + runtime.cxx_test( + name = "test_sentencepiece", + srcs = [ + "test_sentencepiece.cpp", + ], + deps = ["//pytorch/tokenizers:sentencepiece"], + external_deps = [ + "sentencepiece", + "abseil-cpp", + ], + env = { + "RESOURCES_PATH": "$(location :resources)/resources", + }, + platforms = [CXX, ANDROID], # Cannot bundle resources on Apple platform. + ) + + runtime.cxx_test( + name = "test_tiktoken", + srcs = [ + "test_tiktoken.cpp", + ], + deps = [ + "//pytorch/tokenizers:tiktoken", + ], + env = { + "RESOURCES_PATH": "$(location :resources)/resources", + }, + platforms = [CXX, ANDROID], # Cannot bundle resources on Apple platform. + external_deps = [ + "re2", + ], + ) + + runtime.filegroup( + name = "resources", + srcs = native.glob([ + "resources/**", + ]), + ) diff --git a/test/test_llama2c_tokenizer.cpp b/test/test_llama2c_tokenizer.cpp index 4e158e7..211f9db 100644 --- a/test/test_llama2c_tokenizer.cpp +++ b/test/test_llama2c_tokenizer.cpp @@ -1,10 +1,4 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. #ifdef TOKENIZERS_FB_BUCK #include diff --git a/test/test_sentencepiece.cpp b/test/test_sentencepiece.cpp index 8c5e1e9..b8ffd8a 100644 --- a/test/test_sentencepiece.cpp +++ b/test/test_sentencepiece.cpp @@ -7,9 +7,6 @@ */ // @lint-ignore-every LICENSELINT -#ifdef TOKENIZERS_FB_BUCK -#include -#endif #include #include @@ -17,14 +14,8 @@ namespace tokenizers { namespace { static inline std::string _get_resource_path(const std::string& name) { -#ifdef TOKENIZERS_FB_BUCK - return facebook::xplat::testing::getPathForTestResource( - "test/resources/" + name); -#else return std::getenv("RESOURCES_PATH") + std::string("/") + name; -#endif } - } // namespace TEST(SPTokenizerTest, TestEncodeWithoutLoad) { diff --git a/test/test_tiktoken.cpp b/test/test_tiktoken.cpp index 86af4fe..a7c094e 100644 --- a/test/test_tiktoken.cpp +++ b/test/test_tiktoken.cpp @@ -7,9 +7,6 @@ */ // @lint-ignore-every LICENSELINT -#ifdef TOKENIZERS_FB_BUCK -#include -#endif #include #include @@ -45,12 +42,7 @@ static inline std::unique_ptr> _get_special_tokens() { } static inline std::string _get_resource_path(const std::string& name) { -#ifdef TOKENIZERS_FB_BUCK - return facebook::xplat::testing::getPathForTestResource( - "test/resources/" + name); -#else return std::getenv("RESOURCES_PATH") + std::string("/") + name; -#endif } } // namespace