From 037c803786bc53501d9b3b1996a281c0bf7cdbeb Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Thu, 30 May 2024 06:44:36 +0200 Subject: [PATCH] map stop_criteria in pybind; fix genai env manager on Win; fix failing multibatch tests; --- .../openvino/genai/generation_config.hpp | 16 +- src/cpp/src/generation_config.cpp | 6 +- src/cpp/src/greedy_decoding.cpp | 3 +- src/cpp/src/group_beam_searcher.cpp | 8 +- src/cpp/src/llm_pipeline.cpp | 61 +----- src/cpp/src/streamer_base.cpp | 88 ++++++++ src/cpp/src/tokenizer.cpp | 72 +++++-- src/cpp/src/utils.cpp | 1 - src/python/CMakeLists.txt | 4 +- src/python/openvino_genai/__init__.py | 12 +- src/python/py_generate_pipeline.cpp | 189 +++++++++++++----- tests/python_tests/conftest.py | 8 + tests/python_tests/generate_api_check.py | 25 --- tests/python_tests/list_test_models.py | 6 +- tests/python_tests/test_generate_api.py | 51 +++-- 15 files changed, 364 insertions(+), 186 deletions(-) create mode 100644 src/cpp/src/streamer_base.cpp create mode 100644 tests/python_tests/conftest.py delete mode 100644 tests/python_tests/generate_api_check.py diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 82a450b619..fe78e0270b 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -16,11 +16,11 @@ namespace genai { /** * @brief controls the stopping condition for grouped beam search. The following values are possible: - * "early" stops as soon as there are `num_beams` complete candidates. - "heuristic" stops when is it unlikely to find better candidates. - "never" stops when there cannot be better candidates. + * "EARLY" stops as soon as there are `num_beams` complete candidates. + "HEURISTIC" stops when is it unlikely to find better candidates. + "NEVER" stops when there cannot be better candidates. */ -enum class StopCriteria { early, heuristic, never }; +enum class StopCriteria { EARLY, HEURISTIC, NEVER }; /** * @brief Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group @@ -50,9 +50,9 @@ enum class StopCriteria { early, heuristic, never }; * @param num_return_sequences the number of sequences to return for grouped beam search decoding. * @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once. * @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values: - * "early", where the generation stops as soon as there are `num_beams` complete candidates; "heuristic", where an - * heuristic is applied and the generation stops when is it very unlikely to find better candidates; - * "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). + * "EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "HEURISTIC", where an + * "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; + * "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). * * Random sampling parameters: * @param temperature the value used to modulate token probabilities for random sampling. @@ -78,7 +78,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { float length_penalty = 1.0f; size_t num_return_sequences = 1; size_t no_repeat_ngram_size = std::numeric_limits::max(); - StopCriteria stop_criteria = StopCriteria::heuristic; + StopCriteria stop_criteria = StopCriteria::HEURISTIC; // Multinomial float temperature = 1.0f; diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 5569a759b0..f07ad13ea4 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -44,11 +44,11 @@ GenerationConfig::GenerationConfig(std::string json_path) { if (data.contains("early_stopping")) { auto field_type = data["early_stopping"].type(); if (field_type == nlohmann::json::value_t::string && data["early_stopping"] == "never") { - stop_criteria = StopCriteria::never; + stop_criteria = StopCriteria::NEVER; } else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == true) { - stop_criteria = StopCriteria::early; + stop_criteria = StopCriteria::EARLY; } else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == false) { - stop_criteria = StopCriteria::heuristic; + stop_criteria = StopCriteria::HEURISTIC; } } diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index be3e04d337..48cedf09f0 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -63,8 +63,6 @@ EncodedResults greedy_decoding( auto beam_data = m_model_runner.get_tensor("beam_idx").data(); std::iota(beam_data, beam_data + running_batch_size, 0); - size_t max_tokens = generation_config.get_max_new_tokens(prompt_len); - m_model_runner.infer(); auto logits = m_model_runner.get_tensor("logits"); ov::Shape logits_shape = logits.get_shape(); @@ -88,6 +86,7 @@ EncodedResults greedy_decoding( if (!generation_config.ignore_eos && all_are_eos) return results; + size_t max_tokens = generation_config.get_max_new_tokens(prompt_len); for (size_t i = 0; i < max_tokens - 1; ++i) { utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask")); m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp index 5c852298d0..0288b255d9 100644 --- a/src/cpp/src/group_beam_searcher.cpp +++ b/src/cpp/src/group_beam_searcher.cpp @@ -91,7 +91,7 @@ struct Parameters { size_t group_size = 5; float diversity_penalty = 1.0; size_t max_new_tokens = 20; - ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::heuristic; + ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::HEURISTIC; float length_penalty = 1.0; size_t no_repeat_ngram_size = std::numeric_limits::max(); @@ -128,15 +128,15 @@ struct Group { float best_sum_logprobs = ongoing.front().score; float worst_score = min_heap.front().score; switch (parameters.stop_criteria) { - case ov::genai::StopCriteria::early: + case ov::genai::StopCriteria::EARLY: done = true; return; - case ov::genai::StopCriteria::heuristic: { + case ov::genai::StopCriteria::HEURISTIC: { float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty); done = worst_score >= highest_attainable_score; return; } - case ov::genai::StopCriteria::never: { + case ov::genai::StopCriteria::NEVER: { size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len; float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty); done = worst_score >= highest_attainable_score; diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index cdd1d4f67f..10b7da499a 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -15,33 +15,6 @@ #include "utils.hpp" #include "text_callback_streamer.hpp" -#ifdef _WIN32 -# include -# define MAX_ABS_PATH _MAX_PATH -# define get_absolute_path(result, path) _fullpath(result, path.c_str(), MAX_ABS_PATH) -#else -# include -# include -# define MAX_ABS_PATH PATH_MAX -# define get_absolute_path(result, path) realpath(path.c_str(), result) -namespace { -std::string get_absolute_file_path(const std::string& path) { - std::string absolutePath; - absolutePath.resize(MAX_ABS_PATH); - std::ignore = get_absolute_path(&absolutePath[0], path); - if (!absolutePath.empty()) { - // on Linux if file does not exist or no access, function will return NULL, but - // `absolutePath` will contain resolved path - absolutePath.resize(absolutePath.find('\0')); - return std::string(absolutePath); - } - std::stringstream ss; - ss << "Can't get absolute file path for [" << path << "], err = " << strerror(errno); - throw std::runtime_error(ss.str()); -} -} -#endif - namespace { const std::string STREAMER_ARG_NAME = "streamer"; @@ -86,30 +59,6 @@ std::string from_tokenizer_json_if_exists(const std::string& path) { return res; } - - -std::string get_ov_genai_library_path() { -#ifdef _WIN32 - CHAR genai_library_path[MAX_PATH]; - HMODULE hm = NULL; - if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, - reinterpret_cast(get_ov_genai_library_path), - &hm)) { - std::stringstream ss; - ss << "GetModuleHandle returned " << GetLastError(); - throw std::runtime_error(ss.str()); - } - GetModuleFileNameA(hm, (LPSTR)genai_library_path, sizeof(genai_library_path)); - return std::string(genai_library_path); -#elif defined(__APPLE__) || defined(__linux__) || defined(__EMSCRIPTEN__) - Dl_info info; - dladdr(reinterpret_cast(get_ov_genai_library_path), &info); - return get_absolute_file_path(info.dli_fname).c_str(); -#else -# error "Unsupported OS" -#endif // _WIN32 -} - ov::genai::StreamerVariant get_streamer_from_map(const ov::AnyMap& config_map) { ov::genai::StreamerVariant streamer = std::monostate(); @@ -194,6 +143,8 @@ class LLMPipeline::LLMPipelineImpl { ) { GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; + auto old_pad_token_id = m_tokenizer.get_pad_token_id(); + m_tokenizer.set_pad_token_id(config.pad_token_id); EncodedInputs encoded_input; if (auto input_vector = std::get_if>(&inputs)) { @@ -214,7 +165,6 @@ class LLMPipeline::LLMPipelineImpl { auto input_ids = res.input_ids; auto attention_mask = res.attention_mask; - // todo: W/A If sentence begins with a specfial tokens (, , etc.) openvino_tokenizer inserts 2 special extra tokens and "▁", // but HF does not do that. Moreover openvino_tokenizer always inserts but in chat scenario HF does not do that because skip_special_tokens=True. // Need to remove both of that tokens manually to get exact token by token alignment with HF @@ -243,6 +193,7 @@ class LLMPipeline::LLMPipelineImpl { encoded_input = TokenizedInputs{input_ids, attention_mask}; } + m_tokenizer.set_pad_token_id(old_pad_token_id); auto encoded_results = generate(encoded_input, config, streamer); return {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores}; } @@ -285,8 +236,6 @@ class LLMPipeline::LLMPipelineImpl { OPENVINO_THROW("Currently streaming is possible only with batch size=1 and greedy or multinomial decoding"); } - // auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::genai::utils::init_attention_mask(input_ids); - if (config.is_greedy_decoding()) { result = ov::genai::greedy_decoding(m_model_runner, input_ids, attention_mask, config, streamer_ptr, is_chat_conversation); } else if (config.is_beam_search()) { @@ -431,14 +380,12 @@ ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( const ov::AnyMap& config ): m_model_runner{ov::Core{}.compile_model(path + "/openvino_model.xml", device, config).create_infer_request()}, + m_tokenizer(path, device), m_generation_config{from_config_json_if_exists(path)}, m_chat_template{from_tokenizer_json_if_exists(path)} { - ov::genai::utils::GenAIEnvManager env_manager(get_ov_genai_library_path()); - m_tokenizer = Tokenizer(path, device); } - ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { return m_pimpl->m_generation_config; } diff --git a/src/cpp/src/streamer_base.cpp b/src/cpp/src/streamer_base.cpp new file mode 100644 index 0000000000..37bb20cfc2 --- /dev/null +++ b/src/cpp/src/streamer_base.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/tokenizer.hpp" + +namespace ov { +namespace genai { + +// class LambdaStreamer::LambdaStreamerImpl { +// public: +// LambdaStreamerImpl(Tokenizer tokenizer, std::function func): m_tokenizer(tokenizer), m_func(func) {} +// LambdaStreamerImpl(std::function func): m_func(func) {} + +// Tokenizer m_tokenizer; +// std::function m_func; +// bool m_print_eos_token = false; +// std::vector m_tokens_cache; +// size_t print_len = 0; + +// bool put(int64_t token) { +// std::stringstream res; +// // do nothing if token is met and if print_eos_token=false +// if (!m_print_eos_token && token == m_tokenizer.get_eos_token_id()) +// return m_func(res.str()); + +// m_tokens_cache.push_back(token); +// std::string text = m_tokenizer.decode(m_tokens_cache); +// if (!text.empty() && '\n' == text.back()) { +// // Flush the cache after the new line symbol +// res << std::string_view{text.data() + print_len, text.size() - print_len}; +// m_tokens_cache.clear(); +// print_len = 0; +// return m_func(res.str()); +// } +// if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) { +// // Don't print incomplete text +// return m_func(res.str()); +// } +// res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; +// print_len = text.size(); +// return m_func(res.str()); +// } + +// bool end() { +// std::stringstream res; +// std::string text = m_tokenizer.decode(m_tokens_cache); +// res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; +// m_tokens_cache.clear(); +// print_len = 0; +// return m_func(res.str()); +// } + +// }; + +// LambdaStreamer::LambdaStreamer(Tokenizer tokenizer, std::function func) {} + +// LambdaStreamer::LambdaStreamer(std::function func) { +// m_pimpl = std::make_shared(func); +// } + +// void LambdaStreamer::put(int64_t token) { m_pimpl -> put(token);} + +// void LambdaStreamer::end() { m_pimpl->end();} + +} // namespace genai +} // namespace ov + + + +// class LambdaStreamer: public StreamerBase { +// public: +// // LambdaStreamer(Tokenizer tokenizer, std::function func); +// LambdaStreamer(std::function func); + +// void put(int64_t token) override; +// void end() override; + +// bool operator==(const LambdaStreamer& other) const { +// // For simplicity, we assume lambdas are not comparable. +// // If you need to compare actual logic, you may need to use type erasure or another method. +// return false; // This can be changed based on your specific needs. +// } +// private: + +// class LambdaStreamerImpl; +// std::shared_ptr m_pimpl; +// }; \ No newline at end of file diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 05e0c0d5db..4b6cd4e8ac 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -4,12 +4,11 @@ #include #include "openvino/genai/tokenizer.hpp" #include "utils.hpp" -#include namespace { // todo: remove when openvino-tokenizers will support left padding -ov::genai::TokenizedInputs pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token) { +ov::genai::TokenizedInputs pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token_id) { const size_t batch_size = input_ids.get_shape()[0]; const size_t sequence_length = input_ids.get_shape()[1]; int64_t* inputs_data = input_ids.data(); @@ -19,14 +18,14 @@ ov::genai::TokenizedInputs pad_left(ov::Tensor&& input_ids, ov::Tensor&& attenti const size_t batch_offset = batch * sequence_length; // last token in the sequence is not a PAD_TOKEN, skipping - if (inputs_data[batch_offset + sequence_length - 1] != pad_token) + if (inputs_data[batch_offset + sequence_length - 1] != pad_token_id) continue; size_t pad_tokens_number = 0; for (int i = sequence_length - 1; i >= 0; i--) { const size_t token_offset = batch_offset + i; - if (inputs_data[token_offset] == pad_token) + if (inputs_data[token_offset] == pad_token_id) continue; if (pad_tokens_number == 0) @@ -40,19 +39,67 @@ ov::genai::TokenizedInputs pad_left(ov::Tensor&& input_ids, ov::Tensor&& attenti return {input_ids, attention_mask}; } -std::filesystem::path with_openvino_tokenizers(const std::filesystem::path& path) { #ifdef _WIN32 - constexpr char tokenizers[] = "openvino_tokenizers.dll"; -#elif __linux__ - constexpr char tokenizers[] = "libopenvino_tokenizers.so"; -#elif __APPLE__ - constexpr char tokenizers[] = "libopenvino_tokenizers.dylib"; +# include +# define MAX_ABS_PATH _MAX_PATH +# define get_absolute_path(result, path) _fullpath(result, path.c_str(), MAX_ABS_PATH) +#else +# include +# include +# define MAX_ABS_PATH PATH_MAX +# define get_absolute_path(result, path) realpath(path.c_str(), result) + +std::string get_absolute_file_path(const std::string& path) { + std::string absolutePath; + absolutePath.resize(MAX_ABS_PATH); + std::ignore = get_absolute_path(&absolutePath[0], path); + if (!absolutePath.empty()) { + // on Linux if file does not exist or no access, function will return NULL, but + // `absolutePath` will contain resolved path + absolutePath.resize(absolutePath.find('\0')); + return std::string(absolutePath); + } + std::stringstream ss; + ss << "Can't get absolute file path for [" << path << "], err = " << strerror(errno); + throw std::runtime_error(ss.str()); +} #endif - return path.parent_path() / tokenizers; + +std::string get_ov_genai_library_path() { + #ifdef _WIN32 + CHAR genai_library_path[MAX_PATH]; + HMODULE hm = NULL; + if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, + reinterpret_cast(get_ov_genai_library_path), + &hm)) { + std::stringstream ss; + ss << "GetModuleHandle returned " << GetLastError(); + throw std::runtime_error(ss.str()); + } + GetModuleFileNameA(hm, (LPSTR)genai_library_path, sizeof(genai_library_path)); + return std::string(genai_library_path); + #elif defined(__APPLE__) || defined(__linux__) || defined(__EMSCRIPTEN__) + Dl_info info; + dladdr(reinterpret_cast(get_ov_genai_library_path), &info); + return get_absolute_file_path(info.dli_fname).c_str(); + #else + # error "Unsupported OS" + #endif // _WIN32 } +std::filesystem::path with_openvino_tokenizers(const std::filesystem::path& path) { + #ifdef _WIN32 + constexpr char tokenizers[] = "openvino_tokenizers.dll"; + #elif __linux__ + constexpr char tokenizers[] = "libopenvino_tokenizers.so"; + #elif __APPLE__ + constexpr char tokenizers[] = "libopenvino_tokenizers.dylib"; + #endif + return path.parent_path() / tokenizers; } +} // namespace + namespace ov { namespace genai { @@ -73,7 +120,7 @@ class Tokenizer::TokenizerImpl { const char* ov_tokenizers_path = getenv(ov::genai::utils::get_tokenizers_env_name()); if (ov_tokenizers_path) { - core.add_extension(with_openvino_tokenizers(ov_tokenizers_path)); + core.add_extension(ov_tokenizers_path); } else { OPENVINO_THROW("openvino_tokenizers path is not set"); } @@ -158,6 +205,7 @@ class Tokenizer::TokenizerImpl { }; Tokenizer::Tokenizer(const std::string& tokenizers_path, const std::string& device) { + ov::genai::utils::GenAIEnvManager env_manager(with_openvino_tokenizers(get_ov_genai_library_path())); m_pimpl = std::make_shared(tokenizers_path, device); } diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index dc123d04c4..7dac6571dc 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -62,7 +62,6 @@ void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attenti for (size_t i = 0; i < seq_length; i++) { const size_t element_offset = batch * seq_length + i; position_ids_data[element_offset] = sum; - // sum += 1; if (attention_mask_data[element_offset] == 1) { sum += 1; } diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index 7802561837..e53ba6ca02 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -15,9 +15,7 @@ if(NOT pybind11_POPULATED) add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR}) endif() -# to be able to use utils.hpp in pybind -include_directories(${CMAKE_SOURCE_DIR}/src/cpp/src/) -pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp ${CMAKE_SOURCE_DIR}/src/cpp/src/utils.cpp) +pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp) target_link_libraries(py_generate_pipeline PRIVATE openvino::genai nlohmann_json::nlohmann_json) set_target_properties(py_generate_pipeline PROPERTIES LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py index f23e447d5f..1e3f0b393c 100644 --- a/src/python/openvino_genai/__init__.py +++ b/src/python/openvino_genai/__init__.py @@ -9,6 +9,14 @@ if hasattr(os, "add_dll_directory"): os.add_dll_directory(os.path.dirname(__file__)) -from .py_generate_pipeline import LLMPipeline, Tokenizer, GenerationConfig, DecodedResults, EncodedResults, StreamerBase +from .py_generate_pipeline import LLMPipeline, Tokenizer, GenerationConfig, DecodedResults, EncodedResults, StreamerBase, StopCriteria -__all__ = ['LLMPipeline', 'Tokenizer', 'GenerationConfig', 'DecodedResults', 'EncodedResults', 'StreamerBase'] +__all__ = [ + 'LLMPipeline', + 'Tokenizer', + 'GenerationConfig', + 'DecodedResults', + 'EncodedResults', + 'StreamerBase', + 'StopCriteria' +] diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 7099b11548..3b9b80897f 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -6,7 +6,6 @@ #include #include #include "openvino/genai/llm_pipeline.hpp" -#include "utils.hpp" #ifdef _WIN32 # include @@ -35,6 +34,47 @@ std::string get_absolute_file_path(const std::string& path) { } #endif +namespace { + +// dublicates GenAIEnvManager from ov::genai::utils, since +// it was problematic getting access to that on Win + +const char* get_tokenizers_env_name() { return "OPENVINO_TOKENIZERS_PATH_GENAI"; } + +class GenAIEnvManager { +public: + GenAIEnvManager(const std::string& path) { + #ifdef _WIN32 + char* value = nullptr; + size_t len = 0; + _dupenv_s(&value, &len, ::get_tokenizers_env_name()); + if (value == nullptr) + _putenv_s(::get_tokenizers_env_name(), path.c_str()); + #else + if (!getenv(::get_tokenizers_env_name())) + setenv(::get_tokenizers_env_name(), path.c_str(), 1); + #endif + else + was_already_set = true; + } + + ~GenAIEnvManager() { + if (!was_already_set){ + #ifdef _WIN32 + _putenv_s(::get_tokenizers_env_name(), ""); + #else + unsetenv(::get_tokenizers_env_name()); + #endif + } + } + +private: + bool was_already_set; +}; + +} + + namespace py = pybind11; using ov::genai::LLMPipeline; using ov::genai::Tokenizer; @@ -44,24 +84,9 @@ using ov::genai::DecodedResults; using ov::genai::StopCriteria; using ov::genai::StreamerBase; using ov::genai::StreamerVariant; +using ov::genai::OptionalGenerationConfig; namespace { -void str_to_stop_criteria(GenerationConfig& config, const std::string& stop_criteria_str){ - if (stop_criteria_str == "early") config.stop_criteria = StopCriteria::early; - else if (stop_criteria_str == "never") config.stop_criteria = StopCriteria::never; - else if (stop_criteria_str == "heuristic") config.stop_criteria = StopCriteria::heuristic; - else OPENVINO_THROW(stop_criteria_str + " is incorrect value of stop_criteria. " - "Allowed values are: \"early\", \"never\", \"heuristic\". "); -} - -std::string stop_criteria_to_str(const GenerationConfig& config) { - switch (config.stop_criteria) { - case StopCriteria::early: return "early"; - case StopCriteria::heuristic: return "heuristic"; - case StopCriteria::never: return "never"; - default: throw std::runtime_error("Incorrect stop_criteria"); - } -} void update_config_from_kwargs(GenerationConfig& config, const py::kwargs& kwargs) { if (kwargs.contains("max_new_tokens")) config.max_new_tokens = kwargs["max_new_tokens"].cast(); @@ -73,7 +98,7 @@ void update_config_from_kwargs(GenerationConfig& config, const py::kwargs& kwarg if (kwargs.contains("length_penalty")) config.length_penalty = kwargs["length_penalty"].cast(); if (kwargs.contains("num_return_sequences")) config.num_return_sequences = kwargs["num_return_sequences"].cast(); if (kwargs.contains("no_repeat_ngram_size")) config.no_repeat_ngram_size = kwargs["no_repeat_ngram_size"].cast(); - if (kwargs.contains("stop_criteria")) str_to_stop_criteria(config, kwargs["stop_criteria"].cast()); + if (kwargs.contains("stop_criteria")) config.stop_criteria = kwargs["stop_criteria"].cast(); if (kwargs.contains("temperature")) config.temperature = kwargs["temperature"].cast(); if (kwargs.contains("top_p")) config.top_p = kwargs["top_p"].cast(); if (kwargs.contains("top_k")) config.top_k = kwargs["top_k"].cast(); @@ -152,6 +177,7 @@ std::string ov_tokenizers_module_path() { } return py::str(py::module_::import("openvino_tokenizers").attr("_ext_path")); } + class EmptyStreamer: public StreamerBase { // It's impossible to create an instance of pure virtual class. Define EmptyStreamer instead. void put(int64_t token) override { @@ -166,35 +192,101 @@ class EmptyStreamer: public StreamerBase { PYBIND11_OVERRIDE_PURE(void, StreamerBase, end); } }; + +ov::InferRequest& get_request_from_pyobj(py::object obj) { + py::str obj_type = py::str(obj.get_type()); + // todo: InferRequest is not accessible from the outside. + // obj_type is openvino._pyopenvino.InferRequest, + // which is a pybind binding to InferRequestWrapper (InferRequest is in a m_request field of the latest) + // and the definition of InferRequestWrapper is not accessible from the outside. + + if (py::isinstance(obj)) { + // Directly return the casted object without copying + return obj.cast(); + } else { + throw std::invalid_argument("Provided object is not castable to ov::InferRequest"); + } } +} // namespace + + PYBIND11_MODULE(py_generate_pipeline, m) { m.doc() = "Pybind11 binding for LLM Pipeline"; py::class_(m, "LLMPipeline") - .def(py::init(), - py::arg("model_path"), py::arg("tokenizer"), py::arg("device") = "CPU", - py::arg("plugin_config") = ov::AnyMap{}) .def(py::init([](const std::string& model_path, - const std::string& device, - const ov::AnyMap& plugin_config) { - ov::genai::utils::GenAIEnvManager env_manager(ov_tokenizers_module_path()); - return std::make_unique(model_path, device, plugin_config);}), + const std::string& device) { + ::GenAIEnvManager env_manager(ov_tokenizers_module_path()); + return std::make_unique(model_path, device);}), py::arg("model_path"), "path to the model path", py::arg("device") = "CPU", "device on which inference will be done", - py::arg("plugin_config") = ov::AnyMap(), - "LLMPipeline class constructor.\n" - " model_path (str): Path to the model file.\n" - " device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'.\n" - " plugin_config (ov::AnyMap): Plugin configuration settings. Default is an empty.") - - .def("__call__", py::overload_cast(&call_with_kwargs)) - .def("__call__", py::overload_cast(&call_with_config)) - + R"( + LLMPipeline class constructor. + model_path (str): Path to the model file. + device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. + )") + + .def(py::init(), + py::arg("model_path"), + py::arg("tokenizer"), + py::arg("device") = "CPU", + R"( + LLMPipeline class constructor for manualy created openvino_genai.Tokenizer. + model_path (str): Path to the model file. + tokenizer (openvino_genai.Tokenizer): tokenizer object. + device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. + )") + + .def(py::init([](py::object infer_request, + const Tokenizer& tokenizer, + OptionalGenerationConfig config) { + ::GenAIEnvManager env_manager(ov_tokenizers_module_path()); + return std::make_unique(get_request_from_pyobj(infer_request), tokenizer, config); + }), + py::arg("infer_request"), "infer_request", + py::arg("tokenizer"), "openvino_genai.Tokenizer object", + py::arg("config"), "device on which inference will be done") + .def("generate", py::overload_cast(&call_with_kwargs), + R"( + max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. + max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + ignore_eos: if set to true, then generation will not stop even if token is met. + pad_token_id: token_id of (padding) + bos_token_id: token_id of (beggining of sentence) + eos_token_id: token_id of (end of sentence) + bos_token: token string representation + eos_token: token string representation + + Beam search specific parameters: + num_beams: number of beams for beam search. 1 disables beam search. + num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. + diversity_penalty: value is subtracted from a beam's score if it generates the same token as any beam from other group at a particular time. + length_penalty: exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while + `length_penalty` < 0.0 encourages shorter sequences. + num_return_sequences: the number of sequences to return for grouped beam search decoding. + no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "HEURISTIC", where an + "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; + "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). + + Random sampling parameters: + temperature: the value used to modulate token probabilities for random sampling. + top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. + top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. + do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + )") .def("generate", py::overload_cast&, const py::kwargs&>(&call_with_kwargs)) .def("generate", py::overload_cast&, const GenerationConfig&, const StreamerVariant&>(&call_with_config)) - .def("generate", py::overload_cast(&call_with_kwargs)) .def("generate", py::overload_cast(&call_with_config)) + + .def("__call__", py::overload_cast(&call_with_kwargs)) + .def("__call__", py::overload_cast(&call_with_config)) // todo: if input_ids is a ov::Tensor/numpy tensor @@ -206,19 +298,24 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def("apply_chat_template", &LLMPipeline::apply_chat_template); // Binding for Tokenizer - py::class_(m, "Tokenizer") + py::class_(m, "Tokenizer", + R"(openvino_genai.Tokenizer object is used to to initialize tokenizer if it's located in different path + that the main model.)") .def(py::init<>()) .def(py::init(), - py::arg("tokenizers_path"), - py::arg("device") = "CPU") + py::arg("tokenizers_path"), + py::arg("device") = "CPU"); - // todo: implement encode/decode when for numpy inputs and outputs - .def("encode", py::overload_cast(&Tokenizer::encode), "Encode a single prompt") - // TODO: common.h(1106...) template argument deduction/substitution failed: - // .def("encode", py::overload_cast&>(&Tokenizer::encode), "Encode multiple prompts") - .def("decode", py::overload_cast>(&Tokenizer::decode), "Decode a list of tokens") - .def("decode", py::overload_cast(&Tokenizer::decode), "Decode a tensor of tokens") - .def("decode", py::overload_cast>>(&Tokenizer::decode), "Decode multiple lines of tokens"); + // Binding for StopCriteria + py::enum_(m, "StopCriteria", + R"(StopCriteria controls the stopping condition for grouped beam search. The following values are possible: + "EARLY" stops as soon as there are `num_beams` complete candidates. + "HEURISTIC" stops when is it unlikely to find better candidates. + "NEVER" stops when there cannot be better candidates.)") + .value("EARLY", StopCriteria::EARLY) + .value("HEURISTIC", StopCriteria::HEURISTIC) + .value("NEVER", StopCriteria::NEVER) + .export_values(); // Binding for GenerationConfig py::class_(m, "GenerationConfig") @@ -233,7 +330,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_readwrite("length_penalty", &GenerationConfig::length_penalty) .def_readwrite("num_return_sequences", &GenerationConfig::num_return_sequences) .def_readwrite("no_repeat_ngram_size", &GenerationConfig::no_repeat_ngram_size) - .def_property("stop_criteria", &stop_criteria_to_str, &str_to_stop_criteria) + .def_readwrite("stop_criteria", &GenerationConfig::stop_criteria) .def_readwrite("temperature", &GenerationConfig::temperature) .def_readwrite("top_p", &GenerationConfig::top_p) .def_readwrite("top_k", &GenerationConfig::top_k) diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py new file mode 100644 index 0000000000..b990d96428 --- /dev/null +++ b/tests/python_tests/conftest.py @@ -0,0 +1,8 @@ +def pytest_make_parametrize_id(config, val, argname): + if argname in ['prompt', 'promtps']: + return f'{val}' + if argname in 'stop_criteria': + return str(val) + if isinstance(val, (int, float, str)): + return f'{argname}={val}' + return None diff --git a/tests/python_tests/generate_api_check.py b/tests/python_tests/generate_api_check.py deleted file mode 100644 index ad0851fea2..0000000000 --- a/tests/python_tests/generate_api_check.py +++ /dev/null @@ -1,25 +0,0 @@ -import openvino_genai as ov_genai -model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' -path = '/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0' -device = 'CPU' -pipe = ov_genai.LLMPipeline(path, device) - -from transformers import AutoTokenizer, AutoModelForCausalLM -tokenizer = AutoTokenizer.from_pretrained(model_id) -model = AutoModelForCausalLM.from_pretrained(model_id) - -prompt = 'table is made of' -generation_config = {'max_new_tokens': 10} - -encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True) -hf_encoded_output = model.generate(encoded_prompt, **generation_config) -hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:]) - - - -import os -build_dir = os.getenv('GENAI_BUILD_DIR', 'build') -ov_tokenizers_path = f'{build_dir}/openvino_tokenizers/src/' -# pipe = ov_genai.LLMPipeline(path, device, {}, ov_tokenizers_path) - -ov_output = pipe.generate(prompt, **generation_config) diff --git a/tests/python_tests/list_test_models.py b/tests/python_tests/list_test_models.py index a67f47c6df..a9454fc211 100644 --- a/tests/python_tests/list_test_models.py +++ b/tests/python_tests/list_test_models.py @@ -1,15 +1,15 @@ def models_list(): model_ids = [ ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0"), - # ("databricks/dolly-v2-3b", "dolly-v2-3b"), # no free disk space lefton CI machine - # ("microsoft/phi-1_5", "phi-1_5/"), + ("microsoft/phi-1_5", "phi-1_5/"), + # ("google/gemma-2b-it", "gemma-2b-it"), # ("google/gemma-7b-it", "gemma-7b-it"), # ("meta-llama/Llama-2-7b-chat-hf", "Llama-2-7b-chat-hf"), # ("meta-llama/Llama-2-13b-chat-hf", "Llama-2-13b-chat-hf"), # ("openlm-research/open_llama_3b", "open_llama_3b"), # ("openlm-research/open_llama_7b", "open_llama_7b"), - # ("databricks/dolly-v2-3b", "dolly-v2-3b"), # Please check that openvino_tokenizer.xml and openvino_detokenizer.xml exist + # ("databricks/dolly-v2-3b", "dolly-v2-3b"), # ("databricks/dolly-v2-12b", "dolly-v2-12b"), ] import os diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index 5e2029ea84..d000891c3e 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -2,11 +2,14 @@ # SPDX-License-Identifier: Apache-2.0 import openvino_genai +from openvino_genai import StopCriteria import pytest from list_test_models import models_list from typing import Union, List, Dict -@pytest.fixture(scope="module", params=models_list()) + +@pytest.fixture(scope="module", params=models_list(), + ids=lambda param: param[0].split('/', 1)[1] if '/' in param[0] else param[0]) def model_fixture(request): model_id, path = request.param from transformers import AutoTokenizer, AutoModelForCausalLM @@ -19,6 +22,7 @@ def model_fixture(request): del model gc.collect() + def run_hf_ov_genai_comparison_batched(model_fixture, generation_config: Dict, prompts: Union[str, List[str]]): model_id, path, tokenizer, model = model_fixture device = 'CPU' @@ -63,11 +67,11 @@ def run_hf_ov_genai_comparison_batched(model_fixture, generation_config: Dict, p ov_outputs.sort() for i, (hf_output, ov_output) in enumerate(zip(hf_outputs, ov_outputs)): if hf_output != ov_output: - print(f'Prompt {i}:') print(f'hf_output: {hf_output}') print(f'ov_output: {ov_output}') assert hf_output == ov_output + def run_hf_ov_genai_comparison(model_fixture, generation_config: Dict, prompt): device = 'CPU' model_id, path, tokenizer, model = model_fixture @@ -81,9 +85,6 @@ def run_hf_ov_genai_comparison(model_fixture, generation_config: Dict, prompt): config['do_sample'] = False generation_config_hf = config.copy() - # in OpenVINO GenAI this parameter is called stop_criteria, - # while in HF it's called early_stopping. - # HF values True, False and "never" correspond to OV GenAI values "early", "heuristic" and "never" if generation_config_hf.get('stop_criteria'): generation_config_hf['early_stopping'] = stop_criteria_map()[generation_config_hf.pop('stop_criteria')] @@ -99,7 +100,6 @@ def run_hf_ov_genai_comparison(model_fixture, generation_config: Dict, prompt): ov_output = ov_output[0] if hf_output != ov_output: - print(f'Prompt {i}:') print(f'hf_output: {hf_output}') print(f'ov_output: {ov_output}') @@ -107,10 +107,18 @@ def run_hf_ov_genai_comparison(model_fixture, generation_config: Dict, prompt): def stop_criteria_map(): - return {"never": "never", "early": True, "heuristic": False} + # in OpenVINO GenAI this parameter is called stop_criteria, + # while in HF it's called early_stopping. + # HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER" + return { + StopCriteria.NEVER: "never", + StopCriteria.EARLY: True, + StopCriteria.HEURISTIC: False + } + test_cases = [ - (dict(max_new_tokens=20, do_sample=False), 'table is made of'), # generation_config, prompt + (dict(max_new_tokens=20), 'table is made of'), # generation_config, prompt (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'Alan Turing was a'), (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'), (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), @@ -121,23 +129,25 @@ def stop_criteria_map(): def test_decoding(model_fixture, generation_config, prompt): run_hf_ov_genai_comparison(model_fixture, generation_config, prompt) + test_configs = [ - dict(max_new_tokens=20, do_sample=False), - dict(num_beam_groups=3, num_beams=15, max_new_tokens=20, diversity_penalty=1.0) + dict(max_new_tokens=20), + dict( max_new_tokens=20, num_beam_groups=3, num_beams=15,diversity_penalty=1.0) ] -batched_prompts = [['table is made of', 'They sky is blue because', 'Difference between Jupiter and Marks is that'] - ,['hello', 'Here is the longest nowel ever: ']] +batched_prompts = [['table is made of', 'They sky is blue because', 'Difference between Jupiter and Marks is that'], + ['hello', 'Here is the longest nowel ever: ']] @pytest.mark.parametrize("generation_config", test_configs) @pytest.mark.parametrize("prompts", batched_prompts) def test_multibatch(model_fixture, generation_config, prompts): + generation_config['pad_token_id'] = 2 run_hf_ov_genai_comparison_batched(model_fixture, generation_config, prompts) -prompts = ['The Sun is yellow because', 'Alan Turing was a', 'table is made of'] +prompts = ['The Sun is yellow because', 'Difference between Jupiter and Marks is that', 'table is made of'] @pytest.mark.parametrize("num_beam_groups", [2, 3, 8]) @pytest.mark.parametrize("group_size", [5, 3, 10]) @pytest.mark.parametrize("max_new_tokens", [20, 15]) -@pytest.mark.parametrize("diversity_penalty", [1.0, 1.5]) +@pytest.mark.parametrize("diversity_penalty", [1.0 , 1.5]) @pytest.mark.parametrize("prompt", prompts) def test_beam_search_decoding(model_fixture, num_beam_groups, group_size, max_new_tokens, diversity_penalty, prompt): @@ -151,12 +161,12 @@ def test_beam_search_decoding(model_fixture, num_beam_groups, group_size, run_hf_ov_genai_comparison(model_fixture, generation_config, prompt) -@pytest.mark.parametrize("stop_criteria", ["never", "early", "heuristic"]) +@pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC]) @pytest.mark.parametrize("prompt", prompts) -@pytest.mark.parametrize("max_new_tokens", [20, 40, 300]) +@pytest.mark.parametrize("max_new_tokens", [10, 80]) def test_stop_criteria(model_fixture, stop_criteria, prompt, max_new_tokens): - # todo: for long sentences early stop_criteria fails - if (stop_criteria == 'early' and max_new_tokens >= 300): + # todo: for long sentences EARLY stop_criteria fails + if (stop_criteria == StopCriteria.EARLY and max_new_tokens >= 300): pytest.skip() generation_config = dict( num_beam_groups=2, @@ -222,7 +232,8 @@ def __init__(self, tokenizer): super().__init__() self.tokenizer = tokenizer def put(self, token_id): - print(self.tokenizer.decode([token_id])) # Incorrect way to print, but easy to implement + # print(self.tokenizer.decode([token_id])) # Incorrect way to print, but easy to implement + print(token_id) # print only token because self.tokenizer.decode([token_id]) are not implemented yet def end(self): print('end') @@ -262,7 +273,7 @@ def test_operator_wit_callback_one_string(model_fixture, callback): @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) def test_operator_wit_callback_batch_fail(model_fixture, callback): pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') - with pytest.raises(RuntimeError): + with pytest.raises(Exception): pipe(['1', '2'], openvino_genai.GenerationConfig(), callback)