map stop_criteria in pybind;

fix genai env manager on Win; fix failing multibatch tests;
openvinotoolkit · May 30, 2024 · 037c803 · 037c803
1 parent 9389930
commit 037c803
Show file tree

Hide file tree

Showing 15 changed files with 364 additions and 186 deletions.
diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -16,11 +16,11 @@ namespace genai {
 
 /**
  * @brief controls the stopping condition for grouped beam search. The following values are possible:
- *        "early" stops as soon as there are `num_beams` complete candidates.
-          "heuristic" stops when is it unlikely to find better candidates.
-          "never" stops when there cannot be better candidates.
+ *        "EARLY" stops as soon as there are `num_beams` complete candidates.
+          "HEURISTIC" stops when is it unlikely to find better candidates.
+          "NEVER" stops when there cannot be better candidates.
  */
-enum class StopCriteria { early, heuristic, never };
+enum class StopCriteria { EARLY, HEURISTIC, NEVER };
 
 /**
  * @brief Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
@@ -50,9 +50,9 @@ enum class StopCriteria { early, heuristic, never };
  * @param num_return_sequences the number of sequences to return for grouped beam search decoding.
  * @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once.
  * @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values: 
- *        "early", where the generation stops as soon as there are `num_beams` complete candidates; "heuristic", where an 
- *        heuristic is applied and the generation stops when is it very unlikely to find better candidates;
- *        "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
+ *        "EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "HEURISTIC", where an 
+ *        "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
+ *        "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
  * 
  * Random sampling parameters:
  * @param temperature the value used to modulate token probabilities for random sampling.
@@ -78,7 +78,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     float length_penalty = 1.0f;
     size_t num_return_sequences = 1;
     size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
-    StopCriteria stop_criteria = StopCriteria::heuristic;
+    StopCriteria stop_criteria = StopCriteria::HEURISTIC;
 
     // Multinomial
     float temperature = 1.0f;

diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
@@ -44,11 +44,11 @@ GenerationConfig::GenerationConfig(std::string json_path) {
     if (data.contains("early_stopping")) {
         auto field_type = data["early_stopping"].type();
         if (field_type == nlohmann::json::value_t::string && data["early_stopping"] == "never") {
-            stop_criteria = StopCriteria::never;
+            stop_criteria = StopCriteria::NEVER;
         } else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == true) {
-            stop_criteria = StopCriteria::early;
+            stop_criteria = StopCriteria::EARLY;
         } else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == false) {
-            stop_criteria = StopCriteria::heuristic;
+            stop_criteria = StopCriteria::HEURISTIC;
         }
     }
 

diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
@@ -63,8 +63,6 @@ EncodedResults greedy_decoding(
     auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
     std::iota(beam_data, beam_data + running_batch_size, 0);
 
-    size_t max_tokens = generation_config.get_max_new_tokens(prompt_len);
-
     m_model_runner.infer();
     auto logits = m_model_runner.get_tensor("logits");
     ov::Shape logits_shape = logits.get_shape();
@@ -88,6 +86,7 @@ EncodedResults greedy_decoding(
     if (!generation_config.ignore_eos && all_are_eos)
         return results;
 
+    size_t max_tokens = generation_config.get_max_new_tokens(prompt_len);
     for (size_t i = 0; i < max_tokens - 1; ++i) {
         utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask"));
         m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask")));

diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp
@@ -91,7 +91,7 @@ struct Parameters {
     size_t group_size = 5;
     float diversity_penalty = 1.0;
     size_t max_new_tokens = 20;
-    ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::heuristic;
+    ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::HEURISTIC;
     float length_penalty = 1.0;
     size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
 
@@ -128,15 +128,15 @@ struct Group {
         float best_sum_logprobs = ongoing.front().score;
         float worst_score = min_heap.front().score;
         switch (parameters.stop_criteria) {
-        case ov::genai::StopCriteria::early:
+        case ov::genai::StopCriteria::EARLY:
             done = true;
             return;
-        case ov::genai::StopCriteria::heuristic: {
+        case ov::genai::StopCriteria::HEURISTIC: {
             float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty);
             done = worst_score >= highest_attainable_score;
             return;
         }
-        case ov::genai::StopCriteria::never: {
+        case ov::genai::StopCriteria::NEVER: {
             size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len;
             float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty);
             done = worst_score >= highest_attainable_score;

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -15,33 +15,6 @@
 #include "utils.hpp"
 #include "text_callback_streamer.hpp"
 
-#ifdef _WIN32
-#    include <windows.h>
-#    define MAX_ABS_PATH _MAX_PATH
-#    define get_absolute_path(result, path) _fullpath(result, path.c_str(), MAX_ABS_PATH)
-#else
-#    include <dlfcn.h>
-#    include <limits.h>
-#    define MAX_ABS_PATH PATH_MAX
-#    define get_absolute_path(result, path) realpath(path.c_str(), result)
-namespace {
-std::string get_absolute_file_path(const std::string& path) {
-    std::string absolutePath;
-    absolutePath.resize(MAX_ABS_PATH);
-    std::ignore = get_absolute_path(&absolutePath[0], path);
-    if (!absolutePath.empty()) {
-        // on Linux if file does not exist or no access, function will return NULL, but
-        // `absolutePath` will contain resolved path
-        absolutePath.resize(absolutePath.find('\0'));
-        return std::string(absolutePath);
-    }
-    std::stringstream ss;
-    ss << "Can't get absolute file path for [" << path << "], err = " << strerror(errno);
-    throw std::runtime_error(ss.str());
-}
-}
-#endif
-
 namespace {
 
 const std::string STREAMER_ARG_NAME = "streamer";
@@ -86,30 +59,6 @@ std::string from_tokenizer_json_if_exists(const std::string& path) {
     return res;
 }
 
-
-
-std::string get_ov_genai_library_path() {
-#ifdef _WIN32
-    CHAR genai_library_path[MAX_PATH];
-    HMODULE hm = NULL;
-    if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
-                            reinterpret_cast<LPSTR>(get_ov_genai_library_path),
-                            &hm)) {
-        std::stringstream ss;
-        ss << "GetModuleHandle returned " << GetLastError();
-        throw std::runtime_error(ss.str());
-    }
-    GetModuleFileNameA(hm, (LPSTR)genai_library_path, sizeof(genai_library_path));
-    return std::string(genai_library_path);
-#elif defined(__APPLE__) || defined(__linux__) || defined(__EMSCRIPTEN__)
-    Dl_info info;
-    dladdr(reinterpret_cast<void*>(get_ov_genai_library_path), &info);
-    return get_absolute_file_path(info.dli_fname).c_str();
-#else
-#    error "Unsupported OS"
-#endif  // _WIN32
-}
-
 ov::genai::StreamerVariant get_streamer_from_map(const ov::AnyMap& config_map) {
     ov::genai::StreamerVariant streamer = std::monostate();
 
@@ -194,6 +143,8 @@ class LLMPipeline::LLMPipelineImpl {
     ) {
         GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
 
+        auto old_pad_token_id = m_tokenizer.get_pad_token_id();
+        m_tokenizer.set_pad_token_id(config.pad_token_id);
 
         EncodedInputs encoded_input;
         if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
@@ -214,7 +165,6 @@ class LLMPipeline::LLMPipelineImpl {
             auto input_ids = res.input_ids;
             auto attention_mask = res.attention_mask;
 
-
             // todo: W/A If sentence begins with a specfial tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
             // but HF does not do that. Moreover openvino_tokenizer always inserts <bos> but in chat scenario HF does not do that because skip_special_tokens=True.
             // Need to remove both of that tokens manually to get exact token by token alignment with HF
@@ -243,6 +193,7 @@ class LLMPipeline::LLMPipelineImpl {
 
             encoded_input = TokenizedInputs{input_ids, attention_mask};
         }
+        m_tokenizer.set_pad_token_id(old_pad_token_id);
         auto encoded_results  = generate(encoded_input, config, streamer);
         return {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
     }
@@ -285,8 +236,6 @@ class LLMPipeline::LLMPipelineImpl {
             OPENVINO_THROW("Currently streaming is possible only with batch size=1 and greedy or multinomial decoding");
         }
 
-        // auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::genai::utils::init_attention_mask(input_ids);
-
         if (config.is_greedy_decoding()) {
             result = ov::genai::greedy_decoding(m_model_runner, input_ids, attention_mask, config, streamer_ptr, is_chat_conversation);
         } else if (config.is_beam_search()) {
@@ -431,14 +380,12 @@ ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(
     const ov::AnyMap& config
 ): 
     m_model_runner{ov::Core{}.compile_model(path + "/openvino_model.xml", device, config).create_infer_request()}, 
+    m_tokenizer(path, device),
     m_generation_config{from_config_json_if_exists(path)},
     m_chat_template{from_tokenizer_json_if_exists(path)}
  {
-    ov::genai::utils::GenAIEnvManager env_manager(get_ov_genai_library_path());
-    m_tokenizer = Tokenizer(path, device);
  }
 
-
 ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
     return m_pimpl->m_generation_config;
 }

diff --git a/src/cpp/src/streamer_base.cpp b/src/cpp/src/streamer_base.cpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "openvino/genai/streamer_base.hpp"
+#include "openvino/genai/tokenizer.hpp"
+
+namespace ov {
+namespace genai {
+
+// class LambdaStreamer::LambdaStreamerImpl {
+// public:
+//     LambdaStreamerImpl(Tokenizer tokenizer, std::function<bool(std::string)> func): m_tokenizer(tokenizer), m_func(func) {}
+//     LambdaStreamerImpl(std::function<bool(std::string)> func): m_func(func) {}
+
+//     Tokenizer m_tokenizer;
+//     std::function<bool(std::string)> m_func;
+//     bool m_print_eos_token = false;
+//     std::vector<int64_t> m_tokens_cache;
+//     size_t print_len = 0;
+
+//     bool put(int64_t token) {
+//         std::stringstream res;
+//         // do nothing if <eos> token is met and if print_eos_token=false
+//         if (!m_print_eos_token && token == m_tokenizer.get_eos_token_id())
+//             return m_func(res.str());
+
+//         m_tokens_cache.push_back(token);
+//         std::string text = m_tokenizer.decode(m_tokens_cache);
+//         if (!text.empty() && '\n' == text.back()) {
+//             // Flush the cache after the new line symbol
+//             res << std::string_view{text.data() + print_len, text.size() - print_len};
+//             m_tokens_cache.clear();
+//             print_len = 0;
+//             return m_func(res.str());
+//         }
+//         if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
+//             // Don't print incomplete text
+//             return m_func(res.str());
+//         }
+//         res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+//         print_len = text.size();
+//     return m_func(res.str());
+// }
+
+// bool end() {
+//     std::stringstream res;
+//     std::string text = m_tokenizer.decode(m_tokens_cache);
+//     res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+//     m_tokens_cache.clear();
+//     print_len = 0;
+//     return m_func(res.str());
+// }
+
+// };
+
+// LambdaStreamer::LambdaStreamer(Tokenizer tokenizer, std::function<bool(std::string)> func) {}
+
+// LambdaStreamer::LambdaStreamer(std::function<bool(std::string)> func) {
+//     m_pimpl = std::make_shared<LambdaStreamer::LambdaStreamerImpl>(func);
+// }
+
+// void LambdaStreamer::put(int64_t token) { m_pimpl -> put(token);}
+
+// void LambdaStreamer::end() { m_pimpl->end();}
+
+}  // namespace genai
+}  // namespace ov
+
+
+
+// class LambdaStreamer: public StreamerBase {
+// public:
+//     // LambdaStreamer(Tokenizer tokenizer, std::function<bool(std::string)> func);
+//     LambdaStreamer(std::function<bool(std::string)> func);
+
+//     void put(int64_t token) override;
+//     void end() override;
+
+//     bool operator==(const LambdaStreamer& other) const {
+//         // For simplicity, we assume lambdas are not comparable.
+//         // If you need to compare actual logic, you may need to use type erasure or another method.
+//         return false; // This can be changed based on your specific needs.
+//     }
+// private:
+
+//     class LambdaStreamerImpl;
+//     std::shared_ptr<LambdaStreamerImpl> m_pimpl;
+// };