From 037c803786bc53501d9b3b1996a281c0bf7cdbeb Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 30 May 2024 06:44:36 +0200
Subject: [PATCH] map stop_criteria in pybind; fix genai env manager on Win;
 fix failing multibatch tests;

---
 .../openvino/genai/generation_config.hpp      |  16 +-
 src/cpp/src/generation_config.cpp             |   6 +-
 src/cpp/src/greedy_decoding.cpp               |   3 +-
 src/cpp/src/group_beam_searcher.cpp           |   8 +-
 src/cpp/src/llm_pipeline.cpp                  |  61 +-----
 src/cpp/src/streamer_base.cpp                 |  88 ++++++++
 src/cpp/src/tokenizer.cpp                     |  72 +++++--
 src/cpp/src/utils.cpp                         |   1 -
 src/python/CMakeLists.txt                     |   4 +-
 src/python/openvino_genai/__init__.py         |  12 +-
 src/python/py_generate_pipeline.cpp           | 189 +++++++++++++-----
 tests/python_tests/conftest.py                |   8 +
 tests/python_tests/generate_api_check.py      |  25 ---
 tests/python_tests/list_test_models.py        |   6 +-
 tests/python_tests/test_generate_api.py       |  51 +++--
 15 files changed, 364 insertions(+), 186 deletions(-)
 create mode 100644 src/cpp/src/streamer_base.cpp
 create mode 100644 tests/python_tests/conftest.py
 delete mode 100644 tests/python_tests/generate_api_check.py

diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index 82a450b619..fe78e0270b 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -16,11 +16,11 @@ namespace genai {
 
 /**
  * @brief controls the stopping condition for grouped beam search. The following values are possible:
- *        "early" stops as soon as there are `num_beams` complete candidates.
-          "heuristic" stops when is it unlikely to find better candidates.
-          "never" stops when there cannot be better candidates.
+ *        "EARLY" stops as soon as there are `num_beams` complete candidates.
+          "HEURISTIC" stops when is it unlikely to find better candidates.
+          "NEVER" stops when there cannot be better candidates.
  */
-enum class StopCriteria { early, heuristic, never };
+enum class StopCriteria { EARLY, HEURISTIC, NEVER };
 
 /**
  * @brief Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
@@ -50,9 +50,9 @@ enum class StopCriteria { early, heuristic, never };
  * @param num_return_sequences the number of sequences to return for grouped beam search decoding.
  * @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once.
  * @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values: 
- *        "early", where the generation stops as soon as there are `num_beams` complete candidates; "heuristic", where an 
- *        heuristic is applied and the generation stops when is it very unlikely to find better candidates;
- *        "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
+ *        "EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "HEURISTIC", where an 
+ *        "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
+ *        "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
  * 
  * Random sampling parameters:
  * @param temperature the value used to modulate token probabilities for random sampling.
@@ -78,7 +78,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     float length_penalty = 1.0f;
     size_t num_return_sequences = 1;
     size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
-    StopCriteria stop_criteria = StopCriteria::heuristic;
+    StopCriteria stop_criteria = StopCriteria::HEURISTIC;
     
     // Multinomial
     float temperature = 1.0f;
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index 5569a759b0..f07ad13ea4 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -44,11 +44,11 @@ GenerationConfig::GenerationConfig(std::string json_path) {
     if (data.contains("early_stopping")) {
         auto field_type = data["early_stopping"].type();
         if (field_type == nlohmann::json::value_t::string && data["early_stopping"] == "never") {
-            stop_criteria = StopCriteria::never;
+            stop_criteria = StopCriteria::NEVER;
         } else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == true) {
-            stop_criteria = StopCriteria::early;
+            stop_criteria = StopCriteria::EARLY;
         } else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == false) {
-            stop_criteria = StopCriteria::heuristic;
+            stop_criteria = StopCriteria::HEURISTIC;
         }
     }
 
diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
index be3e04d337..48cedf09f0 100644
--- a/src/cpp/src/greedy_decoding.cpp
+++ b/src/cpp/src/greedy_decoding.cpp
@@ -63,8 +63,6 @@ EncodedResults greedy_decoding(
     auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
     std::iota(beam_data, beam_data + running_batch_size, 0);
 
-    size_t max_tokens = generation_config.get_max_new_tokens(prompt_len);
-
     m_model_runner.infer();
     auto logits = m_model_runner.get_tensor("logits");
     ov::Shape logits_shape = logits.get_shape();
@@ -88,6 +86,7 @@ EncodedResults greedy_decoding(
     if (!generation_config.ignore_eos && all_are_eos)
         return results;
     
+    size_t max_tokens = generation_config.get_max_new_tokens(prompt_len);
     for (size_t i = 0; i < max_tokens - 1; ++i) {
         utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask"));
         m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask")));
diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp
index 5c852298d0..0288b255d9 100644
--- a/src/cpp/src/group_beam_searcher.cpp
+++ b/src/cpp/src/group_beam_searcher.cpp
@@ -91,7 +91,7 @@ struct Parameters {
     size_t group_size = 5;
     float diversity_penalty = 1.0;
     size_t max_new_tokens = 20;
-    ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::heuristic;
+    ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::HEURISTIC;
     float length_penalty = 1.0;
     size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
 
@@ -128,15 +128,15 @@ struct Group {
         float best_sum_logprobs = ongoing.front().score;
         float worst_score = min_heap.front().score;
         switch (parameters.stop_criteria) {
-        case ov::genai::StopCriteria::early:
+        case ov::genai::StopCriteria::EARLY:
             done = true;
             return;
-        case ov::genai::StopCriteria::heuristic: {
+        case ov::genai::StopCriteria::HEURISTIC: {
             float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty);
             done = worst_score >= highest_attainable_score;
             return;
         }
-        case ov::genai::StopCriteria::never: {
+        case ov::genai::StopCriteria::NEVER: {
             size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len;
             float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty);
             done = worst_score >= highest_attainable_score;
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index cdd1d4f67f..10b7da499a 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -15,33 +15,6 @@
 #include "utils.hpp"
 #include "text_callback_streamer.hpp"
 
-#ifdef _WIN32
-#    include <windows.h>
-#    define MAX_ABS_PATH _MAX_PATH
-#    define get_absolute_path(result, path) _fullpath(result, path.c_str(), MAX_ABS_PATH)
-#else
-#    include <dlfcn.h>
-#    include <limits.h>
-#    define MAX_ABS_PATH PATH_MAX
-#    define get_absolute_path(result, path) realpath(path.c_str(), result)
-namespace {
-std::string get_absolute_file_path(const std::string& path) {
-    std::string absolutePath;
-    absolutePath.resize(MAX_ABS_PATH);
-    std::ignore = get_absolute_path(&absolutePath[0], path);
-    if (!absolutePath.empty()) {
-        // on Linux if file does not exist or no access, function will return NULL, but
-        // `absolutePath` will contain resolved path
-        absolutePath.resize(absolutePath.find('\0'));
-        return std::string(absolutePath);
-    }
-    std::stringstream ss;
-    ss << "Can't get absolute file path for [" << path << "], err = " << strerror(errno);
-    throw std::runtime_error(ss.str());
-}
-}
-#endif
-
 namespace {
 
 const std::string STREAMER_ARG_NAME = "streamer";
@@ -86,30 +59,6 @@ std::string from_tokenizer_json_if_exists(const std::string& path) {
     return res;
 }
 
-
-
-std::string get_ov_genai_library_path() {
-#ifdef _WIN32
-    CHAR genai_library_path[MAX_PATH];
-    HMODULE hm = NULL;
-    if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
-                            reinterpret_cast<LPSTR>(get_ov_genai_library_path),
-                            &hm)) {
-        std::stringstream ss;
-        ss << "GetModuleHandle returned " << GetLastError();
-        throw std::runtime_error(ss.str());
-    }
-    GetModuleFileNameA(hm, (LPSTR)genai_library_path, sizeof(genai_library_path));
-    return std::string(genai_library_path);
-#elif defined(__APPLE__) || defined(__linux__) || defined(__EMSCRIPTEN__)
-    Dl_info info;
-    dladdr(reinterpret_cast<void*>(get_ov_genai_library_path), &info);
-    return get_absolute_file_path(info.dli_fname).c_str();
-#else
-#    error "Unsupported OS"
-#endif  // _WIN32
-}
-
 ov::genai::StreamerVariant get_streamer_from_map(const ov::AnyMap& config_map) {
     ov::genai::StreamerVariant streamer = std::monostate();
 
@@ -194,6 +143,8 @@ class LLMPipeline::LLMPipelineImpl {
     ) {
         GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
         
+        auto old_pad_token_id = m_tokenizer.get_pad_token_id();
+        m_tokenizer.set_pad_token_id(config.pad_token_id);
 
         EncodedInputs encoded_input;
         if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
@@ -214,7 +165,6 @@ class LLMPipeline::LLMPipelineImpl {
             auto input_ids = res.input_ids;
             auto attention_mask = res.attention_mask;
 
-
             // todo: W/A If sentence begins with a specfial tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
             // but HF does not do that. Moreover openvino_tokenizer always inserts <bos> but in chat scenario HF does not do that because skip_special_tokens=True.
             // Need to remove both of that tokens manually to get exact token by token alignment with HF
@@ -243,6 +193,7 @@ class LLMPipeline::LLMPipelineImpl {
             
             encoded_input = TokenizedInputs{input_ids, attention_mask};
         }
+        m_tokenizer.set_pad_token_id(old_pad_token_id);
         auto encoded_results  = generate(encoded_input, config, streamer);
         return {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
     }
@@ -285,8 +236,6 @@ class LLMPipeline::LLMPipelineImpl {
             OPENVINO_THROW("Currently streaming is possible only with batch size=1 and greedy or multinomial decoding");
         }
 
-        // auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::genai::utils::init_attention_mask(input_ids);
-
         if (config.is_greedy_decoding()) {
             result = ov::genai::greedy_decoding(m_model_runner, input_ids, attention_mask, config, streamer_ptr, is_chat_conversation);
         } else if (config.is_beam_search()) {
@@ -431,14 +380,12 @@ ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(
     const ov::AnyMap& config
 ): 
     m_model_runner{ov::Core{}.compile_model(path + "/openvino_model.xml", device, config).create_infer_request()}, 
+    m_tokenizer(path, device),
     m_generation_config{from_config_json_if_exists(path)},
     m_chat_template{from_tokenizer_json_if_exists(path)}
  {
-    ov::genai::utils::GenAIEnvManager env_manager(get_ov_genai_library_path());
-    m_tokenizer = Tokenizer(path, device);
  }
 
-
 ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
     return m_pimpl->m_generation_config;
 }
diff --git a/src/cpp/src/streamer_base.cpp b/src/cpp/src/streamer_base.cpp
new file mode 100644
index 0000000000..37bb20cfc2
--- /dev/null
+++ b/src/cpp/src/streamer_base.cpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "openvino/genai/streamer_base.hpp"
+#include "openvino/genai/tokenizer.hpp"
+
+namespace ov {
+namespace genai {
+
+// class LambdaStreamer::LambdaStreamerImpl {
+// public:
+//     LambdaStreamerImpl(Tokenizer tokenizer, std::function<bool(std::string)> func): m_tokenizer(tokenizer), m_func(func) {}
+//     LambdaStreamerImpl(std::function<bool(std::string)> func): m_func(func) {}
+    
+//     Tokenizer m_tokenizer;
+//     std::function<bool(std::string)> m_func;
+//     bool m_print_eos_token = false;
+//     std::vector<int64_t> m_tokens_cache;
+//     size_t print_len = 0;
+    
+//     bool put(int64_t token) {
+//         std::stringstream res;
+//         // do nothing if <eos> token is met and if print_eos_token=false
+//         if (!m_print_eos_token && token == m_tokenizer.get_eos_token_id())
+//             return m_func(res.str());
+
+//         m_tokens_cache.push_back(token);
+//         std::string text = m_tokenizer.decode(m_tokens_cache);
+//         if (!text.empty() && '\n' == text.back()) {
+//             // Flush the cache after the new line symbol
+//             res << std::string_view{text.data() + print_len, text.size() - print_len};
+//             m_tokens_cache.clear();
+//             print_len = 0;
+//             return m_func(res.str());
+//         }
+//         if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
+//             // Don't print incomplete text
+//             return m_func(res.str());
+//         }
+//         res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+//         print_len = text.size();
+//     return m_func(res.str());
+// }
+
+// bool end() {
+//     std::stringstream res;
+//     std::string text = m_tokenizer.decode(m_tokens_cache);
+//     res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+//     m_tokens_cache.clear();
+//     print_len = 0;
+//     return m_func(res.str());
+// }
+
+// };
+
+// LambdaStreamer::LambdaStreamer(Tokenizer tokenizer, std::function<bool(std::string)> func) {}
+
+// LambdaStreamer::LambdaStreamer(std::function<bool(std::string)> func) {
+//     m_pimpl = std::make_shared<LambdaStreamer::LambdaStreamerImpl>(func);
+// }
+
+// void LambdaStreamer::put(int64_t token) { m_pimpl -> put(token);}
+
+// void LambdaStreamer::end() { m_pimpl->end();}
+
+}  // namespace genai
+}  // namespace ov
+
+
+
+// class LambdaStreamer: public StreamerBase {
+// public:
+//     // LambdaStreamer(Tokenizer tokenizer, std::function<bool(std::string)> func);
+//     LambdaStreamer(std::function<bool(std::string)> func);
+
+//     void put(int64_t token) override;
+//     void end() override;
+    
+//     bool operator==(const LambdaStreamer& other) const {
+//         // For simplicity, we assume lambdas are not comparable.
+//         // If you need to compare actual logic, you may need to use type erasure or another method.
+//         return false; // This can be changed based on your specific needs.
+//     }
+// private:
+    
+//     class LambdaStreamerImpl;
+//     std::shared_ptr<LambdaStreamerImpl> m_pimpl;
+// };
\ No newline at end of file
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 05e0c0d5db..4b6cd4e8ac 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -4,12 +4,11 @@
 #include <openvino/openvino.hpp>
 #include "openvino/genai/tokenizer.hpp"
 #include "utils.hpp"
-#include <cstdlib>
 
 namespace {
 
 // todo: remove when openvino-tokenizers will support left padding
-ov::genai::TokenizedInputs pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token) {
+ov::genai::TokenizedInputs pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token_id) {
     const size_t batch_size = input_ids.get_shape()[0];
     const size_t sequence_length = input_ids.get_shape()[1];
     int64_t* inputs_data = input_ids.data<int64_t>();
@@ -19,14 +18,14 @@ ov::genai::TokenizedInputs pad_left(ov::Tensor&& input_ids, ov::Tensor&& attenti
         const size_t batch_offset = batch * sequence_length;
 
         // last token in the sequence is not a PAD_TOKEN, skipping
-        if (inputs_data[batch_offset + sequence_length - 1] != pad_token)
+        if (inputs_data[batch_offset + sequence_length - 1] != pad_token_id)
             continue;
 
         size_t pad_tokens_number = 0;
         for (int i = sequence_length - 1; i >= 0; i--) {
             const size_t token_offset = batch_offset + i;
 
-            if (inputs_data[token_offset] == pad_token)
+            if (inputs_data[token_offset] == pad_token_id)
                 continue;
 
             if (pad_tokens_number == 0)
@@ -40,19 +39,67 @@ ov::genai::TokenizedInputs pad_left(ov::Tensor&& input_ids, ov::Tensor&& attenti
     return {input_ids, attention_mask};
 }
 
-std::filesystem::path with_openvino_tokenizers(const std::filesystem::path& path) {
 #ifdef _WIN32
-    constexpr char tokenizers[] = "openvino_tokenizers.dll";
-#elif __linux__
-    constexpr char tokenizers[] = "libopenvino_tokenizers.so";
-#elif __APPLE__
-    constexpr char tokenizers[] = "libopenvino_tokenizers.dylib";
+#    include <windows.h>
+#    define MAX_ABS_PATH _MAX_PATH
+#    define get_absolute_path(result, path) _fullpath(result, path.c_str(), MAX_ABS_PATH)
+#else
+#    include <dlfcn.h>
+#    include <limits.h>
+#    define MAX_ABS_PATH PATH_MAX
+#    define get_absolute_path(result, path) realpath(path.c_str(), result)
+
+std::string get_absolute_file_path(const std::string& path) {
+    std::string absolutePath;
+    absolutePath.resize(MAX_ABS_PATH);
+    std::ignore = get_absolute_path(&absolutePath[0], path);
+    if (!absolutePath.empty()) {
+        // on Linux if file does not exist or no access, function will return NULL, but
+        // `absolutePath` will contain resolved path
+        absolutePath.resize(absolutePath.find('\0'));
+        return std::string(absolutePath);
+    }
+    std::stringstream ss;
+    ss << "Can't get absolute file path for [" << path << "], err = " << strerror(errno);
+    throw std::runtime_error(ss.str());
+}
 #endif
-    return path.parent_path() / tokenizers;
+
+std::string get_ov_genai_library_path() {
+    #ifdef _WIN32
+        CHAR genai_library_path[MAX_PATH];
+        HMODULE hm = NULL;
+        if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+                                reinterpret_cast<LPSTR>(get_ov_genai_library_path),
+                                &hm)) {
+            std::stringstream ss;
+            ss << "GetModuleHandle returned " << GetLastError();
+            throw std::runtime_error(ss.str());
+        }
+        GetModuleFileNameA(hm, (LPSTR)genai_library_path, sizeof(genai_library_path));
+        return std::string(genai_library_path);
+    #elif defined(__APPLE__) || defined(__linux__) || defined(__EMSCRIPTEN__)
+        Dl_info info;
+        dladdr(reinterpret_cast<void*>(get_ov_genai_library_path), &info);
+        return get_absolute_file_path(info.dli_fname).c_str();
+    #else
+    #    error "Unsupported OS"
+    #endif  // _WIN32
 }
 
+std::filesystem::path with_openvino_tokenizers(const std::filesystem::path& path) {
+    #ifdef _WIN32
+        constexpr char tokenizers[] = "openvino_tokenizers.dll";
+    #elif __linux__
+        constexpr char tokenizers[] = "libopenvino_tokenizers.so";
+    #elif __APPLE__
+        constexpr char tokenizers[] = "libopenvino_tokenizers.dylib";
+    #endif
+        return path.parent_path() / tokenizers;
 }
 
+}  // namespace
+
 namespace ov {
 namespace genai {
 
@@ -73,7 +120,7 @@ class Tokenizer::TokenizerImpl {
 
         const char* ov_tokenizers_path = getenv(ov::genai::utils::get_tokenizers_env_name());
         if (ov_tokenizers_path) {
-            core.add_extension(with_openvino_tokenizers(ov_tokenizers_path));
+            core.add_extension(ov_tokenizers_path);
         } else {
             OPENVINO_THROW("openvino_tokenizers path is not set");
         }
@@ -158,6 +205,7 @@ class Tokenizer::TokenizerImpl {
 };
 
 Tokenizer::Tokenizer(const std::string& tokenizers_path, const std::string& device) {
+    ov::genai::utils::GenAIEnvManager env_manager(with_openvino_tokenizers(get_ov_genai_library_path()));
     m_pimpl = std::make_shared<TokenizerImpl>(tokenizers_path, device);
 }
 
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index dc123d04c4..7dac6571dc 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -62,7 +62,6 @@ void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attenti
         for (size_t i = 0; i < seq_length; i++) {
             const size_t element_offset = batch * seq_length + i;
             position_ids_data[element_offset] = sum;
-            // sum += 1;
             if (attention_mask_data[element_offset] == 1) {
                 sum += 1;
             }
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 7802561837..e53ba6ca02 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -15,9 +15,7 @@ if(NOT pybind11_POPULATED)
     add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
 endif()
 
-# to be able to use utils.hpp in pybind
-include_directories(${CMAKE_SOURCE_DIR}/src/cpp/src/)
-pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp ${CMAKE_SOURCE_DIR}/src/cpp/src/utils.cpp)
+pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp)
 target_link_libraries(py_generate_pipeline PRIVATE openvino::genai nlohmann_json::nlohmann_json)
 set_target_properties(py_generate_pipeline PROPERTIES
     LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>"
diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py
index f23e447d5f..1e3f0b393c 100644
--- a/src/python/openvino_genai/__init__.py
+++ b/src/python/openvino_genai/__init__.py
@@ -9,6 +9,14 @@
 if hasattr(os, "add_dll_directory"):
     os.add_dll_directory(os.path.dirname(__file__))
 
-from .py_generate_pipeline import LLMPipeline, Tokenizer, GenerationConfig, DecodedResults, EncodedResults, StreamerBase
+from .py_generate_pipeline import LLMPipeline, Tokenizer, GenerationConfig, DecodedResults, EncodedResults, StreamerBase, StopCriteria
 
-__all__ = ['LLMPipeline', 'Tokenizer', 'GenerationConfig', 'DecodedResults', 'EncodedResults', 'StreamerBase']
+__all__ = [
+    'LLMPipeline', 
+    'Tokenizer', 
+    'GenerationConfig', 
+    'DecodedResults', 
+    'EncodedResults',
+    'StreamerBase', 
+    'StopCriteria'
+]
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 7099b11548..3b9b80897f 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -6,7 +6,6 @@
 #include <pybind11/stl.h>
 #include <pybind11/functional.h>
 #include "openvino/genai/llm_pipeline.hpp"
-#include "utils.hpp"
 
 #ifdef _WIN32
 #    include <windows.h>
@@ -35,6 +34,47 @@ std::string get_absolute_file_path(const std::string& path) {
 }
 #endif
 
+namespace {
+
+// dublicates GenAIEnvManager from ov::genai::utils, since 
+// it was problematic getting access to that on Win
+
+const char* get_tokenizers_env_name() { return "OPENVINO_TOKENIZERS_PATH_GENAI"; }
+
+class GenAIEnvManager {
+public:
+    GenAIEnvManager(const std::string& path) {
+        #ifdef _WIN32
+        char* value = nullptr;
+        size_t len = 0;
+        _dupenv_s(&value, &len, ::get_tokenizers_env_name());
+        if (value == nullptr)
+            _putenv_s(::get_tokenizers_env_name(), path.c_str());
+        #else
+        if (!getenv(::get_tokenizers_env_name()))
+            setenv(::get_tokenizers_env_name(), path.c_str(), 1);
+        #endif
+        else
+            was_already_set = true;
+    }
+
+    ~GenAIEnvManager() {
+        if (!was_already_set){
+        #ifdef _WIN32
+            _putenv_s(::get_tokenizers_env_name(), "");
+        #else
+            unsetenv(::get_tokenizers_env_name());
+        #endif
+        }
+    }
+
+private:
+    bool was_already_set;
+};
+
+}
+
+
 namespace py = pybind11;
 using ov::genai::LLMPipeline;
 using ov::genai::Tokenizer;
@@ -44,24 +84,9 @@ using ov::genai::DecodedResults;
 using ov::genai::StopCriteria;
 using ov::genai::StreamerBase;
 using ov::genai::StreamerVariant;
+using ov::genai::OptionalGenerationConfig;
 
 namespace {
-void str_to_stop_criteria(GenerationConfig& config, const std::string& stop_criteria_str){
-    if (stop_criteria_str == "early") config.stop_criteria = StopCriteria::early;
-    else if (stop_criteria_str == "never") config.stop_criteria =  StopCriteria::never;
-    else if (stop_criteria_str == "heuristic") config.stop_criteria =  StopCriteria::heuristic;
-    else OPENVINO_THROW(stop_criteria_str + " is incorrect value of stop_criteria. "
-                       "Allowed values are: \"early\", \"never\", \"heuristic\". ");
-}
-
-std::string stop_criteria_to_str(const GenerationConfig& config) {
-    switch (config.stop_criteria) {
-        case StopCriteria::early: return "early";
-        case StopCriteria::heuristic: return "heuristic";
-        case StopCriteria::never: return "never";
-        default: throw std::runtime_error("Incorrect stop_criteria");
-    }
-}
 
 void update_config_from_kwargs(GenerationConfig& config, const py::kwargs& kwargs) {
     if (kwargs.contains("max_new_tokens")) config.max_new_tokens = kwargs["max_new_tokens"].cast<size_t>();
@@ -73,7 +98,7 @@ void update_config_from_kwargs(GenerationConfig& config, const py::kwargs& kwarg
     if (kwargs.contains("length_penalty")) config.length_penalty = kwargs["length_penalty"].cast<float>();
     if (kwargs.contains("num_return_sequences")) config.num_return_sequences = kwargs["num_return_sequences"].cast<size_t>();
     if (kwargs.contains("no_repeat_ngram_size")) config.no_repeat_ngram_size = kwargs["no_repeat_ngram_size"].cast<size_t>();
-    if (kwargs.contains("stop_criteria")) str_to_stop_criteria(config, kwargs["stop_criteria"].cast<std::string>());
+    if (kwargs.contains("stop_criteria")) config.stop_criteria = kwargs["stop_criteria"].cast<StopCriteria>();
     if (kwargs.contains("temperature")) config.temperature = kwargs["temperature"].cast<float>();
     if (kwargs.contains("top_p")) config.top_p = kwargs["top_p"].cast<float>();
     if (kwargs.contains("top_k")) config.top_k = kwargs["top_k"].cast<size_t>();
@@ -152,6 +177,7 @@ std::string ov_tokenizers_module_path() {
     }
     return py::str(py::module_::import("openvino_tokenizers").attr("_ext_path"));
 }
+
 class EmptyStreamer: public StreamerBase {
     // It's impossible to create an instance of pure virtual class. Define EmptyStreamer instead.
     void put(int64_t token) override {
@@ -166,35 +192,101 @@ class EmptyStreamer: public StreamerBase {
         PYBIND11_OVERRIDE_PURE(void, StreamerBase, end);
     }
 };
+
+ov::InferRequest& get_request_from_pyobj(py::object obj) {
+    py::str obj_type = py::str(obj.get_type());
+    // todo: InferRequest is not accessible from the outside.
+    // obj_type is openvino._pyopenvino.InferRequest,
+    // which is a pybind binding to InferRequestWrapper (InferRequest is in a m_request field of the latest)
+    // and the definition of InferRequestWrapper is not accessible from the outside.
+
+    if (py::isinstance<ov::InferRequest>(obj)) {
+        // Directly return the casted object without copying
+        return obj.cast<ov::InferRequest&>();
+    } else {
+        throw std::invalid_argument("Provided object is not castable to ov::InferRequest");
+    }
 }
 
+} // namespace
+
+
 PYBIND11_MODULE(py_generate_pipeline, m) {
     m.doc() = "Pybind11 binding for LLM Pipeline";
 
     py::class_<LLMPipeline>(m, "LLMPipeline")
-        .def(py::init<const std::string, const Tokenizer&, const std::string, const ov::AnyMap&>(), 
-             py::arg("model_path"), py::arg("tokenizer"), py::arg("device") = "CPU", 
-             py::arg("plugin_config") = ov::AnyMap{})
         .def(py::init([](const std::string& model_path, 
-                         const std::string& device,
-                         const ov::AnyMap& plugin_config) {
-            ov::genai::utils::GenAIEnvManager env_manager(ov_tokenizers_module_path());
-            return std::make_unique<LLMPipeline>(model_path, device, plugin_config);}),
+                            const std::string& device) {
+            ::GenAIEnvManager env_manager(ov_tokenizers_module_path());
+            return std::make_unique<LLMPipeline>(model_path, device);}),
         py::arg("model_path"), "path to the model path", 
         py::arg("device") = "CPU", "device on which inference will be done",
-        py::arg("plugin_config") = ov::AnyMap(), 
-        "LLMPipeline class constructor.\n"
-        "    model_path (str): Path to the model file.\n"
-        "    device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'.\n"
-        "    plugin_config (ov::AnyMap): Plugin configuration settings. Default is an empty.")
-        
-        .def("__call__", py::overload_cast<LLMPipeline&, const std::string&, const py::kwargs&>(&call_with_kwargs))
-        .def("__call__", py::overload_cast<LLMPipeline&, const std::string&, const GenerationConfig&, const StreamerVariant&>(&call_with_config))
-        
+        R"(
+            LLMPipeline class constructor.
+            model_path (str): Path to the model file.
+            device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'.
+        )")
+
+        .def(py::init<const std::string, const Tokenizer&, const std::string>(), 
+        py::arg("model_path"),
+        py::arg("tokenizer"),
+        py::arg("device") = "CPU",
+        R"(
+            LLMPipeline class constructor for manualy created openvino_genai.Tokenizer.
+            model_path (str): Path to the model file.
+            tokenizer (openvino_genai.Tokenizer): tokenizer object.
+            device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'.
+        )")
+
+        .def(py::init([](py::object infer_request, 
+                            const Tokenizer& tokenizer,
+                            OptionalGenerationConfig config) {
+            ::GenAIEnvManager env_manager(ov_tokenizers_module_path());
+            return std::make_unique<LLMPipeline>(get_request_from_pyobj(infer_request), tokenizer, config);
+        }),
+        py::arg("infer_request"), "infer_request", 
+        py::arg("tokenizer"), "openvino_genai.Tokenizer object",
+        py::arg("config"), "device on which inference will be done")
+        .def("generate", py::overload_cast<LLMPipeline&, const std::string&, const py::kwargs&>(&call_with_kwargs),
+        R"(
+            max_length:    the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
+                        `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
+            max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+            ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
+            pad_token_id:  token_id of <pad> (padding)
+            bos_token_id:  token_id of <bos> (beggining of sentence)
+            eos_token_id:  token_id of <eos> (end of sentence)
+            bos_token:     <bos> token string representation
+            eos_token:     <eos> token string representation
+            
+            Beam search specific parameters:
+            num_beams:         number of beams for beam search. 1 disables beam search.
+            num_beam_groups:   number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
+            diversity_penalty: value is subtracted from a beam's score if it generates the same token as any beam from other group at a particular time.
+            length_penalty:    exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+                the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+                likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
+                `length_penalty` < 0.0 encourages shorter sequences.
+            num_return_sequences: the number of sequences to return for grouped beam search decoding.
+            no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once.
+            stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values: 
+                "EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "HEURISTIC", where an 
+                "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
+                "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
+            
+            Random sampling parameters:
+            temperature:        the value used to modulate token probabilities for random sampling.
+            top_p:              if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+            top_k:              the number of highest probability vocabulary tokens to keep for top-k-filtering.
+            do_sample:          whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
+            repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+        )")
         .def("generate", py::overload_cast<LLMPipeline&, const std::vector<std::string>&, const py::kwargs&>(&call_with_kwargs))
         .def("generate", py::overload_cast<LLMPipeline&, const std::vector<std::string>&, const GenerationConfig&, const StreamerVariant&>(&call_with_config))
-        .def("generate", py::overload_cast<LLMPipeline&, const std::string&, const py::kwargs&>(&call_with_kwargs))
         .def("generate", py::overload_cast<LLMPipeline&, const std::string&, const GenerationConfig&, const StreamerVariant&>(&call_with_config))
+
+        .def("__call__", py::overload_cast<LLMPipeline&, const std::string&, const py::kwargs&>(&call_with_kwargs))
+        .def("__call__", py::overload_cast<LLMPipeline&, const std::string&, const GenerationConfig&, const StreamerVariant&>(&call_with_config))
         
         // todo: if input_ids is a ov::Tensor/numpy tensor
 
@@ -206,19 +298,24 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def("apply_chat_template", &LLMPipeline::apply_chat_template);
 
      // Binding for Tokenizer
-    py::class_<Tokenizer>(m, "Tokenizer")
+    py::class_<Tokenizer>(m, "Tokenizer",
+        R"(openvino_genai.Tokenizer object is used to to initialize tokenizer if it's located in different path 
+        that the main model.)")
         .def(py::init<>())
         .def(py::init<std::string&, const std::string&>(), 
-             py::arg("tokenizers_path"), 
-             py::arg("device") = "CPU")
+                py::arg("tokenizers_path"), 
+                py::arg("device") = "CPU");
 
-        // todo: implement encode/decode when for numpy inputs and outputs
-        .def("encode", py::overload_cast<const std::string>(&Tokenizer::encode), "Encode a single prompt")
-        // TODO: common.h(1106...) template argument deduction/substitution failed:
-        // .def("encode", py::overload_cast<std::vector<std::string>&>(&Tokenizer::encode), "Encode multiple prompts")
-        .def("decode", py::overload_cast<std::vector<int64_t>>(&Tokenizer::decode), "Decode a list of tokens")
-        .def("decode", py::overload_cast<ov::Tensor>(&Tokenizer::decode), "Decode a tensor of tokens")
-        .def("decode", py::overload_cast<std::vector<std::vector<int64_t>>>(&Tokenizer::decode), "Decode multiple lines of tokens");
+    // Binding for StopCriteria
+    py::enum_<StopCriteria>(m, "StopCriteria",
+        R"(StopCriteria controls the stopping condition for grouped beam search. The following values are possible:
+            "EARLY" stops as soon as there are `num_beams` complete candidates.
+            "HEURISTIC" stops when is it unlikely to find better candidates.
+            "NEVER" stops when there cannot be better candidates.)")
+        .value("EARLY", StopCriteria::EARLY)
+        .value("HEURISTIC", StopCriteria::HEURISTIC)
+        .value("NEVER", StopCriteria::NEVER)
+        .export_values();
 
      // Binding for GenerationConfig
     py::class_<GenerationConfig>(m, "GenerationConfig")
@@ -233,7 +330,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readwrite("length_penalty", &GenerationConfig::length_penalty)
         .def_readwrite("num_return_sequences", &GenerationConfig::num_return_sequences)
         .def_readwrite("no_repeat_ngram_size", &GenerationConfig::no_repeat_ngram_size)
-        .def_property("stop_criteria", &stop_criteria_to_str, &str_to_stop_criteria)
+        .def_readwrite("stop_criteria", &GenerationConfig::stop_criteria)
         .def_readwrite("temperature", &GenerationConfig::temperature)
         .def_readwrite("top_p", &GenerationConfig::top_p)
         .def_readwrite("top_k", &GenerationConfig::top_k)
diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py
new file mode 100644
index 0000000000..b990d96428
--- /dev/null
+++ b/tests/python_tests/conftest.py
@@ -0,0 +1,8 @@
+def pytest_make_parametrize_id(config, val, argname):
+    if argname in ['prompt', 'promtps']:
+        return f'{val}'
+    if argname in 'stop_criteria':
+        return str(val)
+    if isinstance(val, (int, float, str)):
+        return f'{argname}={val}'
+    return None
diff --git a/tests/python_tests/generate_api_check.py b/tests/python_tests/generate_api_check.py
deleted file mode 100644
index ad0851fea2..0000000000
--- a/tests/python_tests/generate_api_check.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import openvino_genai as ov_genai
-model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
-path = '/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0'
-device = 'CPU'
-pipe = ov_genai.LLMPipeline(path, device)
-
-from transformers import AutoTokenizer, AutoModelForCausalLM
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id)
-
-prompt = 'table is made of'
-generation_config = {'max_new_tokens': 10}
-
-encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True)
-hf_encoded_output = model.generate(encoded_prompt, **generation_config)
-hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:])
-
-
-
-import os
-build_dir = os.getenv('GENAI_BUILD_DIR', 'build')
-ov_tokenizers_path = f'{build_dir}/openvino_tokenizers/src/'
-# pipe = ov_genai.LLMPipeline(path, device, {}, ov_tokenizers_path)
-
-ov_output = pipe.generate(prompt, **generation_config)
diff --git a/tests/python_tests/list_test_models.py b/tests/python_tests/list_test_models.py
index a67f47c6df..a9454fc211 100644
--- a/tests/python_tests/list_test_models.py
+++ b/tests/python_tests/list_test_models.py
@@ -1,15 +1,15 @@
 def models_list():
     model_ids = [
         ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0"),
-        # ("databricks/dolly-v2-3b", "dolly-v2-3b"), # no free disk space lefton CI machine
-        # ("microsoft/phi-1_5", "phi-1_5/"),
+        ("microsoft/phi-1_5", "phi-1_5/"),
+
         # ("google/gemma-2b-it", "gemma-2b-it"),
         # ("google/gemma-7b-it", "gemma-7b-it"),
         # ("meta-llama/Llama-2-7b-chat-hf", "Llama-2-7b-chat-hf"),
         # ("meta-llama/Llama-2-13b-chat-hf", "Llama-2-13b-chat-hf"),
         # ("openlm-research/open_llama_3b", "open_llama_3b"),
         # ("openlm-research/open_llama_7b", "open_llama_7b"),
-        # ("databricks/dolly-v2-3b", "dolly-v2-3b"),   #  Please check that openvino_tokenizer.xml and openvino_detokenizer.xml exist
+        # ("databricks/dolly-v2-3b", "dolly-v2-3b"),
         # ("databricks/dolly-v2-12b", "dolly-v2-12b"),
     ]
     import os
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 5e2029ea84..d000891c3e 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -2,11 +2,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import openvino_genai
+from openvino_genai import StopCriteria
 import pytest
 from list_test_models import models_list
 from typing import Union, List, Dict
 
-@pytest.fixture(scope="module", params=models_list())
+
+@pytest.fixture(scope="module", params=models_list(), 
+                ids=lambda param: param[0].split('/', 1)[1] if '/' in param[0] else param[0])
 def model_fixture(request):
     model_id, path = request.param
     from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -19,6 +22,7 @@ def model_fixture(request):
     del model
     gc.collect()
 
+
 def run_hf_ov_genai_comparison_batched(model_fixture, generation_config: Dict, prompts: Union[str, List[str]]):
     model_id, path, tokenizer, model = model_fixture
     device = 'CPU'
@@ -63,11 +67,11 @@ def run_hf_ov_genai_comparison_batched(model_fixture, generation_config: Dict, p
     ov_outputs.sort()
     for i, (hf_output, ov_output) in enumerate(zip(hf_outputs, ov_outputs)):
         if hf_output != ov_output:
-            print(f'Prompt {i}:')
             print(f'hf_output: {hf_output}')
             print(f'ov_output: {ov_output}')
         assert hf_output == ov_output
 
+
 def run_hf_ov_genai_comparison(model_fixture, generation_config: Dict, prompt):
     device = 'CPU'
     model_id, path, tokenizer, model = model_fixture
@@ -81,9 +85,6 @@ def run_hf_ov_genai_comparison(model_fixture, generation_config: Dict, prompt):
         config['do_sample'] = False
     
     generation_config_hf = config.copy()
-    # in OpenVINO GenAI this parameter is called stop_criteria,
-    # while in HF it's called early_stopping. 
-    # HF values True, False and "never" correspond to OV GenAI values "early", "heuristic" and "never"
     if generation_config_hf.get('stop_criteria'):
         generation_config_hf['early_stopping'] = stop_criteria_map()[generation_config_hf.pop('stop_criteria')]
 
@@ -99,7 +100,6 @@ def run_hf_ov_genai_comparison(model_fixture, generation_config: Dict, prompt):
         ov_output = ov_output[0]
 
     if hf_output != ov_output:
-        print(f'Prompt {i}:')
         print(f'hf_output: {hf_output}')
         print(f'ov_output: {ov_output}')
 
@@ -107,10 +107,18 @@ def run_hf_ov_genai_comparison(model_fixture, generation_config: Dict, prompt):
 
 
 def stop_criteria_map():
-    return {"never": "never", "early": True, "heuristic": False}
+    # in OpenVINO GenAI this parameter is called stop_criteria,
+    # while in HF it's called early_stopping. 
+    # HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER"
+    return {
+        StopCriteria.NEVER: "never", 
+        StopCriteria.EARLY: True, 
+        StopCriteria.HEURISTIC: False
+    }
+
 
 test_cases = [
-    (dict(max_new_tokens=20, do_sample=False), 'table is made of'),  # generation_config, prompt
+    (dict(max_new_tokens=20), 'table is made of'),  # generation_config, prompt
     (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'Alan Turing was a'),
     (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'),
     (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'),
@@ -121,23 +129,25 @@ def stop_criteria_map():
 def test_decoding(model_fixture, generation_config, prompt):
     run_hf_ov_genai_comparison(model_fixture, generation_config, prompt)
 
+
 test_configs = [
-    dict(max_new_tokens=20, do_sample=False),
-    dict(num_beam_groups=3, num_beams=15, max_new_tokens=20, diversity_penalty=1.0)
+    dict(max_new_tokens=20),
+    dict( max_new_tokens=20, num_beam_groups=3, num_beams=15,diversity_penalty=1.0)
 ]
-batched_prompts = [['table is made of', 'They sky is blue because', 'Difference between Jupiter and Marks is that']
-                   ,['hello', 'Here is the longest nowel ever: ']]
+batched_prompts = [['table is made of', 'They sky is blue because', 'Difference between Jupiter and Marks is that'],
+                   ['hello', 'Here is the longest nowel ever: ']]
 @pytest.mark.parametrize("generation_config", test_configs)
 @pytest.mark.parametrize("prompts", batched_prompts)
 def test_multibatch(model_fixture, generation_config, prompts):
+    generation_config['pad_token_id'] = 2
     run_hf_ov_genai_comparison_batched(model_fixture, generation_config, prompts)
 
 
-prompts = ['The Sun is yellow because', 'Alan Turing was a', 'table is made of']
+prompts = ['The Sun is yellow because', 'Difference between Jupiter and Marks is that', 'table is made of']
 @pytest.mark.parametrize("num_beam_groups", [2, 3, 8])
 @pytest.mark.parametrize("group_size", [5, 3, 10])
 @pytest.mark.parametrize("max_new_tokens", [20, 15])
-@pytest.mark.parametrize("diversity_penalty", [1.0, 1.5])
+@pytest.mark.parametrize("diversity_penalty", [1.0 , 1.5])
 @pytest.mark.parametrize("prompt", prompts)
 def test_beam_search_decoding(model_fixture, num_beam_groups, group_size, 
                               max_new_tokens, diversity_penalty, prompt):
@@ -151,12 +161,12 @@ def test_beam_search_decoding(model_fixture, num_beam_groups, group_size,
     run_hf_ov_genai_comparison(model_fixture, generation_config, prompt)
 
 
-@pytest.mark.parametrize("stop_criteria", ["never", "early", "heuristic"])
+@pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC])
 @pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("max_new_tokens", [20, 40, 300])
+@pytest.mark.parametrize("max_new_tokens", [10, 80])
 def test_stop_criteria(model_fixture, stop_criteria, prompt, max_new_tokens):
-    # todo: for long sentences early stop_criteria fails
-    if (stop_criteria == 'early' and max_new_tokens >= 300):
+    # todo: for long sentences EARLY stop_criteria fails
+    if (stop_criteria == StopCriteria.EARLY and max_new_tokens >= 300):
         pytest.skip()
     generation_config = dict(
         num_beam_groups=2, 
@@ -222,7 +232,8 @@ def __init__(self, tokenizer):
         super().__init__()
         self.tokenizer = tokenizer
     def put(self, token_id):
-        print(self.tokenizer.decode([token_id]))  # Incorrect way to print, but easy to implement
+        # print(self.tokenizer.decode([token_id]))  # Incorrect way to print, but easy to implement
+        print(token_id)  # print only token because self.tokenizer.decode([token_id]) are not implemented yet
     def end(self):
         print('end')
 
@@ -262,7 +273,7 @@ def test_operator_wit_callback_one_string(model_fixture, callback):
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 def test_operator_wit_callback_batch_fail(model_fixture, callback):
     pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU')
-    with pytest.raises(RuntimeError):
+    with pytest.raises(Exception):
         pipe(['1', '2'], openvino_genai.GenerationConfig(), callback)