From d2ccfe071a654c64bba31de7496c15830801eb15 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Wed, 10 Jul 2024 18:55:30 +0400
Subject: [PATCH 01/28] Add ContinuousBatchingPipeline constructor similar to
 LLMPipeline

That allows LLMPipeline to create ContinuousBatchingPipeline as a backend. There's also a constructor accepting ireq, which can be used if the model was already transformed appropriately for ContinuousBatchingPipeline. But it feels it's going to be misleading and it simpler just to throw if such constructor is called with ContinuousBatchingPipeline backend.
---
 .github/workflows/causal_lm_cpp.yml           |  2 +-
 .github/workflows/genai_python_lib.yml        |  4 +--
 .../continuous_batching_accuracy.cpp          |  4 ++-
 .../genai/continuous_batching_pipeline.hpp    | 19 ++++++++++++-
 .../include/openvino/genai/llm_pipeline.hpp   |  4 +--
 src/cpp/include/openvino/genai/tokenizer.hpp  |  2 +-
 src/cpp/src/continuous_batching_pipeline.cpp  | 27 +++++++++++++------
 7 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 80089a4e8..18cc89a8f 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -648,7 +648,7 @@ jobs:
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
-          cmake -DCMAKE_BUILD_TYPE=Releas -S ./ -B ./build/
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
       - name: Run gtests
         run: |
diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml
index 640a293fa..e53c0d881 100644
--- a/.github/workflows/genai_python_lib.yml
+++ b/.github/workflows/genai_python_lib.yml
@@ -10,7 +10,7 @@ env:
   w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/w_openvino_toolkit_windows_2024.3.0.dev20240708_x86_64.zip
 jobs:
   ubuntu_genai_python_lib:
-    # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env.
+    # A tokenizers' dependency fails to compile on ubuntu-20 in CenOS7 env.
     runs-on: ubuntu-22.04
     env:
       # A tokenizers' dependency fails to compile with Ninja in CenOS7 env.
@@ -86,7 +86,7 @@ jobs:
       - run: python -m pytest ./tests/python_tests/test_generate_api.py -m precommit
 
   continuous_batching_python_lib_ubuntu:
-    # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env.
+    # A tokenizers' dependency fails to compile on ubuntu-20 in CenOS7 env.
     runs-on: ubuntu-22.04
     env:
       # A tokenizers' dependency fails to compile with Ninja in CenOS7 env.
diff --git a/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp b/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp
index 6e0cb5034..77485e36d 100644
--- a/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp
+++ b/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp
@@ -78,7 +78,9 @@ int main(int argc, char* argv[]) try {
     // vLLM specific params
     scheduler_config.max_num_seqs = 2;
 
-    ov::genai::ContinuousBatchingPipeline pipe(models_path, scheduler_config);
+    // It's possible to construct a Tokenizer from a different path.
+    // If the Tokenizer isn't specified, it's loaded from the same folder.
+    ov::genai::ContinuousBatchingPipeline pipe(models_path, ov::genai::Tokenizer{models_path}, scheduler_config);
     std::vector<ov::genai::GenerationResult> generation_results = pipe.generate(prompts, sampling_params);
 
     for (size_t request_id = 0; request_id < generation_results.size(); ++request_id) {
diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index e30892f9c..be9a5fd8c 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -32,7 +32,24 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
                                const std::string& device = "CPU",
                                const ov::AnyMap& plugin_config = {});
 
-    std::shared_ptr<ov::genai::Tokenizer> get_tokenizer();
+    /**
+    * @brief Constructs a ContinuousBatchingPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs.
+    *
+    * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json
+    * @param scheduler_config
+    * @param tokenizer manually initialized ov::genai::Tokenizer
+    * @param device optional device
+    * @param plugin_config optional plugin_config
+    */
+    ContinuousBatchingPipeline(
+        const std::string& model_path,
+        const ov::genai::Tokenizer& tokenizer,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device="CPU",
+        const ov::AnyMap& plugin_config={}
+    );
+
+    ov::genai::Tokenizer get_tokenizer();
 
     ov::genai::GenerationConfig get_config() const;
 
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index b6c8f70a2..88982a54c 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -116,10 +116,10 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     );
     
     /**
-    * @brief Constructs a LLMPipeline when ov::Tokenizer is initialized manually using file from the different dirs.
+    * @brief Constructs a LLMPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs.
     *
     * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json
-    * @param tokenizer manually initialized ov::Tokenizer 
+    * @param tokenizer manually initialized ov::genai::Tokenizer 
     * @param device optional device
     * @param plugin_config optional plugin_config
     */
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index a9f3e112b..f12d900a8 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -26,7 +26,7 @@ struct TokenizedInputs {
 class OPENVINO_GENAI_EXPORTS Tokenizer {
 public:
     /**
-    * @brief ov::Tokenizer constructor.
+    * @brief ov::genai::Tokenizer constructor.
     * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
     */
     Tokenizer(const std::string& tokenizer_path);
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index dbacf3c24..27c183ddd 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -19,7 +19,7 @@ using namespace ov::genai;
 void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, DeviceConfig& device_config);
 
 class ContinuousBatchingPipeline::Impl {
-    std::shared_ptr<ov::genai::Tokenizer> m_tokenizer;
+    ov::genai::Tokenizer m_tokenizer;
     std::shared_ptr<Scheduler> m_scheduler;
     std::shared_ptr<CacheManager> m_cache_manager;
     std::shared_ptr<ModelRunner> m_model_runner;
@@ -69,9 +69,9 @@ class ContinuousBatchingPipeline::Impl {
     }
 
 public:
-    Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string device, const ov::AnyMap& plugin_config) {
+    Impl(const std::string& models_path, const Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& plugin_config) :
+            m_tokenizer{tokenizer} {
         ov::Core core;
-        m_tokenizer = std::make_shared<ov::genai::Tokenizer>(models_path);
 
         // The model can be compiled for GPU as well
         std::shared_ptr<ov::Model> model = core.read_model(models_path + "/openvino_model.xml");
@@ -104,6 +104,9 @@ class ContinuousBatchingPipeline::Impl {
         // read default generation config
     }
 
+    Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& plugin_config)
+        : Impl{models_path, Tokenizer(models_path), scheduler_config, device, plugin_config} {}
+
     ov::genai::GenerationConfig get_config() const {
         return m_generation_config;
     }
@@ -112,19 +115,19 @@ class ContinuousBatchingPipeline::Impl {
         return m_pipeline_metrics;
     }
 
-    std::shared_ptr<ov::genai::Tokenizer> get_tokenizer() {
+    ov::genai::Tokenizer get_tokenizer() {
         return m_tokenizer;
     }
 
     GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) {
-        sampling_params.set_eos_token_id(m_tokenizer->get_eos_token_id());
+        sampling_params.set_eos_token_id(m_tokenizer.get_eos_token_id());
         sampling_params.validate();
 
         ov::Tensor input_ids;
         {
             static ManualTimer timer("tokenize");
             timer.start();
-            input_ids = m_tokenizer->encode(prompt).input_ids;
+            input_ids = m_tokenizer.encode(prompt).input_ids;
             timer.end();
         }
 
@@ -262,7 +265,7 @@ class ContinuousBatchingPipeline::Impl {
             auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size());
             for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
                 const auto& generation_output = generation_outputs[generation_output_idx];
-                std::string output_text = m_tokenizer->decode(generation_output.generated_token_ids);
+                std::string output_text = m_tokenizer.decode(generation_output.generated_token_ids);
                 result.m_generation_ids.push_back(output_text);
                 result.m_scores.push_back(generation_output.score);
             }
@@ -282,7 +285,15 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& model
     m_impl = std::make_shared<Impl>(models_path, scheduler_config, device, plugin_config);
 }
 
-std::shared_ptr<ov::genai::Tokenizer> ContinuousBatchingPipeline::get_tokenizer() {
+ContinuousBatchingPipeline::ContinuousBatchingPipeline(
+    const std::string& model_path,
+    const Tokenizer& tokenizer,
+    const SchedulerConfig& scheduler_config,
+    const std::string& device,
+    const ov::AnyMap& plugin_config
+) : m_impl{std::make_shared<Impl>(model_path, tokenizer, scheduler_config, device, plugin_config)} {}
+
+ov::genai::Tokenizer ContinuousBatchingPipeline::get_tokenizer() {
     return m_impl->get_tokenizer();
 }
 

From ab0f43c70cd280783c38ad04f95dbe68d166896f Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 11 Jul 2024 17:05:02 +0400
Subject: [PATCH 02/28] Use CB as backend

---
 .../genai/continuous_batching_pipeline.hpp    |   6 +-
 .../openvino/genai/generation_handle.hpp      |  14 ++
 src/cpp/src/continuous_batching_pipeline.cpp  |  82 ++++++---
 src/cpp/src/llm_pipeline.cpp                  | 156 +++++++++++++++++-
 src/python/py_generate_pipeline.cpp           |   6 +-
 5 files changed, 229 insertions(+), 35 deletions(-)

diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index be9a5fd8c..f13cc55c4 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -55,13 +55,15 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
 
     PipelineMetrics get_metrics() const;
 
-    GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params);
+    GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params);
+    GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params);
 
     void step();
 
     bool has_non_finished_requests();
 
     // more high level interface, which can process multiple prompts in continuous batching manner
-    std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, std::vector<ov::genai::GenerationConfig> sampling_params);
+    std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params);
+    std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params);
 };
 }
diff --git a/src/cpp/include/openvino/genai/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp
index d0ddbc3a3..556f4b812 100644
--- a/src/cpp/include/openvino/genai/generation_handle.hpp
+++ b/src/cpp/include/openvino/genai/generation_handle.hpp
@@ -18,6 +18,20 @@ enum class GenerationStatus {
     DROPPED_BY_HANDLE = 4 // Status set when generation handle is dropped
 };
 
+struct EncodedGenerationResult {
+    // request ID - obsolete when handle API is approved as handle will connect results with prompts.
+    uint64_t m_request_id;
+
+    // in a generic case we have multiple generation results per initial prompt
+    // depending on sampling parameters (e.g. beam search or parallel sampling)
+    std::vector<std::vector<int64_t>> m_generation_ids;
+    // scores
+    std::vector<float> m_scores;
+
+    // Status of generation
+    GenerationStatus m_status = GenerationStatus::RUNNING;
+};
+
 struct GenerationResult {
     // request ID - obsolete when handle API is approved as handle will connect results with prompts.
     uint64_t m_request_id;
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 27c183ddd..4b8d1a319 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -6,6 +6,7 @@
 #include <memory>
 
 #include "openvino/genai/continuous_batching_pipeline.hpp"
+#include "openvino/genai/generation_handle.hpp"
 #include "openvino/genai/tokenizer.hpp"
 #include "cache_manager.hpp"
 #include "sampler.hpp"
@@ -119,18 +120,10 @@ class ContinuousBatchingPipeline::Impl {
         return m_tokenizer;
     }
 
-    GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) {
+    GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, ov::genai::GenerationConfig sampling_params) {
         sampling_params.set_eos_token_id(m_tokenizer.get_eos_token_id());
         sampling_params.validate();
 
-        ov::Tensor input_ids;
-        {
-            static ManualTimer timer("tokenize");
-            timer.start();
-            input_ids = m_tokenizer.encode(prompt).input_ids;
-            timer.end();
-        }
-
         SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, input_ids,
                                                                             sampling_params, m_scheduler->get_config().block_size);
         {
@@ -140,6 +133,14 @@ class ContinuousBatchingPipeline::Impl {
         return std::make_unique<GenerationHandleImpl>(sequence_group->get_generation_stream(), sampling_params);
     }
 
+    GenerationHandle add_request(uint64_t request_id, const std::string& prompt, ov::genai::GenerationConfig sampling_params) {
+        static ManualTimer timer("tokenize");
+        timer.start();
+        ov::Tensor input_ids = m_tokenizer.encode(prompt).input_ids;
+        timer.end();
+        return add_request(request_id, input_ids, sampling_params);
+    }
+
     void step() {
         static ManualTimer step_timer("step()");
         step_timer.start();
@@ -237,16 +238,20 @@ class ContinuousBatchingPipeline::Impl {
         return !m_awaiting_requests.empty() || !m_requests.empty();
     }
 
-    std::vector<GenerationResult> generate(const std::vector<std::string> prompts, std::vector<ov::genai::GenerationConfig> sampling_params) {
+    std::vector<EncodedGenerationResult> generate(
+        const std::vector<ov::Tensor>& input_ids,
+        const std::vector<GenerationConfig>& sampling_params
+    ) {
         OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
-        OPENVINO_ASSERT(prompts.size() == sampling_params.size());
+        OPENVINO_ASSERT(input_ids.size() == sampling_params.size());
 
         std::vector<GenerationHandle> generations;
-        for (size_t request_id = 0; request_id < prompts.size(); ++request_id) {
-            generations.push_back(add_request(request_id, prompts[request_id], sampling_params[request_id]));
+        for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) {
+            OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch.");
+            generations.push_back(add_request(request_id, input_ids[request_id], sampling_params[request_id]));
         }
 
-        std::vector<GenerationResult> results;
+        std::vector<EncodedGenerationResult> results;
         results.reserve(m_awaiting_requests.size());
 
         while (has_non_finished_requests()) {
@@ -255,7 +260,7 @@ class ContinuousBatchingPipeline::Impl {
 
         for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) {
             const auto& generation = generations[generation_idx];
-            GenerationResult result;
+            EncodedGenerationResult result;
             result.m_request_id = 1;
             std::vector<GenerationOutput> generation_outputs = generation->read_all();
             std::sort(generation_outputs.begin(), generation_outputs.end(), [=] (GenerationOutput& r1, GenerationOutput& r2) {
@@ -265,17 +270,42 @@ class ContinuousBatchingPipeline::Impl {
             auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size());
             for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
                 const auto& generation_output = generation_outputs[generation_output_idx];
-                std::string output_text = m_tokenizer.decode(generation_output.generated_token_ids);
-                result.m_generation_ids.push_back(output_text);
+                result.m_generation_ids.push_back(generation_output.generated_token_ids);
                 result.m_scores.push_back(generation_output.score);
             }
             result.m_status = generation->get_status();
-            results.push_back(result);
+            results.push_back(std::move(result));
         }
-
-        OPENVINO_ASSERT(results.size() == prompts.size());
         return results;
     }
+
+    std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, std::vector<ov::genai::GenerationConfig> sampling_params) {
+        std::vector<ov::Tensor> input_ids;
+        input_ids.reserve(prompts.size());
+        for (const std::string& prompt : prompts) {
+            static ManualTimer timer("tokenize");
+            timer.start();
+            input_ids.push_back(m_tokenizer.encode(prompt).input_ids);
+            timer.end();
+        }
+        std::vector<EncodedGenerationResult> encoded = generate(input_ids, sampling_params);
+        std::vector<GenerationResult> decoded;
+        decoded.reserve(encoded.size());
+        for (EncodedGenerationResult& res : encoded) {
+            std::vector<std::string> generated;
+            generated.reserve(res.m_generation_ids.size());
+            for (const std::vector<int64_t>& tokens : res.m_generation_ids) {
+                generated.push_back(m_tokenizer.decode(tokens));
+            }
+            decoded.push_back(GenerationResult{
+                res.m_request_id,
+                std::move(generated),
+                std::move(res.m_scores),
+                res.m_status
+            });
+        }
+        return decoded;
+    }
 };
 
 ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& models_path,
@@ -305,10 +335,14 @@ PipelineMetrics ContinuousBatchingPipeline::get_metrics() const{
     return m_impl->get_metrics();
 }
 
-GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) {
+GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params) {
     return m_impl->add_request(request_id, prompt, sampling_params);
 }
 
+GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params) {
+    return m_impl->add_request(request_id, input_ids, sampling_params);
+}
+
 void ContinuousBatchingPipeline::step() {
     m_impl->step();
 }
@@ -317,6 +351,10 @@ bool ContinuousBatchingPipeline::has_non_finished_requests() {
     return m_impl->has_non_finished_requests();
 }
 
-std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::vector<std::string>& prompts, std::vector<ov::genai::GenerationConfig> sampling_params) {
+std::vector<EncodedGenerationResult> ContinuousBatchingPipeline::generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params) {
+    return m_impl->generate(input_ids, sampling_params);
+}
+
+std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params) {
     return m_impl->generate(prompts, sampling_params);
 }
\ No newline at end of file
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 200ce5a63..2fa3cb963 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <nlohmann/json.hpp>
 #include <openvino/openvino.hpp>
+#include "openvino/genai/continuous_batching_pipeline.hpp"
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/llm_pipeline.hpp"
 #include "llm_pipeline_base.hpp"
@@ -286,14 +287,147 @@ std::pair<std::string, Any> generation_config(const GenerationConfig& config) {
 }  // namespace genai
 }  // namespace ov
 
-using namespace std;
+namespace {
+using namespace ov::genai;
+
+template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
+template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
+
+Tokenizer dont_construct() {
+    OPENVINO_THROW("Continuous Batching backend can't be constructed"
+        "from ireq because the model must be transformed");
+}
+
+class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
+public:
+    ov::genai::ContinuousBatchingPipeline m_impl;
+
+    ContinuousBatchingAdapter(
+        const ov::InferRequest& request,
+        const ov::genai::Tokenizer& tokenizer,
+        OptionalGenerationConfig generation_config
+    ): LLMPipelineImplBase{dont_construct()}, m_impl{"", {}} {}
+
+    ContinuousBatchingAdapter(
+        const std::filesystem::path& model_path,
+        const Tokenizer& tokenizer,
+        const std::string& device,
+        const ov::AnyMap& plugin_config
+    ): LLMPipelineImplBase{tokenizer}, m_impl{
+        model_path.string(),
+        tokenizer,
+        SchedulerConfig{},
+        device,
+        plugin_config
+    } {}
+
+    ContinuousBatchingAdapter(
+        const std::filesystem::path& model_path,
+        const std::string& device,
+        const ov::AnyMap& plugin_config
+    ): LLMPipelineImplBase{Tokenizer(model_path.string())}, m_impl{
+        model_path.string(),
+        m_tokenizer,
+        SchedulerConfig{},
+        device,
+        plugin_config
+    } {}
+
+    DecodedResults generate(
+        StringInputs inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override {
+        EncodedInputs input_ids_att = std::visit(overloaded{
+            [this](const std::string& prompt) {
+                return m_tokenizer.encode(prompt);
+            },
+            [this](std::vector<std::string>& prompts) {
+                return m_tokenizer.encode(prompts);
+            }
+        }, inputs);
+        EncodedResults encoded = generate(input_ids_att, generation_config, streamer);
+        return {m_tokenizer.decode(encoded.tokens), encoded.scores};
+    }
+
+    EncodedResults generate(
+        const EncodedInputs& inputs,
+        OptionalGenerationConfig generation_config,
+        StreamerVariant streamer
+    ) override {
+        if (std::holds_alternative<std::monostate>(streamer)) {
+            OPENVINO_THROW("streamer isn't supported for Continuous Batching");
+        }
+        std::vector<ov::Tensor> input_ids = std::visit(overloaded{
+            [](const ov::Tensor& inp) {
+                size_t batch_size = inp.get_shape().at(0);
+                if (1 == batch_size) {
+                    return std::vector{inp};
+                }
+                std::vector<ov::Tensor> input_ids;
+                input_ids.reserve(batch_size);
+                size_t max_len = inp.get_shape().at(1);
+                const int64_t* const source = inp.data<const int64_t>();
+                for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
+                    input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
+                    int64_t* destination = input_ids.back().data<int64_t>();
+                    std::copy_n(source + batch_id * max_len, max_len, destination);
+                }
+                return input_ids;
+            },
+            [](const TokenizedInputs& inp) {
+                size_t batch_size = inp.input_ids.get_shape().at(0);
+                std::vector<ov::Tensor> input_ids;
+                input_ids.reserve(batch_size);
+                size_t max_len = inp.input_ids.get_shape().at(1);
+                const int64_t* const source = inp.input_ids.data<const int64_t>();
+                const int64_t* const attention_mask = inp.attention_mask.data<const int64_t>();
+                for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) {
+                    input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len));
+                    int64_t* destination = input_ids.back().data<int64_t>();
+                    size_t copy_count = 0;
+                    for (size_t idx = 0; idx < max_len; ++idx) {
+                        if (1 == attention_mask[batch_id * max_len + idx]) {
+                            destination[copy_count++] = source[batch_id * max_len + idx];
+                        }
+                    }
+                    input_ids.back().set_shape({1, copy_count});
+                }
+                return input_ids;
+            }
+        }, inputs);
+        const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
+        // -1 == config.eos_token_id and config.validate() are handled in m_impl.
+        std::vector<EncodedGenerationResult> generated = m_impl.generate(input_ids, std::vector<GenerationConfig>{input_ids.size(), config});
+        std::vector<std::vector<int64_t>> tokens;
+        std::vector<float> scores;
+        for (EncodedGenerationResult& res : generated) {
+            std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(tokens));
+            std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(scores));
+        }
+        return {std::move(tokens), std::move(scores)};
+    }
+
+    void start_chat() override {
+        OPENVINO_THROW("start_chat() isn't implemented.");
+    }
+
+    void finish_chat() override {
+        OPENVINO_THROW("finish_chat() isn't implemented.");
+    }
+};
+}
 
 ov::genai::LLMPipeline::LLMPipeline(
     const ov::InferRequest& request,
     const ov::genai::Tokenizer& tokenizer,
     OptionalGenerationConfig generation_config
 ) {
-    m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer, generation_config);
+    if (std::getenv("USE_CONTINUOUS_BATCHING")) {
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(request, tokenizer, generation_config);
+    } else {
+        m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer, generation_config);
+    }
 }
 
 ov::genai::LLMPipeline::LLMPipeline(
@@ -302,10 +436,12 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& plugin_config
 ) {
-    if (device == "NPU") {
-        m_pimpl = make_unique<StaticLLMPipeline>(std::filesystem::path(model_path), tokenizer, device, plugin_config);
+    if (std::getenv("USE_CONTINUOUS_BATCHING")) {
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_path, tokenizer, device, plugin_config);
+    } else if ("NPU" == device) {
+        m_pimpl = std::make_unique<StaticLLMPipeline>(model_path, tokenizer, device, plugin_config);
     } else {
-        m_pimpl = make_unique<StatefulLLMPipeline>(std::filesystem::path(model_path), tokenizer, device, plugin_config);
+        m_pimpl = std::make_unique<StatefulLLMPipeline>(model_path, tokenizer, device, plugin_config);
     }
 }
 
@@ -314,10 +450,12 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& config
 ) {
-    if (device == "NPU") {
-        m_pimpl = make_unique<StaticLLMPipeline>(std::filesystem::path(path), device, config);
+    if (std::getenv("USE_CONTINUOUS_BATCHING")) {
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(path, device, config);
+    } else if ("NPU" == device) {
+        m_pimpl = std::make_unique<StaticLLMPipeline>(path, device, config);
     } else {
-        m_pimpl = make_unique<StatefulLLMPipeline>(std::filesystem::path(path), device, config);
+        m_pimpl = std::make_unique<StatefulLLMPipeline>(path, device, config);
     }
 }
 
@@ -338,7 +476,7 @@ void ov::genai::LLMPipeline::finish_chat() {
 }
 
 void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& config) {
-    int64_t default_eos_token_id = m_pimpl->m_generation_config.eos_token_id;;
+    int64_t default_eos_token_id = m_pimpl->m_generation_config.eos_token_id;
     m_pimpl->m_generation_config = config;
     // if eos_token_id was not provided in config forward from default config
     if (config.eos_token_id == -1)
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 784fcd8e3..e3783e7f4 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -599,11 +599,13 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def(py::init([](const std::string& model_path, const SchedulerConfig& config) {
             ScopedVar env_manager(ov_tokenizers_module_path());
             return std::make_unique<ContinuousBatchingPipeline>(model_path, config);
-        }))
+        }), py::arg("device") = "CPU")  // TODO: other ctors
         .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer)
         .def("get_config", &ContinuousBatchingPipeline::get_config)
-        .def("add_request", &ContinuousBatchingPipeline::add_request)
+        .def("add_request", py::overload_cast<uint64_t, const ov::Tensor&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request))
+        .def("add_request", py::overload_cast<uint64_t, const std::string&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request))
         .def("step", &ContinuousBatchingPipeline::step)
         .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests)
         .def("generate", &ContinuousBatchingPipeline::generate);
+        .def("generate", py::overload_cast<uint64_t, const std::string&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::generate))
 }

From 05cf5d33b908143f129dca24741dcdeafd3a3d52 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 11 Jul 2024 18:32:17 +0400
Subject: [PATCH 03/28] Update bindings

---
 src/python/py_generate_pipeline.cpp | 10 +++++++---
 tests/python_tests/common.py        |  2 +-
 tests/python_tests/test_sampling.py |  4 ++--
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 784fcd8e3..7cd6e0430 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -596,10 +596,14 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs);
 
     py::class_<ContinuousBatchingPipeline>(m, "ContinuousBatchingPipeline")
-        .def(py::init([](const std::string& model_path, const SchedulerConfig& config) {
+        .def(py::init([](const std::string& model_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
             ScopedVar env_manager(ov_tokenizers_module_path());
-            return std::make_unique<ContinuousBatchingPipeline>(model_path, config);
-        }))
+            return std::make_unique<ContinuousBatchingPipeline>(model_path, scheduler_config, device, properties_to_any_map(plugin_config));
+        }), py::arg("model_path"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap({}))
+        .def(py::init([](const std::string& model_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
+            ScopedVar env_manager(ov_tokenizers_module_path());
+            return std::make_unique<ContinuousBatchingPipeline>(model_path, tokenizer, scheduler_config, device, properties_to_any_map(plugin_config));
+        }), py::arg("model_path"), py::arg("tokenizer"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap({}))
         .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer)
         .def("get_config", &ContinuousBatchingPipeline::get_config)
         .def("add_request", &ContinuousBatchingPipeline::add_request)
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index 9b53a6b78..2ec96f671 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -273,7 +273,7 @@ def run_continuous_batching(
     prompts: List[str],
     generation_configs : List[GenerationConfig]
 ) -> List[GenerationResult]:
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config)
+    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config, "CPU", {})
     output = pipe.generate(prompts, generation_configs)
     del pipe
     shutil.rmtree(model_path)
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index f4f35deac..ae6fbe4b4 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -7,7 +7,7 @@
 import sys
 from dataclasses import dataclass
 from pathlib import Path
-from openvino_genai import ContinuousBatchingPipeline, GenerationConfig
+from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
 from typing import List
 
 from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, \
@@ -205,7 +205,7 @@ def test_post_oom_health(tmp_path):
     model_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, model_path)
 
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config)
+    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()) scheduler_config)
     # First run should return incomplete response
     output = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert(len(output))

From 96fcf7764dea00eba4317bf53392a44c83eeb98c Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 11 Jul 2024 18:43:08 +0400
Subject: [PATCH 04/28] comma

---
 tests/python_tests/test_sampling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index ae6fbe4b4..c02804527 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -205,7 +205,7 @@ def test_post_oom_health(tmp_path):
     model_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, model_path)
 
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()) scheduler_config)
+    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config)
     # First run should return incomplete response
     output = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert(len(output))

From e278a14131bebb9a7e1f3a0661208b79c70a1dad Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 11 Jul 2024 20:37:27 +0400
Subject: [PATCH 05/28] pass

---
 .../openvino/genai/scheduler_config.hpp       |   2 +-
 src/cpp/src/llm_pipeline.cpp                  |  19 +-
 src/python/py_generate_pipeline.cpp           |   4 +-
 tests/python_tests/test_generate_api.py       | 793 ++++--------------
 4 files changed, 161 insertions(+), 657 deletions(-)

diff --git a/src/cpp/include/openvino/genai/scheduler_config.hpp b/src/cpp/include/openvino/genai/scheduler_config.hpp
index 787060d07..9d808fd42 100644
--- a/src/cpp/include/openvino/genai/scheduler_config.hpp
+++ b/src/cpp/include/openvino/genai/scheduler_config.hpp
@@ -16,7 +16,7 @@ struct SchedulerConfig {
     std::size_t num_kv_blocks = 0;
 
     // total size of KV cache in GB
-    std::size_t cache_size = 0;
+    std::size_t cache_size = 1;
 
     // block size for KV cache
     std::size_t block_size = 32;
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 1c96d7552..41acc3d07 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -403,7 +403,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         OptionalGenerationConfig generation_config,
         StreamerVariant streamer
     ) override {
-        if (std::holds_alternative<std::monostate>(streamer)) {
+        if (!std::holds_alternative<std::monostate>(streamer)) {
             OPENVINO_THROW("streamer isn't supported for Continuous Batching");
         }
         std::vector<ov::Tensor> input_ids = std::visit(overloaded{
@@ -446,6 +446,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         }, inputs);
         const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
         // -1 == config.eos_token_id and config.validate() are handled in m_impl.
+        std::cout << "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAa\n";
         std::vector<EncodedGenerationResult> generated = m_impl.generate(input_ids, std::vector<GenerationConfig>{input_ids.size(), config});
         std::vector<std::vector<int64_t>> tokens;
         std::vector<float> scores;
@@ -456,7 +457,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         return {std::move(tokens), std::move(scores)};
     }
 
-    void start_chat() override {
+    void start_chat(const std::string& system_message) override {
         OPENVINO_THROW("start_chat() isn't implemented.");
     }
 
@@ -471,11 +472,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::genai::Tokenizer& tokenizer,
     OptionalGenerationConfig generation_config
 ) {
-    if (std::getenv("USE_CONTINUOUS_BATCHING")) {
-        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(request, tokenizer, generation_config);
-    } else {
-        m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer, generation_config);
-    }
+    m_pimpl = std::make_unique<ContinuousBatchingAdapter>(request, tokenizer, generation_config);
 }
 
 ov::genai::LLMPipeline::LLMPipeline(
@@ -484,8 +481,8 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& plugin_config
 ) {
-    if (std::getenv("USE_CONTINUOUS_BATCHING")) {
-        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_path, tokenizer, device, plugin_config);
+    if ("CB" == device) {
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_path, tokenizer, "CPU", plugin_config);
     } else if ("NPU" == device) {
         m_pimpl = std::make_unique<StaticLLMPipeline>(model_path, tokenizer, device, plugin_config);
     } else {
@@ -498,8 +495,8 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& config
 ) {
-    if (std::getenv("USE_CONTINUOUS_BATCHING")) {
-        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(path, device, config);
+    if ("CB" == device) {
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(path, "CPU", config);
     } else if ("NPU" == device) {
         m_pimpl = std::make_unique<StaticLLMPipeline>(path, device, config);
     } else {
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index ced1e7c8b..942c7a284 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -610,6 +610,6 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def("add_request", py::overload_cast<uint64_t, const std::string&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request))
         .def("step", &ContinuousBatchingPipeline::step)
         .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests)
-        .def("generate", &ContinuousBatchingPipeline::generate);
-        .def("generate", py::overload_cast<uint64_t, const std::string&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::generate))
+        .def("generate", py::overload_cast<const std::vector<ov::Tensor>&, const std::vector<ov::genai::GenerationConfig>&>(&ContinuousBatchingPipeline::generate))
+        .def("generate", py::overload_cast<const std::vector<std::string>&, const std::vector<ov::genai::GenerationConfig>&>(&ContinuousBatchingPipeline::generate));
 }
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 40eba9227..84488fe01 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -1,675 +1,182 @@
 # Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import functools
 import openvino_genai as ov_genai
-from openvino_genai import StopCriteria
+import pathlib
 import pytest
-import transformers
-from typing import Union, List, Dict, Optional
-import numpy as np
-import openvino as ov
-import sys
-from pathlib import Path
-import torch
+from typing import Dict, Tuple
 from ov_genai_test_utils import (
-    get_models_list, 
-    read_model, 
-    load_pipe,
-    load_tok, 
-    model_tmp_path, 
-    STOP_CRITERIA_MAP, 
+    get_models_list,
+    get_chat_models_list,
+    read_model,
+    load_tok,
+    model_tmp_path,
+    get_chat_templates
 )
 
 
-def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]):
-    device = 'CPU'
-    model_id, path, tokenizer, model, pipe = model_descr
-    config = generation_config.copy()  # to avoid side effects
-    num_beams = config['num_beams'] if 'num_beams' in config else 1
-    config['num_return_sequences'] = num_beams
-    
-    if not isinstance(prompts, list):
-        prompts = [prompts]
-
-    if 'do_sample' not in config:
-        # Some HF models have default do_sample = True, and if we set beam search generation config 
-        # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
-        # Need to set exlicitly to False, but only if test arguments omitted this arg.
-        # Do not apply 'repetition_penalty' if sampling is not used.
-        config['do_sample'] = False
-        config['repetition_penalty'] = None
-    
-    generation_config_hf = config.copy()
-    if generation_config_hf.get('stop_criteria'):
-        generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
-    generation_config_hf.pop('ignore_eos', None)
-
-    # Encode the batch of prompts
-    tokenizer.padding_side = "left"
-    encoded_prompts = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True)
-    prompt_ids, attention_mask = encoded_prompts['input_ids'], encoded_prompts['attention_mask']
-    
-    hf_encoded_outputs = model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf)
-
-    hf_outputs = []
-    for idx, hf_encoded_out in enumerate(hf_encoded_outputs):
-        prompt_count = idx // num_beams
-        hf_outputs.append(tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True))
-
-    ov_outputs = pipe.generate(prompts, **config).texts
-
-    hf_outputs.sort()
-    ov_outputs.sort()
-    for i, (hf_output, ov_output) in enumerate(zip(hf_outputs, ov_outputs)):
-        if hf_output != ov_output:
-            print(f'hf_output: {hf_output}')
-            print(f'ov_output: {ov_output}')
-        assert hf_output == ov_output
-
-def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str):
-    device = 'CPU'
-    model_id, path, tokenizer, model, pipe = model_descr
-
-    config = generation_config.copy()  # to avoid side effects
-
-    if 'do_sample' not in config:
-        # Some HF models have default do_sample = True, and if we set beam search generation config 
-        # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
-        # Need to set exlicitly to False, but only if test arguments omitted this arg.
-        # Do not apply 'repetition_penalty' if sampling is not used.
-        config['do_sample'] = False
-        config['repetition_penalty'] = None
-
-    generation_config_hf = config.copy()
-    if generation_config_hf.get('stop_criteria'):
-        generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
-    generation_config_hf.pop('ignore_eos', None)
+configs = [
+    dict(max_new_tokens=20),
+    dict(num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0)
+]
 
-    encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True)
-    hf_encoded_output = model.generate(encoded_prompt, **generation_config_hf)
-    hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:], skip_special_tokens=True)
 
-    ov_output = pipe.generate(prompt, **config)
-    if config.get('num_return_sequences', 1) > 1:
-        assert hf_output in ov_output.texts
-    else:
-        if hf_output != ov_output:
-            print(f'hf_output: {hf_output}')
-            print(f'ov_output: {ov_output}')
+questions = [
+    '1+1=',
+    'What is the previous answer?',
+    'Why is the Sun yellow?',
+    'What was my first question?'
+]
 
-        assert hf_output == ov_output
 
-def hf_ov_genai_tensors_comparison(
-        model_descr, 
-        generation_config: Dict, 
-        input_ids: np.ndarray, 
-        attention_mask: Optional[np.array] = None
-    ):
+@pytest.mark.parametrize("generation_config", configs)
+@pytest.mark.parametrize("model_descr", get_chat_models_list())
+@pytest.mark.precommit
+def test_chat_compare_with_HF(model_descr, generation_config: Dict):
     device = 'CPU'
-    model_id, path, tokenizer, model, pipe = model_descr
-
-    config = generation_config.copy()  # to avoid side effects
-
-    if 'do_sample' not in config:
-        # Some HF models have default do_sample = True, and if we set beam search generation config 
-        # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
-        # Need to set exlicitly to False, but only if test arguments omitted this arg.
-        # Do not apply 'repetition_penalty' if sampling is not used.
-        config['do_sample'] = False
-        config['repetition_penalty'] = None
+    chat_history_hf = []
+    chat_history_ov = []
+    chat_prompt = ''
     
-    generation_config_hf = config.copy()
-    if generation_config_hf.get('stop_criteria'):
-        generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
-    generation_config_hf.pop('ignore_eos', None)
+    # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True.
+    # Need to regenerate openvino_tokenizer/detokenizer.
+    model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False)
+
+    pipe.start_chat()    
+    for prompt in questions:
+        chat_history_hf.append({'role': 'user', 'content': prompt})
+        chat_history_ov.append({'role': 'user', 'content': prompt})
+        
+        chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
+        tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
+        
+        answer = model_opt.generate(**tokenized, **generation_config, do_sample=False, repetition_penalty = None)
+        answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
+        chat_history_hf.append({'role': 'assistant', 'content': answer_str})
+
+        answer_ov = pipe.generate(prompt, **generation_config)
+        chat_history_ov.append({'role': 'assistant', 'content': answer_ov})
+
+    pipe.finish_chat()
     
-    if attention_mask is not None:
-        inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask))
-        inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask))
-    else:
-        inputs_hf = dict(inputs=torch.tensor(input_ids))
-        inputs_ov = ov.Tensor(input_ids)
-
-    hf_output = model.generate(**inputs_hf, **generation_config_hf)
-
-    pipe = ov_genai.LLMPipeline(str(path), device)
-    ov_output = pipe.generate(inputs_ov, **config)
-
-    hf_res = hf_output[0, input_ids.shape[1]:].numpy()
-    ov_res = np.array(ov_output.tokens, dtype=np.int64)
-    assert np.all(ov_res == hf_res)
+    if chat_history_ov != chat_history_hf:
+        print(f'hf_output: {chat_history_hf}')
+        print(f'ov_output: {chat_history_ov}')
+    assert chat_history_ov == chat_history_hf
 
 
-test_cases = [
-    (dict(max_new_tokens=20), 'table is made of'),
-    (dict(max_new_tokens=20), '你好！ 你好嗎？'),
-    (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'),
-    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'),
-    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'The Sun is yellow because'),
-    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'),
-]
-@pytest.mark.parametrize("generation_config,prompt", test_cases)
-@pytest.mark.parametrize("model_descr", get_models_list())
+@pytest.mark.parametrize("generation_config", configs)
+@pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
-def test_decoding(model_descr, generation_config, prompt):
-    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
-
-input_tensors_list = [
-    # input_ids, attention_mask
-    (np.array([[1, 4, 42]], dtype=np.int64), None),
-    (np.array([[1, 4, 42]], dtype=np.int64), np.array([[1, 1, 1]], dtype=np.int64)),
-]
-@pytest.mark.parametrize("inputs", input_tensors_list)
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.xfail(
-    raises=TypeError, 
-    reason="pybind was unable to find overloads with tensor inputs on Linux",
-    strict=False,
-    condition=sys.platform == "linux"
-)
-@pytest.mark.precommit
-def test_ov_tensors(model_descr, inputs):
-    hf_ov_genai_tensors_comparison(read_model(model_descr), dict(max_new_tokens=20), *inputs)
-
-
-prompts = [
-    'table is made of',
-    '你好！ 你好嗎？',
-    'Alan Turing was a',
-    'The Sun is yellow because',
-    ['The Sun is yellow because', 'Alan Turing was a', 'Alan Turing was a']
-]
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.precommit
-@pytest.mark.xfail(
-    raises=TypeError, 
-    reason="pybind was unable to find ov::Tensor from openvino yet",
-    strict=False,
-    condition=sys.platform in ["linux", "win32"]
-)
-def test_genai_tokenizer_encode(model_descr, prompt):
-    model_id, path, tokenizer, model, pipe = read_model(model_descr)
-    tok = pipe.get_tokenizer()
+def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict):
+    # compares with HF when history in ov_genai is save as a text
+    device = 'CPU'
+    chat_history_hf = []
+    chat_history_ov = []
+    chat_prompt = ''
     
-    encoded_ov = tok.encode(prompt).input_ids.data
-    if isinstance(prompt, list):
-        encoded_hf = tokenizer.batch_encode_plus(prompt)['input_ids']
-        for tokens_ov, tokens_hf in zip(encoded_ov, encoded_hf):
-            assert np.all(tokens_ov == tokens_hf)
-    else:
-        encoded_hf = tokenizer.encode(prompt)
-        assert np.all(encoded_hf == encoded_ov[0])
-
-encoded_prompts = [
-    [1, 1591, 338, 1754, 310],
-    [1, 17102,   323,  3864,   471,   263],
+    # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True.
+    # Need to regenerate openvino_tokenizer/detokenizer.
+    model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False)
     
-    # chineze characters
-    [1, 29871, 30919, 31076, 30584, 29871, 30919, 31076, 232, 154, 145, 30882],
-
-    # On meta-llama/Meta-Llama-3-8B-Instruct this becomes longer  after removing the last token
-    [3113, 264, 364, 267],
-
-    # batched tokens
-    [[1, 1591, 338, 1754, 310], [1, 1591, 338, 1754, 310], [1, 17102,   323,  3864,   471,   263]]
-]
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.parametrize("encoded_prompt", encoded_prompts)
-@pytest.mark.precommit
-@pytest.mark.xfail(
-    raises=TypeError, 
-    reason="pybind was unable to find ov::Tensor from openvino yet",
-    strict=False,
-    condition=sys.platform in ["linux", "win32"]
-)
-def test_genai_tokenizer_decode(model_descr, encoded_prompt):
-    model_id, path, tokenizer, model, pipe = read_model(model_descr)
-    tok = pipe.get_tokenizer()
-    decoded_ov = tok.decode(encoded_prompt)
+    for prompt in questions:
+        chat_history_hf.append({'role': 'user', 'content': prompt})
+        chat_history_ov.append({'role': 'user', 'content': prompt})
+        
+        chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
+        tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
+        
+        answer = model_opt.generate(**tokenized, **generation_config, do_sample=False, repetition_penalty = None)
+        answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
+        chat_history_hf.append({'role': 'assistant', 'content': answer_str})
+        
+        chat_prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True)
+        answer_ov = pipe.generate(chat_prompt, **generation_config)
+        chat_history_ov.append({'role': 'assistant', 'content': answer_ov})
+  
+    if chat_history_ov != chat_history_hf:
+        print(f'hf_output: {chat_history_hf}')
+        print(f'ov_output: {chat_history_ov}')
+    assert chat_history_ov == chat_history_hf
+
+
+@pytest.mark.parametrize("generation_config", configs)
+@pytest.mark.parametrize("model_descr", get_chat_models_list())
+@pytest.mark.precommit
+def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: Dict):
+    # Check that when history is stored in KV cache results are the same as when history stored in a text.
+    device ='CPU'
     
-    if isinstance(encoded_prompt[0], list):
-        decoded_hf = tokenizer.batch_decode(encoded_prompt, skip_special_tokens=True)
-        for tokens_ov, tokens_hf in zip(decoded_ov, decoded_hf):
-            assert np.all(tokens_ov == tokens_hf)
-    else:
-        decoded_hf = tokenizer.decode(encoded_prompt, skip_special_tokens=True)
-        assert decoded_hf == decoded_ov
-
-
-test_configs = [
-    dict(max_new_tokens=20),
-    dict(max_new_tokens=200, ignore_eos=True),
-    dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0)
-]
-batched_prompts = [
-    ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'],
-    ['hello', 'Here is the longest nowel ever: '],
-    ['Alan Turing was a', 'return 0', '你好！ 你好嗎？'],
-    ['table is made', 'table is made [force left pad tokens]']
-]
-@pytest.mark.parametrize("generation_config", test_configs)
-@pytest.mark.parametrize("prompts", batched_prompts)
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.precommit
-def test_multibatch(model_descr, generation_config, prompts):
-    run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts)
-
-
-prompts = ['The Sun is yellow because', 'Difference between Jupiter and Mars is that', 'table is made of']
-@pytest.mark.parametrize("num_beam_groups", [2, 3, 8])
-@pytest.mark.parametrize("group_size", [5, 3, 10])
-@pytest.mark.parametrize("max_new_tokens", [20, 15])
-@pytest.mark.parametrize("diversity_penalty", [1.0 , 1.5])
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.precommit
-def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
-                              max_new_tokens, diversity_penalty, prompt):
-    generation_config = dict(
-        num_beam_groups=num_beam_groups, 
-        num_beams=num_beam_groups * group_size, 
-        diversity_penalty=diversity_penalty, 
-        num_return_sequences=num_beam_groups * group_size, 
-        max_new_tokens=max_new_tokens, 
-    )
-    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
-
-
-@pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC])
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("max_new_tokens", [10, 80])
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.precommit
-def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
-    # todo: with EARLY stop_criteria looks like HF return unvalid out with sentence<eos><unk><unk>
-    # while genai ends sentence with <eos>
-    if (stop_criteria == StopCriteria.EARLY):
-        pytest.skip()
-    generation_config = dict(
-        num_beam_groups=2, 
-        num_beams=2 * 3, 
-        diversity_penalty=1.0, 
-        num_return_sequences=2 * 3, 
-        max_new_tokens=max_new_tokens, 
-        stop_criteria=stop_criteria,
-    )
-    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
-
-
-# test long sequences
-@pytest.mark.parametrize("num_beam_groups", [2])
-@pytest.mark.parametrize("group_size", [5])
-@pytest.mark.parametrize("max_new_tokens", [800, 2000])
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.skip(reason="Will be enabled in nightly since the test are computationally expensive")
-@pytest.mark.nightly
-def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size,
-                                    max_new_tokens, prompt):
-    generation_config = dict(
-        num_beam_groups=num_beam_groups, 
-        num_beams=num_beam_groups * group_size, 
-        diversity_penalty=1.0, 
-        num_return_sequences=num_beam_groups * group_size, 
-        max_new_tokens=max_new_tokens, 
-    )
-    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
-
-
-def user_defined_callback(subword):
-    print(subword)
-
-
-@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
-@pytest.mark.precommit
-def test_callback_one_string(callback):
-    pipe = read_model(get_models_list()[0])[4]
-    generation_config = pipe.get_generation_config()
-    generation_config.max_new_tokens = 10
-    pipe.generate('table is made of', generation_config, callback)
-
-
-@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
-@pytest.mark.precommit
-def test_callback_batch_fail(callback):
-    pipe = read_model(get_models_list()[0])[4]
-    with pytest.raises(RuntimeError):
-        pipe.generate(['1', '2'], ov_genai.GenerationConfig(), callback)
-
-
-@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
-@pytest.mark.precommit
-def test_callback_kwargs_one_string(callback):
-    pipe = read_model(get_models_list()[0])[4]
-    pipe.generate('table is made of', max_new_tokens=10, streamer=callback)
-
-@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
-@pytest.mark.precommit
-@pytest.mark.parametrize("model_descr", get_models_list())
-def test_callback_decoding_metallama(model_descr, callback):
-    # On metallam this prompt generates output which can shorten after adding new tokens.
-    # Test that streamer correctly handles such cases.
-    prompt = 'I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature'
-    if model_descr[0] != 'meta-llama/Meta-Llama-3-8B-Instruct':
-        pytest.skip()
-    pipe = read_model(model_descr)[4]
-    pipe.generate(prompt, max_new_tokens=300, streamer=callback)
-
-
-@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
-@pytest.mark.precommit
-def test_callback_kwargs_batch_fail(callback):
-    pipe = read_model(get_models_list()[0])[4]
-    with pytest.raises(RuntimeError):
-        pipe.generate(['1', '2'], max_new_tokens=10, streamer=callback)
-
-
-class Printer(ov_genai.StreamerBase):
-    def __init__(self, tokenizer):
-        # super() may work, but once you begin mixing Python and C++
-        # multiple inheritance, things will fall apart due to
-        # differences between Python’s MRO and C++’s mechanisms.
-        ov_genai.StreamerBase.__init__(self)
-        self.tokenizer = tokenizer
-    def put(self, token_id):
-        # print(self.tokenizer.decode([token_id]))  # Incorrect way to print, but easy to implement
-        print(token_id)  # print only token because self.tokenizer.decode([token_id]) are not implemented yet
-    def end(self):
-        print('end')
-
-
-@pytest.mark.precommit
-def test_streamer_one_string():
-    pipe = read_model(get_models_list()[0])[4]
-    generation_config = pipe.get_generation_config()
-    generation_config.max_new_tokens = 10
-    printer = Printer(pipe.get_tokenizer())
-    pipe.generate('table is made of', generation_config, printer)
-
-
-@pytest.mark.precommit
-def test_streamer_batch_fail():
-    pipe = read_model(get_models_list()[0])[4]
-    printer = Printer(pipe.get_tokenizer())
-    with pytest.raises(RuntimeError):
-        pipe.generate(['1', '2'], ov_genai.GenerationConfig(), printer)
-
-
-@pytest.mark.precommit
-def test_streamer_kwargs_one_string():
-    pipe = read_model(get_models_list()[0])[4]
-    printer = Printer(pipe.get_tokenizer())
-    pipe.generate('table is made of', max_new_tokens=10, do_sample=False, streamer=printer)
-
-
-@pytest.mark.precommit
-def test_streamer_kwargs_batch_fail():
-    pipe = read_model(get_models_list()[0])[4]
-    printer = Printer(pipe.get_tokenizer())
-    with pytest.raises(RuntimeError):
-        pipe.generate('', num_beams=2, streamer=printer)
-
-
-@pytest.mark.precommit
-@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
-def test_operator_with_callback_one_string(callback):
-    pipe = read_model(get_models_list()[0])[4]
-    ten_tokens = pipe.get_generation_config()
-    ten_tokens.max_new_tokens = 10
-    pipe('talbe is made of', ten_tokens, callback)
-
-
-@pytest.mark.precommit
-@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
-def test_operator_with_callback_batch_fail(callback):
-    pipe = read_model(get_models_list()[0])[4]
-    with pytest.raises(RuntimeError):
-        pipe(['1', '2'], ov_genai.GenerationConfig(), callback)
-
-
-@pytest.mark.precommit
-def test_operator_with_streamer_kwargs_one_string():
-    pipe = read_model(get_models_list()[0])[4]
-    printer = Printer(pipe.get_tokenizer())
-    pipe('hi', max_new_tokens=10, do_sample=True, streamer=printer)
-
-
-@pytest.mark.precommit
-def test_operator_with_streamer_kwargs_batch_fail():
-    pipe = read_model(get_models_list()[0])[4]
-    printer = Printer(pipe.get_tokenizer())
-    with pytest.raises(RuntimeError):
-        pipe('', num_beams=2, streamer=printer)
-
-
-@pytest.mark.precommit
-def test_load_special_tokens_ids_1(model_tmp_path):
-    # test when there is an available config.json
-    config_json = { 
-        "pad_token_id": 422,
-        "bos_token_id": 42, 
-        "eos_token_id": 37,
-    }
-    tok = load_tok([(config_json, "config.json")], model_tmp_path[1])
-    assert tok.get_pad_token_id() == config_json['pad_token_id']
-    assert tok.get_bos_token_id() == config_json['bos_token_id']
-    assert tok.get_eos_token_id() == config_json['eos_token_id']
-
-
-@pytest.mark.precommit
-def test_load_special_tokens_str_2(model_tmp_path):
-    # test with special_tokens_map
-    special_tokens_map_json = { 
-        "pad_token": {"content": "<custom_pad>"},
-        "bos_token": {"content": "<custom_bos>"},
-        "eos_token": {"content": "<custom_eos>"},
-    }
-    tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], model_tmp_path[1])
-    assert tok.get_pad_token() == special_tokens_map_json['pad_token']["content"]
-    assert tok.get_bos_token() == special_tokens_map_json['bos_token']["content"]
-    assert tok.get_eos_token() == special_tokens_map_json['eos_token']["content"]
-
-
-@pytest.mark.precommit
-def test_load_special_tokens_3_(model_tmp_path):
-    # special_tokens_map is not available 
-    # but tokenize_config.json exists
-    # will load both string and integer representations
-    tok_config_json = {
-        "added_tokens_decoder": {
-            "422": {"content": "<pad>"},
-            "37": {"content": "<s>"},
-            "42": {"content": "</s>"},
-        },
-        "pad_token": "<pad>",
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-    }
-
-    tok = load_tok([(tok_config_json, "tokenizer_config.json")], model_tmp_path[1])
-    assert tok.get_pad_token() == tok_config_json['pad_token']
-    assert tok.get_bos_token() == tok_config_json['bos_token']
-    assert tok.get_eos_token() == tok_config_json['eos_token']
-
-    assert tok.get_pad_token_id() == 422
-    assert tok.get_bos_token_id() == 37
-    assert tok.get_eos_token_id() == 42
-
-
-@pytest.mark.precommit
-def test_load_special_tokens_3(model_tmp_path):
-    # both config.json is availabel and tokenizer_config.json available
-    # check that it does not read int values from tokenizer_config.json if they are in config.json
-    tok_config_json = {
-    "added_tokens_decoder": {
-        # integers differ from config.json to check they don't override config.json
-        "777": {"content": "<pad>"},
-        "888": {"content": "<s>"},
-        "656": {"content": "</s>"},
-    },
-    "pad_token": "<pad>",
-    "bos_token": "<s>",
-    "eos_token": "</s>",
-    }
-    config_json = { 
-        "pad_token_id": 422,
-        "bos_token_id": 42, 
-        "eos_token_id": 37,
-    }
-    configs = [
-        (tok_config_json, "tokenizer_config.json"),
-        (config_json, "config.json")
-    ]
-    tok = load_tok(configs, model_tmp_path[1])
-    assert tok.get_pad_token_id() == config_json['pad_token_id']
-    assert tok.get_bos_token_id() == config_json['bos_token_id']
-    assert tok.get_eos_token_id() == config_json['eos_token_id']
-
-    assert tok.get_pad_token() == tok_config_json['pad_token']
-    assert tok.get_bos_token() == tok_config_json['bos_token']
-    assert tok.get_eos_token() == tok_config_json['eos_token']
-
-
-@pytest.mark.precommit
-@pytest.mark.xfail(
-    raises=AssertionError, 
-    reason="CVS-143410 ov tokenizer should be aligned with hf",
-    strict=False,
-)
-def test_load_special_tokens_4(model_tmp_path):
-    # only string representation is provided, find token integers by inference
-    model_id, temp_path = model_tmp_path
-    tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    chat_history_with_kv_cache = []
+    chat_history_ov = []
     
-    special_tokens_map_json = {}
-    token_str_int_map = {}
-    special_token_names = ['pad_token', 'bos_token', 'eos_token']
-    for token_str in special_token_names:
-        if hasattr(tokenizer, token_str):
-            token_val = getattr(tokenizer, token_str)
-            special_tokens_map_json.update({token_str: {"content": token_val}})
-            token_id = tokenizer(token_val, add_special_tokens=False)['input_ids'][0]
-            token_str_int_map.update({token_str: token_id})
-
-    # since only string representations are present in the json will try to get by inference
-    tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], temp_path)
-
-    # check ids inferred correctly for special tokens existing if HF tokenizer
-    if 'pad_token' in token_str_int_map:
-        assert tok.get_pad_token_id() == token_str_int_map['pad_token']
-    if 'bos_token' in token_str_int_map:
-        assert tok.get_bos_token_id() == token_str_int_map['bos_token']
-    if 'eos_token' in token_str_int_map:
-        assert tok.get_eos_token_id() == token_str_int_map['eos_token']
-
-
-invalid_configs = [
-    dict(num_beam_groups=3, num_beams=15, do_sample=True),
-    dict(do_sample=True),  # no eos_token_id no max_new_tokens, no max_len
-    dict(eos_token_id=42, ignore_eos=True),  # no max_new_tokens, no max_len with ignore_eos
-    dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty
-    dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp
-    dict(top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_p
-    dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k
+    # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True.
+    # Need to regenerate openvino_tokenizer/detokenizer.
+    model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False)
+    pipe_with_kv_cache = ov_genai.LLMPipeline(str(path), device, config={"ENABLE_MMAP": False})
+  
+    pipe_with_kv_cache.start_chat()
+    for question in questions:
+        chat_history_with_kv_cache.append({'role': 'user', 'content': question})
+        answer = pipe_with_kv_cache.generate(question, **generation_config)
+        chat_history_with_kv_cache.append({'role': 'assistant', 'content': answer})
+        
+        chat_history_ov.append({'role': 'user', 'content': question})
+        prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True)
+        answer = pipe.generate(prompt, **generation_config)
+        chat_history_ov.append({'role': 'assistant', 'content': answer})
+    pipe_with_kv_cache.finish_chat()
+
+    if chat_history_ov != chat_history_with_kv_cache:
+        print(f'kvcache_hist: {chat_history_with_kv_cache}')
+        print(f'text_history: {chat_history_ov}')
+    assert chat_history_ov == chat_history_with_kv_cache
+
+
+conversation = [
+    {'role': 'user', 'content': '1+1='},
+    {'role': 'assistant', 'content': '1 + 1 = 2'},
+    {'role': 'user', 'content': 'What is the previous answer?'},
+    {'role': 'assistant', 'content': 'The previous answer was: 1 + 1 = 2. \n Please ask me your next question.'},
+    {'role': 'user', 'content': 'Why is the sun yellow?'},
+    {'role': 'assistant', 'content': 'Because it emits yeloow light.'},
+    {'role': 'user', 'content': 'What was my first question?'},
 ]
-@pytest.mark.parametrize("generation_config", invalid_configs)
-@pytest.mark.precommit
-def test_invalid_configs(model_tmp_path, generation_config):
-    model_id, temp_path = model_tmp_path
-    config_json = {}
-    pipe = load_pipe([(config_json, "config.json")], temp_path)
-    with pytest.raises(RuntimeError):
-        pipe.generate('blah blah', **generation_config)
-
-
 @pytest.mark.precommit
-def test_valid_configs(model_tmp_path):
-    model_id, temp_path = model_tmp_path
-    pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path)
-
-    config = ov_genai.GenerationConfig()
-    config.do_sample = True  # no eos_token_id but it's loaded from config.json
-    pipe.set_generation_config(config)
+@pytest.mark.parametrize('chat_config', get_chat_templates())
+def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
+    tokenizer_config = chat_config[1]
 
-invalid_py_configs = [
-    dict(num_beam_groups=3, num_beams=15, do_sample=True),
-    dict(unexisting_key_name=True),  # no eos_token_id no max_new_tokens, no max_len
-    dict(eos_token_id=42, ignore_eos=True),  # no max_new_tokens, no max_len with ignore_eos
-    dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty
-    dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp
-    dict(top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_p
-    dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k
-]
-@pytest.mark.precommit
-@pytest.mark.parametrize("generation_config", invalid_py_configs)
-def test_python_generation_config_validation(model_tmp_path, generation_config):
-    model_id, temp_path = model_tmp_path
-    pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path)
+    # Will load openvino_model for tiny-random-phi as a placeholder
+    # but indeed only Tokenizer and apply_chat_template will be tested.
+    model_id, path, tokenizer, opt_model, pipe = read_model(get_models_list()[0])
     
-    # 'unexisting_key_name' key validity is checked in pybind and ValueError will be returned
-    #  instead of RuntimeError, which is returned when GenerationConfig values are validated
-    return_exception_type = ValueError if 'unexisting_key_name' in generation_config else RuntimeError
-    with pytest.raises(return_exception_type):
-        pipe.set_generation_config(ov_genai.GenerationConfig(**generation_config))
-
-
-@pytest.mark.precommit
-def test_unicode_pybind_decoding_1():
-    # On this model this prompt generates unfinished utf string.
-    # Test that pybind will not fail.
-    model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
-    pipe = read_model((model_id, path))[4]
-    res_str = pipe.generate(',', max_new_tokens=4)
-    assert '�' == res_str[-1]
-
-
-
-@pytest.mark.precommit
-def test_unicode_pybind_decoding_2():
-    # On this model this prompt generates unfinished utf string.
-    # Test that pybind will not fail.
-    model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
-    pipe = read_model((model_id, path))[4]
-    res_str = pipe.generate([","], max_new_tokens=4)
-    assert '�' == res_str.texts[0][-1]
+    full_history_str_hf = tokenizer.apply_chat_template(conversation, 
+        add_generation_prompt=False, 
+        tokenize=False,
+        **tokenizer_config)
+    
+    tok = load_tok([(tokenizer_config, "tokenizer_config.json")], model_tmp_path[1])
+    full_history_str = tok.apply_chat_template(conversation, add_generation_prompt=False)
+    if full_history_str != full_history_str_hf:
+        print(f'hf reference: {full_history_str_hf}')
+        print(f'ov_genai out: {full_history_str}')
+    assert full_history_str == full_history_str_hf
 
 
-@pytest.mark.precommit
-def test_unicode_pybind_decoding_3():
-    # On this model this prompt generates unfinished utf-8 string
-    # and streams it. Test that pybind will not fail while we pass string to python.
-    model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
-    pipe = read_model((model_id, path))[4]
-    res_str = []
-    pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x))
-    assert '�' == res_str[-1]
+@functools.lru_cache(1)
+def get_continuous_batching(path):
+    return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB')
 
 
-@pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory")
+@pytest.mark.parametrize("prompt", questions)
 @pytest.mark.precommit
-@pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win")
-def test_left_pad():
-    # test left pad tokenizer post processing implementation
-    prompts = [
-        "The Sun is yellow because",
-        "The Sun is yellow because [force left pad tokens]"
-    ]
-    models = read_model(("microsoft/phi-1_5", Path("phi-1_5/")))
-
-    config = {
-        "max_new_tokens": 20,
-        "num_beam_groups": 2,
-        "num_beams": 2,
-        "num_return_sequences": 2,
-        "do_sample": False,
-        "diversity_penalty": 1.0,
-        # phi 1_5 has no eos_token_id in model configuration
-        # ov genai will detect eos_token_id from tokenizer config
-        # hf implementation doesn't fetch it from tokenizer config and defaults to None
-        # align ov genai and hf by setting eos_token_id explicitly
-        "eos_token_id": 50256,
-    }
-
-    models[2].pad_token = models[2].eos_token
-    run_hf_ov_genai_comparison_batched(models, config, prompts)
+def test_continuous_batching_vs_stateful(question):
+    model_id, path, tokenizer, model, pipe = read_model("TinyLlama/TinyLlama-1.1B-Chat-v1.0", pathlib.Path("TinyLlama-1.1B-Chat-v1.0"))
+    cb = get_continuous_batching(path)
+    config = ov_genai.GenerationConfig()
+    config.max_new_tokens = 100
+    gen = cb.generate(question, config)
+    ref = pipe.generate(question, config)
+    assert gen == ref

From 47fa22c2b112f470f0d7bcb028f75494d220f435 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 11 Jul 2024 20:39:43 +0400
Subject: [PATCH 06/28] conflict

---
 tests/python_tests/test_generate_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 84488fe01..32b297da7 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -170,7 +170,7 @@ def get_continuous_batching(path):
     return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB')
 
 
-@pytest.mark.parametrize("prompt", questions)
+@pytest.mark.parametrize("question", questions)
 @pytest.mark.precommit
 def test_continuous_batching_vs_stateful(question):
     model_id, path, tokenizer, model, pipe = read_model("TinyLlama/TinyLlama-1.1B-Chat-v1.0", pathlib.Path("TinyLlama-1.1B-Chat-v1.0"))

From 2094ba67e1eda3d60944fefeb4c1077053a9fa1a Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 11 Jul 2024 20:44:01 +0400
Subject: [PATCH 07/28] conflict

---
 .github/workflows/genai_python_lib.yml  | 91 +------------------------
 tests/python_tests/test_generate_api.py |  5 +-
 2 files changed, 6 insertions(+), 90 deletions(-)

diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml
index 294ff8863..34d5fbf92 100644
--- a/.github/workflows/genai_python_lib.yml
+++ b/.github/workflows/genai_python_lib.yml
@@ -11,7 +11,7 @@ env:
 jobs:
   ubuntu_genai_python_lib:
     # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env.
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-22.04-16-cores
     env:
       # A tokenizers' dependency fails to compile with Ninja in CenOS7 env.
       CMAKE_GENERATOR: Unix Makefiles
@@ -83,91 +83,4 @@ jobs:
       # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
       - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/
       - run: call ./ov/setupvars.bat && python -m pip install . --verbose
-      - run: python -m pytest ./tests/python_tests/test_generate_api.py -m precommit
-
-  continuous_batching_python_lib_ubuntu:
-    # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env.
-    runs-on: ubuntu-22.04
-    env:
-      # A tokenizers' dependency fails to compile with Ninja in CenOS7 env.
-      CMAKE_GENERATOR: Unix Makefiles
-      CMAKE_BUILD_PARALLEL_LEVEL: null
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.8
-      # Install CentOS7 instead of Ubuntu to match PyPI distribution ABI.
-      - name: Install OpenVINO
-        run: |
-          mkdir ./ov/
-          curl ${{ env.l_ov_centos_link }} | tar --directory ./ov/ --strip-components 1 -xz
-          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Install dependencies and build
-        run: |
-          source ./ov/setupvars.sh
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
-      - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_sampling.py -m precommit
-      - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_preemption.py -m precommit
-      - run: source ./ov/setupvars.sh && python -m pip install .
-      - run: python -m pytest ./tests/python_tests/test_preemption.py -m precommit
-
-  continuous_batching_python_lib_windows:
-    runs-on: windows-latest
-    defaults:
-      run:
-        shell: cmd
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.8
-
-      - name: Install OpenVINO
-        run: |
-          curl --output ov.zip ${{ env.w_ov_link }}
-          unzip -d ov ov.zip
-          dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}"
-        shell: bash
-      - name: Install dependencies and build
-        run: |
-          call .\ov\setupvars.bat
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
-      - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/test_sampling.py -m precommit
-      - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/test_preemption.py -m precommit
-      - run: call ./ov/setupvars.bat && python -m pip install . --verbose
-      - run: python -m pytest ./tests/python_tests/test_preemption.py -m precommit
-
-
-  continuous_batching_python_lib_macos:
-    runs-on: macos-12
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.8
-      - name: Install OpenVINO
-        run: |
-          mkdir ./ov/
-          curl ${{ env.m_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
-          brew install coreutils scons
-      - name: Download, convert and build
-        run: |
-          source ./ov/setupvars.sh
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
-      - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_sampling.py -m precommit
-      - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_preemption.py -m precommit
-      - run: source ./ov/setupvars.sh && python -m pip install .
-      - run: python -m pytest ./tests/python_tests/test_preemption.py -m precommit
+      - run: python -m pytest ./tests/python_tests/
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 32b297da7..3849dae1d 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -173,7 +173,10 @@ def get_continuous_batching(path):
 @pytest.mark.parametrize("question", questions)
 @pytest.mark.precommit
 def test_continuous_batching_vs_stateful(question):
-    model_id, path, tokenizer, model, pipe = read_model("TinyLlama/TinyLlama-1.1B-Chat-v1.0", pathlib.Path("TinyLlama-1.1B-Chat-v1.0"))
+    model_id, path, tokenizer, model, pipe = read_model((
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        pathlib.Path("TinyLlama-1.1B-Chat-v1.0")
+    ))
     cb = get_continuous_batching(path)
     config = ov_genai.GenerationConfig()
     config.max_new_tokens = 100

From 78361a989efbd7485ff7990a1793d7ce65b9c7fb Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 11 Jul 2024 20:57:25 +0400
Subject: [PATCH 08/28] clean

---
 src/cpp/src/continuous_batching_pipeline.cpp |  8 +++---
 src/cpp/src/llm_pipeline.cpp                 | 27 +++++++++-----------
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 4b8d1a319..7e822379c 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -123,7 +123,6 @@ class ContinuousBatchingPipeline::Impl {
     GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, ov::genai::GenerationConfig sampling_params) {
         sampling_params.set_eos_token_id(m_tokenizer.get_eos_token_id());
         sampling_params.validate();
-
         SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, input_ids,
                                                                             sampling_params, m_scheduler->get_config().block_size);
         {
@@ -238,10 +237,7 @@ class ContinuousBatchingPipeline::Impl {
         return !m_awaiting_requests.empty() || !m_requests.empty();
     }
 
-    std::vector<EncodedGenerationResult> generate(
-        const std::vector<ov::Tensor>& input_ids,
-        const std::vector<GenerationConfig>& sampling_params
-    ) {
+    std::vector<EncodedGenerationResult> generate( const std::vector<ov::Tensor>& input_ids, const std::vector<GenerationConfig>& sampling_params) {
         OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
         OPENVINO_ASSERT(input_ids.size() == sampling_params.size());
 
@@ -276,6 +272,8 @@ class ContinuousBatchingPipeline::Impl {
             result.m_status = generation->get_status();
             results.push_back(std::move(result));
         }
+
+        OPENVINO_ASSERT(results.size() == input_ids.size());
         return results;
     }
 
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 41acc3d07..d2168c67f 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -446,7 +446,6 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         }, inputs);
         const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
         // -1 == config.eos_token_id and config.validate() are handled in m_impl.
-        std::cout << "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAa\n";
         std::vector<EncodedGenerationResult> generated = m_impl.generate(input_ids, std::vector<GenerationConfig>{input_ids.size(), config});
         std::vector<std::vector<int64_t>> tokens;
         std::vector<float> scores;
@@ -480,29 +479,27 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
     const ov::AnyMap& plugin_config
-) {
+): m_pimpl{[&]() -> std::unique_ptr<LLMPipelineImplBase> {
     if ("CB" == device) {
-        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_path, tokenizer, "CPU", plugin_config);
-    } else if ("NPU" == device) {
-        m_pimpl = std::make_unique<StaticLLMPipeline>(model_path, tokenizer, device, plugin_config);
-    } else {
-        m_pimpl = std::make_unique<StatefulLLMPipeline>(model_path, tokenizer, device, plugin_config);
+        return std::make_unique<ContinuousBatchingAdapter>(model_path, tokenizer, "CPU", plugin_config);
+    } if ("NPU" == device) {
+        return std::make_unique<StaticLLMPipeline>(model_path, tokenizer, device, plugin_config);
     }
-}
+    return std::make_unique<StatefulLLMPipeline>(model_path, tokenizer, device, plugin_config);
+}()} {}
 
 ov::genai::LLMPipeline::LLMPipeline(
     const std::string& path,
     const std::string& device,
     const ov::AnyMap& config
-) {
+): m_pimpl{[&]() -> std::unique_ptr<LLMPipelineImplBase> {
     if ("CB" == device) {
-        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(path, "CPU", config);
-    } else if ("NPU" == device) {
-        m_pimpl = std::make_unique<StaticLLMPipeline>(path, device, config);
-    } else {
-        m_pimpl = std::make_unique<StatefulLLMPipeline>(path, device, config);
+        return std::make_unique<ContinuousBatchingAdapter>(path, "CPU", config);
+    } if ("NPU" == device) {
+        return std::make_unique<StaticLLMPipeline>(path, device, config);
     }
-}
+    return std::make_unique<StatefulLLMPipeline>(path, device, config);
+}()} {}
 
 ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
     return m_pimpl->m_generation_config;

From 691fefc0026d2913fe47a1e3452d230054a93873 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 11 Jul 2024 21:01:14 +0400
Subject: [PATCH 09/28] verify status

---
 src/cpp/src/llm_pipeline.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index d2168c67f..cfeedbfc2 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -450,6 +450,9 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         std::vector<std::vector<int64_t>> tokens;
         std::vector<float> scores;
         for (EncodedGenerationResult& res : generated) {
+            if (GenerationStatus::FINISHED != res.m_status) {
+                OPENVINO_THROW("Got unfinished GenerationStatus");
+            }
             std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(tokens));
             std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(scores));
         }

From 18253285427b7b1a3a1ab2f689a45488c088cd3a Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 11 Jul 2024 21:16:36 +0400
Subject: [PATCH 10/28] conflict

---
 src/cpp/src/continuous_batching_pipeline.cpp |   4 +-
 src/cpp/src/llm_pipeline.cpp                 |  16 +-
 tests/python_tests/test_generate_api.py      | 800 +++++++++++++++----
 3 files changed, 665 insertions(+), 155 deletions(-)

diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 7e822379c..ad190bd00 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -237,7 +237,7 @@ class ContinuousBatchingPipeline::Impl {
         return !m_awaiting_requests.empty() || !m_requests.empty();
     }
 
-    std::vector<EncodedGenerationResult> generate( const std::vector<ov::Tensor>& input_ids, const std::vector<GenerationConfig>& sampling_params) {
+    std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<GenerationConfig>& sampling_params) {
         OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
         OPENVINO_ASSERT(input_ids.size() == sampling_params.size());
 
@@ -266,7 +266,7 @@ class ContinuousBatchingPipeline::Impl {
             auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size());
             for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
                 const auto& generation_output = generation_outputs[generation_output_idx];
-                result.m_generation_ids.push_back(generation_output.generated_token_ids);
+                result.m_generation_ids.push_back(std::move(generation_output.generated_token_ids));
                 result.m_scores.push_back(generation_output.score);
             }
             result.m_status = generation->get_status();
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index cfeedbfc2..2fcd77318 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -348,11 +348,11 @@ Tokenizer dont_construct() {
 
 class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
 public:
-    ov::genai::ContinuousBatchingPipeline m_impl;
+    ContinuousBatchingPipeline m_impl;
 
     ContinuousBatchingAdapter(
         const ov::InferRequest& request,
-        const ov::genai::Tokenizer& tokenizer,
+        const Tokenizer& tokenizer,
         OptionalGenerationConfig generation_config
     ): LLMPipelineImplBase{dont_construct()}, m_impl{"", {}} {}
 
@@ -447,16 +447,16 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
         // -1 == config.eos_token_id and config.validate() are handled in m_impl.
         std::vector<EncodedGenerationResult> generated = m_impl.generate(input_ids, std::vector<GenerationConfig>{input_ids.size(), config});
-        std::vector<std::vector<int64_t>> tokens;
-        std::vector<float> scores;
+        std::vector<std::vector<int64_t>> plain_tokens;
+        std::vector<float> plain_scores;
         for (EncodedGenerationResult& res : generated) {
             if (GenerationStatus::FINISHED != res.m_status) {
                 OPENVINO_THROW("Got unfinished GenerationStatus");
             }
-            std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(tokens));
-            std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(scores));
+            std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_tokens));
+            std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
         }
-        return {std::move(tokens), std::move(scores)};
+        return {std::move(plain_tokens), std::move(plain_scores)};
     }
 
     void start_chat(const std::string& system_message) override {
@@ -474,7 +474,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::genai::Tokenizer& tokenizer,
     OptionalGenerationConfig generation_config
 ) {
-    m_pimpl = std::make_unique<ContinuousBatchingAdapter>(request, tokenizer, generation_config);
+    m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer, generation_config);
 }
 
 ov::genai::LLMPipeline::LLMPipeline(
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 3849dae1d..d994d52fe 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -1,168 +1,679 @@
 # Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-import functools
 import openvino_genai as ov_genai
-import pathlib
+from openvino_genai import StopCriteria
 import pytest
-from typing import Dict, Tuple
+import transformers
+from typing import Union, List, Dict, Optional
+import numpy as np
+import openvino as ov
+import sys
+from pathlib import Path
+import torch
+import functools
 from ov_genai_test_utils import (
-    get_models_list,
-    get_chat_models_list,
-    read_model,
-    load_tok,
-    model_tmp_path,
-    get_chat_templates
+    get_models_list, 
+    read_model, 
+    load_pipe,
+    load_tok, 
+    model_tmp_path, 
+    STOP_CRITERIA_MAP, 
 )
 
 
-configs = [
-    dict(max_new_tokens=20),
-    dict(num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0)
-]
+def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]):
+    device = 'CPU'
+    model_id, path, tokenizer, model, pipe = model_descr
+    config = generation_config.copy()  # to avoid side effects
+    num_beams = config['num_beams'] if 'num_beams' in config else 1
+    config['num_return_sequences'] = num_beams
+    
+    if not isinstance(prompts, list):
+        prompts = [prompts]
 
+    if 'do_sample' not in config:
+        # Some HF models have default do_sample = True, and if we set beam search generation config 
+        # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
+        # Need to set exlicitly to False, but only if test arguments omitted this arg.
+        # Do not apply 'repetition_penalty' if sampling is not used.
+        config['do_sample'] = False
+        config['repetition_penalty'] = None
+    
+    generation_config_hf = config.copy()
+    if generation_config_hf.get('stop_criteria'):
+        generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
+    generation_config_hf.pop('ignore_eos', None)
 
-questions = [
-    '1+1=',
-    'What is the previous answer?',
-    'Why is the Sun yellow?',
-    'What was my first question?'
-]
+    # Encode the batch of prompts
+    tokenizer.padding_side = "left"
+    encoded_prompts = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True)
+    prompt_ids, attention_mask = encoded_prompts['input_ids'], encoded_prompts['attention_mask']
+    
+    hf_encoded_outputs = model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf)
 
+    hf_outputs = []
+    for idx, hf_encoded_out in enumerate(hf_encoded_outputs):
+        prompt_count = idx // num_beams
+        hf_outputs.append(tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True))
 
-@pytest.mark.parametrize("generation_config", configs)
-@pytest.mark.parametrize("model_descr", get_chat_models_list())
-@pytest.mark.precommit
-def test_chat_compare_with_HF(model_descr, generation_config: Dict):
+    ov_outputs = pipe.generate(prompts, **config).texts
+
+    hf_outputs.sort()
+    ov_outputs.sort()
+    for i, (hf_output, ov_output) in enumerate(zip(hf_outputs, ov_outputs)):
+        if hf_output != ov_output:
+            print(f'hf_output: {hf_output}')
+            print(f'ov_output: {ov_output}')
+        assert hf_output == ov_output
+
+def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str):
     device = 'CPU'
-    chat_history_hf = []
-    chat_history_ov = []
-    chat_prompt = ''
+    model_id, path, tokenizer, model, pipe = model_descr
+
+    config = generation_config.copy()  # to avoid side effects
+
+    if 'do_sample' not in config:
+        # Some HF models have default do_sample = True, and if we set beam search generation config 
+        # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
+        # Need to set exlicitly to False, but only if test arguments omitted this arg.
+        # Do not apply 'repetition_penalty' if sampling is not used.
+        config['do_sample'] = False
+        config['repetition_penalty'] = None
+
+    generation_config_hf = config.copy()
+    if generation_config_hf.get('stop_criteria'):
+        generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
+    generation_config_hf.pop('ignore_eos', None)
+
+    encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True)
+    hf_encoded_output = model.generate(encoded_prompt, **generation_config_hf)
+    hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:], skip_special_tokens=True)
+
+    ov_output = pipe.generate(prompt, **config)
+    if config.get('num_return_sequences', 1) > 1:
+        assert hf_output in ov_output.texts
+    else:
+        if hf_output != ov_output:
+            print(f'hf_output: {hf_output}')
+            print(f'ov_output: {ov_output}')
+
+        assert hf_output == ov_output
+
+def hf_ov_genai_tensors_comparison(
+        model_descr, 
+        generation_config: Dict, 
+        input_ids: np.ndarray, 
+        attention_mask: Optional[np.array] = None
+    ):
+    device = 'CPU'
+    model_id, path, tokenizer, model, pipe = model_descr
+
+    config = generation_config.copy()  # to avoid side effects
+
+    if 'do_sample' not in config:
+        # Some HF models have default do_sample = True, and if we set beam search generation config 
+        # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
+        # Need to set exlicitly to False, but only if test arguments omitted this arg.
+        # Do not apply 'repetition_penalty' if sampling is not used.
+        config['do_sample'] = False
+        config['repetition_penalty'] = None
     
-    # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True.
-    # Need to regenerate openvino_tokenizer/detokenizer.
-    model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False)
-
-    pipe.start_chat()    
-    for prompt in questions:
-        chat_history_hf.append({'role': 'user', 'content': prompt})
-        chat_history_ov.append({'role': 'user', 'content': prompt})
-        
-        chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
-        tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
-        
-        answer = model_opt.generate(**tokenized, **generation_config, do_sample=False, repetition_penalty = None)
-        answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
-        chat_history_hf.append({'role': 'assistant', 'content': answer_str})
-
-        answer_ov = pipe.generate(prompt, **generation_config)
-        chat_history_ov.append({'role': 'assistant', 'content': answer_ov})
-
-    pipe.finish_chat()
+    generation_config_hf = config.copy()
+    if generation_config_hf.get('stop_criteria'):
+        generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
+    generation_config_hf.pop('ignore_eos', None)
     
-    if chat_history_ov != chat_history_hf:
-        print(f'hf_output: {chat_history_hf}')
-        print(f'ov_output: {chat_history_ov}')
-    assert chat_history_ov == chat_history_hf
+    if attention_mask is not None:
+        inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask))
+        inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask))
+    else:
+        inputs_hf = dict(inputs=torch.tensor(input_ids))
+        inputs_ov = ov.Tensor(input_ids)
+
+    hf_output = model.generate(**inputs_hf, **generation_config_hf)
+
+    pipe = ov_genai.LLMPipeline(str(path), device)
+    ov_output = pipe.generate(inputs_ov, **config)
 
+    hf_res = hf_output[0, input_ids.shape[1]:].numpy()
+    ov_res = np.array(ov_output.tokens, dtype=np.int64)
+    assert np.all(ov_res == hf_res)
 
-@pytest.mark.parametrize("generation_config", configs)
-@pytest.mark.parametrize("model_descr", get_chat_models_list())
+
+test_cases = [
+    (dict(max_new_tokens=20), 'table is made of'),
+    (dict(max_new_tokens=20), '你好！ 你好嗎？'),
+    (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'),
+    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'),
+    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'The Sun is yellow because'),
+    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'),
+]
+@pytest.mark.parametrize("generation_config,prompt", test_cases)
+@pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
-def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict):
-    # compares with HF when history in ov_genai is save as a text
-    device = 'CPU'
-    chat_history_hf = []
-    chat_history_ov = []
-    chat_prompt = ''
-    
-    # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True.
-    # Need to regenerate openvino_tokenizer/detokenizer.
-    model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False)
+def test_decoding(model_descr, generation_config, prompt):
+    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
+
+input_tensors_list = [
+    # input_ids, attention_mask
+    (np.array([[1, 4, 42]], dtype=np.int64), None),
+    (np.array([[1, 4, 42]], dtype=np.int64), np.array([[1, 1, 1]], dtype=np.int64)),
+]
+@pytest.mark.parametrize("inputs", input_tensors_list)
+@pytest.mark.parametrize("model_descr", get_models_list())
+@pytest.mark.xfail(
+    raises=TypeError, 
+    reason="pybind was unable to find overloads with tensor inputs on Linux",
+    strict=False,
+    condition=sys.platform == "linux"
+)
+@pytest.mark.precommit
+def test_ov_tensors(model_descr, inputs):
+    hf_ov_genai_tensors_comparison(read_model(model_descr), dict(max_new_tokens=20), *inputs)
+
+
+prompts = [
+    'table is made of',
+    '你好！ 你好嗎？',
+    'Alan Turing was a',
+    'The Sun is yellow because',
+    ['The Sun is yellow because', 'Alan Turing was a', 'Alan Turing was a']
+]
+@pytest.mark.parametrize("model_descr", get_models_list())
+@pytest.mark.parametrize("prompt", prompts)
+@pytest.mark.precommit
+@pytest.mark.xfail(
+    raises=TypeError, 
+    reason="pybind was unable to find ov::Tensor from openvino yet",
+    strict=False,
+    condition=sys.platform in ["linux", "win32"]
+)
+def test_genai_tokenizer_encode(model_descr, prompt):
+    model_id, path, tokenizer, model, pipe = read_model(model_descr)
+    tok = pipe.get_tokenizer()
     
-    for prompt in questions:
-        chat_history_hf.append({'role': 'user', 'content': prompt})
-        chat_history_ov.append({'role': 'user', 'content': prompt})
-        
-        chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
-        tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
-        
-        answer = model_opt.generate(**tokenized, **generation_config, do_sample=False, repetition_penalty = None)
-        answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
-        chat_history_hf.append({'role': 'assistant', 'content': answer_str})
-        
-        chat_prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True)
-        answer_ov = pipe.generate(chat_prompt, **generation_config)
-        chat_history_ov.append({'role': 'assistant', 'content': answer_ov})
-  
-    if chat_history_ov != chat_history_hf:
-        print(f'hf_output: {chat_history_hf}')
-        print(f'ov_output: {chat_history_ov}')
-    assert chat_history_ov == chat_history_hf
-
-
-@pytest.mark.parametrize("generation_config", configs)
-@pytest.mark.parametrize("model_descr", get_chat_models_list())
-@pytest.mark.precommit
-def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: Dict):
-    # Check that when history is stored in KV cache results are the same as when history stored in a text.
-    device ='CPU'
+    encoded_ov = tok.encode(prompt).input_ids.data
+    if isinstance(prompt, list):
+        encoded_hf = tokenizer.batch_encode_plus(prompt)['input_ids']
+        for tokens_ov, tokens_hf in zip(encoded_ov, encoded_hf):
+            assert np.all(tokens_ov == tokens_hf)
+    else:
+        encoded_hf = tokenizer.encode(prompt)
+        assert np.all(encoded_hf == encoded_ov[0])
+
+encoded_prompts = [
+    [1, 1591, 338, 1754, 310],
+    [1, 17102,   323,  3864,   471,   263],
     
-    chat_history_with_kv_cache = []
-    chat_history_ov = []
+    # chineze characters
+    [1, 29871, 30919, 31076, 30584, 29871, 30919, 31076, 232, 154, 145, 30882],
+
+    # On meta-llama/Meta-Llama-3-8B-Instruct this becomes longer  after removing the last token
+    [3113, 264, 364, 267],
+
+    # batched tokens
+    [[1, 1591, 338, 1754, 310], [1, 1591, 338, 1754, 310], [1, 17102,   323,  3864,   471,   263]]
+]
+@pytest.mark.parametrize("model_descr", get_models_list())
+@pytest.mark.parametrize("encoded_prompt", encoded_prompts)
+@pytest.mark.precommit
+@pytest.mark.xfail(
+    raises=TypeError, 
+    reason="pybind was unable to find ov::Tensor from openvino yet",
+    strict=False,
+    condition=sys.platform in ["linux", "win32"]
+)
+def test_genai_tokenizer_decode(model_descr, encoded_prompt):
+    model_id, path, tokenizer, model, pipe = read_model(model_descr)
+    tok = pipe.get_tokenizer()
+    decoded_ov = tok.decode(encoded_prompt)
     
-    # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True.
-    # Need to regenerate openvino_tokenizer/detokenizer.
-    model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False)
-    pipe_with_kv_cache = ov_genai.LLMPipeline(str(path), device, config={"ENABLE_MMAP": False})
-  
-    pipe_with_kv_cache.start_chat()
-    for question in questions:
-        chat_history_with_kv_cache.append({'role': 'user', 'content': question})
-        answer = pipe_with_kv_cache.generate(question, **generation_config)
-        chat_history_with_kv_cache.append({'role': 'assistant', 'content': answer})
-        
-        chat_history_ov.append({'role': 'user', 'content': question})
-        prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True)
-        answer = pipe.generate(prompt, **generation_config)
-        chat_history_ov.append({'role': 'assistant', 'content': answer})
-    pipe_with_kv_cache.finish_chat()
-
-    if chat_history_ov != chat_history_with_kv_cache:
-        print(f'kvcache_hist: {chat_history_with_kv_cache}')
-        print(f'text_history: {chat_history_ov}')
-    assert chat_history_ov == chat_history_with_kv_cache
-
-
-conversation = [
-    {'role': 'user', 'content': '1+1='},
-    {'role': 'assistant', 'content': '1 + 1 = 2'},
-    {'role': 'user', 'content': 'What is the previous answer?'},
-    {'role': 'assistant', 'content': 'The previous answer was: 1 + 1 = 2. \n Please ask me your next question.'},
-    {'role': 'user', 'content': 'Why is the sun yellow?'},
-    {'role': 'assistant', 'content': 'Because it emits yeloow light.'},
-    {'role': 'user', 'content': 'What was my first question?'},
+    if isinstance(encoded_prompt[0], list):
+        decoded_hf = tokenizer.batch_decode(encoded_prompt, skip_special_tokens=True)
+        for tokens_ov, tokens_hf in zip(decoded_ov, decoded_hf):
+            assert np.all(tokens_ov == tokens_hf)
+    else:
+        decoded_hf = tokenizer.decode(encoded_prompt, skip_special_tokens=True)
+        assert decoded_hf == decoded_ov
+
+
+test_configs = [
+    dict(max_new_tokens=20),
+    dict(max_new_tokens=200, ignore_eos=True),
+    dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0)
 ]
+batched_prompts = [
+    ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'],
+    ['hello', 'Here is the longest nowel ever: '],
+    ['Alan Turing was a', 'return 0', '你好！ 你好嗎？'],
+    ['table is made', 'table is made [force left pad tokens]']
+]
+@pytest.mark.parametrize("generation_config", test_configs)
+@pytest.mark.parametrize("prompts", batched_prompts)
+@pytest.mark.parametrize("model_descr", get_models_list())
+@pytest.mark.precommit
+def test_multibatch(model_descr, generation_config, prompts):
+    run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts)
+
+
+prompts = ['The Sun is yellow because', 'Difference between Jupiter and Mars is that', 'table is made of']
+@pytest.mark.parametrize("num_beam_groups", [2, 3, 8])
+@pytest.mark.parametrize("group_size", [5, 3, 10])
+@pytest.mark.parametrize("max_new_tokens", [20, 15])
+@pytest.mark.parametrize("diversity_penalty", [1.0 , 1.5])
+@pytest.mark.parametrize("prompt", prompts)
+@pytest.mark.parametrize("model_descr", get_models_list())
+@pytest.mark.precommit
+def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
+                              max_new_tokens, diversity_penalty, prompt):
+    generation_config = dict(
+        num_beam_groups=num_beam_groups, 
+        num_beams=num_beam_groups * group_size, 
+        diversity_penalty=diversity_penalty, 
+        num_return_sequences=num_beam_groups * group_size, 
+        max_new_tokens=max_new_tokens, 
+    )
+    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
+
+
+@pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC])
+@pytest.mark.parametrize("prompt", prompts)
+@pytest.mark.parametrize("max_new_tokens", [10, 80])
+@pytest.mark.parametrize("model_descr", get_models_list())
+@pytest.mark.precommit
+def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
+    # todo: with EARLY stop_criteria looks like HF return unvalid out with sentence<eos><unk><unk>
+    # while genai ends sentence with <eos>
+    if (stop_criteria == StopCriteria.EARLY):
+        pytest.skip()
+    generation_config = dict(
+        num_beam_groups=2, 
+        num_beams=2 * 3, 
+        diversity_penalty=1.0, 
+        num_return_sequences=2 * 3, 
+        max_new_tokens=max_new_tokens, 
+        stop_criteria=stop_criteria,
+    )
+    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
+
+
+# test long sequences
+@pytest.mark.parametrize("num_beam_groups", [2])
+@pytest.mark.parametrize("group_size", [5])
+@pytest.mark.parametrize("max_new_tokens", [800, 2000])
+@pytest.mark.parametrize("prompt", prompts)
+@pytest.mark.parametrize("model_descr", get_models_list())
+@pytest.mark.skip(reason="Will be enabled in nightly since the test are computationally expensive")
+@pytest.mark.nightly
+def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size,
+                                    max_new_tokens, prompt):
+    generation_config = dict(
+        num_beam_groups=num_beam_groups, 
+        num_beams=num_beam_groups * group_size, 
+        diversity_penalty=1.0, 
+        num_return_sequences=num_beam_groups * group_size, 
+        max_new_tokens=max_new_tokens, 
+    )
+    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
+
+
+def user_defined_callback(subword):
+    print(subword)
+
+
+@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
+@pytest.mark.precommit
+def test_callback_one_string(callback):
+    pipe = read_model(get_models_list()[0])[4]
+    generation_config = pipe.get_generation_config()
+    generation_config.max_new_tokens = 10
+    pipe.generate('table is made of', generation_config, callback)
+
+
+@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
+@pytest.mark.precommit
+def test_callback_batch_fail(callback):
+    pipe = read_model(get_models_list()[0])[4]
+    with pytest.raises(RuntimeError):
+        pipe.generate(['1', '2'], ov_genai.GenerationConfig(), callback)
+
+
+@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
-@pytest.mark.parametrize('chat_config', get_chat_templates())
-def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
-    tokenizer_config = chat_config[1]
+def test_callback_kwargs_one_string(callback):
+    pipe = read_model(get_models_list()[0])[4]
+    pipe.generate('table is made of', max_new_tokens=10, streamer=callback)
+
+@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
+@pytest.mark.precommit
+@pytest.mark.parametrize("model_descr", get_models_list())
+def test_callback_decoding_metallama(model_descr, callback):
+    # On metallam this prompt generates output which can shorten after adding new tokens.
+    # Test that streamer correctly handles such cases.
+    prompt = 'I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature'
+    if model_descr[0] != 'meta-llama/Meta-Llama-3-8B-Instruct':
+        pytest.skip()
+    pipe = read_model(model_descr)[4]
+    pipe.generate(prompt, max_new_tokens=300, streamer=callback)
+
+
+@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
+@pytest.mark.precommit
+def test_callback_kwargs_batch_fail(callback):
+    pipe = read_model(get_models_list()[0])[4]
+    with pytest.raises(RuntimeError):
+        pipe.generate(['1', '2'], max_new_tokens=10, streamer=callback)
+
+
+class Printer(ov_genai.StreamerBase):
+    def __init__(self, tokenizer):
+        # super() may work, but once you begin mixing Python and C++
+        # multiple inheritance, things will fall apart due to
+        # differences between Python’s MRO and C++’s mechanisms.
+        ov_genai.StreamerBase.__init__(self)
+        self.tokenizer = tokenizer
+    def put(self, token_id):
+        # print(self.tokenizer.decode([token_id]))  # Incorrect way to print, but easy to implement
+        print(token_id)  # print only token because self.tokenizer.decode([token_id]) are not implemented yet
+    def end(self):
+        print('end')
 
-    # Will load openvino_model for tiny-random-phi as a placeholder
-    # but indeed only Tokenizer and apply_chat_template will be tested.
-    model_id, path, tokenizer, opt_model, pipe = read_model(get_models_list()[0])
+
+@pytest.mark.precommit
+def test_streamer_one_string():
+    pipe = read_model(get_models_list()[0])[4]
+    generation_config = pipe.get_generation_config()
+    generation_config.max_new_tokens = 10
+    printer = Printer(pipe.get_tokenizer())
+    pipe.generate('table is made of', generation_config, printer)
+
+
+@pytest.mark.precommit
+def test_streamer_batch_fail():
+    pipe = read_model(get_models_list()[0])[4]
+    printer = Printer(pipe.get_tokenizer())
+    with pytest.raises(RuntimeError):
+        pipe.generate(['1', '2'], ov_genai.GenerationConfig(), printer)
+
+
+@pytest.mark.precommit
+def test_streamer_kwargs_one_string():
+    pipe = read_model(get_models_list()[0])[4]
+    printer = Printer(pipe.get_tokenizer())
+    pipe.generate('table is made of', max_new_tokens=10, do_sample=False, streamer=printer)
+
+
+@pytest.mark.precommit
+def test_streamer_kwargs_batch_fail():
+    pipe = read_model(get_models_list()[0])[4]
+    printer = Printer(pipe.get_tokenizer())
+    with pytest.raises(RuntimeError):
+        pipe.generate('', num_beams=2, streamer=printer)
+
+
+@pytest.mark.precommit
+@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
+def test_operator_with_callback_one_string(callback):
+    pipe = read_model(get_models_list()[0])[4]
+    ten_tokens = pipe.get_generation_config()
+    ten_tokens.max_new_tokens = 10
+    pipe('talbe is made of', ten_tokens, callback)
+
+
+@pytest.mark.precommit
+@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
+def test_operator_with_callback_batch_fail(callback):
+    pipe = read_model(get_models_list()[0])[4]
+    with pytest.raises(RuntimeError):
+        pipe(['1', '2'], ov_genai.GenerationConfig(), callback)
+
+
+@pytest.mark.precommit
+def test_operator_with_streamer_kwargs_one_string():
+    pipe = read_model(get_models_list()[0])[4]
+    printer = Printer(pipe.get_tokenizer())
+    pipe('hi', max_new_tokens=10, do_sample=True, streamer=printer)
+
+
+@pytest.mark.precommit
+def test_operator_with_streamer_kwargs_batch_fail():
+    pipe = read_model(get_models_list()[0])[4]
+    printer = Printer(pipe.get_tokenizer())
+    with pytest.raises(RuntimeError):
+        pipe('', num_beams=2, streamer=printer)
+
+
+@pytest.mark.precommit
+def test_load_special_tokens_ids_1(model_tmp_path):
+    # test when there is an available config.json
+    config_json = { 
+        "pad_token_id": 422,
+        "bos_token_id": 42, 
+        "eos_token_id": 37,
+    }
+    tok = load_tok([(config_json, "config.json")], model_tmp_path[1])
+    assert tok.get_pad_token_id() == config_json['pad_token_id']
+    assert tok.get_bos_token_id() == config_json['bos_token_id']
+    assert tok.get_eos_token_id() == config_json['eos_token_id']
+
+
+@pytest.mark.precommit
+def test_load_special_tokens_str_2(model_tmp_path):
+    # test with special_tokens_map
+    special_tokens_map_json = { 
+        "pad_token": {"content": "<custom_pad>"},
+        "bos_token": {"content": "<custom_bos>"},
+        "eos_token": {"content": "<custom_eos>"},
+    }
+    tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], model_tmp_path[1])
+    assert tok.get_pad_token() == special_tokens_map_json['pad_token']["content"]
+    assert tok.get_bos_token() == special_tokens_map_json['bos_token']["content"]
+    assert tok.get_eos_token() == special_tokens_map_json['eos_token']["content"]
+
+
+@pytest.mark.precommit
+def test_load_special_tokens_3_(model_tmp_path):
+    # special_tokens_map is not available 
+    # but tokenize_config.json exists
+    # will load both string and integer representations
+    tok_config_json = {
+        "added_tokens_decoder": {
+            "422": {"content": "<pad>"},
+            "37": {"content": "<s>"},
+            "42": {"content": "</s>"},
+        },
+        "pad_token": "<pad>",
+        "bos_token": "<s>",
+        "eos_token": "</s>",
+    }
+
+    tok = load_tok([(tok_config_json, "tokenizer_config.json")], model_tmp_path[1])
+    assert tok.get_pad_token() == tok_config_json['pad_token']
+    assert tok.get_bos_token() == tok_config_json['bos_token']
+    assert tok.get_eos_token() == tok_config_json['eos_token']
+
+    assert tok.get_pad_token_id() == 422
+    assert tok.get_bos_token_id() == 37
+    assert tok.get_eos_token_id() == 42
+
+
+@pytest.mark.precommit
+def test_load_special_tokens_3(model_tmp_path):
+    # both config.json is availabel and tokenizer_config.json available
+    # check that it does not read int values from tokenizer_config.json if they are in config.json
+    tok_config_json = {
+    "added_tokens_decoder": {
+        # integers differ from config.json to check they don't override config.json
+        "777": {"content": "<pad>"},
+        "888": {"content": "<s>"},
+        "656": {"content": "</s>"},
+    },
+    "pad_token": "<pad>",
+    "bos_token": "<s>",
+    "eos_token": "</s>",
+    }
+    config_json = { 
+        "pad_token_id": 422,
+        "bos_token_id": 42, 
+        "eos_token_id": 37,
+    }
+    configs = [
+        (tok_config_json, "tokenizer_config.json"),
+        (config_json, "config.json")
+    ]
+    tok = load_tok(configs, model_tmp_path[1])
+    assert tok.get_pad_token_id() == config_json['pad_token_id']
+    assert tok.get_bos_token_id() == config_json['bos_token_id']
+    assert tok.get_eos_token_id() == config_json['eos_token_id']
+
+    assert tok.get_pad_token() == tok_config_json['pad_token']
+    assert tok.get_bos_token() == tok_config_json['bos_token']
+    assert tok.get_eos_token() == tok_config_json['eos_token']
+
+
+@pytest.mark.precommit
+@pytest.mark.xfail(
+    raises=AssertionError, 
+    reason="CVS-143410 ov tokenizer should be aligned with hf",
+    strict=False,
+)
+def test_load_special_tokens_4(model_tmp_path):
+    # only string representation is provided, find token integers by inference
+    model_id, temp_path = model_tmp_path
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     
-    full_history_str_hf = tokenizer.apply_chat_template(conversation, 
-        add_generation_prompt=False, 
-        tokenize=False,
-        **tokenizer_config)
+    special_tokens_map_json = {}
+    token_str_int_map = {}
+    special_token_names = ['pad_token', 'bos_token', 'eos_token']
+    for token_str in special_token_names:
+        if hasattr(tokenizer, token_str):
+            token_val = getattr(tokenizer, token_str)
+            special_tokens_map_json.update({token_str: {"content": token_val}})
+            token_id = tokenizer(token_val, add_special_tokens=False)['input_ids'][0]
+            token_str_int_map.update({token_str: token_id})
+
+    # since only string representations are present in the json will try to get by inference
+    tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], temp_path)
+
+    # check ids inferred correctly for special tokens existing if HF tokenizer
+    if 'pad_token' in token_str_int_map:
+        assert tok.get_pad_token_id() == token_str_int_map['pad_token']
+    if 'bos_token' in token_str_int_map:
+        assert tok.get_bos_token_id() == token_str_int_map['bos_token']
+    if 'eos_token' in token_str_int_map:
+        assert tok.get_eos_token_id() == token_str_int_map['eos_token']
+
+
+invalid_configs = [
+    dict(num_beam_groups=3, num_beams=15, do_sample=True),
+    dict(do_sample=True),  # no eos_token_id no max_new_tokens, no max_len
+    dict(eos_token_id=42, ignore_eos=True),  # no max_new_tokens, no max_len with ignore_eos
+    dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty
+    dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp
+    dict(top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_p
+    dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k
+]
+@pytest.mark.parametrize("generation_config", invalid_configs)
+@pytest.mark.precommit
+def test_invalid_configs(model_tmp_path, generation_config):
+    model_id, temp_path = model_tmp_path
+    config_json = {}
+    pipe = load_pipe([(config_json, "config.json")], temp_path)
+    with pytest.raises(RuntimeError):
+        pipe.generate('blah blah', **generation_config)
+
+
+@pytest.mark.precommit
+def test_valid_configs(model_tmp_path):
+    model_id, temp_path = model_tmp_path
+    pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path)
+
+    config = ov_genai.GenerationConfig()
+    config.do_sample = True  # no eos_token_id but it's loaded from config.json
+    pipe.set_generation_config(config)
+
+invalid_py_configs = [
+    dict(num_beam_groups=3, num_beams=15, do_sample=True),
+    dict(unexisting_key_name=True),  # no eos_token_id no max_new_tokens, no max_len
+    dict(eos_token_id=42, ignore_eos=True),  # no max_new_tokens, no max_len with ignore_eos
+    dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty
+    dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp
+    dict(top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_p
+    dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k
+]
+@pytest.mark.precommit
+@pytest.mark.parametrize("generation_config", invalid_py_configs)
+def test_python_generation_config_validation(model_tmp_path, generation_config):
+    model_id, temp_path = model_tmp_path
+    pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path)
     
-    tok = load_tok([(tokenizer_config, "tokenizer_config.json")], model_tmp_path[1])
-    full_history_str = tok.apply_chat_template(conversation, add_generation_prompt=False)
-    if full_history_str != full_history_str_hf:
-        print(f'hf reference: {full_history_str_hf}')
-        print(f'ov_genai out: {full_history_str}')
-    assert full_history_str == full_history_str_hf
+    # 'unexisting_key_name' key validity is checked in pybind and ValueError will be returned
+    #  instead of RuntimeError, which is returned when GenerationConfig values are validated
+    return_exception_type = ValueError if 'unexisting_key_name' in generation_config else RuntimeError
+    with pytest.raises(return_exception_type):
+        pipe.set_generation_config(ov_genai.GenerationConfig(**generation_config))
+
+
+@pytest.mark.precommit
+def test_unicode_pybind_decoding_1():
+    # On this model this prompt generates unfinished utf string.
+    # Test that pybind will not fail.
+    model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
+    pipe = read_model((model_id, path))[4]
+    res_str = pipe.generate(',', max_new_tokens=4)
+    assert '�' == res_str[-1]
+
+
+
+@pytest.mark.precommit
+def test_unicode_pybind_decoding_2():
+    # On this model this prompt generates unfinished utf string.
+    # Test that pybind will not fail.
+    model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
+    pipe = read_model((model_id, path))[4]
+    res_str = pipe.generate([","], max_new_tokens=4)
+    assert '�' == res_str.texts[0][-1]
+
+
+@pytest.mark.precommit
+def test_unicode_pybind_decoding_3():
+    # On this model this prompt generates unfinished utf-8 string
+    # and streams it. Test that pybind will not fail while we pass string to python.
+    model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
+    pipe = read_model((model_id, path))[4]
+    res_str = []
+    pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x))
+    assert '�' == res_str[-1]
+
+
+@pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory")
+@pytest.mark.precommit
+@pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win")
+def test_left_pad():
+    # test left pad tokenizer post processing implementation
+    prompts = [
+        "The Sun is yellow because",
+        "The Sun is yellow because [force left pad tokens]"
+    ]
+    models = read_model(("microsoft/phi-1_5", Path("phi-1_5/")))
+
+    config = {
+        "max_new_tokens": 20,
+        "num_beam_groups": 2,
+        "num_beams": 2,
+        "num_return_sequences": 2,
+        "do_sample": False,
+        "diversity_penalty": 1.0,
+        # phi 1_5 has no eos_token_id in model configuration
+        # ov genai will detect eos_token_id from tokenizer config
+        # hf implementation doesn't fetch it from tokenizer config and defaults to None
+        # align ov genai and hf by setting eos_token_id explicitly
+        "eos_token_id": 50256,
+    }
+
+    models[2].pad_token = models[2].eos_token
+    run_hf_ov_genai_comparison_batched(models, config, prompts)
 
 
 @functools.lru_cache(1)
@@ -170,16 +681,15 @@ def get_continuous_batching(path):
     return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB')
 
 
-@pytest.mark.parametrize("question", questions)
+@pytest.mark.parametrize("generation_config", test_configs)
+@pytest.mark.parametrize("prompts", batched_prompts)
 @pytest.mark.precommit
-def test_continuous_batching_vs_stateful(question):
+def test_continuous_batching_vs_stateful(batched_prompts):
     model_id, path, tokenizer, model, pipe = read_model((
         "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        pathlib.Path("TinyLlama-1.1B-Chat-v1.0")
+        Path("TinyLlama-1.1B-Chat-v1.0")
     ))
     cb = get_continuous_batching(path)
-    config = ov_genai.GenerationConfig()
-    config.max_new_tokens = 100
-    gen = cb.generate(question, config)
-    ref = pipe.generate(question, config)
+    gen = cb.generate(batched_prompts, **test_configs)
+    ref = pipe.generate(batched_prompts, **test_configs)
     assert gen == ref

From 6a3275ef85cdbdbcb517eecd94698f15103302d7 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 11 Jul 2024 21:18:04 +0400
Subject: [PATCH 11/28] args

---
 tests/python_tests/test_generate_api.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index d994d52fe..3ef5b6aa9 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -681,10 +681,10 @@ def get_continuous_batching(path):
     return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB')
 
 
-@pytest.mark.parametrize("generation_config", test_configs)
-@pytest.mark.parametrize("prompts", batched_prompts)
+@pytest.mark.parametrize("test_configs", test_configs)
+@pytest.mark.parametrize("batched_prompts", batched_prompts)
 @pytest.mark.precommit
-def test_continuous_batching_vs_stateful(batched_prompts):
+def test_continuous_batching_vs_stateful(batched_prompts, test_configs):
     model_id, path, tokenizer, model, pipe = read_model((
         "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
         Path("TinyLlama-1.1B-Chat-v1.0")

From a5f2cd6ad9adb7294ad9c60a04c025c3246cac62 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 11 Jul 2024 21:41:40 +0400
Subject: [PATCH 12/28] test

---
 .../include/openvino/genai/llm_pipeline.hpp   |  2 +-
 tests/python_tests/test_generate_api.py       | 27 +++++++++++--------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index 84dc02bd5..abd4ee5a4 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -14,7 +14,7 @@
 namespace ov {
 namespace genai {
 
-// Return flag corresponds whether generation should be stopped: false means continue generation, true means stop.
+// Return flag correspods whether generation should be stopped: false means continue generation, true means stop.
 using StreamerVariant = std::variant<std::function<bool(std::string)>, std::shared_ptr<StreamerBase>, std::monostate>;
 using OptionalGenerationConfig = std::optional<GenerationConfig>;
 using EncodedInputs = std::variant<ov::Tensor, TokenizedInputs>;
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 3ef5b6aa9..df019b0a2 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -23,7 +23,6 @@
 
 
 def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]):
-    device = 'CPU'
     model_id, path, tokenizer, model, pipe = model_descr
     config = generation_config.copy()  # to avoid side effects
     num_beams = config['num_beams'] if 'num_beams' in config else 1
@@ -68,7 +67,6 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro
         assert hf_output == ov_output
 
 def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str):
-    device = 'CPU'
     model_id, path, tokenizer, model, pipe = model_descr
 
     config = generation_config.copy()  # to avoid side effects
@@ -76,7 +74,7 @@ def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str
     if 'do_sample' not in config:
         # Some HF models have default do_sample = True, and if we set beam search generation config 
         # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
-        # Need to set exlicitly to False, but only if test arguments omitted this arg.
+        # Need to set explicitly to False, but only if test arguments omitted this arg.
         # Do not apply 'repetition_penalty' if sampling is not used.
         config['do_sample'] = False
         config['repetition_penalty'] = None
@@ -245,7 +243,7 @@ def test_genai_tokenizer_decode(model_descr, encoded_prompt):
 ]
 batched_prompts = [
     ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'],
-    ['hello', 'Here is the longest nowel ever: '],
+    ['hello', 'Here is the longest novel ever: '],
     ['Alan Turing was a', 'return 0', '你好！ 你好嗎？'],
     ['table is made', 'table is made [force left pad tokens]']
 ]
@@ -681,15 +679,22 @@ def get_continuous_batching(path):
     return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB')
 
 
-@pytest.mark.parametrize("test_configs", test_configs)
-@pytest.mark.parametrize("batched_prompts", batched_prompts)
+@pytest.mark.parametrize("prompts", [
+    'table is made of',
+    '你好！ 你好嗎？',
+    'Alan Turing was a',
+    'The Sun is yellow because',
+    ['The Sun is yellow because', 'Alan Turing was a', 'Alan Turing was a']
+])
 @pytest.mark.precommit
-def test_continuous_batching_vs_stateful(batched_prompts, test_configs):
+def test_continuous_batching_vs_stateful(prompts):
     model_id, path, tokenizer, model, pipe = read_model((
         "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
         Path("TinyLlama-1.1B-Chat-v1.0")
     ))
-    cb = get_continuous_batching(path)
-    gen = cb.generate(batched_prompts, **test_configs)
-    ref = pipe.generate(batched_prompts, **test_configs)
-    assert gen == ref
+    config = ov_genai.GenerationConfig()
+    config.max_new_tokens = 100
+    assert (
+        get_continuous_batching(path).generate(prompts, config)
+        == pipe.generate(batched_prompts, config)
+    )

From 771fc29b31cd64592ab12a6e36897648f49652e3 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 11 Jul 2024 22:03:58 +0400
Subject: [PATCH 13/28] tests

---
 src/cpp/src/llm_pipeline.cpp            |  4 +--
 tests/python_tests/test_generate_api.py | 33 ++++++++++++++-----------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 2fcd77318..2edeb264e 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -435,9 +435,9 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
                     int64_t* destination = input_ids.back().data<int64_t>();
                     size_t copy_count = 0;
                     for (size_t idx = 0; idx < max_len; ++idx) {
-                        if (1 == attention_mask[batch_id * max_len + idx]) {
+                        // if (1 == attention_mask[batch_id * max_len + idx]) {
                             destination[copy_count++] = source[batch_id * max_len + idx];
-                        }
+                        // }
                     }
                     input_ids.back().set_shape({1, copy_count});
                 }
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index df019b0a2..687e1a6f4 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -678,23 +678,28 @@ def test_left_pad():
 def get_continuous_batching(path):
     return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB')
 
-
-@pytest.mark.parametrize("prompts", [
-    'table is made of',
-    '你好！ 你好嗎？',
-    'Alan Turing was a',
-    'The Sun is yellow because',
-    ['The Sun is yellow because', 'Alan Turing was a', 'Alan Turing was a']
-])
+test_configs = [
+    dict(max_new_tokens=20),
+    dict(max_new_tokens=200, ignore_eos=True),
+    dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0)
+]
+batched_prompts = [
+    ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'],
+    ['hello', 'Here is the longest novel ever: '],
+    ['Alan Turing was a', 'return 0', '你好！ 你好嗎？'],
+    ['table is made', 'table is made [force left pad tokens]']
+]
+@pytest.mark.parametrize("generation_config", test_configs)
+@pytest.mark.parametrize("prompt", batched_prompts)
 @pytest.mark.precommit
-def test_continuous_batching_vs_stateful(prompts):
-    model_id, path, tokenizer, model, pipe = read_model((
+def test_continuous_batching_vs_stateful(prompt, generation_config):
+    model_id, path, tokenizer, model, stateful = read_model((
         "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
         Path("TinyLlama-1.1B-Chat-v1.0")
     ))
     config = ov_genai.GenerationConfig()
     config.max_new_tokens = 100
-    assert (
-        get_continuous_batching(path).generate(prompts, config)
-        == pipe.generate(batched_prompts, config)
-    )
+    cb = get_continuous_batching(path)
+    vanilla = cb.generate(prompt, **generation_config)
+    ref = stateful.generate(prompt, **generation_config)
+    assert vanilla == ref

From 5c615bf29b3bb990e7411b87a5cdbb2f9e1308a0 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 11 Jul 2024 22:03:58 +0400
Subject: [PATCH 14/28] tests

---
 src/cpp/src/llm_pipeline.cpp            |  4 ++--
 tests/python_tests/test_generate_api.py | 18 ++++--------------
 2 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 2edeb264e..2fcd77318 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -435,9 +435,9 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
                     int64_t* destination = input_ids.back().data<int64_t>();
                     size_t copy_count = 0;
                     for (size_t idx = 0; idx < max_len; ++idx) {
-                        // if (1 == attention_mask[batch_id * max_len + idx]) {
+                        if (1 == attention_mask[batch_id * max_len + idx]) {
                             destination[copy_count++] = source[batch_id * max_len + idx];
-                        // }
+                        }
                     }
                     input_ids.back().set_shape({1, copy_count});
                 }
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 687e1a6f4..a6ea9bcc1 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -678,19 +678,9 @@ def test_left_pad():
 def get_continuous_batching(path):
     return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB')
 
-test_configs = [
-    dict(max_new_tokens=20),
-    dict(max_new_tokens=200, ignore_eos=True),
-    dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0)
-]
-batched_prompts = [
-    ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'],
-    ['hello', 'Here is the longest novel ever: '],
-    ['Alan Turing was a', 'return 0', '你好！ 你好嗎？'],
-    ['table is made', 'table is made [force left pad tokens]']
-]
+
 @pytest.mark.parametrize("generation_config", test_configs)
-@pytest.mark.parametrize("prompt", batched_prompts)
+@pytest.mark.parametrize("prompt", prompts)
 @pytest.mark.precommit
 def test_continuous_batching_vs_stateful(prompt, generation_config):
     model_id, path, tokenizer, model, stateful = read_model((
@@ -700,6 +690,6 @@ def test_continuous_batching_vs_stateful(prompt, generation_config):
     config = ov_genai.GenerationConfig()
     config.max_new_tokens = 100
     cb = get_continuous_batching(path)
-    vanilla = cb.generate(prompt, **generation_config)
+    generated = cb.generate(prompt, **generation_config)
     ref = stateful.generate(prompt, **generation_config)
-    assert vanilla == ref
+    assert generated == ref

From 3afc16d751d19f757ee44a2a9e9d7046ba9cdc86 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 12 Jul 2024 13:39:50 +0400
Subject: [PATCH 15/28] test

---
 tests/python_tests/test_generate_api.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index a6ea9bcc1..4d53f2576 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -12,6 +12,7 @@
 from pathlib import Path
 import torch
 import functools
+import math
 from ov_genai_test_utils import (
     get_models_list, 
     read_model, 
@@ -680,7 +681,7 @@ def get_continuous_batching(path):
 
 
 @pytest.mark.parametrize("generation_config", test_configs)
-@pytest.mark.parametrize("prompt", prompts)
+@pytest.mark.parametrize("prompt", batched_prompts)
 @pytest.mark.precommit
 def test_continuous_batching_vs_stateful(prompt, generation_config):
     model_id, path, tokenizer, model, stateful = read_model((
@@ -691,5 +692,9 @@ def test_continuous_batching_vs_stateful(prompt, generation_config):
     config.max_new_tokens = 100
     cb = get_continuous_batching(path)
     generated = cb.generate(prompt, **generation_config)
-    ref = stateful.generate(prompt, **generation_config)
-    assert generated == ref
+    reference = stateful.generate(prompt, **generation_config)
+    assert generated.texts == reference.texts
+    if 1 != generation_config.get("num_beams", 1):
+        # Stateful puts zeroes to generated.scores. Don't compare them.
+        for gen, ref in zip(generated.scores, reference.scores):
+            assert math.isclose(gen, ref, abs_tol=0.0001)

From 67f47172e91931e208fc749c18a7151481f69331 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 12 Jul 2024 13:44:39 +0400
Subject: [PATCH 16/28] remove caching

---
 tests/python_tests/test_generate_api.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 4d53f2576..232edc1b8 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -11,7 +11,6 @@
 import sys
 from pathlib import Path
 import torch
-import functools
 import math
 from ov_genai_test_utils import (
     get_models_list, 
@@ -675,7 +674,6 @@ def test_left_pad():
     run_hf_ov_genai_comparison_batched(models, config, prompts)
 
 
-@functools.lru_cache(1)
 def get_continuous_batching(path):
     return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB')
 

From d26723f747cafa8d042f97033821773176b32c06 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Fri, 12 Jul 2024 15:38:28 +0200
Subject: [PATCH 17/28] Clear beam search info.

---
 src/cpp/src/continuous_batching_pipeline.cpp | 1 +
 src/cpp/src/sampler.hpp                      | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index dbacf3c24..beeabf4d0 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -269,6 +269,7 @@ class ContinuousBatchingPipeline::Impl {
             result.m_status = generation->get_status();
             results.push_back(result);
         }
+        m_sampler->clear_beam_search_info();
 
         OPENVINO_ASSERT(results.size() == prompts.size());
         return results;
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 5dc44b491..cbc48a995 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -247,6 +247,8 @@ class Sampler {
     SamplerOutput sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits);
 
     void set_seed(size_t seed) { rng_engine.seed(seed); }
+
+    void clear_beam_search_info();
 };
 
 SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits) {
@@ -578,4 +580,8 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp
         }
     }
 }
+
+void Sampler::clear_beam_search_info() { 
+    m_beam_search_info.clear();
+}
 }

From d223d68f81742bd035958ff168ba36641dc0a071 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 15 Jul 2024 09:04:11 +0400
Subject: [PATCH 18/28] -am cache

---
 tests/python_tests/test_generate_api.py | 2 ++
 thirdparty/openvino_tokenizers          | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 232edc1b8..4d53f2576 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -11,6 +11,7 @@
 import sys
 from pathlib import Path
 import torch
+import functools
 import math
 from ov_genai_test_utils import (
     get_models_list, 
@@ -674,6 +675,7 @@ def test_left_pad():
     run_hf_ov_genai_comparison_batched(models, config, prompts)
 
 
+@functools.lru_cache(1)
 def get_continuous_batching(path):
     return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB')
 
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 880d569cd..c615ec5ae 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 880d569cd2f5d52165b940542e2f9190172ed2cb
+Subproject commit c615ec5ae550da770606ce9f82775cf50e71082d

From 238ea8bff582c15e9b444ee76efee831417d22e8 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 15 Jul 2024 10:16:32 +0400
Subject: [PATCH 19/28] updte

---
 tests/python_tests/test_generate_api.py | 2 --
 thirdparty/openvino_tokenizers          | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 4d53f2576..232edc1b8 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -11,7 +11,6 @@
 import sys
 from pathlib import Path
 import torch
-import functools
 import math
 from ov_genai_test_utils import (
     get_models_list, 
@@ -675,7 +674,6 @@ def test_left_pad():
     run_hf_ov_genai_comparison_batched(models, config, prompts)
 
 
-@functools.lru_cache(1)
 def get_continuous_batching(path):
     return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB')
 
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index c615ec5ae..2fb700042 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit c615ec5ae550da770606ce9f82775cf50e71082d
+Subproject commit 2fb700042acfc3e734941fbbc332af2de17024a4

From 5a4c878e14c014d632a9247ecd4a6f449f45be85 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 15 Jul 2024 14:13:25 +0400
Subject: [PATCH 20/28] Revert "Merge remote-tracking branch
 'popovaan/clear_beam_info' into use-CB-as-backend"

This reverts commit c28a023396e85a3d6db87eedd548ff5b60967368, reversing
changes made to 67f47172e91931e208fc749c18a7151481f69331.
---
 src/cpp/src/continuous_batching_pipeline.cpp | 1 -
 src/cpp/src/sampler.hpp                      | 6 ------
 2 files changed, 7 deletions(-)

diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 1b682153b..ad190bd00 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -272,7 +272,6 @@ class ContinuousBatchingPipeline::Impl {
             result.m_status = generation->get_status();
             results.push_back(std::move(result));
         }
-        m_sampler->clear_beam_search_info();
 
         OPENVINO_ASSERT(results.size() == input_ids.size());
         return results;
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 1bcee8a1b..095c795a4 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -247,8 +247,6 @@ class Sampler {
     SamplerOutput sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits);
 
     void set_seed(size_t seed) { rng_engine.seed(seed); }
-
-    void clear_beam_search_info();
 };
 
 SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits) {
@@ -580,8 +578,4 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp
         }
     }
 }
-
-void Sampler::clear_beam_search_info() { 
-    m_beam_search_info.clear();
-}
 }

From cf35f190b5f7e3d2d2f08303b0096bdc9ba0ccc6 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 15 Jul 2024 14:13:57 +0400
Subject: [PATCH 21/28] revert spelling

---
 tests/python_tests/test_generate_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 232edc1b8..4089b3f93 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -243,7 +243,7 @@ def test_genai_tokenizer_decode(model_descr, encoded_prompt):
 ]
 batched_prompts = [
     ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'],
-    ['hello', 'Here is the longest novel ever: '],
+    ['hello', 'Here is the longest nowel ever: '],
     ['Alan Turing was a', 'return 0', '你好！ 你好嗎？'],
     ['table is made', 'table is made [force left pad tokens]']
 ]

From 12061afdd1f1f2248ac816343b685f59e5e0a129 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 15 Jul 2024 14:46:31 +0400
Subject: [PATCH 22/28] relax abs_tol

---
 tests/python_tests/test_generate_api.py | 2 +-
 thirdparty/openvino_tokenizers          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 4089b3f93..3c84e55f7 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -695,4 +695,4 @@ def test_continuous_batching_vs_stateful(prompt, generation_config):
     if 1 != generation_config.get("num_beams", 1):
         # Stateful puts zeroes to generated.scores. Don't compare them.
         for gen, ref in zip(generated.scores, reference.scores):
-            assert math.isclose(gen, ref, abs_tol=0.0001)
+            assert math.isclose(gen, ref, abs_tol=0.0003)
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 2fb700042..880d569cd 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 2fb700042acfc3e734941fbbc332af2de17024a4
+Subproject commit 880d569cd2f5d52165b940542e2f9190172ed2cb

From 6d7a468f256884f13e9e6505e62adc496b4a1d8b Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 15 Jul 2024 22:55:13 +0400
Subject: [PATCH 23/28] lru_cache

---
 tests/python_tests/test_generate_api.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 3c84e55f7..4662a6e4a 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -11,6 +11,7 @@
 import sys
 from pathlib import Path
 import torch
+import functools
 import math
 from ov_genai_test_utils import (
     get_models_list, 
@@ -674,6 +675,7 @@ def test_left_pad():
     run_hf_ov_genai_comparison_batched(models, config, prompts)
 
 
+@functools.lru_cache(1)
 def get_continuous_batching(path):
     return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB')
 

From c6d345a7bce94338dbffc6c18f2adef26d492a0c Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Tue, 16 Jul 2024 23:35:43 +0400
Subject: [PATCH 24/28] Add CB streaming

---
 .../genai/continuous_batching_pipeline.hpp    |  5 ++--
 .../openvino/genai/generation_handle.hpp      |  1 +
 src/cpp/src/continuous_batching_pipeline.cpp  | 28 +++++++++++++------
 src/cpp/src/generation_handle.cpp             |  4 +++
 src/cpp/src/generation_stream.hpp             |  3 ++
 src/cpp/src/llm_pipeline.cpp                  | 16 ++++++++---
 src/cpp/src/synchronized_queue.hpp            |  6 ++++
 src/python/py_generate_pipeline.cpp           | 16 +++++++++--
 tests/python_tests/test_generate_api.py       | 13 +++++++++
 thirdparty/openvino_tokenizers                |  2 +-
 10 files changed, 76 insertions(+), 18 deletions(-)

diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index f13cc55c4..e9a1add9f 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -10,6 +10,7 @@
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/generation_handle.hpp"
+#include "openvino/genai/streamer_base.hpp"
 #include "openvino/genai/visibility.hpp"
 
 namespace ov::genai {
@@ -63,7 +64,7 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
     bool has_non_finished_requests();
 
     // more high level interface, which can process multiple prompts in continuous batching manner
-    std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params);
-    std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params);
+    std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params, const std::shared_ptr<StreamerBase>& streamer=nullptr);
+    std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params, const std::shared_ptr<StreamerBase>& streamer=nullptr);
 };
 }
diff --git a/src/cpp/include/openvino/genai/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp
index 556f4b812..8d00ae0e9 100644
--- a/src/cpp/include/openvino/genai/generation_handle.hpp
+++ b/src/cpp/include/openvino/genai/generation_handle.hpp
@@ -74,6 +74,7 @@ class OPENVINO_GENAI_EXPORTS GenerationHandleImpl {
 
     bool can_read();
 
+    GenerationOutputs back();
     // Reads result of a generation for single iteration
     GenerationOutputs read();
     // Reads all generated tokens for all sequences
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index cfdbae3a0..970b47c5b 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -238,7 +238,7 @@ class ContinuousBatchingPipeline::Impl {
         return !m_awaiting_requests.empty() || !m_requests.empty();
     }
 
-    std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<GenerationConfig>& sampling_params) {
+    std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<GenerationConfig>& sampling_params, const std::shared_ptr<StreamerBase>& streamer) {
         OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
         OPENVINO_ASSERT(input_ids.size() == sampling_params.size());
 
@@ -251,8 +251,18 @@ class ContinuousBatchingPipeline::Impl {
         std::vector<EncodedGenerationResult> results;
         results.reserve(m_awaiting_requests.size());
 
-        while (has_non_finished_requests()) {
+        bool continue_generation = true;
+        while (has_non_finished_requests() && continue_generation) {
             step();
+            if (streamer) {
+                std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
+                OPENVINO_ASSERT(1 == token.size());
+                OPENVINO_ASSERT(1 == token.begin()->second.generated_token_ids.size());
+                continue_generation = !streamer->put(token.begin()->second.generated_token_ids.at(0));
+            }
+        }
+        if (streamer) {
+            streamer->end();
         }
 
         for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) {
@@ -278,7 +288,7 @@ class ContinuousBatchingPipeline::Impl {
         return results;
     }
 
-    std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, std::vector<ov::genai::GenerationConfig> sampling_params) {
+    std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, std::vector<ov::genai::GenerationConfig> sampling_params, const std::shared_ptr<StreamerBase>& streamer) {
         std::vector<ov::Tensor> input_ids;
         input_ids.reserve(prompts.size());
         for (const std::string& prompt : prompts) {
@@ -287,7 +297,7 @@ class ContinuousBatchingPipeline::Impl {
             input_ids.push_back(m_tokenizer.encode(prompt).input_ids);
             timer.end();
         }
-        std::vector<EncodedGenerationResult> encoded = generate(input_ids, sampling_params);
+        std::vector<EncodedGenerationResult> encoded = generate(input_ids, sampling_params, streamer);
         std::vector<GenerationResult> decoded;
         decoded.reserve(encoded.size());
         for (EncodedGenerationResult& res : encoded) {
@@ -350,10 +360,10 @@ bool ContinuousBatchingPipeline::has_non_finished_requests() {
     return m_impl->has_non_finished_requests();
 }
 
-std::vector<EncodedGenerationResult> ContinuousBatchingPipeline::generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params) {
-    return m_impl->generate(input_ids, sampling_params);
+std::vector<EncodedGenerationResult> ContinuousBatchingPipeline::generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params, const std::shared_ptr<StreamerBase>& streamer) {
+    return m_impl->generate(input_ids, sampling_params, streamer);
 }
 
-std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params) {
-    return m_impl->generate(prompts, sampling_params);
-}
\ No newline at end of file
+std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params, const std::shared_ptr<StreamerBase>& streamer) {
+    return m_impl->generate(prompts, sampling_params, streamer);
+}
diff --git a/src/cpp/src/generation_handle.cpp b/src/cpp/src/generation_handle.cpp
index a0187025e..26cc12604 100644
--- a/src/cpp/src/generation_handle.cpp
+++ b/src/cpp/src/generation_handle.cpp
@@ -20,6 +20,10 @@ bool GenerationHandleImpl::can_read() {
     return m_generation_stream->can_read();
 }
 
+std::unordered_map<uint64_t, GenerationOutput> GenerationHandleImpl::back() {
+    return m_generation_stream->back();
+}
+
 std::unordered_map<uint64_t, GenerationOutput> GenerationHandleImpl::read() {
     return m_generation_stream->read();
 }
diff --git a/src/cpp/src/generation_stream.hpp b/src/cpp/src/generation_stream.hpp
index 0d51897e8..1ac2eefef 100644
--- a/src/cpp/src/generation_stream.hpp
+++ b/src/cpp/src/generation_stream.hpp
@@ -31,6 +31,9 @@ class GenerationStream {
     }
 
     // Retriving vector of pairs <sequence_id, token_id> as we can generate multiple outputs for a single prompt
+    GenerationOutputs back() {
+        return m_output_queue.back();
+    }
     GenerationOutputs read() {
         return m_output_queue.pull();
     }
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 2fcd77318..1e654b4dd 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -403,9 +403,6 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         OptionalGenerationConfig generation_config,
         StreamerVariant streamer
     ) override {
-        if (!std::holds_alternative<std::monostate>(streamer)) {
-            OPENVINO_THROW("streamer isn't supported for Continuous Batching");
-        }
         std::vector<ov::Tensor> input_ids = std::visit(overloaded{
             [](const ov::Tensor& inp) {
                 size_t batch_size = inp.get_shape().at(0);
@@ -446,7 +443,18 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         }, inputs);
         const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
         // -1 == config.eos_token_id and config.validate() are handled in m_impl.
-        std::vector<EncodedGenerationResult> generated = m_impl.generate(input_ids, std::vector<GenerationConfig>{input_ids.size(), config});
+        std::shared_ptr<StreamerBase> streamer_ptr = std::visit(overloaded{
+            [this](std::monostate) -> std::shared_ptr<StreamerBase> {
+                return nullptr;
+            },
+            [this](const std::shared_ptr<StreamerBase>& streamer) {
+                return streamer;
+            },
+            [this](std::function<bool(std::string)>& streamer) -> std::shared_ptr<StreamerBase> {
+                return std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer);
+            }
+        }, streamer);
+        std::vector<EncodedGenerationResult> generated = m_impl.generate(input_ids, std::vector<GenerationConfig>{input_ids.size(), config}, streamer_ptr);
         std::vector<std::vector<int64_t>> plain_tokens;
         std::vector<float> plain_scores;
         for (EncodedGenerationResult& res : generated) {
diff --git a/src/cpp/src/synchronized_queue.hpp b/src/cpp/src/synchronized_queue.hpp
index 0c2cd3180..bd025f1b7 100644
--- a/src/cpp/src/synchronized_queue.hpp
+++ b/src/cpp/src/synchronized_queue.hpp
@@ -17,6 +17,12 @@ class SynchronizedQueue
     SynchronizedQueue(const SynchronizedQueue&&) = delete;
     SynchronizedQueue& operator=(const SynchronizedQueue&) = delete;
 
+    T back() {
+        std::unique_lock<std::mutex> lock(m_mutex);
+        m_cv.wait(lock, [this]{return !m_queue.empty();});
+        return m_queue.back();
+    }
+
     T pull() {
         std::unique_lock<std::mutex> lock(m_mutex);
         m_cv.wait(lock, [this]{return !m_queue.empty();});
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 942c7a284..e8ce0b0a2 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -610,6 +610,18 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def("add_request", py::overload_cast<uint64_t, const std::string&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request))
         .def("step", &ContinuousBatchingPipeline::step)
         .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests)
-        .def("generate", py::overload_cast<const std::vector<ov::Tensor>&, const std::vector<ov::genai::GenerationConfig>&>(&ContinuousBatchingPipeline::generate))
-        .def("generate", py::overload_cast<const std::vector<std::string>&, const std::vector<ov::genai::GenerationConfig>&>(&ContinuousBatchingPipeline::generate));
+        .def(
+            "generate",
+            py::overload_cast<const std::vector<ov::Tensor>&, const std::vector<ov::genai::GenerationConfig>&, const std::shared_ptr<StreamerBase>&>(&ContinuousBatchingPipeline::generate),
+            py::arg("input_ids"),
+            py::arg("sampling_params"),
+            py::arg("streamer") = nullptr
+        )
+        .def(
+            "generate",
+            py::overload_cast<const std::vector<std::string>&, const std::vector<ov::genai::GenerationConfig>&, const std::shared_ptr<StreamerBase>&>(&ContinuousBatchingPipeline::generate),
+            py::arg("propts"),
+            py::arg("sampling_params"),
+            py::arg("streamer") = nullptr
+        );
 }
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 4662a6e4a..a796aa07e 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -698,3 +698,16 @@ def test_continuous_batching_vs_stateful(prompt, generation_config):
         # Stateful puts zeroes to generated.scores. Don't compare them.
         for gen, ref in zip(generated.scores, reference.scores):
             assert math.isclose(gen, ref, abs_tol=0.0003)
+
+@pytest.mark.parametrize("prompt", prompts)
+@pytest.mark.precommit
+def test_cb_streamer_vs_return_vs_stateful(prompt):
+    model_id, path, tokenizer, model, stateful = read_model((
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        Path("TinyLlama-1.1B-Chat-v1.0")
+    ))
+    cb = get_continuous_batching(path)
+    streamed = []
+    generated = cb.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword))
+    reference = stateful.generate(prompt, max_new_tokens=20)
+    assert generated == "".join(streamed) == reference
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 04795c1b7..880d569cd 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 04795c1b78c61e3294d1744c78a8ebb5e129256c
+Subproject commit 880d569cd2f5d52165b940542e2f9190172ed2cb

From bc56ca64eb30a9ebf4fea20323ff2e09079a7809 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Wed, 17 Jul 2024 10:01:37 +0400
Subject: [PATCH 25/28] use StreamerVariant

---
 .../genai/continuous_batching_pipeline.hpp    |  5 +--
 src/cpp/src/continuous_batching_pipeline.cpp  | 31 ++++++++++++++-----
 src/cpp/src/llm_pipeline.cpp                  | 13 +-------
 src/python/py_generate_pipeline.cpp           | 10 +++---
 4 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index e9a1add9f..43c3f4f80 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -10,6 +10,7 @@
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/generation_handle.hpp"
+#include "openvino/genai/llm_pipeline.hpp"
 #include "openvino/genai/streamer_base.hpp"
 #include "openvino/genai/visibility.hpp"
 
@@ -64,7 +65,7 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
     bool has_non_finished_requests();
 
     // more high level interface, which can process multiple prompts in continuous batching manner
-    std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params, const std::shared_ptr<StreamerBase>& streamer=nullptr);
-    std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params, const std::shared_ptr<StreamerBase>& streamer=nullptr);
+    std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{});
+    std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{});
 };
 }
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 970b47c5b..08a66ef92 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -12,11 +12,15 @@
 #include "sampler.hpp"
 #include "model_runner.hpp"
 #include "scheduler.hpp"
+#include "text_callback_streamer.hpp"
 #include "timer.hpp"
 #include "debug_utils.hpp"
 
 using namespace ov::genai;
 
+template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
+template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
+
 void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, DeviceConfig& device_config);
 
 class ContinuousBatchingPipeline::Impl {
@@ -238,9 +242,20 @@ class ContinuousBatchingPipeline::Impl {
         return !m_awaiting_requests.empty() || !m_requests.empty();
     }
 
-    std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<GenerationConfig>& sampling_params, const std::shared_ptr<StreamerBase>& streamer) {
+    std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<GenerationConfig>& sampling_params, const StreamerVariant& streamer) {
         OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
         OPENVINO_ASSERT(input_ids.size() == sampling_params.size());
+        const std::shared_ptr<StreamerBase>& streamer_ptr = std::visit(overloaded{
+            [](std::monostate) -> std::shared_ptr<StreamerBase> {
+                return nullptr;
+            },
+            [](const std::shared_ptr<StreamerBase>& streamer) {
+                return streamer;
+            },
+            [this](const std::function<bool(std::string)>& streamer) -> std::shared_ptr<StreamerBase> {
+                return std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer);
+            }
+        }, streamer);
 
         std::vector<GenerationHandle> generations;
         for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) {
@@ -254,15 +269,15 @@ class ContinuousBatchingPipeline::Impl {
         bool continue_generation = true;
         while (has_non_finished_requests() && continue_generation) {
             step();
-            if (streamer) {
+            if (streamer_ptr) {
                 std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
                 OPENVINO_ASSERT(1 == token.size());
                 OPENVINO_ASSERT(1 == token.begin()->second.generated_token_ids.size());
-                continue_generation = !streamer->put(token.begin()->second.generated_token_ids.at(0));
+                continue_generation = !streamer_ptr->put(token.begin()->second.generated_token_ids.at(0));
             }
         }
-        if (streamer) {
-            streamer->end();
+        if (streamer_ptr) {
+            streamer_ptr->end();
         }
 
         for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) {
@@ -288,7 +303,7 @@ class ContinuousBatchingPipeline::Impl {
         return results;
     }
 
-    std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, std::vector<ov::genai::GenerationConfig> sampling_params, const std::shared_ptr<StreamerBase>& streamer) {
+    std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, std::vector<ov::genai::GenerationConfig> sampling_params, const StreamerVariant& streamer) {
         std::vector<ov::Tensor> input_ids;
         input_ids.reserve(prompts.size());
         for (const std::string& prompt : prompts) {
@@ -360,10 +375,10 @@ bool ContinuousBatchingPipeline::has_non_finished_requests() {
     return m_impl->has_non_finished_requests();
 }
 
-std::vector<EncodedGenerationResult> ContinuousBatchingPipeline::generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params, const std::shared_ptr<StreamerBase>& streamer) {
+std::vector<EncodedGenerationResult> ContinuousBatchingPipeline::generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params, const StreamerVariant& streamer) {
     return m_impl->generate(input_ids, sampling_params, streamer);
 }
 
-std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params, const std::shared_ptr<StreamerBase>& streamer) {
+std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params, const StreamerVariant& streamer) {
     return m_impl->generate(prompts, sampling_params, streamer);
 }
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 1e654b4dd..acf7059e7 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -443,18 +443,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         }, inputs);
         const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
         // -1 == config.eos_token_id and config.validate() are handled in m_impl.
-        std::shared_ptr<StreamerBase> streamer_ptr = std::visit(overloaded{
-            [this](std::monostate) -> std::shared_ptr<StreamerBase> {
-                return nullptr;
-            },
-            [this](const std::shared_ptr<StreamerBase>& streamer) {
-                return streamer;
-            },
-            [this](std::function<bool(std::string)>& streamer) -> std::shared_ptr<StreamerBase> {
-                return std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer);
-            }
-        }, streamer);
-        std::vector<EncodedGenerationResult> generated = m_impl.generate(input_ids, std::vector<GenerationConfig>{input_ids.size(), config}, streamer_ptr);
+        std::vector<EncodedGenerationResult> generated = m_impl.generate(input_ids, std::vector<GenerationConfig>{input_ids.size(), config}, streamer);
         std::vector<std::vector<int64_t>> plain_tokens;
         std::vector<float> plain_scores;
         for (EncodedGenerationResult& res : generated) {
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index e8ce0b0a2..6df19bf13 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -612,16 +612,16 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests)
         .def(
             "generate",
-            py::overload_cast<const std::vector<ov::Tensor>&, const std::vector<ov::genai::GenerationConfig>&, const std::shared_ptr<StreamerBase>&>(&ContinuousBatchingPipeline::generate),
+            py::overload_cast<const std::vector<ov::Tensor>&, const std::vector<ov::genai::GenerationConfig>&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate),
             py::arg("input_ids"),
             py::arg("sampling_params"),
-            py::arg("streamer") = nullptr
+            py::arg("streamer") = std::monostate{}
         )
         .def(
             "generate",
-            py::overload_cast<const std::vector<std::string>&, const std::vector<ov::genai::GenerationConfig>&, const std::shared_ptr<StreamerBase>&>(&ContinuousBatchingPipeline::generate),
-            py::arg("propts"),
+            py::overload_cast<const std::vector<std::string>&, const std::vector<ov::genai::GenerationConfig>&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate),
+            py::arg("prompts"),
             py::arg("sampling_params"),
-            py::arg("streamer") = nullptr
+            py::arg("streamer") = std::monostate{}
         );
 }

From c70b909b8c64d39b1cc865bb7a958a542514a74e Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 18 Jul 2024 18:29:51 +0400
Subject: [PATCH 26/28] Add CB naive chat

Merge after https://github.com/openvinotoolkit/openvino.genai/pull/641
---
 .../genai/continuous_batching_pipeline.hpp    | 12 +++++
 src/cpp/src/continuous_batching_pipeline.cpp  | 47 ++++++++++++++++---
 src/cpp/src/llm_pipeline.cpp                  | 38 ++++++++++-----
 tests/python_tests/ov_genai_test_utils.py     |  5 ++
 tests/python_tests/test_chat_generate_api.py  | 20 +++++++-
 tests/python_tests/test_generate_api.py       | 28 ++++-------
 6 files changed, 114 insertions(+), 36 deletions(-)

diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index 43c3f4f80..3ec23b439 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -67,5 +67,17 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
     // more high level interface, which can process multiple prompts in continuous batching manner
     std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{});
     std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{});
+
+    /**
+    * @brief start chat with keeping history in kv cache.
+    *
+    * @param system_message optional system message.
+    */
+    void start_chat(const std::string& system_message = "");
+
+    /**
+    * @brief finish chat and clear kv cache.
+    */
+    void finish_chat();
 };
 }
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 08a66ef92..e3a86e2d8 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -56,6 +56,8 @@ class ContinuousBatchingPipeline::Impl {
     std::vector<SequenceGroup::Ptr> m_awaiting_requests;
     // Mutex protecting access to m_awaiting_requests, so add_request and step methods can be called from different threads
     std::mutex m_awaiting_requests_mutex;
+    bool m_is_chat_conversation = false;
+    ChatHistory m_history;
 
 
     void _free_non_running_requests() {
@@ -305,12 +307,22 @@ class ContinuousBatchingPipeline::Impl {
 
     std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, std::vector<ov::genai::GenerationConfig> sampling_params, const StreamerVariant& streamer) {
         std::vector<ov::Tensor> input_ids;
-        input_ids.reserve(prompts.size());
-        for (const std::string& prompt : prompts) {
-            static ManualTimer timer("tokenize");
+        static ManualTimer timer("tokenize");
+        if (m_is_chat_conversation) {
+            OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts");
+            m_history.push_back({{"role", "user"}, {"content", prompts.at(0)}});
+            constexpr bool add_generation_prompt = true;
+            std::string history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
             timer.start();
-            input_ids.push_back(m_tokenizer.encode(prompt).input_ids);
+            input_ids.push_back(m_tokenizer.encode(history).input_ids);
             timer.end();
+        } else {
+            input_ids.reserve(prompts.size());
+            for (const std::string& prompt : prompts) {
+                timer.start();
+                input_ids.push_back(m_tokenizer.encode(prompt).input_ids);
+                timer.end();
+            }
         }
         std::vector<EncodedGenerationResult> encoded = generate(input_ids, sampling_params, streamer);
         std::vector<GenerationResult> decoded;
@@ -318,8 +330,11 @@ class ContinuousBatchingPipeline::Impl {
         for (EncodedGenerationResult& res : encoded) {
             std::vector<std::string> generated;
             generated.reserve(res.m_generation_ids.size());
-            for (const std::vector<int64_t>& tokens : res.m_generation_ids) {
-                generated.push_back(m_tokenizer.decode(tokens));
+            for (size_t idx = 0; idx < res.m_generation_ids.size(); ++idx) {
+                generated.push_back(m_tokenizer.decode(res.m_generation_ids.at(idx)));
+                if (m_is_chat_conversation && 0 == idx) {
+                    m_history.push_back({{"role", "assistant"}, {"content", generated.back()}});
+                }
             }
             decoded.push_back(GenerationResult{
                 res.m_request_id,
@@ -330,6 +345,18 @@ class ContinuousBatchingPipeline::Impl {
         }
         return decoded;
     }
+
+    void start_chat(const std::string& system_message) {
+        if (!system_message.empty()) {
+            m_history.push_back({{"role", "system"}, {"content", system_message}});
+        }
+        m_is_chat_conversation = true;
+    };
+
+    void finish_chat() {
+        m_is_chat_conversation = false;
+        m_history.clear();
+    };
 };
 
 ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& models_path,
@@ -382,3 +409,11 @@ std::vector<EncodedGenerationResult> ContinuousBatchingPipeline::generate(const
 std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params, const StreamerVariant& streamer) {
     return m_impl->generate(prompts, sampling_params, streamer);
 }
+
+void ContinuousBatchingPipeline::start_chat(const std::string& system_message) {
+    m_impl->start_chat(system_message);
+};
+
+void ContinuousBatchingPipeline::finish_chat() {
+    m_impl->finish_chat();
+};
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index acf7059e7..1d68d4c74 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -115,6 +115,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         EncodedInputs encoded_input;
 
         if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
+            OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
             encoded_input = m_tokenizer.encode(*input_vector);
         } else if (auto input_prompt = std::get_if<std::string>(&inputs)) {
             std::string& prompt = *input_prompt;
@@ -386,16 +387,31 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         OptionalGenerationConfig generation_config,
         StreamerVariant streamer
     ) override {
-        EncodedInputs input_ids_att = std::visit(overloaded{
-            [this](const std::string& prompt) {
-                return m_tokenizer.encode(prompt);
+        std::vector<std::string> prompts = std::visit(overloaded{
+            [](const std::string& prompt) {
+                return std::vector{prompt};
             },
-            [this](std::vector<std::string>& prompts) {
-                return m_tokenizer.encode(prompts);
+            [](std::vector<std::string>& prompts) {
+                return prompts;
             }
         }, inputs);
-        EncodedResults encoded = generate(input_ids_att, generation_config, streamer);
-        return {m_tokenizer.decode(encoded.tokens), encoded.scores};
+        const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config;
+        // -1 == config.eos_token_id and config.validate() are handled in m_impl.
+        std::vector<GenerationResult> generated = m_impl.generate(
+            prompts,
+            std::vector<GenerationConfig>{prompts.size(), config},
+            streamer
+        );
+        std::vector<std::string> plain_replies;
+        std::vector<float> plain_scores;
+        for (GenerationResult& res : generated) {
+            if (GenerationStatus::FINISHED != res.m_status) {
+                OPENVINO_THROW("Got unfinished GenerationStatus");
+            }
+            std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_replies));
+            std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
+        }
+        return {std::move(plain_replies), std::move(plain_scores)};
     }
 
     EncodedResults generate(
@@ -457,12 +473,12 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
     }
 
     void start_chat(const std::string& system_message) override {
-        OPENVINO_THROW("start_chat() isn't implemented.");
-    }
+        m_impl.start_chat();
+    };
 
     void finish_chat() override {
-        OPENVINO_THROW("finish_chat() isn't implemented.");
-    }
+        m_impl.finish_chat();
+    };
 };
 }
 
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 4ba71a1d4..c513353e4 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -215,3 +215,8 @@ def load_pipe(configs: List[Tuple], temp_path):
         with (temp_path / config_name).open('w') as f:
             json.dump(config_json, f)
     return ov_genai.LLMPipeline(str(temp_path))
+
+
+@functools.lru_cache(1)
+def get_continuous_batching(path):
+    return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB')
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index 94de8f6cc..814bde076 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -1,6 +1,7 @@
 # Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import math
 import openvino
 import openvino_tokenizers
 import openvino_genai as ov_genai
@@ -12,7 +13,8 @@
     read_model,
     load_tok,
     model_tmp_path,
-    get_chat_templates
+    get_chat_templates,
+    get_continuous_batching,
 )
 
 
@@ -163,3 +165,19 @@ def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
         print(f'hf reference: {full_history_str_hf}')
         print(f'ov_genai out: {full_history_str}')
     assert full_history_str == full_history_str_hf
+
+
+@pytest.mark.parametrize("generation_config", configs[1:])
+@pytest.mark.parametrize("model_descr", get_chat_models_list())
+@pytest.mark.precommit
+def test_chat_continuous_batching_vs_stateful(model_descr, generation_config: Dict):
+    model_id, path, tokenizer, model, stateful = read_model(model_descr)
+    cb = get_continuous_batching(path)
+    stateful.start_chat()
+    cb.start_chat()
+    for question in quenstions:
+        generated = cb.generate(question, **generation_config)
+        reference = stateful.generate(question, **generation_config)
+        assert generated == reference
+    # Test that finish_chat() doesn't fail just in case.
+    cb.finish_chat()
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index a796aa07e..6b859b05d 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -11,7 +11,6 @@
 import sys
 from pathlib import Path
 import torch
-import functools
 import math
 from ov_genai_test_utils import (
     get_models_list, 
@@ -20,6 +19,7 @@
     load_tok, 
     model_tmp_path, 
     STOP_CRITERIA_MAP, 
+    get_continuous_batching,
 )
 
 
@@ -675,39 +675,31 @@ def test_left_pad():
     run_hf_ov_genai_comparison_batched(models, config, prompts)
 
 
-@functools.lru_cache(1)
-def get_continuous_batching(path):
-    return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB')
-
-
 @pytest.mark.parametrize("generation_config", test_configs)
 @pytest.mark.parametrize("prompt", batched_prompts)
+@pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
-def test_continuous_batching_vs_stateful(prompt, generation_config):
-    model_id, path, tokenizer, model, stateful = read_model((
-        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        Path("TinyLlama-1.1B-Chat-v1.0")
-    ))
+def test_continuous_batching_vs_stateful(model_descr, prompt, generation_config):
+    model_id, path, tokenizer, model, stateful = read_model(model_descr)
     config = ov_genai.GenerationConfig()
     config.max_new_tokens = 100
     cb = get_continuous_batching(path)
     generated = cb.generate(prompt, **generation_config)
     reference = stateful.generate(prompt, **generation_config)
     assert generated.texts == reference.texts
-    if 1 != generation_config.get("num_beams", 1):
+    if 1 != generation_config.get("num_return_sequences", 1):
         # Stateful puts zeroes to generated.scores. Don't compare them.
         for gen, ref in zip(generated.scores, reference.scores):
             assert math.isclose(gen, ref, abs_tol=0.0003)
 
 @pytest.mark.parametrize("prompt", prompts)
+@pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
-def test_cb_streamer_vs_return_vs_stateful(prompt):
-    model_id, path, tokenizer, model, stateful = read_model((
-        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        Path("TinyLlama-1.1B-Chat-v1.0")
-    ))
+def test_cb_streamer_vs_return_vs_stateful(model_descr, prompt):
+    model_id, path, tokenizer, model, stateful = read_model(model_descr)
     cb = get_continuous_batching(path)
     streamed = []
     generated = cb.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword))
     reference = stateful.generate(prompt, max_new_tokens=20)
-    assert generated == "".join(streamed) == reference
+    assert generated == "".join(streamed)
+    assert "".join(streamed) == reference

From 54684396733bb460ec96866d725f74338c6f2e24 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 19 Jul 2024 06:27:26 +0400
Subject: [PATCH 27/28] correct tests

---
 tests/python_tests/test_chat_generate_api.py | 2 +-
 tests/python_tests/test_generate_api.py      | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index 814bde076..3cf715e4e 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -171,7 +171,7 @@ def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
 def test_chat_continuous_batching_vs_stateful(model_descr, generation_config: Dict):
-    model_id, path, tokenizer, model, stateful = read_model(model_descr)
+    model_id, path, tokenizer, model, stateful = read_model((model_descr[0], model_descr[1] / '_test_chat'))
     cb = get_continuous_batching(path)
     stateful.start_chat()
     cb.start_chat()
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 6b859b05d..aca6314dc 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -696,7 +696,10 @@ def test_continuous_batching_vs_stateful(model_descr, prompt, generation_config)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 def test_cb_streamer_vs_return_vs_stateful(model_descr, prompt):
-    model_id, path, tokenizer, model, stateful = read_model(model_descr)
+    model_id, path, tokenizer, model, stateful = read_model((
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        Path("TinyLlama-1.1B-Chat-v1.0")
+    ))
     cb = get_continuous_batching(path)
     streamed = []
     generated = cb.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword))

From 4f77aa903c8096c16da8f51d6a147912265decf2 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 19 Jul 2024 07:09:58 +0400
Subject: [PATCH 28/28] correct test_continuous_batching_vs_stateful

---
 tests/python_tests/test_generate_api.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index aca6314dc..abef1e528 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -680,7 +680,10 @@ def test_left_pad():
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 def test_continuous_batching_vs_stateful(model_descr, prompt, generation_config):
-    model_id, path, tokenizer, model, stateful = read_model(model_descr)
+    model_id, path, tokenizer, model, stateful = read_model((
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        Path("TinyLlama-1.1B-Chat-v1.0")
+    ))
     config = ov_genai.GenerationConfig()
     config.max_new_tokens = 100
     cb = get_continuous_batching(path)