From 26d178f52653261397d59d2010b0e4a658eb86bc Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 21 May 2024 17:55:05 +0200
Subject: [PATCH 01/32] Removed max_padding

---
 .../apps/accuracy_sample.cpp                  |  1 -
 .../apps/throughput_benchmark.cpp             |  2 -
 .../library/include/scheduler_config.hpp      |  4 --
 .../library/src/scheduler.hpp                 |  6 --
 .../library/src/tests/scheduler.cpp           |  8 ---
 .../cpp/continuous_batching/python/python.cpp |  3 +-
 .../python/tests/models/real_models           | 55 +++++++++----------
 7 files changed, 27 insertions(+), 52 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp b/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp
index 2a2a841d7..e7cdaa7f3 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp
@@ -71,7 +71,6 @@ int main(int argc, char* argv[]) try {
         .dynamic_split_fuse = dynamic_split_fuse,
         // vLLM specific params
         .max_num_seqs = 2,
-        .max_paddings = 8,
     };
 
     ContinuousBatchingPipeline pipe(models_path, scheduler_config);
diff --git a/text_generation/causal_lm/cpp/continuous_batching/apps/throughput_benchmark.cpp b/text_generation/causal_lm/cpp/continuous_batching/apps/throughput_benchmark.cpp
index 595d1aba7..d29933806 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/apps/throughput_benchmark.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/apps/throughput_benchmark.cpp
@@ -181,7 +181,6 @@ int main(int argc, char* argv[]) try {
         .block_size = 32,
         .dynamic_split_fuse = dynamic_split_fuse,
         .max_num_seqs = 256, // not used if dynamic_split_fuse=True
-        .max_paddings = 256, // not used if dynamic_split_fuse=True
     };
 
     std::cout << "Benchmarking parameters: " << std::endl;
@@ -189,7 +188,6 @@ int main(int argc, char* argv[]) try {
     std::cout << "\tScheduling type: " << (scheduler_config.dynamic_split_fuse ? "dynamic split-fuse" : "vLLM") << std::endl;
     if (!scheduler_config.dynamic_split_fuse) {
         std::cout << "\tMax number of batched sequences: " << scheduler_config.max_num_seqs << std::endl;
-        std::cout << "\tMax number of padding tokens within prompt batch: " << scheduler_config.max_paddings << std::endl;
     }
     std::cout << "Dataset parameters: " << std::endl;
     std::cout << "\tNum prompts: " << num_prompts << std::endl;
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp
index eebcdc2fb..5bdf163e7 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp
@@ -27,8 +27,4 @@ struct SchedulerConfig {
 
     // max number of scheduled sequences (you can think of it as "max batch size")
     std::size_t max_num_seqs = 256;
-    // max number of padding tokens applied when we schedule a prompt phase
-    // e.g. if total number of padded tokens within a batch a greater than this value, then
-    // new sequnce is not added to batch
-    std::size_t max_paddings = 256;
 };
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp
index ff1f9b84a..5890cc78b 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp
@@ -309,7 +309,6 @@ class Scheduler {
         // Current scheduling method schedules prompts only in a manner similar to vLLM:
         // - Limits max batch size by:
         //   - max_num_seqs (256 in vLLM's defaults)
-        //   - max_paddings (256 in vLLM's defaults)
         //   - max_num_batched_tokens (max_model_length (and at least 2048) in vLLM's defaults)
 
         OPENVINO_ASSERT(!m_config.dynamic_split_fuse, "Internal error: we are in vLLM scheduling");
@@ -345,11 +344,6 @@ class Scheduler {
                 if (num_available_tokens_in_megabatch < max_sequence_len)
                     break;
 
-                // apply max padding tokens limitations
-                size_t total_num_paddings = max_sequence_len * (scheduler_output.m_scheduled_sequence_groups_ids.size() + 1) - (num_scheduled_tokens + sequence_len);
-                if (total_num_paddings > m_config.max_paddings)
-                    break;
-
                 // apply KV cache limitations
                 const size_t num_required_blocks = (sequence_len + m_config.block_size - 1) / m_config.block_size;
                 if (!m_block_manager.can_allocate_blocks(num_required_blocks))
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/scheduler.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/scheduler.cpp
index 3b0ae698c..f2aa62586 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/scheduler.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/scheduler.cpp
@@ -25,7 +25,6 @@ TEST(TestScheduler, general_test) {
         .block_size = 4,
         .dynamic_split_fuse = false,
         .max_num_seqs = 5,
-        .max_paddings = 8,
     },
         SchedulerConfig {
         .max_num_batched_tokens = 32,
@@ -33,7 +32,6 @@ TEST(TestScheduler, general_test) {
         .block_size = 4,
         .dynamic_split_fuse = true,
         .max_num_seqs = 5,
-        .max_paddings = 8,
     }
     };
     for (auto scheduler_config: configs) {
@@ -123,7 +121,6 @@ TEST(TestScheduler, test_append_slots_considers_all_sequences) {
         .block_size = 4,
         .dynamic_split_fuse = false,
         .max_num_seqs = 5,
-        .max_paddings = 8,
     },
         SchedulerConfig {
         .max_num_batched_tokens = 32,
@@ -131,7 +128,6 @@ TEST(TestScheduler, test_append_slots_considers_all_sequences) {
         .block_size = 4,
         .dynamic_split_fuse = true,
         .max_num_seqs = 5,
-        .max_paddings = 8,
     }
     };
     for (auto scheduler_config: configs) {
@@ -195,7 +191,6 @@ TEST(TestScheduler, test_partial_preemption) {
             .block_size = 4,
             .dynamic_split_fuse = false,
             .max_num_seqs = 5,
-            .max_paddings = 8,
         },
         SchedulerConfig {
             .max_num_batched_tokens = 32,
@@ -203,7 +198,6 @@ TEST(TestScheduler, test_partial_preemption) {
             .block_size = 4,
             .dynamic_split_fuse = true,
             .max_num_seqs = 5,
-            .max_paddings = 8,
         }
     };
     for (auto scheduler_config: configs) {
@@ -294,7 +288,6 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
             .block_size = 4,
             .dynamic_split_fuse = false,
             .max_num_seqs = 5,
-            .max_paddings = 8,
         },
         SchedulerConfig {
             .max_num_batched_tokens = 32,
@@ -302,7 +295,6 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
             .block_size = 4,
             .dynamic_split_fuse = true,
             .max_num_seqs = 5,
-            .max_paddings = 8,
         }
     };
     for (auto scheduler_config: configs) {
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/python.cpp b/text_generation/causal_lm/cpp/continuous_batching/python/python.cpp
index ca3d6f5d3..583efb971 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/python.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/python.cpp
@@ -90,8 +90,7 @@ PYBIND11_MODULE(py_continuous_batching, m) {
         .def_readwrite("num_kv_blocks", &SchedulerConfig::num_kv_blocks)
         .def_readwrite("block_size", &SchedulerConfig::block_size)
         .def_readwrite("dynamic_split_fuse", &SchedulerConfig::dynamic_split_fuse)
-        .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs)
-        .def_readwrite("max_paddings", &SchedulerConfig::max_paddings);
+        .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs);
 
     py::class_<ContinuousBatchingPipeline>(m, "ContinuousBatchingPipeline")
         .def(py::init<const std::string &, const SchedulerConfig&>())
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
index fefe45983..44d2897b1 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
@@ -1,8 +1,8 @@
-# Set of models with accuracy issues, because of PA:
+# Set of models with accuracy issues, because of PA
 EleutherAI/pythia-160m
 bigscience/bloomz-1b7
 bigscience/bloomz-560m
-databricks/dolly-v2-3b
+# XEON: databricks/dolly-v2-3b
 tiiuae/falcon-rw-7b
 bigcode/starcoder2-3b
 openbmb/MiniCPM-2B-sft-bf16
@@ -16,30 +16,28 @@ google/pegasus-big_patent
 google/pegasus-large
 #
 # Set of models, which require support in optimum-intel:
-# optimum-intel: Trying to export a RefinedWebModel model, that is a custom or unsupported architecture: nomic-ai/gpt4all-falcon
-# optimum-intel: Trying to export a internlm model, that is a custom or unsupported architecture: internlm/internlm-chat-7b
-# optimum-intel: Trying to export a mosaic-gpt model, that is a custom or unsupported architecture: mosaicml/mpt-1b-redpajama-200b
-# optimum-intel: AttributeError: Could not find the attribute named "num_key_value_heads" in the normalized config: BAAI/Aquila-7B
-# optimum-intel: PermissionError: [Errno 13] Permission denied: internlm/internlm2-7b
-# optimum-intel: AttributeError: 'NoneType' object has no attribute 'device': Salesforce/codegen2-1b
-# optimum-intel: TypeError: Object of type method is not JSON serializable: Salesforce/xgen-7b-8k-base
-# optimum-intel: IndexError: tuple index out of range: facebook/blenderbot-3B
+internlm/internlm-chat-7b
+BAAI/Aquila-7B
+internlm/internlm2-7b
+Salesforce/codegen2-1b
+Salesforce/xgen-7b-8k-base
+facebook/blenderbot-3B
 #
 # Set of models, failed because of CPU limitation
 # CPU: head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b
 #
 # Set of failed models, because of PA:
-# Exception from src/core/src/shape_util.cpp:65: BAAI/AquilaChat2-7B
-# Exception from src/core/src/shape_util.cpp:65: BAAI/AquilaChat-7B
-# Exception from src/core/src/shape_util.cpp:65: baichuan-inc/Baichuan-7B
-# Exception from src/core/src/shape_util.cpp:65: tiiuae/falcon-7b
-# RuntimeError: Check 'unregistered_parameters.str().empty()': microsoft/Phi-3-mini-128k-instruct
-# RuntimeError: Check 'unregistered_parameters.str().empty()': microsoft/Phi-3-mini-4k-instruct
-# RuntimeError: Check 'unregistered_parameters.str().empty()': bigcode/starcoderbase-3b
-# RuntimeError: Check 'unregistered_parameters.str().empty()': bigcode/gpt_bigcode-santacoder
-# RuntimeError: Check 'unregistered_parameters.str().empty()': nomic-ai/gpt4all-mpt
-# RuntimeError: Check 'unregistered_parameters.str().empty()': mosaicml/mpt-7b
-# RuntimeError: Check 'unregistered_parameters.str().empty()': facebook/opt-350m
+BAAI/AquilaChat2-7B
+BAAI/AquilaChat-7B
+baichuan-inc/Baichuan-7B
+tiiuae/falcon-7b
+microsoft/Phi-3-mini-128k-instruct
+microsoft/Phi-3-mini-4k-instruct
+bigcode/starcoderbase-3b
+bigcode/gpt_bigcode-santacoder
+nomic-ai/gpt4all-mpt
+mosaicml/mpt-7b
+facebook/opt-350m
 #
 # Set of models, failed because of OpenVINO Tokenizers:
 # RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B
@@ -59,8 +57,8 @@ Qwen/Qwen1.5-MoE-A2.7B-Chat
 # big model, not tried: xverse/XVERSE-MoE-A4.2B
 #
 # Set of passed models:
-microsoft/phi-2
-microsoft/phi-1_5
+# XEON: microsoft/phi-2
+# XEON: microsoft/phi-1_5
 EleutherAI/gpt-neo-125m
 EleutherAI/gpt-neo-125m
 EleutherAI/gpt-neo-1.3B
@@ -75,8 +73,8 @@ openai-community/gpt2-xl
 gpt2
 gpt2-xl
 nomic-ai/gpt4all-j
-stabilityai/stablelm-3b-4e1t
-stabilityai/stablelm-2-zephyr-1_6b
+# Xeon: stabilityai/stablelm-3b-4e1t
+# Xeon: stabilityai/stablelm-2-zephyr-1_6b
 meta-llama/Llama-2-7b-hf
 meta-llama/Meta-Llama-3-8B-Instruct
 meta-llama/CodeLlama-7b-hf
@@ -95,7 +93,6 @@ togethercomputer/RedPajama-INCITE-Chat-3B-v1
 #
 # Set of invalid models, because of HF:
 # HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat
-# HF: DeciCoderAttention.forward() got an unexpected keyword argument 'cache_position': Deci/DeciCoder-1b
-# HF: The generation config instance is invalid -- `.validate()` throws warnings and/or exceptions: openchat/openchat_3.5
-# HF: The generation config instance is invalid -- `.validate(): lmsys/vicuna-7b-v1.5
-# HF: The generation config instance is invalid -- `.validate(): lmsys/longchat-7b-v1.5-32k
\ No newline at end of file
+# Xeon: openchat/openchat_3.5
+lmsys/vicuna-7b-v1.5
+lmsys/longchat-7b-v1.5-32k
\ No newline at end of file

From 1aa23fe86d200b4f92a4be22842cfdb104844958 Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <dariusz.trawinski@intel.com>
Date: Wed, 22 May 2024 15:12:55 +0200
Subject: [PATCH 02/32] Updated tokenizers

---
 .../python/tests/models/real_models              | 16 ++++++++--------
 .../python/tests/requirements.txt                |  7 ++++++-
 thirdparty/openvino_tokenizers                   |  2 +-
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
index 44d2897b1..e9ae2f441 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
@@ -2,7 +2,7 @@
 EleutherAI/pythia-160m
 bigscience/bloomz-1b7
 bigscience/bloomz-560m
-# XEON: databricks/dolly-v2-3b
+databricks/dolly-v2-3b
 tiiuae/falcon-rw-7b
 bigcode/starcoder2-3b
 openbmb/MiniCPM-2B-sft-bf16
@@ -25,6 +25,8 @@ facebook/blenderbot-3B
 #
 # Set of models, failed because of CPU limitation
 # CPU: head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b
+# CPU: head size must be multiple of 16, current: 100: openlm-research/open_llama_3b
+# CPU: head size must be multiple of 16, current: 100: openlm-research/open_llama_3b_v2
 #
 # Set of failed models, because of PA:
 BAAI/AquilaChat2-7B
@@ -57,8 +59,8 @@ Qwen/Qwen1.5-MoE-A2.7B-Chat
 # big model, not tried: xverse/XVERSE-MoE-A4.2B
 #
 # Set of passed models:
-# XEON: microsoft/phi-2
-# XEON: microsoft/phi-1_5
+microsoft/phi-2
+microsoft/phi-1_5
 EleutherAI/gpt-neo-125m
 EleutherAI/gpt-neo-125m
 EleutherAI/gpt-neo-1.3B
@@ -73,14 +75,12 @@ openai-community/gpt2-xl
 gpt2
 gpt2-xl
 nomic-ai/gpt4all-j
-# Xeon: stabilityai/stablelm-3b-4e1t
-# Xeon: stabilityai/stablelm-2-zephyr-1_6b
+stabilityai/stablelm-3b-4e1t
+stabilityai/stablelm-2-zephyr-1_6b
 meta-llama/Llama-2-7b-hf
 meta-llama/Meta-Llama-3-8B-Instruct
 meta-llama/CodeLlama-7b-hf
 lmsys/vicuna-7b-v1.3
-openlm-research/open_llama_3b
-openlm-research/open_llama_3b_v2
 mistralai/Mistral-7B-v0.1
 mistralai/Mistral-7B-Instruct-v0.1
 allenai/OLMo-1B-hf
@@ -93,6 +93,6 @@ togethercomputer/RedPajama-INCITE-Chat-3B-v1
 #
 # Set of invalid models, because of HF:
 # HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat
-# Xeon: openchat/openchat_3.5
+openchat/openchat_3.5
 lmsys/vicuna-7b-v1.5
 lmsys/longchat-7b-v1.5-32k
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt
index d49f1b043..4a94dad33 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt
@@ -20,6 +20,7 @@ bitsandbytes
 # - Qwen/Qwen-7B
 # - Qwen/Qwen-7B-Chat
 # - mosaicml/mpt-7b
+# - internlm/internlm2-7b
 einops
 # - Qwen/Qwen-7B
 # - Qwen/Qwen-7B-Chat
@@ -27,4 +28,8 @@ transformers_stream_generator
 # - openbmb/MiniCPM-V-2
 torchvision
 # - openbmb/MiniCPM-V-2
-timm
\ No newline at end of file
+timm
+# - Qwen/Qwen-7B
+# - Qwen/Qwen-7B-Chat
+# - Salesforce/xgen-7b-8k-base
+tiktoken
\ No newline at end of file
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index c75450346..200cffc10 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit c754503462f569b648b598d57ff91ea57bb8deb1
+Subproject commit 200cffc10e3479b00006b613dc3c9fa48301177d

From a44e0aa16a62179adab5cd6c2be561bb291a762d Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 22 May 2024 16:53:42 +0200
Subject: [PATCH 03/32] Endless loop fix.

---
 .../src/continuous_batching_pipeline.cpp      | 24 +++++++-
 .../library/src/scheduler.hpp                 | 18 ++++--
 .../library/src/sequence_group.hpp            | 54 +++++++++++++++++-
 .../python/tests/common.py                    |  6 +-
 .../python/tests/test_preemption.py           | 55 ++++++++++++++++---
 5 files changed, 141 insertions(+), 16 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
index 3bfd2dabf..40b1d0223 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
@@ -147,6 +147,14 @@ class ContinuousBatchingPipeline::Impl {
             timer.end();
         }
 
+        // if no tokens were scheduled, we are out of memory
+        if (scheduler_output.m_total_num_scheduled_tokens == 0) {
+            for (size_t sequence_group_id = 0; sequence_group_id < m_requests.size(); ++sequence_group_id) {
+                m_requests[sequence_group_id]->set_out_of_memory();
+            }
+            return {};
+        }
+
         ov::Tensor logits;
         {
             static ManualTimer timer("forward");
@@ -194,7 +202,6 @@ class ContinuousBatchingPipeline::Impl {
         }
 
         // perform post-processing of current step
-
         std::vector<GenerationResult> currently_finished_requests;
         {
             static ManualTimer timer("create finished results");
@@ -221,6 +228,14 @@ class ContinuousBatchingPipeline::Impl {
         return !m_requests.empty();
     }
 
+    bool out_of_memory() const {
+        for (size_t sequence_group_id = 0; sequence_group_id < m_requests.size(); ++sequence_group_id) {
+            if (m_requests[sequence_group_id]->out_of_memory())
+                return true;
+        }
+        return false;
+    }
+
     std::vector<GenerationResult> generate(const std::vector<std::string> prompts, std::vector<GenerationConfig> sampling_params) {
         OPENVINO_ASSERT(!has_running_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
         OPENVINO_ASSERT(prompts.size() == sampling_params.size());
@@ -232,11 +247,14 @@ class ContinuousBatchingPipeline::Impl {
         std::vector<GenerationResult> results;
         results.reserve(m_requests.size());
 
-        while (has_running_requests()) {
+        while (has_running_requests() && !out_of_memory()) {
             std::vector<GenerationResult> partial_results = step();
-            results.insert(results.end(), partial_results.begin(), partial_results.end());
+            if (partial_results.size() > 0)
+                results.insert(results.end(), partial_results.begin(), partial_results.end());
         }
 
+        OPENVINO_ASSERT(!out_of_memory(), "Not enough memory for processing the requests.");
+
         // sort results according to request_id to return results in order of initial prompts
         std::sort(results.begin(), results.end(), [] (const GenerationResult& r1, const GenerationResult& r2) -> bool {
             return r1.m_request_id < r2.m_request_id;
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp
index ff1f9b84a..1cc357f7a 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp
@@ -55,6 +55,8 @@ class Scheduler {
             }
         }
 
+        _clear_waiting_sequences(sequence_groups);
+
         return scheduler_output;
     }
 
@@ -104,9 +106,10 @@ class Scheduler {
                 m_block_manager.free_sequence(seq_id);
             }
             sequence_group->reset();
+            sequence_group->set_waiting();
             return m_block_manager.num_free_blocks() > prev_blocks_count;
         }
-
+        
         // currently partial preemtion is enabled only for single running sequence case
         // TODO: implement partial preemption for case with muliple sequences in group
         for (size_t s = 0; s < num_running_sequences; ++s) {
@@ -150,6 +153,7 @@ class Scheduler {
             m_block_manager.free_sequence(seq_id);
         }
         sequence_group->preempt_tokens(preempted_tokens);
+        sequence_group->set_waiting();
         return total_num_released_blocks > 0;
     }
 
@@ -197,7 +201,7 @@ class Scheduler {
 
         for (size_t sequence_group_id = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
             SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id];
-            if (!sequence_group->can_generate_tokens()) {
+            if (!sequence_group->can_generate_tokens() && !sequence_group->is_waiting()) {
                 size_t num_running_seqs = sequence_group->num_running_seqs();
                 // prompt phases can have a single running sequence
                 OPENVINO_ASSERT(num_running_seqs == 1);
@@ -249,7 +253,7 @@ class Scheduler {
             // Question: do we need to schedule preeempted first as it's done in vLLM?
             // Answer: preempted sequences have low priority, so they should be after "running" ones. So, here we
             //         keep latencies for sequence groups of high priority
-            if (sequence_group->can_generate_tokens()) {
+            if (sequence_group->can_generate_tokens() && !sequence_group->is_waiting()) {
                 OPENVINO_ASSERT(!sequence_group->has_finished());
                 size_t num_running_seqs = sequence_group->num_running_seqs();
                 size_t num_tokens_in_megabatch = m_config.max_num_batched_tokens - scheduler_output.m_total_num_scheduled_tokens;
@@ -322,7 +326,7 @@ class Scheduler {
 
         for (size_t sequence_group_id = 0, num_scheduled_tokens = 0, max_sequence_len = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
             SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id];
-            if (!sequence_group->can_generate_tokens()) {
+            if (!sequence_group->can_generate_tokens() && !sequence_group->is_waiting()) {
                 size_t num_running_seqs = sequence_group->num_running_seqs();
                 // prompt phases can have a single running sequence
                 OPENVINO_ASSERT(num_running_seqs == 1);
@@ -381,4 +385,10 @@ class Scheduler {
             }
         }
     }
+
+    void _clear_waiting_sequences(const std::vector<SequenceGroup::Ptr>& sequence_groups) {
+        for (size_t sequence_group_id = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) { 
+            sequence_groups[sequence_group_id]->clear_waiting_sequences();
+        }
+    }
 };
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp
index 29b0af513..c49a88a5a 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp
@@ -10,7 +10,9 @@
 
 enum class SequenceStatus {
     RUNNING = 0,
-    FINISHED = 1
+    FINISHED = 1,
+    OUT_OF_MEMORY = 2,
+    WAITING = 3
 };
 
 using TokenIds = std::vector<int64_t>;
@@ -65,6 +67,14 @@ class Sequence {
         return m_status == SequenceStatus::RUNNING;
     }
 
+    bool out_of_memory() const {
+        return m_status == SequenceStatus::OUT_OF_MEMORY;
+    }
+
+    bool is_waiting() const {
+        return m_status == SequenceStatus::WAITING;
+    }
+
     void set_status(SequenceStatus status) {
         m_status = status;
     }
@@ -279,6 +289,14 @@ class SequenceGroup {
         clear_scheduled_tokens();
     }
 
+    void clear_waiting_sequences() {
+        for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) {
+            if (m_sequences[seq_id]->is_waiting()) {
+                m_sequences[seq_id]->set_status(SequenceStatus::RUNNING);
+            }
+        }
+    }
+
     const TokenIds& get_prompt_ids() const {
         return m_prompt_ids;
     }
@@ -321,4 +339,38 @@ class SequenceGroup {
             return false;
         return true; 
     }
+
+    void set_out_of_memory() {
+        for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) {
+            if (m_sequences[seq_id]->is_running()) {
+                m_sequences[seq_id]->set_status(SequenceStatus::OUT_OF_MEMORY);
+            }
+        }
+    }
+
+    void set_waiting() {
+        for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) {
+            if (m_sequences[seq_id]->is_running()) {
+                m_sequences[seq_id]->set_status(SequenceStatus::WAITING);
+            }
+        }
+    }
+
+    bool out_of_memory() {
+        for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) {
+            if (m_sequences[seq_id]->out_of_memory()) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    bool is_waiting() {
+        for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) {
+            if (m_sequences[seq_id]->is_waiting()) {
+                return true;
+            }
+        }
+        return false;
+    }
 };
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
index 724c7cf71..e9b483b31 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
@@ -237,10 +237,14 @@ def generate_and_compare_with_reference_text(model_path: Path, prompts: List[str
         for ref_text, ov_text in zip(ref_texts_for_this_prompt, ov_result.m_generation_ids):
             assert ref_text == ov_text
 
-def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = None):
+def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None):
     prompts, generation_configs = get_test_dataset()
     scheduler_config = get_scheduler_config(scheduler_params)
 
+    if generation_config is not None:
+        generation_config.rng_seed = 0
+        generation_configs = [generation_config] * len(prompts)
+
     _generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path)
 
 
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py
index 781d016d3..e0ba5b6ad 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py
@@ -1,13 +1,54 @@
 # Copyright (C) 2018-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
+
 import pytest
+from dataclasses import dataclass
+from py_continuous_batching import GenerationConfig, GenerationResult
+from typing import List
 
-from common import run_test_pipeline, get_models_list
+from common import get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \
+    DEFAULT_SCHEDULER_CONFIG, get_scheduler_config, run_test_pipeline, get_models_list, get_beam_search, get_greedy, \
+    get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p
+from test_sampling import RandomSamplingTestStruct
 
-scheduler_params_list = [{"num_kv_blocks": 300, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256},
-                         {"num_kv_blocks": 40, "block_size": 4, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, # test preemption for dynamic_split_fuse
-                         {"num_kv_blocks": 40, "block_size": 4, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}] # test preemption for vllm
-@pytest.mark.parametrize("scheduler_params", scheduler_params_list)
+scheduler_params_list = [({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()),
+                         ({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()),
+                         ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), # output text does not match due to <\s> symbols problem
+                         ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search())] # output text does not match due to <\s> symbols problem
+@pytest.mark.parametrize("params", scheduler_params_list)
 @pytest.mark.precommit
-def test_preemption(tmp_path, scheduler_params):
-    run_test_pipeline(tmp_path, "facebook/opt-125m", scheduler_params)
\ No newline at end of file
+def test_preemption(tmp_path, params):
+    run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1])
+
+
+@pytest.mark.precommit
+def test_out_of_memory(tmp_path):
+    with pytest.raises(RuntimeError) as excinfo:
+        run_test_pipeline(tmp_path, "facebook/opt-125m", {"num_kv_blocks": 1})
+    assert "Not enough memory for processing the requests." in str(excinfo.value)
+
+multinomial_params = RandomSamplingTestStruct(generation_config=[get_multinomial_temperature(),
+                                                          get_multinomial_temperature_and_top_p(),
+                                                          get_multinomial_temperature_and_top_k()],
+                                                       prompts=["What is OpenVINO?",
+                                                                "How are you?",
+                                                                "Tell me something about Canada?",
+                                                                ],
+                                                       ref_texts=[ ["\n\nOpenVINO is a live platform that allows users to create and manage a new library for open source applications.\n\nOpenVINO is"],
+                                                                   ["  You're getting much better results from doing this, than you are by not doing this.  I have a BH and I was so far"],
+                                                                   ["\nI'm from Canada, and I'm from the US, so I'm not sure.\nI think you mean the Canadian version.</s>"]])
+
+@pytest.mark.parametrize("dynamic_split_fuse", [True, False])
+@pytest.mark.precommit
+def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse):
+    generation_configs = multinomial_params.generation_config
+    for config in generation_configs:
+        config.rng_seed = 0
+    model_id : str = "facebook/opt-125m"
+    model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True)
+
+    model_path : Path = tmp_path / model_id
+    save_ov_model_from_optimum(model, hf_tokenizer, model_path)
+
+    scheduler_config = get_scheduler_config({"num_kv_blocks": 3, "block_size": 32, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256})
+    generate_and_compare_with_reference_text(model_path, multinomial_params.prompts, multinomial_params.ref_texts, generation_configs, scheduler_config)

From e0815cf5401f173f670731d9512f7963985a6769 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 22 May 2024 17:07:26 +0200
Subject: [PATCH 04/32] Minor correction.

---
 .../library/src/continuous_batching_pipeline.cpp                | 1 +
 .../causal_lm/cpp/continuous_batching/library/src/scheduler.hpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
index 40b1d0223..e2919c5d2 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
@@ -202,6 +202,7 @@ class ContinuousBatchingPipeline::Impl {
         }
 
         // perform post-processing of current step
+
         std::vector<GenerationResult> currently_finished_requests;
         {
             static ManualTimer timer("create finished results");
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp
index 1cc357f7a..e3bdb0c2a 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp
@@ -109,7 +109,7 @@ class Scheduler {
             sequence_group->set_waiting();
             return m_block_manager.num_free_blocks() > prev_blocks_count;
         }
-        
+
         // currently partial preemtion is enabled only for single running sequence case
         // TODO: implement partial preemption for case with muliple sequences in group
         for (size_t s = 0; s < num_running_sequences; ++s) {

From 02045f51f7cc4ea2012b5fd4e1c4404648951bcf Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 22 May 2024 20:15:49 +0200
Subject: [PATCH 05/32] Updated list of models

---
 .../python/tests/common.py                    |  2 +-
 .../python/tests/models/real_models           | 61 ++++++++++---------
 2 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
index 724c7cf71..3f2187486 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
@@ -191,7 +191,7 @@ def save_ov_model_from_optimum(model, hf_tokenizer, model_path: Path):
     # convert tokenizers as well
     from openvino_tokenizers import convert_tokenizer
     from openvino import serialize
-    tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True)
+    tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True, add_special_tokens=False)
     serialize(tokenizer, model_path / "openvino_tokenizer.xml")
     serialize(detokenizer, model_path / "openvino_detokenizer.xml")
 
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
index e9ae2f441..0a11e4f39 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
@@ -11,52 +11,60 @@ Qwen/Qwen-7B
 Qwen/Qwen-7B-Chat
 Qwen/Qwen1.5-0.5B
 Qwen/Qwen1.5-7B-Chat
-rinna/bilingual-gpt-neox-4b
-google/pegasus-big_patent
-google/pegasus-large
-#
-# Set of models, which require support in optimum-intel:
 internlm/internlm-chat-7b
 BAAI/Aquila-7B
 internlm/internlm2-7b
-Salesforce/codegen2-1b
-Salesforce/xgen-7b-8k-base
-facebook/blenderbot-3B
-#
-# Set of models, failed because of CPU limitation
-# CPU: head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b
-# CPU: head size must be multiple of 16, current: 100: openlm-research/open_llama_3b
-# CPU: head size must be multiple of 16, current: 100: openlm-research/open_llama_3b_v2
-#
-# Set of failed models, because of PA:
+openchat/openchat_3.5
+lmsys/vicuna-7b-v1.5
+lmsys/longchat-7b-v1.5-32k
 BAAI/AquilaChat2-7B
 BAAI/AquilaChat-7B
 baichuan-inc/Baichuan-7B
 tiiuae/falcon-7b
 microsoft/Phi-3-mini-128k-instruct
-microsoft/Phi-3-mini-4k-instruct
-bigcode/starcoderbase-3b
-bigcode/gpt_bigcode-santacoder
+microsoft/Phi-3-mini-4k-instruct#
 nomic-ai/gpt4all-mpt
 mosaicml/mpt-7b
-facebook/opt-350m
+# Set of models, failed because of C++ Cont. Batching
+# RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B
+#
+# Set of models, which require support in optimum-intel / transformers / models repositories:
+# https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32: Salesforce/xgen-7b-8k-base
+# Trying to export a jais model, that is a custom or unsupported architecture: core42/jais-13b-chat
+# IndexError: tuple index out of range: facebook/blenderbot-3B
+# `pip install flash_attn`: OrionStarAI/Orion-14B-Base
+# Xeon only: reports IP_ADDRESS on Optimum inference: allenai/OLMo-1B-hf
+# Xeon only: reports IP_ADDRESS on Optimum inference: allenai/OLMo-7B-hf
+#
+# Set of models, failed because of CPU limitation
+# head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b
+# head size must be multiple of 16, current: 100: openlm-research/open_llama_3b
+# head size must be multiple of 16, current: 100: openlm-research/open_llama_3b_v2
+#
+# Set of failed models, because of PA:
+# 'start' input is not a scalar: google/pegasus-big_patent
+# 'start' input is not a scalar: google/pegasus-large
+# 'stop' input is not a scalar: Salesforce/codegen2-1b
+# Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]): bigcode/starcoderbase-3b
+# Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]): bigcode/gpt_bigcode-santacoder
+# Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]): facebook/opt-350m
 #
 # Set of models, failed because of OpenVINO Tokenizers:
-# RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B
+# https://jira.devtools.intel.com/browse/CVS-142063: rinna/bilingual-gpt-neox-4b
 #
 # Set of 13B, 30B abd 70B models:
 EleutherAI/gpt-neox-20b
 # big model, not tried: core42/jais-13b
-core42/jais-13b-chat
+# see optimum: core42/jais-13b-chat
 # big model, not tried: young-geng/koala
 mistralai/Mixtral-8x7B-v0.1
 # big model, not tried: mistralai/Mixtral-8x7B-Instruct-v0.1
 # big model, not tried: mosaicml/mpt-30b
-OrionStarAI/Orion-14B-Base
+# see optimum: OrionStarAI/Orion-14B-Base
 # big model, not tried: OrionStarAI/Orion-14B-Chat
 # big model, not tried: Qwen/Qwen1.5-MoE-A2.7B
 Qwen/Qwen1.5-MoE-A2.7B-Chat
-# big model, not tried: xverse/XVERSE-MoE-A4.2B
+xverse/XVERSE-MoE-A4.2B
 #
 # Set of passed models:
 microsoft/phi-2
@@ -83,8 +91,6 @@ meta-llama/CodeLlama-7b-hf
 lmsys/vicuna-7b-v1.3
 mistralai/Mistral-7B-v0.1
 mistralai/Mistral-7B-Instruct-v0.1
-allenai/OLMo-1B-hf
-allenai/OLMo-7B-hf
 01-ai/Yi-6B
 Salesforce/codegen-350M-multi
 Salesforce/codegen-350M-nl
@@ -92,7 +98,4 @@ togethercomputer/RedPajama-INCITE-Chat-3B-v1
 # passed, but with export=False: OpenVINO/codegen25-7b-multi-fp16-ov
 #
 # Set of invalid models, because of HF:
-# HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat
-openchat/openchat_3.5
-lmsys/vicuna-7b-v1.5
-lmsys/longchat-7b-v1.5-32k
\ No newline at end of file
+# HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat
\ No newline at end of file

From 6f60f9156e0b7a484962cede253ef1ffd41612aa Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 22 May 2024 20:49:41 +0200
Subject: [PATCH 06/32] Proper flag to skip special tokens

---
 .../causal_lm/cpp/continuous_batching/python/tests/common.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
index 3f2187486..df27e6d4a 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
@@ -191,7 +191,7 @@ def save_ov_model_from_optimum(model, hf_tokenizer, model_path: Path):
     # convert tokenizers as well
     from openvino_tokenizers import convert_tokenizer
     from openvino import serialize
-    tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True, add_special_tokens=False)
+    tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True, skip_special_tokens=True)
     serialize(tokenizer, model_path / "openvino_tokenizer.xml")
     serialize(detokenizer, model_path / "openvino_detokenizer.xml")
 

From ec3ac962f9ee375f7c2f356165255bcc8b3732b6 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 23 May 2024 11:21:21 +0200
Subject: [PATCH 07/32] Updated models list

---
 .../python/tests/models/real_models                 | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
index 0a11e4f39..cf7785e30 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
@@ -22,9 +22,10 @@ BAAI/AquilaChat-7B
 baichuan-inc/Baichuan-7B
 tiiuae/falcon-7b
 microsoft/Phi-3-mini-128k-instruct
-microsoft/Phi-3-mini-4k-instruct#
+microsoft/Phi-3-mini-4k-instruct
 nomic-ai/gpt4all-mpt
 mosaicml/mpt-7b
+mosaicml/mpt-7b-chat
 # Set of models, failed because of C++ Cont. Batching
 # RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B
 #
@@ -58,11 +59,13 @@ EleutherAI/gpt-neox-20b
 # see optimum: core42/jais-13b-chat
 # big model, not tried: young-geng/koala
 mistralai/Mixtral-8x7B-v0.1
-# big model, not tried: mistralai/Mixtral-8x7B-Instruct-v0.1
-# big model, not tried: mosaicml/mpt-30b
+mistralai/Mixtral-8x7B-Instruct-v0.1
+mosaicml/mpt-30b
 # see optimum: OrionStarAI/Orion-14B-Base
 # big model, not tried: OrionStarAI/Orion-14B-Chat
-# big model, not tried: Qwen/Qwen1.5-MoE-A2.7B
+CohereForAI/c4ai-command-r-v01
+openlm-research/open_llama_13b
+Qwen/Qwen1.5-MoE-A2.7B
 Qwen/Qwen1.5-MoE-A2.7B-Chat
 xverse/XVERSE-MoE-A4.2B
 #
@@ -98,4 +101,4 @@ togethercomputer/RedPajama-INCITE-Chat-3B-v1
 # passed, but with export=False: OpenVINO/codegen25-7b-multi-fp16-ov
 #
 # Set of invalid models, because of HF:
-# HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat
\ No newline at end of file
+# HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat

From a553d49f9d56cb0206762f1218563304922a702f Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 23 May 2024 12:18:36 +0200
Subject: [PATCH 08/32] PyTest config.

---
 .../cpp/continuous_batching/python/tests/.pytest.ini         | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 text_generation/causal_lm/cpp/continuous_batching/python/tests/.pytest.ini

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/.pytest.ini b/text_generation/causal_lm/cpp/continuous_batching/python/tests/.pytest.ini
new file mode 100644
index 000000000..7bc73fe85
--- /dev/null
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/.pytest.ini
@@ -0,0 +1,5 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+[pytest]
+addopts = -m precommit
\ No newline at end of file

From 96bf758e5cb57cf004804860bb4958116177879c Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 23 May 2024 12:24:37 +0200
Subject: [PATCH 09/32] Mark for real_models.

---
 .../cpp/continuous_batching/python/tests/test_sampling.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py
index 50e917253..14a2e8295 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py
@@ -22,7 +22,7 @@ def test_sampling_precommit(tmp_path, model_id):
 def test_sampling_nightly(tmp_path, model_id):
     run_test_pipeline(tmp_path, model_id)
 
-
+@pytest.mark.real_models
 @pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models")))
 def test_real_models(tmp_path, model_id):
     run_test_pipeline(tmp_path, model_id)

From fba8ac6b3c7aeed0901c38834953babae850d7c2 Mon Sep 17 00:00:00 2001
From: Sergey Lyalin <sergey.lyalin@intel.com>
Date: Thu, 23 May 2024 12:11:09 +0000
Subject: [PATCH 10/32] Fix for models where k and v merged into a single
 variable per decoder layer.

---
 .../library/src/paged_attention_transformations.cpp   | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/paged_attention_transformations.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/paged_attention_transformations.cpp
index ad02f279d..887cdbd38 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/paged_attention_transformations.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/paged_attention_transformations.cpp
@@ -12,15 +12,14 @@ void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, Dev
     const ov::op::util::VariableVector& variables = model->get_variables();
     OPENVINO_ASSERT(!variables.empty(), "Model is supposed to be stateful");
 
-    // number of variables is 2 (K and V) multiplied by number of decoder layers
-    size_t num_layers = variables.size() >> 1;
-
-    ov::pass::Manager manager;
-    manager.register_pass<ov::pass::SDPAToPagedAttention>();
-    manager.run_passes(model);
+    ov::pass::SDPAToPagedAttention().run_on_model(model);
 
     const ov::ParameterVector& parameters = model->get_parameters();
 
+    size_t num_layers = std::count_if(parameters.begin(), parameters.end(), [](std::shared_ptr<ov::op::v0::Parameter> parameter) {
+        return parameter->get_friendly_name().find("key_cache.") == 0;
+    });
+
     // extract num_kv_heads and head_size
     size_t kv_caches_inputs_offset = 2;
     ov::PartialShape k_shape = parameters[kv_caches_inputs_offset]->get_partial_shape();

From d79d7fd74bfe9a1ec37ac64325672f0e44f1603f Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 23 May 2024 14:11:51 +0200
Subject: [PATCH 11/32] Tests correction.

---
 .../cpp/continuous_batching/python/tests/test_preemption.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py
index e0ba5b6ad..5300296d2 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py
@@ -13,8 +13,8 @@
 
 scheduler_params_list = [({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()),
                          ({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()),
-                         ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), # output text does not match due to <\s> symbols problem
-                         ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search())] # output text does not match due to <\s> symbols problem
+                         ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()),
+                         ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search())]
 @pytest.mark.parametrize("params", scheduler_params_list)
 @pytest.mark.precommit
 def test_preemption(tmp_path, params):
@@ -36,7 +36,7 @@ def test_out_of_memory(tmp_path):
                                                                 ],
                                                        ref_texts=[ ["\n\nOpenVINO is a live platform that allows users to create and manage a new library for open source applications.\n\nOpenVINO is"],
                                                                    ["  You're getting much better results from doing this, than you are by not doing this.  I have a BH and I was so far"],
-                                                                   ["\nI'm from Canada, and I'm from the US, so I'm not sure.\nI think you mean the Canadian version.</s>"]])
+                                                                   ["\nI'm from Canada, and I'm from the US, so I'm not sure.\nI think you mean the Canadian version."]])
 
 @pytest.mark.parametrize("dynamic_split_fuse", [True, False])
 @pytest.mark.precommit

From 39916cafbcba0c7d25ab6c5a9a9361d2ef6eb864 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 23 May 2024 22:30:15 +0200
Subject: [PATCH 12/32] Adjust real models list

---
 .../python/tests/models/real_models              | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
index cf7785e30..a7c637a3b 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
@@ -26,16 +26,16 @@ microsoft/Phi-3-mini-4k-instruct
 nomic-ai/gpt4all-mpt
 mosaicml/mpt-7b
 mosaicml/mpt-7b-chat
+bigcode/starcoderbase-3b
+bigcode/gpt_bigcode-santacoder
+allenai/OLMo-1B-hf
+allenai/OLMo-7B-hf
 # Set of models, failed because of C++ Cont. Batching
 # RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B
 #
 # Set of models, which require support in optimum-intel / transformers / models repositories:
-# https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32: Salesforce/xgen-7b-8k-base
-# Trying to export a jais model, that is a custom or unsupported architecture: core42/jais-13b-chat
 # IndexError: tuple index out of range: facebook/blenderbot-3B
 # `pip install flash_attn`: OrionStarAI/Orion-14B-Base
-# Xeon only: reports IP_ADDRESS on Optimum inference: allenai/OLMo-1B-hf
-# Xeon only: reports IP_ADDRESS on Optimum inference: allenai/OLMo-7B-hf
 #
 # Set of models, failed because of CPU limitation
 # head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b
@@ -46,8 +46,8 @@ mosaicml/mpt-7b-chat
 # 'start' input is not a scalar: google/pegasus-big_patent
 # 'start' input is not a scalar: google/pegasus-large
 # 'stop' input is not a scalar: Salesforce/codegen2-1b
-# Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]): bigcode/starcoderbase-3b
-# Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]): bigcode/gpt_bigcode-santacoder
+# 'stop' input is not a scalar: core42/jais-13b
+# 'stop' input is not a scalar: core42/jais-13b-chat
 # Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]): facebook/opt-350m
 #
 # Set of models, failed because of OpenVINO Tokenizers:
@@ -55,8 +55,6 @@ mosaicml/mpt-7b-chat
 #
 # Set of 13B, 30B abd 70B models:
 EleutherAI/gpt-neox-20b
-# big model, not tried: core42/jais-13b
-# see optimum: core42/jais-13b-chat
 # big model, not tried: young-geng/koala
 mistralai/Mixtral-8x7B-v0.1
 mistralai/Mixtral-8x7B-Instruct-v0.1
@@ -73,7 +71,6 @@ xverse/XVERSE-MoE-A4.2B
 microsoft/phi-2
 microsoft/phi-1_5
 EleutherAI/gpt-neo-125m
-EleutherAI/gpt-neo-125m
 EleutherAI/gpt-neo-1.3B
 EleutherAI/gpt-j-6b
 baichuan-inc/Baichuan2-7B-Chat
@@ -102,3 +99,4 @@ togethercomputer/RedPajama-INCITE-Chat-3B-v1
 #
 # Set of invalid models, because of HF:
 # HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat
+# https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32: Salesforce/xgen-7b-8k-base

From 41cc93011c49bac48e15f2aab7b5c6f98dc4e600 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 24 May 2024 12:36:01 +0200
Subject: [PATCH 13/32] Added mre models

---
 .../python/tests/models/real_models           | 28 ++++++++++++++++++-
 .../python/tests/requirements.txt             |  6 +++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
index a7c637a3b..40c261fe7 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
@@ -30,12 +30,32 @@ bigcode/starcoderbase-3b
 bigcode/gpt_bigcode-santacoder
 allenai/OLMo-1B-hf
 allenai/OLMo-7B-hf
+PygmalionAI/pygmalion-6b
+stabilityai/stable-code-3b
+berkeley-nest/Starling-LM-7B-alpha
+EleutherAI/gpt-neo-2.7B
+databricks/dolly-v1-6b
+openai-community/gpt2-large
+openai-community/gpt2-medium
+bigscience/bloom-7b1
+facebook/opt-1.3b
+facebook/opt-2.7b
+GAIR/Abel-7B-002
+google/gemma-1.1-7b-it
+google/gemma-2b-it
+microsoft/DialoGPT-large
+microsoft/DialoGPT-medium
+Qwen/Qwen1.5-1.8B
+microsoft/Orca-2-7b
 # Set of models, failed because of C++ Cont. Batching
 # RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B
 #
 # Set of models, which require support in optimum-intel / transformers / models repositories:
 # IndexError: tuple index out of range: facebook/blenderbot-3B
 # `pip install flash_attn`: OrionStarAI/Orion-14B-Base
+# ValueError: Trying to export a fuyu model, that is a custom or unsupported architecture: adept/fuyu-8b
+# ValueError: Trying to export a mamba model, that is a custom or unsupported architecture: state-spaces/mamba-130m-hf
+# ValueError: Trying to export a xlnet model, that is a custom or unsupported architecture: xlnet/xlnet-base-cased
 #
 # Set of models, failed because of CPU limitation
 # head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b
@@ -52,6 +72,9 @@ allenai/OLMo-7B-hf
 #
 # Set of models, failed because of OpenVINO Tokenizers:
 # https://jira.devtools.intel.com/browse/CVS-142063: rinna/bilingual-gpt-neox-4b
+# Cannot convert tokenizer of this type without `.model` file: deepseek-ai/deepseek-coder-33b-instruct
+# Cannot convert tokenizer of this type without `.model` file: deepseek-ai/deepseek-coder-6.7b-instruct
+# Tokenizer type is not supported: <class 'transformers.models.biogpt.tokenization_biogpt.BioGptTokenizer'>: microsoft/biogpt
 #
 # Set of 13B, 30B abd 70B models:
 EleutherAI/gpt-neox-20b
@@ -66,6 +89,9 @@ openlm-research/open_llama_13b
 Qwen/Qwen1.5-MoE-A2.7B
 Qwen/Qwen1.5-MoE-A2.7B-Chat
 xverse/XVERSE-MoE-A4.2B
+cerebras/Cerebras-GPT-13B
+WizardLMTeam/WizardCoder-15B-V1.0
+TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ
 #
 # Set of passed models:
 microsoft/phi-2
@@ -99,4 +125,4 @@ togethercomputer/RedPajama-INCITE-Chat-3B-v1
 #
 # Set of invalid models, because of HF:
 # HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat
-# https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32: Salesforce/xgen-7b-8k-base
+# https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32: Salesforce/xgen-7b-8k-base
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt
index 4a94dad33..568b6886b 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt
@@ -32,4 +32,8 @@ timm
 # - Qwen/Qwen-7B
 # - Qwen/Qwen-7B-Chat
 # - Salesforce/xgen-7b-8k-base
-tiktoken
\ No newline at end of file
+tiktoken
+# - microsoft/biogpt
+sacremoses
+# - TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ
+auto-gptq
\ No newline at end of file

From 0c2e335be171f3a522aeb0befe7f7a4b0360eeeb Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 24 May 2024 13:38:06 +0200
Subject: [PATCH 14/32] Drop koala model

---
 .../cpp/continuous_batching/python/tests/models/real_models      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
index 40c261fe7..b179c6058 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
@@ -78,7 +78,6 @@ microsoft/Orca-2-7b
 #
 # Set of 13B, 30B abd 70B models:
 EleutherAI/gpt-neox-20b
-# big model, not tried: young-geng/koala
 mistralai/Mixtral-8x7B-v0.1
 mistralai/Mixtral-8x7B-Instruct-v0.1
 mosaicml/mpt-30b

From d717e01eefdb1d8ff47b26199c5d3fbc0ed4b2ea Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Fri, 24 May 2024 16:49:54 +0200
Subject: [PATCH 15/32] Added generation statuses.

---
 .../include/continuous_batching_pipeline.hpp  |  9 ++++
 .../src/continuous_batching_pipeline.cpp      | 43 ++++++++++++-------
 .../library/src/sequence_group.hpp            |  6 +--
 3 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/include/continuous_batching_pipeline.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/include/continuous_batching_pipeline.hpp
index 5afd0e715..b3701d436 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/include/continuous_batching_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/include/continuous_batching_pipeline.hpp
@@ -9,6 +9,12 @@
 #include "tokenizer.hpp"
 #include "generation_config.hpp"
 
+enum class GenerationResultStatus {
+    FINISHED = 0,
+    IGNORED = 1,
+    ABORTED = 2 // Currently not used, TODO: implement abort functionality
+};
+
 struct GenerationResult {
     // request ID
     uint64_t m_request_id;
@@ -18,6 +24,9 @@ struct GenerationResult {
     std::vector<std::string> m_generation_ids;
     // scores
     std::vector<float> m_scores;
+
+    // Status of generation
+    GenerationResultStatus m_status;
 };
 
 class ContinuousBatchingPipeline {
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
index e2919c5d2..667eeacc0 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
@@ -19,12 +19,11 @@ GenerationResult from_sequence_group(std::shared_ptr<Tokenizer> tokenizer, Seque
 
     std::vector<Sequence::CPtr> finished_sequences = sequence_group->get_finished_sequences();
 
-    OPENVINO_ASSERT(finished_sequences.size() == sequence_group->num_total_seqs() && sequence_group->has_finished());
+    OPENVINO_ASSERT(finished_sequences.size() == sequence_group->num_total_seqs());
     for (size_t sequence_id = 0; sequence_id < finished_sequences.size(); ++sequence_id) {
         Sequence::CPtr sequence = finished_sequences[sequence_id];
 
         result.m_scores.push_back(sequence->get_beam_search_score(sequence_group->get_sampling_parameters()));
-
         {
             static ManualTimer timer("detokenize");
             timer.start();
@@ -34,6 +33,15 @@ GenerationResult from_sequence_group(std::shared_ptr<Tokenizer> tokenizer, Seque
         }
     }
 
+    if (sequence_group->has_finished()) {
+        result.m_status = GenerationResultStatus::FINISHED;
+    }
+    else if (sequence_group->out_of_memory()) {
+        result.m_status = GenerationResultStatus::IGNORED;
+    }
+    else {
+        result.m_status = GenerationResultStatus::ABORTED;
+    }
     return result;
 }
 
@@ -74,6 +82,10 @@ class ContinuousBatchingPipeline::Impl {
         m_requests.erase(new_end, m_requests.end());
     }
 
+    void _free_all_requests() {
+        m_requests.erase(m_requests.begin(), m_requests.end());
+    }
+
 public:
     Impl(const std::string& models_path, const SchedulerConfig& scheduler_config) {
         ov::Core core;
@@ -152,7 +164,17 @@ class ContinuousBatchingPipeline::Impl {
             for (size_t sequence_group_id = 0; sequence_group_id < m_requests.size(); ++sequence_group_id) {
                 m_requests[sequence_group_id]->set_out_of_memory();
             }
-            return {};
+
+            // return partial results
+            std::vector<GenerationResult> pertial_results;
+
+            for (size_t i = 0; i < m_requests.size(); ++i) {
+                SequenceGroup::CPtr sequence_group = m_requests[i];
+                pertial_results.push_back(from_sequence_group(m_tokenizer, sequence_group));
+            }
+
+            _free_all_requests();
+            return pertial_results;
         }
 
         ov::Tensor logits;
@@ -229,14 +251,6 @@ class ContinuousBatchingPipeline::Impl {
         return !m_requests.empty();
     }
 
-    bool out_of_memory() const {
-        for (size_t sequence_group_id = 0; sequence_group_id < m_requests.size(); ++sequence_group_id) {
-            if (m_requests[sequence_group_id]->out_of_memory())
-                return true;
-        }
-        return false;
-    }
-
     std::vector<GenerationResult> generate(const std::vector<std::string> prompts, std::vector<GenerationConfig> sampling_params) {
         OPENVINO_ASSERT(!has_running_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
         OPENVINO_ASSERT(prompts.size() == sampling_params.size());
@@ -248,14 +262,11 @@ class ContinuousBatchingPipeline::Impl {
         std::vector<GenerationResult> results;
         results.reserve(m_requests.size());
 
-        while (has_running_requests() && !out_of_memory()) {
+        while (has_running_requests()) {
             std::vector<GenerationResult> partial_results = step();
-            if (partial_results.size() > 0)
-                results.insert(results.end(), partial_results.begin(), partial_results.end());
+            results.insert(results.end(), partial_results.begin(), partial_results.end());
         }
 
-        OPENVINO_ASSERT(!out_of_memory(), "Not enough memory for processing the requests.");
-
         // sort results according to request_id to return results in order of initial prompts
         std::sort(results.begin(), results.end(), [] (const GenerationResult& r1, const GenerationResult& r2) -> bool {
             return r1.m_request_id < r2.m_request_id;
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp
index c49a88a5a..3bea9d37f 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp
@@ -199,7 +199,7 @@ class SequenceGroup {
     std::vector<Sequence::CPtr> get_finished_sequences() const {
         std::vector<Sequence::CPtr> finished_seqs;
         for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) {
-            if (m_sequences[seq_id]->has_finished()) {
+            if (m_sequences[seq_id]->has_finished() || m_sequences[seq_id]->out_of_memory()) {
                 finished_seqs.push_back(m_sequences[seq_id]);
             }
         }
@@ -356,7 +356,7 @@ class SequenceGroup {
         }
     }
 
-    bool out_of_memory() {
+    bool out_of_memory() const {
         for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) {
             if (m_sequences[seq_id]->out_of_memory()) {
                 return true;
@@ -365,7 +365,7 @@ class SequenceGroup {
         return false;
     }
 
-    bool is_waiting() {
+    bool is_waiting() const {
         for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) {
             if (m_sequences[seq_id]->is_waiting()) {
                 return true;

From 499083c16bed6431ae3440014e69f641f380892b Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Fri, 24 May 2024 17:38:29 +0200
Subject: [PATCH 16/32] Accuracy sample corrected.

---
 .../apps/accuracy_sample.cpp                  | 29 +++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp b/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp
index e7cdaa7f3..0a7dfd3af 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp
@@ -6,6 +6,12 @@
 
 #include "continuous_batching_pipeline.hpp"
 
+void print_sequence(const GenerationResult& generation_result) {
+    for (size_t output_id = 0; output_id < generation_result.m_generation_ids.size(); ++output_id) {
+        std::cout << "Answer " << output_id << " (" << generation_result.m_scores[output_id] << ") : " << generation_result.m_generation_ids[output_id] << std::endl;
+    }
+}
+
 int main(int argc, char* argv[]) try {
     // Command line options
 
@@ -80,8 +86,27 @@ int main(int argc, char* argv[]) try {
         const GenerationResult & generation_result = generation_results[request_id];
 
         std::cout << "Question: " << prompts[request_id] << std::endl;
-        for (size_t output_id = 0; output_id < generation_result.m_generation_ids.size(); ++output_id) {
-            std::cout << "Answer " << output_id << " (" << generation_result.m_scores[output_id] << ") : " << generation_result.m_generation_ids[output_id] << std::endl;
+        switch (generation_result.m_status)
+        {
+        case GenerationResultStatus::FINISHED:
+            print_sequence(generation_result);
+            break;
+        case GenerationResultStatus::IGNORED:
+            std::cout << "Sequence was ignored." <<std::endl;
+            if (generation_result.m_generation_ids.size() > 0) {
+                std::cout << "Partial result:" << std::endl;
+                print_sequence(generation_result);
+            }
+            break;
+        case GenerationResultStatus::ABORTED:
+            std::cout << "Sequence was aborted." <<std::endl;
+            if (generation_result.m_generation_ids.size() > 0) {
+                std::cout << "Partial result:" << std::endl;
+                print_sequence(generation_result);
+            }
+            break;   
+        default:
+            break;
         }
         std::cout << std::endl;
     }

From cfab5bfa0d42c9e90af261d3132dc79f3d7cf346 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Fri, 24 May 2024 17:42:33 +0200
Subject: [PATCH 17/32] Minor correction.

---
 .../cpp/continuous_batching/apps/accuracy_sample.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp b/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp
index 0a7dfd3af..9de48508e 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp
@@ -6,7 +6,7 @@
 
 #include "continuous_batching_pipeline.hpp"
 
-void print_sequence(const GenerationResult& generation_result) {
+void print_generation_result(const GenerationResult& generation_result) {
     for (size_t output_id = 0; output_id < generation_result.m_generation_ids.size(); ++output_id) {
         std::cout << "Answer " << output_id << " (" << generation_result.m_scores[output_id] << ") : " << generation_result.m_generation_ids[output_id] << std::endl;
     }
@@ -89,20 +89,20 @@ int main(int argc, char* argv[]) try {
         switch (generation_result.m_status)
         {
         case GenerationResultStatus::FINISHED:
-            print_sequence(generation_result);
+            print_generation_result(generation_result);
             break;
         case GenerationResultStatus::IGNORED:
-            std::cout << "Sequence was ignored." <<std::endl;
+            std::cout << "Request was ignored due to lack of memory." <<std::endl;
             if (generation_result.m_generation_ids.size() > 0) {
                 std::cout << "Partial result:" << std::endl;
-                print_sequence(generation_result);
+                print_generation_result(generation_result);
             }
             break;
         case GenerationResultStatus::ABORTED:
-            std::cout << "Sequence was aborted." <<std::endl;
+            std::cout << "Request was aborted." <<std::endl;
             if (generation_result.m_generation_ids.size() > 0) {
                 std::cout << "Partial result:" << std::endl;
-                print_sequence(generation_result);
+                print_generation_result(generation_result);
             }
             break;   
         default:

From 41fbdcf37b87e0f1eecae12225075a69837a7e22 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Fri, 24 May 2024 17:55:44 +0200
Subject: [PATCH 18/32] Minor correction.

---
 .../continuous_batching/python/tests/test_preemption.py    | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py
index 5300296d2..078e7137a 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py
@@ -20,13 +20,6 @@
 def test_preemption(tmp_path, params):
     run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1])
 
-
-@pytest.mark.precommit
-def test_out_of_memory(tmp_path):
-    with pytest.raises(RuntimeError) as excinfo:
-        run_test_pipeline(tmp_path, "facebook/opt-125m", {"num_kv_blocks": 1})
-    assert "Not enough memory for processing the requests." in str(excinfo.value)
-
 multinomial_params = RandomSamplingTestStruct(generation_config=[get_multinomial_temperature(),
                                                           get_multinomial_temperature_and_top_p(),
                                                           get_multinomial_temperature_and_top_k()],

From 7d18ebec006e29565ee5a08ad71f0324fbd16c39 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Fri, 24 May 2024 17:59:08 +0200
Subject: [PATCH 19/32] Minor correction.

---
 .../library/src/continuous_batching_pipeline.cpp                 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
index 667eeacc0..54b3e9f53 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
@@ -24,6 +24,7 @@ GenerationResult from_sequence_group(std::shared_ptr<Tokenizer> tokenizer, Seque
         Sequence::CPtr sequence = finished_sequences[sequence_id];
 
         result.m_scores.push_back(sequence->get_beam_search_score(sequence_group->get_sampling_parameters()));
+
         {
             static ManualTimer timer("detokenize");
             timer.start();

From d087295f9eb4d205460842a72090063065735616 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Fri, 24 May 2024 20:09:46 +0200
Subject: [PATCH 20/32] Fix sorting in the temperature transform

---
 .../cpp/continuous_batching/library/src/sampler.hpp        | 7 +++++--
 .../library/src/tests/logit_filtering.cpp                  | 5 +++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp
index 126020e32..3bb9566ff 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp
@@ -226,7 +226,8 @@ class TopPFilter: public IProbabilityFilter {
             nucleus_size += 1;
             if (probability_sum > m_top_p) break;
         }
-        return std::vector<ProbabilityWithIdx>(tmp.begin(), tmp.begin() + nucleus_size);
+        tmp.resize(nucleus_size);
+        return tmp;
     }
 
 private:
@@ -241,7 +242,8 @@ class TopKFilter: public IProbabilityFilter {
         std::vector<ProbabilityWithIdx> tmp(input_probs);
         std::sort(tmp.begin(), tmp.end(), [](const ProbabilityWithIdx& lhs, const ProbabilityWithIdx& rhs) {return lhs.first > rhs.first; });
         size_t top_k = input_probs.size() >= m_top_k ? m_top_k : input_probs.size();
-        return std::vector<ProbabilityWithIdx>(tmp.begin(), tmp.begin() + top_k);
+        tmp.resize(top_k);
+        return tmp;
     }
 
 private:
@@ -256,6 +258,7 @@ class TemperatureLogitTransform {
 
     std::vector<ProbabilityWithIdx> apply(const std::vector<LogitWithIdx>& input_logits) {
         std::vector<ProbabilityWithIdx> output(input_logits.begin(), input_logits.end());
+        std::sort(output.begin(), output.end(), [](const ProbabilityWithIdx& lhs, const ProbabilityWithIdx& rhs) {return lhs.first > rhs.first; });
         float max_logit = output[0].first;
         std::for_each(output.begin(), output.end(), [max_logit, this](ProbabilityWithIdx& val) {val.first = expf((val.first - max_logit) / this->m_temperature);});
 
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp
index 6eba8cfe4..7aa982553 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp
@@ -29,8 +29,9 @@ TEST_P(TemperatureTransformTest, TransformResultEqualToReference) {
 
 
 const std::vector<TemperatureTransformTestStruct> TEMPERATURE_TRANSFORM_TEST_CASES = {
-    {1.0f, { {1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, { {0.090031, 0}, {0.244728, 1}, {0.665241, 2} } },
-    {2.0f, { {1.0f, 2}, {2.0f, 1}, {3.0f, 0} }, { {0.186323, 2}, {0.307195, 1}, {0.506480, 0} } }
+    {1.0f, { {1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, { {0.665241, 2}, {0.244728, 1}, {0.090031, 0} } },
+    {2.0f, { {1.0f, 2}, {2.0f, 1}, {3.0f, 0} }, { {0.506480, 0}, {0.307195, 1}, {0.186323, 2} } },
+    {1.0f, { {3.0f, 0}, {1.0f, 1}, {2.0f, 2} }, { {0.665241, 0}, {0.244728, 2}, {0.090031, 1} } },
 };
 
 INSTANTIATE_TEST_SUITE_P(VariousInputs,

From d22b7557004e575685c2918dcfb24db089f591a5 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Fri, 24 May 2024 23:23:23 +0200
Subject: [PATCH 21/32] Implement repetition penalty

---
 .../library/src/sampler.hpp                   | 62 +++++++++++++++----
 .../library/src/sequence_group.hpp            |  7 +++
 .../library/src/tests/logit_filtering.cpp     | 59 ++++++++++++++++++
 .../python/tests/common.py                    |  6 ++
 .../python/tests/test_sampling.py             |  6 +-
 5 files changed, 124 insertions(+), 16 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp
index 3bb9566ff..f17a803cb 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp
@@ -11,6 +11,7 @@
 #include <algorithm>
 #include <cmath>
 #include <random>
+#include <set>
 
 #include "openvino/runtime/tensor.hpp"
 
@@ -275,6 +276,37 @@ class TemperatureLogitTransform {
     double m_temperature;
 };
 
+class RepetitionPenaltyTransform {
+public:
+    RepetitionPenaltyTransform(double penalty) : m_penalty(penalty) {
+        OPENVINO_ASSERT(m_penalty >= 0.0f, "repetition penalty must be a positive value");
+    }
+
+    std::vector<LogitWithIdx> apply(const std::vector<LogitWithIdx>& input_logits, const std::set<int64_t>& unique_input_ids) {
+        std::vector<LogitWithIdx> output(input_logits.begin(), input_logits.end());
+        size_t vocab_size = input_logits.size();
+        for (auto input_id : unique_input_ids) {
+            OPENVINO_ASSERT((input_id >= 0) && (input_id < vocab_size), "input_ids token out of bounds");
+            OPENVINO_ASSERT(input_logits[input_id].second == input_id, "input_logits must have original index order");
+            auto logit_value = output[input_id].first;
+            if (logit_value >= 0) {
+                output[input_id].first /= m_penalty;
+            } else {
+                output[input_id].first *= m_penalty;
+            };
+        }
+        return output;
+    }
+
+    std::vector<LogitWithIdx> apply(const std::vector<LogitWithIdx>& input_logits, const TokenIds& input_ids) {
+        std::set<int64_t> unique_input_ids(input_ids.begin(), input_ids.end());
+        return this->apply(input_logits, unique_input_ids);
+    }
+private:
+    double m_penalty;
+};
+
+
 class ProbabilityNormalizeTransform {
 public:
     std::vector<ProbabilityWithIdx> apply(const std::vector<ProbabilityWithIdx>& input_probs) {
@@ -288,27 +320,25 @@ class ProbabilityNormalizeTransform {
 
 class Sampler {
 
-    int64_t _greedy_sample(ov::Tensor logits) const {
+    std::vector<LogitWithIdx> _get_logit_vector(ov::Tensor logits) {
         ov::Shape logits_shape = logits.get_shape();
         size_t batch_size = logits_shape[0], seq_len = logits_shape[1], vocab_size = logits_shape[2];
         OPENVINO_ASSERT(batch_size == 1);
-
         const float * logits_data = logits.data<const float>() + (seq_len - 1) * vocab_size;
-        int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
-        return out_token;
-    }
 
-    int64_t _multinomial_sample(ov::Tensor logits, float temperature, float top_p, size_t top_k) {
-        ov::Shape logits_shape = logits.get_shape();
-        size_t batch_size = logits_shape[0], seq_len = logits_shape[1], vocab_size = logits_shape[2];
-        OPENVINO_ASSERT(batch_size == 1);
-
-        const float * logits_data = logits.data<const float>() + (seq_len - 1) * vocab_size;
         std::vector<LogitWithIdx> logit_vector(vocab_size);
         for (size_t i = 0; i < logit_vector.size(); i++) {
             logit_vector[i] = LogitWithIdx(logits_data[i], i);
         }
+        return logit_vector;
+    }
+
+    int64_t _greedy_sample(const std::vector<LogitWithIdx>& logit_vector) const {
+        int64_t out_token = std::max_element(logit_vector.begin(), logit_vector.end(), [](const LogitWithIdx& lhs, const LogitWithIdx& rhs) { return lhs.first < rhs.first; }) - logit_vector.begin();
+        return out_token;
+    }
 
+    int64_t _multinomial_sample(const std::vector<LogitWithIdx>& logit_vector, float temperature, float top_p, size_t top_k) {
         auto temperature_transform = TemperatureLogitTransform(temperature);
         std::vector<ProbabilityWithIdx> softmax_vector = temperature_transform.apply(logit_vector);
 
@@ -367,6 +397,12 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
 
         const void * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens;
         ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, actual_seq_len, vocab_size}, (void *)sequence_group_logits_data);
+        auto logit_vector = _get_logit_vector(sequence_group_logits);  // TODO (vshampor): do we really even need a tensor on the line above?
+
+        if (sampling_params.repetition_penalty != 1.0f) {
+            auto repetition_penalty_transform = RepetitionPenaltyTransform(sampling_params.repetition_penalty);
+            logit_vector = repetition_penalty_transform.apply(logit_vector, sequence_group->get_unique_prompt_ids());
+        }
 
         if (sequence_group->requires_sampling()) {
             if (sampling_params.is_greedy_sampling() || sampling_params.is_multinomial()) {
@@ -375,10 +411,10 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
 
                 int64_t sampled_token_id;
                 if (sampling_params.is_greedy_sampling()) {
-                    sampled_token_id = _greedy_sample(sequence_group_logits);
+                    sampled_token_id = _greedy_sample(logit_vector);
                 }
                 else {  // .is_multinomial()
-                    sampled_token_id = _multinomial_sample(sequence_group_logits, sampling_params.temperature, sampling_params.top_p, sampling_params.top_k);
+                    sampled_token_id = _multinomial_sample(logit_vector, sampling_params.temperature, sampling_params.top_p, sampling_params.top_k);
                 }
                 // in case of greedy search we always have a single parent sequence to sample from
                 running_sequences[0]->append_token(sampled_token_id, sequence_group_logits.data<const float>()[sampled_token_id]);
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp
index 29b0af513..6ab10cd90 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <vector>
+#include <set>
 #include <cstdlib>
 
 #include "generation_config.hpp"
@@ -104,6 +105,7 @@ class SequenceGroup {
     GenerationConfig m_sampling_params;
     std::size_t m_block_size;
     TokenIds m_prompt_ids;
+    std::set<int64_t> m_unique_prompt_ids;
  
     // amount of processed tokens, e.g. prompt can be processed using multiple consequence inferences
     // so, we need to track which part of the prompt we have already processed
@@ -131,6 +133,7 @@ class SequenceGroup {
 
         m_prompt_ids.resize(input_ids.get_size());
         std::copy_n(input_ids.data<int64_t>(), input_ids.get_size(), m_prompt_ids.begin());
+        for (auto id: m_prompt_ids) { m_unique_prompt_ids.insert(id); }
     }
 
     void add_sequence(const Sequence::Ptr & sequence) {
@@ -283,6 +286,10 @@ class SequenceGroup {
         return m_prompt_ids;
     }
 
+    const std::set<int64_t>& get_unique_prompt_ids() const {
+        return m_unique_prompt_ids;
+    }
+
     size_t get_num_logical_blocks() const {
         return (get_context_len() + m_block_size - 1) / m_block_size;
     }
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp
index 7aa982553..80df9afc7 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp
@@ -140,3 +140,62 @@ TEST(TopPFilterInitializationTest, ThrowsForInvalidProbabilities) {
     EXPECT_THROW(TopPFilter(-0.5), ov::Exception);
     EXPECT_THROW(TopPFilter(1.1), ov::Exception);
 }
+
+
+struct RepetitionPenaltyTransformTestStruct {
+    float penalty;
+    std::vector<LogitWithIdx> input_logits;
+    TokenIds input_ids;
+    std::vector<LogitWithIdx> expected_output;
+};
+
+using RepetitionPenaltyTransformTest = testing::TestWithParam<RepetitionPenaltyTransformTestStruct>;
+
+TEST_P(RepetitionPenaltyTransformTest, TransformResultEqualToReference) {
+    auto test_struct = GetParam();
+    auto transform = RepetitionPenaltyTransform(test_struct.penalty);
+    auto test_result = transform.apply(test_struct.input_logits, test_struct.input_ids);
+    ASSERT_EQ(test_result.size(), test_struct.expected_output.size());
+    for (size_t i = 0; i < test_result.size(); i++) {
+        EXPECT_NEAR(test_result[i].first, test_struct.expected_output[i].first, 1e-6);
+        EXPECT_EQ(test_result[i].second, test_struct.expected_output[i].second);
+    }
+}
+
+
+const std::vector<RepetitionPenaltyTransformTestStruct> REPETITION_PENALTY_TRANSFORM_TEST_CASES = {
+    { // basic case, indices are applied, order is left as-is
+        1.2f,
+        { {1.0f, 0}, {2.0f, 1}, {3.0f, 2} },
+        { 2, 0 },
+        { {0.8333333f, 0}, {2.0f, 1}, {2.5f, 2} }
+    },
+    { // negative scores case
+        2.0f,
+        { {-1.0f, 0}, {2.0f, 1}, {3.0f, 2} },
+        { 0, 1 },
+        { {-2.0f, 0}, {1.0f, 1}, {3.0f, 2} }
+    },
+    { // repeated tokens in prompt, check that the penalty is only applied once
+        0.5f,
+        { {-1.0f, 0}, {2.0f, 1}, {3.0f, 2} },
+        { 1, 1 },
+        { {-1.0f, 0}, {4.0f, 1}, {3.0f, 2} }
+    },
+};
+
+INSTANTIATE_TEST_SUITE_P(VariousInputs,
+                         RepetitionPenaltyTransformTest,
+                         testing::ValuesIn(REPETITION_PENALTY_TRANSFORM_TEST_CASES));
+
+
+TEST(RepetitionPenaltyTransformInitializationTest, ThrowsForInvalidPenalties) {
+    EXPECT_THROW(RepetitionPenaltyTransform(-0.5), ov::Exception);
+}
+
+TEST(RepetitionPenaltyTransformInitializationTest, ThrowsForInvalidInputIds) {
+    auto transform = RepetitionPenaltyTransform(1.5);
+    EXPECT_THROW(transform.apply({ {43.0f, 0} }, std::set<int64_t>{1337} ), ov::Exception);
+    EXPECT_THROW(transform.apply({ {18.0f, 0} }, std::set<int64_t>{0, -1} ), ov::Exception);
+}
+
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
index df27e6d4a..288506fc2 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
@@ -18,6 +18,12 @@ def get_greedy() -> GenerationConfig:
     generation_config.num_return_sequences = 1
     return generation_config
 
+def get_greedy_with_repetition_penalty() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.num_return_sequences = 1
+    generation_config.repetition_penalty = 2.0
+    return generation_config
+
 
 def get_beam_search() -> GenerationConfig:
     generation_config = GenerationConfig()
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py
index 14a2e8295..82a5e3ec7 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py
@@ -3,7 +3,7 @@
 import os
 import pytest
 
-from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG
+from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty
 from dataclasses import dataclass
 from py_continuous_batching import GenerationConfig, GenerationResult
 from pathlib import Path
@@ -84,8 +84,8 @@ def test_eos_greedy(tmp_path):
         print(f"Prompt = {prompt}\nHF result = {hf_result}\nOV result = {ov_result}")
         compare_results(hf_result, ov_result, generation_config)
 
-@pytest.mark.parametrize("generation_config", [get_greedy(), get_beam_search()],
-        ids=["greedy", "beam"])
+@pytest.mark.parametrize("generation_config", [get_greedy(), get_beam_search(), get_greedy_with_repetition_penalty()],
+        ids=["greedy", "beam", "greedy_with_repetition_penalty"])
 def test_individual_generation_configs_deterministic(tmp_path, generation_config):
     prompts = [
             "What is OpenVINO?",

From 78f41eadaa6bea8fe730cf940d495ae14cfce9d5 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Sat, 25 May 2024 14:29:55 +0200
Subject: [PATCH 22/32] Align with HF for sampling-based algos

---
 .../continuous_batching/library/src/sampler.hpp   | 15 +++++++++------
 .../library/src/sequence_group.hpp                | 12 ++++++++----
 .../continuous_batching/python/tests/common.py    |  4 ++--
 .../python/tests/test_sampling.py                 |  5 +++--
 4 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp
index f17a803cb..2d4cc4df4 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp
@@ -397,15 +397,15 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
 
         const void * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens;
         ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, actual_seq_len, vocab_size}, (void *)sequence_group_logits_data);
-        auto logit_vector = _get_logit_vector(sequence_group_logits);  // TODO (vshampor): do we really even need a tensor on the line above?
-
-        if (sampling_params.repetition_penalty != 1.0f) {
-            auto repetition_penalty_transform = RepetitionPenaltyTransform(sampling_params.repetition_penalty);
-            logit_vector = repetition_penalty_transform.apply(logit_vector, sequence_group->get_unique_prompt_ids());
-        }
 
         if (sequence_group->requires_sampling()) {
             if (sampling_params.is_greedy_sampling() || sampling_params.is_multinomial()) {
+                auto logit_vector = _get_logit_vector(sequence_group_logits);  // TODO (vshampor): should be also applicable to beam search, but need to remove the batch size == 1 limitation
+
+                if (sampling_params.repetition_penalty != 1.0f) {
+                    auto repetition_penalty_transform = RepetitionPenaltyTransform(sampling_params.repetition_penalty);
+                    logit_vector = repetition_penalty_transform.apply(logit_vector, sequence_group->get_unique_generated_ids());
+                }
                 std::vector<Sequence::Ptr> running_sequences = sequence_group->get_running_sequences();
                 OPENVINO_ASSERT(running_sequences.size() == 1);
 
@@ -416,6 +416,9 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
                 else {  // .is_multinomial()
                     sampled_token_id = _multinomial_sample(logit_vector, sampling_params.temperature, sampling_params.top_p, sampling_params.top_k);
                 }
+
+                sequence_group->register_generated_token_id(sampled_token_id);
+
                 // in case of greedy search we always have a single parent sequence to sample from
                 running_sequences[0]->append_token(sampled_token_id, sequence_group_logits.data<const float>()[sampled_token_id]);
 
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp
index 6ab10cd90..3aae278ec 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp
@@ -105,7 +105,7 @@ class SequenceGroup {
     GenerationConfig m_sampling_params;
     std::size_t m_block_size;
     TokenIds m_prompt_ids;
-    std::set<int64_t> m_unique_prompt_ids;
+    std::set<int64_t> m_unique_generated_ids;
  
     // amount of processed tokens, e.g. prompt can be processed using multiple consequence inferences
     // so, we need to track which part of the prompt we have already processed
@@ -133,7 +133,7 @@ class SequenceGroup {
 
         m_prompt_ids.resize(input_ids.get_size());
         std::copy_n(input_ids.data<int64_t>(), input_ids.get_size(), m_prompt_ids.begin());
-        for (auto id: m_prompt_ids) { m_unique_prompt_ids.insert(id); }
+        for (auto id: m_prompt_ids) { m_unique_generated_ids.insert(id); }
     }
 
     void add_sequence(const Sequence::Ptr & sequence) {
@@ -286,8 +286,12 @@ class SequenceGroup {
         return m_prompt_ids;
     }
 
-    const std::set<int64_t>& get_unique_prompt_ids() const {
-        return m_unique_prompt_ids;
+    const std::set<int64_t>& get_unique_generated_ids() const {
+        return m_unique_generated_ids;
+    }
+
+    void register_generated_token_id(int64_t token_id) {
+        m_unique_generated_ids.insert(token_id);
     }
 
     size_t get_num_logical_blocks() const {
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
index 288506fc2..3c5bdfe81 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
@@ -105,13 +105,13 @@ def convert_to_hf(
     # copy default parameters
     kwargs['eos_token_id'] = default_generation_config.eos_token_id
     kwargs['pad_token_id'] = default_generation_config.pad_token_id
+    kwargs['repetition_penalty'] = generation_config.repetition_penalty
 
     if generation_config.num_groups * generation_config.group_size > 1:
         # beam search case
         kwargs['num_beam_groups'] = generation_config.num_groups
         kwargs['num_beams'] = generation_config.num_groups * generation_config.group_size
         kwargs['diversity_penalty'] = generation_config.diversity_penalty
-        kwargs['repetition_penalty'] = generation_config.repetition_penalty
         kwargs['length_penalty'] = generation_config.length_penalty
         kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size
         kwargs['num_return_sequences'] = generation_config.num_return_sequences
@@ -207,7 +207,7 @@ def get_model_and_tokenizer(model_id: str, use_optimum = True):
             AutoModelForCausalLM.from_pretrained(model_id)
     return model, hf_tokenizer
 
-def _generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path):
+def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path):
     use_optimum = True
     model_path : Path = tmp_path / model_id
     model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum)
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py
index 82a5e3ec7..3690a27f5 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py
@@ -3,7 +3,7 @@
 import os
 import pytest
 
-from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty
+from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, generate_and_compare_with_hf
 from dataclasses import dataclass
 from py_continuous_batching import GenerationConfig, GenerationResult
 from pathlib import Path
@@ -84,6 +84,7 @@ def test_eos_greedy(tmp_path):
         print(f"Prompt = {prompt}\nHF result = {hf_result}\nOV result = {ov_result}")
         compare_results(hf_result, ov_result, generation_config)
 
+@pytest.mark.precommit
 @pytest.mark.parametrize("generation_config", [get_greedy(), get_beam_search(), get_greedy_with_repetition_penalty()],
         ids=["greedy", "beam", "greedy_with_repetition_penalty"])
 def test_individual_generation_configs_deterministic(tmp_path, generation_config):
@@ -92,7 +93,7 @@ def test_individual_generation_configs_deterministic(tmp_path, generation_config
             ]
     generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
-    _generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path)
+    generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path)
 
 
 @dataclass

From 5f595059af28e7c7acd789fc4c370d03d974b0b5 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Sat, 25 May 2024 14:38:17 +0200
Subject: [PATCH 23/32] Align with HF for non-beam search cases

---
 .../python/tests/common.py                    |  7 ++++
 .../python/tests/test_sampling.py             | 34 +++++++++++--------
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
index 3c5bdfe81..8c4acfa51 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
@@ -61,6 +61,13 @@ def get_multinomial_temperature_top_p_and_top_k() -> GenerationConfig:
     generation_config.top_k = 2
     return generation_config
 
+def get_multinomial_temperature_and_repetition_penalty() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.do_sample = True
+    generation_config.temperature = 0.8
+    generation_config.repetition_penalty = 2.0
+    return generation_config
+
 def get_test_dataset() -> Tuple[List[str], List[GenerationConfig]]:
     prompts = [
         "What is OpenVINO?",
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py
index 3690a27f5..e9a97d1ae 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py
@@ -3,7 +3,7 @@
 import os
 import pytest
 
-from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, generate_and_compare_with_hf
+from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, generate_and_compare_with_hf, get_multinomial_temperature_and_repetition_penalty
 from dataclasses import dataclass
 from py_continuous_batching import GenerationConfig, GenerationResult
 from pathlib import Path
@@ -102,24 +102,28 @@ class RandomSamplingTestStruct:
     prompts: List[str]
     ref_texts: List[List[str]]
 
-RANDOM_SAMPLING_TEST_CASES = [RandomSamplingTestStruct(generation_config=get_multinomial_temperature(),
-                                                       prompts=["What is OpenVINO?"],
-                                                       ref_texts=[ ["\n\nOpenVINO is a software development platform developed by OpenVINO, a set of technology companies and startups that enables developers to use the most"] ]),
-                              RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_top_p(),
-                                                       prompts=["What is OpenVINO?"],
-                                                       ref_texts=[ ["\nOpenVINO is an online application that allows users to create, test, and analyze their own software using a collection of software packages. The application"] ]),
-                              RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_top_k(),
-                                                       prompts=["What is OpenVINO?"],
-                                                       ref_texts=[ ["\n\nOpenVINO is a software that allows users to create a virtual machine with the ability to create a virtual machine in a virtual environment. Open"] ]),
-                              RandomSamplingTestStruct(generation_config=get_multinomial_temperature_top_p_and_top_k(),
-                                                       prompts=["What is OpenVINO?"],
-                                                       ref_texts=[ ["\nOpenVINO is an open source software that allows developers to create, manage, and distribute software. It is an open source project that allows developers"] ]),
-                              ]
+RANDOM_SAMPLING_TEST_CASES = [
+    RandomSamplingTestStruct(generation_config=get_multinomial_temperature(),
+                             prompts=["What is OpenVINO?"],
+                             ref_texts=[ ["\n\nOpenVINO is a software development platform developed by OpenVINO, a set of technology companies and startups that enables developers to use the most"] ]),
+    RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_top_p(),
+                             prompts=["What is OpenVINO?"],
+                             ref_texts=[ ["\nOpenVINO is an online application that allows users to create, test, and analyze their own software using a collection of software packages. The application"] ]),
+    RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_top_k(),
+                             prompts=["What is OpenVINO?"],
+                             ref_texts=[ ["\n\nOpenVINO is a software that allows users to create a virtual machine with the ability to create a virtual machine in a virtual environment. Open"] ]),
+    RandomSamplingTestStruct(generation_config=get_multinomial_temperature_top_p_and_top_k(),
+                             prompts=["What is OpenVINO?"],
+                             ref_texts=[ ["\nOpenVINO is an open source software that allows developers to create, manage, and distribute software. It is an open source project that allows developers"] ]),
+    RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_repetition_penalty(),
+                             prompts=["What is OpenVINO?"],
+                             ref_texts=[ ["\nOpen Vino's are a new and improved way to find cheap, fast-investment frozen vegetables that have no waste or calories. They're"] ]),
+]
 
 
 @pytest.mark.precommit
 @pytest.mark.parametrize("test_struct", RANDOM_SAMPLING_TEST_CASES,
-        ids=["multinomial_temperature", "multinomial_temperature_and_top_p", "multinomial_temperature_and_top_k", "multinomial_temperature_top_p_and_top_k"])
+        ids=["multinomial_temperature", "multinomial_temperature_and_top_p", "multinomial_temperature_and_top_k", "multinomial_temperature_top_p_and_top_k", "multinomial_temperature_and_repetition_penalty"])
 def test_individual_generation_configs_random(tmp_path, test_struct: RandomSamplingTestStruct):
     generation_config = test_struct.generation_config
 

From cc9e4a069e2059573bb794b08dc9b2a44d466894 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sun, 26 May 2024 01:22:13 +0200
Subject: [PATCH 24/32] Updated models list

---
 .../python/tests/models/real_models           | 209 +++++++++---------
 .../python/tests/requirements.txt             |   4 +-
 2 files changed, 107 insertions(+), 106 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
index b179c6058..4fe917605 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
@@ -1,127 +1,126 @@
-# Set of models with accuracy issues, because of PA
+01-ai/Yi-6B
+BAAI/Aquila-7B
+BAAI/AquilaChat-7B
+BAAI/AquilaChat2-7B
+# CohereForAI/c4ai-command-r-v01: restricted and you are not in the authorized list
+EleutherAI/gpt-j-6B
+EleutherAI/gpt-j-6b
+EleutherAI/gpt-neo-1.3B
+EleutherAI/gpt-neo-125m
+EleutherAI/gpt-neo-2.7B
+EleutherAI/gpt-neox-20b
 EleutherAI/pythia-160m
-bigscience/bloomz-1b7
-bigscience/bloomz-560m
-databricks/dolly-v2-3b
-tiiuae/falcon-rw-7b
-bigcode/starcoder2-3b
-openbmb/MiniCPM-2B-sft-bf16
-openbmb/MiniCPM-2B-dpo-bf16
+GAIR/Abel-7B-002
+# OrionStarAI/Orion-14B-Base: pip install flash_attn
+PygmalionAI/pygmalion-6b
 Qwen/Qwen-7B
 Qwen/Qwen-7B-Chat
 Qwen/Qwen1.5-0.5B
+Qwen/Qwen1.5-1.8B
+Qwen/Qwen1.5-7B
 Qwen/Qwen1.5-7B-Chat
-internlm/internlm-chat-7b
-BAAI/Aquila-7B
-internlm/internlm2-7b
-openchat/openchat_3.5
-lmsys/vicuna-7b-v1.5
-lmsys/longchat-7b-v1.5-32k
-BAAI/AquilaChat2-7B
-BAAI/AquilaChat-7B
-baichuan-inc/Baichuan-7B
-tiiuae/falcon-7b
-microsoft/Phi-3-mini-128k-instruct
-microsoft/Phi-3-mini-4k-instruct
-nomic-ai/gpt4all-mpt
-mosaicml/mpt-7b
-mosaicml/mpt-7b-chat
-bigcode/starcoderbase-3b
-bigcode/gpt_bigcode-santacoder
+Qwen/Qwen1.5-MoE-A2.7B
+Qwen/Qwen1.5-MoE-A2.7B-Chat
+Salesforce/codegen-350M-multi
+Salesforce/codegen-350M-nl
+# Salesforce/codegen2-1b: PA - 'stop' input is not a scalar
+# Salesforce/xgen-7b-8k-base: Transformers issue - Object of type method is not JSON serializable
+THUDM/chatglm2-6b
+THUDM/chatglm3-6b
+TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ
+TinyLlama/TinyLlama-1.1B-Chat-v0.6
+TinyLlama/TinyLlama-1.1B-Chat-v1.0
+TitanML/tiny-mixtral
+WizardLMTeam/WizardCoder-15B-V1.0
+# adept/fuyu-8b: optimum - Trying to export a fuyu model, that is a custom or unsupported architecture
 allenai/OLMo-1B-hf
+# allenai/OLMo-7B: pip install hf_olmo
 allenai/OLMo-7B-hf
-PygmalionAI/pygmalion-6b
-stabilityai/stable-code-3b
+baichuan-inc/Baichuan-7B
+baichuan-inc/Baichuan2-7B-Base
+baichuan-inc/Baichuan2-7B-Chat
 berkeley-nest/Starling-LM-7B-alpha
-EleutherAI/gpt-neo-2.7B
-databricks/dolly-v1-6b
-openai-community/gpt2-large
-openai-community/gpt2-medium
+bigcode/gpt_bigcode-santacoder
+bigcode/starcoder2-3b
+bigcode/starcoder2-7b
+bigcode/starcoderbase-3b
+bigscience/bloom-560m
 bigscience/bloom-7b1
+bigscience/bloomz-1b7
+bigscience/bloomz-560m
+bigscience/bloomz-7b1
+cerebras/Cerebras-GPT-13B
+# core42/jais-13b: PA -  'stop' input is not a scalar
+# core42/jais-13b-chat: PA -  'stop' input is not a scalar
+databricks/dolly-v1-6b
+databricks/dolly-v2-3b
+# deepseek-ai/deepseek-coder-33b-instruct: OpenVINO tokenizers -  Cannot convert tokenizer of this type without `.model` file
+# deepseek-ai/deepseek-coder-6.7b-instruct: OpenVINO tokenizers -  Cannot convert tokenizer of this type without `.model` file
+# deepseek-ai/deepseek-moe-16b-base: optimum -  Trying to export a deepseek model, that is a custom or unsupported architecture
+# facebook/blenderbot-3B: optimum - IndexError: tuple index out of range
+# facebook/incoder-1B: CB - Failed to detect "eos_token_id" in openvino_tokenizer.xml runtime information
 facebook/opt-1.3b
+facebook/opt-125m
 facebook/opt-2.7b
-GAIR/Abel-7B-002
+# facebook/opt-350m: PA - Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?])
+facebook/opt-6.7b
 google/gemma-1.1-7b-it
+google/gemma-2b
 google/gemma-2b-it
+google/gemma-7b
+# google/pegasus-big_patent: PA - 'start' input is not a scalar
+# google/pegasus-large: PA - 'start' input is not a scalar
+gpt2
+gpt2-xl
+internlm/internlm-chat-7b
+internlm/internlm2-7b
+lmsys/longchat-7b-v1.5-32k
+lmsys/vicuna-7b-v1.3
+lmsys/vicuna-7b-v1.5
+meta-llama/CodeLlama-7b-hf
+meta-llama/Llama-2-7b-chat-hf
+meta-llama/Llama-2-7b-hf
+meta-llama/Meta-Llama-3-8B-Instruct
 microsoft/DialoGPT-large
 microsoft/DialoGPT-medium
-Qwen/Qwen1.5-1.8B
 microsoft/Orca-2-7b
-# Set of models, failed because of C++ Cont. Batching
-# RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B
-#
-# Set of models, which require support in optimum-intel / transformers / models repositories:
-# IndexError: tuple index out of range: facebook/blenderbot-3B
-# `pip install flash_attn`: OrionStarAI/Orion-14B-Base
-# ValueError: Trying to export a fuyu model, that is a custom or unsupported architecture: adept/fuyu-8b
-# ValueError: Trying to export a mamba model, that is a custom or unsupported architecture: state-spaces/mamba-130m-hf
-# ValueError: Trying to export a xlnet model, that is a custom or unsupported architecture: xlnet/xlnet-base-cased
-#
-# Set of models, failed because of CPU limitation
-# head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b
-# head size must be multiple of 16, current: 100: openlm-research/open_llama_3b
-# head size must be multiple of 16, current: 100: openlm-research/open_llama_3b_v2
-#
-# Set of failed models, because of PA:
-# 'start' input is not a scalar: google/pegasus-big_patent
-# 'start' input is not a scalar: google/pegasus-large
-# 'stop' input is not a scalar: Salesforce/codegen2-1b
-# 'stop' input is not a scalar: core42/jais-13b
-# 'stop' input is not a scalar: core42/jais-13b-chat
-# Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]): facebook/opt-350m
-#
-# Set of models, failed because of OpenVINO Tokenizers:
-# https://jira.devtools.intel.com/browse/CVS-142063: rinna/bilingual-gpt-neox-4b
-# Cannot convert tokenizer of this type without `.model` file: deepseek-ai/deepseek-coder-33b-instruct
-# Cannot convert tokenizer of this type without `.model` file: deepseek-ai/deepseek-coder-6.7b-instruct
-# Tokenizer type is not supported: <class 'transformers.models.biogpt.tokenization_biogpt.BioGptTokenizer'>: microsoft/biogpt
-#
-# Set of 13B, 30B abd 70B models:
-EleutherAI/gpt-neox-20b
-mistralai/Mixtral-8x7B-v0.1
+microsoft/Phi-3-mini-128k-instruct
+microsoft/Phi-3-mini-4k-instruct
+# microsoft/biogpt: OpenVINO Tokenizers - openvino.runtime.exceptions.OVTypeError: Tokenizer type is not supported: <class 'transformers.models.biogpt.tokenization_biogpt.BioGptTokenizer'>
+microsoft/phi-1_5
+microsoft/phi-2
+mistralai/Mistral-7B-Instruct-v0.1
+mistralai/Mistral-7B-v0.1
 mistralai/Mixtral-8x7B-Instruct-v0.1
+mistralai/Mixtral-8x7B-v0.1
+# mosaicml/mpt-1b-redpajama-200b: optimum - Trying to export a mosaic-gpt model, that is a custom or unsupported architecture
 mosaicml/mpt-30b
-# see optimum: OrionStarAI/Orion-14B-Base
-# big model, not tried: OrionStarAI/Orion-14B-Chat
-CohereForAI/c4ai-command-r-v01
-openlm-research/open_llama_13b
-Qwen/Qwen1.5-MoE-A2.7B
-Qwen/Qwen1.5-MoE-A2.7B-Chat
-xverse/XVERSE-MoE-A4.2B
-cerebras/Cerebras-GPT-13B
-WizardLMTeam/WizardCoder-15B-V1.0
-TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ
-#
-# Set of passed models:
-microsoft/phi-2
-microsoft/phi-1_5
-EleutherAI/gpt-neo-125m
-EleutherAI/gpt-neo-1.3B
-EleutherAI/gpt-j-6b
-baichuan-inc/Baichuan2-7B-Chat
-THUDM/chatglm2-6b
-THUDM/chatglm3-6b
-google/gemma-2b
-google/gemma-7b
+mosaicml/mpt-7b
+mosaicml/mpt-7b-chat
+nomic-ai/gpt4all-j
+nomic-ai/gpt4all-mpt
 openai-community/gpt2
+openai-community/gpt2-large
+openai-community/gpt2-medium
 openai-community/gpt2-xl
-gpt2
-gpt2-xl
-nomic-ai/gpt4all-j
-stabilityai/stablelm-3b-4e1t
+openbmb/MiniCPM-2B-dpo-bf16
+openbmb/MiniCPM-2B-sft-bf16
+# openbmb/MiniCPM-V-2: optimum - Trying to export a minicpmv model, that is a custom or unsupported architecture
+openchat/openchat_3.5
+openlm-research/open_llama_13b
+# openlm-research/open_llama_3b: CPU - head size must be multiple of 16, current: 100
+# openlm-research/open_llama_3b_v2: CPU - head size must be multiple of 16, current: 100
+# replit/replit-code-v1-3b: OpenVINO Tokenizers - AttributeError: 'ReplitLMTokenizer' object has no attribute 'sp_model'
+# rinna/bilingual-gpt-neox-4b: OpenVINO Tokenizers - trash output
+rinna/youri-7b-chat
+stabilityai/stable-code-3b
+stabilityai/stable-zephyr-3b
 stabilityai/stablelm-2-zephyr-1_6b
-meta-llama/Llama-2-7b-hf
-meta-llama/Meta-Llama-3-8B-Instruct
-meta-llama/CodeLlama-7b-hf
-lmsys/vicuna-7b-v1.3
-mistralai/Mistral-7B-v0.1
-mistralai/Mistral-7B-Instruct-v0.1
-01-ai/Yi-6B
-Salesforce/codegen-350M-multi
-Salesforce/codegen-350M-nl
+stabilityai/stablelm-3b-4e1t
+# state-spaces/mamba-130m-hf: optimum - Trying to export a mamba model, that is a custom or unsupported architecture
+tiiuae/falcon-7b
+tiiuae/falcon-rw-7b
 togethercomputer/RedPajama-INCITE-Chat-3B-v1
-# passed, but with export=False: OpenVINO/codegen25-7b-multi-fp16-ov
-#
-# Set of invalid models, because of HF:
-# HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat
-# https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32: Salesforce/xgen-7b-8k-base
\ No newline at end of file
+# xlnet/xlnet-base-cased: optimum - Trying to export a xlnet model, that is a custom or unsupported architecture
+# xverse/XVERSE-7B-Chat: Transfomers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3
+# xverse/XVERSE-MoE-A4.2B: Transfomers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt
index 568b6886b..0c803412e 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt
@@ -36,4 +36,6 @@ tiktoken
 # - microsoft/biogpt
 sacremoses
 # - TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ
-auto-gptq
\ No newline at end of file
+auto-gptq
+# - allenai/OLMo-7B
+hf_olmo
\ No newline at end of file

From 66006de68270a643649258ae8baabf4deca78952 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sun, 26 May 2024 17:28:28 +0200
Subject: [PATCH 25/32] Small updates

---
 .../python/tests/models/real_models                    | 10 ++++------
 .../continuous_batching/python/tests/requirements.txt  |  4 +---
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
index 4fe917605..dbb50dade 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
@@ -2,7 +2,7 @@
 BAAI/Aquila-7B
 BAAI/AquilaChat-7B
 BAAI/AquilaChat2-7B
-# CohereForAI/c4ai-command-r-v01: restricted and you are not in the authorized list
+CohereForAI/c4ai-command-r-v01
 EleutherAI/gpt-j-6B
 EleutherAI/gpt-j-6b
 EleutherAI/gpt-neo-1.3B
@@ -11,7 +11,7 @@ EleutherAI/gpt-neo-2.7B
 EleutherAI/gpt-neox-20b
 EleutherAI/pythia-160m
 GAIR/Abel-7B-002
-# OrionStarAI/Orion-14B-Base: pip install flash_attn
+# OrionStarAI/Orion-14B-Base: pip install flash_attn (https://github.com/huggingface/transformers/pull/30954)
 PygmalionAI/pygmalion-6b
 Qwen/Qwen-7B
 Qwen/Qwen-7B-Chat
@@ -24,7 +24,7 @@ Qwen/Qwen1.5-MoE-A2.7B-Chat
 Salesforce/codegen-350M-multi
 Salesforce/codegen-350M-nl
 # Salesforce/codegen2-1b: PA - 'stop' input is not a scalar
-# Salesforce/xgen-7b-8k-base: Transformers issue - Object of type method is not JSON serializable
+# Salesforce/xgen-7b-8k-base: Transformers issue - Object of type method is not JSON serializable (https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32)
 THUDM/chatglm2-6b
 THUDM/chatglm3-6b
 TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ
@@ -34,7 +34,6 @@ TitanML/tiny-mixtral
 WizardLMTeam/WizardCoder-15B-V1.0
 # adept/fuyu-8b: optimum - Trying to export a fuyu model, that is a custom or unsupported architecture
 allenai/OLMo-1B-hf
-# allenai/OLMo-7B: pip install hf_olmo
 allenai/OLMo-7B-hf
 baichuan-inc/Baichuan-7B
 baichuan-inc/Baichuan2-7B-Base
@@ -105,13 +104,12 @@ openai-community/gpt2-medium
 openai-community/gpt2-xl
 openbmb/MiniCPM-2B-dpo-bf16
 openbmb/MiniCPM-2B-sft-bf16
-# openbmb/MiniCPM-V-2: optimum - Trying to export a minicpmv model, that is a custom or unsupported architecture
 openchat/openchat_3.5
 openlm-research/open_llama_13b
 # openlm-research/open_llama_3b: CPU - head size must be multiple of 16, current: 100
 # openlm-research/open_llama_3b_v2: CPU - head size must be multiple of 16, current: 100
 # replit/replit-code-v1-3b: OpenVINO Tokenizers - AttributeError: 'ReplitLMTokenizer' object has no attribute 'sp_model'
-# rinna/bilingual-gpt-neox-4b: OpenVINO Tokenizers - trash output
+# rinna/bilingual-gpt-neox-4b: OpenVINO Tokenizers - trash output (https://jira.devtools.intel.com/browse/CVS-142063)
 rinna/youri-7b-chat
 stabilityai/stable-code-3b
 stabilityai/stable-zephyr-3b
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt
index 0c803412e..568b6886b 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt
@@ -36,6 +36,4 @@ tiktoken
 # - microsoft/biogpt
 sacremoses
 # - TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ
-auto-gptq
-# - allenai/OLMo-7B
-hf_olmo
\ No newline at end of file
+auto-gptq
\ No newline at end of file

From 8d4d5a65e48bc7c89e01ea1ca6969ace1115cdaa Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 27 May 2024 10:55:13 +0200
Subject: [PATCH 26/32] Updated list

---
 .../python/tests/models/real_models           | 20 ++++++++-----------
 thirdparty/openvino_tokenizers                |  2 +-
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
index dbb50dade..28a1cb6dd 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
@@ -23,7 +23,7 @@ Qwen/Qwen1.5-MoE-A2.7B
 Qwen/Qwen1.5-MoE-A2.7B-Chat
 Salesforce/codegen-350M-multi
 Salesforce/codegen-350M-nl
-# Salesforce/codegen2-1b: PA - 'stop' input is not a scalar
+Salesforce/codegen2-1b
 # Salesforce/xgen-7b-8k-base: Transformers issue - Object of type method is not JSON serializable (https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32)
 THUDM/chatglm2-6b
 THUDM/chatglm3-6b
@@ -32,7 +32,6 @@ TinyLlama/TinyLlama-1.1B-Chat-v0.6
 TinyLlama/TinyLlama-1.1B-Chat-v1.0
 TitanML/tiny-mixtral
 WizardLMTeam/WizardCoder-15B-V1.0
-# adept/fuyu-8b: optimum - Trying to export a fuyu model, that is a custom or unsupported architecture
 allenai/OLMo-1B-hf
 allenai/OLMo-7B-hf
 baichuan-inc/Baichuan-7B
@@ -49,26 +48,26 @@ bigscience/bloomz-1b7
 bigscience/bloomz-560m
 bigscience/bloomz-7b1
 cerebras/Cerebras-GPT-13B
-# core42/jais-13b: PA -  'stop' input is not a scalar
-# core42/jais-13b-chat: PA -  'stop' input is not a scalar
+# core42/jais-13b: optimum - no SDPA
+# core42/jais-13b-chat: optimum - no SDPA
 databricks/dolly-v1-6b
 databricks/dolly-v2-3b
-# deepseek-ai/deepseek-coder-33b-instruct: OpenVINO tokenizers -  Cannot convert tokenizer of this type without `.model` file
-# deepseek-ai/deepseek-coder-6.7b-instruct: OpenVINO tokenizers -  Cannot convert tokenizer of this type without `.model` file
+# deepseek-ai/deepseek-coder-33b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file
+# deepseek-ai/deepseek-coder-6.7b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file
 # deepseek-ai/deepseek-moe-16b-base: optimum -  Trying to export a deepseek model, that is a custom or unsupported architecture
 # facebook/blenderbot-3B: optimum - IndexError: tuple index out of range
 # facebook/incoder-1B: CB - Failed to detect "eos_token_id" in openvino_tokenizer.xml runtime information
 facebook/opt-1.3b
 facebook/opt-125m
 facebook/opt-2.7b
-# facebook/opt-350m: PA - Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?])
+facebook/opt-350m
 facebook/opt-6.7b
 google/gemma-1.1-7b-it
 google/gemma-2b
 google/gemma-2b-it
 google/gemma-7b
-# google/pegasus-big_patent: PA - 'start' input is not a scalar
-# google/pegasus-large: PA - 'start' input is not a scalar
+google/pegasus-big_patent
+google/pegasus-large
 gpt2
 gpt2-xl
 internlm/internlm-chat-7b
@@ -92,7 +91,6 @@ mistralai/Mistral-7B-Instruct-v0.1
 mistralai/Mistral-7B-v0.1
 mistralai/Mixtral-8x7B-Instruct-v0.1
 mistralai/Mixtral-8x7B-v0.1
-# mosaicml/mpt-1b-redpajama-200b: optimum - Trying to export a mosaic-gpt model, that is a custom or unsupported architecture
 mosaicml/mpt-30b
 mosaicml/mpt-7b
 mosaicml/mpt-7b-chat
@@ -115,10 +113,8 @@ stabilityai/stable-code-3b
 stabilityai/stable-zephyr-3b
 stabilityai/stablelm-2-zephyr-1_6b
 stabilityai/stablelm-3b-4e1t
-# state-spaces/mamba-130m-hf: optimum - Trying to export a mamba model, that is a custom or unsupported architecture
 tiiuae/falcon-7b
 tiiuae/falcon-rw-7b
 togethercomputer/RedPajama-INCITE-Chat-3B-v1
-# xlnet/xlnet-base-cased: optimum - Trying to export a xlnet model, that is a custom or unsupported architecture
 # xverse/XVERSE-7B-Chat: Transfomers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3
 # xverse/XVERSE-MoE-A4.2B: Transfomers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3
\ No newline at end of file
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 200cffc10..0b406fd60 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 200cffc10e3479b00006b613dc3c9fa48301177d
+Subproject commit 0b406fd6080f930a0d4a7c068dae7372046daa9d

From bf7b8bcb6744649aa473b713f9454aff9b89a704 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 27 May 2024 10:56:54 +0200
Subject: [PATCH 27/32] No PA for 350m

---
 .../cpp/continuous_batching/python/tests/models/real_models     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
index 28a1cb6dd..94defc857 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
+++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
@@ -60,7 +60,7 @@ databricks/dolly-v2-3b
 facebook/opt-1.3b
 facebook/opt-125m
 facebook/opt-2.7b
-facebook/opt-350m
+# facebook/opt-350m: PA - Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?])
 facebook/opt-6.7b
 google/gemma-1.1-7b-it
 google/gemma-2b

From e8f9f973cc8ca2549a680832493dd51ac796dd42 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Mon, 27 May 2024 13:40:47 +0200
Subject: [PATCH 28/32] Applied comments.

---
 .../src/continuous_batching_pipeline.cpp        | 17 +++++------------
 .../library/src/scheduler.hpp                   |  7 +++++++
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
index 54b3e9f53..1817edbc6 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
@@ -26,7 +26,7 @@ GenerationResult from_sequence_group(std::shared_ptr<Tokenizer> tokenizer, Seque
         result.m_scores.push_back(sequence->get_beam_search_score(sequence_group->get_sampling_parameters()));
 
         {
-            static ManualTimer timer("detokenize");
+            static ManualTimer timer("detokenize"); 
             timer.start();
             std::string output_text = tokenizer->decode(sequence->get_generated_ids());
             timer.end();
@@ -76,17 +76,13 @@ class ContinuousBatchingPipeline::Impl {
     // current requests to process
     std::vector<SequenceGroup::Ptr> m_requests;
 
-    void _free_finished_requests() {
+    void _free_non_running_requests() {
         auto new_end = std::remove_if(m_requests.begin(), m_requests.end(), [] (SequenceGroup::CPtr seq_group) -> bool {
-            return seq_group->has_finished();
+            return seq_group->has_finished() || seq_group->out_of_memory();
         });
         m_requests.erase(new_end, m_requests.end());
     }
 
-    void _free_all_requests() {
-        m_requests.erase(m_requests.begin(), m_requests.end());
-    }
-
 public:
     Impl(const std::string& models_path, const SchedulerConfig& scheduler_config) {
         ov::Core core;
@@ -162,9 +158,6 @@ class ContinuousBatchingPipeline::Impl {
 
         // if no tokens were scheduled, we are out of memory
         if (scheduler_output.m_total_num_scheduled_tokens == 0) {
-            for (size_t sequence_group_id = 0; sequence_group_id < m_requests.size(); ++sequence_group_id) {
-                m_requests[sequence_group_id]->set_out_of_memory();
-            }
 
             // return partial results
             std::vector<GenerationResult> pertial_results;
@@ -174,7 +167,7 @@ class ContinuousBatchingPipeline::Impl {
                 pertial_results.push_back(from_sequence_group(m_tokenizer, sequence_group));
             }
 
-            _free_all_requests();
+            _free_non_running_requests();
             return pertial_results;
         }
 
@@ -239,7 +232,7 @@ class ContinuousBatchingPipeline::Impl {
                 }
             }
 
-            _free_finished_requests();
+            _free_non_running_requests();
 
             timer.end();
         }
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp
index 8ae3cb721..fd7ff7185 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp
@@ -57,6 +57,13 @@ class Scheduler {
 
         _clear_waiting_sequences(sequence_groups);
 
+
+        // if no tokens were scheduled, we are out of memory
+        if (scheduler_output.m_total_num_scheduled_tokens == 0) {
+            for (size_t sequence_group_id = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
+                sequence_groups[sequence_group_id]->set_out_of_memory();
+            }
+        }
         return scheduler_output;
     }
 

From 0965791dff32cd1081c9d36d390c322fd9dcac7a Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Mon, 27 May 2024 13:42:26 +0200
Subject: [PATCH 29/32] Minor correction.

---
 .../library/src/continuous_batching_pipeline.cpp                | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
index 1817edbc6..f6b224197 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
@@ -26,7 +26,7 @@ GenerationResult from_sequence_group(std::shared_ptr<Tokenizer> tokenizer, Seque
         result.m_scores.push_back(sequence->get_beam_search_score(sequence_group->get_sampling_parameters()));
 
         {
-            static ManualTimer timer("detokenize"); 
+            static ManualTimer timer("detokenize");
             timer.start();
             std::string output_text = tokenizer->decode(sequence->get_generated_ids());
             timer.end();

From 302e638415a4fdf6190211eab2e0907ffaf63137 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Tue, 28 May 2024 10:52:07 +0200
Subject: [PATCH 30/32] Added cache_size field in SchedulerConfig.

---
 .../library/CMakeLists.txt                    |  2 +-
 .../library/include/scheduler_config.hpp      |  6 +++--
 .../library/src/cache_manager.hpp             |  9 ++++++-
 .../src/continuous_batching_pipeline.cpp      | 10 ++++++--
 .../library/src/device_config.hpp             | 24 +++++++++++++++++--
 5 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/CMakeLists.txt b/text_generation/causal_lm/cpp/continuous_batching/library/CMakeLists.txt
index 7f2f73dcf..129f770cc 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/CMakeLists.txt
@@ -67,7 +67,7 @@ FetchContent_MakeAvailable(googletest)
 
 
 set(TEST_TARGET_NAME "tests_continuous_batching")
-add_executable(${TEST_TARGET_NAME} "src/tests/scheduler.cpp" "src/tests/block_manager.cpp" "src/tests/logit_filtering.cpp")
+add_executable(${TEST_TARGET_NAME} "src/tests/scheduler.cpp" "src/tests/block_manager.cpp" "src/tests/logit_filtering.cpp" "src/tests/cache_manager.cpp")
 target_link_libraries(${TEST_TARGET_NAME} PUBLIC ${TARGET_NAME} openvino::runtime gtest_main)
 target_include_directories(${TEST_TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/"
                                           PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp
index 5bdf163e7..ac7739cb4 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp
@@ -11,9 +11,11 @@ struct SchedulerConfig {
     // TODO: benchmark this value and understand a required value to ensure inference is not memory bound
     std::size_t max_num_batched_tokens = 16;
 
-    // TODO: specify size in GBs instead of number of KV blocks
     // total number of KV blocks available to scheduler logic
-    std::size_t num_kv_blocks = 500;
+    std::size_t num_kv_blocks = 0;
+
+    // total size of KV cache in GB
+    std::size_t cache_size = 0;
 
     // block size for KV cache
     std::size_t block_size = 32;
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp
index 11e4dbb38..aa465421c 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp
@@ -13,21 +13,24 @@ class CacheManager {
     DeviceConfig m_device_config;
     std::vector<ov::Tensor> m_key_cache;
     std::vector<ov::Tensor> m_value_cache;
+    size_t m_allocated_bytes;
 
 public:
     explicit CacheManager(const DeviceConfig& device_config) :
         m_device_config(device_config) {
         m_key_cache.reserve(m_device_config.get_num_layers());
         m_value_cache.reserve(m_device_config.get_num_layers());
+        m_allocated_bytes = 0;
 
         // Allocate KV caches
         for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
             ov::Tensor key_cache(device_config.get_cache_precision(), device_config.get_key_cache_shape());
             ov::Tensor value_cache(device_config.get_cache_precision(), device_config.get_value_cache_shape());
-
+            
             // force allocation
             std::memset(key_cache.data(), 0, key_cache.get_byte_size());
             std::memset(value_cache.data(), 0, value_cache.get_byte_size());
+            m_allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size();
 
             m_key_cache.emplace_back(key_cache);
             m_value_cache.emplace_back(value_cache);
@@ -81,4 +84,8 @@ class CacheManager {
             }
         }
     }
+
+    size_t get_total_allocated_bytes() const {
+        return m_allocated_bytes;
+    }
 };
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
index f6b224197..68bf676ff 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
@@ -104,9 +104,15 @@ class ContinuousBatchingPipeline::Impl {
             infer_request.set_input_tensor(2 + decoder_layer_id * 2 + 1, m_cache_manager->get_value_cache(decoder_layer_id));
         }
 
-        m_scheduler = std::make_shared<Scheduler>(scheduler_config);
+        SchedulerConfig updated_config = scheduler_config;
+        // update KV number in scheduler config
+        if (scheduler_config.num_kv_blocks != device_config.get_num_kv_blocks()) {
+            updated_config.num_kv_blocks = device_config.get_num_kv_blocks();
+        }
+
+        m_scheduler = std::make_shared<Scheduler>(updated_config);
         // and finally create model runner
-        m_model_runner = std::make_shared<ModelRunner>(infer_request, scheduler_config);
+        m_model_runner = std::make_shared<ModelRunner>(infer_request, updated_config);
         m_sampler = std::make_shared<Sampler>();
         m_sampler->set_seed(m_generation_config.rng_seed);
 
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp
index ac92c275f..240be4d9e 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp
@@ -13,7 +13,9 @@ class DeviceConfig {
     ov::element::Type m_kv_cache_type;
     ov::Shape m_key_cache_shape, m_value_cache_shape;
     ov::Shape::value_type m_num_kv_heads, m_head_size, m_num_decoder_layers;
-    size_t m_num_kv_blocks, m_block_size;
+    size_t m_num_kv_blocks = 0;
+    size_t m_block_size = 0;
+    size_t m_cache_size = 0;
     std::string m_device;
 
 public:
@@ -21,7 +23,6 @@ class DeviceConfig {
         m_device = device;
 
         // keep information about blocsk
-        m_num_kv_blocks = scheduling_config.num_kv_blocks;
         m_block_size = scheduling_config.block_size;
 
         if (m_device == "CPU") {
@@ -32,6 +33,15 @@ class DeviceConfig {
         } else {
             OPENVINO_THROW(m_device, " is not supported by OpenVINO Continuous Batching");
         }
+
+        OPENVINO_ASSERT(scheduling_config.num_kv_blocks > 0 || scheduling_config.cache_size > 0, "num_kv_blocks or cache_size should be specified.");
+        if (scheduling_config.num_kv_blocks > 0) {
+            m_num_kv_blocks = scheduling_config.num_kv_blocks;
+        }
+        else {
+            m_cache_size = scheduling_config.cache_size;
+
+        }
     }
 
     void set_model_params(size_t num_kv_heads, size_t head_size, size_t num_decoder_layers) {
@@ -39,6 +49,12 @@ class DeviceConfig {
         m_head_size = head_size;
         m_num_decoder_layers = num_decoder_layers;
 
+        if (m_num_kv_blocks == 0) {
+            OPENVINO_ASSERT(m_cache_size > 0, "num_kv_blocks or cache_size should be specified.");
+            size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024;
+            m_num_kv_blocks = size_in_bytes / (m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * m_kv_cache_type.size());
+        }
+
         m_key_cache_shape = m_value_cache_shape = ov::Shape{m_num_kv_blocks,
                                                             m_num_kv_heads,
                                                             m_block_size,
@@ -66,4 +82,8 @@ class DeviceConfig {
         OPENVINO_ASSERT(!m_value_cache_shape.empty());
         return m_value_cache_shape;
     }
+
+    size_t get_num_kv_blocks() const {
+        return m_num_kv_blocks;
+    }
 };

From bb7eea0b4fa079ac8c285a8874dfceee93937d67 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Tue, 28 May 2024 11:02:42 +0200
Subject: [PATCH 31/32] Removed not needed code.

---
 .../library/src/cache_manager.hpp             |  9 +----
 .../library/src/tests/cache_manager.cpp       | 36 +++++++++++++++++++
 2 files changed, 37 insertions(+), 8 deletions(-)
 create mode 100644 text_generation/causal_lm/cpp/continuous_batching/library/src/tests/cache_manager.cpp

diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp
index aa465421c..11e4dbb38 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp
@@ -13,24 +13,21 @@ class CacheManager {
     DeviceConfig m_device_config;
     std::vector<ov::Tensor> m_key_cache;
     std::vector<ov::Tensor> m_value_cache;
-    size_t m_allocated_bytes;
 
 public:
     explicit CacheManager(const DeviceConfig& device_config) :
         m_device_config(device_config) {
         m_key_cache.reserve(m_device_config.get_num_layers());
         m_value_cache.reserve(m_device_config.get_num_layers());
-        m_allocated_bytes = 0;
 
         // Allocate KV caches
         for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
             ov::Tensor key_cache(device_config.get_cache_precision(), device_config.get_key_cache_shape());
             ov::Tensor value_cache(device_config.get_cache_precision(), device_config.get_value_cache_shape());
-            
+
             // force allocation
             std::memset(key_cache.data(), 0, key_cache.get_byte_size());
             std::memset(value_cache.data(), 0, value_cache.get_byte_size());
-            m_allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size();
 
             m_key_cache.emplace_back(key_cache);
             m_value_cache.emplace_back(value_cache);
@@ -84,8 +81,4 @@ class CacheManager {
             }
         }
     }
-
-    size_t get_total_allocated_bytes() const {
-        return m_allocated_bytes;
-    }
 };
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/cache_manager.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/cache_manager.cpp
new file mode 100644
index 000000000..2fa479093
--- /dev/null
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/cache_manager.cpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include "openvino/runtime/core.hpp"
+#include "scheduler.hpp"
+#include "device_config.hpp"
+#include "cache_manager.hpp"
+
+TEST(TestCacheManager, general_test) {
+    ov::Core core;
+    SchedulerConfig scheduler_config = {
+        .max_num_batched_tokens = 32,
+        .num_kv_blocks = 0,
+        .cache_size = 2,
+        .block_size = 32,
+        .max_num_seqs = 2,
+    };
+
+    const std::string device = "CPU";
+    DeviceConfig device_config(core, scheduler_config, "CPU");
+    size_t num_decoder_layers = 12;
+    device_config.set_model_params(12, 64, num_decoder_layers);
+
+    auto cache_manager = std::make_shared<CacheManager>(device_config);
+
+    size_t allocated_bytes = 0;
+    for (size_t i = 0; i < num_decoder_layers; i++) {
+        auto key_cache = cache_manager->get_key_cache(i);
+        auto value_cache = cache_manager->get_value_cache(i);
+        allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size();
+    }
+    
+    ASSERT_EQ(allocated_bytes, 2146959360);
+}

From 1703dbd6929a45e0a3d399e3c649b0652b5b2a64 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Tue, 28 May 2024 11:10:35 +0200
Subject: [PATCH 32/32] Minor correction.

---
 .../cpp/continuous_batching/library/src/device_config.hpp     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp
index 240be4d9e..010d9b2ba 100644
--- a/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp
+++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp
@@ -34,7 +34,7 @@ class DeviceConfig {
             OPENVINO_THROW(m_device, " is not supported by OpenVINO Continuous Batching");
         }
 
-        OPENVINO_ASSERT(scheduling_config.num_kv_blocks > 0 || scheduling_config.cache_size > 0, "num_kv_blocks or cache_size should be specified.");
+        OPENVINO_ASSERT(scheduling_config.num_kv_blocks > 0 || scheduling_config.cache_size > 0, "num_kv_blocks or cache_size should be more than zero.");
         if (scheduling_config.num_kv_blocks > 0) {
             m_num_kv_blocks = scheduling_config.num_kv_blocks;
         }
@@ -50,7 +50,7 @@ class DeviceConfig {
         m_num_decoder_layers = num_decoder_layers;
 
         if (m_num_kv_blocks == 0) {
-            OPENVINO_ASSERT(m_cache_size > 0, "num_kv_blocks or cache_size should be specified.");
+            OPENVINO_ASSERT(m_cache_size > 0, "num_kv_blocks or cache_size should be more than zero.");
             size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024;
             m_num_kv_blocks = size_in_bytes / (m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * m_kv_cache_type.size());
         }