From 26d178f52653261397d59d2010b0e4a658eb86bc Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 21 May 2024 17:55:05 +0200 Subject: [PATCH 01/32] Removed max_padding --- .../apps/accuracy_sample.cpp | 1 - .../apps/throughput_benchmark.cpp | 2 - .../library/include/scheduler_config.hpp | 4 -- .../library/src/scheduler.hpp | 6 -- .../library/src/tests/scheduler.cpp | 8 --- .../cpp/continuous_batching/python/python.cpp | 3 +- .../python/tests/models/real_models | 55 +++++++++---------- 7 files changed, 27 insertions(+), 52 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp b/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp index 2a2a841d7..e7cdaa7f3 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp @@ -71,7 +71,6 @@ int main(int argc, char* argv[]) try { .dynamic_split_fuse = dynamic_split_fuse, // vLLM specific params .max_num_seqs = 2, - .max_paddings = 8, }; ContinuousBatchingPipeline pipe(models_path, scheduler_config); diff --git a/text_generation/causal_lm/cpp/continuous_batching/apps/throughput_benchmark.cpp b/text_generation/causal_lm/cpp/continuous_batching/apps/throughput_benchmark.cpp index 595d1aba7..d29933806 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/apps/throughput_benchmark.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/apps/throughput_benchmark.cpp @@ -181,7 +181,6 @@ int main(int argc, char* argv[]) try { .block_size = 32, .dynamic_split_fuse = dynamic_split_fuse, .max_num_seqs = 256, // not used if dynamic_split_fuse=True - .max_paddings = 256, // not used if dynamic_split_fuse=True }; std::cout << "Benchmarking parameters: " << std::endl; @@ -189,7 +188,6 @@ int main(int argc, char* argv[]) try { std::cout << "\tScheduling type: " << (scheduler_config.dynamic_split_fuse ? "dynamic split-fuse" : "vLLM") << std::endl; if (!scheduler_config.dynamic_split_fuse) { std::cout << "\tMax number of batched sequences: " << scheduler_config.max_num_seqs << std::endl; - std::cout << "\tMax number of padding tokens within prompt batch: " << scheduler_config.max_paddings << std::endl; } std::cout << "Dataset parameters: " << std::endl; std::cout << "\tNum prompts: " << num_prompts << std::endl; diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp index eebcdc2fb..5bdf163e7 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp @@ -27,8 +27,4 @@ struct SchedulerConfig { // max number of scheduled sequences (you can think of it as "max batch size") std::size_t max_num_seqs = 256; - // max number of padding tokens applied when we schedule a prompt phase - // e.g. if total number of padded tokens within a batch a greater than this value, then - // new sequnce is not added to batch - std::size_t max_paddings = 256; }; diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp index ff1f9b84a..5890cc78b 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp @@ -309,7 +309,6 @@ class Scheduler { // Current scheduling method schedules prompts only in a manner similar to vLLM: // - Limits max batch size by: // - max_num_seqs (256 in vLLM's defaults) - // - max_paddings (256 in vLLM's defaults) // - max_num_batched_tokens (max_model_length (and at least 2048) in vLLM's defaults) OPENVINO_ASSERT(!m_config.dynamic_split_fuse, "Internal error: we are in vLLM scheduling"); @@ -345,11 +344,6 @@ class Scheduler { if (num_available_tokens_in_megabatch < max_sequence_len) break; - // apply max padding tokens limitations - size_t total_num_paddings = max_sequence_len * (scheduler_output.m_scheduled_sequence_groups_ids.size() + 1) - (num_scheduled_tokens + sequence_len); - if (total_num_paddings > m_config.max_paddings) - break; - // apply KV cache limitations const size_t num_required_blocks = (sequence_len + m_config.block_size - 1) / m_config.block_size; if (!m_block_manager.can_allocate_blocks(num_required_blocks)) diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/scheduler.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/scheduler.cpp index 3b0ae698c..f2aa62586 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/scheduler.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/scheduler.cpp @@ -25,7 +25,6 @@ TEST(TestScheduler, general_test) { .block_size = 4, .dynamic_split_fuse = false, .max_num_seqs = 5, - .max_paddings = 8, }, SchedulerConfig { .max_num_batched_tokens = 32, @@ -33,7 +32,6 @@ TEST(TestScheduler, general_test) { .block_size = 4, .dynamic_split_fuse = true, .max_num_seqs = 5, - .max_paddings = 8, } }; for (auto scheduler_config: configs) { @@ -123,7 +121,6 @@ TEST(TestScheduler, test_append_slots_considers_all_sequences) { .block_size = 4, .dynamic_split_fuse = false, .max_num_seqs = 5, - .max_paddings = 8, }, SchedulerConfig { .max_num_batched_tokens = 32, @@ -131,7 +128,6 @@ TEST(TestScheduler, test_append_slots_considers_all_sequences) { .block_size = 4, .dynamic_split_fuse = true, .max_num_seqs = 5, - .max_paddings = 8, } }; for (auto scheduler_config: configs) { @@ -195,7 +191,6 @@ TEST(TestScheduler, test_partial_preemption) { .block_size = 4, .dynamic_split_fuse = false, .max_num_seqs = 5, - .max_paddings = 8, }, SchedulerConfig { .max_num_batched_tokens = 32, @@ -203,7 +198,6 @@ TEST(TestScheduler, test_partial_preemption) { .block_size = 4, .dynamic_split_fuse = true, .max_num_seqs = 5, - .max_paddings = 8, } }; for (auto scheduler_config: configs) { @@ -294,7 +288,6 @@ TEST(TestScheduler, test_partially_preempted_prompt) { .block_size = 4, .dynamic_split_fuse = false, .max_num_seqs = 5, - .max_paddings = 8, }, SchedulerConfig { .max_num_batched_tokens = 32, @@ -302,7 +295,6 @@ TEST(TestScheduler, test_partially_preempted_prompt) { .block_size = 4, .dynamic_split_fuse = true, .max_num_seqs = 5, - .max_paddings = 8, } }; for (auto scheduler_config: configs) { diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/python.cpp b/text_generation/causal_lm/cpp/continuous_batching/python/python.cpp index ca3d6f5d3..583efb971 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/python.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/python/python.cpp @@ -90,8 +90,7 @@ PYBIND11_MODULE(py_continuous_batching, m) { .def_readwrite("num_kv_blocks", &SchedulerConfig::num_kv_blocks) .def_readwrite("block_size", &SchedulerConfig::block_size) .def_readwrite("dynamic_split_fuse", &SchedulerConfig::dynamic_split_fuse) - .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs) - .def_readwrite("max_paddings", &SchedulerConfig::max_paddings); + .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs); py::class_(m, "ContinuousBatchingPipeline") .def(py::init()) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models index fefe45983..44d2897b1 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models @@ -1,8 +1,8 @@ -# Set of models with accuracy issues, because of PA: +# Set of models with accuracy issues, because of PA EleutherAI/pythia-160m bigscience/bloomz-1b7 bigscience/bloomz-560m -databricks/dolly-v2-3b +# XEON: databricks/dolly-v2-3b tiiuae/falcon-rw-7b bigcode/starcoder2-3b openbmb/MiniCPM-2B-sft-bf16 @@ -16,30 +16,28 @@ google/pegasus-big_patent google/pegasus-large # # Set of models, which require support in optimum-intel: -# optimum-intel: Trying to export a RefinedWebModel model, that is a custom or unsupported architecture: nomic-ai/gpt4all-falcon -# optimum-intel: Trying to export a internlm model, that is a custom or unsupported architecture: internlm/internlm-chat-7b -# optimum-intel: Trying to export a mosaic-gpt model, that is a custom or unsupported architecture: mosaicml/mpt-1b-redpajama-200b -# optimum-intel: AttributeError: Could not find the attribute named "num_key_value_heads" in the normalized config: BAAI/Aquila-7B -# optimum-intel: PermissionError: [Errno 13] Permission denied: internlm/internlm2-7b -# optimum-intel: AttributeError: 'NoneType' object has no attribute 'device': Salesforce/codegen2-1b -# optimum-intel: TypeError: Object of type method is not JSON serializable: Salesforce/xgen-7b-8k-base -# optimum-intel: IndexError: tuple index out of range: facebook/blenderbot-3B +internlm/internlm-chat-7b +BAAI/Aquila-7B +internlm/internlm2-7b +Salesforce/codegen2-1b +Salesforce/xgen-7b-8k-base +facebook/blenderbot-3B # # Set of models, failed because of CPU limitation # CPU: head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b # # Set of failed models, because of PA: -# Exception from src/core/src/shape_util.cpp:65: BAAI/AquilaChat2-7B -# Exception from src/core/src/shape_util.cpp:65: BAAI/AquilaChat-7B -# Exception from src/core/src/shape_util.cpp:65: baichuan-inc/Baichuan-7B -# Exception from src/core/src/shape_util.cpp:65: tiiuae/falcon-7b -# RuntimeError: Check 'unregistered_parameters.str().empty()': microsoft/Phi-3-mini-128k-instruct -# RuntimeError: Check 'unregistered_parameters.str().empty()': microsoft/Phi-3-mini-4k-instruct -# RuntimeError: Check 'unregistered_parameters.str().empty()': bigcode/starcoderbase-3b -# RuntimeError: Check 'unregistered_parameters.str().empty()': bigcode/gpt_bigcode-santacoder -# RuntimeError: Check 'unregistered_parameters.str().empty()': nomic-ai/gpt4all-mpt -# RuntimeError: Check 'unregistered_parameters.str().empty()': mosaicml/mpt-7b -# RuntimeError: Check 'unregistered_parameters.str().empty()': facebook/opt-350m +BAAI/AquilaChat2-7B +BAAI/AquilaChat-7B +baichuan-inc/Baichuan-7B +tiiuae/falcon-7b +microsoft/Phi-3-mini-128k-instruct +microsoft/Phi-3-mini-4k-instruct +bigcode/starcoderbase-3b +bigcode/gpt_bigcode-santacoder +nomic-ai/gpt4all-mpt +mosaicml/mpt-7b +facebook/opt-350m # # Set of models, failed because of OpenVINO Tokenizers: # RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B @@ -59,8 +57,8 @@ Qwen/Qwen1.5-MoE-A2.7B-Chat # big model, not tried: xverse/XVERSE-MoE-A4.2B # # Set of passed models: -microsoft/phi-2 -microsoft/phi-1_5 +# XEON: microsoft/phi-2 +# XEON: microsoft/phi-1_5 EleutherAI/gpt-neo-125m EleutherAI/gpt-neo-125m EleutherAI/gpt-neo-1.3B @@ -75,8 +73,8 @@ openai-community/gpt2-xl gpt2 gpt2-xl nomic-ai/gpt4all-j -stabilityai/stablelm-3b-4e1t -stabilityai/stablelm-2-zephyr-1_6b +# Xeon: stabilityai/stablelm-3b-4e1t +# Xeon: stabilityai/stablelm-2-zephyr-1_6b meta-llama/Llama-2-7b-hf meta-llama/Meta-Llama-3-8B-Instruct meta-llama/CodeLlama-7b-hf @@ -95,7 +93,6 @@ togethercomputer/RedPajama-INCITE-Chat-3B-v1 # # Set of invalid models, because of HF: # HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat -# HF: DeciCoderAttention.forward() got an unexpected keyword argument 'cache_position': Deci/DeciCoder-1b -# HF: The generation config instance is invalid -- `.validate()` throws warnings and/or exceptions: openchat/openchat_3.5 -# HF: The generation config instance is invalid -- `.validate(): lmsys/vicuna-7b-v1.5 -# HF: The generation config instance is invalid -- `.validate(): lmsys/longchat-7b-v1.5-32k \ No newline at end of file +# Xeon: openchat/openchat_3.5 +lmsys/vicuna-7b-v1.5 +lmsys/longchat-7b-v1.5-32k \ No newline at end of file From 1aa23fe86d200b4f92a4be22842cfdb104844958 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Wed, 22 May 2024 15:12:55 +0200 Subject: [PATCH 02/32] Updated tokenizers --- .../python/tests/models/real_models | 16 ++++++++-------- .../python/tests/requirements.txt | 7 ++++++- thirdparty/openvino_tokenizers | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models index 44d2897b1..e9ae2f441 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models @@ -2,7 +2,7 @@ EleutherAI/pythia-160m bigscience/bloomz-1b7 bigscience/bloomz-560m -# XEON: databricks/dolly-v2-3b +databricks/dolly-v2-3b tiiuae/falcon-rw-7b bigcode/starcoder2-3b openbmb/MiniCPM-2B-sft-bf16 @@ -25,6 +25,8 @@ facebook/blenderbot-3B # # Set of models, failed because of CPU limitation # CPU: head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b +# CPU: head size must be multiple of 16, current: 100: openlm-research/open_llama_3b +# CPU: head size must be multiple of 16, current: 100: openlm-research/open_llama_3b_v2 # # Set of failed models, because of PA: BAAI/AquilaChat2-7B @@ -57,8 +59,8 @@ Qwen/Qwen1.5-MoE-A2.7B-Chat # big model, not tried: xverse/XVERSE-MoE-A4.2B # # Set of passed models: -# XEON: microsoft/phi-2 -# XEON: microsoft/phi-1_5 +microsoft/phi-2 +microsoft/phi-1_5 EleutherAI/gpt-neo-125m EleutherAI/gpt-neo-125m EleutherAI/gpt-neo-1.3B @@ -73,14 +75,12 @@ openai-community/gpt2-xl gpt2 gpt2-xl nomic-ai/gpt4all-j -# Xeon: stabilityai/stablelm-3b-4e1t -# Xeon: stabilityai/stablelm-2-zephyr-1_6b +stabilityai/stablelm-3b-4e1t +stabilityai/stablelm-2-zephyr-1_6b meta-llama/Llama-2-7b-hf meta-llama/Meta-Llama-3-8B-Instruct meta-llama/CodeLlama-7b-hf lmsys/vicuna-7b-v1.3 -openlm-research/open_llama_3b -openlm-research/open_llama_3b_v2 mistralai/Mistral-7B-v0.1 mistralai/Mistral-7B-Instruct-v0.1 allenai/OLMo-1B-hf @@ -93,6 +93,6 @@ togethercomputer/RedPajama-INCITE-Chat-3B-v1 # # Set of invalid models, because of HF: # HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat -# Xeon: openchat/openchat_3.5 +openchat/openchat_3.5 lmsys/vicuna-7b-v1.5 lmsys/longchat-7b-v1.5-32k \ No newline at end of file diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt index d49f1b043..4a94dad33 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt @@ -20,6 +20,7 @@ bitsandbytes # - Qwen/Qwen-7B # - Qwen/Qwen-7B-Chat # - mosaicml/mpt-7b +# - internlm/internlm2-7b einops # - Qwen/Qwen-7B # - Qwen/Qwen-7B-Chat @@ -27,4 +28,8 @@ transformers_stream_generator # - openbmb/MiniCPM-V-2 torchvision # - openbmb/MiniCPM-V-2 -timm \ No newline at end of file +timm +# - Qwen/Qwen-7B +# - Qwen/Qwen-7B-Chat +# - Salesforce/xgen-7b-8k-base +tiktoken \ No newline at end of file diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers index c75450346..200cffc10 160000 --- a/thirdparty/openvino_tokenizers +++ b/thirdparty/openvino_tokenizers @@ -1 +1 @@ -Subproject commit c754503462f569b648b598d57ff91ea57bb8deb1 +Subproject commit 200cffc10e3479b00006b613dc3c9fa48301177d From a44e0aa16a62179adab5cd6c2be561bb291a762d Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 22 May 2024 16:53:42 +0200 Subject: [PATCH 03/32] Endless loop fix. --- .../src/continuous_batching_pipeline.cpp | 24 +++++++- .../library/src/scheduler.hpp | 18 ++++-- .../library/src/sequence_group.hpp | 54 +++++++++++++++++- .../python/tests/common.py | 6 +- .../python/tests/test_preemption.py | 55 ++++++++++++++++--- 5 files changed, 141 insertions(+), 16 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp index 3bfd2dabf..40b1d0223 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp @@ -147,6 +147,14 @@ class ContinuousBatchingPipeline::Impl { timer.end(); } + // if no tokens were scheduled, we are out of memory + if (scheduler_output.m_total_num_scheduled_tokens == 0) { + for (size_t sequence_group_id = 0; sequence_group_id < m_requests.size(); ++sequence_group_id) { + m_requests[sequence_group_id]->set_out_of_memory(); + } + return {}; + } + ov::Tensor logits; { static ManualTimer timer("forward"); @@ -194,7 +202,6 @@ class ContinuousBatchingPipeline::Impl { } // perform post-processing of current step - std::vector currently_finished_requests; { static ManualTimer timer("create finished results"); @@ -221,6 +228,14 @@ class ContinuousBatchingPipeline::Impl { return !m_requests.empty(); } + bool out_of_memory() const { + for (size_t sequence_group_id = 0; sequence_group_id < m_requests.size(); ++sequence_group_id) { + if (m_requests[sequence_group_id]->out_of_memory()) + return true; + } + return false; + } + std::vector generate(const std::vector prompts, std::vector sampling_params) { OPENVINO_ASSERT(!has_running_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request"); OPENVINO_ASSERT(prompts.size() == sampling_params.size()); @@ -232,11 +247,14 @@ class ContinuousBatchingPipeline::Impl { std::vector results; results.reserve(m_requests.size()); - while (has_running_requests()) { + while (has_running_requests() && !out_of_memory()) { std::vector partial_results = step(); - results.insert(results.end(), partial_results.begin(), partial_results.end()); + if (partial_results.size() > 0) + results.insert(results.end(), partial_results.begin(), partial_results.end()); } + OPENVINO_ASSERT(!out_of_memory(), "Not enough memory for processing the requests."); + // sort results according to request_id to return results in order of initial prompts std::sort(results.begin(), results.end(), [] (const GenerationResult& r1, const GenerationResult& r2) -> bool { return r1.m_request_id < r2.m_request_id; diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp index ff1f9b84a..1cc357f7a 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp @@ -55,6 +55,8 @@ class Scheduler { } } + _clear_waiting_sequences(sequence_groups); + return scheduler_output; } @@ -104,9 +106,10 @@ class Scheduler { m_block_manager.free_sequence(seq_id); } sequence_group->reset(); + sequence_group->set_waiting(); return m_block_manager.num_free_blocks() > prev_blocks_count; } - + // currently partial preemtion is enabled only for single running sequence case // TODO: implement partial preemption for case with muliple sequences in group for (size_t s = 0; s < num_running_sequences; ++s) { @@ -150,6 +153,7 @@ class Scheduler { m_block_manager.free_sequence(seq_id); } sequence_group->preempt_tokens(preempted_tokens); + sequence_group->set_waiting(); return total_num_released_blocks > 0; } @@ -197,7 +201,7 @@ class Scheduler { for (size_t sequence_group_id = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) { SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id]; - if (!sequence_group->can_generate_tokens()) { + if (!sequence_group->can_generate_tokens() && !sequence_group->is_waiting()) { size_t num_running_seqs = sequence_group->num_running_seqs(); // prompt phases can have a single running sequence OPENVINO_ASSERT(num_running_seqs == 1); @@ -249,7 +253,7 @@ class Scheduler { // Question: do we need to schedule preeempted first as it's done in vLLM? // Answer: preempted sequences have low priority, so they should be after "running" ones. So, here we // keep latencies for sequence groups of high priority - if (sequence_group->can_generate_tokens()) { + if (sequence_group->can_generate_tokens() && !sequence_group->is_waiting()) { OPENVINO_ASSERT(!sequence_group->has_finished()); size_t num_running_seqs = sequence_group->num_running_seqs(); size_t num_tokens_in_megabatch = m_config.max_num_batched_tokens - scheduler_output.m_total_num_scheduled_tokens; @@ -322,7 +326,7 @@ class Scheduler { for (size_t sequence_group_id = 0, num_scheduled_tokens = 0, max_sequence_len = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) { SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id]; - if (!sequence_group->can_generate_tokens()) { + if (!sequence_group->can_generate_tokens() && !sequence_group->is_waiting()) { size_t num_running_seqs = sequence_group->num_running_seqs(); // prompt phases can have a single running sequence OPENVINO_ASSERT(num_running_seqs == 1); @@ -381,4 +385,10 @@ class Scheduler { } } } + + void _clear_waiting_sequences(const std::vector& sequence_groups) { + for (size_t sequence_group_id = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) { + sequence_groups[sequence_group_id]->clear_waiting_sequences(); + } + } }; diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp index 29b0af513..c49a88a5a 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp @@ -10,7 +10,9 @@ enum class SequenceStatus { RUNNING = 0, - FINISHED = 1 + FINISHED = 1, + OUT_OF_MEMORY = 2, + WAITING = 3 }; using TokenIds = std::vector; @@ -65,6 +67,14 @@ class Sequence { return m_status == SequenceStatus::RUNNING; } + bool out_of_memory() const { + return m_status == SequenceStatus::OUT_OF_MEMORY; + } + + bool is_waiting() const { + return m_status == SequenceStatus::WAITING; + } + void set_status(SequenceStatus status) { m_status = status; } @@ -279,6 +289,14 @@ class SequenceGroup { clear_scheduled_tokens(); } + void clear_waiting_sequences() { + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->is_waiting()) { + m_sequences[seq_id]->set_status(SequenceStatus::RUNNING); + } + } + } + const TokenIds& get_prompt_ids() const { return m_prompt_ids; } @@ -321,4 +339,38 @@ class SequenceGroup { return false; return true; } + + void set_out_of_memory() { + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->is_running()) { + m_sequences[seq_id]->set_status(SequenceStatus::OUT_OF_MEMORY); + } + } + } + + void set_waiting() { + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->is_running()) { + m_sequences[seq_id]->set_status(SequenceStatus::WAITING); + } + } + } + + bool out_of_memory() { + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->out_of_memory()) { + return true; + } + } + return false; + } + + bool is_waiting() { + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->is_waiting()) { + return true; + } + } + return false; + } }; diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py index 724c7cf71..e9b483b31 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py @@ -237,10 +237,14 @@ def generate_and_compare_with_reference_text(model_path: Path, prompts: List[str for ref_text, ov_text in zip(ref_texts_for_this_prompt, ov_result.m_generation_ids): assert ref_text == ov_text -def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = None): +def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None): prompts, generation_configs = get_test_dataset() scheduler_config = get_scheduler_config(scheduler_params) + if generation_config is not None: + generation_config.rng_seed = 0 + generation_configs = [generation_config] * len(prompts) + _generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py index 781d016d3..e0ba5b6ad 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py @@ -1,13 +1,54 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + import pytest +from dataclasses import dataclass +from py_continuous_batching import GenerationConfig, GenerationResult +from typing import List -from common import run_test_pipeline, get_models_list +from common import get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \ + DEFAULT_SCHEDULER_CONFIG, get_scheduler_config, run_test_pipeline, get_models_list, get_beam_search, get_greedy, \ + get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p +from test_sampling import RandomSamplingTestStruct -scheduler_params_list = [{"num_kv_blocks": 300, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, - {"num_kv_blocks": 40, "block_size": 4, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, # test preemption for dynamic_split_fuse - {"num_kv_blocks": 40, "block_size": 4, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}] # test preemption for vllm -@pytest.mark.parametrize("scheduler_params", scheduler_params_list) +scheduler_params_list = [({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), + ({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), + ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), # output text does not match due to <\s> symbols problem + ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search())] # output text does not match due to <\s> symbols problem +@pytest.mark.parametrize("params", scheduler_params_list) @pytest.mark.precommit -def test_preemption(tmp_path, scheduler_params): - run_test_pipeline(tmp_path, "facebook/opt-125m", scheduler_params) \ No newline at end of file +def test_preemption(tmp_path, params): + run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1]) + + +@pytest.mark.precommit +def test_out_of_memory(tmp_path): + with pytest.raises(RuntimeError) as excinfo: + run_test_pipeline(tmp_path, "facebook/opt-125m", {"num_kv_blocks": 1}) + assert "Not enough memory for processing the requests." in str(excinfo.value) + +multinomial_params = RandomSamplingTestStruct(generation_config=[get_multinomial_temperature(), + get_multinomial_temperature_and_top_p(), + get_multinomial_temperature_and_top_k()], + prompts=["What is OpenVINO?", + "How are you?", + "Tell me something about Canada?", + ], + ref_texts=[ ["\n\nOpenVINO is a live platform that allows users to create and manage a new library for open source applications.\n\nOpenVINO is"], + [" You're getting much better results from doing this, than you are by not doing this. I have a BH and I was so far"], + ["\nI'm from Canada, and I'm from the US, so I'm not sure.\nI think you mean the Canadian version."]]) + +@pytest.mark.parametrize("dynamic_split_fuse", [True, False]) +@pytest.mark.precommit +def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse): + generation_configs = multinomial_params.generation_config + for config in generation_configs: + config.rng_seed = 0 + model_id : str = "facebook/opt-125m" + model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + + model_path : Path = tmp_path / model_id + save_ov_model_from_optimum(model, hf_tokenizer, model_path) + + scheduler_config = get_scheduler_config({"num_kv_blocks": 3, "block_size": 32, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) + generate_and_compare_with_reference_text(model_path, multinomial_params.prompts, multinomial_params.ref_texts, generation_configs, scheduler_config) From e0815cf5401f173f670731d9512f7963985a6769 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 22 May 2024 17:07:26 +0200 Subject: [PATCH 04/32] Minor correction. --- .../library/src/continuous_batching_pipeline.cpp | 1 + .../causal_lm/cpp/continuous_batching/library/src/scheduler.hpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp index 40b1d0223..e2919c5d2 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp @@ -202,6 +202,7 @@ class ContinuousBatchingPipeline::Impl { } // perform post-processing of current step + std::vector currently_finished_requests; { static ManualTimer timer("create finished results"); diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp index 1cc357f7a..e3bdb0c2a 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp @@ -109,7 +109,7 @@ class Scheduler { sequence_group->set_waiting(); return m_block_manager.num_free_blocks() > prev_blocks_count; } - + // currently partial preemtion is enabled only for single running sequence case // TODO: implement partial preemption for case with muliple sequences in group for (size_t s = 0; s < num_running_sequences; ++s) { From 02045f51f7cc4ea2012b5fd4e1c4404648951bcf Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 22 May 2024 20:15:49 +0200 Subject: [PATCH 05/32] Updated list of models --- .../python/tests/common.py | 2 +- .../python/tests/models/real_models | 61 ++++++++++--------- 2 files changed, 33 insertions(+), 30 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py index 724c7cf71..3f2187486 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py @@ -191,7 +191,7 @@ def save_ov_model_from_optimum(model, hf_tokenizer, model_path: Path): # convert tokenizers as well from openvino_tokenizers import convert_tokenizer from openvino import serialize - tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True) + tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True, add_special_tokens=False) serialize(tokenizer, model_path / "openvino_tokenizer.xml") serialize(detokenizer, model_path / "openvino_detokenizer.xml") diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models index e9ae2f441..0a11e4f39 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models @@ -11,52 +11,60 @@ Qwen/Qwen-7B Qwen/Qwen-7B-Chat Qwen/Qwen1.5-0.5B Qwen/Qwen1.5-7B-Chat -rinna/bilingual-gpt-neox-4b -google/pegasus-big_patent -google/pegasus-large -# -# Set of models, which require support in optimum-intel: internlm/internlm-chat-7b BAAI/Aquila-7B internlm/internlm2-7b -Salesforce/codegen2-1b -Salesforce/xgen-7b-8k-base -facebook/blenderbot-3B -# -# Set of models, failed because of CPU limitation -# CPU: head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b -# CPU: head size must be multiple of 16, current: 100: openlm-research/open_llama_3b -# CPU: head size must be multiple of 16, current: 100: openlm-research/open_llama_3b_v2 -# -# Set of failed models, because of PA: +openchat/openchat_3.5 +lmsys/vicuna-7b-v1.5 +lmsys/longchat-7b-v1.5-32k BAAI/AquilaChat2-7B BAAI/AquilaChat-7B baichuan-inc/Baichuan-7B tiiuae/falcon-7b microsoft/Phi-3-mini-128k-instruct -microsoft/Phi-3-mini-4k-instruct -bigcode/starcoderbase-3b -bigcode/gpt_bigcode-santacoder +microsoft/Phi-3-mini-4k-instruct# nomic-ai/gpt4all-mpt mosaicml/mpt-7b -facebook/opt-350m +# Set of models, failed because of C++ Cont. Batching +# RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B +# +# Set of models, which require support in optimum-intel / transformers / models repositories: +# https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32: Salesforce/xgen-7b-8k-base +# Trying to export a jais model, that is a custom or unsupported architecture: core42/jais-13b-chat +# IndexError: tuple index out of range: facebook/blenderbot-3B +# `pip install flash_attn`: OrionStarAI/Orion-14B-Base +# Xeon only: reports IP_ADDRESS on Optimum inference: allenai/OLMo-1B-hf +# Xeon only: reports IP_ADDRESS on Optimum inference: allenai/OLMo-7B-hf +# +# Set of models, failed because of CPU limitation +# head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b +# head size must be multiple of 16, current: 100: openlm-research/open_llama_3b +# head size must be multiple of 16, current: 100: openlm-research/open_llama_3b_v2 +# +# Set of failed models, because of PA: +# 'start' input is not a scalar: google/pegasus-big_patent +# 'start' input is not a scalar: google/pegasus-large +# 'stop' input is not a scalar: Salesforce/codegen2-1b +# Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]): bigcode/starcoderbase-3b +# Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]): bigcode/gpt_bigcode-santacoder +# Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]): facebook/opt-350m # # Set of models, failed because of OpenVINO Tokenizers: -# RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B +# https://jira.devtools.intel.com/browse/CVS-142063: rinna/bilingual-gpt-neox-4b # # Set of 13B, 30B abd 70B models: EleutherAI/gpt-neox-20b # big model, not tried: core42/jais-13b -core42/jais-13b-chat +# see optimum: core42/jais-13b-chat # big model, not tried: young-geng/koala mistralai/Mixtral-8x7B-v0.1 # big model, not tried: mistralai/Mixtral-8x7B-Instruct-v0.1 # big model, not tried: mosaicml/mpt-30b -OrionStarAI/Orion-14B-Base +# see optimum: OrionStarAI/Orion-14B-Base # big model, not tried: OrionStarAI/Orion-14B-Chat # big model, not tried: Qwen/Qwen1.5-MoE-A2.7B Qwen/Qwen1.5-MoE-A2.7B-Chat -# big model, not tried: xverse/XVERSE-MoE-A4.2B +xverse/XVERSE-MoE-A4.2B # # Set of passed models: microsoft/phi-2 @@ -83,8 +91,6 @@ meta-llama/CodeLlama-7b-hf lmsys/vicuna-7b-v1.3 mistralai/Mistral-7B-v0.1 mistralai/Mistral-7B-Instruct-v0.1 -allenai/OLMo-1B-hf -allenai/OLMo-7B-hf 01-ai/Yi-6B Salesforce/codegen-350M-multi Salesforce/codegen-350M-nl @@ -92,7 +98,4 @@ togethercomputer/RedPajama-INCITE-Chat-3B-v1 # passed, but with export=False: OpenVINO/codegen25-7b-multi-fp16-ov # # Set of invalid models, because of HF: -# HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat -openchat/openchat_3.5 -lmsys/vicuna-7b-v1.5 -lmsys/longchat-7b-v1.5-32k \ No newline at end of file +# HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat \ No newline at end of file From 6f60f9156e0b7a484962cede253ef1ffd41612aa Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 22 May 2024 20:49:41 +0200 Subject: [PATCH 06/32] Proper flag to skip special tokens --- .../causal_lm/cpp/continuous_batching/python/tests/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py index 3f2187486..df27e6d4a 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py @@ -191,7 +191,7 @@ def save_ov_model_from_optimum(model, hf_tokenizer, model_path: Path): # convert tokenizers as well from openvino_tokenizers import convert_tokenizer from openvino import serialize - tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True, add_special_tokens=False) + tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True, skip_special_tokens=True) serialize(tokenizer, model_path / "openvino_tokenizer.xml") serialize(detokenizer, model_path / "openvino_detokenizer.xml") From ec3ac962f9ee375f7c2f356165255bcc8b3732b6 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Thu, 23 May 2024 11:21:21 +0200 Subject: [PATCH 07/32] Updated models list --- .../python/tests/models/real_models | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models index 0a11e4f39..cf7785e30 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models @@ -22,9 +22,10 @@ BAAI/AquilaChat-7B baichuan-inc/Baichuan-7B tiiuae/falcon-7b microsoft/Phi-3-mini-128k-instruct -microsoft/Phi-3-mini-4k-instruct# +microsoft/Phi-3-mini-4k-instruct nomic-ai/gpt4all-mpt mosaicml/mpt-7b +mosaicml/mpt-7b-chat # Set of models, failed because of C++ Cont. Batching # RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B # @@ -58,11 +59,13 @@ EleutherAI/gpt-neox-20b # see optimum: core42/jais-13b-chat # big model, not tried: young-geng/koala mistralai/Mixtral-8x7B-v0.1 -# big model, not tried: mistralai/Mixtral-8x7B-Instruct-v0.1 -# big model, not tried: mosaicml/mpt-30b +mistralai/Mixtral-8x7B-Instruct-v0.1 +mosaicml/mpt-30b # see optimum: OrionStarAI/Orion-14B-Base # big model, not tried: OrionStarAI/Orion-14B-Chat -# big model, not tried: Qwen/Qwen1.5-MoE-A2.7B +CohereForAI/c4ai-command-r-v01 +openlm-research/open_llama_13b +Qwen/Qwen1.5-MoE-A2.7B Qwen/Qwen1.5-MoE-A2.7B-Chat xverse/XVERSE-MoE-A4.2B # @@ -98,4 +101,4 @@ togethercomputer/RedPajama-INCITE-Chat-3B-v1 # passed, but with export=False: OpenVINO/codegen25-7b-multi-fp16-ov # # Set of invalid models, because of HF: -# HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat \ No newline at end of file +# HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat From a553d49f9d56cb0206762f1218563304922a702f Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 23 May 2024 12:18:36 +0200 Subject: [PATCH 08/32] PyTest config. --- .../cpp/continuous_batching/python/tests/.pytest.ini | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 text_generation/causal_lm/cpp/continuous_batching/python/tests/.pytest.ini diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/.pytest.ini b/text_generation/causal_lm/cpp/continuous_batching/python/tests/.pytest.ini new file mode 100644 index 000000000..7bc73fe85 --- /dev/null +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/.pytest.ini @@ -0,0 +1,5 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +[pytest] +addopts = -m precommit \ No newline at end of file From 96bf758e5cb57cf004804860bb4958116177879c Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 23 May 2024 12:24:37 +0200 Subject: [PATCH 09/32] Mark for real_models. --- .../cpp/continuous_batching/python/tests/test_sampling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py index 50e917253..14a2e8295 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py @@ -22,7 +22,7 @@ def test_sampling_precommit(tmp_path, model_id): def test_sampling_nightly(tmp_path, model_id): run_test_pipeline(tmp_path, model_id) - +@pytest.mark.real_models @pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models"))) def test_real_models(tmp_path, model_id): run_test_pipeline(tmp_path, model_id) From fba8ac6b3c7aeed0901c38834953babae850d7c2 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Thu, 23 May 2024 12:11:09 +0000 Subject: [PATCH 10/32] Fix for models where k and v merged into a single variable per decoder layer. --- .../library/src/paged_attention_transformations.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/paged_attention_transformations.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/paged_attention_transformations.cpp index ad02f279d..887cdbd38 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/paged_attention_transformations.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/paged_attention_transformations.cpp @@ -12,15 +12,14 @@ void apply_paged_attention_transformations(std::shared_ptr model, Dev const ov::op::util::VariableVector& variables = model->get_variables(); OPENVINO_ASSERT(!variables.empty(), "Model is supposed to be stateful"); - // number of variables is 2 (K and V) multiplied by number of decoder layers - size_t num_layers = variables.size() >> 1; - - ov::pass::Manager manager; - manager.register_pass(); - manager.run_passes(model); + ov::pass::SDPAToPagedAttention().run_on_model(model); const ov::ParameterVector& parameters = model->get_parameters(); + size_t num_layers = std::count_if(parameters.begin(), parameters.end(), [](std::shared_ptr parameter) { + return parameter->get_friendly_name().find("key_cache.") == 0; + }); + // extract num_kv_heads and head_size size_t kv_caches_inputs_offset = 2; ov::PartialShape k_shape = parameters[kv_caches_inputs_offset]->get_partial_shape(); From d79d7fd74bfe9a1ec37ac64325672f0e44f1603f Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 23 May 2024 14:11:51 +0200 Subject: [PATCH 11/32] Tests correction. --- .../cpp/continuous_batching/python/tests/test_preemption.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py index e0ba5b6ad..5300296d2 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py @@ -13,8 +13,8 @@ scheduler_params_list = [({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), ({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), - ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), # output text does not match due to <\s> symbols problem - ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search())] # output text does not match due to <\s> symbols problem + ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), + ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search())] @pytest.mark.parametrize("params", scheduler_params_list) @pytest.mark.precommit def test_preemption(tmp_path, params): @@ -36,7 +36,7 @@ def test_out_of_memory(tmp_path): ], ref_texts=[ ["\n\nOpenVINO is a live platform that allows users to create and manage a new library for open source applications.\n\nOpenVINO is"], [" You're getting much better results from doing this, than you are by not doing this. I have a BH and I was so far"], - ["\nI'm from Canada, and I'm from the US, so I'm not sure.\nI think you mean the Canadian version."]]) + ["\nI'm from Canada, and I'm from the US, so I'm not sure.\nI think you mean the Canadian version."]]) @pytest.mark.parametrize("dynamic_split_fuse", [True, False]) @pytest.mark.precommit From 39916cafbcba0c7d25ab6c5a9a9361d2ef6eb864 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Thu, 23 May 2024 22:30:15 +0200 Subject: [PATCH 12/32] Adjust real models list --- .../python/tests/models/real_models | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models index cf7785e30..a7c637a3b 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models @@ -26,16 +26,16 @@ microsoft/Phi-3-mini-4k-instruct nomic-ai/gpt4all-mpt mosaicml/mpt-7b mosaicml/mpt-7b-chat +bigcode/starcoderbase-3b +bigcode/gpt_bigcode-santacoder +allenai/OLMo-1B-hf +allenai/OLMo-7B-hf # Set of models, failed because of C++ Cont. Batching # RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B # # Set of models, which require support in optimum-intel / transformers / models repositories: -# https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32: Salesforce/xgen-7b-8k-base -# Trying to export a jais model, that is a custom or unsupported architecture: core42/jais-13b-chat # IndexError: tuple index out of range: facebook/blenderbot-3B # `pip install flash_attn`: OrionStarAI/Orion-14B-Base -# Xeon only: reports IP_ADDRESS on Optimum inference: allenai/OLMo-1B-hf -# Xeon only: reports IP_ADDRESS on Optimum inference: allenai/OLMo-7B-hf # # Set of models, failed because of CPU limitation # head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b @@ -46,8 +46,8 @@ mosaicml/mpt-7b-chat # 'start' input is not a scalar: google/pegasus-big_patent # 'start' input is not a scalar: google/pegasus-large # 'stop' input is not a scalar: Salesforce/codegen2-1b -# Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]): bigcode/starcoderbase-3b -# Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]): bigcode/gpt_bigcode-santacoder +# 'stop' input is not a scalar: core42/jais-13b +# 'stop' input is not a scalar: core42/jais-13b-chat # Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]): facebook/opt-350m # # Set of models, failed because of OpenVINO Tokenizers: @@ -55,8 +55,6 @@ mosaicml/mpt-7b-chat # # Set of 13B, 30B abd 70B models: EleutherAI/gpt-neox-20b -# big model, not tried: core42/jais-13b -# see optimum: core42/jais-13b-chat # big model, not tried: young-geng/koala mistralai/Mixtral-8x7B-v0.1 mistralai/Mixtral-8x7B-Instruct-v0.1 @@ -73,7 +71,6 @@ xverse/XVERSE-MoE-A4.2B microsoft/phi-2 microsoft/phi-1_5 EleutherAI/gpt-neo-125m -EleutherAI/gpt-neo-125m EleutherAI/gpt-neo-1.3B EleutherAI/gpt-j-6b baichuan-inc/Baichuan2-7B-Chat @@ -102,3 +99,4 @@ togethercomputer/RedPajama-INCITE-Chat-3B-v1 # # Set of invalid models, because of HF: # HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat +# https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32: Salesforce/xgen-7b-8k-base From 41cc93011c49bac48e15f2aab7b5c6f98dc4e600 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 24 May 2024 12:36:01 +0200 Subject: [PATCH 13/32] Added mre models --- .../python/tests/models/real_models | 28 ++++++++++++++++++- .../python/tests/requirements.txt | 6 +++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models index a7c637a3b..40c261fe7 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models @@ -30,12 +30,32 @@ bigcode/starcoderbase-3b bigcode/gpt_bigcode-santacoder allenai/OLMo-1B-hf allenai/OLMo-7B-hf +PygmalionAI/pygmalion-6b +stabilityai/stable-code-3b +berkeley-nest/Starling-LM-7B-alpha +EleutherAI/gpt-neo-2.7B +databricks/dolly-v1-6b +openai-community/gpt2-large +openai-community/gpt2-medium +bigscience/bloom-7b1 +facebook/opt-1.3b +facebook/opt-2.7b +GAIR/Abel-7B-002 +google/gemma-1.1-7b-it +google/gemma-2b-it +microsoft/DialoGPT-large +microsoft/DialoGPT-medium +Qwen/Qwen1.5-1.8B +microsoft/Orca-2-7b # Set of models, failed because of C++ Cont. Batching # RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B # # Set of models, which require support in optimum-intel / transformers / models repositories: # IndexError: tuple index out of range: facebook/blenderbot-3B # `pip install flash_attn`: OrionStarAI/Orion-14B-Base +# ValueError: Trying to export a fuyu model, that is a custom or unsupported architecture: adept/fuyu-8b +# ValueError: Trying to export a mamba model, that is a custom or unsupported architecture: state-spaces/mamba-130m-hf +# ValueError: Trying to export a xlnet model, that is a custom or unsupported architecture: xlnet/xlnet-base-cased # # Set of models, failed because of CPU limitation # head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b @@ -52,6 +72,9 @@ allenai/OLMo-7B-hf # # Set of models, failed because of OpenVINO Tokenizers: # https://jira.devtools.intel.com/browse/CVS-142063: rinna/bilingual-gpt-neox-4b +# Cannot convert tokenizer of this type without `.model` file: deepseek-ai/deepseek-coder-33b-instruct +# Cannot convert tokenizer of this type without `.model` file: deepseek-ai/deepseek-coder-6.7b-instruct +# Tokenizer type is not supported: : microsoft/biogpt # # Set of 13B, 30B abd 70B models: EleutherAI/gpt-neox-20b @@ -66,6 +89,9 @@ openlm-research/open_llama_13b Qwen/Qwen1.5-MoE-A2.7B Qwen/Qwen1.5-MoE-A2.7B-Chat xverse/XVERSE-MoE-A4.2B +cerebras/Cerebras-GPT-13B +WizardLMTeam/WizardCoder-15B-V1.0 +TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ # # Set of passed models: microsoft/phi-2 @@ -99,4 +125,4 @@ togethercomputer/RedPajama-INCITE-Chat-3B-v1 # # Set of invalid models, because of HF: # HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat -# https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32: Salesforce/xgen-7b-8k-base +# https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32: Salesforce/xgen-7b-8k-base \ No newline at end of file diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt index 4a94dad33..568b6886b 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt @@ -32,4 +32,8 @@ timm # - Qwen/Qwen-7B # - Qwen/Qwen-7B-Chat # - Salesforce/xgen-7b-8k-base -tiktoken \ No newline at end of file +tiktoken +# - microsoft/biogpt +sacremoses +# - TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ +auto-gptq \ No newline at end of file From 0c2e335be171f3a522aeb0befe7f7a4b0360eeeb Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 24 May 2024 13:38:06 +0200 Subject: [PATCH 14/32] Drop koala model --- .../cpp/continuous_batching/python/tests/models/real_models | 1 - 1 file changed, 1 deletion(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models index 40c261fe7..b179c6058 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models @@ -78,7 +78,6 @@ microsoft/Orca-2-7b # # Set of 13B, 30B abd 70B models: EleutherAI/gpt-neox-20b -# big model, not tried: young-geng/koala mistralai/Mixtral-8x7B-v0.1 mistralai/Mixtral-8x7B-Instruct-v0.1 mosaicml/mpt-30b From d717e01eefdb1d8ff47b26199c5d3fbc0ed4b2ea Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 24 May 2024 16:49:54 +0200 Subject: [PATCH 15/32] Added generation statuses. --- .../include/continuous_batching_pipeline.hpp | 9 ++++ .../src/continuous_batching_pipeline.cpp | 43 ++++++++++++------- .../library/src/sequence_group.hpp | 6 +-- 3 files changed, 39 insertions(+), 19 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/include/continuous_batching_pipeline.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/include/continuous_batching_pipeline.hpp index 5afd0e715..b3701d436 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/include/continuous_batching_pipeline.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/include/continuous_batching_pipeline.hpp @@ -9,6 +9,12 @@ #include "tokenizer.hpp" #include "generation_config.hpp" +enum class GenerationResultStatus { + FINISHED = 0, + IGNORED = 1, + ABORTED = 2 // Currently not used, TODO: implement abort functionality +}; + struct GenerationResult { // request ID uint64_t m_request_id; @@ -18,6 +24,9 @@ struct GenerationResult { std::vector m_generation_ids; // scores std::vector m_scores; + + // Status of generation + GenerationResultStatus m_status; }; class ContinuousBatchingPipeline { diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp index e2919c5d2..667eeacc0 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp @@ -19,12 +19,11 @@ GenerationResult from_sequence_group(std::shared_ptr tokenizer, Seque std::vector finished_sequences = sequence_group->get_finished_sequences(); - OPENVINO_ASSERT(finished_sequences.size() == sequence_group->num_total_seqs() && sequence_group->has_finished()); + OPENVINO_ASSERT(finished_sequences.size() == sequence_group->num_total_seqs()); for (size_t sequence_id = 0; sequence_id < finished_sequences.size(); ++sequence_id) { Sequence::CPtr sequence = finished_sequences[sequence_id]; result.m_scores.push_back(sequence->get_beam_search_score(sequence_group->get_sampling_parameters())); - { static ManualTimer timer("detokenize"); timer.start(); @@ -34,6 +33,15 @@ GenerationResult from_sequence_group(std::shared_ptr tokenizer, Seque } } + if (sequence_group->has_finished()) { + result.m_status = GenerationResultStatus::FINISHED; + } + else if (sequence_group->out_of_memory()) { + result.m_status = GenerationResultStatus::IGNORED; + } + else { + result.m_status = GenerationResultStatus::ABORTED; + } return result; } @@ -74,6 +82,10 @@ class ContinuousBatchingPipeline::Impl { m_requests.erase(new_end, m_requests.end()); } + void _free_all_requests() { + m_requests.erase(m_requests.begin(), m_requests.end()); + } + public: Impl(const std::string& models_path, const SchedulerConfig& scheduler_config) { ov::Core core; @@ -152,7 +164,17 @@ class ContinuousBatchingPipeline::Impl { for (size_t sequence_group_id = 0; sequence_group_id < m_requests.size(); ++sequence_group_id) { m_requests[sequence_group_id]->set_out_of_memory(); } - return {}; + + // return partial results + std::vector pertial_results; + + for (size_t i = 0; i < m_requests.size(); ++i) { + SequenceGroup::CPtr sequence_group = m_requests[i]; + pertial_results.push_back(from_sequence_group(m_tokenizer, sequence_group)); + } + + _free_all_requests(); + return pertial_results; } ov::Tensor logits; @@ -229,14 +251,6 @@ class ContinuousBatchingPipeline::Impl { return !m_requests.empty(); } - bool out_of_memory() const { - for (size_t sequence_group_id = 0; sequence_group_id < m_requests.size(); ++sequence_group_id) { - if (m_requests[sequence_group_id]->out_of_memory()) - return true; - } - return false; - } - std::vector generate(const std::vector prompts, std::vector sampling_params) { OPENVINO_ASSERT(!has_running_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request"); OPENVINO_ASSERT(prompts.size() == sampling_params.size()); @@ -248,14 +262,11 @@ class ContinuousBatchingPipeline::Impl { std::vector results; results.reserve(m_requests.size()); - while (has_running_requests() && !out_of_memory()) { + while (has_running_requests()) { std::vector partial_results = step(); - if (partial_results.size() > 0) - results.insert(results.end(), partial_results.begin(), partial_results.end()); + results.insert(results.end(), partial_results.begin(), partial_results.end()); } - OPENVINO_ASSERT(!out_of_memory(), "Not enough memory for processing the requests."); - // sort results according to request_id to return results in order of initial prompts std::sort(results.begin(), results.end(), [] (const GenerationResult& r1, const GenerationResult& r2) -> bool { return r1.m_request_id < r2.m_request_id; diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp index c49a88a5a..3bea9d37f 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp @@ -199,7 +199,7 @@ class SequenceGroup { std::vector get_finished_sequences() const { std::vector finished_seqs; for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { - if (m_sequences[seq_id]->has_finished()) { + if (m_sequences[seq_id]->has_finished() || m_sequences[seq_id]->out_of_memory()) { finished_seqs.push_back(m_sequences[seq_id]); } } @@ -356,7 +356,7 @@ class SequenceGroup { } } - bool out_of_memory() { + bool out_of_memory() const { for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { if (m_sequences[seq_id]->out_of_memory()) { return true; @@ -365,7 +365,7 @@ class SequenceGroup { return false; } - bool is_waiting() { + bool is_waiting() const { for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { if (m_sequences[seq_id]->is_waiting()) { return true; From 499083c16bed6431ae3440014e69f641f380892b Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 24 May 2024 17:38:29 +0200 Subject: [PATCH 16/32] Accuracy sample corrected. --- .../apps/accuracy_sample.cpp | 29 +++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp b/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp index e7cdaa7f3..0a7dfd3af 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp @@ -6,6 +6,12 @@ #include "continuous_batching_pipeline.hpp" +void print_sequence(const GenerationResult& generation_result) { + for (size_t output_id = 0; output_id < generation_result.m_generation_ids.size(); ++output_id) { + std::cout << "Answer " << output_id << " (" << generation_result.m_scores[output_id] << ") : " << generation_result.m_generation_ids[output_id] << std::endl; + } +} + int main(int argc, char* argv[]) try { // Command line options @@ -80,8 +86,27 @@ int main(int argc, char* argv[]) try { const GenerationResult & generation_result = generation_results[request_id]; std::cout << "Question: " << prompts[request_id] << std::endl; - for (size_t output_id = 0; output_id < generation_result.m_generation_ids.size(); ++output_id) { - std::cout << "Answer " << output_id << " (" << generation_result.m_scores[output_id] << ") : " << generation_result.m_generation_ids[output_id] << std::endl; + switch (generation_result.m_status) + { + case GenerationResultStatus::FINISHED: + print_sequence(generation_result); + break; + case GenerationResultStatus::IGNORED: + std::cout << "Sequence was ignored." < 0) { + std::cout << "Partial result:" << std::endl; + print_sequence(generation_result); + } + break; + case GenerationResultStatus::ABORTED: + std::cout << "Sequence was aborted." < 0) { + std::cout << "Partial result:" << std::endl; + print_sequence(generation_result); + } + break; + default: + break; } std::cout << std::endl; } From cfab5bfa0d42c9e90af261d3132dc79f3d7cf346 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 24 May 2024 17:42:33 +0200 Subject: [PATCH 17/32] Minor correction. --- .../cpp/continuous_batching/apps/accuracy_sample.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp b/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp index 0a7dfd3af..9de48508e 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp @@ -6,7 +6,7 @@ #include "continuous_batching_pipeline.hpp" -void print_sequence(const GenerationResult& generation_result) { +void print_generation_result(const GenerationResult& generation_result) { for (size_t output_id = 0; output_id < generation_result.m_generation_ids.size(); ++output_id) { std::cout << "Answer " << output_id << " (" << generation_result.m_scores[output_id] << ") : " << generation_result.m_generation_ids[output_id] << std::endl; } @@ -89,20 +89,20 @@ int main(int argc, char* argv[]) try { switch (generation_result.m_status) { case GenerationResultStatus::FINISHED: - print_sequence(generation_result); + print_generation_result(generation_result); break; case GenerationResultStatus::IGNORED: - std::cout << "Sequence was ignored." < 0) { std::cout << "Partial result:" << std::endl; - print_sequence(generation_result); + print_generation_result(generation_result); } break; case GenerationResultStatus::ABORTED: - std::cout << "Sequence was aborted." < 0) { std::cout << "Partial result:" << std::endl; - print_sequence(generation_result); + print_generation_result(generation_result); } break; default: From 41fbdcf37b87e0f1eecae12225075a69837a7e22 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 24 May 2024 17:55:44 +0200 Subject: [PATCH 18/32] Minor correction. --- .../continuous_batching/python/tests/test_preemption.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py index 5300296d2..078e7137a 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py @@ -20,13 +20,6 @@ def test_preemption(tmp_path, params): run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1]) - -@pytest.mark.precommit -def test_out_of_memory(tmp_path): - with pytest.raises(RuntimeError) as excinfo: - run_test_pipeline(tmp_path, "facebook/opt-125m", {"num_kv_blocks": 1}) - assert "Not enough memory for processing the requests." in str(excinfo.value) - multinomial_params = RandomSamplingTestStruct(generation_config=[get_multinomial_temperature(), get_multinomial_temperature_and_top_p(), get_multinomial_temperature_and_top_k()], From 7d18ebec006e29565ee5a08ad71f0324fbd16c39 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 24 May 2024 17:59:08 +0200 Subject: [PATCH 19/32] Minor correction. --- .../library/src/continuous_batching_pipeline.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp index 667eeacc0..54b3e9f53 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp @@ -24,6 +24,7 @@ GenerationResult from_sequence_group(std::shared_ptr tokenizer, Seque Sequence::CPtr sequence = finished_sequences[sequence_id]; result.m_scores.push_back(sequence->get_beam_search_score(sequence_group->get_sampling_parameters())); + { static ManualTimer timer("detokenize"); timer.start(); From d087295f9eb4d205460842a72090063065735616 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Fri, 24 May 2024 20:09:46 +0200 Subject: [PATCH 20/32] Fix sorting in the temperature transform --- .../cpp/continuous_batching/library/src/sampler.hpp | 7 +++++-- .../library/src/tests/logit_filtering.cpp | 5 +++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp index 126020e32..3bb9566ff 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp @@ -226,7 +226,8 @@ class TopPFilter: public IProbabilityFilter { nucleus_size += 1; if (probability_sum > m_top_p) break; } - return std::vector(tmp.begin(), tmp.begin() + nucleus_size); + tmp.resize(nucleus_size); + return tmp; } private: @@ -241,7 +242,8 @@ class TopKFilter: public IProbabilityFilter { std::vector tmp(input_probs); std::sort(tmp.begin(), tmp.end(), [](const ProbabilityWithIdx& lhs, const ProbabilityWithIdx& rhs) {return lhs.first > rhs.first; }); size_t top_k = input_probs.size() >= m_top_k ? m_top_k : input_probs.size(); - return std::vector(tmp.begin(), tmp.begin() + top_k); + tmp.resize(top_k); + return tmp; } private: @@ -256,6 +258,7 @@ class TemperatureLogitTransform { std::vector apply(const std::vector& input_logits) { std::vector output(input_logits.begin(), input_logits.end()); + std::sort(output.begin(), output.end(), [](const ProbabilityWithIdx& lhs, const ProbabilityWithIdx& rhs) {return lhs.first > rhs.first; }); float max_logit = output[0].first; std::for_each(output.begin(), output.end(), [max_logit, this](ProbabilityWithIdx& val) {val.first = expf((val.first - max_logit) / this->m_temperature);}); diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp index 6eba8cfe4..7aa982553 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp @@ -29,8 +29,9 @@ TEST_P(TemperatureTransformTest, TransformResultEqualToReference) { const std::vector TEMPERATURE_TRANSFORM_TEST_CASES = { - {1.0f, { {1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, { {0.090031, 0}, {0.244728, 1}, {0.665241, 2} } }, - {2.0f, { {1.0f, 2}, {2.0f, 1}, {3.0f, 0} }, { {0.186323, 2}, {0.307195, 1}, {0.506480, 0} } } + {1.0f, { {1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, { {0.665241, 2}, {0.244728, 1}, {0.090031, 0} } }, + {2.0f, { {1.0f, 2}, {2.0f, 1}, {3.0f, 0} }, { {0.506480, 0}, {0.307195, 1}, {0.186323, 2} } }, + {1.0f, { {3.0f, 0}, {1.0f, 1}, {2.0f, 2} }, { {0.665241, 0}, {0.244728, 2}, {0.090031, 1} } }, }; INSTANTIATE_TEST_SUITE_P(VariousInputs, From d22b7557004e575685c2918dcfb24db089f591a5 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Fri, 24 May 2024 23:23:23 +0200 Subject: [PATCH 21/32] Implement repetition penalty --- .../library/src/sampler.hpp | 62 +++++++++++++++---- .../library/src/sequence_group.hpp | 7 +++ .../library/src/tests/logit_filtering.cpp | 59 ++++++++++++++++++ .../python/tests/common.py | 6 ++ .../python/tests/test_sampling.py | 6 +- 5 files changed, 124 insertions(+), 16 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp index 3bb9566ff..f17a803cb 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include "openvino/runtime/tensor.hpp" @@ -275,6 +276,37 @@ class TemperatureLogitTransform { double m_temperature; }; +class RepetitionPenaltyTransform { +public: + RepetitionPenaltyTransform(double penalty) : m_penalty(penalty) { + OPENVINO_ASSERT(m_penalty >= 0.0f, "repetition penalty must be a positive value"); + } + + std::vector apply(const std::vector& input_logits, const std::set& unique_input_ids) { + std::vector output(input_logits.begin(), input_logits.end()); + size_t vocab_size = input_logits.size(); + for (auto input_id : unique_input_ids) { + OPENVINO_ASSERT((input_id >= 0) && (input_id < vocab_size), "input_ids token out of bounds"); + OPENVINO_ASSERT(input_logits[input_id].second == input_id, "input_logits must have original index order"); + auto logit_value = output[input_id].first; + if (logit_value >= 0) { + output[input_id].first /= m_penalty; + } else { + output[input_id].first *= m_penalty; + }; + } + return output; + } + + std::vector apply(const std::vector& input_logits, const TokenIds& input_ids) { + std::set unique_input_ids(input_ids.begin(), input_ids.end()); + return this->apply(input_logits, unique_input_ids); + } +private: + double m_penalty; +}; + + class ProbabilityNormalizeTransform { public: std::vector apply(const std::vector& input_probs) { @@ -288,27 +320,25 @@ class ProbabilityNormalizeTransform { class Sampler { - int64_t _greedy_sample(ov::Tensor logits) const { + std::vector _get_logit_vector(ov::Tensor logits) { ov::Shape logits_shape = logits.get_shape(); size_t batch_size = logits_shape[0], seq_len = logits_shape[1], vocab_size = logits_shape[2]; OPENVINO_ASSERT(batch_size == 1); - const float * logits_data = logits.data() + (seq_len - 1) * vocab_size; - int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data; - return out_token; - } - int64_t _multinomial_sample(ov::Tensor logits, float temperature, float top_p, size_t top_k) { - ov::Shape logits_shape = logits.get_shape(); - size_t batch_size = logits_shape[0], seq_len = logits_shape[1], vocab_size = logits_shape[2]; - OPENVINO_ASSERT(batch_size == 1); - - const float * logits_data = logits.data() + (seq_len - 1) * vocab_size; std::vector logit_vector(vocab_size); for (size_t i = 0; i < logit_vector.size(); i++) { logit_vector[i] = LogitWithIdx(logits_data[i], i); } + return logit_vector; + } + + int64_t _greedy_sample(const std::vector& logit_vector) const { + int64_t out_token = std::max_element(logit_vector.begin(), logit_vector.end(), [](const LogitWithIdx& lhs, const LogitWithIdx& rhs) { return lhs.first < rhs.first; }) - logit_vector.begin(); + return out_token; + } + int64_t _multinomial_sample(const std::vector& logit_vector, float temperature, float top_p, size_t top_k) { auto temperature_transform = TemperatureLogitTransform(temperature); std::vector softmax_vector = temperature_transform.apply(logit_vector); @@ -367,6 +397,12 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, const void * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens; ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, actual_seq_len, vocab_size}, (void *)sequence_group_logits_data); + auto logit_vector = _get_logit_vector(sequence_group_logits); // TODO (vshampor): do we really even need a tensor on the line above? + + if (sampling_params.repetition_penalty != 1.0f) { + auto repetition_penalty_transform = RepetitionPenaltyTransform(sampling_params.repetition_penalty); + logit_vector = repetition_penalty_transform.apply(logit_vector, sequence_group->get_unique_prompt_ids()); + } if (sequence_group->requires_sampling()) { if (sampling_params.is_greedy_sampling() || sampling_params.is_multinomial()) { @@ -375,10 +411,10 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, int64_t sampled_token_id; if (sampling_params.is_greedy_sampling()) { - sampled_token_id = _greedy_sample(sequence_group_logits); + sampled_token_id = _greedy_sample(logit_vector); } else { // .is_multinomial() - sampled_token_id = _multinomial_sample(sequence_group_logits, sampling_params.temperature, sampling_params.top_p, sampling_params.top_k); + sampled_token_id = _multinomial_sample(logit_vector, sampling_params.temperature, sampling_params.top_p, sampling_params.top_k); } // in case of greedy search we always have a single parent sequence to sample from running_sequences[0]->append_token(sampled_token_id, sequence_group_logits.data()[sampled_token_id]); diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp index 29b0af513..6ab10cd90 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp @@ -4,6 +4,7 @@ #pragma once #include +#include #include #include "generation_config.hpp" @@ -104,6 +105,7 @@ class SequenceGroup { GenerationConfig m_sampling_params; std::size_t m_block_size; TokenIds m_prompt_ids; + std::set m_unique_prompt_ids; // amount of processed tokens, e.g. prompt can be processed using multiple consequence inferences // so, we need to track which part of the prompt we have already processed @@ -131,6 +133,7 @@ class SequenceGroup { m_prompt_ids.resize(input_ids.get_size()); std::copy_n(input_ids.data(), input_ids.get_size(), m_prompt_ids.begin()); + for (auto id: m_prompt_ids) { m_unique_prompt_ids.insert(id); } } void add_sequence(const Sequence::Ptr & sequence) { @@ -283,6 +286,10 @@ class SequenceGroup { return m_prompt_ids; } + const std::set& get_unique_prompt_ids() const { + return m_unique_prompt_ids; + } + size_t get_num_logical_blocks() const { return (get_context_len() + m_block_size - 1) / m_block_size; } diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp index 7aa982553..80df9afc7 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp @@ -140,3 +140,62 @@ TEST(TopPFilterInitializationTest, ThrowsForInvalidProbabilities) { EXPECT_THROW(TopPFilter(-0.5), ov::Exception); EXPECT_THROW(TopPFilter(1.1), ov::Exception); } + + +struct RepetitionPenaltyTransformTestStruct { + float penalty; + std::vector input_logits; + TokenIds input_ids; + std::vector expected_output; +}; + +using RepetitionPenaltyTransformTest = testing::TestWithParam; + +TEST_P(RepetitionPenaltyTransformTest, TransformResultEqualToReference) { + auto test_struct = GetParam(); + auto transform = RepetitionPenaltyTransform(test_struct.penalty); + auto test_result = transform.apply(test_struct.input_logits, test_struct.input_ids); + ASSERT_EQ(test_result.size(), test_struct.expected_output.size()); + for (size_t i = 0; i < test_result.size(); i++) { + EXPECT_NEAR(test_result[i].first, test_struct.expected_output[i].first, 1e-6); + EXPECT_EQ(test_result[i].second, test_struct.expected_output[i].second); + } +} + + +const std::vector REPETITION_PENALTY_TRANSFORM_TEST_CASES = { + { // basic case, indices are applied, order is left as-is + 1.2f, + { {1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, + { 2, 0 }, + { {0.8333333f, 0}, {2.0f, 1}, {2.5f, 2} } + }, + { // negative scores case + 2.0f, + { {-1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, + { 0, 1 }, + { {-2.0f, 0}, {1.0f, 1}, {3.0f, 2} } + }, + { // repeated tokens in prompt, check that the penalty is only applied once + 0.5f, + { {-1.0f, 0}, {2.0f, 1}, {3.0f, 2} }, + { 1, 1 }, + { {-1.0f, 0}, {4.0f, 1}, {3.0f, 2} } + }, +}; + +INSTANTIATE_TEST_SUITE_P(VariousInputs, + RepetitionPenaltyTransformTest, + testing::ValuesIn(REPETITION_PENALTY_TRANSFORM_TEST_CASES)); + + +TEST(RepetitionPenaltyTransformInitializationTest, ThrowsForInvalidPenalties) { + EXPECT_THROW(RepetitionPenaltyTransform(-0.5), ov::Exception); +} + +TEST(RepetitionPenaltyTransformInitializationTest, ThrowsForInvalidInputIds) { + auto transform = RepetitionPenaltyTransform(1.5); + EXPECT_THROW(transform.apply({ {43.0f, 0} }, std::set{1337} ), ov::Exception); + EXPECT_THROW(transform.apply({ {18.0f, 0} }, std::set{0, -1} ), ov::Exception); +} + diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py index df27e6d4a..288506fc2 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py @@ -18,6 +18,12 @@ def get_greedy() -> GenerationConfig: generation_config.num_return_sequences = 1 return generation_config +def get_greedy_with_repetition_penalty() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 1 + generation_config.repetition_penalty = 2.0 + return generation_config + def get_beam_search() -> GenerationConfig: generation_config = GenerationConfig() diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py index 14a2e8295..82a5e3ec7 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py @@ -3,7 +3,7 @@ import os import pytest -from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG +from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty from dataclasses import dataclass from py_continuous_batching import GenerationConfig, GenerationResult from pathlib import Path @@ -84,8 +84,8 @@ def test_eos_greedy(tmp_path): print(f"Prompt = {prompt}\nHF result = {hf_result}\nOV result = {ov_result}") compare_results(hf_result, ov_result, generation_config) -@pytest.mark.parametrize("generation_config", [get_greedy(), get_beam_search()], - ids=["greedy", "beam"]) +@pytest.mark.parametrize("generation_config", [get_greedy(), get_beam_search(), get_greedy_with_repetition_penalty()], + ids=["greedy", "beam", "greedy_with_repetition_penalty"]) def test_individual_generation_configs_deterministic(tmp_path, generation_config): prompts = [ "What is OpenVINO?", From 78f41eadaa6bea8fe730cf940d495ae14cfce9d5 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Sat, 25 May 2024 14:29:55 +0200 Subject: [PATCH 22/32] Align with HF for sampling-based algos --- .../continuous_batching/library/src/sampler.hpp | 15 +++++++++------ .../library/src/sequence_group.hpp | 12 ++++++++---- .../continuous_batching/python/tests/common.py | 4 ++-- .../python/tests/test_sampling.py | 5 +++-- 4 files changed, 22 insertions(+), 14 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp index f17a803cb..2d4cc4df4 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp @@ -397,15 +397,15 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, const void * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens; ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, actual_seq_len, vocab_size}, (void *)sequence_group_logits_data); - auto logit_vector = _get_logit_vector(sequence_group_logits); // TODO (vshampor): do we really even need a tensor on the line above? - - if (sampling_params.repetition_penalty != 1.0f) { - auto repetition_penalty_transform = RepetitionPenaltyTransform(sampling_params.repetition_penalty); - logit_vector = repetition_penalty_transform.apply(logit_vector, sequence_group->get_unique_prompt_ids()); - } if (sequence_group->requires_sampling()) { if (sampling_params.is_greedy_sampling() || sampling_params.is_multinomial()) { + auto logit_vector = _get_logit_vector(sequence_group_logits); // TODO (vshampor): should be also applicable to beam search, but need to remove the batch size == 1 limitation + + if (sampling_params.repetition_penalty != 1.0f) { + auto repetition_penalty_transform = RepetitionPenaltyTransform(sampling_params.repetition_penalty); + logit_vector = repetition_penalty_transform.apply(logit_vector, sequence_group->get_unique_generated_ids()); + } std::vector running_sequences = sequence_group->get_running_sequences(); OPENVINO_ASSERT(running_sequences.size() == 1); @@ -416,6 +416,9 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, else { // .is_multinomial() sampled_token_id = _multinomial_sample(logit_vector, sampling_params.temperature, sampling_params.top_p, sampling_params.top_k); } + + sequence_group->register_generated_token_id(sampled_token_id); + // in case of greedy search we always have a single parent sequence to sample from running_sequences[0]->append_token(sampled_token_id, sequence_group_logits.data()[sampled_token_id]); diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp index 6ab10cd90..3aae278ec 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp @@ -105,7 +105,7 @@ class SequenceGroup { GenerationConfig m_sampling_params; std::size_t m_block_size; TokenIds m_prompt_ids; - std::set m_unique_prompt_ids; + std::set m_unique_generated_ids; // amount of processed tokens, e.g. prompt can be processed using multiple consequence inferences // so, we need to track which part of the prompt we have already processed @@ -133,7 +133,7 @@ class SequenceGroup { m_prompt_ids.resize(input_ids.get_size()); std::copy_n(input_ids.data(), input_ids.get_size(), m_prompt_ids.begin()); - for (auto id: m_prompt_ids) { m_unique_prompt_ids.insert(id); } + for (auto id: m_prompt_ids) { m_unique_generated_ids.insert(id); } } void add_sequence(const Sequence::Ptr & sequence) { @@ -286,8 +286,12 @@ class SequenceGroup { return m_prompt_ids; } - const std::set& get_unique_prompt_ids() const { - return m_unique_prompt_ids; + const std::set& get_unique_generated_ids() const { + return m_unique_generated_ids; + } + + void register_generated_token_id(int64_t token_id) { + m_unique_generated_ids.insert(token_id); } size_t get_num_logical_blocks() const { diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py index 288506fc2..3c5bdfe81 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py @@ -105,13 +105,13 @@ def convert_to_hf( # copy default parameters kwargs['eos_token_id'] = default_generation_config.eos_token_id kwargs['pad_token_id'] = default_generation_config.pad_token_id + kwargs['repetition_penalty'] = generation_config.repetition_penalty if generation_config.num_groups * generation_config.group_size > 1: # beam search case kwargs['num_beam_groups'] = generation_config.num_groups kwargs['num_beams'] = generation_config.num_groups * generation_config.group_size kwargs['diversity_penalty'] = generation_config.diversity_penalty - kwargs['repetition_penalty'] = generation_config.repetition_penalty kwargs['length_penalty'] = generation_config.length_penalty kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size kwargs['num_return_sequences'] = generation_config.num_return_sequences @@ -207,7 +207,7 @@ def get_model_and_tokenizer(model_id: str, use_optimum = True): AutoModelForCausalLM.from_pretrained(model_id) return model, hf_tokenizer -def _generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path): +def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path): use_optimum = True model_path : Path = tmp_path / model_id model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py index 82a5e3ec7..3690a27f5 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py @@ -3,7 +3,7 @@ import os import pytest -from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty +from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, generate_and_compare_with_hf from dataclasses import dataclass from py_continuous_batching import GenerationConfig, GenerationResult from pathlib import Path @@ -84,6 +84,7 @@ def test_eos_greedy(tmp_path): print(f"Prompt = {prompt}\nHF result = {hf_result}\nOV result = {ov_result}") compare_results(hf_result, ov_result, generation_config) +@pytest.mark.precommit @pytest.mark.parametrize("generation_config", [get_greedy(), get_beam_search(), get_greedy_with_repetition_penalty()], ids=["greedy", "beam", "greedy_with_repetition_penalty"]) def test_individual_generation_configs_deterministic(tmp_path, generation_config): @@ -92,7 +93,7 @@ def test_individual_generation_configs_deterministic(tmp_path, generation_config ] generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - _generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) + generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) @dataclass From 5f595059af28e7c7acd789fc4c370d03d974b0b5 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Sat, 25 May 2024 14:38:17 +0200 Subject: [PATCH 23/32] Align with HF for non-beam search cases --- .../python/tests/common.py | 7 ++++ .../python/tests/test_sampling.py | 34 +++++++++++-------- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py index 3c5bdfe81..8c4acfa51 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py @@ -61,6 +61,13 @@ def get_multinomial_temperature_top_p_and_top_k() -> GenerationConfig: generation_config.top_k = 2 return generation_config +def get_multinomial_temperature_and_repetition_penalty() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.temperature = 0.8 + generation_config.repetition_penalty = 2.0 + return generation_config + def get_test_dataset() -> Tuple[List[str], List[GenerationConfig]]: prompts = [ "What is OpenVINO?", diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py index 3690a27f5..e9a97d1ae 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py @@ -3,7 +3,7 @@ import os import pytest -from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, generate_and_compare_with_hf +from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, generate_and_compare_with_hf, get_multinomial_temperature_and_repetition_penalty from dataclasses import dataclass from py_continuous_batching import GenerationConfig, GenerationResult from pathlib import Path @@ -102,24 +102,28 @@ class RandomSamplingTestStruct: prompts: List[str] ref_texts: List[List[str]] -RANDOM_SAMPLING_TEST_CASES = [RandomSamplingTestStruct(generation_config=get_multinomial_temperature(), - prompts=["What is OpenVINO?"], - ref_texts=[ ["\n\nOpenVINO is a software development platform developed by OpenVINO, a set of technology companies and startups that enables developers to use the most"] ]), - RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_top_p(), - prompts=["What is OpenVINO?"], - ref_texts=[ ["\nOpenVINO is an online application that allows users to create, test, and analyze their own software using a collection of software packages. The application"] ]), - RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_top_k(), - prompts=["What is OpenVINO?"], - ref_texts=[ ["\n\nOpenVINO is a software that allows users to create a virtual machine with the ability to create a virtual machine in a virtual environment. Open"] ]), - RandomSamplingTestStruct(generation_config=get_multinomial_temperature_top_p_and_top_k(), - prompts=["What is OpenVINO?"], - ref_texts=[ ["\nOpenVINO is an open source software that allows developers to create, manage, and distribute software. It is an open source project that allows developers"] ]), - ] +RANDOM_SAMPLING_TEST_CASES = [ + RandomSamplingTestStruct(generation_config=get_multinomial_temperature(), + prompts=["What is OpenVINO?"], + ref_texts=[ ["\n\nOpenVINO is a software development platform developed by OpenVINO, a set of technology companies and startups that enables developers to use the most"] ]), + RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_top_p(), + prompts=["What is OpenVINO?"], + ref_texts=[ ["\nOpenVINO is an online application that allows users to create, test, and analyze their own software using a collection of software packages. The application"] ]), + RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_top_k(), + prompts=["What is OpenVINO?"], + ref_texts=[ ["\n\nOpenVINO is a software that allows users to create a virtual machine with the ability to create a virtual machine in a virtual environment. Open"] ]), + RandomSamplingTestStruct(generation_config=get_multinomial_temperature_top_p_and_top_k(), + prompts=["What is OpenVINO?"], + ref_texts=[ ["\nOpenVINO is an open source software that allows developers to create, manage, and distribute software. It is an open source project that allows developers"] ]), + RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_repetition_penalty(), + prompts=["What is OpenVINO?"], + ref_texts=[ ["\nOpen Vino's are a new and improved way to find cheap, fast-investment frozen vegetables that have no waste or calories. They're"] ]), +] @pytest.mark.precommit @pytest.mark.parametrize("test_struct", RANDOM_SAMPLING_TEST_CASES, - ids=["multinomial_temperature", "multinomial_temperature_and_top_p", "multinomial_temperature_and_top_k", "multinomial_temperature_top_p_and_top_k"]) + ids=["multinomial_temperature", "multinomial_temperature_and_top_p", "multinomial_temperature_and_top_k", "multinomial_temperature_top_p_and_top_k", "multinomial_temperature_and_repetition_penalty"]) def test_individual_generation_configs_random(tmp_path, test_struct: RandomSamplingTestStruct): generation_config = test_struct.generation_config From cc9e4a069e2059573bb794b08dc9b2a44d466894 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sun, 26 May 2024 01:22:13 +0200 Subject: [PATCH 24/32] Updated models list --- .../python/tests/models/real_models | 209 +++++++++--------- .../python/tests/requirements.txt | 4 +- 2 files changed, 107 insertions(+), 106 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models index b179c6058..4fe917605 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models @@ -1,127 +1,126 @@ -# Set of models with accuracy issues, because of PA +01-ai/Yi-6B +BAAI/Aquila-7B +BAAI/AquilaChat-7B +BAAI/AquilaChat2-7B +# CohereForAI/c4ai-command-r-v01: restricted and you are not in the authorized list +EleutherAI/gpt-j-6B +EleutherAI/gpt-j-6b +EleutherAI/gpt-neo-1.3B +EleutherAI/gpt-neo-125m +EleutherAI/gpt-neo-2.7B +EleutherAI/gpt-neox-20b EleutherAI/pythia-160m -bigscience/bloomz-1b7 -bigscience/bloomz-560m -databricks/dolly-v2-3b -tiiuae/falcon-rw-7b -bigcode/starcoder2-3b -openbmb/MiniCPM-2B-sft-bf16 -openbmb/MiniCPM-2B-dpo-bf16 +GAIR/Abel-7B-002 +# OrionStarAI/Orion-14B-Base: pip install flash_attn +PygmalionAI/pygmalion-6b Qwen/Qwen-7B Qwen/Qwen-7B-Chat Qwen/Qwen1.5-0.5B +Qwen/Qwen1.5-1.8B +Qwen/Qwen1.5-7B Qwen/Qwen1.5-7B-Chat -internlm/internlm-chat-7b -BAAI/Aquila-7B -internlm/internlm2-7b -openchat/openchat_3.5 -lmsys/vicuna-7b-v1.5 -lmsys/longchat-7b-v1.5-32k -BAAI/AquilaChat2-7B -BAAI/AquilaChat-7B -baichuan-inc/Baichuan-7B -tiiuae/falcon-7b -microsoft/Phi-3-mini-128k-instruct -microsoft/Phi-3-mini-4k-instruct -nomic-ai/gpt4all-mpt -mosaicml/mpt-7b -mosaicml/mpt-7b-chat -bigcode/starcoderbase-3b -bigcode/gpt_bigcode-santacoder +Qwen/Qwen1.5-MoE-A2.7B +Qwen/Qwen1.5-MoE-A2.7B-Chat +Salesforce/codegen-350M-multi +Salesforce/codegen-350M-nl +# Salesforce/codegen2-1b: PA - 'stop' input is not a scalar +# Salesforce/xgen-7b-8k-base: Transformers issue - Object of type method is not JSON serializable +THUDM/chatglm2-6b +THUDM/chatglm3-6b +TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ +TinyLlama/TinyLlama-1.1B-Chat-v0.6 +TinyLlama/TinyLlama-1.1B-Chat-v1.0 +TitanML/tiny-mixtral +WizardLMTeam/WizardCoder-15B-V1.0 +# adept/fuyu-8b: optimum - Trying to export a fuyu model, that is a custom or unsupported architecture allenai/OLMo-1B-hf +# allenai/OLMo-7B: pip install hf_olmo allenai/OLMo-7B-hf -PygmalionAI/pygmalion-6b -stabilityai/stable-code-3b +baichuan-inc/Baichuan-7B +baichuan-inc/Baichuan2-7B-Base +baichuan-inc/Baichuan2-7B-Chat berkeley-nest/Starling-LM-7B-alpha -EleutherAI/gpt-neo-2.7B -databricks/dolly-v1-6b -openai-community/gpt2-large -openai-community/gpt2-medium +bigcode/gpt_bigcode-santacoder +bigcode/starcoder2-3b +bigcode/starcoder2-7b +bigcode/starcoderbase-3b +bigscience/bloom-560m bigscience/bloom-7b1 +bigscience/bloomz-1b7 +bigscience/bloomz-560m +bigscience/bloomz-7b1 +cerebras/Cerebras-GPT-13B +# core42/jais-13b: PA - 'stop' input is not a scalar +# core42/jais-13b-chat: PA - 'stop' input is not a scalar +databricks/dolly-v1-6b +databricks/dolly-v2-3b +# deepseek-ai/deepseek-coder-33b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file +# deepseek-ai/deepseek-coder-6.7b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file +# deepseek-ai/deepseek-moe-16b-base: optimum - Trying to export a deepseek model, that is a custom or unsupported architecture +# facebook/blenderbot-3B: optimum - IndexError: tuple index out of range +# facebook/incoder-1B: CB - Failed to detect "eos_token_id" in openvino_tokenizer.xml runtime information facebook/opt-1.3b +facebook/opt-125m facebook/opt-2.7b -GAIR/Abel-7B-002 +# facebook/opt-350m: PA - Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]) +facebook/opt-6.7b google/gemma-1.1-7b-it +google/gemma-2b google/gemma-2b-it +google/gemma-7b +# google/pegasus-big_patent: PA - 'start' input is not a scalar +# google/pegasus-large: PA - 'start' input is not a scalar +gpt2 +gpt2-xl +internlm/internlm-chat-7b +internlm/internlm2-7b +lmsys/longchat-7b-v1.5-32k +lmsys/vicuna-7b-v1.3 +lmsys/vicuna-7b-v1.5 +meta-llama/CodeLlama-7b-hf +meta-llama/Llama-2-7b-chat-hf +meta-llama/Llama-2-7b-hf +meta-llama/Meta-Llama-3-8B-Instruct microsoft/DialoGPT-large microsoft/DialoGPT-medium -Qwen/Qwen1.5-1.8B microsoft/Orca-2-7b -# Set of models, failed because of C++ Cont. Batching -# RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B -# -# Set of models, which require support in optimum-intel / transformers / models repositories: -# IndexError: tuple index out of range: facebook/blenderbot-3B -# `pip install flash_attn`: OrionStarAI/Orion-14B-Base -# ValueError: Trying to export a fuyu model, that is a custom or unsupported architecture: adept/fuyu-8b -# ValueError: Trying to export a mamba model, that is a custom or unsupported architecture: state-spaces/mamba-130m-hf -# ValueError: Trying to export a xlnet model, that is a custom or unsupported architecture: xlnet/xlnet-base-cased -# -# Set of models, failed because of CPU limitation -# head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b -# head size must be multiple of 16, current: 100: openlm-research/open_llama_3b -# head size must be multiple of 16, current: 100: openlm-research/open_llama_3b_v2 -# -# Set of failed models, because of PA: -# 'start' input is not a scalar: google/pegasus-big_patent -# 'start' input is not a scalar: google/pegasus-large -# 'stop' input is not a scalar: Salesforce/codegen2-1b -# 'stop' input is not a scalar: core42/jais-13b -# 'stop' input is not a scalar: core42/jais-13b-chat -# Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]): facebook/opt-350m -# -# Set of models, failed because of OpenVINO Tokenizers: -# https://jira.devtools.intel.com/browse/CVS-142063: rinna/bilingual-gpt-neox-4b -# Cannot convert tokenizer of this type without `.model` file: deepseek-ai/deepseek-coder-33b-instruct -# Cannot convert tokenizer of this type without `.model` file: deepseek-ai/deepseek-coder-6.7b-instruct -# Tokenizer type is not supported: : microsoft/biogpt -# -# Set of 13B, 30B abd 70B models: -EleutherAI/gpt-neox-20b -mistralai/Mixtral-8x7B-v0.1 +microsoft/Phi-3-mini-128k-instruct +microsoft/Phi-3-mini-4k-instruct +# microsoft/biogpt: OpenVINO Tokenizers - openvino.runtime.exceptions.OVTypeError: Tokenizer type is not supported: +microsoft/phi-1_5 +microsoft/phi-2 +mistralai/Mistral-7B-Instruct-v0.1 +mistralai/Mistral-7B-v0.1 mistralai/Mixtral-8x7B-Instruct-v0.1 +mistralai/Mixtral-8x7B-v0.1 +# mosaicml/mpt-1b-redpajama-200b: optimum - Trying to export a mosaic-gpt model, that is a custom or unsupported architecture mosaicml/mpt-30b -# see optimum: OrionStarAI/Orion-14B-Base -# big model, not tried: OrionStarAI/Orion-14B-Chat -CohereForAI/c4ai-command-r-v01 -openlm-research/open_llama_13b -Qwen/Qwen1.5-MoE-A2.7B -Qwen/Qwen1.5-MoE-A2.7B-Chat -xverse/XVERSE-MoE-A4.2B -cerebras/Cerebras-GPT-13B -WizardLMTeam/WizardCoder-15B-V1.0 -TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ -# -# Set of passed models: -microsoft/phi-2 -microsoft/phi-1_5 -EleutherAI/gpt-neo-125m -EleutherAI/gpt-neo-1.3B -EleutherAI/gpt-j-6b -baichuan-inc/Baichuan2-7B-Chat -THUDM/chatglm2-6b -THUDM/chatglm3-6b -google/gemma-2b -google/gemma-7b +mosaicml/mpt-7b +mosaicml/mpt-7b-chat +nomic-ai/gpt4all-j +nomic-ai/gpt4all-mpt openai-community/gpt2 +openai-community/gpt2-large +openai-community/gpt2-medium openai-community/gpt2-xl -gpt2 -gpt2-xl -nomic-ai/gpt4all-j -stabilityai/stablelm-3b-4e1t +openbmb/MiniCPM-2B-dpo-bf16 +openbmb/MiniCPM-2B-sft-bf16 +# openbmb/MiniCPM-V-2: optimum - Trying to export a minicpmv model, that is a custom or unsupported architecture +openchat/openchat_3.5 +openlm-research/open_llama_13b +# openlm-research/open_llama_3b: CPU - head size must be multiple of 16, current: 100 +# openlm-research/open_llama_3b_v2: CPU - head size must be multiple of 16, current: 100 +# replit/replit-code-v1-3b: OpenVINO Tokenizers - AttributeError: 'ReplitLMTokenizer' object has no attribute 'sp_model' +# rinna/bilingual-gpt-neox-4b: OpenVINO Tokenizers - trash output +rinna/youri-7b-chat +stabilityai/stable-code-3b +stabilityai/stable-zephyr-3b stabilityai/stablelm-2-zephyr-1_6b -meta-llama/Llama-2-7b-hf -meta-llama/Meta-Llama-3-8B-Instruct -meta-llama/CodeLlama-7b-hf -lmsys/vicuna-7b-v1.3 -mistralai/Mistral-7B-v0.1 -mistralai/Mistral-7B-Instruct-v0.1 -01-ai/Yi-6B -Salesforce/codegen-350M-multi -Salesforce/codegen-350M-nl +stabilityai/stablelm-3b-4e1t +# state-spaces/mamba-130m-hf: optimum - Trying to export a mamba model, that is a custom or unsupported architecture +tiiuae/falcon-7b +tiiuae/falcon-rw-7b togethercomputer/RedPajama-INCITE-Chat-3B-v1 -# passed, but with export=False: OpenVINO/codegen25-7b-multi-fp16-ov -# -# Set of invalid models, because of HF: -# HF: Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3: xverse/XVERSE-7B-Chat -# https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32: Salesforce/xgen-7b-8k-base \ No newline at end of file +# xlnet/xlnet-base-cased: optimum - Trying to export a xlnet model, that is a custom or unsupported architecture +# xverse/XVERSE-7B-Chat: Transfomers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3 +# xverse/XVERSE-MoE-A4.2B: Transfomers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3 \ No newline at end of file diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt index 568b6886b..0c803412e 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt @@ -36,4 +36,6 @@ tiktoken # - microsoft/biogpt sacremoses # - TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ -auto-gptq \ No newline at end of file +auto-gptq +# - allenai/OLMo-7B +hf_olmo \ No newline at end of file From 66006de68270a643649258ae8baabf4deca78952 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sun, 26 May 2024 17:28:28 +0200 Subject: [PATCH 25/32] Small updates --- .../python/tests/models/real_models | 10 ++++------ .../continuous_batching/python/tests/requirements.txt | 4 +--- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models index 4fe917605..dbb50dade 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models @@ -2,7 +2,7 @@ BAAI/Aquila-7B BAAI/AquilaChat-7B BAAI/AquilaChat2-7B -# CohereForAI/c4ai-command-r-v01: restricted and you are not in the authorized list +CohereForAI/c4ai-command-r-v01 EleutherAI/gpt-j-6B EleutherAI/gpt-j-6b EleutherAI/gpt-neo-1.3B @@ -11,7 +11,7 @@ EleutherAI/gpt-neo-2.7B EleutherAI/gpt-neox-20b EleutherAI/pythia-160m GAIR/Abel-7B-002 -# OrionStarAI/Orion-14B-Base: pip install flash_attn +# OrionStarAI/Orion-14B-Base: pip install flash_attn (https://github.com/huggingface/transformers/pull/30954) PygmalionAI/pygmalion-6b Qwen/Qwen-7B Qwen/Qwen-7B-Chat @@ -24,7 +24,7 @@ Qwen/Qwen1.5-MoE-A2.7B-Chat Salesforce/codegen-350M-multi Salesforce/codegen-350M-nl # Salesforce/codegen2-1b: PA - 'stop' input is not a scalar -# Salesforce/xgen-7b-8k-base: Transformers issue - Object of type method is not JSON serializable +# Salesforce/xgen-7b-8k-base: Transformers issue - Object of type method is not JSON serializable (https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32) THUDM/chatglm2-6b THUDM/chatglm3-6b TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ @@ -34,7 +34,6 @@ TitanML/tiny-mixtral WizardLMTeam/WizardCoder-15B-V1.0 # adept/fuyu-8b: optimum - Trying to export a fuyu model, that is a custom or unsupported architecture allenai/OLMo-1B-hf -# allenai/OLMo-7B: pip install hf_olmo allenai/OLMo-7B-hf baichuan-inc/Baichuan-7B baichuan-inc/Baichuan2-7B-Base @@ -105,13 +104,12 @@ openai-community/gpt2-medium openai-community/gpt2-xl openbmb/MiniCPM-2B-dpo-bf16 openbmb/MiniCPM-2B-sft-bf16 -# openbmb/MiniCPM-V-2: optimum - Trying to export a minicpmv model, that is a custom or unsupported architecture openchat/openchat_3.5 openlm-research/open_llama_13b # openlm-research/open_llama_3b: CPU - head size must be multiple of 16, current: 100 # openlm-research/open_llama_3b_v2: CPU - head size must be multiple of 16, current: 100 # replit/replit-code-v1-3b: OpenVINO Tokenizers - AttributeError: 'ReplitLMTokenizer' object has no attribute 'sp_model' -# rinna/bilingual-gpt-neox-4b: OpenVINO Tokenizers - trash output +# rinna/bilingual-gpt-neox-4b: OpenVINO Tokenizers - trash output (https://jira.devtools.intel.com/browse/CVS-142063) rinna/youri-7b-chat stabilityai/stable-code-3b stabilityai/stable-zephyr-3b diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt index 0c803412e..568b6886b 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt @@ -36,6 +36,4 @@ tiktoken # - microsoft/biogpt sacremoses # - TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ -auto-gptq -# - allenai/OLMo-7B -hf_olmo \ No newline at end of file +auto-gptq \ No newline at end of file From 8d4d5a65e48bc7c89e01ea1ca6969ace1115cdaa Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 27 May 2024 10:55:13 +0200 Subject: [PATCH 26/32] Updated list --- .../python/tests/models/real_models | 20 ++++++++----------- thirdparty/openvino_tokenizers | 2 +- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models index dbb50dade..28a1cb6dd 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models @@ -23,7 +23,7 @@ Qwen/Qwen1.5-MoE-A2.7B Qwen/Qwen1.5-MoE-A2.7B-Chat Salesforce/codegen-350M-multi Salesforce/codegen-350M-nl -# Salesforce/codegen2-1b: PA - 'stop' input is not a scalar +Salesforce/codegen2-1b # Salesforce/xgen-7b-8k-base: Transformers issue - Object of type method is not JSON serializable (https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32) THUDM/chatglm2-6b THUDM/chatglm3-6b @@ -32,7 +32,6 @@ TinyLlama/TinyLlama-1.1B-Chat-v0.6 TinyLlama/TinyLlama-1.1B-Chat-v1.0 TitanML/tiny-mixtral WizardLMTeam/WizardCoder-15B-V1.0 -# adept/fuyu-8b: optimum - Trying to export a fuyu model, that is a custom or unsupported architecture allenai/OLMo-1B-hf allenai/OLMo-7B-hf baichuan-inc/Baichuan-7B @@ -49,26 +48,26 @@ bigscience/bloomz-1b7 bigscience/bloomz-560m bigscience/bloomz-7b1 cerebras/Cerebras-GPT-13B -# core42/jais-13b: PA - 'stop' input is not a scalar -# core42/jais-13b-chat: PA - 'stop' input is not a scalar +# core42/jais-13b: optimum - no SDPA +# core42/jais-13b-chat: optimum - no SDPA databricks/dolly-v1-6b databricks/dolly-v2-3b -# deepseek-ai/deepseek-coder-33b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file -# deepseek-ai/deepseek-coder-6.7b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file +# deepseek-ai/deepseek-coder-33b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file +# deepseek-ai/deepseek-coder-6.7b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file # deepseek-ai/deepseek-moe-16b-base: optimum - Trying to export a deepseek model, that is a custom or unsupported architecture # facebook/blenderbot-3B: optimum - IndexError: tuple index out of range # facebook/incoder-1B: CB - Failed to detect "eos_token_id" in openvino_tokenizer.xml runtime information facebook/opt-1.3b facebook/opt-125m facebook/opt-2.7b -# facebook/opt-350m: PA - Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]) +facebook/opt-350m facebook/opt-6.7b google/gemma-1.1-7b-it google/gemma-2b google/gemma-2b-it google/gemma-7b -# google/pegasus-big_patent: PA - 'start' input is not a scalar -# google/pegasus-large: PA - 'start' input is not a scalar +google/pegasus-big_patent +google/pegasus-large gpt2 gpt2-xl internlm/internlm-chat-7b @@ -92,7 +91,6 @@ mistralai/Mistral-7B-Instruct-v0.1 mistralai/Mistral-7B-v0.1 mistralai/Mixtral-8x7B-Instruct-v0.1 mistralai/Mixtral-8x7B-v0.1 -# mosaicml/mpt-1b-redpajama-200b: optimum - Trying to export a mosaic-gpt model, that is a custom or unsupported architecture mosaicml/mpt-30b mosaicml/mpt-7b mosaicml/mpt-7b-chat @@ -115,10 +113,8 @@ stabilityai/stable-code-3b stabilityai/stable-zephyr-3b stabilityai/stablelm-2-zephyr-1_6b stabilityai/stablelm-3b-4e1t -# state-spaces/mamba-130m-hf: optimum - Trying to export a mamba model, that is a custom or unsupported architecture tiiuae/falcon-7b tiiuae/falcon-rw-7b togethercomputer/RedPajama-INCITE-Chat-3B-v1 -# xlnet/xlnet-base-cased: optimum - Trying to export a xlnet model, that is a custom or unsupported architecture # xverse/XVERSE-7B-Chat: Transfomers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3 # xverse/XVERSE-MoE-A4.2B: Transfomers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3 \ No newline at end of file diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers index 200cffc10..0b406fd60 160000 --- a/thirdparty/openvino_tokenizers +++ b/thirdparty/openvino_tokenizers @@ -1 +1 @@ -Subproject commit 200cffc10e3479b00006b613dc3c9fa48301177d +Subproject commit 0b406fd6080f930a0d4a7c068dae7372046daa9d From bf7b8bcb6744649aa473b713f9454aff9b89a704 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 27 May 2024 10:56:54 +0200 Subject: [PATCH 27/32] No PA for 350m --- .../cpp/continuous_batching/python/tests/models/real_models | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models index 28a1cb6dd..94defc857 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models @@ -60,7 +60,7 @@ databricks/dolly-v2-3b facebook/opt-1.3b facebook/opt-125m facebook/opt-2.7b -facebook/opt-350m +# facebook/opt-350m: PA - Model references undeclared parameters: opset1::Parameter attention_mask () -> (i64[?,?]) facebook/opt-6.7b google/gemma-1.1-7b-it google/gemma-2b From e8f9f973cc8ca2549a680832493dd51ac796dd42 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Mon, 27 May 2024 13:40:47 +0200 Subject: [PATCH 28/32] Applied comments. --- .../src/continuous_batching_pipeline.cpp | 17 +++++------------ .../library/src/scheduler.hpp | 7 +++++++ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp index 54b3e9f53..1817edbc6 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp @@ -26,7 +26,7 @@ GenerationResult from_sequence_group(std::shared_ptr tokenizer, Seque result.m_scores.push_back(sequence->get_beam_search_score(sequence_group->get_sampling_parameters())); { - static ManualTimer timer("detokenize"); + static ManualTimer timer("detokenize"); timer.start(); std::string output_text = tokenizer->decode(sequence->get_generated_ids()); timer.end(); @@ -76,17 +76,13 @@ class ContinuousBatchingPipeline::Impl { // current requests to process std::vector m_requests; - void _free_finished_requests() { + void _free_non_running_requests() { auto new_end = std::remove_if(m_requests.begin(), m_requests.end(), [] (SequenceGroup::CPtr seq_group) -> bool { - return seq_group->has_finished(); + return seq_group->has_finished() || seq_group->out_of_memory(); }); m_requests.erase(new_end, m_requests.end()); } - void _free_all_requests() { - m_requests.erase(m_requests.begin(), m_requests.end()); - } - public: Impl(const std::string& models_path, const SchedulerConfig& scheduler_config) { ov::Core core; @@ -162,9 +158,6 @@ class ContinuousBatchingPipeline::Impl { // if no tokens were scheduled, we are out of memory if (scheduler_output.m_total_num_scheduled_tokens == 0) { - for (size_t sequence_group_id = 0; sequence_group_id < m_requests.size(); ++sequence_group_id) { - m_requests[sequence_group_id]->set_out_of_memory(); - } // return partial results std::vector pertial_results; @@ -174,7 +167,7 @@ class ContinuousBatchingPipeline::Impl { pertial_results.push_back(from_sequence_group(m_tokenizer, sequence_group)); } - _free_all_requests(); + _free_non_running_requests(); return pertial_results; } @@ -239,7 +232,7 @@ class ContinuousBatchingPipeline::Impl { } } - _free_finished_requests(); + _free_non_running_requests(); timer.end(); } diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp index 8ae3cb721..fd7ff7185 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp @@ -57,6 +57,13 @@ class Scheduler { _clear_waiting_sequences(sequence_groups); + + // if no tokens were scheduled, we are out of memory + if (scheduler_output.m_total_num_scheduled_tokens == 0) { + for (size_t sequence_group_id = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) { + sequence_groups[sequence_group_id]->set_out_of_memory(); + } + } return scheduler_output; } From 0965791dff32cd1081c9d36d390c322fd9dcac7a Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Mon, 27 May 2024 13:42:26 +0200 Subject: [PATCH 29/32] Minor correction. --- .../library/src/continuous_batching_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp index 1817edbc6..f6b224197 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp @@ -26,7 +26,7 @@ GenerationResult from_sequence_group(std::shared_ptr tokenizer, Seque result.m_scores.push_back(sequence->get_beam_search_score(sequence_group->get_sampling_parameters())); { - static ManualTimer timer("detokenize"); + static ManualTimer timer("detokenize"); timer.start(); std::string output_text = tokenizer->decode(sequence->get_generated_ids()); timer.end(); From 302e638415a4fdf6190211eab2e0907ffaf63137 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 28 May 2024 10:52:07 +0200 Subject: [PATCH 30/32] Added cache_size field in SchedulerConfig. --- .../library/CMakeLists.txt | 2 +- .../library/include/scheduler_config.hpp | 6 +++-- .../library/src/cache_manager.hpp | 9 ++++++- .../src/continuous_batching_pipeline.cpp | 10 ++++++-- .../library/src/device_config.hpp | 24 +++++++++++++++++-- 5 files changed, 43 insertions(+), 8 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/CMakeLists.txt b/text_generation/causal_lm/cpp/continuous_batching/library/CMakeLists.txt index 7f2f73dcf..129f770cc 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/CMakeLists.txt +++ b/text_generation/causal_lm/cpp/continuous_batching/library/CMakeLists.txt @@ -67,7 +67,7 @@ FetchContent_MakeAvailable(googletest) set(TEST_TARGET_NAME "tests_continuous_batching") -add_executable(${TEST_TARGET_NAME} "src/tests/scheduler.cpp" "src/tests/block_manager.cpp" "src/tests/logit_filtering.cpp") +add_executable(${TEST_TARGET_NAME} "src/tests/scheduler.cpp" "src/tests/block_manager.cpp" "src/tests/logit_filtering.cpp" "src/tests/cache_manager.cpp") target_link_libraries(${TEST_TARGET_NAME} PUBLIC ${TARGET_NAME} openvino::runtime gtest_main) target_include_directories(${TEST_TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/" PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp index 5bdf163e7..ac7739cb4 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp @@ -11,9 +11,11 @@ struct SchedulerConfig { // TODO: benchmark this value and understand a required value to ensure inference is not memory bound std::size_t max_num_batched_tokens = 16; - // TODO: specify size in GBs instead of number of KV blocks // total number of KV blocks available to scheduler logic - std::size_t num_kv_blocks = 500; + std::size_t num_kv_blocks = 0; + + // total size of KV cache in GB + std::size_t cache_size = 0; // block size for KV cache std::size_t block_size = 32; diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp index 11e4dbb38..aa465421c 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp @@ -13,21 +13,24 @@ class CacheManager { DeviceConfig m_device_config; std::vector m_key_cache; std::vector m_value_cache; + size_t m_allocated_bytes; public: explicit CacheManager(const DeviceConfig& device_config) : m_device_config(device_config) { m_key_cache.reserve(m_device_config.get_num_layers()); m_value_cache.reserve(m_device_config.get_num_layers()); + m_allocated_bytes = 0; // Allocate KV caches for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { ov::Tensor key_cache(device_config.get_cache_precision(), device_config.get_key_cache_shape()); ov::Tensor value_cache(device_config.get_cache_precision(), device_config.get_value_cache_shape()); - + // force allocation std::memset(key_cache.data(), 0, key_cache.get_byte_size()); std::memset(value_cache.data(), 0, value_cache.get_byte_size()); + m_allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size(); m_key_cache.emplace_back(key_cache); m_value_cache.emplace_back(value_cache); @@ -81,4 +84,8 @@ class CacheManager { } } } + + size_t get_total_allocated_bytes() const { + return m_allocated_bytes; + } }; diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp index f6b224197..68bf676ff 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp @@ -104,9 +104,15 @@ class ContinuousBatchingPipeline::Impl { infer_request.set_input_tensor(2 + decoder_layer_id * 2 + 1, m_cache_manager->get_value_cache(decoder_layer_id)); } - m_scheduler = std::make_shared(scheduler_config); + SchedulerConfig updated_config = scheduler_config; + // update KV number in scheduler config + if (scheduler_config.num_kv_blocks != device_config.get_num_kv_blocks()) { + updated_config.num_kv_blocks = device_config.get_num_kv_blocks(); + } + + m_scheduler = std::make_shared(updated_config); // and finally create model runner - m_model_runner = std::make_shared(infer_request, scheduler_config); + m_model_runner = std::make_shared(infer_request, updated_config); m_sampler = std::make_shared(); m_sampler->set_seed(m_generation_config.rng_seed); diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp index ac92c275f..240be4d9e 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp @@ -13,7 +13,9 @@ class DeviceConfig { ov::element::Type m_kv_cache_type; ov::Shape m_key_cache_shape, m_value_cache_shape; ov::Shape::value_type m_num_kv_heads, m_head_size, m_num_decoder_layers; - size_t m_num_kv_blocks, m_block_size; + size_t m_num_kv_blocks = 0; + size_t m_block_size = 0; + size_t m_cache_size = 0; std::string m_device; public: @@ -21,7 +23,6 @@ class DeviceConfig { m_device = device; // keep information about blocsk - m_num_kv_blocks = scheduling_config.num_kv_blocks; m_block_size = scheduling_config.block_size; if (m_device == "CPU") { @@ -32,6 +33,15 @@ class DeviceConfig { } else { OPENVINO_THROW(m_device, " is not supported by OpenVINO Continuous Batching"); } + + OPENVINO_ASSERT(scheduling_config.num_kv_blocks > 0 || scheduling_config.cache_size > 0, "num_kv_blocks or cache_size should be specified."); + if (scheduling_config.num_kv_blocks > 0) { + m_num_kv_blocks = scheduling_config.num_kv_blocks; + } + else { + m_cache_size = scheduling_config.cache_size; + + } } void set_model_params(size_t num_kv_heads, size_t head_size, size_t num_decoder_layers) { @@ -39,6 +49,12 @@ class DeviceConfig { m_head_size = head_size; m_num_decoder_layers = num_decoder_layers; + if (m_num_kv_blocks == 0) { + OPENVINO_ASSERT(m_cache_size > 0, "num_kv_blocks or cache_size should be specified."); + size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024; + m_num_kv_blocks = size_in_bytes / (m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * m_kv_cache_type.size()); + } + m_key_cache_shape = m_value_cache_shape = ov::Shape{m_num_kv_blocks, m_num_kv_heads, m_block_size, @@ -66,4 +82,8 @@ class DeviceConfig { OPENVINO_ASSERT(!m_value_cache_shape.empty()); return m_value_cache_shape; } + + size_t get_num_kv_blocks() const { + return m_num_kv_blocks; + } }; From bb7eea0b4fa079ac8c285a8874dfceee93937d67 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 28 May 2024 11:02:42 +0200 Subject: [PATCH 31/32] Removed not needed code. --- .../library/src/cache_manager.hpp | 9 +---- .../library/src/tests/cache_manager.cpp | 36 +++++++++++++++++++ 2 files changed, 37 insertions(+), 8 deletions(-) create mode 100644 text_generation/causal_lm/cpp/continuous_batching/library/src/tests/cache_manager.cpp diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp index aa465421c..11e4dbb38 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp @@ -13,24 +13,21 @@ class CacheManager { DeviceConfig m_device_config; std::vector m_key_cache; std::vector m_value_cache; - size_t m_allocated_bytes; public: explicit CacheManager(const DeviceConfig& device_config) : m_device_config(device_config) { m_key_cache.reserve(m_device_config.get_num_layers()); m_value_cache.reserve(m_device_config.get_num_layers()); - m_allocated_bytes = 0; // Allocate KV caches for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { ov::Tensor key_cache(device_config.get_cache_precision(), device_config.get_key_cache_shape()); ov::Tensor value_cache(device_config.get_cache_precision(), device_config.get_value_cache_shape()); - + // force allocation std::memset(key_cache.data(), 0, key_cache.get_byte_size()); std::memset(value_cache.data(), 0, value_cache.get_byte_size()); - m_allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size(); m_key_cache.emplace_back(key_cache); m_value_cache.emplace_back(value_cache); @@ -84,8 +81,4 @@ class CacheManager { } } } - - size_t get_total_allocated_bytes() const { - return m_allocated_bytes; - } }; diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/cache_manager.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/cache_manager.cpp new file mode 100644 index 000000000..2fa479093 --- /dev/null +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/cache_manager.cpp @@ -0,0 +1,36 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "openvino/runtime/core.hpp" +#include "scheduler.hpp" +#include "device_config.hpp" +#include "cache_manager.hpp" + +TEST(TestCacheManager, general_test) { + ov::Core core; + SchedulerConfig scheduler_config = { + .max_num_batched_tokens = 32, + .num_kv_blocks = 0, + .cache_size = 2, + .block_size = 32, + .max_num_seqs = 2, + }; + + const std::string device = "CPU"; + DeviceConfig device_config(core, scheduler_config, "CPU"); + size_t num_decoder_layers = 12; + device_config.set_model_params(12, 64, num_decoder_layers); + + auto cache_manager = std::make_shared(device_config); + + size_t allocated_bytes = 0; + for (size_t i = 0; i < num_decoder_layers; i++) { + auto key_cache = cache_manager->get_key_cache(i); + auto value_cache = cache_manager->get_value_cache(i); + allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size(); + } + + ASSERT_EQ(allocated_bytes, 2146959360); +} From 1703dbd6929a45e0a3d399e3c649b0652b5b2a64 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 28 May 2024 11:10:35 +0200 Subject: [PATCH 32/32] Minor correction. --- .../cpp/continuous_batching/library/src/device_config.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp index 240be4d9e..010d9b2ba 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp @@ -34,7 +34,7 @@ class DeviceConfig { OPENVINO_THROW(m_device, " is not supported by OpenVINO Continuous Batching"); } - OPENVINO_ASSERT(scheduling_config.num_kv_blocks > 0 || scheduling_config.cache_size > 0, "num_kv_blocks or cache_size should be specified."); + OPENVINO_ASSERT(scheduling_config.num_kv_blocks > 0 || scheduling_config.cache_size > 0, "num_kv_blocks or cache_size should be more than zero."); if (scheduling_config.num_kv_blocks > 0) { m_num_kv_blocks = scheduling_config.num_kv_blocks; } @@ -50,7 +50,7 @@ class DeviceConfig { m_num_decoder_layers = num_decoder_layers; if (m_num_kv_blocks == 0) { - OPENVINO_ASSERT(m_cache_size > 0, "num_kv_blocks or cache_size should be specified."); + OPENVINO_ASSERT(m_cache_size > 0, "num_kv_blocks or cache_size should be more than zero."); size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024; m_num_kv_blocks = size_in_bytes / (m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * m_kv_cache_type.size()); }