Apply comments

openvinotoolkit · Jul 3, 2024 · dd2ed1d · dd2ed1d
1 parent 751229b
commit dd2ed1d
Showing 1 changed file with 9 additions and 0 deletions.
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -116,6 +116,15 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             std::string& prompt = *input_prompt;
 
             if (is_chat_conversation) {
+                // KV cache in model already contains prompts and answers from previous iterations.
+                // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
+                // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
+                // <bos token> will be inserted on every iteration.
+                // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
+                // and takes only the difference between them.
+                // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
+                // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
+
                 m_history.push_back({{"role", "user"}, {"content", prompt}});
                 constexpr bool add_generation_prompt = true;
                 auto new_templated_chat_history  = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);