Fix chat template

Add test for chat scenario
openvinotoolkit · Jun 10, 2024 · 85a5d95 · 85a5d95
1 parent 1ee4f38
commit 85a5d95
Show file tree

Hide file tree

Showing 3 changed files with 107 additions and 43 deletions.
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -487,3 +487,66 @@ jobs:
               predictions = predictions[:idx] + predictions[idx + len(ref):]
           "
           echo "Alan Turing was a" passed
+
+
+
+  cpp-chat_sample-ubuntu:
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+      - name: Install OpenVINO
+        run: |
+          mkdir ./ov/
+          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
+      - name: Download, convert and build
+        run: |
+          source ./ov/setupvars.sh
+          python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
+      - name: Compare
+        run: |
+          source ./ov/setupvars.sh
+
+          printf "What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\nStop.\n" > ./input.txt
+          timeout 30s cat input.txt | ./build/text_generation/causal_lm/cpp/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ > ./pred.txt
+          cat ./pred.txt
+          python -c "
+          from transformers import LlamaTokenizer, LlamaForCausalLM
+          with open('pred.txt', 'r') as file:
+              predictions = file.read()
+          model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
+          tokenizer = LlamaTokenizer.from_pretrained(model_id)
+          model = LlamaForCausalLM.from_pretrained(model_id)
+          prompts = ['What is 2 + 2?', 'What is the previous answer?', 'Add 1 to it.', 'Subtract 5 from it.', 'Why is the sun yellow?', 'What was my first question?']
+          def gen_prompt(prompt):
+              return {'role': 'user', 'content': prompt}
+          def gen_answer(answer):
+              return {'role': 'assistant', 'content': answer}
+          chat_history = []
+          chat_prompt = ''
+          for prompt in prompts:
+              print('prompt:', prompt)
+              chat_history.append(gen_prompt(prompt))
+              chat_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
+              print('prompt after chat tempate:', chat_prompt)
+              tokenized = tokenizer(chat_prompt, return_tensors='pt')
+              print('tokens:', tokenized['input_ids'])
+              answer = model.generate(**tokenized, max_length=1000, do_sample=False)
+              answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
+              chat_history.append(gen_answer(answer_str))
+              print('answer:', answer_str)
+              idx = predictions.find(answer_str)
+              if -1 == idx:
+                  raise RuntimeError(f'Missing "{answer_str=}" from predictions')
+              predictions = predictions[:idx] + predictions[idx + len(answer_str):]
+          "
+          echo "Chat sample?" passed
diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp
@@ -17,9 +17,9 @@ int main(int argc, char* argv[]) try {
     pipe.start_chat();
     for (;;) {
         std::cout << "question:\n";
-        
+
         std::getline(std::cin, prompt);
-        if (prompt == "Stop!") 
+        if (prompt == "Stop.")
             break;
 
         pipe.generate(prompt, config, streamer);

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -99,6 +99,15 @@ class LLMPipeline::LLMPipelineImpl {
     std::string m_chat_template = "";
     bool is_chat_conversation = false;
     bool m_is_cache_empty = true;
+    std::string m_templated_chat_history = "";
+
+    enum class chat_role { user, assistant, system }; 
+    using ChatHistory = std::vector<std::pair<std::string, chat_role>>;
+    ChatHistory m_chat_history;
+
+    std::map<chat_role, std::string> chat_role_str = {{ chat_role::user, "user" },
+                                                      { chat_role::assistant, "assistant" },
+                                                      { chat_role::system, "system" }};
 
     LLMPipelineImpl(
         const ov::InferRequest& request, 
@@ -136,22 +145,27 @@ class LLMPipeline::LLMPipelineImpl {
 
             std::string text = *input_str;
             // todo: make for batched inputs as well
-            if (is_chat_conversation)
-                text = apply_chat_template(text);
+            if (is_chat_conversation) {
+                m_chat_history.push_back(std::make_pair(text, chat_role::user));
+                auto new_templated_chat_history = apply_chat_template(m_chat_history);
+                text = new_templated_chat_history.substr(m_templated_chat_history.size());
+                std::cout << "templayed prompt: " << text  << "\n";
+                m_templated_chat_history = new_templated_chat_history;
+            }
 
             // previous prompt generation in chat dialog stops with the end of sentence token, 
             // need to append this token to the current prompt
-            if (is_chat_conversation && !m_is_cache_empty)
-                text = m_tokenizer.get_eos_token() + text;
+            // if (is_chat_conversation && !m_is_cache_empty)
+            //     text = m_tokenizer.get_eos_token() + text;
 
             auto res = m_tokenizer.encode(text);
             auto input_ids = res.input_ids;
             auto attention_mask = res.attention_mask;
 
-            // todo: W/A If sentence begins with specfial tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
+            // todo: W/A If sentence begins with a specfial tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
             // but HF does not do that. Moreover openvino_tokenizer always inserts <bos> but in chat scenario HF does not do that because skip_special_tokens=True.
             // Need to remove both of that tokens manually to get exact token by token alignment with HF
-            auto size = input_ids.get_shape();
+            // auto size = input_ids.get_shape();
             int64_t* inputs_data = input_ids.data<int64_t>();
             std::vector<int64_t> tmp_ids(inputs_data, inputs_data + input_ids.get_size()); // todo: works only for batch 1
 
@@ -169,12 +183,17 @@ class LLMPipeline::LLMPipelineImpl {
             attention_mask = ov::Tensor(attention_mask.get_element_type(), {1, tmp_attn_mask.size()});
             std::copy(tmp_ids.begin(), tmp_ids.end(), input_ids.data<int64_t>());
             std::copy(tmp_attn_mask.begin(), tmp_attn_mask.end(), attention_mask.data<int64_t>());
-           
+
             encoded_input = TokenizedInputs{input_ids, attention_mask};
         }
 
         auto encoded_results  = generate(encoded_input, config, streamer);
-        return {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
+        auto answer = m_tokenizer.decode(encoded_results.tokens);
+        if (is_chat_conversation) {
+            m_templated_chat_history.append(answer[0]);
+            m_chat_history.push_back(std::make_pair(answer[0], chat_role::assistant));
+        }
+        return {answer, encoded_results.scores};
     }
 
     EncodedResults generate(
@@ -242,7 +261,7 @@ class LLMPipeline::LLMPipelineImpl {
         return result;        
     }
 
-    std::string apply_chat_template(const std::vector<std::pair<std::string, std::string>>& prompts) const {
+    std::string apply_chat_template(const ChatHistory& prompts) const {
         jinja2::TemplateEnv env;
         env.GetSettings().lstripBlocks = true;
         env.GetSettings().trimBlocks = true;
@@ -251,7 +270,7 @@ class LLMPipeline::LLMPipelineImpl {
 
         jinja2::ValuesList messages;
         for (const auto& [prompt, role] : prompts) {
-            messages.push_back(jinja2::ValuesMap{{"role", role}, {"content", prompt}});
+            messages.push_back(jinja2::ValuesMap{{"role", chat_role_str.at(role)}, {"content", prompt}});
         }
 
         jinja2::ValuesMap params = {
@@ -263,32 +282,6 @@ class LLMPipeline::LLMPipelineImpl {
 
         return tpl.RenderAsString(params).value();
     }
-
-    std::string apply_chat_template(std::string prompt, std::string role = "user") const {
-        jinja2::TemplateEnv env;
-        env.GetSettings().lstripBlocks = true;
-        env.GetSettings().trimBlocks = true;
-        jinja2::Template tpl(&env);
-        tpl.Load(m_chat_template);
-
-        jinja2::ValuesMap message {{"role", role}, {"content", prompt}};
-        jinja2::ValuesMap params = {
-            {"messages", jinja2::ValuesList({message})},
-            {"bos_token",  m_tokenizer.get_bos_token()},
-            {"eos_token", m_tokenizer.get_eos_token()},
-            {"add_generation_prompt", true},
-        };
-
-        return tpl.RenderAsString(params).value();
-    }
-
-    std::vector<std::string> apply_chat_template(std::vector<std::string>& prompts, std::string role = "user") const {
-        std::vector<std::string> res;
-        for (const auto& prompt: prompts) {
-            res.emplace_back(apply_chat_template(prompt));
-        }
-        return res;
-    }
 };
 
 DecodedResults LLMPipeline::generate(
@@ -394,6 +387,19 @@ ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(
     // If eos_token_id was not provided, take value
     if (m_generation_config.eos_token_id == -1)
         m_generation_config.eos_token_id = m_tokenizer.get_eos_token_id();
+    //update chat template
+    if (m_chat_template.size()) {
+        std::map<std::string, std::string> replace_str_map = {{"\n'}", "\n' }"},
+                                                              {".strip()", "\"\""}};
+        size_t pos = 0;
+        for (auto [from, to] : replace_str_map) {
+            while ( ( pos = m_chat_template.find( from, pos ) ) != std::string::npos )
+            {
+                m_chat_template.replace( pos, from.size(), to );
+                pos += to.size();
+            }
+        }
+    }
 }
 
 ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
@@ -404,11 +410,6 @@ ov::genai::Tokenizer ov::genai::LLMPipeline::get_tokenizer() {
     return m_pimpl->m_tokenizer;
 }
 
-std::string ov::genai::LLMPipeline::apply_chat_template(std::string prompt, std::string role) const {
-    return m_pimpl->apply_chat_template(prompt, role);
-}
-
-
 void ov::genai::LLMPipeline::start_chat() {
     m_pimpl->is_chat_conversation = true;
     if (!m_pimpl->m_is_cache_empty) {