Skip to content

Commit

Permalink
Fix chat template
Browse files Browse the repository at this point in the history
Add test for chat scenario
  • Loading branch information
olpipi committed Jun 10, 2024
1 parent 1ee4f38 commit 85a5d95
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 43 deletions.
63 changes: 63 additions & 0 deletions .github/workflows/causal_lm_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -487,3 +487,66 @@ jobs:
predictions = predictions[:idx] + predictions[idx + len(ref):]
"
echo "Alan Turing was a" passed
cpp-chat_sample-ubuntu:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.8
- name: Install OpenVINO
run: |
mkdir ./ov/
curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Download, convert and build
run: |
source ./ov/setupvars.sh
python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Compare
run: |
source ./ov/setupvars.sh
printf "What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\nStop.\n" > ./input.txt
timeout 30s cat input.txt | ./build/text_generation/causal_lm/cpp/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ > ./pred.txt
cat ./pred.txt
python -c "
from transformers import LlamaTokenizer, LlamaForCausalLM
with open('pred.txt', 'r') as file:
predictions = file.read()
model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
tokenizer = LlamaTokenizer.from_pretrained(model_id)
model = LlamaForCausalLM.from_pretrained(model_id)
prompts = ['What is 2 + 2?', 'What is the previous answer?', 'Add 1 to it.', 'Subtract 5 from it.', 'Why is the sun yellow?', 'What was my first question?']
def gen_prompt(prompt):
return {'role': 'user', 'content': prompt}
def gen_answer(answer):
return {'role': 'assistant', 'content': answer}
chat_history = []
chat_prompt = ''
for prompt in prompts:
print('prompt:', prompt)
chat_history.append(gen_prompt(prompt))
chat_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
print('prompt after chat tempate:', chat_prompt)
tokenized = tokenizer(chat_prompt, return_tensors='pt')
print('tokens:', tokenized['input_ids'])
answer = model.generate(**tokenized, max_length=1000, do_sample=False)
answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
chat_history.append(gen_answer(answer_str))
print('answer:', answer_str)
idx = predictions.find(answer_str)
if -1 == idx:
raise RuntimeError(f'Missing "{answer_str=}" from predictions')
predictions = predictions[:idx] + predictions[idx + len(answer_str):]
"
echo "Chat sample?" passed
4 changes: 2 additions & 2 deletions samples/cpp/chat_sample/chat_sample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ int main(int argc, char* argv[]) try {
pipe.start_chat();
for (;;) {
std::cout << "question:\n";

std::getline(std::cin, prompt);
if (prompt == "Stop!")
if (prompt == "Stop.")
break;

pipe.generate(prompt, config, streamer);
Expand Down
83 changes: 42 additions & 41 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,15 @@ class LLMPipeline::LLMPipelineImpl {
std::string m_chat_template = "";
bool is_chat_conversation = false;
bool m_is_cache_empty = true;
std::string m_templated_chat_history = "";

enum class chat_role { user, assistant, system };
using ChatHistory = std::vector<std::pair<std::string, chat_role>>;
ChatHistory m_chat_history;

std::map<chat_role, std::string> chat_role_str = {{ chat_role::user, "user" },
{ chat_role::assistant, "assistant" },
{ chat_role::system, "system" }};

LLMPipelineImpl(
const ov::InferRequest& request,
Expand Down Expand Up @@ -136,22 +145,27 @@ class LLMPipeline::LLMPipelineImpl {

std::string text = *input_str;
// todo: make for batched inputs as well
if (is_chat_conversation)
text = apply_chat_template(text);
if (is_chat_conversation) {
m_chat_history.push_back(std::make_pair(text, chat_role::user));
auto new_templated_chat_history = apply_chat_template(m_chat_history);
text = new_templated_chat_history.substr(m_templated_chat_history.size());
std::cout << "templayed prompt: " << text << "\n";
m_templated_chat_history = new_templated_chat_history;
}

// previous prompt generation in chat dialog stops with the end of sentence token,
// need to append this token to the current prompt
if (is_chat_conversation && !m_is_cache_empty)
text = m_tokenizer.get_eos_token() + text;
// if (is_chat_conversation && !m_is_cache_empty)
// text = m_tokenizer.get_eos_token() + text;

auto res = m_tokenizer.encode(text);
auto input_ids = res.input_ids;
auto attention_mask = res.attention_mask;

// todo: W/A If sentence begins with specfial tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
// todo: W/A If sentence begins with a specfial tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
// but HF does not do that. Moreover openvino_tokenizer always inserts <bos> but in chat scenario HF does not do that because skip_special_tokens=True.
// Need to remove both of that tokens manually to get exact token by token alignment with HF
auto size = input_ids.get_shape();
// auto size = input_ids.get_shape();
int64_t* inputs_data = input_ids.data<int64_t>();
std::vector<int64_t> tmp_ids(inputs_data, inputs_data + input_ids.get_size()); // todo: works only for batch 1

Expand All @@ -169,12 +183,17 @@ class LLMPipeline::LLMPipelineImpl {
attention_mask = ov::Tensor(attention_mask.get_element_type(), {1, tmp_attn_mask.size()});
std::copy(tmp_ids.begin(), tmp_ids.end(), input_ids.data<int64_t>());
std::copy(tmp_attn_mask.begin(), tmp_attn_mask.end(), attention_mask.data<int64_t>());

encoded_input = TokenizedInputs{input_ids, attention_mask};
}

auto encoded_results = generate(encoded_input, config, streamer);
return {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
auto answer = m_tokenizer.decode(encoded_results.tokens);
if (is_chat_conversation) {
m_templated_chat_history.append(answer[0]);
m_chat_history.push_back(std::make_pair(answer[0], chat_role::assistant));
}
return {answer, encoded_results.scores};
}

EncodedResults generate(
Expand Down Expand Up @@ -242,7 +261,7 @@ class LLMPipeline::LLMPipelineImpl {
return result;
}

std::string apply_chat_template(const std::vector<std::pair<std::string, std::string>>& prompts) const {
std::string apply_chat_template(const ChatHistory& prompts) const {
jinja2::TemplateEnv env;
env.GetSettings().lstripBlocks = true;
env.GetSettings().trimBlocks = true;
Expand All @@ -251,7 +270,7 @@ class LLMPipeline::LLMPipelineImpl {

jinja2::ValuesList messages;
for (const auto& [prompt, role] : prompts) {
messages.push_back(jinja2::ValuesMap{{"role", role}, {"content", prompt}});
messages.push_back(jinja2::ValuesMap{{"role", chat_role_str.at(role)}, {"content", prompt}});
}

jinja2::ValuesMap params = {
Expand All @@ -263,32 +282,6 @@ class LLMPipeline::LLMPipelineImpl {

return tpl.RenderAsString(params).value();
}

std::string apply_chat_template(std::string prompt, std::string role = "user") const {
jinja2::TemplateEnv env;
env.GetSettings().lstripBlocks = true;
env.GetSettings().trimBlocks = true;
jinja2::Template tpl(&env);
tpl.Load(m_chat_template);

jinja2::ValuesMap message {{"role", role}, {"content", prompt}};
jinja2::ValuesMap params = {
{"messages", jinja2::ValuesList({message})},
{"bos_token", m_tokenizer.get_bos_token()},
{"eos_token", m_tokenizer.get_eos_token()},
{"add_generation_prompt", true},
};

return tpl.RenderAsString(params).value();
}

std::vector<std::string> apply_chat_template(std::vector<std::string>& prompts, std::string role = "user") const {
std::vector<std::string> res;
for (const auto& prompt: prompts) {
res.emplace_back(apply_chat_template(prompt));
}
return res;
}
};

DecodedResults LLMPipeline::generate(
Expand Down Expand Up @@ -394,6 +387,19 @@ ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(
// If eos_token_id was not provided, take value
if (m_generation_config.eos_token_id == -1)
m_generation_config.eos_token_id = m_tokenizer.get_eos_token_id();
//update chat template
if (m_chat_template.size()) {
std::map<std::string, std::string> replace_str_map = {{"\n'}", "\n' }"},
{".strip()", "\"\""}};
size_t pos = 0;
for (auto [from, to] : replace_str_map) {
while ( ( pos = m_chat_template.find( from, pos ) ) != std::string::npos )
{
m_chat_template.replace( pos, from.size(), to );
pos += to.size();
}
}
}
}

ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
Expand All @@ -404,11 +410,6 @@ ov::genai::Tokenizer ov::genai::LLMPipeline::get_tokenizer() {
return m_pimpl->m_tokenizer;
}

std::string ov::genai::LLMPipeline::apply_chat_template(std::string prompt, std::string role) const {
return m_pimpl->apply_chat_template(prompt, role);
}


void ov::genai::LLMPipeline::start_chat() {
m_pimpl->is_chat_conversation = true;
if (!m_pimpl->m_is_cache_empty) {
Expand Down

0 comments on commit 85a5d95

Please sign in to comment.