Skip to content

Commit

Permalink
map stop_criteria in pybind;
Browse files Browse the repository at this point in the history
fix genai env manager on Win;
fix failing multibatch tests;
  • Loading branch information
pavel-esir committed May 30, 2024
1 parent 9389930 commit 037c803
Show file tree
Hide file tree
Showing 15 changed files with 364 additions and 186 deletions.
16 changes: 8 additions & 8 deletions src/cpp/include/openvino/genai/generation_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ namespace genai {

/**
* @brief controls the stopping condition for grouped beam search. The following values are possible:
* "early" stops as soon as there are `num_beams` complete candidates.
"heuristic" stops when is it unlikely to find better candidates.
"never" stops when there cannot be better candidates.
* "EARLY" stops as soon as there are `num_beams` complete candidates.
"HEURISTIC" stops when is it unlikely to find better candidates.
"NEVER" stops when there cannot be better candidates.
*/
enum class StopCriteria { early, heuristic, never };
enum class StopCriteria { EARLY, HEURISTIC, NEVER };

/**
* @brief Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
Expand Down Expand Up @@ -50,9 +50,9 @@ enum class StopCriteria { early, heuristic, never };
* @param num_return_sequences the number of sequences to return for grouped beam search decoding.
* @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once.
* @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values:
* "early", where the generation stops as soon as there are `num_beams` complete candidates; "heuristic", where an
* heuristic is applied and the generation stops when is it very unlikely to find better candidates;
* "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
* "EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "HEURISTIC", where an
* "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
* "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
*
* Random sampling parameters:
* @param temperature the value used to modulate token probabilities for random sampling.
Expand All @@ -78,7 +78,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
float length_penalty = 1.0f;
size_t num_return_sequences = 1;
size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
StopCriteria stop_criteria = StopCriteria::heuristic;
StopCriteria stop_criteria = StopCriteria::HEURISTIC;

// Multinomial
float temperature = 1.0f;
Expand Down
6 changes: 3 additions & 3 deletions src/cpp/src/generation_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ GenerationConfig::GenerationConfig(std::string json_path) {
if (data.contains("early_stopping")) {
auto field_type = data["early_stopping"].type();
if (field_type == nlohmann::json::value_t::string && data["early_stopping"] == "never") {
stop_criteria = StopCriteria::never;
stop_criteria = StopCriteria::NEVER;
} else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == true) {
stop_criteria = StopCriteria::early;
stop_criteria = StopCriteria::EARLY;
} else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == false) {
stop_criteria = StopCriteria::heuristic;
stop_criteria = StopCriteria::HEURISTIC;
}
}

Expand Down
3 changes: 1 addition & 2 deletions src/cpp/src/greedy_decoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,6 @@ EncodedResults greedy_decoding(
auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
std::iota(beam_data, beam_data + running_batch_size, 0);

size_t max_tokens = generation_config.get_max_new_tokens(prompt_len);

m_model_runner.infer();
auto logits = m_model_runner.get_tensor("logits");
ov::Shape logits_shape = logits.get_shape();
Expand All @@ -88,6 +86,7 @@ EncodedResults greedy_decoding(
if (!generation_config.ignore_eos && all_are_eos)
return results;

size_t max_tokens = generation_config.get_max_new_tokens(prompt_len);
for (size_t i = 0; i < max_tokens - 1; ++i) {
utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask"));
m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask")));
Expand Down
8 changes: 4 additions & 4 deletions src/cpp/src/group_beam_searcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ struct Parameters {
size_t group_size = 5;
float diversity_penalty = 1.0;
size_t max_new_tokens = 20;
ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::heuristic;
ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::HEURISTIC;
float length_penalty = 1.0;
size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();

Expand Down Expand Up @@ -128,15 +128,15 @@ struct Group {
float best_sum_logprobs = ongoing.front().score;
float worst_score = min_heap.front().score;
switch (parameters.stop_criteria) {
case ov::genai::StopCriteria::early:
case ov::genai::StopCriteria::EARLY:
done = true;
return;
case ov::genai::StopCriteria::heuristic: {
case ov::genai::StopCriteria::HEURISTIC: {
float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty);
done = worst_score >= highest_attainable_score;
return;
}
case ov::genai::StopCriteria::never: {
case ov::genai::StopCriteria::NEVER: {
size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len;
float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty);
done = worst_score >= highest_attainable_score;
Expand Down
61 changes: 4 additions & 57 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,33 +15,6 @@
#include "utils.hpp"
#include "text_callback_streamer.hpp"

#ifdef _WIN32
# include <windows.h>
# define MAX_ABS_PATH _MAX_PATH
# define get_absolute_path(result, path) _fullpath(result, path.c_str(), MAX_ABS_PATH)
#else
# include <dlfcn.h>
# include <limits.h>
# define MAX_ABS_PATH PATH_MAX
# define get_absolute_path(result, path) realpath(path.c_str(), result)
namespace {
std::string get_absolute_file_path(const std::string& path) {
std::string absolutePath;
absolutePath.resize(MAX_ABS_PATH);
std::ignore = get_absolute_path(&absolutePath[0], path);
if (!absolutePath.empty()) {
// on Linux if file does not exist or no access, function will return NULL, but
// `absolutePath` will contain resolved path
absolutePath.resize(absolutePath.find('\0'));
return std::string(absolutePath);
}
std::stringstream ss;
ss << "Can't get absolute file path for [" << path << "], err = " << strerror(errno);
throw std::runtime_error(ss.str());
}
}
#endif

namespace {

const std::string STREAMER_ARG_NAME = "streamer";
Expand Down Expand Up @@ -86,30 +59,6 @@ std::string from_tokenizer_json_if_exists(const std::string& path) {
return res;
}



std::string get_ov_genai_library_path() {
#ifdef _WIN32
CHAR genai_library_path[MAX_PATH];
HMODULE hm = NULL;
if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
reinterpret_cast<LPSTR>(get_ov_genai_library_path),
&hm)) {
std::stringstream ss;
ss << "GetModuleHandle returned " << GetLastError();
throw std::runtime_error(ss.str());
}
GetModuleFileNameA(hm, (LPSTR)genai_library_path, sizeof(genai_library_path));
return std::string(genai_library_path);
#elif defined(__APPLE__) || defined(__linux__) || defined(__EMSCRIPTEN__)
Dl_info info;
dladdr(reinterpret_cast<void*>(get_ov_genai_library_path), &info);
return get_absolute_file_path(info.dli_fname).c_str();
#else
# error "Unsupported OS"
#endif // _WIN32
}

ov::genai::StreamerVariant get_streamer_from_map(const ov::AnyMap& config_map) {
ov::genai::StreamerVariant streamer = std::monostate();

Expand Down Expand Up @@ -194,6 +143,8 @@ class LLMPipeline::LLMPipelineImpl {
) {
GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;

auto old_pad_token_id = m_tokenizer.get_pad_token_id();
m_tokenizer.set_pad_token_id(config.pad_token_id);

EncodedInputs encoded_input;
if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
Expand All @@ -214,7 +165,6 @@ class LLMPipeline::LLMPipelineImpl {
auto input_ids = res.input_ids;
auto attention_mask = res.attention_mask;


// todo: W/A If sentence begins with a specfial tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
// but HF does not do that. Moreover openvino_tokenizer always inserts <bos> but in chat scenario HF does not do that because skip_special_tokens=True.
// Need to remove both of that tokens manually to get exact token by token alignment with HF
Expand Down Expand Up @@ -243,6 +193,7 @@ class LLMPipeline::LLMPipelineImpl {

encoded_input = TokenizedInputs{input_ids, attention_mask};
}
m_tokenizer.set_pad_token_id(old_pad_token_id);
auto encoded_results = generate(encoded_input, config, streamer);
return {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
}
Expand Down Expand Up @@ -285,8 +236,6 @@ class LLMPipeline::LLMPipelineImpl {
OPENVINO_THROW("Currently streaming is possible only with batch size=1 and greedy or multinomial decoding");
}

// auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::genai::utils::init_attention_mask(input_ids);

if (config.is_greedy_decoding()) {
result = ov::genai::greedy_decoding(m_model_runner, input_ids, attention_mask, config, streamer_ptr, is_chat_conversation);
} else if (config.is_beam_search()) {
Expand Down Expand Up @@ -431,14 +380,12 @@ ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(
const ov::AnyMap& config
):
m_model_runner{ov::Core{}.compile_model(path + "/openvino_model.xml", device, config).create_infer_request()},
m_tokenizer(path, device),
m_generation_config{from_config_json_if_exists(path)},
m_chat_template{from_tokenizer_json_if_exists(path)}
{
ov::genai::utils::GenAIEnvManager env_manager(get_ov_genai_library_path());
m_tokenizer = Tokenizer(path, device);
}


ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
return m_pimpl->m_generation_config;
}
Expand Down
88 changes: 88 additions & 0 deletions src/cpp/src/streamer_base.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include "openvino/genai/streamer_base.hpp"
#include "openvino/genai/tokenizer.hpp"

namespace ov {
namespace genai {

// class LambdaStreamer::LambdaStreamerImpl {
// public:
// LambdaStreamerImpl(Tokenizer tokenizer, std::function<bool(std::string)> func): m_tokenizer(tokenizer), m_func(func) {}
// LambdaStreamerImpl(std::function<bool(std::string)> func): m_func(func) {}

// Tokenizer m_tokenizer;
// std::function<bool(std::string)> m_func;
// bool m_print_eos_token = false;
// std::vector<int64_t> m_tokens_cache;
// size_t print_len = 0;

// bool put(int64_t token) {
// std::stringstream res;
// // do nothing if <eos> token is met and if print_eos_token=false
// if (!m_print_eos_token && token == m_tokenizer.get_eos_token_id())
// return m_func(res.str());

// m_tokens_cache.push_back(token);
// std::string text = m_tokenizer.decode(m_tokens_cache);
// if (!text.empty() && '\n' == text.back()) {
// // Flush the cache after the new line symbol
// res << std::string_view{text.data() + print_len, text.size() - print_len};
// m_tokens_cache.clear();
// print_len = 0;
// return m_func(res.str());
// }
// if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
// // Don't print incomplete text
// return m_func(res.str());
// }
// res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
// print_len = text.size();
// return m_func(res.str());
// }

// bool end() {
// std::stringstream res;
// std::string text = m_tokenizer.decode(m_tokens_cache);
// res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
// m_tokens_cache.clear();
// print_len = 0;
// return m_func(res.str());
// }

// };

// LambdaStreamer::LambdaStreamer(Tokenizer tokenizer, std::function<bool(std::string)> func) {}

// LambdaStreamer::LambdaStreamer(std::function<bool(std::string)> func) {
// m_pimpl = std::make_shared<LambdaStreamer::LambdaStreamerImpl>(func);
// }

// void LambdaStreamer::put(int64_t token) { m_pimpl -> put(token);}

// void LambdaStreamer::end() { m_pimpl->end();}

} // namespace genai
} // namespace ov



// class LambdaStreamer: public StreamerBase {
// public:
// // LambdaStreamer(Tokenizer tokenizer, std::function<bool(std::string)> func);
// LambdaStreamer(std::function<bool(std::string)> func);

// void put(int64_t token) override;
// void end() override;

// bool operator==(const LambdaStreamer& other) const {
// // For simplicity, we assume lambdas are not comparable.
// // If you need to compare actual logic, you may need to use type erasure or another method.
// return false; // This can be changed based on your specific needs.
// }
// private:

// class LambdaStreamerImpl;
// std::shared_ptr<LambdaStreamerImpl> m_pimpl;
// };
Loading

0 comments on commit 037c803

Please sign in to comment.