Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ void regmodule_properties(py::module m) {
wrap_property_RW(m_hint, ov::hint::allow_auto_batching, "allow_auto_batching");
wrap_property_RW(m_hint, ov::hint::dynamic_quantization_group_size, "dynamic_quantization_group_size");
wrap_property_RW(m_hint, ov::hint::kv_cache_precision, "kv_cache_precision");
wrap_property_RW(m_hint, ov::hint::kv_cache_preallocation_size, "kv_cache_preallocation_size");
wrap_property_RW(m_hint, ov::hint::activations_scale_factor, "activations_scale_factor");
wrap_property_RW(m_hint, ov::hint::compiled_blob, "compiled_blob");

Expand Down
6 changes: 6 additions & 0 deletions src/inference/include/openvino/runtime/properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,12 @@ static constexpr Property<uint64_t, PropertyMutability::RW> dynamic_quantization
static constexpr Property<element::Type, PropertyMutability::RW> kv_cache_precision{"KV_CACHE_PRECISION"};

/**
* @brief Hint for device to set kv cache preallocation token size. Default is 128.
* @ingroup ov_runtime_cpp_prop_api
*/
static constexpr Property<uint32_t, PropertyMutability::RW> kv_cache_preallocation_size{"KV_CACHE_PREALLOCATION_SIZE"};

/**
* @brief This property scales down activations to prevent overflows when inference precision is f16.
* @ingroup ov_runtime_cpp_prop_api
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ OV_CONFIG_RELEASE_OPTION(ov, cache_encryption_callbacks, ov::EncryptionCallbacks
OV_CONFIG_RELEASE_OPTION(ov::hint, dynamic_quantization_group_size, 0, "Dynamic quantization group size")
OV_CONFIG_RELEASE_OPTION(ov::intel_gpu::hint, dynamic_quantization_group_size_max, UINT64_MAX, "Maximum dynamic quantization group size. When group_size is set as a higher value than this number, dynamic quantization will be turned off")
OV_CONFIG_RELEASE_OPTION(ov::hint, kv_cache_precision, ov::element::dynamic, "")
OV_CONFIG_RELEASE_OPTION(ov::hint, kv_cache_preallocation_size, 128, "Preallocation token size for kv cache, default:128")
OV_CONFIG_RELEASE_OPTION(ov::intel_gpu::hint, enable_kernels_reuse, false, "")
OV_CONFIG_RELEASE_OPTION(ov, weights_path, "", "Path to the model weights file used for weightless caching")
OV_CONFIG_RELEASE_OPTION(ov::hint, activations_scale_factor, -1.0f, "Scalar floating point value that is used for runtime activation tensor scaling with fp16 inference precision")
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_gpu/src/graph/kv_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ int32_t kv_cache_inst::get_prealloc_iter_num() {
// iteration.
// - Therfore, to avoid this situation where the allocation and copying occurs simutaneously for all the kv_cache_insts,
// we assigned different prealloc-size for each kv cache so that we could prevent a memory peak
return 128 + kv_cache_id % 64;
auto prealloc_size = this->get_config().get_kv_cache_preallocation_size();
return prealloc_size + kv_cache_id % 64;
}

void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) {
Expand Down