diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp index e36c044c518809..8ccd07dae2b95b 100644 --- a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp +++ b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp @@ -101,6 +101,7 @@ void regmodule_properties(py::module m) { wrap_property_RW(m_hint, ov::hint::allow_auto_batching, "allow_auto_batching"); wrap_property_RW(m_hint, ov::hint::dynamic_quantization_group_size, "dynamic_quantization_group_size"); wrap_property_RW(m_hint, ov::hint::kv_cache_precision, "kv_cache_precision"); + wrap_property_RW(m_hint, ov::hint::kv_cache_preallocation_size, "kv_cache_preallocation_size"); wrap_property_RW(m_hint, ov::hint::activations_scale_factor, "activations_scale_factor"); wrap_property_RW(m_hint, ov::hint::compiled_blob, "compiled_blob"); diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp index 3abb527cfb8f6c..08b52a2dc1d2da 100644 --- a/src/inference/include/openvino/runtime/properties.hpp +++ b/src/inference/include/openvino/runtime/properties.hpp @@ -603,6 +603,12 @@ static constexpr Property dynamic_quantization static constexpr Property kv_cache_precision{"KV_CACHE_PRECISION"}; /** + * @brief Hint for device to set kv cache preallocation token size. Default is 128. + * @ingroup ov_runtime_cpp_prop_api + */ +static constexpr Property kv_cache_preallocation_size{"KV_CACHE_PREALLOCATION_SIZE"}; + + /** * @brief This property scales down activations to prevent overflows when inference precision is f16. * @ingroup ov_runtime_cpp_prop_api */ diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl b/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl index f3c17cb0d283b5..b672d80b5153c2 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl @@ -30,6 +30,7 @@ OV_CONFIG_RELEASE_OPTION(ov, cache_encryption_callbacks, ov::EncryptionCallbacks OV_CONFIG_RELEASE_OPTION(ov::hint, dynamic_quantization_group_size, 0, "Dynamic quantization group size") OV_CONFIG_RELEASE_OPTION(ov::intel_gpu::hint, dynamic_quantization_group_size_max, UINT64_MAX, "Maximum dynamic quantization group size. When group_size is set as a higher value than this number, dynamic quantization will be turned off") OV_CONFIG_RELEASE_OPTION(ov::hint, kv_cache_precision, ov::element::dynamic, "") +OV_CONFIG_RELEASE_OPTION(ov::hint, kv_cache_preallocation_size, 128, "Preallocation token size for kv cache, default:128") OV_CONFIG_RELEASE_OPTION(ov::intel_gpu::hint, enable_kernels_reuse, false, "") OV_CONFIG_RELEASE_OPTION(ov, weights_path, "", "Path to the model weights file used for weightless caching") OV_CONFIG_RELEASE_OPTION(ov::hint, activations_scale_factor, -1.0f, "Scalar floating point value that is used for runtime activation tensor scaling with fp16 inference precision") diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp index 01ba3d92c28dcc..b85b3e986f8e33 100644 --- a/src/plugins/intel_gpu/src/graph/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/kv_cache.cpp @@ -103,7 +103,8 @@ int32_t kv_cache_inst::get_prealloc_iter_num() { // iteration. // - Therfore, to avoid this situation where the allocation and copying occurs simutaneously for all the kv_cache_insts, // we assigned different prealloc-size for each kv cache so that we could prevent a memory peak - return 128 + kv_cache_id % 64; + auto prealloc_size = this->get_config().get_kv_cache_preallocation_size(); + return prealloc_size + kv_cache_id % 64; } void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) {