openvinotoolkit · nazanin-beheshti · Oct 30, 2025
@@ -101,6 +101,7 @@ void regmodule_properties(py::module m) {
     wrap_property_RW(m_hint, ov::hint::allow_auto_batching, "allow_auto_batching");
     wrap_property_RW(m_hint, ov::hint::dynamic_quantization_group_size, "dynamic_quantization_group_size");
     wrap_property_RW(m_hint, ov::hint::kv_cache_precision, "kv_cache_precision");
+    wrap_property_RW(m_hint, ov::hint::kv_cache_preallocation_size, "kv_cache_preallocation_size");
     wrap_property_RW(m_hint, ov::hint::activations_scale_factor, "activations_scale_factor");
     wrap_property_RW(m_hint, ov::hint::compiled_blob, "compiled_blob");
 

@@ -603,6 +603,12 @@ static constexpr Property<uint64_t, PropertyMutability::RW> dynamic_quantization
 static constexpr Property<element::Type, PropertyMutability::RW> kv_cache_precision{"KV_CACHE_PRECISION"};
 
 /**
+ * @brief Hint for device to set kv cache preallocation token size. Default is 128.
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<uint32_t, PropertyMutability::RW> kv_cache_preallocation_size{"KV_CACHE_PREALLOCATION_SIZE"};
+
+ /**
  * @brief This property scales down activations to prevent overflows when inference precision is f16.
  * @ingroup ov_runtime_cpp_prop_api
  */

@@ -30,6 +30,7 @@ OV_CONFIG_RELEASE_OPTION(ov, cache_encryption_callbacks, ov::EncryptionCallbacks
 OV_CONFIG_RELEASE_OPTION(ov::hint, dynamic_quantization_group_size, 0, "Dynamic quantization group size")
 OV_CONFIG_RELEASE_OPTION(ov::intel_gpu::hint, dynamic_quantization_group_size_max, UINT64_MAX, "Maximum dynamic quantization group size. When group_size is set as a higher value than this number, dynamic quantization will be turned off")
 OV_CONFIG_RELEASE_OPTION(ov::hint, kv_cache_precision, ov::element::dynamic, "")
+OV_CONFIG_RELEASE_OPTION(ov::hint, kv_cache_preallocation_size, 128, "Preallocation token size for kv cache, default:128")
 OV_CONFIG_RELEASE_OPTION(ov::intel_gpu::hint, enable_kernels_reuse, false, "")
 OV_CONFIG_RELEASE_OPTION(ov, weights_path, "", "Path to the model weights file used for weightless caching")
 OV_CONFIG_RELEASE_OPTION(ov::hint, activations_scale_factor, -1.0f, "Scalar floating point value that is used for runtime activation tensor scaling with fp16 inference precision")

@@ -103,7 +103,8 @@ int32_t kv_cache_inst::get_prealloc_iter_num() {
     //   iteration.
     // - Therfore, to avoid this situation where the allocation and copying occurs simutaneously for all the kv_cache_insts,
     //   we assigned different prealloc-size for each kv cache so that we could prevent a memory peak
-    return 128 + kv_cache_id % 64;
+    auto prealloc_size = this->get_config().get_kv_cache_preallocation_size();
+    return prealloc_size + kv_cache_id % 64;
 }
 
 void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) {