openvinotoolkit · razvanapetroaie · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
@@ -1359,6 +1359,8 @@ struct SEPARATE_WEIGHTS_VERSION final : OptionBase<SEPARATE_WEIGHTS_VERSION, ov:
     }
 
     static ov::intel_npu::WSVersion defaultValue() {
+        // Note: if the compiler-in-plugin is used (intel_npu::compiler_type = intel_npu::CompilerType::PLUGIN), then
+        // the default is actually WSVersion::ONE_SHOT
         return ov::intel_npu::WSVersion::ITERATIVE;
     }
 

@@ -375,7 +375,8 @@ static constexpr ov::Property<BatchMode> batch_mode{"NPU_BATCH_MODE"};
 
 /**
  * @brief [Experimental, only for NPU Plugin]
- * Type: enum. Default is "ITERATIVE".
+ * Type: enum. Default is "ITERATIVE". If the compiler-in-plugin is used (intel_npu::compiler_type =
+ * intel_npu::CompilerType::PLUGIN), then the default becomes "ONE_SHOT".
  *
  * The value stored in this entry indicates which implementation of the "weights separation" feature will be used.
  * Note: NPU_COMPILER_TYPE = DRIVER & NPU_SEPARATE_WEIGHTS_VERSION = ONE_SHOT are not compatible.

@@ -38,6 +38,10 @@ class WeightsPointerAttribute : public ov::RuntimeAttribute {
         return true;
     }
 
+    bool is_deterministic() const override {
+        return false;
+    }
+
     size_t memory_pointer;
     size_t byte_size;
 };

@@ -16,7 +16,11 @@
 
 namespace intel_npu {
 
-using SerializedIR = std::pair<size_t, std::shared_ptr<uint8_t>>;
+struct SerializedIR {
+    std::shared_ptr<uint8_t> buffer;
+    size_t size;
+    std::optional<uint64_t> hash = std::nullopt;
+};
 
 /**
  * @brief Contain all required transformation on OpenVINO model in case for external compiler usage and
@@ -35,7 +39,9 @@ namespace driver_compiler_utils {
 SerializedIR serializeIR(const std::shared_ptr<const ov::Model>& model,
                          ze_graph_compiler_version_info_t compilerVersion,
                          const uint32_t supportedOpsetVersion,
-                         const bool useBaseModelSerializer = true);
+                         const bool useBaseModelSerializer = true,
+                         const bool computeModelHash = false,
+                         const bool storeWeightlessCacheAttribute = false);
 
 /**
  * @brief Serialize input / output information to string format.

@@ -162,25 +162,6 @@ void storeWeightsPointerAttribute(const std::shared_ptr<ov::Model>& model) {
     }
 }
 
-/**
- * @brief Removes the attributes stored by "storeWeightsPointerAttribute" in order to restore the model to its original
- * state.
- * @see storeWeightsPointerAttribute for details.
- */
-void removeWeightsPointerAttribute(const std::shared_ptr<ov::Model>& model) {
-    for (auto&& node : model->get_ops()) {
-        if (!ov::is_type<ov::op::v0::Constant>(node)) {
-            continue;
-        }
-
-        ov::RTMap& runtimeInfoMap = node->get_rt_info();
-        const auto& resultIt = runtimeInfoMap.find(intel_npu::WeightsPointerAttribute::get_type_info_static());
-        if (resultIt != runtimeInfoMap.end()) {
-            runtimeInfoMap.erase(resultIt);
-        }
-    }
-}
-
 }  // namespace
 
 namespace intel_npu::driver_compiler_utils {
@@ -194,19 +175,15 @@ class VCLSerializerBase {
 public:
     VCLSerializerBase(const std::shared_ptr<const ov::Model>& origModel,
                       const ze_graph_compiler_version_info_t compilerVersion,
-                      const uint32_t supportedOpset = 11)
+                      const uint32_t supportedOpset = 11,
+                      const bool computeModelHash = false,
+                      const bool storeWeightlessCacheAttribute = false)
         : _logger("VCLSerializerBase", Logger::global().level()),
           _compilerVersion(compilerVersion),
-          _supportedOpset(supportedOpset) {
-        // There is no const variant of run_passes so use const_cast here
-        // as model serialization does not mutate the model
+          _supportedOpset(supportedOpset),
+          _computeModelHash(computeModelHash),
+          _storeWeightlessCacheAttribute(storeWeightlessCacheAttribute) {
         _model = std::const_pointer_cast<ov::Model>(origModel);
-
-        if (supportedOpset < 11) {
-            // Need to clone to modify the model and remain thread safe
-            _model = _model->clone();
-            _logger.info("Clone model for offset smaller than 11");
-        }
     }
 
     virtual SerializedIR serialize() = 0;
@@ -223,46 +200,42 @@ class VCLSerializerBase {
     void serialize_model_to_stream(const std::function<void(ov::pass::Manager&)>& register_serialization_pass) {
         _logger.debug("serialize_model_to_stream");
         const auto passConfig = std::make_shared<ov::pass::PassConfig>();
+
+        // Step 1: run compatibility passes
         ov::pass::Manager manager(std::move(passConfig), "NPU:serialize_model_to_stream");
 
         if (_supportedOpset < 11) {
             // Downgrade to opset10
             manager.register_pass<ov::pass::ConvertInterpolate11ToInterpolate4>();
             _logger.info("Downgrade op for opset smaller than 11");
         }
-
+        // Step 2: store the WeightlessCacheAttribute if requested
+        // Step 3: serialize
+        // Step 4: compute the hash if requested
         register_serialization_pass(manager);
 
-        // We modify the original model object here therefore a mutex is required
-        static std::mutex rtInfoMutex;
-
-        {
-            std::lock_guard<std::mutex> lock(rtInfoMutex);
+        // Depending on the driver version, the compiler attached to it may request this information as an indicator
+        // of the precision/layout preprocessing requirement. We are setting this value to "true" since the API
+        // version is no longer a cause for altering the metadata. This is due to the preprocessing performed in the
+        // OpenVINO framework's implementaion, the "ov::Model" object is preprocessed before reaching the NPU
+        // plugin.
+        _model->set_rt_info(true, "is_new_api");
+        // Flag used for indicating an NPU plugin version which switched the I/O identification convention from
+        // names to indices. The flag is required in order to inform the driver-compiler adapter to expect indices
+        // when attempting to deserialize the I/O metadata.
+        _model->set_rt_info(true, "use_indices_for_io_metadata");
 
-            // Depending on the driver version, the compiler attached to it may request this information as an indicator
-            // of the precision/layout preprocessing requirement. We are setting this value to "true" since the API
-            // version is no longer a cause for altering the metadata. This is due to the preprocessing performed in the
-            // OpenVINO framework's implementaion, the "ov::Model" object is preprocessed before reaching the NPU
-            // plugin.
-            _model->set_rt_info(true, "is_new_api");
-            // Flag used for indicating an NPU plugin version which switched the I/O identification convention from
-            // names to indices. The flag is required in order to inform the driver-compiler adapter to expect indices
-            // when attempting to deserialize the I/O metadata.
-            _model->set_rt_info(true, "use_indices_for_io_metadata");
+        manager.run_passes(_model);
 
-            manager.run_passes(_model);
-
-            auto& rtInfo = _model->get_rt_info();
-            rtInfo.erase("is_new_api");
-            rtInfo.erase("use_indices_for_io_metadata");
-        }
         _logger.debug("serialize_model_to_stream end");
     }
 
     Logger _logger;
     std::shared_ptr<ov::Model> _model = nullptr;
     ze_graph_compiler_version_info_t _compilerVersion;
     uint32_t _supportedOpset = 11;
+    bool _computeModelHash;
+    bool _storeWeightlessCacheAttribute;
 };
 
 /**
@@ -272,8 +245,14 @@ class VCLSerializerWithWeightsCopy : public VCLSerializerBase {
 public:
     VCLSerializerWithWeightsCopy(const std::shared_ptr<const ov::Model>& origModel,
                                  const ze_graph_compiler_version_info_t compilerVersion,
-                                 const uint32_t supportedOpset = 11)
-        : VCLSerializerBase(origModel, compilerVersion, supportedOpset) {
+                                 const uint32_t supportedOpset = 11,
+                                 const bool computeModelHash = false,
+                                 const bool storeWeightlessCacheAttribute = false)
+        : VCLSerializerBase(origModel,
+                            compilerVersion,
+                            supportedOpset,
+                            computeModelHash,
+                            storeWeightlessCacheAttribute) {
         _logger.setName("VCLSerializerWithWeightsCopy");
     };
 
@@ -331,7 +310,7 @@ class VCLSerializerWithWeightsCopy : public VCLSerializerBase {
 
         OPENVINO_ASSERT(offset == sizeOfSerializedIR);
 
-        return std::make_pair(sizeOfSerializedIR, buffer);
+        return {buffer, sizeOfSerializedIR};
     }
 
 private:
@@ -392,8 +371,14 @@ class VCLSerializerWithoutWeightsCopy : public VCLSerializerBase {
 public:
     VCLSerializerWithoutWeightsCopy(const std::shared_ptr<const ov::Model>& origModel,
                                     const ze_graph_compiler_version_info_t compilerVersion,
-                                    const uint32_t supportedOpset = 11)
-        : VCLSerializerBase(origModel, compilerVersion, supportedOpset) {
+                                    const uint32_t supportedOpset = 11,
+                                    const bool computeModelHash = false,
+                                    const bool storeWeightlessCacheAttribute = false)
+        : VCLSerializerBase(origModel,
+                            compilerVersion,
+                            supportedOpset,
+                            computeModelHash,
+                            storeWeightlessCacheAttribute) {
         _logger.setName("VCLSerializerWithoutWeightsCopy");
     };
 
@@ -411,7 +396,7 @@ class VCLSerializerWithoutWeightsCopy : public VCLSerializerBase {
         std::shared_ptr<uint8_t> buffer(new uint8_t[_serializedModelSize], std::default_delete<uint8_t[]>());
         serialize_model_to_buffer(buffer.get());
 
-        return SerializedIR(_serializedModelSize, buffer);
+        return {buffer, _serializedModelSize};
     }
 
 private:
@@ -455,19 +440,29 @@ class VCLSerializerWithoutWeightsCopy : public VCLSerializerBase {
 SerializedIR serializeIR(const std::shared_ptr<const ov::Model>& model,
                          const ze_graph_compiler_version_info_t compilerVersion,
                          const uint32_t supportedOpsetVersion,
-                         const bool useBaseModelSerializer) {
+                         const bool useBaseModelSerializer,
+                         const bool computeModelHash,
+                         const bool storeWeightlessCacheAttribute) {
     if (!useBaseModelSerializer) {
         // Non-constness required for adding & removing weights pointer attributes. The current instance is already a
         // clone (or should be one), we are not modifying the original model.
         const std::shared_ptr<ov::Model> nonConstantModel = std::const_pointer_cast<ov::Model>(model);
         storeWeightsPointerAttribute(nonConstantModel);
 
-        SerializedIR serializedIR =
-            VCLSerializerWithoutWeightsCopy(model, compilerVersion, supportedOpsetVersion).serialize();
-        removeWeightsPointerAttribute(nonConstantModel);
+        SerializedIR serializedIR = VCLSerializerWithoutWeightsCopy(model,
+                                                                    compilerVersion,
+                                                                    supportedOpsetVersion,
+                                                                    computeModelHash,
+                                                                    storeWeightlessCacheAttribute)
+                                        .serialize();
         return serializedIR;
     }
-    return VCLSerializerWithWeightsCopy(model, compilerVersion, supportedOpsetVersion).serialize();
+    return VCLSerializerWithWeightsCopy(model,
+                                        compilerVersion,
+                                        supportedOpsetVersion,
+                                        computeModelHash,
+                                        storeWeightlessCacheAttribute)
+        .serialize();
 }
 
 std::string serializeIOInfo(const std::shared_ptr<const ov::Model>& model, const bool useIndices) {

@@ -277,8 +277,8 @@ std::unordered_set<std::string> ZeGraphExtWrappers::queryGraph(SerializedIR seri
     ze_graph_desc_2_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
                               nullptr,
                               ZE_GRAPH_FORMAT_NGRAPH_LITE,
-                              serializedIR.first,
-                              serializedIR.second.get(),
+                              serializedIR.size,
+                              serializedIR.buffer.get(),
                               buildFlags.c_str(),
                               ZE_GRAPH_FLAG_NONE};
 
@@ -335,23 +335,30 @@ bool ZeGraphExtWrappers::canCpuVaBeImported(const void* data, size_t size) const
     return true;
 }
 
+// ze_graph_input_hash_t graphInputHash = {};
+//     graphInputHash.stype = ZE_STRUCTURE_TYPE_GRAPH_INPUT_HASH;
+//     graphInputHash.hash = hash;
+
 GraphDescriptor ZeGraphExtWrappers::getGraphDescriptor(SerializedIR serializedIR,
                                                        const std::string& buildFlags,
                                                        const bool bypassUmdCache) const {
     ze_graph_handle_t graphHandle = nullptr;
 
+    const uint64_t hash = 14;
+    ze_graph_input_hash_t modelHash = {ZE_STRUCTURE_TYPE_GRAPH_INPUT_HASH, nullptr, hash};
+
     uint32_t flags = ZE_GRAPH_FLAG_NONE;
     if (bypassUmdCache) {
         _logger.debug("getGraphDescriptor - set ZE_GRAPH_FLAG_DISABLE_CACHING");
         flags |= ZE_GRAPH_FLAG_DISABLE_CACHING;
     }
 
     ze_graph_desc_2_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
-                              nullptr,
+                              &modelHash,
                               ZE_GRAPH_FORMAT_NGRAPH_LITE,
-                              serializedIR.first,
-                              serializedIR.second.get(),
-                              buildFlags.c_str(),
+                              serializedIR.size,
+                              serializedIR.buffer.get(),
+                              "",
                               flags};
 
     _logger.debug("getGraphDescriptor - perform pfnCreate2");
@@ -360,6 +367,7 @@ GraphDescriptor ZeGraphExtWrappers::getGraphDescriptor(SerializedIR serializedIR
                                                                  &desc,
                                                                  &graphHandle);
     THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnCreate2", result, _zeroInitStruct->getGraphDdiTable());
+    OPENVINO_THROW("");
 
     return GraphDescriptor{graphHandle};
 }