Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1359,6 +1359,8 @@ struct SEPARATE_WEIGHTS_VERSION final : OptionBase<SEPARATE_WEIGHTS_VERSION, ov:
}

static ov::intel_npu::WSVersion defaultValue() {
// Note: if the compiler-in-plugin is used (intel_npu::compiler_type = intel_npu::CompilerType::PLUGIN), then
// the default is actually WSVersion::ONE_SHOT
return ov::intel_npu::WSVersion::ITERATIVE;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,8 @@ static constexpr ov::Property<BatchMode> batch_mode{"NPU_BATCH_MODE"};

/**
* @brief [Experimental, only for NPU Plugin]
* Type: enum. Default is "ITERATIVE".
* Type: enum. Default is "ITERATIVE". If the compiler-in-plugin is used (intel_npu::compiler_type =
* intel_npu::CompilerType::PLUGIN), then the default becomes "ONE_SHOT".
*
* The value stored in this entry indicates which implementation of the "weights separation" feature will be used.
* Note: NPU_COMPILER_TYPE = DRIVER & NPU_SEPARATE_WEIGHTS_VERSION = ONE_SHOT are not compatible.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ class WeightsPointerAttribute : public ov::RuntimeAttribute {
return true;
}

bool is_deterministic() const override {
return false;
}

size_t memory_pointer;
size_t byte_size;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@

namespace intel_npu {

using SerializedIR = std::pair<size_t, std::shared_ptr<uint8_t>>;
struct SerializedIR {
std::shared_ptr<uint8_t> buffer;
size_t size;
std::optional<uint64_t> hash = std::nullopt;
};

/**
* @brief Contain all required transformation on OpenVINO model in case for external compiler usage and
Expand All @@ -35,7 +39,9 @@ namespace driver_compiler_utils {
SerializedIR serializeIR(const std::shared_ptr<const ov::Model>& model,
ze_graph_compiler_version_info_t compilerVersion,
const uint32_t supportedOpsetVersion,
const bool useBaseModelSerializer = true);
const bool useBaseModelSerializer = true,
const bool computeModelHash = false,
const bool storeWeightlessCacheAttribute = false);

/**
* @brief Serialize input / output information to string format.
Expand Down
119 changes: 57 additions & 62 deletions src/plugins/intel_npu/src/compiler_adapter/src/vcl_serializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,25 +162,6 @@ void storeWeightsPointerAttribute(const std::shared_ptr<ov::Model>& model) {
}
}

/**
* @brief Removes the attributes stored by "storeWeightsPointerAttribute" in order to restore the model to its original
* state.
* @see storeWeightsPointerAttribute for details.
*/
void removeWeightsPointerAttribute(const std::shared_ptr<ov::Model>& model) {
for (auto&& node : model->get_ops()) {
if (!ov::is_type<ov::op::v0::Constant>(node)) {
continue;
}

ov::RTMap& runtimeInfoMap = node->get_rt_info();
const auto& resultIt = runtimeInfoMap.find(intel_npu::WeightsPointerAttribute::get_type_info_static());
if (resultIt != runtimeInfoMap.end()) {
runtimeInfoMap.erase(resultIt);
}
}
}

} // namespace

namespace intel_npu::driver_compiler_utils {
Expand All @@ -194,19 +175,15 @@ class VCLSerializerBase {
public:
VCLSerializerBase(const std::shared_ptr<const ov::Model>& origModel,
const ze_graph_compiler_version_info_t compilerVersion,
const uint32_t supportedOpset = 11)
const uint32_t supportedOpset = 11,
const bool computeModelHash = false,
const bool storeWeightlessCacheAttribute = false)
: _logger("VCLSerializerBase", Logger::global().level()),
_compilerVersion(compilerVersion),
_supportedOpset(supportedOpset) {
// There is no const variant of run_passes so use const_cast here
// as model serialization does not mutate the model
_supportedOpset(supportedOpset),
_computeModelHash(computeModelHash),
_storeWeightlessCacheAttribute(storeWeightlessCacheAttribute) {
_model = std::const_pointer_cast<ov::Model>(origModel);

if (supportedOpset < 11) {
// Need to clone to modify the model and remain thread safe
_model = _model->clone();
_logger.info("Clone model for offset smaller than 11");
}
}

virtual SerializedIR serialize() = 0;
Expand All @@ -223,46 +200,42 @@ class VCLSerializerBase {
void serialize_model_to_stream(const std::function<void(ov::pass::Manager&)>& register_serialization_pass) {
_logger.debug("serialize_model_to_stream");
const auto passConfig = std::make_shared<ov::pass::PassConfig>();

// Step 1: run compatibility passes
ov::pass::Manager manager(std::move(passConfig), "NPU:serialize_model_to_stream");

if (_supportedOpset < 11) {
// Downgrade to opset10
manager.register_pass<ov::pass::ConvertInterpolate11ToInterpolate4>();
_logger.info("Downgrade op for opset smaller than 11");
}

// Step 2: store the WeightlessCacheAttribute if requested
// Step 3: serialize
// Step 4: compute the hash if requested
register_serialization_pass(manager);

// We modify the original model object here therefore a mutex is required
static std::mutex rtInfoMutex;

{
std::lock_guard<std::mutex> lock(rtInfoMutex);
// Depending on the driver version, the compiler attached to it may request this information as an indicator
// of the precision/layout preprocessing requirement. We are setting this value to "true" since the API
// version is no longer a cause for altering the metadata. This is due to the preprocessing performed in the
// OpenVINO framework's implementaion, the "ov::Model" object is preprocessed before reaching the NPU
// plugin.
_model->set_rt_info(true, "is_new_api");
// Flag used for indicating an NPU plugin version which switched the I/O identification convention from
// names to indices. The flag is required in order to inform the driver-compiler adapter to expect indices
// when attempting to deserialize the I/O metadata.
_model->set_rt_info(true, "use_indices_for_io_metadata");

// Depending on the driver version, the compiler attached to it may request this information as an indicator
// of the precision/layout preprocessing requirement. We are setting this value to "true" since the API
// version is no longer a cause for altering the metadata. This is due to the preprocessing performed in the
// OpenVINO framework's implementaion, the "ov::Model" object is preprocessed before reaching the NPU
// plugin.
_model->set_rt_info(true, "is_new_api");
// Flag used for indicating an NPU plugin version which switched the I/O identification convention from
// names to indices. The flag is required in order to inform the driver-compiler adapter to expect indices
// when attempting to deserialize the I/O metadata.
_model->set_rt_info(true, "use_indices_for_io_metadata");
manager.run_passes(_model);

manager.run_passes(_model);

auto& rtInfo = _model->get_rt_info();
rtInfo.erase("is_new_api");
rtInfo.erase("use_indices_for_io_metadata");
}
_logger.debug("serialize_model_to_stream end");
}

Logger _logger;
std::shared_ptr<ov::Model> _model = nullptr;
ze_graph_compiler_version_info_t _compilerVersion;
uint32_t _supportedOpset = 11;
bool _computeModelHash;
bool _storeWeightlessCacheAttribute;
};

/**
Expand All @@ -272,8 +245,14 @@ class VCLSerializerWithWeightsCopy : public VCLSerializerBase {
public:
VCLSerializerWithWeightsCopy(const std::shared_ptr<const ov::Model>& origModel,
const ze_graph_compiler_version_info_t compilerVersion,
const uint32_t supportedOpset = 11)
: VCLSerializerBase(origModel, compilerVersion, supportedOpset) {
const uint32_t supportedOpset = 11,
const bool computeModelHash = false,
const bool storeWeightlessCacheAttribute = false)
: VCLSerializerBase(origModel,
compilerVersion,
supportedOpset,
computeModelHash,
storeWeightlessCacheAttribute) {
_logger.setName("VCLSerializerWithWeightsCopy");
};

Expand Down Expand Up @@ -331,7 +310,7 @@ class VCLSerializerWithWeightsCopy : public VCLSerializerBase {

OPENVINO_ASSERT(offset == sizeOfSerializedIR);

return std::make_pair(sizeOfSerializedIR, buffer);
return {buffer, sizeOfSerializedIR};
}

private:
Expand Down Expand Up @@ -392,8 +371,14 @@ class VCLSerializerWithoutWeightsCopy : public VCLSerializerBase {
public:
VCLSerializerWithoutWeightsCopy(const std::shared_ptr<const ov::Model>& origModel,
const ze_graph_compiler_version_info_t compilerVersion,
const uint32_t supportedOpset = 11)
: VCLSerializerBase(origModel, compilerVersion, supportedOpset) {
const uint32_t supportedOpset = 11,
const bool computeModelHash = false,
const bool storeWeightlessCacheAttribute = false)
: VCLSerializerBase(origModel,
compilerVersion,
supportedOpset,
computeModelHash,
storeWeightlessCacheAttribute) {
_logger.setName("VCLSerializerWithoutWeightsCopy");
};

Expand All @@ -411,7 +396,7 @@ class VCLSerializerWithoutWeightsCopy : public VCLSerializerBase {
std::shared_ptr<uint8_t> buffer(new uint8_t[_serializedModelSize], std::default_delete<uint8_t[]>());
serialize_model_to_buffer(buffer.get());

return SerializedIR(_serializedModelSize, buffer);
return {buffer, _serializedModelSize};
}

private:
Expand Down Expand Up @@ -455,19 +440,29 @@ class VCLSerializerWithoutWeightsCopy : public VCLSerializerBase {
SerializedIR serializeIR(const std::shared_ptr<const ov::Model>& model,
const ze_graph_compiler_version_info_t compilerVersion,
const uint32_t supportedOpsetVersion,
const bool useBaseModelSerializer) {
const bool useBaseModelSerializer,
const bool computeModelHash,
const bool storeWeightlessCacheAttribute) {
if (!useBaseModelSerializer) {
// Non-constness required for adding & removing weights pointer attributes. The current instance is already a
// clone (or should be one), we are not modifying the original model.
const std::shared_ptr<ov::Model> nonConstantModel = std::const_pointer_cast<ov::Model>(model);
storeWeightsPointerAttribute(nonConstantModel);

SerializedIR serializedIR =
VCLSerializerWithoutWeightsCopy(model, compilerVersion, supportedOpsetVersion).serialize();
removeWeightsPointerAttribute(nonConstantModel);
SerializedIR serializedIR = VCLSerializerWithoutWeightsCopy(model,
compilerVersion,
supportedOpsetVersion,
computeModelHash,
storeWeightlessCacheAttribute)
.serialize();
return serializedIR;
}
return VCLSerializerWithWeightsCopy(model, compilerVersion, supportedOpsetVersion).serialize();
return VCLSerializerWithWeightsCopy(model,
compilerVersion,
supportedOpsetVersion,
computeModelHash,
storeWeightlessCacheAttribute)
.serialize();
}

std::string serializeIOInfo(const std::shared_ptr<const ov::Model>& model, const bool useIndices) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -277,8 +277,8 @@ std::unordered_set<std::string> ZeGraphExtWrappers::queryGraph(SerializedIR seri
ze_graph_desc_2_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
nullptr,
ZE_GRAPH_FORMAT_NGRAPH_LITE,
serializedIR.first,
serializedIR.second.get(),
serializedIR.size,
serializedIR.buffer.get(),
buildFlags.c_str(),
ZE_GRAPH_FLAG_NONE};

Expand Down Expand Up @@ -335,23 +335,30 @@ bool ZeGraphExtWrappers::canCpuVaBeImported(const void* data, size_t size) const
return true;
}

// ze_graph_input_hash_t graphInputHash = {};
// graphInputHash.stype = ZE_STRUCTURE_TYPE_GRAPH_INPUT_HASH;
// graphInputHash.hash = hash;

GraphDescriptor ZeGraphExtWrappers::getGraphDescriptor(SerializedIR serializedIR,
const std::string& buildFlags,
const bool bypassUmdCache) const {
ze_graph_handle_t graphHandle = nullptr;

const uint64_t hash = 14;
ze_graph_input_hash_t modelHash = {ZE_STRUCTURE_TYPE_GRAPH_INPUT_HASH, nullptr, hash};

uint32_t flags = ZE_GRAPH_FLAG_NONE;
if (bypassUmdCache) {
_logger.debug("getGraphDescriptor - set ZE_GRAPH_FLAG_DISABLE_CACHING");
flags |= ZE_GRAPH_FLAG_DISABLE_CACHING;
}

ze_graph_desc_2_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
nullptr,
&modelHash,
ZE_GRAPH_FORMAT_NGRAPH_LITE,
serializedIR.first,
serializedIR.second.get(),
buildFlags.c_str(),
serializedIR.size,
serializedIR.buffer.get(),
"",
flags};

_logger.debug("getGraphDescriptor - perform pfnCreate2");
Expand All @@ -360,6 +367,7 @@ GraphDescriptor ZeGraphExtWrappers::getGraphDescriptor(SerializedIR serializedIR
&desc,
&graphHandle);
THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnCreate2", result, _zeroInitStruct->getGraphDdiTable());
OPENVINO_THROW("");

return GraphDescriptor{graphHandle};
}
Expand Down