openvinotoolkit
diff --git a/‎src/inference/include/openvino/runtime/intel_npu/properties.hpp‎
Lines changed: 12 additions & 0 deletions b/‎src/inference/include/openvino/runtime/intel_npu/properties.hpp‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/plugins/intel_npu/README.md‎
Lines changed: 1 addition & 0 deletions b/‎src/plugins/intel_npu/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/plugins/intel_npu/src/al/include/intel_npu/config/options.hpp‎
Lines changed: 18 additions & 0 deletions b/‎src/plugins/intel_npu/src/al/include/intel_npu/config/options.hpp‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp‎
Lines changed: 4 additions & 2 deletions b/‎src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/plugins/intel_npu/src/backend/include/zero_tensor.hpp‎
Lines changed: 3 additions & 4 deletions b/‎src/plugins/intel_npu/src/backend/include/zero_tensor.hpp‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp‎
Lines changed: 9 additions & 16 deletions b/‎src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp‎
Lines changed: 9 additions & 16 deletions
diff --git a/‎src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp‎
Lines changed: 28 additions & 10 deletions b/‎src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp‎
Lines changed: 28 additions & 10 deletions
diff --git a/‎src/plugins/intel_npu/src/backend/src/zero_tensor.cpp‎
Lines changed: 32 additions & 22 deletions b/‎src/plugins/intel_npu/src/backend/src/zero_tensor.cpp‎
Lines changed: 32 additions & 22 deletions
diff --git a/‎src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp‎
Lines changed: 3 additions & 1 deletion b/‎src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp‎
Lines changed: 3 additions & 1 deletion
@@ -110,6 +110,7 @@ static constexpr ov::Property<bool> turbo{"NPU_TURBO"};
  * @brief [Only for NPU Compiler]
  * Type: integer, default is -1
  * Sets the number of npu tiles to compile the model for.
+ * @ingroup ov_runtime_npu_prop_cpp_api
  */
 static constexpr ov::Property<int64_t> tiles{"NPU_TILES"};
 
@@ -118,6 +119,7 @@ static constexpr ov::Property<int64_t> tiles{"NPU_TILES"};
  * Type: integer, default is -1
  * Maximum number of tiles supported by the device we compile for. Can be set for offline compilation. If not set, it
  * will be populated by driver.
+ * @ingroup ov_runtime_npu_prop_cpp_api
  */
 static constexpr ov::Property<int64_t> max_tiles{"NPU_MAX_TILES"};
 
@@ -133,6 +135,7 @@ static constexpr ov::Property<bool> bypass_umd_caching{"NPU_BYPASS_UMD_CACHING"}
  * @brief [Only for NPU Plugin]
  * Type: boolean, default is false
  * This option allows to delay loading the weights until inference is created
+ * @ingroup ov_runtime_npu_prop_cpp_api
  */
 static constexpr ov::Property<bool> defer_weights_load{"NPU_DEFER_WEIGHTS_LOAD"};
 
@@ -145,5 +148,14 @@ static constexpr ov::Property<bool> defer_weights_load{"NPU_DEFER_WEIGHTS_LOAD"}
  */
 static constexpr ov::Property<bool> run_inferences_sequentially{"NPU_RUN_INFERENCES_SEQUENTIALLY"};
 
+/**
+ * @brief [Only for NPU plugin]
+ * Type: std::bool, default is false
+ * Enable ROI Tensor feature. The compiler shall be aware that ROI tensors will be used; they must be enabled at
+ * compilation time if needed later at runtime.
+ * @ingroup ov_runtime_npu_prop_cpp_api
+ */
+static constexpr ov::Property<bool> enable_roi_tensor{"NPU_ENABLE_ROI_TENSOR"};
+
 }  // namespace intel_npu
 }  // namespace ov
@@ -205,6 +205,7 @@ The following properties are supported (may differ based on current system confi
 | `ov::intel_npu::bypass_umd_caching`/</br>`NPU_BYPASS_UMD_CACHING` | RW | Bypass the caching of compiled models in UMD. | `YES`/ `NO`| `NO` |
 | `ov::intel_npu::defer_weights_load`/</br>`NPU_DEFER_WEIGHTS_LOAD` | RW | Delay loading the weights until inference is created. | `YES`/ `NO`| `NO` |
 | `ov::intel_npu::run_inferences_sequentially`/</br>`NPU_RUN_INFERENCES_SEQUENTIALLY` | RW | Run inferences in async mode sequentially in the order in which they are started to optimize host scheduling. | `YES`/ `NO`| `NO` |
+| `ov::intel_npu::enable_roi_tensor`/</br>`NPU_ENABLE_ROI_TENSOR` | RW | Enable ROI Tensor feature. The compiler shall be aware that ROI tensors will be used; they must be enabled at compilation time if needed later at runtime. | `YES`/ `NO`| `NO` |
 <br>
 
 ### Compiled_model properties VS Plugin properties
 
@@ -1426,4 +1426,22 @@ struct USE_BASE_MODEL_SERIALIZER final : OptionBase<USE_BASE_MODEL_SERIALIZER, b
     }
 };
 
+struct ENABLE_ROI_TENSOR final : OptionBase<ENABLE_ROI_TENSOR, bool> {
+    static std::string_view key() {
+        return ov::intel_npu::enable_roi_tensor.name();
+    }
+
+    static bool defaultValue() {
+        return false;
+    }
+
+    static bool isPublic() {
+        return true;
+    }
+
+    static OptionMode mode() {
+        return OptionMode::CompileTime;
+    }
+};
+
 }  // namespace intel_npu
@@ -29,12 +29,14 @@ struct Pipeline final {
     void pull();
     void reset() const;
 
-    void update_graph_arguments(uint32_t arg_index, const void* arg_data, size_t byte_size);
-    void update_graph_arguments_batching(uint32_t arg_index, const void* arg_data, size_t batch_index);
+    void update_graph_arguments(uint32_t index, const std::shared_ptr<ZeroTensor>& tensor);
+    void update_graph_arguments(uint32_t index, const std::shared_ptr<ZeroTensor>& tensor, size_t command_list_index);
 
     std::vector<ov::ProfilingInfo> get_profiling_info() const;
 
 protected:
+    ov::Strides get_strides(const std::shared_ptr<ZeroTensor>& tensor);
+
     std::shared_ptr<ZeroInitStructsHolder> _init_structs;
     std::shared_ptr<IGraph> _graph;
     const Config _config;
 
@@ -73,24 +73,23 @@ class ZeroTensor final : public ov::ITensor {
 
 private:
     void update_strides() const;
-    size_t get_capacity() const;
     size_t get_bytes_capacity() const;
 
     std::shared_ptr<ZeroInitStructsHolder> _init_structs;
     Logger _logger;
 
+    ov::SoPtr<ov::ITensor> _user_tensor;
+
     ov::element::Type _element_type;
     ov::Shape _shape;
-    ov::Shape _capacity;
+    size_t _bytes_capacity;
     mutable ov::Strides _strides;
     mutable std::once_flag _strides_once;
     void* _ptr = nullptr;
     bool _reset_tensor_memory = false;
     bool _is_input = false;
     bool _can_be_reused = false;
 
-    ov::SoPtr<ov::ITensor> _user_tensor;
-
     std::shared_ptr<ZeroMem> _mem_ref;
 };
 
 
@@ -424,8 +424,7 @@ void ZeroInferRequest::set_tensor(const ov::Output<const ov::Node>& port, const
             _pipeline->update_graph_arguments(foundPort.is_input()
                                                   ? _graph->get_input_descriptors().at(foundPort.idx).idx
                                                   : _graph->get_output_descriptors().at(foundPort.idx).idx,
-                                              levelZeroTensor->data(),
-                                              levelZeroTensor->get_byte_size());
+                                              levelZeroTensor);
         }
     }
     // If command list updates are not supported, fallback to copying tensors every time.
@@ -501,9 +500,9 @@ void ZeroInferRequest::set_tensors(const ov::Output<const ov::Node>& port,
                 OPENVINO_ASSERT(get_level_zero_input(foundPort.idx, i)->data(), "Empty buffer");
                 OV_ITT_TASK_NEXT(ZERO_SET_TENSORS, "updateCommandList");
 
-                _pipeline->update_graph_arguments_batching(_graph->get_input_descriptors().at(foundPort.idx).idx,
-                                                           get_level_zero_input(foundPort.idx, i)->data(),
-                                                           i);
+                _pipeline->update_graph_arguments(_graph->get_input_descriptors().at(foundPort.idx).idx,
+                                                  get_level_zero_input(foundPort.idx, i),
+                                                  i);
             }
         }
     }
@@ -625,8 +624,7 @@ void ZeroInferRequest::update_pipeline_if_memory_changed() {
             OPENVINO_ASSERT(levelZeroTensor.at(SINGLE_TENSOR)->data(), "Empty buffer");
 
             _pipeline->update_graph_arguments(_graph->get_input_descriptors().at(ioIndex).idx,
-                                              levelZeroTensor.at(SINGLE_TENSOR)->data(),
-                                              levelZeroTensor.at(SINGLE_TENSOR)->get_byte_size());
+                                              levelZeroTensor.at(SINGLE_TENSOR));
 
             if (!inputDescriptor.isStateInput) {
                 levelZeroTensor.at(SINGLE_TENSOR)->reset_memory_flag();
@@ -656,9 +654,7 @@ void ZeroInferRequest::update_pipeline_if_memory_changed() {
             _logger.debug("Update output graph descriptor with the new tensor");
             OPENVINO_ASSERT(levelZeroTensor->data(), "Empty buffer");
 
-            _pipeline->update_graph_arguments(_graph->get_output_descriptors().at(ioIndex).idx,
-                                              levelZeroTensor->data(),
-                                              levelZeroTensor->get_byte_size());
+            _pipeline->update_graph_arguments(_graph->get_output_descriptors().at(ioIndex).idx, levelZeroTensor);
 
             levelZeroTensor->reset_memory_flag();
         }
@@ -688,13 +684,10 @@ void ZeroInferRequest::update_states_if_memory_changed() {
                 zeroState->clear_zero_state_update_pending();
 
                 _pipeline->update_graph_arguments(_graphInputDescriptors.at(zeroState->get_tensor_index()).idx,
-                                                  get_level_zero_input(zeroState->get_tensor_index())->data(),
-                                                  get_level_zero_input(zeroState->get_tensor_index())->get_byte_size());
+                                                  get_level_zero_input(zeroState->get_tensor_index()));
 
-                _pipeline->update_graph_arguments(
-                    _graphOutputDescriptors.at(zeroState->get_related_tensor_index()).idx,
-                    _levelZeroOutputTensors.at(zeroState->get_related_tensor_index())->data(),
-                    _levelZeroOutputTensors.at(zeroState->get_related_tensor_index())->get_byte_size());
+                _pipeline->update_graph_arguments(_graphOutputDescriptors.at(zeroState->get_related_tensor_index()).idx,
+                                                  _levelZeroOutputTensors.at(zeroState->get_related_tensor_index()));
             }
         }
     }
 
@@ -96,26 +96,29 @@ Pipeline::Pipeline(const Config& config,
             if (input_tensors.at(io_index).size() > 1) {
                 _logger.debug("Pipeline - set args for input index: %zu", io_index);
 
-                graph->set_argument_value(desc.idx, input_tensors.at(io_index).at(i)->data());
+                const auto& tensor = input_tensors.at(io_index).at(i);
+                graph->set_argument_value(desc.idx, tensor->data(), get_strides(tensor));
 
                 ++io_index;
                 continue;
             }
 
+            const auto& tensor = input_tensors.at(io_index).at(0);
             graph->set_argument_value(
                 desc.idx,
-                static_cast<unsigned char*>(input_tensors.at(io_index).at(0)->data()) +
-                    (i * input_tensors.at(io_index).at(0)->get_byte_size()) / _number_of_command_lists);
+                static_cast<unsigned char*>(tensor->data()) + (i * tensor->get_byte_size()) / _number_of_command_lists,
+                get_strides(tensor));
 
             ++io_index;
         }
 
         io_index = 0;
         for (const auto& desc : graph->get_output_descriptors()) {
+            const auto& tensor = output_tensors.at(io_index);
             graph->set_argument_value(
                 desc.idx,
-                static_cast<unsigned char*>(output_tensors.at(io_index)->data()) +
-                    (i * output_tensors.at(io_index)->get_byte_size()) / _number_of_command_lists);
+                static_cast<unsigned char*>(tensor->data()) + (i * tensor->get_byte_size()) / _number_of_command_lists,
+                get_strides(tensor));
             ++io_index;
         }
 
@@ -223,20 +226,23 @@ void Pipeline::reset() const {
     _logger.debug("Pipeline - rest() completed");
 };
 
-void Pipeline::update_graph_arguments(uint32_t arg_index, const void* arg_data, size_t byte_size) {
+void Pipeline::update_graph_arguments(uint32_t index, const std::shared_ptr<ZeroTensor>& tensor) {
     OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "Pipeline", "updateCommandList");
     _logger.debug("Pipeline - updateCommandList");
 
     const size_t number_of_command_lists = _command_lists.size();
 
     for (size_t i = 0; i < number_of_command_lists; i++) {
         _command_lists.at(i)->updateMutableCommandList(
-            arg_index,
-            static_cast<const unsigned char*>(arg_data) + (i * byte_size) / number_of_command_lists);
+            index,
+            static_cast<const unsigned char*>(tensor->data()) + (i * tensor->get_byte_size()) / number_of_command_lists,
+            get_strides(tensor));
     }
 };
 
-void Pipeline::update_graph_arguments_batching(uint32_t arg_index, const void* arg_data, size_t command_list_index) {
+void Pipeline::update_graph_arguments(uint32_t index,
+                                      const std::shared_ptr<ZeroTensor>& tensor,
+                                      size_t command_list_index) {
     OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "Pipeline", "updateCommandListIndex");
     _logger.debug("Pipeline - updateCommandListIndex");
 
@@ -246,7 +252,7 @@ void Pipeline::update_graph_arguments_batching(uint32_t arg_index, const void* a
                     "Command list index is higher than the number of Command lists ",
                     command_list_index);
 
-    _command_lists.at(command_list_index)->updateMutableCommandList(arg_index, arg_data);
+    _command_lists.at(command_list_index)->updateMutableCommandList(index, tensor->data(), get_strides(tensor));
 };
 
 std::vector<ov::ProfilingInfo> Pipeline::get_profiling_info() const {
@@ -272,4 +278,16 @@ std::vector<ov::ProfilingInfo> Pipeline::get_profiling_info() const {
     }
 }
 
+ov::Strides Pipeline::get_strides(const std::shared_ptr<ZeroTensor>& tensor) {
+    if (tensor->get_element_type().bitwidth() < 8 || tensor->is_continuous()) {
+        return ov::Strides{};
+    }
+
+    if (!_graph->is_strided_tensor_supported()) {
+        OPENVINO_THROW("Strides are not supported by the current driver version.");
+    }
+
+    return tensor->get_strides();
+};
+
 }  // namespace intel_npu
@@ -36,14 +36,15 @@ ZeroTensor::ZeroTensor(const std::shared_ptr<ZeroInitStructsHolder>& init_struct
       _logger("ZeroTensor", config.get<LOG_LEVEL>()),
       _element_type{element_type},
       _shape{shape},
-      _capacity{_shape},
       _strides{},
       _strides_once{},
       _is_input(is_input) {
     OPENVINO_ASSERT(_element_type.is_static());
     const auto byte_size = ov::util::get_memory_size_safe(element_type, _shape);
     OPENVINO_ASSERT(byte_size, "Cannot allocate memory for type: ", element_type, " and shape: ", _shape);
 
+    _bytes_capacity = get_bytes_capacity();
+
     _mem_ref = ZeroMemPool::get_instance().allocate_zero_memory(_init_structs,
                                                                 byte_size.value(),
                                                                 utils::STANDARD_PAGE_SIZE,
@@ -59,14 +60,15 @@ ZeroTensor::ZeroTensor(const std::shared_ptr<ZeroInitStructsHolder>& init_struct
                        const ov::SoPtr<ov::ITensor>& user_tensor)
     : _init_structs(init_structs),
       _logger("ZeroTensor", config.get<LOG_LEVEL>()),
-      _element_type{user_tensor->get_element_type()},
-      _shape{user_tensor->get_shape()},
-      _capacity{_shape},
-      _strides{_element_type.bitwidth() >= 8 ? user_tensor->get_strides() : ov::Strides{}},
-      _strides_once{},
-      _user_tensor(user_tensor) {
+      _user_tensor(user_tensor),
+      _element_type{_user_tensor->get_element_type()},
+      _shape{_user_tensor->get_shape()},
+      _strides{_element_type.bitwidth() >= 8 ? _user_tensor->get_strides() : ov::Strides{}},
+      _strides_once{} {
     OPENVINO_ASSERT(_element_type.is_static());
 
+    _bytes_capacity = get_bytes_capacity();
+
     // Data pointer of the given user_tensor must be a valid address in the level zero context
     // Check first if the given tensor is a ZeroRemoteTensor (which has a different method to expose the internal
     // storage)
@@ -81,9 +83,7 @@ ZeroTensor::ZeroTensor(const std::shared_ptr<ZeroInitStructsHolder>& init_struct
     // _mem_ref will keep a reference to that allocation. Otherwise the function will try to import it into the level
     // zero context.
     _logger.debug("ZeroTensor::ZeroTensor - get tensor from pool or import it");
-    _mem_ref = ZeroMemPool::get_instance().import_standard_allocation_memory(_init_structs,
-                                                                             _ptr,
-                                                                             _user_tensor->get_byte_size());
+    _mem_ref = ZeroMemPool::get_instance().import_standard_allocation_memory(_init_structs, _ptr, _bytes_capacity);
 }
 
 // Note: Override data() members to not used OpenVINO library code to improve performance
@@ -138,20 +138,28 @@ void ZeroTensor::update_strides() const {
     }
 }
 
+size_t ZeroTensor::get_bytes_capacity() const {
+    size_t original_shape_size = shape_size(_shape);
+
+    if (_user_tensor == nullptr) {
+        return ov::util::get_memory_size(_element_type, original_shape_size);
+    }
+
+    if (_element_type.bitwidth() < 8 || original_shape_size == 0 || _shape.empty() || _strides.empty()) {
+        return ov::util::get_memory_size(_element_type, original_shape_size);
+    }
+
+    return _strides[0] * _shape[0];
+}
+
 const ov::Strides& ZeroTensor::get_strides() const {
     OPENVINO_ASSERT(_element_type.bitwidth() >= 8,
                     "Could not get strides for types with bitwidths less than 8 bit. Tensor type: ",
                     _element_type);
-    std::call_once(_strides_once, &ZeroTensor::update_strides, this);
-    return _strides;
-}
 
-size_t ZeroTensor::get_capacity() const {
-    return shape_size(_capacity);
-}
+    std::call_once(_strides_once, &ZeroTensor::update_strides, this);
 
-size_t ZeroTensor::get_bytes_capacity() const {
-    return ov::util::get_memory_size(get_element_type(), get_capacity());
+    return _strides;
 }
 
 void ZeroTensor::set_shape(ov::Shape new_shape) {
@@ -161,7 +169,7 @@ void ZeroTensor::set_shape(ov::Shape new_shape) {
 
     _shape = std::move(new_shape);
 
-    if (get_size() > get_capacity()) {
+    if (get_byte_size() > _bytes_capacity) {
         OPENVINO_ASSERT(_init_structs->getMutableCommandListExtVersion() >= ZE_MAKE_VERSION(1, 0),
                         "Re-shaping the tensor with a larger shape is not available using this driver version. "
                         "Please update the driver to the latest version.");
@@ -173,13 +181,15 @@ void ZeroTensor::set_shape(ov::Shape new_shape) {
         _ptr = nullptr;
 
         // allocate buffer and initialize objects from scratch
-        _capacity = _shape;
+        const auto byte_size = ov::util::get_memory_size_safe(_element_type, _shape);
+        OPENVINO_ASSERT(byte_size, "Cannot allocate memory for type: ", _element_type, " and shape: ", _shape);
         _mem_ref = ZeroMemPool::get_instance().allocate_zero_memory(_init_structs,
-                                                                    get_bytes_capacity(),
+                                                                    byte_size.value(),
                                                                     utils::STANDARD_PAGE_SIZE,
                                                                     _is_input);
         _ptr = _mem_ref->data();
-        OPENVINO_ASSERT(get_bytes_capacity() == 0 || _ptr != nullptr, "Failed to allocate zero memory");
+        OPENVINO_ASSERT(byte_size.value() == 0 || _ptr != nullptr, "Failed to allocate zero memory");
+        _bytes_capacity = get_byte_size();
 
         _reset_tensor_memory = true;
     }
 
@@ -34,7 +34,7 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
     virtual std::vector<ov::ProfilingInfo> process_profiling_output(const std::vector<uint8_t>& profData,
                                                                     const Config& config) const = 0;
 
-    virtual void set_argument_value(uint32_t argi, const void* argv) const = 0;
+    virtual void set_argument_value(uint32_t id, const void* data, const std::vector<size_t>& strides = {}) const = 0;
 
     virtual void initialize(const Config& config) = 0;
 
@@ -67,6 +67,8 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
     virtual void set_last_submitted_id(uint32_t id_index) = 0;
     virtual uint32_t get_last_submitted_id() const = 0;
 
+    virtual bool is_strided_tensor_supported() const = 0;
+
 protected:
     // Used to protect zero pipeline creation in the graph. The pipeline should be created only once per graph when the
     // first inference starts running