Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPU] Use level zero init structure/ Follow same code design #27453

Merged
merged 3 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions src/plugins/intel_npu/src/backend/include/zero_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ class ZeroDevice : public IDevice {
private:
const std::shared_ptr<ZeroInitStructsHolder> _initStructs;

ze_graph_dditable_ext_curr_t& _graph_ddi_table_ext;

ze_device_properties_t device_properties = {};

ze_pci_ext_properties_t pci_properties = {};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ namespace intel_npu {

class ZeroHostTensor : public ov::ITensor {
public:
ZeroHostTensor(std::shared_ptr<ov::IRemoteContext> context,
std::shared_ptr<ZeroInitStructsHolder> init_structs,
ZeroHostTensor(const std::shared_ptr<ov::IRemoteContext>& context,
const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
const ov::element::Type element_type,
const ov::Shape& shape,
const Config& config);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,17 @@ class ZeroInferRequest final : public SyncInferRequest {
* @param index The index corresponding to the position of the tensor inside the I/O structures.
* @param isInput Used for identifying the structures to which the tensor belongs.
*/
void set_tensor_data(const std::shared_ptr<ov::ITensor> tensor, const size_t index, const bool isInput);
void set_tensor_data(const std::shared_ptr<ov::ITensor>& tensor, const size_t index, const bool isInput);

/**
* @brief Check the received remote tensor and copy it to the Level Zero tensor
* @param tensor Reference to a tensor.
* @param index The index corresponding to the position of the tensor inside the I/O structures.
* @param isInput Used for identifying the structures to which the tensor belongs.
*/
void set_remote_tensor_data(const std::shared_ptr<ZeroRemoteTensor> tensor, const size_t index, const bool isInput);
void set_remote_tensor_data(const std::shared_ptr<ZeroRemoteTensor>& tensor,
const size_t index,
const bool isInput);

void check_network_precision(const ov::element::Type_t precision) const override;
void create_pipeline();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct Pipeline {
const std::shared_ptr<IGraph>& graph,
zeroProfiling::ProfilingPool& profiling_pool,
zeroProfiling::ProfilingQuery& profiling_query,
std::shared_ptr<zeroProfiling::NpuInferProfiling> npu_profiling,
const std::shared_ptr<zeroProfiling::NpuInferProfiling>& npu_profiling,
const std::vector<std::vector<std::optional<TensorData>>>& inputTensorsData,
const std::vector<std::optional<TensorData>>& outputTensorsData,
size_t numberOfCommandLists,
Expand Down
37 changes: 18 additions & 19 deletions src/plugins/intel_npu/src/backend/include/zero_profiling.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@
#include <climits>
#include <map>

#include "intel_npu/common/igraph.hpp"
#include "intel_npu/config/compiler.hpp"
#include "intel_npu/utils/logger/logger.hpp"
#include "intel_npu/utils/zero/zero_init.hpp"
#include "intel_npu/utils/zero/zero_types.hpp"
#include "openvino/runtime/profiling_info.hpp"

Expand All @@ -23,31 +25,29 @@ using LayerStatistics = std::vector<ov::ProfilingInfo>;
constexpr uint32_t POOL_SIZE = 1;

struct ProfilingPool {
ProfilingPool(ze_graph_handle_t graph_handle,
uint32_t profiling_count,
ze_graph_profiling_dditable_ext_curr_t& graph_profiling_ddi_table_ext)
: _graph_handle(graph_handle),
_profiling_count(profiling_count),
_graph_profiling_ddi_table_ext(graph_profiling_ddi_table_ext) {}
ProfilingPool(const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
const std::shared_ptr<IGraph>& graph,
uint32_t profiling_count)
: _init_structs(init_structs),
_graph(graph),
_profiling_count(profiling_count) {}
ProfilingPool(const ProfilingPool&) = delete;
ProfilingPool& operator=(const ProfilingPool&) = delete;
bool create();

~ProfilingPool();

ze_graph_handle_t _graph_handle;
std::shared_ptr<ZeroInitStructsHolder> _init_structs;
std::shared_ptr<IGraph> _graph;
const uint32_t _profiling_count;

ze_graph_profiling_pool_handle_t _handle = nullptr;
ze_graph_profiling_dditable_ext_curr_t& _graph_profiling_ddi_table_ext;
};

struct ProfilingQuery {
ProfilingQuery(uint32_t index,
ze_device_handle_t device_handle,
ze_graph_profiling_dditable_ext_curr_t& graph_profiling_ddi_table_ext)
: _index(index),
_device_handle(device_handle),
_graph_profiling_ddi_table_ext(graph_profiling_ddi_table_ext) {}
ProfilingQuery(const std::shared_ptr<ZeroInitStructsHolder>& init_structs, uint32_t index)
: _init_structs(init_structs),
_index(index) {}
ProfilingQuery(const ProfilingQuery&) = delete;
ProfilingQuery& operator=(const ProfilingQuery&) = delete;
void create(const ze_graph_profiling_pool_handle_t& profiling_pool);
Expand All @@ -64,18 +64,18 @@ struct ProfilingQuery {
void getProfilingProperties(ze_device_profiling_data_properties_t* properties) const;
void verifyProfilingProperties() const;

std::shared_ptr<ZeroInitStructsHolder> _init_structs;
const uint32_t _index;
ze_device_handle_t _device_handle;

ze_graph_profiling_query_handle_t _handle = nullptr;
ze_graph_profiling_dditable_ext_curr_t& _graph_profiling_ddi_table_ext;
};

extern template std::vector<uint8_t> ProfilingQuery::getData<uint8_t>() const;

using NpuInferStatistics = std::vector<ov::ProfilingInfo>;

struct NpuInferProfiling final {
explicit NpuInferProfiling(ze_context_handle_t context, ze_device_handle_t device_handle, ov::log::Level loglevel);
explicit NpuInferProfiling(const std::shared_ptr<ZeroInitStructsHolder>& init_structs, ov::log::Level loglevel);
NpuInferProfiling(const NpuInferProfiling&) = delete;
NpuInferProfiling& operator=(const NpuInferProfiling&) = delete;
NpuInferProfiling(NpuInferProfiling&&) = delete;
Expand All @@ -91,8 +91,7 @@ struct NpuInferProfiling final {
void* npu_ts_infer_end = 0;

private:
ze_context_handle_t _context = nullptr;
ze_device_handle_t _device_handle;
std::shared_ptr<ZeroInitStructsHolder> _init_structs;
ov::log::Level _loglevel;
Logger _logger;
ze_device_properties_t _dev_properties = {};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ namespace intel_npu {

class ZeroRemoteTensor : public RemoteTensor {
public:
ZeroRemoteTensor(std::shared_ptr<ov::IRemoteContext> context,
std::shared_ptr<ZeroInitStructsHolder> init_structs,
ZeroRemoteTensor(const std::shared_ptr<ov::IRemoteContext>& context,
const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
const ov::element::Type& element_type,
const ov::Shape& shape,
const Config& config,
Expand Down
15 changes: 8 additions & 7 deletions src/plugins/intel_npu/src/backend/src/zero_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ using namespace intel_npu;

ZeroDevice::ZeroDevice(const std::shared_ptr<ZeroInitStructsHolder>& initStructs)
: _initStructs(initStructs),
_graph_ddi_table_ext(_initStructs->getGraphDdiTable()),
log("ZeroDevice", Logger::global().level()) {
log.debug("ZeroDevice::ZeroDevice init");
device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
Expand Down Expand Up @@ -121,9 +120,10 @@ uint32_t ZeroDevice::getMaxNumSlices() const {

uint64_t ZeroDevice::getAllocMemSize() const {
ze_graph_memory_query_t query{};
ze_result_t result =
_graph_ddi_table_ext.pfnQueryContextMemory(_initStructs->getContext(), ZE_GRAPH_QUERY_MEMORY_DDR, &query);
THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnQueryContextMemory", result, _graph_ddi_table_ext);
ze_result_t result = _initStructs->getGraphDdiTable().pfnQueryContextMemory(_initStructs->getContext(),
ZE_GRAPH_QUERY_MEMORY_DDR,
&query);
THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnQueryContextMemory", result, _initStructs->getGraphDdiTable());

return query.allocated;
}
Expand All @@ -132,9 +132,10 @@ uint64_t ZeroDevice::getTotalMemSize() const {
#define LEGACY_MAX_MEM_ALLOC_SIZE_BYTES (2147483648) // 2GB in base-2

ze_graph_memory_query_t query{};
ze_result_t result =
_graph_ddi_table_ext.pfnQueryContextMemory(_initStructs->getContext(), ZE_GRAPH_QUERY_MEMORY_DDR, &query);
THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnQueryContextMemory", result, _graph_ddi_table_ext);
ze_result_t result = _initStructs->getGraphDdiTable().pfnQueryContextMemory(_initStructs->getContext(),
ZE_GRAPH_QUERY_MEMORY_DDR,
&query);
THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnQueryContextMemory", result, _initStructs->getGraphDdiTable());

// For drivers with graph_extension < 1.9 we report fixed 2GB max allocation size (old drivers don't support more)
// For drivers with graph_extension > 1.9 we report the value they return
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/intel_npu/src/backend/src/zero_host_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

namespace intel_npu {

ZeroHostTensor::ZeroHostTensor(std::shared_ptr<ov::IRemoteContext> context,
std::shared_ptr<ZeroInitStructsHolder> init_structs,
ZeroHostTensor::ZeroHostTensor(const std::shared_ptr<ov::IRemoteContext>& context,
const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
const ov::element::Type element_type,
const ov::Shape& shape,
const Config& config)
Expand Down
14 changes: 5 additions & 9 deletions src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,20 +167,16 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
_levelZeroOutputTensors(_metadata.outputs.size(), nullptr),
_inputTensorsData(_metadata.inputs.size(), std::vector<std::optional<TensorData>>(1, std::nullopt)),
_outputTensorsData(_metadata.outputs.size(), std::nullopt),
_profilingPool(static_cast<ze_graph_handle_t>(_graph->get_handle()),
zeroProfiling::POOL_SIZE,
_initStructs->getProfilingDdiTable()),
_profilingQuery(0, _initStructs->getDevice(), _initStructs->getProfilingDdiTable()) {
_profilingPool(_initStructs, _graph, zeroProfiling::POOL_SIZE),
_profilingQuery(_initStructs, 0) {
_logger.debug("ZeroInferRequest::ZeroInferRequest - SyncInferRequest");
const std::vector<ArgumentDescriptor>& executorInputDescriptors = _graph->get_input_descriptors();
const std::vector<ArgumentDescriptor>& executorOutputDescriptors = _graph->get_output_descriptors();

auto proftype = config.get<PROFILING_TYPE>();
if (proftype == ov::intel_npu::ProfilingType::INFER) {
_logger.debug("ZeroInferRequest::ZeroInferRequest - profiling type == ov::intel_npu::ProfilingType::INFER");
_npuProfiling = std::make_shared<zeroProfiling::NpuInferProfiling>(_initStructs->getContext(),
_initStructs->getDevice(),
_config.get<LOG_LEVEL>());
_npuProfiling = std::make_shared<zeroProfiling::NpuInferProfiling>(_initStructs, _config.get<LOG_LEVEL>());
}

_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
Expand Down Expand Up @@ -296,7 +292,7 @@ void ZeroInferRequest::create_pipeline() {
_logger.debug("ZeroInferRequest::create_pipeline - SyncInferRequest completed");
}

void ZeroInferRequest::set_tensor_data(const std::shared_ptr<ov::ITensor> tensor,
void ZeroInferRequest::set_tensor_data(const std::shared_ptr<ov::ITensor>& tensor,
const size_t index,
const bool isInput) {
OV_ITT_TASK_CHAIN(ZERO_SET_TENSOR, itt::domains::LevelZeroBackend, "set_tensor", "set_tensor_data");
Expand Down Expand Up @@ -347,7 +343,7 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr<ov::ITensor> tensor
}
}

void ZeroInferRequest::set_remote_tensor_data(const std::shared_ptr<ZeroRemoteTensor> tensor,
void ZeroInferRequest::set_remote_tensor_data(const std::shared_ptr<ZeroRemoteTensor>& tensor,
const size_t index,
const bool isInput) {
OV_ITT_TASK_CHAIN(ZERO_SET_REMOTE_TENSOR, itt::domains::LevelZeroBackend, "set_tensor", "set_remote_tensor_data");
Expand Down
8 changes: 3 additions & 5 deletions src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Pipeline::Pipeline(const Config& config,
const std::shared_ptr<IGraph>& graph,
zeroProfiling::ProfilingPool& profiling_pool,
zeroProfiling::ProfilingQuery& profiling_query,
std::shared_ptr<zeroProfiling::NpuInferProfiling> npu_profiling,
const std::shared_ptr<zeroProfiling::NpuInferProfiling>& npu_profiling,
const std::vector<std::vector<std::optional<TensorData>>>& inputTensorsData,
const std::vector<std::optional<TensorData>>& outputTensorsData,
size_t numberOfCommandLists,
Expand All @@ -30,7 +30,7 @@ Pipeline::Pipeline(const Config& config,
_event_pool{initStructs->getDevice(),
initStructs->getContext(),
numberOfCommandLists ? static_cast<uint32_t>(numberOfCommandLists) : 1},
_npu_profiling(std::move(npu_profiling)),
_npu_profiling(npu_profiling),
_logger("Pipeline", _config.get<LOG_LEVEL>()) {
OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Zero_infer_request::Pipeline::Pipeline");
_logger.debug("Pipeline - initialize started");
Expand All @@ -45,9 +45,7 @@ Pipeline::Pipeline(const Config& config,
_logger.debug("Pipeline - emplace_back _event_pool and _command_queue");
for (size_t i = 0; i < numberOfCommandLists; i++) {
_command_lists.emplace_back(
std::make_unique<CommandList>(initStructs->getDevice(),
initStructs->getContext(),
initStructs->getGraphDdiTable(),
std::make_unique<CommandList>(initStructs,
group_ordinal,
initStructs->getMutableCommandListVersion() ? true : false));
_events.emplace_back(std::make_unique<Event>(_event_pool.handle(), static_cast<uint32_t>(i)));
Expand Down
31 changes: 16 additions & 15 deletions src/plugins/intel_npu/src/backend/src/zero_profiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,20 +35,21 @@ struct ZeProfilingTypeId<uint8_t> {
};

bool ProfilingPool::create() {
auto ret = _graph_profiling_ddi_table_ext.pfnProfilingPoolCreate(_graph_handle, _profiling_count, &_handle);
auto ret =
_init_structs->getProfilingDdiTable().pfnProfilingPoolCreate(_graph->get_handle(), _profiling_count, &_handle);
return ((ZE_RESULT_SUCCESS == ret) && (_handle != nullptr));
}

ProfilingPool::~ProfilingPool() {
if (_handle) {
_graph_profiling_ddi_table_ext.pfnProfilingPoolDestroy(_handle);
_init_structs->getProfilingDdiTable().pfnProfilingPoolDestroy(_handle);
}
}

void ProfilingQuery::create(const ze_graph_profiling_pool_handle_t& profiling_pool) {
THROW_ON_FAIL_FOR_LEVELZERO(
"pfnProfilingQueryCreate",
_graph_profiling_ddi_table_ext.pfnProfilingQueryCreate(profiling_pool, _index, &_handle));
_init_structs->getProfilingDdiTable().pfnProfilingQueryCreate(profiling_pool, _index, &_handle));
}

LayerStatistics ProfilingQuery::getLayerStatistics() const {
Expand All @@ -59,7 +60,7 @@ LayerStatistics ProfilingQuery::getLayerStatistics() const {

ProfilingQuery::~ProfilingQuery() {
if (_handle) {
_graph_profiling_ddi_table_ext.pfnProfilingQueryDestroy(_handle);
_init_structs->getProfilingDdiTable().pfnProfilingQueryDestroy(_handle);
}
}

Expand All @@ -69,7 +70,7 @@ void ProfilingQuery::queryGetData(const ze_graph_profiling_type_t profilingType,
if (_handle && pSize) {
THROW_ON_FAIL_FOR_LEVELZERO(
"pfnProfilingQueryGetData",
_graph_profiling_ddi_table_ext.pfnProfilingQueryGetData(_handle, profilingType, pSize, pData));
_init_structs->getProfilingDdiTable().pfnProfilingQueryGetData(_handle, profilingType, pSize, pData));
}
}

Expand All @@ -95,7 +96,8 @@ void ProfilingQuery::getProfilingProperties(ze_device_profiling_data_properties_
if (_handle && properties) {
THROW_ON_FAIL_FOR_LEVELZERO(
"getProfilingProperties",
_graph_profiling_ddi_table_ext.pfnDeviceGetProfilingDataProperties(_device_handle, properties));
_init_structs->getProfilingDdiTable().pfnDeviceGetProfilingDataProperties(_init_structs->getDevice(),
properties));
}
}

Expand Down Expand Up @@ -179,30 +181,29 @@ NpuInferStatistics NpuInferProfiling::getNpuInferStatistics() const {
return npuPerfCounts;
}

NpuInferProfiling::NpuInferProfiling(ze_context_handle_t context,
ze_device_handle_t device_handle,
NpuInferProfiling::NpuInferProfiling(const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
ov::log::Level loglevel)
: _context(context),
_device_handle(device_handle),
: _init_structs(init_structs),
_loglevel(loglevel),
_logger("InferProfiling", loglevel) {
/// Fetch and store the device timer resolution
_dev_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2;
THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties", zeDeviceGetProperties(_device_handle, &_dev_properties));
THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties",
zeDeviceGetProperties(_init_structs->getDevice(), &_dev_properties));
/// Request mem allocations
ze_host_mem_alloc_desc_t desc = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
nullptr,
ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED};
THROW_ON_FAIL_FOR_LEVELZERO(
"zeMemAllocHost",
zeMemAllocHost(_context,
zeMemAllocHost(_init_structs->getContext(),
&desc,
sizeof(uint64_t),
64,
&npu_ts_infer_start)); // align to 64 bytes to match npu l2 cache line size
THROW_ON_FAIL_FOR_LEVELZERO(
"zeMemAllocHost",
zeMemAllocHost(_context,
zeMemAllocHost(_init_structs->getContext(),
&desc,
sizeof(uint64_t),
64,
Expand Down Expand Up @@ -235,13 +236,13 @@ int64_t NpuInferProfiling::convertCCtoUS(int64_t val_cc) const {
NpuInferProfiling::~NpuInferProfiling() {
/// deallocate npu_ts_infer_start and npu_ts_infer_end, allocated externally by ze driver
if (npu_ts_infer_start != nullptr) {
auto ze_ret = zeMemFree(_context, npu_ts_infer_start);
auto ze_ret = zeMemFree(_init_structs->getContext(), npu_ts_infer_start);
if (ZE_RESULT_SUCCESS != ze_ret) {
_logger.error("zeMemFree on npu_ts_infer_start failed %#X", uint64_t(ze_ret));
}
}
if (npu_ts_infer_end != nullptr) {
auto ze_ret = zeMemFree(_context, npu_ts_infer_end);
auto ze_ret = zeMemFree(_init_structs->getContext(), npu_ts_infer_end);
if (ZE_RESULT_SUCCESS != ze_ret) {
_logger.error("zeMemFree on npu_ts_infer_end failed %#X", uint64_t(ze_ret));
}
Expand Down
Loading
Loading