diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index a19f768a95..c0a6300f75 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -117,6 +117,7 @@ add_ur_adapter(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_factory.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/context.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/latency_tracker.hpp ${CMAKE_CURRENT_SOURCE_DIR}/ur_level_zero.cpp ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp ${CMAKE_CURRENT_SOURCE_DIR}/context.cpp diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp index dc70a2470c..f6317e891f 100644 --- a/source/adapters/level_zero/context.hpp +++ b/source/adapters/level_zero/context.hpp @@ -43,6 +43,9 @@ struct ur_context_handle_t_ : _ur_object { ur_context_handle_t_(ze_context_handle_t ZeContext) : ZeContext{ZeContext} {} + // Make sure this is virtual so that v2::context is appropriately destroyed + virtual ~ur_context_handle_t_() {} + // A L0 context handle is primarily used during creation and management of // resources that may be used by multiple devices. // This field is only set at ur_context_handle_t creation time, and cannot diff --git a/source/adapters/level_zero/v2/command_list_cache.cpp b/source/adapters/level_zero/v2/command_list_cache.cpp index c240cc8ee7..ff056f5413 100644 --- a/source/adapters/level_zero/v2/command_list_cache.cpp +++ b/source/adapters/level_zero/v2/command_list_cache.cpp @@ -41,7 +41,10 @@ inline size_t command_list_descriptor_hash_t::operator()( } command_list_cache_t::command_list_cache_t(ze_context_handle_t ZeContext) - : ZeContext{ZeContext} {} + : ZeContext{ZeContext}, + immediateGetLatencyTracker( + "command_list_cache_t::getImmediateCommandList"), + regularGetLatencyTracker("command_list_cache_t::getRegularCommandList") {} raii::ze_command_list_t command_list_cache_t::createCommandList(const command_list_descriptor_t &desc) { @@ -81,6 +84,8 @@ command_list_cache_t::getImmediateCommandList( ze_device_handle_t ZeDevice, bool IsInOrder, uint32_t Ordinal, ze_command_queue_mode_t Mode, ze_command_queue_priority_t Priority, std::optional Index) { + rolling_latency_tracker tracker(immediateGetLatencyTracker); + immediate_command_list_descriptor_t Desc; Desc.ZeDevice = ZeDevice; Desc.Ordinal = Ordinal; @@ -100,6 +105,8 @@ command_list_cache_t::getImmediateCommandList( raii::cache_borrowed_command_list_t command_list_cache_t::getRegularCommandList(ze_device_handle_t ZeDevice, bool IsInOrder, uint32_t Ordinal) { + rolling_latency_tracker tracker(regularGetLatencyTracker); + regular_command_list_descriptor_t Desc; Desc.ZeDevice = ZeDevice; Desc.IsInOrder = IsInOrder; diff --git a/source/adapters/level_zero/v2/command_list_cache.hpp b/source/adapters/level_zero/v2/command_list_cache.hpp index 7cacf40604..07459a64ed 100644 --- a/source/adapters/level_zero/v2/command_list_cache.hpp +++ b/source/adapters/level_zero/v2/command_list_cache.hpp @@ -17,6 +17,7 @@ #include #include "../common.hpp" +#include "latency_tracker.hpp" namespace v2 { namespace raii { @@ -81,5 +82,8 @@ struct command_list_cache_t { raii::ze_command_list_t cmdList); raii::ze_command_list_t createCommandList(const command_list_descriptor_t &desc); + + rolling_stats immediateGetLatencyTracker; + rolling_stats regularGetLatencyTracker; }; } // namespace v2 diff --git a/source/adapters/level_zero/v2/latency_tracker.hpp b/source/adapters/level_zero/v2/latency_tracker.hpp new file mode 100644 index 0000000000..1292e68e3e --- /dev/null +++ b/source/adapters/level_zero/v2/latency_tracker.hpp @@ -0,0 +1,97 @@ +//===--------- ur_latency_tracker.cpp - common ---------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include "logger/ur_logger.hpp" + +namespace v2 { + +static inline bool trackLatency = []() { + return std::getenv("UR_ENABLE_LATENCY_TRACKING") != nullptr; +}(); + +class rolling_stats { +public: + rolling_stats(const char *name) : name(name) {} + + ~rolling_stats() { + if (trackLatency) { + logger::info("[{}] average latency: {}ns", name, estimate()); + logger::info("[{}] number of samples: {}", name, count()); + } + } + + // track latency by taking the value of duration directly. + void trackValue(double value) { + auto ratio = static_cast(cnt) / (cnt + 1); + avg *= ratio; + ++cnt; + avg += value / cnt; + } + + // Return the rolling average. + uint64_t estimate() { return static_cast(avg); } + + // Number of samples tracked. + uint64_t count() { return cnt; } + +private: + const char *name; + double avg{0}; + uint64_t cnt{0}; +}; + +class rolling_latency_tracker { +public: + explicit rolling_latency_tracker(rolling_stats &stats) + : stats_(trackLatency ? &stats : nullptr), begin_() { + if (trackLatency) { + begin_ = std::chrono::steady_clock::now(); + } + } + rolling_latency_tracker() {} + ~rolling_latency_tracker() { + if (stats_) { + auto tp = std::chrono::steady_clock::now(); + auto diffNanos = + std::chrono::duration_cast(tp - begin_) + .count(); + stats_->trackValue(static_cast(diffNanos)); + } + } + + rolling_latency_tracker(const rolling_latency_tracker &) = delete; + rolling_latency_tracker &operator=(const rolling_latency_tracker &) = delete; + + rolling_latency_tracker(rolling_latency_tracker &&rhs) noexcept + : stats_(rhs.stats_), begin_(rhs.begin_) { + rhs.stats_ = nullptr; + } + + rolling_latency_tracker &operator=(rolling_latency_tracker &&rhs) noexcept { + if (this != &rhs) { + this->~rolling_latency_tracker(); + new (this) rolling_latency_tracker(std::move(rhs)); + } + return *this; + } + +private: + rolling_stats *stats_{nullptr}; + std::chrono::time_point begin_; +}; + +} // namespace v2