Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ jobs:
compiler: [{c: gcc, cxx: g++}]
libbacktrace: ['-DVAL_USE_LIBBACKTRACE_BACKTRACE=OFF']
pool_tracking: ['-DUMF_ENABLE_POOL_TRACKING=ON', '-DUMF_ENABLE_POOL_TRACKING=OFF']
latency_tracking: ['-DUMF_ENABLE_LATENCY_TRACKING=OFF']
include:
- os: 'ubuntu-22.04'
build_type: Release
Expand All @@ -36,6 +37,10 @@ jobs:
- os: 'ubuntu-20.04'
build_type: Release
compiler: {c: gcc-7, cxx: g++-7}
- os: 'ubuntu-22.04'
build_type: Release
compiler: {c: clang, cxx: clang++}
latency_tracking: '-DUMF_ENABLE_LATENCY_TRACKING=ON'
runs-on: ${{ (matrix.os == 'ubuntu-22.04' && github.repository_owner == 'oneapi-src') && 'intel-ubuntu-22.04' || matrix.os }}

steps:
Expand Down Expand Up @@ -92,6 +97,7 @@ jobs:
-DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++
${{matrix.libbacktrace}}
${{matrix.pool_tracking}}
${{matrix.latency_tracking}}

- name: Configure CMake
if: matrix.os == 'ubuntu-20.04'
Expand All @@ -106,6 +112,7 @@ jobs:
-DUR_FORMAT_CPP_STYLE=ON
${{matrix.libbacktrace}}
${{matrix.pool_tracking}}
${{matrix.latency_tracking}}

- name: Generate source from spec, check for uncommitted diff
if: matrix.os == 'ubuntu-22.04'
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ option(UR_ENABLE_ASSERTIONS "Enable assertions for all build types" OFF)
option(UR_BUILD_XPTI_LIBS "Build the XPTI libraries when tracing is enabled" ON)
option(UR_STATIC_LOADER "Build loader as a static library" OFF)
option(UR_FORCE_LIBSTDCXX "Force use of libstdc++ in a build using libc++ on Linux" OFF)
option(UR_ENABLE_LATENCY_HISTOGRAM "Enable latncy histogram" OFF)
set(UR_DPCXX "" CACHE FILEPATH "Path of the DPC++ compiler executable")
set(UR_DPCXX_BUILD_FLAGS "" CACHE STRING "Build flags to pass to DPC++ when compiling device programs")
set(UR_SYCL_LIBRARY_DIR "" CACHE PATH
Expand Down
5 changes: 5 additions & 0 deletions source/adapters/level_zero/v2/command_list_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ command_list_cache_t::getImmediateCommandList(
ze_device_handle_t ZeDevice, bool IsInOrder, uint32_t Ordinal,
ze_command_queue_mode_t Mode, ze_command_queue_priority_t Priority,
std::optional<uint32_t> Index) {
TRACK_SCOPE_LATENCY("command_list_cache_t::getImmediateCommandList");

immediate_command_list_descriptor_t Desc;
Desc.ZeDevice = ZeDevice;
Desc.Ordinal = Ordinal;
Expand All @@ -100,12 +102,15 @@ command_list_cache_t::getImmediateCommandList(
raii::cache_borrowed_command_list_t
command_list_cache_t::getRegularCommandList(ze_device_handle_t ZeDevice,
bool IsInOrder, uint32_t Ordinal) {
TRACK_SCOPE_LATENCY("command_list_cache_t::getRegularCommandList");

regular_command_list_descriptor_t Desc;
Desc.ZeDevice = ZeDevice;
Desc.IsInOrder = IsInOrder;
Desc.Ordinal = Ordinal;

auto CommandList = getCommandList(Desc).release();

return raii::cache_borrowed_command_list_t(
CommandList, [Cache = this, Desc](ze_command_list_handle_t CmdList) {
Cache->addCommandList(
Expand Down
1 change: 1 addition & 0 deletions source/adapters/level_zero/v2/command_list_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <functional>
#include <stack>

#include "latency_tracker.hpp"
#include <ur/ur.hpp>
#include <ur_api.h>
#include <ze_api.h>
Expand Down
5 changes: 5 additions & 0 deletions source/adapters/level_zero/v2/event_provider_normal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "../common.hpp"
#include "../context.hpp"
#include "event_provider.hpp"
#include "latency_tracker.hpp"
#include "ur_api.h"
#include "ze_api.h"
#include <memory>
Expand Down Expand Up @@ -85,7 +86,10 @@ std::unique_ptr<provider_pool> provider_normal::createProviderPool() {
}

event_allocation provider_normal::allocate() {
TRACK_SCOPE_LATENCY("provider_normal::allocate");

if (pools.empty()) {
TRACK_SCOPE_LATENCY("provider_normal::allocate#createProviderPool");
pools.emplace_back(createProviderPool());
}

Expand All @@ -97,6 +101,7 @@ event_allocation provider_normal::allocate() {
}
}

TRACK_SCOPE_LATENCY("provider_normal::allocate#slowPath");
std::sort(pools.begin(), pools.end(), [](auto &a, auto &b) {
return a->nfree() < b->nfree(); // asceding
});
Expand Down
19 changes: 19 additions & 0 deletions source/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
add_ur_library(ur_common STATIC
ur_util.cpp
ur_util.hpp
latency_tracker.hpp
$<$<PLATFORM_ID:Windows>:windows/ur_lib_loader.cpp>
$<$<PLATFORM_ID:Linux,Darwin>:linux/ur_lib_loader.cpp>
)
Expand Down Expand Up @@ -45,6 +46,24 @@ set(UMF_BUILD_LIBUMF_POOL_DISJOINT ON CACHE INTERNAL "Build Disjoint Pool")
FetchContent_MakeAvailable(unified-memory-framework)
FetchContent_GetProperties(unified-memory-framework)

if(UR_ENABLE_LATENCY_HISTOGRAM)
set(HDR_HISTOGRAM_BUILD_STATIC CACHE INTERNAL ON "")
set(HDR_HISTOGRAM_BUILD_SHARED CACHE INTERNAL OFF "")

include(FetchContent)
FetchContent_Declare(hdr_histogram
GIT_REPOSITORY https://github.com/HdrHistogram/HdrHistogram_c.git
GIT_TAG 0.11.8
)

FetchContent_MakeAvailable(hdr_histogram)
FetchContent_GetProperties(hdr_histogram)

target_link_libraries(ur_common PUBLIC hdr_histogram_static)
target_include_directories(ur_common PUBLIC $<BUILD_INTERFACE:${hdr_histogram_SOURCE_DIR}/include>)
target_compile_options(ur_common PUBLIC -DUR_ENABLE_LATENCY_HISTOGRAM=1)
endif()

target_link_libraries(ur_common PUBLIC
${CMAKE_DL_LIBS}
${PROJECT_NAME}::headers
Expand Down
226 changes: 226 additions & 0 deletions source/common/latency_tracker.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
//===--------- latency_tracker.cpp - common ------------------------------===//
//
// Copyright (C) 2024 Intel Corporation
//
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
// Exceptions. See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#pragma once

#include <chrono>
#include <cstdint>
#include <limits>
#include <memory>

#include "logger/ur_logger.hpp"

#if defined(UR_ENABLE_LATENCY_HISTOGRAM)

#include <hdr/hdr_histogram.h>

static inline bool trackLatency = []() {
try {
auto map = getenv_to_map("UR_LOG_LATENCY");

if (!map.has_value()) {
return false;
}

auto it = map->find("level");
return it != map->end() &&
logger::str_to_level(it->second.front()) != logger::Level::QUIET;
} catch (...) {
return false;
}
}();

static constexpr size_t numPercentiles = 7;
static constexpr double percentiles[numPercentiles] = {
50.0, 90.0, 99.0, 99.9, 99.99, 99.999, 99.9999};

struct latencyValues {
int64_t min;
int64_t max;
int64_t mean;
int64_t stddev;
int64_t percentileValues[numPercentiles];
};

using histogram_ptr =
std::unique_ptr<struct hdr_histogram, decltype(&hdr_close)>;

static inline latencyValues getValues(const struct hdr_histogram *histogram) {
latencyValues values;
values.max = hdr_max(histogram);
values.min = hdr_min(histogram);
values.mean = static_cast<int64_t>(hdr_mean(histogram));
values.stddev = static_cast<int64_t>(hdr_stddev(histogram));

auto ret = hdr_value_at_percentiles(
histogram, percentiles, values.percentileValues, numPercentiles);
if (ret != 0) {
logger::error("Failed to get percentiles from latency histogram");
}

return values;
}

class latency_printer {
public:
latency_printer() : logger(logger::create_logger("latency", true, false)) {}

inline void publishLatency(const std::string &name,
histogram_ptr histogram) {
auto [it, inserted] = values.try_emplace(name, std::move(histogram));
if (!inserted) {
// combine histograms
hdr_add(it->second.get(), histogram.get());
}
}

inline ~latency_printer() {
if (trackLatency) {
print();
}
}

inline void print() {
printHeader();

for (auto &[name, histogram] : values) {
auto value = getValues(histogram.get());
logger.log(logger::Level::INFO,
"{},{},{},{},{},{},{},{},{},{},{},{},ns", name,
value.min, value.max, value.mean, value.stddev,
value.percentileValues[0], value.percentileValues[1],
value.percentileValues[2], value.percentileValues[3],
value.percentileValues[4], value.percentileValues[5],
value.percentileValues[6]);
}
}

private:
inline void printHeader() {
logger.log(logger::Level::INFO, "Latency histogram:");
logger.log(logger::Level::INFO,
"name,min,max,mean,stdev,p{},p{},p{},p{},p{},p{},p{},unit",
percentiles[0], percentiles[1], percentiles[2],
percentiles[3], percentiles[4], percentiles[5],
percentiles[6]);
}

std::map<std::string, histogram_ptr> values;
logger::Logger logger;
};

inline latency_printer &globalLatencyPrinter() {
static latency_printer printer;
return printer;
}

class latency_histogram {
public:
inline latency_histogram(const char *name,
latency_printer &printer = globalLatencyPrinter(),
int64_t lowestDiscernibleValue = 1,
int64_t highestTrackableValue = 100'000'000'000,
int significantFigures = 3)
: name(name), histogram(nullptr, nullptr), printer(printer) {
if (trackLatency) {
struct hdr_histogram *cHistogram;
auto ret = hdr_init(lowestDiscernibleValue, highestTrackableValue,
significantFigures, &cHistogram);
if (ret != 0) {
logger::error("Failed to initialize latency histogram");
}
histogram =
std::unique_ptr<struct hdr_histogram, decltype(&hdr_close)>(
cHistogram, &hdr_close);
}
}

latency_histogram(const latency_histogram &) = delete;
latency_histogram(latency_histogram &&) = delete;

inline ~latency_histogram() {
if (!trackLatency || !histogram) {
return;
}

if (hdr_min(histogram.get()) == std::numeric_limits<int64_t>::max()) {
logger::info("[{}] latency: no data", name);
return;
}

printer.publishLatency(name, std::move(histogram));
}

inline void trackValue(int64_t value) {
hdr_record_value(histogram.get(), value);
}

private:
const char *name;
histogram_ptr histogram;
latency_printer &printer;
};

class latency_tracker {
public:
inline explicit latency_tracker(latency_histogram &stats)
: stats(trackLatency ? &stats : nullptr), begin() {
if (trackLatency) {
begin = std::chrono::steady_clock::now();
}
}
inline latency_tracker() {}
inline ~latency_tracker() {
if (stats) {
auto tp = std::chrono::steady_clock::now();
auto diffNanos =
std::chrono::duration_cast<std::chrono::nanoseconds>(tp - begin)
.count();
stats->trackValue(static_cast<int64_t>(diffNanos));
}
}

latency_tracker(const latency_tracker &) = delete;
latency_tracker &operator=(const latency_tracker &) = delete;

inline latency_tracker(latency_tracker &&rhs) noexcept
: stats(rhs.stats), begin(rhs.begin) {
rhs.stats = nullptr;
}

inline latency_tracker &operator=(latency_tracker &&rhs) noexcept {
if (this != &rhs) {
this->~latency_tracker();
new (this) latency_tracker(std::move(rhs));
}
return *this;
}

private:
latency_histogram *stats{nullptr};
std::chrono::time_point<std::chrono::steady_clock> begin;
};

// To resolve __COUNTER__
#define CONCAT(a, b) a##b

// Each tracker has it's own thread-local histogram.
// At program exit, all histograms for the same scope are
// aggregated.
#define TRACK_SCOPE_LATENCY_CNT(name, cnt) \
static thread_local latency_histogram CONCAT(histogram, cnt)(name); \
latency_tracker CONCAT(tracker, cnt)(CONCAT(histogram, cnt));
#define TRACK_SCOPE_LATENCY(name) TRACK_SCOPE_LATENCY_CNT(name, __COUNTER__)

#else // UR_ENABLE_LATENCY_HISTOGRAM

#define TRACK_SCOPE_LATENCY(name)

#endif // UR_ENABLE_LATENCY_HISTOGRAM