From 467ed1ee56718ab5021de43e8231b1b81257dd10 Mon Sep 17 00:00:00 2001 From: Vasanth Tovinkere Date: Tue, 2 Sep 2025 15:29:26 -0700 Subject: [PATCH 01/13] [SYCL][XPTI] Performance improvements by limiting metadata + Improves the performance of collectors and downstream collectors by limiting the amount of metada that is attached to SYCL events + To get additional metadata, toolchains will have to also subscribe to the stream "sycl.debug" and the additional metadata will be sent in the orginal stream + Replaced all xptiMakeEvent() calls with the new xptiCreateTracepoint() function Signed-off-by: Vasanth Tovinkere --- sycl/source/detail/global_handler.cpp | 2 +- sycl/source/detail/queue_impl.cpp | 132 +++++++++-------- sycl/source/detail/scheduler/commands.cpp | 134 +++++++++--------- sycl/source/detail/xpti_registry.cpp | 1 + sycl/source/detail/xpti_registry.hpp | 10 ++ sycl/test-e2e/XPTI/Inputs/test_collector.cpp | 2 +- .../XPTI/basic_event_collection_linux.cpp | 1 + .../source/adapters/cuda/tracing.cpp | 37 ++--- .../layers/tracing/ur_tracing_layer.cpp | 36 +++-- xpti/include/xpti/xpti_trace_framework.h | 4 +- 10 files changed, 194 insertions(+), 165 deletions(-) diff --git a/sycl/source/detail/global_handler.cpp b/sycl/source/detail/global_handler.cpp index 94a7e9f7b70dc..6f9dac6bd42da 100644 --- a/sycl/source/detail/global_handler.cpp +++ b/sycl/source/detail/global_handler.cpp @@ -95,7 +95,7 @@ void GlobalHandler::TraceEventXPTI(const char *Message) { xpti::framework::tracepoint_scope_t TP( CodeLocation.fileName(), CodeLocation.functionName(), CodeLocation.lineNumber(), CodeLocation.columnNumber(), nullptr); - + // Notify the subscriber with a diagnostic message when an exception occurs TP.stream(detail::GSYCLStreamID) .traceType(xpti::trace_point_type_t::diagnostics) .parentEvent(GSYCLCallEvent) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index a107ee491dfe3..8757be73972dd 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -171,6 +171,8 @@ event queue_impl::memset(void *Ptr, int Value, size_t Count, .traceType(xpti::trace_point_type_t::node_create) .parentEvent(detail::GSYCLGraphEvent); + // This information is necessary for memset, so we will not guard it by debug + // stream check TP.addMetadata([&](auto TEvent) { xpti::addMetadata(TEvent, "sycl_device", reinterpret_cast(MDevice.getHandleRef())); @@ -223,6 +225,7 @@ event queue_impl::memcpy(void *Dest, const void *Src, size_t Count, .traceType(xpti::trace_point_type_t::node_create) .parentEvent(GSYCLGraphEvent); const char *UserData = "memory_transfer_node::memcpy"; + // We will include this metadata information as it is required for memcpy TP.addMetadata([&](auto TEvent) { xpti::addMetadata(TEvent, "sycl_device", reinterpret_cast(MDevice.getHandleRef())); @@ -524,33 +527,32 @@ void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc, if (!xptiCheckTraceEnabled(StreamID, NotificationTraceType)) return TraceEvent; - xpti::payload_t Payload; - bool HasSourceInfo = false; + xpti_tracepoint_t *Event; // We try to create a unique string for the wait() call by combining it with // the queue address xpti::utils::StringHelper NG; Name = NG.nameWithAddress("queue.wait", this); - if (CodeLoc.fileName()) { - // We have source code location information - Payload = - xpti::payload_t(Name.c_str(), CodeLoc.fileName(), CodeLoc.lineNumber(), - CodeLoc.columnNumber(), (void *)this); - HasSourceInfo = true; - } else { - // We have no location information, so we'll use the address of the queue - Payload = xpti::payload_t(Name.c_str(), (void *)this); - } + bool HasSourceInfo = CodeLoc.fileName() != nullptr; // wait() calls could be at different user-code locations; We create a new // event based on the code location info and if this has been seen before, a // previously created event will be returned. - uint64_t QWaitInstanceNo = 0; - xpti::trace_event_data_t *WaitEvent = - xptiMakeEvent(Name.c_str(), &Payload, xpti::trace_graph_event, - xpti_at::active, &QWaitInstanceNo); - IId = QWaitInstanceNo; - if (WaitEvent) { - xpti::addMetadata(WaitEvent, "sycl_device_type", queueDeviceToString(this)); + if (HasSourceInfo) { + Event = xptiCreateTracepoint(CodeLoc.functionName(), CodeLoc.fileName(), + CodeLoc.lineNumber(), CodeLoc.columnNumber(), + (void *)this); + } else { + Event = xptiCreateTracepoint(Name.c_str(), nullptr, 0, 0, (void *)this); + } + + IId = xptiGetUniqueId(); + auto WaitEvent = Event->event_ref(); + // We will allow the device type to be set + xpti::addMetadata(WaitEvent, "sycl_device_type", queueDeviceToString(this)); + // We limit the amount of metadata that is added to streams if the + // "sycl.debug" stream is not subscribed to. This improves the performance + // when this data is not required by the tool or the collector. + if (xptiCheckTraceEnabled(detail::GSYCLDebugStreamID)) { if (HasSourceInfo) { xpti::addMetadata(WaitEvent, "sym_function_name", CodeLoc.functionName()); xpti::addMetadata(WaitEvent, "sym_source_file_name", CodeLoc.fileName()); @@ -560,11 +562,11 @@ void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc, WaitEvent, "sym_column_no", static_cast((CodeLoc.columnNumber()))); } - xptiNotifySubscribers(StreamID, xpti::trace_wait_begin, nullptr, WaitEvent, - QWaitInstanceNo, - static_cast(Name.c_str())); - TraceEvent = (void *)WaitEvent; } + xptiNotifySubscribers(StreamID, xpti::trace_wait_begin, nullptr, WaitEvent, + IId, static_cast(Name.c_str())); + TraceEvent = (void *)WaitEvent; + return TraceEvent; } @@ -587,13 +589,11 @@ void queue_impl::instrumentationEpilog(void *TelemetryEvent, std::string &Name, void queue_impl::wait(const detail::code_location &CodeLoc) { (void)CodeLoc; #ifdef XPTI_ENABLE_INSTRUMENTATION - const bool xptiEnabled = xptiCheckTraceEnabled(GSYCLStreamID); + // xptiCheckTraceEnabled() is being performed in instrumentationProlog() void *TelemetryEvent = nullptr; uint64_t IId; std::string Name; - if (xptiEnabled) { - TelemetryEvent = instrumentationProlog(CodeLoc, Name, GSYCLStreamID, IId); - } + TelemetryEvent = instrumentationProlog(CodeLoc, Name, GSYCLStreamID, IId); #endif if (!MGraph.expired()) { @@ -673,51 +673,49 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { } #ifdef XPTI_ENABLE_INSTRUMENTATION - if (xptiEnabled) { - instrumentationEpilog(TelemetryEvent, Name, GSYCLStreamID, IId); - } + // There is an early return in instrumentationEpilog() if no subscribers are + // subscribing to queue.wait() + instrumentationEpilog(TelemetryEvent, Name, GSYCLStreamID, IId); #endif } void queue_impl::constructorNotification() { #if XPTI_ENABLE_INSTRUMENTATION - if (xptiTraceEnabled()) { - constexpr uint16_t NotificationTraceType = - static_cast(xpti::trace_point_type_t::queue_create); - if (xptiCheckTraceEnabled(detail::GSYCLStreamID, NotificationTraceType)) { - xpti::utils::StringHelper SH; - std::string AddrStr = SH.addressAsString(MQueueID); - std::string QueueName = SH.nameWithAddressString("queue", AddrStr); - // Create a payload for the queue create event as we do not get code - // location for the queue create event - xpti::payload_t QPayload(QueueName.c_str()); - MInstanceID = xptiGetUniqueId(); - uint64_t RetInstanceNo; - xpti_td *TEvent = - xptiMakeEvent("queue_create", &QPayload, - (uint16_t)xpti::trace_event_type_t::algorithm, - xpti_at::active, &RetInstanceNo); - // Cache the trace event, stream id and instance IDs for the destructor - MTraceEvent = (void *)TEvent; - - xpti::addMetadata(TEvent, "sycl_context", - reinterpret_cast(MContext->getHandleRef())); - xpti::addMetadata(TEvent, "sycl_device_name", - MDevice.get_info()); - xpti::addMetadata(TEvent, "sycl_device", - reinterpret_cast(MDevice.getHandleRef())); - xpti::addMetadata(TEvent, "is_inorder", MIsInorder); - xpti::addMetadata(TEvent, "queue_id", MQueueID); - xpti::addMetadata(TEvent, "queue_handle", - reinterpret_cast(getHandleRef())); - // Also publish to TLS before notification - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID); - xptiNotifySubscribers(detail::GSYCLStreamID, - (uint16_t)xpti::trace_point_type_t::queue_create, - nullptr, TEvent, MInstanceID, - static_cast("queue_create")); - } - } + // If there are no subscribers to queue_create, return immediately + constexpr uint16_t NotificationTraceType = + static_cast(xpti::trace_point_type_t::queue_create); + if (!xptiCheckTraceEnabled(detail::GSYCLStreamID, NotificationTraceType)) + return; + // We do not have CodeLoc for the queue constructor, so we will have to create + // a queue name with the queue ID to create an event; this step can be avoided + // by using CodeLoc + xpti::utils::StringHelper SH; + std::string AddrStr = SH.addressAsString(MQueueID); + std::string QueueName = SH.nameWithAddressString("queue", AddrStr); + + auto Event = + xptiCreateTracepoint(QueueName.c_str(), nullptr, 0, 0, (void *)this); + MInstanceID = xptiGetUniqueId(); + xpti_td *TEvent = Event->event_ref(); + // Cache the trace event, stream id and instance IDs for the destructor + MTraceEvent = (void *)TEvent; + // We will allow the queue metadata to be set as this is performed + // infrequently + xpti::addMetadata(TEvent, "sycl_context", + reinterpret_cast(MContext->getHandleRef())); + xpti::addMetadata(TEvent, "sycl_device_name", + MDevice.get_info()); + xpti::addMetadata(TEvent, "sycl_device", + reinterpret_cast(MDevice.getHandleRef())); + xpti::addMetadata(TEvent, "is_inorder", MIsInorder); + xpti::addMetadata(TEvent, "queue_id", MQueueID); + xpti::addMetadata(TEvent, "queue_handle", + reinterpret_cast(getHandleRef())); + // Also publish to TLS before notification + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID); + xptiNotifySubscribers( + detail::GSYCLStreamID, (uint16_t)xpti::trace_point_type_t::queue_create, + nullptr, TEvent, MInstanceID, static_cast("queue_create")); #endif } diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 3d1f1b6dbce20..d5b9f43383193 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -614,16 +614,16 @@ void Command::emitEdgeEventForCommandDependence( std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr); // Create an edge with the dependent buffer address for which a command // object has been created as one of the properties of the edge - xpti::payload_t Payload(TypeString.c_str(), MAddress); - uint64_t EdgeInstanceNo; - xpti_td *EdgeEvent = - xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event, - xpti_at::active, &EdgeInstanceNo); - if (EdgeEvent) { + uint64_t EdgeInstanceNo = xptiGetUniqueId(); + auto Event = + xptiCreateTracepoint(TypeString.c_str(), nullptr, 0, 0, MAddress); + if (Event) { + xpti_td *EdgeEvent = Event->event_ref(); xpti_td *SrcEvent = static_cast(Cmd->MTraceEvent); xpti_td *TgtEvent = static_cast(MTraceEvent); EdgeEvent->source_id = SrcEvent->unique_id; EdgeEvent->target_id = TgtEvent->unique_id; + // We allow this metadata to be set as it describes the edge if (IsCommand) { xpti::addMetadata(EdgeEvent, "access_mode", static_cast(AccMode.value())); @@ -670,29 +670,33 @@ void Command::emitEdgeEventForEventDependence(Command *Cmd, std::string NodeName = SH.nameWithAddressString("virtual_node", AddressStr); // Node name is "virtual_node[]" - xpti::payload_t VNPayload(NodeName.c_str(), MAddress); - uint64_t VNodeInstanceNo; - xpti_td *NodeEvent = - xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event, - xpti_at::active, &VNodeInstanceNo); - // Emit the virtual node first - xpti::addMetadata(NodeEvent, "kernel_name", NodeName); - xptiNotifySubscribers(MStreamID, xpti::trace_node_create, - detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo, - nullptr); + auto NEvent = + xptiCreateTracepoint(NodeName.c_str(), nullptr, 0, 0, MAddress); + uint64_t VNodeInstanceNo = xptiGetUniqueId(); + xpti_td *NodeEvent = NEvent ? NEvent->event_ref() : nullptr; + if (NodeEvent) { + // We allow this metadata to be set as the node is a virtual node without + // an actual name + xpti::addMetadata(NodeEvent, "kernel_name", NodeName); + + xptiNotifySubscribers(MStreamID, xpti::trace_node_create, + detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo, + nullptr); + } // Create a new event for the edge std::string EdgeName = SH.nameWithAddressString("Event", AddressStr); - xpti::payload_t EdgePayload(EdgeName.c_str(), MAddress); - uint64_t EdgeInstanceNo; - xpti_td *EdgeEvent = - xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event, - xpti_at::active, &EdgeInstanceNo); + auto EEvent = + xptiCreateTracepoint(EdgeName.c_str(), nullptr, 0, 0, MAddress); + uint64_t EdgeInstanceNo = xptiGetUniqueId(); + xpti_td *EdgeEvent = EEvent ? EEvent->event_ref() : nullptr; if (EdgeEvent && NodeEvent) { // Source node represents the event and this event needs to be completed // before target node can execute xpti_td *TgtEvent = static_cast(MTraceEvent); EdgeEvent->source_id = NodeEvent->unique_id; EdgeEvent->target_id = TgtEvent->unique_id; + // We allow this metadata to be set as an edge without the event address + // will be less useful xpti::addMetadata(EdgeEvent, "event", reinterpret_cast(UrEventAddr)); xptiNotifySubscribers(MStreamID, xpti::trace_edge_create, @@ -719,11 +723,10 @@ uint64_t Command::makeTraceEventProlog(void *MAddress) { std::string CommandString = SH.nameWithAddressString(MCommandName, MAddressString); - xpti::payload_t p(CommandString.c_str(), MAddress); - xpti_td *CmdTraceEvent = - xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event, - xpti_at::active, &CommandInstanceNo); - MInstanceID = CommandInstanceNo; + auto Event = + xptiCreateTracepoint(CommandString.c_str(), nullptr, 0, 0, MAddress); + xpti_td *CmdTraceEvent = Event ? Event->event_ref() : nullptr; + MInstanceID = xptiGetUniqueId(); if (CmdTraceEvent) { MTraceEvent = (void *)CmdTraceEvent; // If we are seeing this event again, then the instance ID will be greater @@ -734,7 +737,7 @@ uint64_t Command::makeTraceEventProlog(void *MAddress) { // maintaining data integrity. } #endif - return CommandInstanceNo; + return MInstanceID; } void Command::makeTraceEventEpilog() { @@ -973,14 +976,15 @@ void Command::resolveReleaseDependencies(std::set &DepList) { // Create an edge with the dependent buffer address being one of the // properties of the edge xpti::payload_t p(TypeString.c_str(), MAddress); - uint64_t EdgeInstanceNo; - xpti_td *EdgeEvent = - xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event, - xpti_at::active, &EdgeInstanceNo); + uint64_t EdgeInstanceNo = xptiGetUniqueId(); + auto Event = + xptiCreateTracepoint(TypeString.c_str(), nullptr, 0, 0, MAddress); + xpti_td *EdgeEvent = Event ? Event->event_ref() : nullptr; if (EdgeEvent) { xpti_td *SrcTraceEvent = static_cast(Item->MTraceEvent); EdgeEvent->target_id = TgtTraceEvent->unique_id; EdgeEvent->source_id = SrcTraceEvent->unique_id; + // We will ensure this is always added xpti::addMetadata(EdgeEvent, "memory_object", reinterpret_cast(MAddress)); xptiNotifySubscribers(MStreamID, xpti::trace_edge_create, @@ -1049,6 +1053,7 @@ void AllocaCommandBase::emitInstrumentationData() { if (MTraceEvent) { xpti_td *TE = static_cast(MTraceEvent); addDeviceMetadata(TE, MQueue); + // Memory-object is used frequently, so it is always added xpti::addMetadata(TE, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant @@ -2043,32 +2048,20 @@ void instrumentationFillCommonData(const std::string &KernelName, xpti_td *&OutTraceEvent) { // Get source file, line number information from the CommandGroup object // and create payload using name, address, and source info - // - // On Windows, since the support for builtin functions is not available in - // MSVC, the MFileName, MLine will be set to nullptr and "0" respectively. - // Handle this condition explicitly here. - bool HasSourceInfo = false; - xpti::payload_t Payload; - if (!FileName.empty()) { - // File name has a valid string - Payload = - xpti::payload_t(FuncName.empty() ? KernelName.data() : FuncName.data(), - FileName.data(), Line, Column, Address); - HasSourceInfo = true; - } else if (Address) { - // We have a valid function name and an address - Payload = xpti::payload_t(KernelName.data(), Address); + bool HasSourceInfo = !FileName.empty(); + xpti_tracepoint_t *Event; + void *AddressToUse = const_cast(Address); + if (HasSourceInfo) { + Event = xptiCreateTracepoint(FuncName.c_str(), FileName.c_str(), Line, + Column, AddressToUse); } else { - // In any case, we will have a valid function name and we'll use that to - // create the hash - Payload = xpti::payload_t(KernelName.data()); + Event = + xptiCreateTracepoint(KernelName.data(), nullptr, 0, 0, AddressToUse); } - uint64_t CGKernelInstanceNo; + uint64_t CGKernelInstanceNo = xptiGetUniqueId(); // Create event using the payload - xpti_td *CmdTraceEvent = - xptiMakeEvent("ExecCG", &Payload, xpti::trace_graph_event, - xpti::trace_activity_type_t::active, &CGKernelInstanceNo); + xpti_td *CmdTraceEvent = Event ? Event->event_ref() : nullptr; if (CmdTraceEvent) { OutInstanceID = CGKernelInstanceNo; OutTraceEvent = CmdTraceEvent; @@ -2077,15 +2070,19 @@ void instrumentationFillCommonData(const std::string &KernelName, if (!KernelName.empty()) { xpti::addMetadata(CmdTraceEvent, "kernel_name", KernelName); } - if (FromSource.has_value()) { - xpti::addMetadata(CmdTraceEvent, "from_source", FromSource.value()); - } - if (HasSourceInfo) { - xpti::addMetadata(CmdTraceEvent, "sym_function_name", KernelName); - xpti::addMetadata(CmdTraceEvent, "sym_source_file_name", FileName); - xpti::addMetadata(CmdTraceEvent, "sym_line_no", static_cast(Line)); - xpti::addMetadata(CmdTraceEvent, "sym_column_no", - static_cast(Column)); + // We limit the metadata to only include the kernel name and device + // information by default + if (xptiCheckTraceEnabled(detail::GSYCLDebugStreamID)) { + if (FromSource.has_value()) { + xpti::addMetadata(CmdTraceEvent, "from_source", FromSource.value()); + } + if (HasSourceInfo) { + xpti::addMetadata(CmdTraceEvent, "sym_function_name", KernelName); + xpti::addMetadata(CmdTraceEvent, "sym_source_file_name", FileName); + xpti::addMetadata(CmdTraceEvent, "sym_line_no", static_cast(Line)); + xpti::addMetadata(CmdTraceEvent, "sym_column_no", + static_cast(Column)); + } } // We no longer set the 'queue_id' in the metadata structure as it is a // mutable value and multiple threads using the same queue created at the @@ -2137,11 +2134,14 @@ std::pair emitKernelInstrumentationData( if (Queue) xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(Queue)); - instrumentationAddExtraKernelMetadata( - CmdTraceEvent, NDRDesc, KernelBundleImplPtr, - std::string(SyclKernelName), DeviceKernelInfo, SyclKernel, Queue, - CGArgs); - + // Add the additional metadata only if the debug information is subscribed + // to; in this case, it is the kernel and its parameters + if (xptiCheckTraceEnabled(detail::GSYCLDebugStreamID)) { + instrumentationAddExtraKernelMetadata( + CmdTraceEvent, NDRDesc, KernelBundleImplPtr, + std::string(SyclKernelName), DeviceKernelInfo, SyclKernel, Queue, + CGArgs); + } xptiNotifySubscribers( StreamID, NotificationTraceType, detail::GSYCLGraphEvent, CmdTraceEvent, InstanceID, diff --git a/sycl/source/detail/xpti_registry.cpp b/sycl/source/detail/xpti_registry.cpp index 5f35997639da8..8abd2ea6e4b20 100644 --- a/sycl/source/detail/xpti_registry.cpp +++ b/sycl/source/detail/xpti_registry.cpp @@ -25,6 +25,7 @@ uint8_t GMemAllocStreamID = xpti::invalid_id; uint8_t GCudaCallStreamID = xpti::invalid_id; uint8_t GCudaDebugStreamID = xpti::invalid_id; uint8_t GSYCLStreamID = xpti::invalid_id; +uint8_t GSYCLDebugStreamID = xpti::invalid_id; uint8_t GUrCallStreamID = xpti::invalid_id; uint8_t GUrApiStreamID = xpti::invalid_id; diff --git a/sycl/source/detail/xpti_registry.hpp b/sycl/source/detail/xpti_registry.hpp index 16749ee259b0c..1a1184d73d222 100644 --- a/sycl/source/detail/xpti_registry.hpp +++ b/sycl/source/detail/xpti_registry.hpp @@ -40,6 +40,11 @@ constexpr const char *GVerStr = SYCL_VERSION_STR; /// We define all the streams used the instrumentation framework here inline constexpr const char *SYCL_STREAM_NAME = "sycl"; +// We will use "*.debug" stream names as indicators of needing debugging +// information; in this case, the tool will have to subscribe to the *.debug +// stream to get additional debug metadata, but the metadata will still be sent +// through the regular stream. +inline constexpr const char *SYCL_DEBUG_STREAM_NAME = "sycl.debug"; inline constexpr auto SYCL_MEM_ALLOC_STREAM_NAME = "sycl.experimental.mem_alloc"; // Stream name being used to notify about buffer objects. @@ -53,6 +58,7 @@ extern uint8_t GBufferStreamID; extern uint8_t GImageStreamID; extern uint8_t GMemAllocStreamID; extern uint8_t GSYCLStreamID; +extern uint8_t GSYCLDebugStreamID; extern uint8_t GUrApiStreamID; extern xpti::trace_event_data_t *GMemAllocEvent; @@ -79,6 +85,10 @@ class XPTIRegistry { // SYCL events detail::GSYCLStreamID = this->initializeStream(SYCL_STREAM_NAME, GMajVer, GMinVer, GVerStr); + // Register the SYCL Debug event stream; tools subscribing to this stream + // will receive additional metadata in the regular "sycl" stream. + detail::GSYCLDebugStreamID = this->initializeStream( + SYCL_DEBUG_STREAM_NAME, GMajVer, GMinVer, GVerStr); // SYCL buffer events detail::GBufferStreamID = this->initializeStream( SYCL_BUFFER_STREAM_NAME, GMajVer, GMinVer, GVerStr); diff --git a/sycl/test-e2e/XPTI/Inputs/test_collector.cpp b/sycl/test-e2e/XPTI/Inputs/test_collector.cpp index 86e126714fc07..c147521a2efc2 100644 --- a/sycl/test-e2e/XPTI/Inputs/test_collector.cpp +++ b/sycl/test-e2e/XPTI/Inputs/test_collector.cpp @@ -26,7 +26,7 @@ XPTI_CALLBACK_API void xptiTraceInit(unsigned int MajorVersion, for (type t : std::initializer_list{type::function_with_args_begin}) xptiRegisterCallback(StreamID, static_cast(t), syclUrCallback); } - if (NameView == "sycl") { + if (NameView == "sycl" || NameView == "sycl.debug") { uint8_t StreamID = xptiRegisterStream(StreamName); for (type t : std::initializer_list{ diff --git a/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp b/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp index 2fc4f365a813e..8faa2f2678356 100644 --- a/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp +++ b/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp @@ -6,6 +6,7 @@ #include "basic_event_collection.inc" // // CHECK-DAG: xptiTraceInit: Stream Name = sycl +// CHECK-DAG: xptiTraceInit: Stream Name = sycl.debug // CHECK-DAG: xptiTraceInit: Stream Name = sycl.experimental.buffer // CHECK-DAG: xptiTraceInit: Stream Name = sycl.experimental.image // CHECK-DAG: xptiTraceInit: Stream Name = sycl.experimental.mem_alloc diff --git a/unified-runtime/source/adapters/cuda/tracing.cpp b/unified-runtime/source/adapters/cuda/tracing.cpp index fb8e7c83d32f1..52e075e44ad2d 100644 --- a/unified-runtime/source/adapters/cuda/tracing.cpp +++ b/unified-runtime/source/adapters/cuda/tracing.cpp @@ -107,14 +107,20 @@ static void cuptiCallback(void *UserData, CUpti_CallbackDomain, uint8_t CallStreamID = xptiRegisterStream(CUDA_CALL_STREAM_NAME); uint8_t DebugStreamID = xptiRegisterStream(CUDA_DEBUG_STREAM_NAME); - xptiNotifySubscribers(CallStreamID, TraceType, Ctx->CallEvent, nullptr, - CallCorrelationID, FuncName); - - xpti::function_with_args_t Payload{ - FuncID, FuncName, const_cast(CBInfo->functionParams), - CBInfo->functionReturnValue, CBInfo->context}; - xptiNotifySubscribers(DebugStreamID, TraceTypeArgs, Ctx->DebugEvent, - nullptr, DebugCorrelationID, &Payload); + // Only notify if there are subscribers + if (xptiCheckTraceEnabled(CallStreamID, TraceType)) { + xptiNotifySubscribers(CallStreamID, TraceType, Ctx->CallEvent, nullptr, + CallCorrelationID, FuncName); + } + + // Prepare the payload and notify subscribers if there are subscribers + if (xptiCheckTraceEnabled(DebugStreamID, TraceTypeArgs)) { + xpti::function_with_args_t Payload{ + FuncID, FuncName, const_cast(CBInfo->functionParams), + CBInfo->functionReturnValue, CBInfo->context}; + xptiNotifySubscribers(DebugStreamID, TraceTypeArgs, Ctx->DebugEvent, + nullptr, DebugCorrelationID, &Payload); + } } } #endif @@ -193,16 +199,13 @@ void enableCUDATracing(cuda_tracing_context_t_ *Ctx) { xptiRegisterStream(CUDA_DEBUG_STREAM_NAME); xptiInitialize(CUDA_DEBUG_STREAM_NAME, GMajVer, GMinVer, GVerStr); - uint64_t Dummy; - xpti::payload_t CUDAPayload("CUDA Plugin Layer"); - Ctx->CallEvent = - xptiMakeEvent("CUDA Plugin Layer", &CUDAPayload, - xpti::trace_algorithm_event, xpti_at::active, &Dummy); - - xpti::payload_t CUDADebugPayload("CUDA Plugin Debug Layer"); + auto CudaCallEvent = + xptiCreateTracepoint("CUDA Plugin Layer", nullptr, 0, 0, nullptr); + auto CudaCallDebugEvent = + xptiCreateTracepoint("CUDA Plugin Debug Layer", nullptr, 0, 0, nullptr); + Ctx->CallEvent = CudaCallEvent ? CudaCallEvent->event_ref() : nullptr; Ctx->DebugEvent = - xptiMakeEvent("CUDA Plugin Debug Layer", &CUDADebugPayload, - xpti::trace_algorithm_event, xpti_at::active, &Dummy); + CudaCallDebugEvent ? CudaCallDebugEvent->event_ref() : nullptr; Ctx->Cupti.Subscribe(&Ctx->Subscriber, cuptiCallback, Ctx); Ctx->Cupti.EnableDomain(1, Ctx->Subscriber, CUPTI_CB_DOMAIN_DRIVER_API); diff --git a/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp b/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp index b8ffa1edba5be..cb0f54b4a49ca 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp @@ -40,6 +40,10 @@ static std::shared_ptr xptiContextManagerGet() { static auto contextManager = std::make_shared(); return contextManager; } + +// The Unified Runtime API calls are meant to be performant and creating an +// event for each API Call will add significant overheads +static xpti_td *GURCallEvent = nullptr; static thread_local xpti_td *activeEvent; /////////////////////////////////////////////////////////////////////////////// @@ -51,11 +55,18 @@ context_t::context_t() : logger(logger::create_logger("tracing", true, true)) { streamv << STREAM_VER_MAJOR << "." << STREAM_VER_MINOR; xptiInitialize(CALL_STREAM_NAME, STREAM_VER_MAJOR, STREAM_VER_MINOR, streamv.str().data()); + // Create global event for all UR API calls + auto Event = + xptiCreateTracepoint("Unified Runtime call", nullptr, 0, 0, (void *)this); + // For function_begin/function_end class of notification, the parent and the + // event object can be NULL based on the specification + GURCallEvent = Event ? Event->event_ref() : nullptr; } void context_t::notify(uint16_t trace_type, uint32_t id, const char *name, void *args, ur_result_t *resultp, uint64_t instance) { xpti::function_with_args_t payload{id, name, args, resultp, nullptr}; + // Use global event for all UR API calls xptiNotifySubscribers(call_stream_id, trace_type, nullptr, activeEvent, instance, &payload); } @@ -67,16 +78,23 @@ uint64_t context_t::notify_begin(uint32_t id, const char *name, void *args) { return UINT64_MAX; } - if (auto loc = codelocData.get_codeloc()) { - xpti::payload_t payload = - xpti::payload_t(loc->functionName, loc->sourceFile, loc->lineNumber, - loc->columnNumber, nullptr); - uint64_t InstanceNumber{}; - activeEvent = - xptiMakeEvent("Unified Runtime call", &payload, xpti::trace_graph_event, - xpti_at::active, &InstanceNumber); - } + // Previous implementation created a new event for each UR API call. This + // adds significant overhead to the tracing toolchain. Replacing the + // previous code with a single global event for all UR API calls: + // + // PREVIOUS CODE: + // if (auto loc = codelocData.get_codeloc()) { + // xpti::payload_t payload = + // xpti::payload_t(loc->functionName, loc->sourceFile, loc->lineNumber, + // loc->columnNumber, nullptr); + // uint64_t InstanceNumber{}; + // activeEvent = + // xptiMakeEvent("Unified Runtime call", &payload, + // xpti::trace_graph_event, + // xpti_at::active, &InstanceNumber); + // } + activeEvent = GURCallEvent; uint64_t instance = xptiGetUniqueId(); notify((uint16_t)xpti::trace_point_type_t::function_with_args_begin, id, name, args, nullptr, instance); diff --git a/xpti/include/xpti/xpti_trace_framework.h b/xpti/include/xpti/xpti_trace_framework.h index 2ac5494d79960..99ee1a5fb2f16 100644 --- a/xpti/include/xpti/xpti_trace_framework.h +++ b/xpti/include/xpti/xpti_trace_framework.h @@ -781,9 +781,7 @@ xptiUnregisterCallback(xpti::stream_id_t stream_id, uint16_t trace_type, /// @brief Notifies all registered subscribers that an event has occurred /// @details Subscribers receive notifications to the trace point types they -/// register a callback with. This function allows subscribers to unregister -/// any previously registered callback functions with this function so they can -/// stop receiving notifications. +/// register a callback with. /// /// @param stream_id The stream for which the registration must be disabled /// @param trace_type The trace point type for which the notification is being From 77657de43a824139f447c90c82fc6f865089be38 Mon Sep 17 00:00:00 2001 From: Vasanth Tovinkere Date: Tue, 2 Sep 2025 15:48:39 -0700 Subject: [PATCH 02/13] [SYCL][XPTI] Updated documentation and tests Signed-off-by: Vasanth Tovinkere --- sycl/doc/design/SYCLInstrumentationUsingXPTI.md | 11 +++++++++++ sycl/tools/sycl-prof/collector.cpp | 2 +- sycl/tools/sycl-trace/collector.cpp | 13 ++++++++----- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/sycl/doc/design/SYCLInstrumentationUsingXPTI.md b/sycl/doc/design/SYCLInstrumentationUsingXPTI.md index a019ba515fc4d..aac22261011f9 100644 --- a/sycl/doc/design/SYCLInstrumentationUsingXPTI.md +++ b/sycl/doc/design/SYCLInstrumentationUsingXPTI.md @@ -221,9 +221,20 @@ trace point that includes an event, a trace point type and a notification. can attached a per-instance user data during this notification call that *must* be guaranteed to be valid for the duration of the notification call. +- To support performance and debug streams, subscribing to the stream **"sycl.debug"** + allows the default streams to contain additional metadata when keeping overheads + to a minimum is not important + This document will outline the protocol for the streams of data being generated by the SYCL runtime. +## SYCL Stream `"sycl.debug"` Notification Signatures + +The "sycl.debug" stream is a dummy stream, when subscribed to, indicates to the SYCL +runtime that additional metadata can be propagated for each SYCL event. Many toolchains +like to keep the overheads low when subscribing to the data and this provides a mechanism +to get more data when keeping overheads low is not important. + ## SYCL Stream `"ur.call"` Notification Signatures | Trace Point Type | Parameter Description | Metadata | diff --git a/sycl/tools/sycl-prof/collector.cpp b/sycl/tools/sycl-prof/collector.cpp index 7afca930b9194..475df602c00ca 100644 --- a/sycl/tools/sycl-prof/collector.cpp +++ b/sycl/tools/sycl-prof/collector.cpp @@ -83,7 +83,7 @@ XPTI_CALLBACK_API void xptiTraceInit(unsigned int /*major_version*/, urBeginEndCallback); xptiRegisterCallback(StreamID, xpti::trace_function_with_args_end, urBeginEndCallback); - } else if (NameView == "sycl") { + } else if (NameView == "sycl" || NameView == "sycl.debug") { uint8_t StreamID = xptiRegisterStream(StreamName); xptiRegisterCallback(StreamID, xpti::trace_task_begin, taskBeginEndCallback); diff --git a/sycl/tools/sycl-trace/collector.cpp b/sycl/tools/sycl-trace/collector.cpp index 79df84994923a..4f3d0b6360e8e 100644 --- a/sycl/tools/sycl-trace/collector.cpp +++ b/sycl/tools/sycl-trace/collector.cpp @@ -215,7 +215,8 @@ XPTI_CALLBACK_API void xptiTraceInit(unsigned int /*major_version*/, } #endif } - if (std::string_view(StreamName) == "sycl" && + if ((std::string_view(StreamName) == "sycl" || + std::string_view(StreamName) == "sycl.debug") && std::getenv("SYCL_TRACE_API_ENABLE")) { syclPrintersInit(); uint16_t StreamID = xptiRegisterStream(StreamName); @@ -254,10 +255,12 @@ XPTI_CALLBACK_API void xptiTraceFinish(const char *StreamName) { cudaCollectorLibrary.clear(); } #endif - if (std::string_view(StreamName) == "sycl" && - std::getenv("SYCL_TRACE_API_ENABLE")) + if ((std::string_view(StreamName) == "sycl" || + std::string_view(StreamName) == "sycl.debug") && + std::getenv("SYCL_TRACE_API_ENABLE")) { syclPrintersFinish(); - if (std::getenv("SYCL_TRACE_VERIFICATION_ENABLE")) { - vPrintersFinish(); + if (std::getenv("SYCL_TRACE_VERIFICATION_ENABLE")) { + vPrintersFinish(); + } } } From 15a3e7b9c8848c3111c36f72220e02a444b74626 Mon Sep 17 00:00:00 2001 From: Vasanth Tovinkere Date: Tue, 2 Sep 2025 21:29:39 -0700 Subject: [PATCH 03/13] [SYCL][XPTI] Ensure all tests subscribe to debug stream Signed-off-by: Vasanth Tovinkere --- .../XPTI/Inputs/memory_info_collector.cpp | 18 ++++++++---------- .../xptitest_subscriber/XPTISubscriber.cpp | 4 ++++ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/sycl/test-e2e/XPTI/Inputs/memory_info_collector.cpp b/sycl/test-e2e/XPTI/Inputs/memory_info_collector.cpp index a2dd139d112a6..bd7e0d457c160 100644 --- a/sycl/test-e2e/XPTI/Inputs/memory_info_collector.cpp +++ b/sycl/test-e2e/XPTI/Inputs/memory_info_collector.cpp @@ -56,7 +56,7 @@ XPTI_CALLBACK_API void xptiTraceInit(unsigned int MajorVersion, xptiRegisterCallback(StreamID, static_cast(t), syclImageCallback); } - if (NameView == "sycl") { + if (NameView == "sycl" || NameView == "sycl.debug") { uint8_t StreamID = xptiRegisterStream(StreamName); for (type t : std::initializer_list{ type::graph_create, type::node_create, type::edge_create, @@ -105,10 +105,9 @@ XPTI_CALLBACK_API void syclBufferCallback(uint16_t TraceType, << BufConstr->user_object_handle << "|0x" << BufConstr->host_object_handle << "|" << std::dec << BufConstr->element_type << "|" << BufConstr->element_size - << "|" << BufConstr->dim << "|" - << "{" << BufConstr->range[0] << "," << BufConstr->range[1] << "," - << BufConstr->range[2] << "}|" - << Event->reserved.payload->source_file << ":" + << "|" << BufConstr->dim << "|" << "{" << BufConstr->range[0] + << "," << BufConstr->range[1] << "," << BufConstr->range[2] + << "}|" << Event->reserved.payload->source_file << ":" << Event->reserved.payload->line_no << ":" << Event->reserved.payload->column_no << "\n"; @@ -167,9 +166,9 @@ XPTI_CALLBACK_API void syclImageCallback(uint16_t TraceType, std::cout << "un"; std::cout << "sampled image|0x" << std::hex << ImgConstr->user_object_handle << "|0x" << ImgConstr->host_object_handle << "|" << std::dec - << ImgConstr->dim << "|" - << "{" << ImgConstr->range[0] << "," << ImgConstr->range[1] << "," - << ImgConstr->range[2] << "}|" << ImgConstr->format << "|"; + << ImgConstr->dim << "|" << "{" << ImgConstr->range[0] << "," + << ImgConstr->range[1] << "," << ImgConstr->range[2] << "}|" + << ImgConstr->format << "|"; if (IsSampledImage) std::cout << *ImgConstr->addressing << "|" << *ImgConstr->coordinate_normalization << "|" @@ -282,8 +281,7 @@ void parseMetadata(xpti::trace_event_data_t *Event) { Metadata, Name.c_str()); std::cout << " " << Name << " : {" << arg.type << ", " << std::hex << "0x" << (uintptr_t)arg.pointer << std::dec << ", " - << arg.size << ", " << arg.index << "} " - << "\n"; + << arg.size << ", " << arg.index << "} " << "\n"; } } else { std::cout << "\n"; diff --git a/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp b/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp index 2c79f76269c11..a21ad3d228f95 100644 --- a/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp +++ b/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp @@ -153,6 +153,10 @@ XPTI_CALLBACK_API void xptiTraceInit(unsigned int /*major_version*/, xptiRegisterCallback(StreamID, xpti::trace_queue_destroy, testCallback); xptiRegisterCallback(StreamID, xpti::trace_task_begin, testCallback); xptiRegisterCallback(StreamID, xpti::trace_task_end, testCallback); + // Register at least one callback to the debug stream to enable additional + // metadata that may be used for tests + uint8_t DebugStreamID = xptiRegisterStream("sycl.debug"); + xptiRegisterCallback(StreamID, xpti::trace_node_create, testCallback); } XPTI_CALLBACK_API void xptiTraceFinish(const char * /*StreamName*/) {} From 604459f5899397549b0467934d388b7158b67def Mon Sep 17 00:00:00 2001 From: Vasanth Tovinkere Date: Wed, 3 Sep 2025 10:58:55 -0700 Subject: [PATCH 04/13] [SYCL][XPTI] Addressed reviewer-1 comments Signed-off-by: Vasanth Tovinkere --- sycl/source/detail/global_handler.cpp | 2 +- sycl/source/detail/queue_impl.cpp | 20 ++++++++-------- sycl/source/detail/scheduler/commands.cpp | 24 +++++++++---------- sycl/source/detail/xpti_registry.hpp | 6 ++--- .../xptitest_subscriber/XPTISubscriber.cpp | 2 +- .../source/adapters/cuda/tracing.cpp | 4 ++-- .../layers/tracing/ur_tracing_layer.cpp | 8 +++---- 7 files changed, 33 insertions(+), 33 deletions(-) diff --git a/sycl/source/detail/global_handler.cpp b/sycl/source/detail/global_handler.cpp index 6f9dac6bd42da..97f37a98c30d3 100644 --- a/sycl/source/detail/global_handler.cpp +++ b/sycl/source/detail/global_handler.cpp @@ -95,7 +95,7 @@ void GlobalHandler::TraceEventXPTI(const char *Message) { xpti::framework::tracepoint_scope_t TP( CodeLocation.fileName(), CodeLocation.functionName(), CodeLocation.lineNumber(), CodeLocation.columnNumber(), nullptr); - // Notify the subscriber with a diagnostic message when an exception occurs + // Notify the subscriber with a diagnostic message when an exception occurs. TP.stream(detail::GSYCLStreamID) .traceType(xpti::trace_point_type_t::diagnostics) .parentEvent(GSYCLCallEvent) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 8757be73972dd..705cbafe0286e 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -172,7 +172,7 @@ event queue_impl::memset(void *Ptr, int Value, size_t Count, .parentEvent(detail::GSYCLGraphEvent); // This information is necessary for memset, so we will not guard it by debug - // stream check + // stream check. TP.addMetadata([&](auto TEvent) { xpti::addMetadata(TEvent, "sycl_device", reinterpret_cast(MDevice.getHandleRef())); @@ -225,7 +225,7 @@ event queue_impl::memcpy(void *Dest, const void *Src, size_t Count, .traceType(xpti::trace_point_type_t::node_create) .parentEvent(GSYCLGraphEvent); const char *UserData = "memory_transfer_node::memcpy"; - // We will include this metadata information as it is required for memcpy + // We will include this metadata information as it is required for memcpy. TP.addMetadata([&](auto TEvent) { xpti::addMetadata(TEvent, "sycl_device", reinterpret_cast(MDevice.getHandleRef())); @@ -589,7 +589,7 @@ void queue_impl::instrumentationEpilog(void *TelemetryEvent, std::string &Name, void queue_impl::wait(const detail::code_location &CodeLoc) { (void)CodeLoc; #ifdef XPTI_ENABLE_INSTRUMENTATION - // xptiCheckTraceEnabled() is being performed in instrumentationProlog() + // xptiCheckTraceEnabled() is being performed in instrumentationProlog(). void *TelemetryEvent = nullptr; uint64_t IId; std::string Name; @@ -674,33 +674,33 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { #ifdef XPTI_ENABLE_INSTRUMENTATION // There is an early return in instrumentationEpilog() if no subscribers are - // subscribing to queue.wait() + // subscribing to queue.wait(). instrumentationEpilog(TelemetryEvent, Name, GSYCLStreamID, IId); #endif } void queue_impl::constructorNotification() { #if XPTI_ENABLE_INSTRUMENTATION - // If there are no subscribers to queue_create, return immediately + // If there are no subscribers to queue_create, return immediately. constexpr uint16_t NotificationTraceType = static_cast(xpti::trace_point_type_t::queue_create); if (!xptiCheckTraceEnabled(detail::GSYCLStreamID, NotificationTraceType)) return; // We do not have CodeLoc for the queue constructor, so we will have to create // a queue name with the queue ID to create an event; this step can be avoided - // by using CodeLoc + // by using CodeLoc. xpti::utils::StringHelper SH; std::string AddrStr = SH.addressAsString(MQueueID); std::string QueueName = SH.nameWithAddressString("queue", AddrStr); - auto Event = + xpti_tracepoint_t *Event = xptiCreateTracepoint(QueueName.c_str(), nullptr, 0, 0, (void *)this); MInstanceID = xptiGetUniqueId(); xpti_td *TEvent = Event->event_ref(); - // Cache the trace event, stream id and instance IDs for the destructor + // Cache the trace event, stream id and instance IDs for the destructor. MTraceEvent = (void *)TEvent; // We will allow the queue metadata to be set as this is performed - // infrequently + // infrequently. xpti::addMetadata(TEvent, "sycl_context", reinterpret_cast(MContext->getHandleRef())); xpti::addMetadata(TEvent, "sycl_device_name", @@ -711,7 +711,7 @@ void queue_impl::constructorNotification() { xpti::addMetadata(TEvent, "queue_id", MQueueID); xpti::addMetadata(TEvent, "queue_handle", reinterpret_cast(getHandleRef())); - // Also publish to TLS before notification + // Also publish to TLS before notification. xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID); xptiNotifySubscribers( detail::GSYCLStreamID, (uint16_t)xpti::trace_point_type_t::queue_create, diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index d5b9f43383193..2fc7e817066b7 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -615,7 +615,7 @@ void Command::emitEdgeEventForCommandDependence( // Create an edge with the dependent buffer address for which a command // object has been created as one of the properties of the edge uint64_t EdgeInstanceNo = xptiGetUniqueId(); - auto Event = + xpti_tracepoint_t *Event = xptiCreateTracepoint(TypeString.c_str(), nullptr, 0, 0, MAddress); if (Event) { xpti_td *EdgeEvent = Event->event_ref(); @@ -623,7 +623,7 @@ void Command::emitEdgeEventForCommandDependence( xpti_td *TgtEvent = static_cast(MTraceEvent); EdgeEvent->source_id = SrcEvent->unique_id; EdgeEvent->target_id = TgtEvent->unique_id; - // We allow this metadata to be set as it describes the edge + // We allow this metadata to be set as it describes the edge. if (IsCommand) { xpti::addMetadata(EdgeEvent, "access_mode", static_cast(AccMode.value())); @@ -670,13 +670,13 @@ void Command::emitEdgeEventForEventDependence(Command *Cmd, std::string NodeName = SH.nameWithAddressString("virtual_node", AddressStr); // Node name is "virtual_node[]" - auto NEvent = + xpti_tracepoint_t *NEvent = xptiCreateTracepoint(NodeName.c_str(), nullptr, 0, 0, MAddress); uint64_t VNodeInstanceNo = xptiGetUniqueId(); xpti_td *NodeEvent = NEvent ? NEvent->event_ref() : nullptr; if (NodeEvent) { // We allow this metadata to be set as the node is a virtual node without - // an actual name + // an actual name. xpti::addMetadata(NodeEvent, "kernel_name", NodeName); xptiNotifySubscribers(MStreamID, xpti::trace_node_create, @@ -685,7 +685,7 @@ void Command::emitEdgeEventForEventDependence(Command *Cmd, } // Create a new event for the edge std::string EdgeName = SH.nameWithAddressString("Event", AddressStr); - auto EEvent = + xpti_tracepoint_t *EEvent = xptiCreateTracepoint(EdgeName.c_str(), nullptr, 0, 0, MAddress); uint64_t EdgeInstanceNo = xptiGetUniqueId(); xpti_td *EdgeEvent = EEvent ? EEvent->event_ref() : nullptr; @@ -696,7 +696,7 @@ void Command::emitEdgeEventForEventDependence(Command *Cmd, EdgeEvent->source_id = NodeEvent->unique_id; EdgeEvent->target_id = TgtEvent->unique_id; // We allow this metadata to be set as an edge without the event address - // will be less useful + // will be less useful. xpti::addMetadata(EdgeEvent, "event", reinterpret_cast(UrEventAddr)); xptiNotifySubscribers(MStreamID, xpti::trace_edge_create, @@ -723,7 +723,7 @@ uint64_t Command::makeTraceEventProlog(void *MAddress) { std::string CommandString = SH.nameWithAddressString(MCommandName, MAddressString); - auto Event = + xpti_tracepoint_t *Event = xptiCreateTracepoint(CommandString.c_str(), nullptr, 0, 0, MAddress); xpti_td *CmdTraceEvent = Event ? Event->event_ref() : nullptr; MInstanceID = xptiGetUniqueId(); @@ -977,14 +977,14 @@ void Command::resolveReleaseDependencies(std::set &DepList) { // properties of the edge xpti::payload_t p(TypeString.c_str(), MAddress); uint64_t EdgeInstanceNo = xptiGetUniqueId(); - auto Event = + xpti_tracepoint_t *Event = xptiCreateTracepoint(TypeString.c_str(), nullptr, 0, 0, MAddress); xpti_td *EdgeEvent = Event ? Event->event_ref() : nullptr; if (EdgeEvent) { xpti_td *SrcTraceEvent = static_cast(Item->MTraceEvent); EdgeEvent->target_id = TgtTraceEvent->unique_id; EdgeEvent->source_id = SrcTraceEvent->unique_id; - // We will ensure this is always added + // We will ensure this is always added. xpti::addMetadata(EdgeEvent, "memory_object", reinterpret_cast(MAddress)); xptiNotifySubscribers(MStreamID, xpti::trace_edge_create, @@ -1053,7 +1053,7 @@ void AllocaCommandBase::emitInstrumentationData() { if (MTraceEvent) { xpti_td *TE = static_cast(MTraceEvent); addDeviceMetadata(TE, MQueue); - // Memory-object is used frequently, so it is always added + // Memory-object is used frequently, so it is always added. xpti::addMetadata(TE, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant @@ -2071,7 +2071,7 @@ void instrumentationFillCommonData(const std::string &KernelName, xpti::addMetadata(CmdTraceEvent, "kernel_name", KernelName); } // We limit the metadata to only include the kernel name and device - // information by default + // information by default. if (xptiCheckTraceEnabled(detail::GSYCLDebugStreamID)) { if (FromSource.has_value()) { xpti::addMetadata(CmdTraceEvent, "from_source", FromSource.value()); @@ -2135,7 +2135,7 @@ std::pair emitKernelInstrumentationData( xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(Queue)); // Add the additional metadata only if the debug information is subscribed - // to; in this case, it is the kernel and its parameters + // to; in this case, it is the kernel and its parameters. if (xptiCheckTraceEnabled(detail::GSYCLDebugStreamID)) { instrumentationAddExtraKernelMetadata( CmdTraceEvent, NDRDesc, KernelBundleImplPtr, diff --git a/sycl/source/detail/xpti_registry.hpp b/sycl/source/detail/xpti_registry.hpp index 1a1184d73d222..3cac639d63532 100644 --- a/sycl/source/detail/xpti_registry.hpp +++ b/sycl/source/detail/xpti_registry.hpp @@ -40,10 +40,10 @@ constexpr const char *GVerStr = SYCL_VERSION_STR; /// We define all the streams used the instrumentation framework here inline constexpr const char *SYCL_STREAM_NAME = "sycl"; -// We will use "*.debug" stream names as indicators of needing debugging -// information; in this case, the tool will have to subscribe to the *.debug +// We will use "sycl.debug" stream name as an indicator of needing debugging +// information; in this case, the tool will have to subscribe to the sycl.debug // stream to get additional debug metadata, but the metadata will still be sent -// through the regular stream. +// through the regular streams. inline constexpr const char *SYCL_DEBUG_STREAM_NAME = "sycl.debug"; inline constexpr auto SYCL_MEM_ALLOC_STREAM_NAME = "sycl.experimental.mem_alloc"; diff --git a/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp b/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp index a21ad3d228f95..dfbf206e64187 100644 --- a/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp +++ b/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp @@ -154,7 +154,7 @@ XPTI_CALLBACK_API void xptiTraceInit(unsigned int /*major_version*/, xptiRegisterCallback(StreamID, xpti::trace_task_begin, testCallback); xptiRegisterCallback(StreamID, xpti::trace_task_end, testCallback); // Register at least one callback to the debug stream to enable additional - // metadata that may be used for tests + // metadata that may be used for tests. uint8_t DebugStreamID = xptiRegisterStream("sycl.debug"); xptiRegisterCallback(StreamID, xpti::trace_node_create, testCallback); } diff --git a/unified-runtime/source/adapters/cuda/tracing.cpp b/unified-runtime/source/adapters/cuda/tracing.cpp index 52e075e44ad2d..c1f3b0ecb36de 100644 --- a/unified-runtime/source/adapters/cuda/tracing.cpp +++ b/unified-runtime/source/adapters/cuda/tracing.cpp @@ -107,13 +107,13 @@ static void cuptiCallback(void *UserData, CUpti_CallbackDomain, uint8_t CallStreamID = xptiRegisterStream(CUDA_CALL_STREAM_NAME); uint8_t DebugStreamID = xptiRegisterStream(CUDA_DEBUG_STREAM_NAME); - // Only notify if there are subscribers + // Only notify if there are subscribers. if (xptiCheckTraceEnabled(CallStreamID, TraceType)) { xptiNotifySubscribers(CallStreamID, TraceType, Ctx->CallEvent, nullptr, CallCorrelationID, FuncName); } - // Prepare the payload and notify subscribers if there are subscribers + // Prepare the payload and notify subscribers if there are subscribers. if (xptiCheckTraceEnabled(DebugStreamID, TraceTypeArgs)) { xpti::function_with_args_t Payload{ FuncID, FuncName, const_cast(CBInfo->functionParams), diff --git a/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp b/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp index cb0f54b4a49ca..8830860a76080 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp @@ -42,7 +42,7 @@ static std::shared_ptr xptiContextManagerGet() { } // The Unified Runtime API calls are meant to be performant and creating an -// event for each API Call will add significant overheads +// event for each API Call will add significant overheads. static xpti_td *GURCallEvent = nullptr; static thread_local xpti_td *activeEvent; @@ -55,11 +55,11 @@ context_t::context_t() : logger(logger::create_logger("tracing", true, true)) { streamv << STREAM_VER_MAJOR << "." << STREAM_VER_MINOR; xptiInitialize(CALL_STREAM_NAME, STREAM_VER_MAJOR, STREAM_VER_MINOR, streamv.str().data()); - // Create global event for all UR API calls - auto Event = + // Create global event for all UR API calls. + xpti_tracepoint_t *Event = xptiCreateTracepoint("Unified Runtime call", nullptr, 0, 0, (void *)this); // For function_begin/function_end class of notification, the parent and the - // event object can be NULL based on the specification + // event object can be NULL based on the specification. GURCallEvent = Event ? Event->event_ref() : nullptr; } From ff02fceb196f7efc65b9ea76b2306b6d4fd167d6 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Fri, 5 Sep 2025 10:35:42 -0700 Subject: [PATCH 05/13] Fix unused variable error --- .../unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp b/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp index dfbf206e64187..7932fc4f59c61 100644 --- a/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp +++ b/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp @@ -156,7 +156,7 @@ XPTI_CALLBACK_API void xptiTraceInit(unsigned int /*major_version*/, // Register at least one callback to the debug stream to enable additional // metadata that may be used for tests. uint8_t DebugStreamID = xptiRegisterStream("sycl.debug"); - xptiRegisterCallback(StreamID, xpti::trace_node_create, testCallback); + xptiRegisterCallback(DebugStreamID, xpti::trace_node_create, testCallback); } XPTI_CALLBACK_API void xptiTraceFinish(const char * /*StreamName*/) {} From e6e88cb4a33bf9a1a5b9248a45de47e6a333fec6 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Fri, 5 Sep 2025 12:38:40 -0700 Subject: [PATCH 06/13] Use kernel name is function name is not available --- sycl/source/detail/scheduler/commands.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 2fc7e817066b7..2e8fd11a15adf 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2052,8 +2052,13 @@ void instrumentationFillCommonData(const std::string &KernelName, xpti_tracepoint_t *Event; void *AddressToUse = const_cast(Address); if (HasSourceInfo) { - Event = xptiCreateTracepoint(FuncName.c_str(), FileName.c_str(), Line, - Column, AddressToUse); + if (!FuncName.empty()) { + Event = xptiCreateTracepoint(FuncName.c_str(), FileName.c_str(), Line, + Column, AddressToUse); + } else { + Event = xptiCreateTracepoint(KernelName.c_str(), FileName.c_str(), Line, + Column, AddressToUse); + } } else { Event = xptiCreateTracepoint(KernelName.data(), nullptr, 0, 0, AddressToUse); From a1b2718ae365503555a15008b44acb93f723c24b Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Fri, 5 Sep 2025 12:45:40 -0700 Subject: [PATCH 07/13] Avoid duplicate code --- sycl/source/detail/scheduler/commands.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 2e8fd11a15adf..e50c309ded096 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2052,13 +2052,9 @@ void instrumentationFillCommonData(const std::string &KernelName, xpti_tracepoint_t *Event; void *AddressToUse = const_cast(Address); if (HasSourceInfo) { - if (!FuncName.empty()) { - Event = xptiCreateTracepoint(FuncName.c_str(), FileName.c_str(), Line, - Column, AddressToUse); - } else { - Event = xptiCreateTracepoint(KernelName.c_str(), FileName.c_str(), Line, - Column, AddressToUse); - } + const auto &Name = FuncName.empty() ? KernelName : FuncName; + Event = xptiCreateTracepoint(Name.c_str(), FileName.c_str(), Line, Column, + AddressToUse); } else { Event = xptiCreateTracepoint(KernelName.data(), nullptr, 0, 0, AddressToUse); From c0bb2c077895633d6531bf6fbbcdb5240ccadace Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Mon, 8 Sep 2025 15:59:45 -0700 Subject: [PATCH 08/13] Introduce ur.call.debug stream for tracepoints with source info --- sycl/tools/sycl-trace/collector.cpp | 6 ++- .../layers/tracing/ur_tracing_layer.cpp | 53 ++++++++++--------- .../layers/tracing/ur_tracing_layer.hpp | 1 + 3 files changed, 34 insertions(+), 26 deletions(-) diff --git a/sycl/tools/sycl-trace/collector.cpp b/sycl/tools/sycl-trace/collector.cpp index 4f3d0b6360e8e..1b69e46e09410 100644 --- a/sycl/tools/sycl-trace/collector.cpp +++ b/sycl/tools/sycl-trace/collector.cpp @@ -180,7 +180,8 @@ XPTI_CALLBACK_API void xptiTraceInit(unsigned int /*major_version*/, unsigned int /*minor_version*/, const char * /*version_str*/, const char *StreamName) { - if (std::string_view(StreamName) == "ur.call" && + if ((std::string_view(StreamName) == "ur.call" || + std::string_view(StreamName) == "ur.call.debug") && std::getenv("SYCL_TRACE_UR_ENABLE")) { urPrintersInit(); uint16_t StreamID = xptiRegisterStream(StreamName); @@ -237,7 +238,8 @@ XPTI_CALLBACK_API void xptiTraceInit(unsigned int /*major_version*/, } XPTI_CALLBACK_API void xptiTraceFinish(const char *StreamName) { - if (std::string_view(StreamName) == "ur.call" && + if ((std::string_view(StreamName) == "ur.call" || + std::string_view(StreamName) == "ur.call.debug") && std::getenv("SYCL_TRACE_UR_ENABLE")) urPrintersFinish(); #ifdef SYCL_HAS_LEVEL_ZERO diff --git a/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp b/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp index 8830860a76080..25591caf799b6 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp @@ -24,6 +24,7 @@ namespace ur_tracing_layer { context_t *getContext() { return context_t::get_direct(); } constexpr auto CALL_STREAM_NAME = "ur.call"; +constexpr auto DEBUG_CALL_STREAM_NAME = "ur.call.debug"; constexpr auto STREAM_VER_MAJOR = UR_MAJOR_VERSION(UR_API_VERSION_CURRENT); constexpr auto STREAM_VER_MINOR = UR_MINOR_VERSION(UR_API_VERSION_CURRENT); @@ -51,10 +52,13 @@ context_t::context_t() : logger(logger::create_logger("tracing", true, true)) { this->xptiContextManager = xptiContextManagerGet(); call_stream_id = xptiRegisterStream(CALL_STREAM_NAME); + debug_call_stream_id = xptiRegisterStream(DEBUG_CALL_STREAM_NAME); std::ostringstream streamv; streamv << STREAM_VER_MAJOR << "." << STREAM_VER_MINOR; xptiInitialize(CALL_STREAM_NAME, STREAM_VER_MAJOR, STREAM_VER_MINOR, streamv.str().data()); + xptiInitialize(DEBUG_CALL_STREAM_NAME, STREAM_VER_MAJOR, STREAM_VER_MINOR, + streamv.str().data()); // Create global event for all UR API calls. xpti_tracepoint_t *Event = xptiCreateTracepoint("Unified Runtime call", nullptr, 0, 0, (void *)this); @@ -66,35 +70,36 @@ context_t::context_t() : logger(logger::create_logger("tracing", true, true)) { void context_t::notify(uint16_t trace_type, uint32_t id, const char *name, void *args, ur_result_t *resultp, uint64_t instance) { xpti::function_with_args_t payload{id, name, args, resultp, nullptr}; - // Use global event for all UR API calls - xptiNotifySubscribers(call_stream_id, trace_type, nullptr, activeEvent, - instance, &payload); + if (xptiCheckTraceEnabled(debug_call_stream_id)) { + xptiNotifySubscribers(debug_call_stream_id, trace_type, nullptr, + activeEvent, instance, &payload); + } else { + // Use global event for all UR API calls + if (xptiCheckTraceEnabled(call_stream_id)) + xptiNotifySubscribers(call_stream_id, trace_type, nullptr, activeEvent, + instance, &payload); + } } uint64_t context_t::notify_begin(uint32_t id, const char *name, void *args) { - // we use UINT64_MAX as a special value that means "tracing disabled", - // so that we don't have to repeat this check in notify_end. - if (!xptiCheckTraceEnabled(call_stream_id)) { + if (xptiCheckTraceEnabled(debug_call_stream_id)) { + // Create a new tracepoint with code location info for each UR API call. + // This adds significant overhead to the tracing toolchain, so do this only + // if there are debug stream subscribers. + if (auto loc = codelocData.get_codeloc()) { + xpti_tracepoint_t *Event = xptiCreateTracepoint( + loc->functionName, loc->sourceFile, loc->lineNumber, + loc->columnNumber, (void *)this); + activeEvent = Event ? Event->event_ref() : nullptr; + } + } else if (xptiCheckTraceEnabled(call_stream_id)) { + // Otherwise use global event for all UR API calls. + activeEvent = GURCallEvent; + } else { + // We use UINT64_MAX as a special value that means "tracing disabled", + // so that we don't have to repeat this check in notify_end. return UINT64_MAX; } - - // Previous implementation created a new event for each UR API call. This - // adds significant overhead to the tracing toolchain. Replacing the - // previous code with a single global event for all UR API calls: - // - // PREVIOUS CODE: - // if (auto loc = codelocData.get_codeloc()) { - // xpti::payload_t payload = - // xpti::payload_t(loc->functionName, loc->sourceFile, loc->lineNumber, - // loc->columnNumber, nullptr); - // uint64_t InstanceNumber{}; - // activeEvent = - // xptiMakeEvent("Unified Runtime call", &payload, - // xpti::trace_graph_event, - // xpti_at::active, &InstanceNumber); - // } - - activeEvent = GURCallEvent; uint64_t instance = xptiGetUniqueId(); notify((uint16_t)xpti::trace_point_type_t::function_with_args_begin, id, name, args, nullptr, instance); diff --git a/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.hpp b/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.hpp index f1fcd01cab3ae..35268b03b64fd 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.hpp +++ b/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.hpp @@ -48,6 +48,7 @@ class __urdlllocal context_t : public proxy_layer_context_t, void notify(uint16_t trace_type, uint32_t id, const char *name, void *args, ur_result_t *resultp, uint64_t instance); uint8_t call_stream_id; + uint8_t debug_call_stream_id; inline static const std::string name = "UR_LAYER_TRACING"; From afcc054d7cd7a64ebc4f814b7b32bbcbdae8f730 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Mon, 8 Sep 2025 16:23:27 -0700 Subject: [PATCH 09/13] Minor fix --- .../source/loader/layers/tracing/ur_tracing_layer.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp b/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp index 25591caf799b6..3125c3fcfb0d1 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_tracing_layer.cpp @@ -75,9 +75,8 @@ void context_t::notify(uint16_t trace_type, uint32_t id, const char *name, activeEvent, instance, &payload); } else { // Use global event for all UR API calls - if (xptiCheckTraceEnabled(call_stream_id)) - xptiNotifySubscribers(call_stream_id, trace_type, nullptr, activeEvent, - instance, &payload); + xptiNotifySubscribers(call_stream_id, trace_type, nullptr, activeEvent, + instance, &payload); } } From f7644c28f03e1123fee297ef8ac1e2b1b6ed9597 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Tue, 9 Sep 2025 08:47:09 -0700 Subject: [PATCH 10/13] Update documentation accordingly * UR only generates function_with_args_begin/function_with_args_end, so remove function_begin/function_end from documentation. Also function_with_args_begin/function_with_args_end are currently sent to ur.call stream (not ur.call.debug), it seems that implementation has changed at some point but documentation has never been updated. * Add description of newly introduced ur.call.debug. --- sycl/doc/design/SYCLInstrumentationUsingXPTI.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sycl/doc/design/SYCLInstrumentationUsingXPTI.md b/sycl/doc/design/SYCLInstrumentationUsingXPTI.md index aac22261011f9..42126acc10bfe 100644 --- a/sycl/doc/design/SYCLInstrumentationUsingXPTI.md +++ b/sycl/doc/design/SYCLInstrumentationUsingXPTI.md @@ -237,17 +237,16 @@ to get more data when keeping overheads low is not important. ## SYCL Stream `"ur.call"` Notification Signatures -| Trace Point Type | Parameter Description | Metadata | -| :--------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------- | -| `function_begin` |
  • **trace_type**: `xpti::trace_point_type_t::function_begin` that marks the beginning of a function
  • **parent**: Event ID created for all functions in the `ur.call` layer.
  • **event**: `nullptr` - since the stream of data just captures functions being called.
  • **instance**: Unique ID to allow the correlation of the `function_begin` event with the `function_end` event.
  • **user_data**: Name of the function being called sent in as `const char *`
  • | None | -| `function_end` |
  • **trace_type**: `xpti::trace_point_type_t::function_end` that marks the beginning of a function
  • **parent**: Event ID created for all functions in the `ur.call` layer.
  • **event**: `nullptr` - since the stream of data just captures functions being called.
  • **instance**: Unique ID to allow the correlation of the `function_begin` event with the `function_end` event. This value is guaranteed to be the same value received by the trace event for the corresponding `function_begin`
  • **user_data**: Name of the function being called sent in as `const char *`
  • | None | +| Trace Point Type | Parameter Description | Metadata | +| :------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------- | +| `function_with_args_begin` |
  • **trace_type**: `xpti::trace_point_type_t::function_with_args_begin` that marks the beginning of a function
  • **parent**: Event ID created for all functions in the `ur.call` layer.
  • **event**: `nullptr` if code location is not available or event ID with code location data.
  • **instance**: Unique ID to allow the correlation of the `function_with_args_begin` event with the `function_with_args_end` event.
  • **user_data**: A pointer to `function_with_args_t` object, that includes function ID, name, and arguments.
  • | None | +| `function_with_args_end` |
  • **trace_type**: `xpti::trace_point_type_t::function_with_args_end` that marks the beginning of a function
  • **parent**: Event ID created for all functions in the `ur.call` layer.
  • **event**: `nullptr` if code location is not available or event ID with code location data.
  • **instance**: Unique ID to allow the correlation of the `function_with_args_begin` event with the `function_with_args_end` event. This value is guaranteed to be the same value received by the trace event for the corresponding `function_with_args_begin`
  • **user_data**: A pointer to `function_with_args_t` object, that includes function ID, name, arguments, and return value.
  • | None | ## SYCL Stream `"ur.call.debug"` Notification Signatures -| Trace Point Type | Parameter Description | Metadata | -| :------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------- | -| `function_with_args_begin` |
  • **trace_type**: `xpti::trace_point_type_t::function_with_args_begin` that marks the beginning of a function
  • **parent**: Event ID created for all functions in the `ur.call.debug` layer.
  • **event**: `nullptr` if code location is not available or event ID with code location data.
  • **instance**: Unique ID to allow the correlation of the `function_with_args_begin` event with the `function_with_args_end` event.
  • **user_data**: A pointer to `function_with_args_t` object, that includes function ID, name, and arguments.
  • | None | -| `function_with_args_end` |
  • **trace_type**: `xpti::trace_point_type_t::function_with_args_end` that marks the beginning of a function
  • **parent**: Event ID created for all functions in the `ur.call.debug` layer.
  • **event**: `nullptr` if code location is not available or event ID with code location data.
  • **instance**: Unique ID to allow the correlation of the `function_with_args_begin` event with the `function_with_args_end` event. This value is guaranteed to be the same value received by the trace event for the corresponding `function_with_args_begin`
  • **user_data**: A pointer to `function_with_args_t` object, that includes function ID, name, arguments, and return value.
  • | None | +The `"ur.call.debug"` stream emits the same notifications as the `"ur.call"` stream, but with additional metadata describing the source code location of each traced function call. This enables tools to correlate traced events with their origin in the application's source code for enhanced debugging and analysis. + +If a tool subscribes to both `"ur.call"` and `"ur.call.debug"`, only notifications from `"ur.call.debug"` will be delivered to avoid duplication. ## SYCL Stream `"sycl"` Notification Signatures From 9cf847639d83ff20ef724081dccaaac34d381c01 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Mon, 15 Sep 2025 21:38:34 -0700 Subject: [PATCH 11/13] Make sycl.debug a normal stream --- sycl/source/detail/event_impl.cpp | 5 ++-- sycl/source/detail/global_handler.cpp | 2 +- sycl/source/detail/graph/graph_impl.cpp | 17 ++++++------ sycl/source/detail/queue_impl.cpp | 32 ++++++++++++----------- sycl/source/detail/scheduler/commands.cpp | 30 ++++++++++----------- sycl/source/detail/xpti_registry.hpp | 21 +++++++++++++-- sycl/source/handler.cpp | 17 ++++++------ 7 files changed, 70 insertions(+), 54 deletions(-) diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 0694664c88fda..b0c838cdd890c 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -288,7 +288,8 @@ void event_impl::wait(bool *Success) { void *TelemetryEvent = nullptr; uint64_t IId = 0; std::string Name; - TelemetryEvent = instrumentationProlog(Name, GSYCLStreamID, IId); + auto StreamID = detail::getActiveXPTIStreamID(); + TelemetryEvent = instrumentationProlog(Name, StreamID, IId); #endif auto EventHandle = getHandle(); @@ -300,7 +301,7 @@ void event_impl::wait(bool *Success) { detail::Scheduler::getInstance().waitForEvent(*this, Success); #ifdef XPTI_ENABLE_INSTRUMENTATION - instrumentationEpilog(TelemetryEvent, Name, GSYCLStreamID, IId); + instrumentationEpilog(TelemetryEvent, Name, StreamID, IId); #endif } diff --git a/sycl/source/detail/global_handler.cpp b/sycl/source/detail/global_handler.cpp index 97f37a98c30d3..eb7d11d3b29d4 100644 --- a/sycl/source/detail/global_handler.cpp +++ b/sycl/source/detail/global_handler.cpp @@ -96,7 +96,7 @@ void GlobalHandler::TraceEventXPTI(const char *Message) { CodeLocation.fileName(), CodeLocation.functionName(), CodeLocation.lineNumber(), CodeLocation.columnNumber(), nullptr); // Notify the subscriber with a diagnostic message when an exception occurs. - TP.stream(detail::GSYCLStreamID) + TP.stream(detail::getActiveXPTIStreamID()) .traceType(xpti::trace_point_type_t::diagnostics) .parentEvent(GSYCLCallEvent) .notify(static_cast(Message)); diff --git a/sycl/source/detail/graph/graph_impl.cpp b/sycl/source/detail/graph/graph_impl.cpp index 70b96eaa8660e..4c94f7d5c5e3e 100644 --- a/sycl/source/detail/graph/graph_impl.cpp +++ b/sycl/source/detail/graph/graph_impl.cpp @@ -731,6 +731,7 @@ ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNodeDirect( const bool xptiEnabled = xptiTraceEnabled(); xpti_td *CmdTraceEvent = nullptr; uint64_t InstanceID = 0; + auto StreamID = detail::getActiveXPTIStreamID(); if (xptiEnabled) { sycl::detail::CGExecKernel *CGExec = static_cast(Node.MCommandGroup.get()); @@ -738,13 +739,12 @@ ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNodeDirect( CGExec->MFunctionName.c_str(), CGExec->MLine, CGExec->MColumn); std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData( - sycl::detail::GSYCLStreamID, CGExec->MSyclKernel, CodeLoc, - CGExec->MIsTopCodeLoc, CGExec->MDeviceKernelInfo, nullptr, - CGExec->MNDRDesc, CGExec->MKernelBundle.get(), CGExec->MArgs); + StreamID, CGExec->MSyclKernel, CodeLoc, CGExec->MIsTopCodeLoc, + CGExec->MDeviceKernelInfo, nullptr, CGExec->MNDRDesc, + CGExec->MKernelBundle.get(), CGExec->MArgs); if (CmdTraceEvent) - sycl::detail::emitInstrumentationGeneral(sycl::detail::GSYCLStreamID, - InstanceID, CmdTraceEvent, - xpti::trace_task_begin, nullptr); + sycl::detail::emitInstrumentationGeneral( + StreamID, InstanceID, CmdTraceEvent, xpti::trace_task_begin, nullptr); } #endif @@ -764,9 +764,8 @@ ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNodeDirect( #ifdef XPTI_ENABLE_INSTRUMENTATION if (xptiEnabled && CmdTraceEvent) - sycl::detail::emitInstrumentationGeneral(sycl::detail::GSYCLStreamID, - InstanceID, CmdTraceEvent, - xpti::trace_task_end, nullptr); + sycl::detail::emitInstrumentationGeneral( + StreamID, InstanceID, CmdTraceEvent, xpti::trace_task_end, nullptr); #endif return NewSyncPoint; diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 27ddf050d6b0f..52ee77d251eb2 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -167,7 +167,7 @@ event queue_impl::memset(void *Ptr, int Value, size_t Count, xpti::framework::tracepoint_scope_t TP( CodeLocation.fileName(), FuncName, CodeLocation.lineNumber(), CodeLocation.columnNumber(), (void *)this); - TP.stream(detail::GSYCLStreamID) + TP.stream(detail::getActiveXPTIStreamID()) .traceType(xpti::trace_point_type_t::node_create) .parentEvent(detail::GSYCLGraphEvent); @@ -221,7 +221,7 @@ event queue_impl::memcpy(void *Dest, const void *Src, size_t Count, xpti::framework::tracepoint_scope_t TP( CodeLoc.fileName(), CodeLoc.functionName(), CodeLoc.lineNumber(), CodeLoc.columnNumber(), (void *)this); - TP.stream(detail::GSYCLStreamID) + TP.stream(detail::getActiveXPTIStreamID()) .traceType(xpti::trace_point_type_t::node_create) .parentEvent(GSYCLGraphEvent); const char *UserData = "memory_transfer_node::memcpy"; @@ -540,10 +540,10 @@ void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc, auto WaitEvent = Event->event_ref(); // We will allow the device type to be set xpti::addMetadata(WaitEvent, "sycl_device_type", queueDeviceToString(this)); - // We limit the amount of metadata that is added to streams if the - // "sycl.debug" stream is not subscribed to. This improves the performance - // when this data is not required by the tool or the collector. - if (xptiCheckTraceEnabled(detail::GSYCLDebugStreamID)) { + // We limit the amount of metadata that is added to the regular stream. + // Only "sycl.debug" stream will have the full information. This improves the + // performance when this data is not required by the tool or the collector. + if (isDebugStream(StreamID)) { if (HasSourceInfo) { xpti::addMetadata(WaitEvent, "sym_function_name", CodeLoc.functionName()); xpti::addMetadata(WaitEvent, "sym_source_file_name", CodeLoc.fileName()); @@ -580,11 +580,11 @@ void queue_impl::instrumentationEpilog(void *TelemetryEvent, std::string &Name, void queue_impl::wait(const detail::code_location &CodeLoc) { (void)CodeLoc; #ifdef XPTI_ENABLE_INSTRUMENTATION - // xptiCheckTraceEnabled() is being performed in instrumentationProlog(). void *TelemetryEvent = nullptr; uint64_t IId; std::string Name; - TelemetryEvent = instrumentationProlog(CodeLoc, Name, GSYCLStreamID, IId); + auto StreamID = detail::getActiveXPTIStreamID(); + TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId); #endif if (!MGraph.expired()) { @@ -666,7 +666,7 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { #ifdef XPTI_ENABLE_INSTRUMENTATION // There is an early return in instrumentationEpilog() if no subscribers are // subscribing to queue.wait(). - instrumentationEpilog(TelemetryEvent, Name, GSYCLStreamID, IId); + instrumentationEpilog(TelemetryEvent, Name, StreamID, IId); #endif } @@ -675,7 +675,7 @@ void queue_impl::constructorNotification() { // If there are no subscribers to queue_create, return immediately. constexpr uint16_t NotificationTraceType = static_cast(xpti::trace_point_type_t::queue_create); - if (!xptiCheckTraceEnabled(detail::GSYCLStreamID, NotificationTraceType)) + if (!anyTraceEnabled(NotificationTraceType)) return; // We do not have CodeLoc for the queue constructor, so we will have to create // a queue name with the queue ID to create an event; this step can be avoided @@ -704,9 +704,10 @@ void queue_impl::constructorNotification() { reinterpret_cast(getHandleRef())); // Also publish to TLS before notification. xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID); - xptiNotifySubscribers( - detail::GSYCLStreamID, (uint16_t)xpti::trace_point_type_t::queue_create, - nullptr, TEvent, MInstanceID, static_cast("queue_create")); + xptiNotifySubscribers(detail::getActiveXPTIStreamID(), + (uint16_t)xpti::trace_point_type_t::queue_create, + nullptr, TEvent, MInstanceID, + static_cast("queue_create")); #endif } @@ -714,10 +715,11 @@ void queue_impl::destructorNotification() { #if XPTI_ENABLE_INSTRUMENTATION constexpr uint16_t NotificationTraceType = static_cast(xpti::trace_point_type_t::queue_destroy); - if (xptiCheckTraceEnabled(detail::GSYCLStreamID, NotificationTraceType)) { + if (anyTraceEnabled(NotificationTraceType)) { // Use the cached trace event, stream id and instance IDs for the // destructor - xptiNotifySubscribers(detail::GSYCLStreamID, NotificationTraceType, nullptr, + xptiNotifySubscribers(detail::getActiveXPTIStreamID(), + NotificationTraceType, nullptr, (xpti::trace_event_data_t *)MTraceEvent, MInstanceID, static_cast("queue_destroy")); xptiReleaseEvent((xpti::trace_event_data_t *)MTraceEvent); diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 7ba97854839b5..e2e62cfcd531b 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -575,7 +575,7 @@ Command::Command( return; // Obtain the stream ID so all commands can emit traces to that stream; // copying it to the member variable to avoid ABI breakage - MStreamID = detail::GSYCLStreamID; + MStreamID = getActiveXPTIStreamID(); #endif } @@ -2037,14 +2037,12 @@ void instrumentationAddExtraKernelMetadata( } } -void instrumentationFillCommonData(const std::string &KernelName, - const std::string &FuncName, - const std::string &FileName, uint64_t Line, - uint64_t Column, const void *const Address, - queue_impl *Queue, - std::optional &FromSource, - uint64_t &OutInstanceID, - xpti_td *&OutTraceEvent) { +void instrumentationFillCommonData( + xpti::stream_id_t StreamID, const std::string &KernelName, + const std::string &FuncName, const std::string &FileName, uint64_t Line, + uint64_t Column, const void *const Address, queue_impl *Queue, + std::optional &FromSource, uint64_t &OutInstanceID, + xpti_td *&OutTraceEvent) { // Get source file, line number information from the CommandGroup object // and create payload using name, address, and source info bool HasSourceInfo = !FileName.empty(); @@ -2072,7 +2070,7 @@ void instrumentationFillCommonData(const std::string &KernelName, } // We limit the metadata to only include the kernel name and device // information by default. - if (xptiCheckTraceEnabled(detail::GSYCLDebugStreamID)) { + if (detail::isDebugStream(StreamID)) { if (FromSource.has_value()) { xpti::addMetadata(CmdTraceEvent, "from_source", FromSource.value()); } @@ -2122,7 +2120,7 @@ std::pair emitKernelInstrumentationData( ? CodeLoc.functionName() : std::string(); - instrumentationFillCommonData(KernelName, FuncName, FileName, + instrumentationFillCommonData(StreamID, KernelName, FuncName, FileName, CodeLoc.lineNumber(), CodeLoc.columnNumber(), Address, Queue, FromSource, InstanceID, CmdTraceEvent); @@ -2136,7 +2134,7 @@ std::pair emitKernelInstrumentationData( getQueueID(Queue)); // Add the additional metadata only if the debug information is subscribed // to; in this case, it is the kernel and its parameters. - if (xptiCheckTraceEnabled(detail::GSYCLDebugStreamID)) { + if (detail::isDebugStream(StreamID)) { instrumentationAddExtraKernelMetadata( CmdTraceEvent, NDRDesc, KernelBundleImplPtr, DeviceKernelInfo, SyclKernel, Queue, CGArgs); @@ -2180,10 +2178,10 @@ void ExecCGCommand::emitInstrumentationData() { FuncName = MCommandGroup->MFunctionName; xpti_td *CmdTraceEvent = nullptr; - instrumentationFillCommonData(KernelName, FuncName, MCommandGroup->MFileName, - MCommandGroup->MLine, MCommandGroup->MColumn, - MAddress, MQueue.get(), FromSource, MInstanceID, - CmdTraceEvent); + instrumentationFillCommonData(MStreamID, KernelName, FuncName, + MCommandGroup->MFileName, MCommandGroup->MLine, + MCommandGroup->MColumn, MAddress, MQueue.get(), + FromSource, MInstanceID, CmdTraceEvent); if (CmdTraceEvent) { xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, diff --git a/sycl/source/detail/xpti_registry.hpp b/sycl/source/detail/xpti_registry.hpp index 3cac639d63532..9598eefab7b27 100644 --- a/sycl/source/detail/xpti_registry.hpp +++ b/sycl/source/detail/xpti_registry.hpp @@ -68,6 +68,22 @@ extern xpti::trace_event_data_t *GApiEvent; // We will pick a global constant so that the pointer in TLS never goes stale inline constexpr auto XPTI_QUEUE_INSTANCE_ID_KEY = "queue_id"; + +// Helper to check if xpti stream is debug. +inline bool isDebugStream(xpti::stream_id_t StreamID) { + return StreamID == detail::GSYCLDebugStreamID; +} + +inline uint8_t getActiveXPTIStreamID() { + return xptiCheckTraceEnabled(detail::GSYCLDebugStreamID) + ? detail::GSYCLDebugStreamID + : detail::GSYCLStreamID; +} + +inline bool anyTraceEnabled(uint16_t TraceType) { + return xptiCheckTraceEnabled(detail::GSYCLDebugStreamID, TraceType) || + xptiCheckTraceEnabled(detail::GSYCLStreamID, TraceType); +} #endif class XPTIRegistry { @@ -108,8 +124,9 @@ class XPTIRegistry { if (detail::GSYCLGraphEvent) { // The graph event is a global event and will be used as the parent for // all nodes (command groups, memory allocations, etc) - xptiNotifySubscribers(detail::GSYCLStreamID, xpti::trace_graph_create, - nullptr, detail::GSYCLGraphEvent, + xptiNotifySubscribers(detail::getActiveXPTIStreamID(), + xpti::trace_graph_create, nullptr, + detail::GSYCLGraphEvent, detail::GSYCLGraphEvent->instance_id, nullptr); } auto MemAllocEventTP = diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 71aa92124f1c8..25de5d1ca2b30 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -662,20 +662,20 @@ event handler::finalize() { ? nullptr : detail::event_impl::create_device_event(impl->get_queue()); -#ifdef XPTI_ENABLE_INSTRUMENTATION - // Only enable instrumentation if there are subscribes to the SYCL stream - const bool xptiEnabled = xptiCheckTraceEnabled(detail::GSYCLStreamID); -#endif auto EnqueueKernel = [&]() { #ifdef XPTI_ENABLE_INSTRUMENTATION xpti_td *CmdTraceEvent = nullptr; uint64_t InstanceID = 0; + auto StreamID = detail::getActiveXPTIStreamID(); + // Only enable instrumentation if there are subscribes to the SYCL + // stream + const bool xptiEnabled = xptiCheckTraceEnabled(StreamID); if (xptiEnabled) { std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData( - detail::GSYCLStreamID, MKernel, MCodeLoc, impl->MIsTopCodeLoc, + StreamID, MKernel, MCodeLoc, impl->MIsTopCodeLoc, *impl->MDeviceKernelInfoPtr, impl->get_queue_or_null(), impl->MNDRDesc, KernelBundleImpPtr, impl->MArgs); - detail::emitInstrumentationGeneral(detail::GSYCLStreamID, InstanceID, + detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent, xpti::trace_task_begin, nullptr); } @@ -697,11 +697,10 @@ event handler::finalize() { // Emit signal only when event is created if (!DiscardEvent) { detail::emitInstrumentationGeneral( - detail::GSYCLStreamID, InstanceID, CmdTraceEvent, - xpti::trace_signal, + StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal, static_cast(ResultEvent->getHandle())); } - detail::emitInstrumentationGeneral(detail::GSYCLStreamID, InstanceID, + detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent, xpti::trace_task_end, nullptr); } From a4de171b52e099c211480cc85c51687d8a442378 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Mon, 15 Sep 2025 23:41:43 -0700 Subject: [PATCH 12/13] Update documentation --- .../design/SYCLInstrumentationUsingXPTI.md | 29 +++++++------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/sycl/doc/design/SYCLInstrumentationUsingXPTI.md b/sycl/doc/design/SYCLInstrumentationUsingXPTI.md index 42126acc10bfe..d0c7c3e59869a 100644 --- a/sycl/doc/design/SYCLInstrumentationUsingXPTI.md +++ b/sycl/doc/design/SYCLInstrumentationUsingXPTI.md @@ -221,20 +221,9 @@ trace point that includes an event, a trace point type and a notification. can attached a per-instance user data during this notification call that *must* be guaranteed to be valid for the duration of the notification call. -- To support performance and debug streams, subscribing to the stream **"sycl.debug"** - allows the default streams to contain additional metadata when keeping overheads - to a minimum is not important - This document will outline the protocol for the streams of data being generated by the SYCL runtime. -## SYCL Stream `"sycl.debug"` Notification Signatures - -The "sycl.debug" stream is a dummy stream, when subscribed to, indicates to the SYCL -runtime that additional metadata can be propagated for each SYCL event. Many toolchains -like to keep the overheads low when subscribing to the data and this provides a mechanism -to get more data when keeping overheads low is not important. - ## SYCL Stream `"ur.call"` Notification Signatures | Trace Point Type | Parameter Description | Metadata | @@ -248,22 +237,26 @@ The `"ur.call.debug"` stream emits the same notifications as the `"ur.call"` str If a tool subscribes to both `"ur.call"` and `"ur.call.debug"`, only notifications from `"ur.call.debug"` will be delivered to avoid duplication. -## SYCL Stream `"sycl"` Notification Signatures +## SYCL Stream `"sycl"` and `"sycl.debug"` Notification Signatures All trace point types in bold provide semantic information about the graph, nodes and edges and the topology of the asynchronous task graphs created by the runtime. + +The `"sycl.debug"` stream emits the same notifications as the `"sycl"` stream, but with additional metadata. If toolchains want to keep the overhead low then subscribing to `"sycl"` stream is the right option, if toolchains want to get more data and keeping overheads low is not important then they should subscribe to `"sycl.debug"`. + +If a tool subscribes to both `"sycl"` and `"sycl.debug"`, only notifications from `"sycl.debug"` will be delivered to avoid duplication. | Trace Point Type | Parameter Description | Metadata | | :----------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | **`graph_create`** |
  • **trace_type**: `xpti::trace_point_type_t::graph_create` that marks the creation of an asynchronous graph.
  • **parent**: `nullptr`
  • **event**: The global asynchronous graph object ID. All other graph related events such as node and edge creation will always this ID as the parent ID.
  • **instance**: Unique ID related to the event, but not a correlation ID as there are other events to correlate to.
  • **user_data**: `nullptr`
  • SYCL runtime will always have one instance of a graph object with many disjoint subgraphs that get created during the execution of an application.
    | None | -| **`node_create`** |
  • **trace_type**: `xpti::trace_point_type_t::node_create` that marks the creation of a node in the graph, which could be a computational kernel or memory operation.
  • **parent**: The global graph event that is created during the `graph_create` event.
  • **event**: The unique ID that identifies the data parallel compute operation or memory operation.
  • **instance**: Unique ID related to the event, but not a correlation ID as there are other events to correlate to.
  • **user_data**: Command type that has been submitted through the command group handler, which could be one of: `command_group_node`, `memory_transfer_node`, `memory_allocation_node`, `sub_buffer_creation_node`, `memory_deallocation_node`, `host_acc_create_buffer_lock_node`, `host_acc_destroy_buffer_release_node` combined with the address of the command group object and represented as a string [`const char *`]
  • SYCL runtime will always have one instance of a graph object with many disjoint subgraphs that get created during the execution of an application.
    |
  • Computational Kernels
  • `sycl_device`, `sycl_device_type`, `sycl_device_name`, `kernel_name`, `from_source`, `sym_function_name`, `sym_source_file_name`, `sym_line_no`. The per-queue unique ID can be obtained by using `xptiGetStashedTuple` API call. See `queue_create` documentation for usage information.
  • Memory operations
  • `memory_object`, `offset`, `access_range`, `allocation_type`, `copy_from`, `copy_to`,`device_id`, `device_name`, `memory_size`, `src_memory_ptr`, `dest_memory_ptr`, `memory_ptr`, `value_set`. The per-queue unique ID can be obtained by using `xptiGetSTashedTuple` API call. See `queue_create` documentation for usage information. | +| **`node_create`** |
  • **trace_type**: `xpti::trace_point_type_t::node_create` that marks the creation of a node in the graph, which could be a computational kernel or memory operation.
  • **parent**: The global graph event that is created during the `graph_create` event.
  • **event**: The unique ID that identifies the data parallel compute operation or memory operation.
  • **instance**: Unique ID related to the event, but not a correlation ID as there are other events to correlate to.
  • **user_data**: Command type that has been submitted through the command group handler, which could be one of: `command_group_node`, `memory_transfer_node`, `memory_allocation_node`, `sub_buffer_creation_node`, `memory_deallocation_node`, `host_acc_create_buffer_lock_node`, `host_acc_destroy_buffer_release_node` combined with the address of the command group object and represented as a string [`const char *`]
  • SYCL runtime will always have one instance of a graph object with many disjoint subgraphs that get created during the execution of an application.
    | "sycl" stream:
  • Computational Kernels
  • `sycl_device`, `sycl_device_type`, `sycl_device_name`, `kernel_name`. The per-queue unique ID can be obtained by using `xptiGetStashedTuple` API call. See `queue_create` documentation for usage information.
  • Memory operations
  • `memory_object`, `offset`, `access_range`, `allocation_type`, `copy_from`, `copy_to`,`device_id`, `device_name`, `memory_size`, `src_memory_ptr`, `dest_memory_ptr`, `memory_ptr`, `value_set`. The per-queue unique ID can be obtained by using `xptiGetSTashedTuple` API call. See `queue_create` documentation for usage information.

    Additional metadata on "sycl.debug" stream:
  • Computational Kernels
  • `from_source`, `sym_function_name`, `sym_source_file_name`, `sym_line_no`. | | **`edge_create`** |
  • **trace_type**: `xpti::trace_point_type_t::graph_create` that marks the creation of an asynchronous graph.
  • **parent**: The global graph event that is created during the `graph_create` event.
  • **event**: The unique ID that identifies the dependence relationship between two operations.
  • **instance**: Unique ID related to the event, but not a correlation ID as there are other events to correlate to.
  • **user_data**: `nullptr`
  • Edges capture dependence relationships between computations or computations and memory operations.
    | `access_mode`, `memory_object`, `event` | | `task_begin` |
  • **trace_type**: `xpti::trace_point_type_t::task_begin` that marks the beginning of a task belonging to one of the nodes in the graph. When the trace event is for a kernel executing on a device other than the the CPU, this `task_begin` and corresponding `task_end` mark the submit call. To track the execution of the kernel on the device, the `trace_signal` event must be monitored to get the kernel event handle from which the execution statistics can be gathered.
  • **parent**: The global graph event that is created during the `graph_create` event.
  • **event**: The event ID will reflect the ID of the computation or memory operation kernel, which would be one of the nodes in the graph.
  • **instance**: Instance ID for the task that can be used to correlate it with the corresponding `task_end` trace event.
  • **user_data**: `nullptr`
  • | Same metadata defined for the node the trace task belongs to. | | `task_end` |
  • **trace_type**: `xpti::trace_point_type_t::task_end` that marks the end of a task belonging to one of the nodes in the graph. The specific task instance can be tacked through the instance ID parameter which helps correlate the `task_end` with the corresponding `task_begin`.
  • **parent**: The global graph event that is created during the `graph_create` event.
  • **event**: The event ID will reflect the ID of the computation or memory operation kernel, which would be one of the nodes in the graph.
  • **instance**: Instance ID for the task that can be used to correlate it with the corresponding `task_begin` trace event.
  • **user_data**: `nullptr`
  • | Same metadata defined for the node the trace task belongs to. | | `signal` |
  • **trace_type**: `xpti::trace_point_type_t::signal` that marks the an event that contains the `event` handle of an executing kernel on a device.
  • **parent**: The global graph event that is created during the `graph_create` event.
  • **event**: The event ID will reflect the ID of the computation or memory operation kernel, which would be one of the nodes in the graph.
  • **instance**: Instance ID for the task for which the signal has been generated.
  • **user_data**: Address of the kernel event that is returned by the device so the progress of the execution can be tracked.
  • | Same metadata defined for the node the trace task belongs to. | -| `wait_begin` |
  • **trace_type**: `xpti::trace_point_type_t::wait_begin` that marks the beginning of the wait on an `event`
  • **parent**: `nullptr`
  • **event**: The event ID will reflect the ID of the command group object submission that created this event, the queue or a new event based on the combination of the string "queue.wait" and the address of the event.
  • **instance**: Unique ID to allow the correlation of the `wait_begin` event with the `wait_end` event.
  • **user_data**: String indicating `queue.wait` and the address of the event sent in as `const char *`
  • Tracing the `queue.wait()` or `queue.wait_and_throw()` will capture the waiting on the action represented by the event object, which could be the execution of a kernel, completion of a memory operation, etc that is embedded in the command group handler. All wait events contain metadata that indicates the SYCL device on which the corresponding operation has been submitted. If the event is from a command group handler, then the source location information is available as well.
    | `sycl_device`, `sycl_device_type`, `sycl_device_name`, `sym_function_name`, `sym_source_file_name`, `sym_line_no`, `sym_column_no` | -| `wait_end` |
  • **trace_type**: `xpti::trace_point_type_t::wait_end` that marks the beginning of the wait on an `event`
  • **parent**: `nullptr`
  • **event**: The event ID will reflect the ID of the command group object submission that created this event, the queue or a new event based on the combination of the string "queue.wait" and the address of the event.
  • **instance**: Unique ID to allow the correlation of the `wait_begin` event with the `wait_end` event.
  • **user_data**: String indicating `queue.wait` and the address of the event as `const char *`
  • | `sycl_device`, `sycl_device_type`, `sycl_device_name`, `sym_function_name`, `sym_source_file_name`, `sym_line_no`, `sym_column_no` | -| `barrier_begin` |
  • **trace_type**: `xpti::trace_point_type_t::barrier_begin` that marks the beginning of a barrier while enqueuing a command group object
  • **parent**: The global graph event that is created during the `graph_create` event.
  • **event**: The event ID will reflect the ID of the command group object that has encountered a barrier during the enqueue operation.
  • **instance**: Unique ID to allow the correlation of the `barrier_begin` event with the `barrier_end` event.
  • **user_data**: String indicating `enqueue.barrier` and the reason for the barrier as a `const char *`
  • The reason for the barrier could be one of `Buffer locked by host accessor`, `Blocked by host task` or `Unknown reason`.
    |
  • Computational Kernels
  • `sycl_device`, `sycl_device_type`, `sycl_device_name`, `kernel_name`, `from_source`, `sym_function_name`, `sym_source_file_name`, `sym_line_no`, `sym_column_no`
  • Memory operations
  • `memory_object`, `offset`, `access_range_start`, `access_range_end`, `allocation_type`, `copy_from`, `copy_to` | -| `barrier_end` |
  • **trace_type**: `xpti::trace_point_type_t::barrier_end` that marks the end of the barrier that is encountered during enqueue.
  • **parent**: The global graph event that is created during the `graph_create` event.
  • **event**: The event ID will reflect the ID of the command group object that has encountered a barrier during the enqueue operation.
  • **instance**: Unique ID to allow the correlation of the `barrier_begin` event with the `barrier_end` event.
  • **user_data**: String indicating `enqueue.barrier` and the reason for the barrier as a `const char *`
  • The reason for the barrier could be one of `Buffer locked by host accessor`, `Blocked by host task` or `Unknown reason`.
    |
  • Computational Kernels
  • `sycl_device`, `sycl_device_type`, `sycl_device_name`, `kernel_name`, `from_source`, `sym_function_name`, `sym_source_file_name`, `sym_line_no`, `sym_column_no`
  • Memory operations
  • `memory_object`, `offset`, `access_range_start`, `access_range_end`, `allocation_type`, `copy_from`, `copy_to` | -| `diagnostics` |
  • **trace_type**: `xpti::trace_point_type_t::diagnostics` that represents general purpose notifications. For example, it is emitted when an exception is thrown in SYCL runtime.
  • **parent**: Set to NULL.
  • **event**: The event ID will reflect the code location of notification origin, if available.
  • **instance**: An instance ID that records the number of times this code location has been seen.
  • **user_data**: String with diagnostic message as a `const char *`
  • | `sym_function_name`, `sym_source_file_name`, `sym_line_no`, `sym_column_no` | +| `wait_begin` |
  • **trace_type**: `xpti::trace_point_type_t::wait_begin` that marks the beginning of the wait on an `event`
  • **parent**: `nullptr`
  • **event**: The event ID will reflect the ID of the command group object submission that created this event, the queue or a new event based on the combination of the string "queue.wait" and the address of the event.
  • **instance**: Unique ID to allow the correlation of the `wait_begin` event with the `wait_end` event.
  • **user_data**: String indicating `queue.wait` and the address of the event sent in as `const char *`
  • Tracing the `queue.wait()` or `queue.wait_and_throw()` will capture the waiting on the action represented by the event object, which could be the execution of a kernel, completion of a memory operation, etc that is embedded in the command group handler. All wait events contain metadata that indicates the SYCL device on which the corresponding operation has been submitted. If the event is from a command group handler, then the source location information is available as well.
    | "sycl.stream": `sycl_device`, `sycl_device_type`, `sycl_device_name`

    Additional metadata on "sycl.debug" stream: `sym_function_name`, `sym_source_file_name`, `sym_line_no`, `sym_column_no` | +| `wait_end` |
  • **trace_type**: `xpti::trace_point_type_t::wait_end` that marks the beginning of the wait on an `event`
  • **parent**: `nullptr`
  • **event**: The event ID will reflect the ID of the command group object submission that created this event, the queue or a new event based on the combination of the string "queue.wait" and the address of the event.
  • **instance**: Unique ID to allow the correlation of the `wait_begin` event with the `wait_end` event.
  • **user_data**: String indicating `queue.wait` and the address of the event as `const char *`
  • | "sycl" stream: `sycl_device`, `sycl_device_type`, `sycl_device_name`

    Additional metadata on "sycl.debug" stream: `sym_function_name`, `sym_source_file_name`, `sym_line_no`, `sym_column_no` | +| `barrier_begin` |
  • **trace_type**: `xpti::trace_point_type_t::barrier_begin` that marks the beginning of a barrier while enqueuing a command group object
  • **parent**: The global graph event that is created during the `graph_create` event.
  • **event**: The event ID will reflect the ID of the command group object that has encountered a barrier during the enqueue operation.
  • **instance**: Unique ID to allow the correlation of the `barrier_begin` event with the `barrier_end` event.
  • **user_data**: String indicating `enqueue.barrier` and the reason for the barrier as a `const char *`
  • The reason for the barrier could be one of `Buffer locked by host accessor`, `Blocked by host task` or `Unknown reason`.
    | "sycl" stream:
  • Computational Kernels
  • `sycl_device`, `sycl_device_type`, `sycl_device_name`, `kernel_name`
  • Memory operations
  • `memory_object`, `offset`, `access_range_start`, `access_range_end`, `allocation_type`, `copy_from`, `copy_to`

    Additional metadata on "sycl.debug" stream:
  • Computational Kernels
  • `from_source`, `sym_function_name`, `sym_source_file_name`, `sym_line_no`, `sym_column_no` | +| `barrier_end` |
  • **trace_type**: `xpti::trace_point_type_t::barrier_end` that marks the end of the barrier that is encountered during enqueue.
  • **parent**: The global graph event that is created during the `graph_create` event.
  • **event**: The event ID will reflect the ID of the command group object that has encountered a barrier during the enqueue operation.
  • **instance**: Unique ID to allow the correlation of the `barrier_begin` event with the `barrier_end` event.
  • **user_data**: String indicating `enqueue.barrier` and the reason for the barrier as a `const char *`
  • The reason for the barrier could be one of `Buffer locked by host accessor`, `Blocked by host task` or `Unknown reason`.
    | "sycl" stream:
  • Computational Kernels
  • `sycl_device`, `sycl_device_type`, `sycl_device_name`, `kernel_name`
  • Memory operations
  • `memory_object`, `offset`, `access_range_start`, `access_range_end`, `allocation_type`, `copy_from`, `copy_to`

    Additional metadata on "sycl.debug" stream:
  • Computational Kernels
  • `from_source`, `sym_function_name`, `sym_source_file_name`, `sym_line_no`, `sym_column_no` | +| `diagnostics` |
  • **trace_type**: `xpti::trace_point_type_t::diagnostics` that represents general purpose notifications. For example, it is emitted when an exception is thrown in SYCL runtime.
  • **parent**: Set to NULL.
  • **event**: The event ID will reflect the code location of notification origin, if available.
  • **instance**: An instance ID that records the number of times this code location has been seen.
  • **user_data**: String with diagnostic message as a `const char *`
  • | On "sycl.debug" stream: `sym_function_name`, `sym_source_file_name`, `sym_line_no`, `sym_column_no` | | `queue_create` |
  • **trace_type**: `xpti::trace_point_type_t::queue_create` that marks the creation of a queue, which could be a device or host queue.
  • **parent**: Set to NULL.
  • **event**: The event ID will reflect the code location of notification origin, if available.
  • **instance**: Will contain the instance ID of the queue, which is a per-queue unique identifier. For example, if the queue is created in a loop, the **event** will be the same as it happens at the same code location, but the **instance** will help differentiate between the different queues being created and used.
  • **user_data**: Not meaningful for this trace type. Could contain string with 'queue_create' or nullptr.
  • This signal is emitted only once for every queue object, notifies about successful queue creation (the signal is not emitted if any exception happens during queue creation).
    | `sycl_context`, `sycl_device_name`, `sycl_device`, `is_inorder`, `queue_handle`

    `queue_id` field has been deprecated and replaced with the **instance** information and supporting XPTI API calls (`xptiGetStashedTuple`). Using the **instance** information is the recommended approach.

    `char *key = 0;`

    `uint64_t value;`

    `if (xptiGetStashedTuple(&key, value) ==xpti::result_t::XPTI_RESULT_SUCCESS) {`

    `// key will contain "queue_id"`

    `// value will contain the per-queue unique ID`

    `}`

    `queue_handle` is absent for host queue since no backend object is used.

    | | `queue_destroy` |
  • **trace_type**: `xpti::trace_point_type_t::queue_destroy` that marks the destruction of a queue, which could be a device or host queue.
  • **parent**: Set to NULL.
  • **event**: The event ID will reflect the code location of notification origin, if available.
  • **instance**: Will contain the instance ID of the queue, which is a per-queue unique identifier.
  • **user_data**: Not meaningful for this trace type. Could contain string with 'queue_destroy' or nullptr.
  • This signal is emitted only once for every queue object, notifies about queue destruction. Contains the same metadata set for corresponding 'queue_create' signal. **event** and corresponding metadata will be destroyed right after notification.
    | `sycl_context`, `sycl_device_name`, `sycl_device`, `is_inorder`, `queue_id`, `queue_handle`

    `queue_id` field has been deprecated and replaced with the **instance** information and supporting XPTI API calls (`xptiGetStashedTuple`). Using the **instance** information is the recommended approach. `queue_handle` is absent for host queue since no backend object is used. | From f4b11b847aa88e4a06dc7796a6fcc79a0c0c7a96 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Tue, 16 Sep 2025 09:02:33 -0700 Subject: [PATCH 13/13] Register all callbacks for debug stream as it is a normal stream now --- .../xpti_trace/xptitest_subscriber/XPTISubscriber.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp b/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp index 7932fc4f59c61..952c5124d144a 100644 --- a/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp +++ b/sycl/unittests/xpti_trace/xptitest_subscriber/XPTISubscriber.cpp @@ -153,10 +153,17 @@ XPTI_CALLBACK_API void xptiTraceInit(unsigned int /*major_version*/, xptiRegisterCallback(StreamID, xpti::trace_queue_destroy, testCallback); xptiRegisterCallback(StreamID, xpti::trace_task_begin, testCallback); xptiRegisterCallback(StreamID, xpti::trace_task_end, testCallback); - // Register at least one callback to the debug stream to enable additional + // Register callbacks for the debug stream to enable additional // metadata that may be used for tests. uint8_t DebugStreamID = xptiRegisterStream("sycl.debug"); + xptiRegisterCallback(DebugStreamID, xpti::trace_diagnostics, testCallback); xptiRegisterCallback(DebugStreamID, xpti::trace_node_create, testCallback); + xptiRegisterCallback(DebugStreamID, xpti::trace_task_begin, testCallback); + xptiRegisterCallback(DebugStreamID, xpti::trace_task_end, testCallback); + xptiRegisterCallback(DebugStreamID, xpti::trace_queue_create, testCallback); + xptiRegisterCallback(DebugStreamID, xpti::trace_queue_destroy, testCallback); + xptiRegisterCallback(DebugStreamID, xpti::trace_task_begin, testCallback); + xptiRegisterCallback(DebugStreamID, xpti::trace_task_end, testCallback); } XPTI_CALLBACK_API void xptiTraceFinish(const char * /*StreamName*/) {}