Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 14 additions & 15 deletions libkineto/src/plugin/xpupti/XpuptiActivityHandlers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ void XpuptiActivityProfilerSession::handleRuntimeActivity(
traceBuffer_.emplace_activity(
traceBuffer_.span, ActivityType::XPU_RUNTIME, std::string(activity->_name));
auto& runtime_activity = traceBuffer_.activities.back();
runtime_activity->startTime = activity->_start_timestamp;
runtime_activity->endTime = activity->_end_timestamp;
runtime_activity->startTime = convertTimeStampValue(activity->_start_timestamp);
runtime_activity->endTime = convertTimeStampValue(activity->_end_timestamp);
runtime_activity->id = activity->_correlation_id;
runtime_activity->device = activity->_process_id;
runtime_activity->resource = activity->_thread_id;
Expand Down Expand Up @@ -142,8 +142,8 @@ void XpuptiActivityProfilerSession::handleKernelActivity(
ActivityType::CONCURRENT_KERNEL,
std::string(activity->_name));
auto& kernel_activity = traceBuffer_.activities.back();
kernel_activity->startTime = activity->_start_timestamp;
kernel_activity->endTime = activity->_end_timestamp;
kernel_activity->startTime = convertTimeStampValue(activity->_start_timestamp);
kernel_activity->endTime = convertTimeStampValue(activity->_end_timestamp);
kernel_activity->id = activity->_correlation_id;
kernel_activity->device = getDeviceIdxFromUUID(activity->_device_uuid);
kernel_activity->resource = getMappedQueueId(activity->_sycl_queue_id);
Expand Down Expand Up @@ -185,11 +185,10 @@ inline std::string memcpyName(
ptiViewMemoryTypeToString(dst));
}

template <class pti_view_memory_record_type>
inline std::string bandwidth(pti_view_memory_record_type* activity) {
auto duration = activity->_end_timestamp - activity->_start_timestamp;
auto bytes = activity->_bytes;
return duration == 0 ? "\"N/A\"" : fmt::format("{}", bytes * 1.0 / duration);
inline std::string bandwidth(uint64_t bytes, uint64_t duration) {
auto calBytesGB = bytes / 1024.0 / 1024.0 / 1024.0;
auto calDurationS = duration / 1000.0 / 1000.0 / 1000.0;
return duration == 0 ? "\"N/A\"" : fmt::format("{}", calBytesGB / calDurationS);
}

void XpuptiActivityProfilerSession::handleMemcpyActivity(
Expand All @@ -205,8 +204,8 @@ void XpuptiActivityProfilerSession::handleMemcpyActivity(
memcpyName(
activity->_memcpy_type, activity->_mem_src, activity->_mem_dst));
auto& memcpy_activity = traceBuffer_.activities.back();
memcpy_activity->startTime = activity->_start_timestamp;
memcpy_activity->endTime = activity->_end_timestamp;
memcpy_activity->startTime = convertTimeStampValue(activity->_start_timestamp);
memcpy_activity->endTime = convertTimeStampValue(activity->_end_timestamp);
memcpy_activity->id = activity->_correlation_id;
memcpy_activity->device = getDeviceIdxFromUUID(activity->_device_uuid);
memcpy_activity->resource = getMappedQueueId(activity->_sycl_queue_id);
Expand All @@ -227,7 +226,7 @@ void XpuptiActivityProfilerSession::handleMemcpyActivity(
memcpy_activity->addMetadata("correlation", activity->_correlation_id);
memcpy_activity->addMetadata("memory opration id", activity->_mem_op_id);
memcpy_activity->addMetadata("bytes", activity->_bytes);
memcpy_activity->addMetadata("memory bandwidth (GB/s)", bandwidth(activity));
memcpy_activity->addMetadata("memory bandwidth (GB/s)", bandwidth(activity->_bytes, memcpy_activity->duration()));

checkTimestampOrder(&*memcpy_activity);
if (outOfRange(*memcpy_activity)) {
Expand All @@ -253,8 +252,8 @@ void XpuptiActivityProfilerSession::handleMemsetActivity(
fmt::format(
"Memset ({})", ptiViewMemoryTypeToString(activity->_mem_type)));
auto& memset_activity = traceBuffer_.activities.back();
memset_activity->startTime = activity->_start_timestamp;
memset_activity->endTime = activity->_end_timestamp;
memset_activity->startTime = convertTimeStampValue(activity->_start_timestamp);
memset_activity->endTime = convertTimeStampValue(activity->_end_timestamp);
memset_activity->id = activity->_correlation_id;
memset_activity->device = getDeviceIdxFromUUID(activity->_device_uuid);
memset_activity->resource = getMappedQueueId(activity->_sycl_queue_id);
Expand All @@ -275,7 +274,7 @@ void XpuptiActivityProfilerSession::handleMemsetActivity(
memset_activity->addMetadata("correlation", activity->_correlation_id);
memset_activity->addMetadata("memory opration id", activity->_mem_op_id);
memset_activity->addMetadata("bytes", activity->_bytes);
memset_activity->addMetadata("memory bandwidth (GB/s)", bandwidth(activity));
memset_activity->addMetadata("memory bandwidth (GB/s)", bandwidth(activity->_bytes, memset_activity->duration()));

checkTimestampOrder(&*memset_activity);
if (outOfRange(*memset_activity)) {
Expand Down
36 changes: 36 additions & 0 deletions libkineto/src/plugin/xpupti/XpuptiActivityProfiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,29 @@ XpuptiActivityProfilerSession::XpuptiActivityProfilerSession(
const libkineto::Config& config,
const std::set<ActivityType>& activity_types)
: xpti_(xpti), config_(config.clone()), activity_types_(activity_types) {
#if HAS_XPUPTI
enumDeviceUUIDs();
xpti_.setMaxBufferSize(config_->activitiesMaxGpuBufferSize());

// set the time method to XPU PTI to align with the torch
#ifdef _WIN32
XPUPTI_CALL(
ptiViewSetTimestampCallback([]() -> uint64_t {
auto system = std::chrono::time_point_cast<std::chrono::nanoseconds>(
std::chrono::system_clock::now());
return system.time_since_epoch().count();
}));
#else
use_xpupti_tsc = config.getTSCTimestampFlag();
if (use_xpupti_tsc) {
XPUPTI_CALL(
ptiViewSetTimestampCallback([]() -> uint64_t {
return getApproximateTime();
}));
}
#endif
#endif

xpti_.enableXpuptiActivities(activity_types_);
}

Expand Down Expand Up @@ -145,6 +166,21 @@ DeviceIndex_t XpuptiActivityProfilerSession::getDeviceIdxFromUUID(
return static_cast<DeviceIndex_t>(std::distance(deviceUUIDs_.begin(), it));
}

// The time convert function is converting for non-windows platform.
// It is using the torch-defined converting.
uint64_t XpuptiActivityProfilerSession::convertTimeStampValue(
const uint64_t timeStampValue) {
#if defined(_WIN32)
return timeStampValue;
#else
if (use_xpupti_tsc){
return get_time_converter()(timeStampValue);
} else {
return timeStampValue;
}
#endif
}

// =========== ActivityProfiler Public Methods ============= //
const std::set<ActivityType> kXpuTypes{
ActivityType::GPU_MEMCPY,
Expand Down
6 changes: 6 additions & 0 deletions libkineto/src/plugin/xpupti/XpuptiActivityProfiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <mutex>
#include <unordered_map>

#include "../../ApproximateClock.h"
#include "XpuptiProfilerMacros.h"

namespace KINETO_NAMESPACE {
Expand Down Expand Up @@ -74,7 +75,12 @@ class XpuptiActivityProfilerSession : public libkineto::IActivityProfilerSession
// for profiling activity creation
DeviceIndex_t getDeviceIdxFromUUID(const uint8_t deviceUUID[16]);

// time convert. Internally use the overrided torch converter
uint64_t convertTimeStampValue(const uint64_t timeStampValue);

private:
// align semantics with use_cupti_tsc
bool use_xpupti_tsc{true};
static uint32_t iterationCount_;
static std::vector<std::array<unsigned char, 16>> deviceUUIDs_;
static std::vector<std::string> correlateRuntimeOps_;
Expand Down