diff --git a/libkineto/src/plugin/xpupti/XpuptiActivityHandlers.cpp b/libkineto/src/plugin/xpupti/XpuptiActivityHandlers.cpp index 3003f4d3a..14fee7912 100644 --- a/libkineto/src/plugin/xpupti/XpuptiActivityHandlers.cpp +++ b/libkineto/src/plugin/xpupti/XpuptiActivityHandlers.cpp @@ -105,8 +105,8 @@ void XpuptiActivityProfilerSession::handleRuntimeActivity( traceBuffer_.emplace_activity( traceBuffer_.span, ActivityType::XPU_RUNTIME, std::string(activity->_name)); auto& runtime_activity = traceBuffer_.activities.back(); - runtime_activity->startTime = activity->_start_timestamp; - runtime_activity->endTime = activity->_end_timestamp; + runtime_activity->startTime = convertTimeStampValue(activity->_start_timestamp); + runtime_activity->endTime = convertTimeStampValue(activity->_end_timestamp); runtime_activity->id = activity->_correlation_id; runtime_activity->device = activity->_process_id; runtime_activity->resource = activity->_thread_id; @@ -142,8 +142,8 @@ void XpuptiActivityProfilerSession::handleKernelActivity( ActivityType::CONCURRENT_KERNEL, std::string(activity->_name)); auto& kernel_activity = traceBuffer_.activities.back(); - kernel_activity->startTime = activity->_start_timestamp; - kernel_activity->endTime = activity->_end_timestamp; + kernel_activity->startTime = convertTimeStampValue(activity->_start_timestamp); + kernel_activity->endTime = convertTimeStampValue(activity->_end_timestamp); kernel_activity->id = activity->_correlation_id; kernel_activity->device = getDeviceIdxFromUUID(activity->_device_uuid); kernel_activity->resource = getMappedQueueId(activity->_sycl_queue_id); @@ -185,11 +185,10 @@ inline std::string memcpyName( ptiViewMemoryTypeToString(dst)); } -template -inline std::string bandwidth(pti_view_memory_record_type* activity) { - auto duration = activity->_end_timestamp - activity->_start_timestamp; - auto bytes = activity->_bytes; - return duration == 0 ? "\"N/A\"" : fmt::format("{}", bytes * 1.0 / duration); +inline std::string bandwidth(uint64_t bytes, uint64_t duration) { + auto calBytesGB = bytes / 1024.0 / 1024.0 / 1024.0; + auto calDurationS = duration / 1000.0 / 1000.0 / 1000.0; + return duration == 0 ? "\"N/A\"" : fmt::format("{}", calBytesGB / calDurationS); } void XpuptiActivityProfilerSession::handleMemcpyActivity( @@ -205,8 +204,8 @@ void XpuptiActivityProfilerSession::handleMemcpyActivity( memcpyName( activity->_memcpy_type, activity->_mem_src, activity->_mem_dst)); auto& memcpy_activity = traceBuffer_.activities.back(); - memcpy_activity->startTime = activity->_start_timestamp; - memcpy_activity->endTime = activity->_end_timestamp; + memcpy_activity->startTime = convertTimeStampValue(activity->_start_timestamp); + memcpy_activity->endTime = convertTimeStampValue(activity->_end_timestamp); memcpy_activity->id = activity->_correlation_id; memcpy_activity->device = getDeviceIdxFromUUID(activity->_device_uuid); memcpy_activity->resource = getMappedQueueId(activity->_sycl_queue_id); @@ -227,7 +226,7 @@ void XpuptiActivityProfilerSession::handleMemcpyActivity( memcpy_activity->addMetadata("correlation", activity->_correlation_id); memcpy_activity->addMetadata("memory opration id", activity->_mem_op_id); memcpy_activity->addMetadata("bytes", activity->_bytes); - memcpy_activity->addMetadata("memory bandwidth (GB/s)", bandwidth(activity)); + memcpy_activity->addMetadata("memory bandwidth (GB/s)", bandwidth(activity->_bytes, memcpy_activity->duration())); checkTimestampOrder(&*memcpy_activity); if (outOfRange(*memcpy_activity)) { @@ -253,8 +252,8 @@ void XpuptiActivityProfilerSession::handleMemsetActivity( fmt::format( "Memset ({})", ptiViewMemoryTypeToString(activity->_mem_type))); auto& memset_activity = traceBuffer_.activities.back(); - memset_activity->startTime = activity->_start_timestamp; - memset_activity->endTime = activity->_end_timestamp; + memset_activity->startTime = convertTimeStampValue(activity->_start_timestamp); + memset_activity->endTime = convertTimeStampValue(activity->_end_timestamp); memset_activity->id = activity->_correlation_id; memset_activity->device = getDeviceIdxFromUUID(activity->_device_uuid); memset_activity->resource = getMappedQueueId(activity->_sycl_queue_id); @@ -275,7 +274,7 @@ void XpuptiActivityProfilerSession::handleMemsetActivity( memset_activity->addMetadata("correlation", activity->_correlation_id); memset_activity->addMetadata("memory opration id", activity->_mem_op_id); memset_activity->addMetadata("bytes", activity->_bytes); - memset_activity->addMetadata("memory bandwidth (GB/s)", bandwidth(activity)); + memset_activity->addMetadata("memory bandwidth (GB/s)", bandwidth(activity->_bytes, memset_activity->duration())); checkTimestampOrder(&*memset_activity); if (outOfRange(*memset_activity)) { diff --git a/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.cpp b/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.cpp index c8a0ea33a..992b46a0c 100644 --- a/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.cpp +++ b/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.cpp @@ -25,8 +25,29 @@ XpuptiActivityProfilerSession::XpuptiActivityProfilerSession( const libkineto::Config& config, const std::set& activity_types) : xpti_(xpti), config_(config.clone()), activity_types_(activity_types) { +#if HAS_XPUPTI enumDeviceUUIDs(); xpti_.setMaxBufferSize(config_->activitiesMaxGpuBufferSize()); + + // set the time method to XPU PTI to align with the torch +#ifdef _WIN32 + XPUPTI_CALL( + ptiViewSetTimestampCallback([]() -> uint64_t { + auto system = std::chrono::time_point_cast( + std::chrono::system_clock::now()); + return system.time_since_epoch().count(); + })); +#else + use_xpupti_tsc = config.getTSCTimestampFlag(); + if (use_xpupti_tsc) { + XPUPTI_CALL( + ptiViewSetTimestampCallback([]() -> uint64_t { + return getApproximateTime(); + })); + } +#endif +#endif + xpti_.enableXpuptiActivities(activity_types_); } @@ -145,6 +166,21 @@ DeviceIndex_t XpuptiActivityProfilerSession::getDeviceIdxFromUUID( return static_cast(std::distance(deviceUUIDs_.begin(), it)); } +// The time convert function is converting for non-windows platform. +// It is using the torch-defined converting. +uint64_t XpuptiActivityProfilerSession::convertTimeStampValue( + const uint64_t timeStampValue) { +#if defined(_WIN32) + return timeStampValue; +#else + if (use_xpupti_tsc){ + return get_time_converter()(timeStampValue); + } else { + return timeStampValue; + } +#endif +} + // =========== ActivityProfiler Public Methods ============= // const std::set kXpuTypes{ ActivityType::GPU_MEMCPY, diff --git a/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.h b/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.h index 86a75e7a5..900a9c90f 100644 --- a/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.h +++ b/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.h @@ -3,6 +3,7 @@ #include #include +#include "../../ApproximateClock.h" #include "XpuptiProfilerMacros.h" namespace KINETO_NAMESPACE { @@ -74,7 +75,12 @@ class XpuptiActivityProfilerSession : public libkineto::IActivityProfilerSession // for profiling activity creation DeviceIndex_t getDeviceIdxFromUUID(const uint8_t deviceUUID[16]); + // time convert. Internally use the overrided torch converter + uint64_t convertTimeStampValue(const uint64_t timeStampValue); + private: + // align semantics with use_cupti_tsc + bool use_xpupti_tsc{true}; static uint32_t iterationCount_; static std::vector> deviceUUIDs_; static std::vector correlateRuntimeOps_;