From ca1eedb9214bb12fa906180575894ffb78de8ac7 Mon Sep 17 00:00:00 2001
From: Shivam Raikundalia <sraikund@meta.com>
Date: Thu, 12 Sep 2024 18:38:05 -0700
Subject: [PATCH 01/16] Add CUPTI/RoCM versions to traces (#985)

Summary:
Pull Request resolved: https://github.com/pytorch/kineto/pull/985

Because of the differences that are emerging between different versions, it would be useful in the metadata we could see which third-party library version we are using. We add them to our kineto traces in this diff.

Reviewed By: aaronenyeshi

Differential Revision: D62538511

fbshipit-source-id: 813af45c1d2e82002ca7b4b7f3788407f13c254c
---
 libkineto/src/CuptiActivityProfiler.cpp | 26 ++++++++++++++++++++-----
 libkineto/src/CuptiActivityProfiler.h   | 24 +++++++++++++++--------
 2 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/libkineto/src/CuptiActivityProfiler.cpp b/libkineto/src/CuptiActivityProfiler.cpp
index 1509de00f..286d5b359 100644
--- a/libkineto/src/CuptiActivityProfiler.cpp
+++ b/libkineto/src/CuptiActivityProfiler.cpp
@@ -193,7 +193,7 @@ std::ostream& operator<<(std::ostream& oss, const CuptiActivityProfiler::ErrorCo
 
 void CuptiActivityProfiler::transferCpuTrace(
     std::unique_ptr<libkineto::CpuTraceBuffer> cpuTrace) {
-  std::lock_guard<std::mutex> guard(mutex_);
+  std::lock_guard<std::recursive_mutex> guard(mutex_);
   const string& trace_name = cpuTrace->span.name;
   if (currentRunloopState_ != RunloopState::CollectTrace &&
       currentRunloopState_ != RunloopState::ProcessTrace) {
@@ -248,6 +248,12 @@ void CuptiActivityProfiler::logGpuVersions() {
       "cuda_runtime_version", std::to_string(cudaRuntimeVersion));
   LOGGER_OBSERVER_ADD_METADATA(
       "cuda_driver_version", std::to_string(cudaDriverVersion));
+  addVersionMetadata(
+      "cupti_version", std::to_string(cuptiVersion));
+  addVersionMetadata(
+      "cuda_runtime_version", std::to_string(cudaRuntimeVersion));
+  addVersionMetadata(
+      "cuda_driver_version", std::to_string(cudaDriverVersion));
 
 #elif defined(HAS_ROCTRACER)
   uint32_t majorVersion = roctracer_version_major();
@@ -267,6 +273,13 @@ void CuptiActivityProfiler::logGpuVersions() {
       "hip_runtime_version", std::to_string(hipRuntimeVersion));
   LOGGER_OBSERVER_ADD_METADATA(
       "hip_driver_version", std::to_string(hipDriverVersion));
+  addVersionMetadata(
+      "roctracer_version", roctracerVersion);
+  addVersionMetadata(
+      "hip_runtime_version", std::to_string(hipRuntimeVersion));
+  addVersionMetadata(
+      "hip_driver_version", std::to_string(hipDriverVersion));
+
 #endif
 }
 
@@ -274,6 +287,9 @@ void CuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) {
   LOG(INFO) << "Processing " << traceBuffers_->cpu.size() << " CPU buffers";
   VLOG(0) << "Profile time range: " << captureWindowStartTime_ << " - "
           << captureWindowEndTime_;
+  for (auto& pair : versionMetadata_) {
+    addMetadata(pair.first, pair.second);
+  }
   logger.handleTraceStart(metadata_);
   setCpuActivityPresent(false);
   setGpuActivityPresent(false);
@@ -948,7 +964,7 @@ void CuptiActivityProfiler::configureChildProfilers() {
 void CuptiActivityProfiler::configure(
     const Config& config,
     const time_point<system_clock>& now) {
-  std::lock_guard<std::mutex> guard(mutex_);
+  std::lock_guard<std::recursive_mutex> guard(mutex_);
   if (isActive()) {
     LOG(WARNING) << "CuptiActivityProfiler already busy, terminating";
     return;
@@ -1171,7 +1187,7 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
 
       if (cupti_.stopCollection) {
         // Go to process trace to clear any outstanding buffers etc
-        std::lock_guard<std::mutex> guard(mutex_);
+        std::lock_guard<std::recursive_mutex> guard(mutex_);
         stopTraceInternal(now);
         resetInternal();
         LOG(ERROR) << "State: Warmup stopped by CUPTI. (Buffer size configured is " << config_->activitiesMaxGpuBufferSize() / 1024 / 1024 << "MB)";
@@ -1230,7 +1246,7 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
         }
 #endif // HAS_CUPTI || HAS_ROCTRACER
 
-        std::lock_guard<std::mutex> guard(mutex_);
+        std::lock_guard<std::recursive_mutex> guard(mutex_);
         stopTraceInternal(now);
         VLOG_IF(0, collection_done) << "Reached profile end time";
         UST_LOGGER_MARK_COMPLETED(kCollectionStage);
@@ -1254,7 +1270,7 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
       }
       // FIXME: Probably want to allow interruption here
       // for quickly handling trace request via synchronous API
-      std::lock_guard<std::mutex> guard(mutex_);
+      std::lock_guard<std::recursive_mutex> guard(mutex_);
       processTraceInternal(*logger_);
       UST_LOGGER_MARK_COMPLETED(kPostProcessingStage);
       resetInternal();
diff --git a/libkineto/src/CuptiActivityProfiler.h b/libkineto/src/CuptiActivityProfiler.h
index ff8c70d6b..0669be2d9 100644
--- a/libkineto/src/CuptiActivityProfiler.h
+++ b/libkineto/src/CuptiActivityProfiler.h
@@ -155,23 +155,23 @@ class CuptiActivityProfiler {
   // Synchronous control API
   void startTrace(
       const std::chrono::time_point<std::chrono::system_clock>& now) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    std::lock_guard<std::recursive_mutex> guard(mutex_);
     startTraceInternal(now);
   }
 
   void stopTrace(const std::chrono::time_point<std::chrono::system_clock>& now) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    std::lock_guard<std::recursive_mutex> guard(mutex_);
     stopTraceInternal(now);
   }
 
   // Process CPU and GPU traces
   void processTrace(ActivityLogger& logger) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    std::lock_guard<std::recursive_mutex> guard(mutex_);
     processTraceInternal(logger);
   }
 
   void reset() {
-    std::lock_guard<std::mutex> guard(mutex_);
+    std::lock_guard<std::recursive_mutex> guard(mutex_);
     resetInternal();
   }
 
@@ -197,7 +197,7 @@ class CuptiActivityProfiler {
     // as key, because that's what CUPTI records.
     int32_t tid = threadId();
     int32_t pid = processId();
-    std::lock_guard<std::mutex> guard(mutex_);
+    std::lock_guard<std::recursive_mutex> guard(mutex_);
     recordThreadInfo(sysTid, tid, pid);
   }
 
@@ -215,13 +215,18 @@ class CuptiActivityProfiler {
   }
 
   void addMetadata(const std::string& key, const std::string& value) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    std::lock_guard<std::recursive_mutex> guard(mutex_);
     metadata_[key] = value;
   }
 
+  void addVersionMetadata(const std::string& key, const std::string& value) {
+    std::lock_guard<std::recursive_mutex> guard(mutex_);
+    versionMetadata_[key] = value;
+  }
+
   void addChildActivityProfiler(
       std::unique_ptr<IActivityProfiler> profiler) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    std::lock_guard<std::recursive_mutex> guard(mutex_);
     profilers_.push_back(std::move(profiler));
   }
 
@@ -472,7 +477,7 @@ class CuptiActivityProfiler {
   // ***************************************************************************
 
   // Mutex to protect non-atomic access to below state
-  std::mutex mutex_;
+  std::recursive_mutex mutex_;
 
   // Runloop phase
   std::atomic<RunloopState> currentRunloopState_{RunloopState::WaitForRequest};
@@ -528,6 +533,9 @@ class CuptiActivityProfiler {
   // Trace metadata
   std::unordered_map<std::string, std::string> metadata_;
 
+  // Version metadata
+  std::unordered_map<std::string, std::string> versionMetadata_;
+
   // child activity profilers
   std::vector<std::unique_ptr<IActivityProfiler>> profilers_;
 

From 79be4704ca3d04728ef51bd19300d9e0fc99003c Mon Sep 17 00:00:00 2001
From: Valentin Andrei <vandrei@meta.com>
Date: Thu, 12 Sep 2024 20:36:23 -0700
Subject: [PATCH 02/16] Modify CUDA test to attempt overlapping D2H transfers

Summary:
Distributed checkpointing for GenAI requires very expensive memory downloads from the GPU which can block the trainer thread if it happens that it issues a new D2H transfer.

For example, we want that model parameters and optimizer state downloads to overlap with compute. However if for some reason the forward pass thread or the backward pass issue a D2H transfer, it will have to wait until the checkpoint download was completed.

This code is a test program for Kineto that issues CUDA kernels, memory copies and UVM accesses in a configurable way. This change enables us to issue multiple GPU D2H downloads to host memory using multiple streams on multiple threads. Previously the D2H downloads were very short because we downloaded a single output value of 4 bytes. With the change we download an entire buffer.

Reviewed By: xerothermic

Differential Revision: D62601073

fbshipit-source-id: ed192723403787f37d45bf63d39e1a768df4a1d3
---
 libkineto/stress_test/kineto_stress_test.cpp  |  9 +++-
 .../stress_test/random_ops_stress_test.cu     | 29 ++++++++---
 libkineto/stress_test/tensor_cache.cu         | 52 +++++--------------
 libkineto/stress_test/tensor_cache.cuh        |  1 +
 4 files changed, 44 insertions(+), 47 deletions(-)

diff --git a/libkineto/stress_test/kineto_stress_test.cpp b/libkineto/stress_test/kineto_stress_test.cpp
index ecf02adf2..3b51ba698 100644
--- a/libkineto/stress_test/kineto_stress_test.cpp
+++ b/libkineto/stress_test/kineto_stress_test.cpp
@@ -182,7 +182,14 @@ void create_cuda_streams(stress_test_args& test_args) {
   if (test_args.use_memcpy_stream) {
     test_args.memcpy_streams = (cudaStream_t*)malloc(test_args.num_workers * sizeof(cudaStream_t));
     for (uint32_t i = 0; i < test_args.num_workers; ++i) {
-      checkCudaStatus(cudaStreamCreateWithFlags(test_args.memcpy_streams + i, cudaStreamNonBlocking), __LINE__);
+      if (i % 2 != 0) {
+        checkCudaStatus(cudaStreamCreateWithFlags(test_args.memcpy_streams + i, cudaStreamNonBlocking), __LINE__);
+      } else {
+        int leastPriority = 0;
+        int greatestPriority = 0;
+        checkCudaStatus(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority), __LINE__);
+        checkCudaStatus(cudaStreamCreateWithPriority(test_args.memcpy_streams + i, cudaStreamNonBlocking, leastPriority), __LINE__);
+      }
     }
   }
 
diff --git a/libkineto/stress_test/random_ops_stress_test.cu b/libkineto/stress_test/random_ops_stress_test.cu
index c25c0e60e..5d6018a2c 100644
--- a/libkineto/stress_test/random_ops_stress_test.cu
+++ b/libkineto/stress_test/random_ops_stress_test.cu
@@ -14,6 +14,7 @@
 
 namespace kineto_stress_test {
 
+#define CUDA_API_PER_THREAD_DEFAULT_STREAM
 #define RNG_SEED 2049
 
 // NCCL variables buffers
@@ -123,15 +124,15 @@ void run_stress_test(
   } else {
     v_streams = (cudaStream_t*)malloc(test_args.num_cuda_streams * sizeof(cudaStream_t));
     for (uint32_t i = 0; i < test_args.num_cuda_streams; ++i) {
-      checkCudaStatus(cudaStreamCreate(v_streams + i), __LINE__);
+      checkCudaStatus(cudaStreamCreateWithFlags(v_streams + i, cudaStreamNonBlocking), __LINE__);
     }
 
     if (test_args.use_memcpy_stream) {
-      checkCudaStatus(cudaStreamCreate(&memcpy_stream), __LINE__);
+      checkCudaStatus(cudaStreamCreateWithFlags(&memcpy_stream, cudaStreamNonBlocking), __LINE__);
     }
 
     if (test_args.use_uvm_stream) {
-      checkCudaStatus(cudaStreamCreate(&uvm_stream), __LINE__);
+      checkCudaStatus(cudaStreamCreateWithFlags(&uvm_stream, cudaStreamNonBlocking), __LINE__);
     }
   }
 
@@ -268,17 +269,29 @@ void run_stress_test(
           szTransfer, cudaMemcpyDeviceToDevice), __LINE__);
     }
 
-    // Simulate output download
-    if (p_memory_pool[pair_idx].b_copy_d2h) {
+    // Simulate checkpoint download. The odd workers will have higher stream priorities
+    // but lower number of transactions
+    bool enable_d2h_copy = p_memory_pool[pair_idx].b_copy_d2h;
+    if (thread_id % 2 != 0) {
+      if (rand_r(&rng_state) % 100 < 97) {
+        enable_d2h_copy = false;
+      }
+    }
+
+    if (enable_d2h_copy) {
+      // checkCudaStatus(cudaStreamSynchronize(current_stream), __LINE__);
       uint32_t rand_index = rand_r(&rng_state) % p_memory_pool[pair_idx].n_elements;
       checkCudaStatus(
           cudaMemcpyAsync(
-              h_output + i,
-              p_memory_pool[pair_idx].d_C + rand_index,
-              sizeof(float),
+              p_memory_pool[pair_idx].h_C,
+              p_memory_pool[pair_idx].d_C,
+              p_memory_pool[pair_idx].n_elements * sizeof(float),
               cudaMemcpyDeviceToHost,
               current_memcpy_stream),
           __LINE__);
+      uint32_t rand_idx_out = rand_r(&rng_state) % test_args.num_operations;
+      // checkCudaStatus(cudaStreamSynchronize(current_memcpy_stream), __LINE__);
+      h_output[rand_idx_out] = p_memory_pool[pair_idx].h_C[rand_index];
     }
 
     // Get memory during execution
diff --git a/libkineto/stress_test/tensor_cache.cu b/libkineto/stress_test/tensor_cache.cu
index 9f1c104ab..8de3a33c0 100644
--- a/libkineto/stress_test/tensor_cache.cu
+++ b/libkineto/stress_test/tensor_cache.cu
@@ -13,6 +13,7 @@
 
 namespace kineto_stress_test {
 
+#define CUDA_API_PER_THREAD_DEFAULT_STREAM
 #define RNG_SEED 1025
 
 // A kernel that fills a device buffer with random values
@@ -92,8 +93,11 @@ void add_pairs_to_tensor_cache(tensor_cache_args cache_args, uint32_t
     // Simulate output download
     if (((float)(rand() % 32767) / 32767.0) < cache_args.prob_d2h) {
       p_memory_pool[i].b_copy_d2h = true;
+      checkCudaStatus(cudaHostAlloc(&p_memory_pool[i].h_C, num_elements * sizeof(float), cudaHostAllocDefault), __LINE__);
+      simple_lcg_host(p_memory_pool[i].h_C, num_elements);
     } else {
       p_memory_pool[i].b_copy_d2h = false;
+      p_memory_pool[i].h_C = NULL;
     }
 
     // Now we have a new tensor pair
@@ -151,42 +155,6 @@ void re_initialize_buffer_values() {
 }
 
 void free_and_realloc_tensor_pairs(tensor_pair *tensor_pair, cudaStream_t stream) {
-// Older CUDA versions don't know about async malloc and free
-#if defined(CUDA_VERSION) && CUDA_VERSION > 11000 && defined(ASYNC_MALLOC)
-
-  checkCudaStatus(
-    cudaFreeAsync(tensor_pair->d_A, stream),
-        __LINE__);
-  checkCudaStatus(
-    cudaFreeAsync(tensor_pair->d_B, stream),
-        __LINE__);
-  checkCudaStatus(
-    cudaFreeAsync(tensor_pair->d_C, stream),
-        __LINE__);
-
-  // Allocate device buffers
-  uint32_t num_elements = tensor_pair->n_elements;
-  checkCudaStatus(
-    cudaMallocAsync(
-        &tensor_pair->d_A,
-        num_elements * sizeof(float),
-        stream),
-      __LINE__);
-  checkCudaStatus(
-    cudaMallocAsync(
-        &tensor_pair->d_B,
-        num_elements * sizeof(float),
-        stream),
-        __LINE__);
-  checkCudaStatus(
-    cudaMallocAsync(
-        &tensor_pair->d_C,
-        num_elements * sizeof(float),
-        stream),
-        __LINE__);
-
-#else
-
   checkCudaStatus(cudaFree(tensor_pair->d_A), __LINE__);
   checkCudaStatus(cudaFree(tensor_pair->d_B), __LINE__);
   checkCudaStatus(cudaFree(tensor_pair->d_C), __LINE__);
@@ -203,8 +171,6 @@ void free_and_realloc_tensor_pairs(tensor_pair *tensor_pair, cudaStream_t stream
     num_elements * sizeof(float)),
     __LINE__);
 
-#endif // CUDA_VERSION >= 11000
-
   if (tensor_pair->b_copy_h2d) {
     checkCudaStatus(cudaFreeHost(tensor_pair->h_A), __LINE__);
     checkCudaStatus(cudaFreeHost(tensor_pair->h_B), __LINE__);
@@ -215,6 +181,12 @@ void free_and_realloc_tensor_pairs(tensor_pair *tensor_pair, cudaStream_t stream
     simple_lcg_host(tensor_pair->h_A, num_elements);
     simple_lcg_host(tensor_pair->h_B, num_elements);
   }
+
+  if (tensor_pair->b_copy_d2h) {
+    checkCudaStatus(cudaFreeHost(tensor_pair->h_C), __LINE__);
+    checkCudaStatus(cudaHostAlloc(&tensor_pair->h_C, num_elements * sizeof(float), cudaHostAllocDefault), __LINE__);
+    simple_lcg_host(tensor_pair->h_C, num_elements);
+  }
 }
 
 void free_tensor_cache() {
@@ -231,6 +203,10 @@ void free_tensor_cache() {
       if (p_memory_pool[i].h_B) {
         checkCudaStatus(cudaFreeHost(p_memory_pool[i].h_B), __LINE__);
       }
+
+      if (p_memory_pool[i].h_C) {
+        checkCudaStatus(cudaFreeHost(p_memory_pool[i].h_C), __LINE__);
+      }
     }
   }
 
diff --git a/libkineto/stress_test/tensor_cache.cuh b/libkineto/stress_test/tensor_cache.cuh
index f6c79d76e..bcd0082c3 100644
--- a/libkineto/stress_test/tensor_cache.cuh
+++ b/libkineto/stress_test/tensor_cache.cuh
@@ -42,6 +42,7 @@ struct tensor_pair {
   // Host buffers
   float* h_A;
   float* h_B;
+  float* h_C;
 };
 
 // The memory pool object

From 2f7ce6f5475bba4e630e4d43504b8c2d5bd390da Mon Sep 17 00:00:00 2001
From: Shivam Raikundalia <sraikund@meta.com>
Date: Fri, 13 Sep 2024 12:17:33 -0700
Subject: [PATCH 03/16] Extend CPU User Annotations to End of Profile (#986)

Summary:
Pull Request resolved: https://github.com/pytorch/kineto/pull/986

If a CPU User Annotation doesn't end by the time the profile ends, the annotation is marked as a 0-length event. This can be annoying to look at because it seems like profiler just never got the annotation event when it did. Lets set the end time to the end of profiling.

Reviewed By: aaronenyeshi

Differential Revision: D62604717

fbshipit-source-id: 34cb06b87c3c369601e1e6df859f61377b8198f6
---
 libkineto/src/CuptiActivityProfiler.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libkineto/src/CuptiActivityProfiler.cpp b/libkineto/src/CuptiActivityProfiler.cpp
index 286d5b359..e02c325c1 100644
--- a/libkineto/src/CuptiActivityProfiler.cpp
+++ b/libkineto/src/CuptiActivityProfiler.cpp
@@ -397,6 +397,9 @@ void CuptiActivityProfiler::processCpuTrace(
               const std::unique_ptr<GenericTraceActivity>>::value,
           "handleActivity is unsafe and relies on the caller to maintain not "
           "only lifetime but also address stability.");
+      if (act->type() ==  ActivityType::USER_ANNOTATION && act->duration()<=0){
+        act->endTime = captureWindowEndTime_;
+      }
       logger.handleActivity(*act);
     }
     clientActivityTraceMap_[act->correlationId()] = &span_pair;

From 3d355d17e15d0fe647e86146434e6edd68025f74 Mon Sep 17 00:00:00 2001
From: Valentin Andrei <vandrei@meta.com>
Date: Fri, 13 Sep 2024 16:50:04 -0700
Subject: [PATCH 04/16] Overlapping D2H transfer example

Summary:
This diff instantiates 6 CPU threads that schedule CUDA kernels and memory copies between CPU and GPU. For each thread there are 2 CUDA streams: one for CUDA kernels, one for memory copies. For every kernel call, we instantiate in advance a pair of inputs and outputs on the host and on the device.

The CUDA kernel needs the following inputs:
- A buffer input
- B buffer input
- C buffer output

The memory copies are as follows:
- A is in pinned memory and copied via a H2D
- B is in pageable memory and copied via H2D
- 50% of C output buffers are in pinned and 50% are in pageable

The D2H to H2D ratio in the current config is somwhere around 2:1

3 out of 6 threads are issuing transactions on the lowest CUDA stream priority and the rest of them are using default priority.

Reviewed By: xerothermic

Differential Revision: D62670279

fbshipit-source-id: e3bca9af984eeae4bce865c35663bfd53ef97ba0
---
 libkineto/stress_test/kineto_stress_test.cpp  |  1 +
 .../stress_test/random_ops_stress_test.cu     |  5 ++++
 libkineto/stress_test/stress_test_dense.json  | 30 +++++++++----------
 libkineto/stress_test/tensor_cache.cu         | 25 +++++++++++++---
 libkineto/stress_test/tensor_cache.cuh        |  1 +
 5 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/libkineto/stress_test/kineto_stress_test.cpp b/libkineto/stress_test/kineto_stress_test.cpp
index 3b51ba698..6871b294f 100644
--- a/libkineto/stress_test/kineto_stress_test.cpp
+++ b/libkineto/stress_test/kineto_stress_test.cpp
@@ -182,6 +182,7 @@ void create_cuda_streams(stress_test_args& test_args) {
   if (test_args.use_memcpy_stream) {
     test_args.memcpy_streams = (cudaStream_t*)malloc(test_args.num_workers * sizeof(cudaStream_t));
     for (uint32_t i = 0; i < test_args.num_workers; ++i) {
+      // We want to test the effect of CUDA stream priorities on the order of memory transfers.
       if (i % 2 != 0) {
         checkCudaStatus(cudaStreamCreateWithFlags(test_args.memcpy_streams + i, cudaStreamNonBlocking), __LINE__);
       } else {
diff --git a/libkineto/stress_test/random_ops_stress_test.cu b/libkineto/stress_test/random_ops_stress_test.cu
index 5d6018a2c..f04f67333 100644
--- a/libkineto/stress_test/random_ops_stress_test.cu
+++ b/libkineto/stress_test/random_ops_stress_test.cu
@@ -278,6 +278,11 @@ void run_stress_test(
       }
     }
 
+    // Tehchnically we should wait for the kernels to complete before downloading
+    // using a stream synchronization on the compute stream. But if we want to generate
+    // multiple overlapping transfers, we need to remove the synchronization. That means
+    // the downloaded tensors may not have correct data.
+
     if (enable_d2h_copy) {
       // checkCudaStatus(cudaStreamSynchronize(current_stream), __LINE__);
       uint32_t rand_index = rand_r(&rng_state) % p_memory_pool[pair_idx].n_elements;
diff --git a/libkineto/stress_test/stress_test_dense.json b/libkineto/stress_test/stress_test_dense.json
index 44365b988..4b09174fd 100644
--- a/libkineto/stress_test/stress_test_dense.json
+++ b/libkineto/stress_test/stress_test_dense.json
@@ -1,18 +1,18 @@
 {
     "test_args": {
-        "num_operations": 500000,
-        "num_cuda_streams": 4,
+        "num_operations": 50000,
+        "num_cuda_streams": 1,
         "prob_cuda_malloc": 0.00,
-        "min_iters_kernel": 100,
-        "max_iters_kernel": 101,
+        "min_iters_kernel": 20,
+        "max_iters_kernel": 25,
         "memset_prob": 0.00,
-        "min_idle_us": 0,
-        "max_idle_us": 1,
+        "min_idle_us": 5,
+        "max_idle_us": 10,
         "simulate_host_time": false,
-        "num_workers": 1,
+        "num_workers": 6,
         "use_uvm_buffers": false,
         "uvm_kernel_prob": 0.05,
-        "parallel_uvm_alloc": true,
+        "parallel_uvm_alloc": false,
         "uvm_len": 1342177280,
         "is_multi_rank": false,
         "num_ranks": 0,
@@ -20,18 +20,18 @@
         "num_iters_nccl_sync": 10000,
         "pre_alloc_streams": true,
         "use_memcpy_stream": true,
-        "use_uvm_stream": true,
-        "monitor_mem_usage": true,
-        "trace_length_us": 500000,
+        "use_uvm_stream": false,
+        "monitor_mem_usage": false,
+        "trace_length_us": 2000000,
         "cupti_buffer_mb": 3
     },
     "cache_args": {
         "sz_cache_KB": 16000000,
         "sz_GPU_memory_KB": 16777216,
-        "sz_min_tensor_KB": 2048,
-        "sz_max_tensor_KB": 4096,
-        "prob_h2d": 0.0,
-        "prob_d2h": 0.00,
+        "sz_min_tensor_KB": 16384,
+        "sz_max_tensor_KB": 32768,
+        "prob_h2d": 0.1,
+        "prob_d2h": 0.99,
         "num_increments": 1,
         "num_pairs_per_increment": 10
     }
diff --git a/libkineto/stress_test/tensor_cache.cu b/libkineto/stress_test/tensor_cache.cu
index 8de3a33c0..7b1805f5d 100644
--- a/libkineto/stress_test/tensor_cache.cu
+++ b/libkineto/stress_test/tensor_cache.cu
@@ -80,7 +80,8 @@ void add_pairs_to_tensor_cache(tensor_cache_args cache_args, uint32_t
     if (((float)(rand() % 32767) / 32767.0) < cache_args.prob_h2d) {
       p_memory_pool[i].b_copy_h2d = true;
       checkCudaStatus(cudaHostAlloc(&p_memory_pool[i].h_A, num_elements * sizeof(float), cudaHostAllocDefault), __LINE__);
-      checkCudaStatus(cudaHostAlloc(&p_memory_pool[i].h_B, num_elements * sizeof(float), cudaHostAllocDefault), __LINE__);
+      // checkCudaStatus(cudaHostAlloc(&p_memory_pool[i].h_B, num_elements * sizeof(float), cudaHostAllocDefault), __LINE__);
+      p_memory_pool[i].h_B = (float*)malloc(sizeof(float) * num_elements);
 
       simple_lcg_host(p_memory_pool[i].h_A, num_elements);
       simple_lcg_host(p_memory_pool[i].h_B, num_elements);
@@ -93,7 +94,14 @@ void add_pairs_to_tensor_cache(tensor_cache_args cache_args, uint32_t
     // Simulate output download
     if (((float)(rand() % 32767) / 32767.0) < cache_args.prob_d2h) {
       p_memory_pool[i].b_copy_d2h = true;
-      checkCudaStatus(cudaHostAlloc(&p_memory_pool[i].h_C, num_elements * sizeof(float), cudaHostAllocDefault), __LINE__);
+      // Make 50% of the D2H on pageable and 50% on pinned memory
+      if (rand() % 2 == 1) {
+        checkCudaStatus(cudaHostAlloc(&p_memory_pool[i].h_C, num_elements * sizeof(float), cudaHostAllocDefault), __LINE__);
+        p_memory_pool[i].h_C_pinned = true;
+      } else {
+        p_memory_pool[i].h_C = (float*)malloc(sizeof(float) * num_elements);
+        p_memory_pool[i].h_C_pinned = false;
+      }
       simple_lcg_host(p_memory_pool[i].h_C, num_elements);
     } else {
       p_memory_pool[i].b_copy_d2h = false;
@@ -198,20 +206,29 @@ void free_tensor_cache() {
     if (p_memory_pool[i].b_copy_h2d) {
       if (p_memory_pool[i].h_A) {
         checkCudaStatus(cudaFreeHost(p_memory_pool[i].h_A), __LINE__);
+        p_memory_pool[i].h_A = NULL;
       }
 
       if (p_memory_pool[i].h_B) {
-        checkCudaStatus(cudaFreeHost(p_memory_pool[i].h_B), __LINE__);
+        //checkCudaStatus(cudaFreeHost(p_memory_pool[i].h_B), __LINE__);
+        free(p_memory_pool[i].h_B);
+        p_memory_pool[i].h_B = NULL;
       }
 
       if (p_memory_pool[i].h_C) {
-        checkCudaStatus(cudaFreeHost(p_memory_pool[i].h_C), __LINE__);
+        if (p_memory_pool[i].h_C_pinned) {
+          checkCudaStatus(cudaFreeHost(p_memory_pool[i].h_C), __LINE__);
+        } else {
+          free(p_memory_pool[i].h_C);
+        }
+        p_memory_pool[i].h_C = NULL;
       }
     }
   }
 
   if (p_memory_pool) {
     free(p_memory_pool);
+    p_memory_pool = NULL;
   }
 
   size_t mem_free = 0;
diff --git a/libkineto/stress_test/tensor_cache.cuh b/libkineto/stress_test/tensor_cache.cuh
index bcd0082c3..4317b0528 100644
--- a/libkineto/stress_test/tensor_cache.cuh
+++ b/libkineto/stress_test/tensor_cache.cuh
@@ -43,6 +43,7 @@ struct tensor_pair {
   float* h_A;
   float* h_B;
   float* h_C;
+  bool h_C_pinned;
 };
 
 // The memory pool object

From 45cc65aff5390b0bf79b0f700f503bea2463734d Mon Sep 17 00:00:00 2001
From: "Chen, Zejun" <zejun.chen@intel.com>
Date: Thu, 19 Sep 2024 19:18:22 -0700
Subject: [PATCH 05/16] add error handling for xpu profiler (#989)

Summary: Pull Request resolved: https://github.com/pytorch/kineto/pull/989

Reviewed By: sraikund16

Differential Revision: D63052607

Pulled By: briancoutinho

fbshipit-source-id: f8e27e1c8a44f1bf5dd850835f82b5136008a48b
---
 libkineto/src/init.cpp                            | 11 ++++++++---
 .../src/plugin/xpupti/XpuptiProfilerMacros.h      | 15 +++++++++++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/libkineto/src/init.cpp b/libkineto/src/init.cpp
index 55ef76b33..cc9d19040 100644
--- a/libkineto/src/init.cpp
+++ b/libkineto/src/init.cpp
@@ -203,10 +203,15 @@ void libkineto_init(bool cpuOnly, bool logOnError) {
   libkineto::api().registerProfilerFactory([]() -> std::unique_ptr<IActivityProfiler> {
     auto returnCode = ptiViewGPULocalAvailable();
     if (returnCode != PTI_SUCCESS) {
-      std::string errCode = std::to_string(returnCode);
-      std::string errMsg(
+      std::string errPrefixMsg(
           "Fail to enable Kineto Profiler on XPU due to error code: ");
-      throw std::runtime_error(errMsg + errCode);
+      errPrefixMsg = errPrefixMsg + std::to_string(returnCode);
+#if PTI_VERSION_MAJOR > 0 || PTI_VERSION_MINOR > 9
+      std::string errMsg(ptiResultTypeToString(returnCode));
+      throw std::runtime_error(errPrefixMsg + std::string(". The detailed error message is: ") + errMsg);
+#else
+      throw std::runtime_error(errPrefixMsg);
+#endif
     }
     return std::make_unique<XPUActivityProfiler>();
   });
diff --git a/libkineto/src/plugin/xpupti/XpuptiProfilerMacros.h b/libkineto/src/plugin/xpupti/XpuptiProfilerMacros.h
index a1eb54881..ab4cac151 100644
--- a/libkineto/src/plugin/xpupti/XpuptiProfilerMacros.h
+++ b/libkineto/src/plugin/xpupti/XpuptiProfilerMacros.h
@@ -11,6 +11,20 @@ namespace KINETO_NAMESPACE {
 
 using namespace libkineto;
 
+#if PTI_VERSION_MAJOR > 0 || PTI_VERSION_MINOR > 9
+#define XPUPTI_CALL(returnCode)                                                 \
+  {                                                                             \
+    if (returnCode != PTI_SUCCESS) {                                            \
+      std::string funcMsg(__func__);                                            \
+      std::string codeMsg = std::to_string(returnCode);                         \
+      std::string HeadMsg("Kineto Profiler on XPU got error from function ");   \
+      std::string Msg(". The error code is ");                                  \
+      std::string detailMsg(". The detailed error message is ");                \
+      detailMsg = detailMsg + std::string(ptiResultTypeToString(returnCode));   \
+      throw std::runtime_error(HeadMsg + funcMsg + Msg + codeMsg + detailMsg);  \
+    }                                                                           \
+  }
+#else
 #define XPUPTI_CALL(returnCode)                                               \
   {                                                                           \
     if (returnCode != PTI_SUCCESS) {                                          \
@@ -21,6 +35,7 @@ using namespace libkineto;
       throw std::runtime_error(HeadMsg + funcMsg + Msg + codeMsg);            \
     }                                                                         \
   }
+#endif
 
 class XpuptiActivityApi;
 using DeviceIndex_t = int8_t;

From 0a8763e19aadf535eeb8891c05e364a5a9943e3a Mon Sep 17 00:00:00 2001
From: Shivam Raikundalia <sraikund@meta.com>
Date: Mon, 23 Sep 2024 15:23:41 -0700
Subject: [PATCH 06/16] Align Roctracer to TSC Clock (#991)

Summary:
Pull Request resolved: https://github.com/pytorch/kineto/pull/991

Right now we align Roctracer events to system clock blindly regardless of what we are using in torch.profiler. We should use a clock based on  what is defined instead. This wont fix overlapping kernel events since we do a static offset when aligning but it will help make sure that kernel events always happen after kernel launches

Reviewed By: aaronenyeshi, briancoutinho

Differential Revision: D62984793

fbshipit-source-id: 4495a83de98dc3fb752754898588b93f4850e7a4
---
 libkineto/src/CuptiActivityProfiler.cpp | 18 ++++++++++++++++++
 libkineto/src/RoctracerActivityApi.cpp  | 19 ++++++++-----------
 libkineto/src/RoctracerActivityApi.h    |  2 ++
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/libkineto/src/CuptiActivityProfiler.cpp b/libkineto/src/CuptiActivityProfiler.cpp
index e02c325c1..2735c602c 100644
--- a/libkineto/src/CuptiActivityProfiler.cpp
+++ b/libkineto/src/CuptiActivityProfiler.cpp
@@ -103,6 +103,22 @@ std::function<time_t(approx_time_t)>& get_time_converter() {
   };
   return _time_converter;
 }
+#ifdef HAS_ROCTRACER
+timestamp_t getTimeOffset() {
+  int64_t t0, t00;
+  timespec t1;
+  t0 = libkineto::getApproximateTime();
+  clock_gettime(CLOCK_MONOTONIC, &t1);
+  t00 = libkineto::getApproximateTime();
+  
+  // Confvert to ns (if necessary)
+  t0 = libkineto::get_time_converter()(t0);
+  t00 = libkineto::get_time_converter()(t00);
+  
+  // Our stored timestamps (from roctracer and generated) are in CLOCK_MONOTONIC domain (in ns).
+  return (t0 >> 1) + (t00 >> 1) - timespec_to_ns(t1);
+}
+#endif
 
 #ifdef HAS_CUPTI
 bool& use_cupti_tsc() {
@@ -340,6 +356,8 @@ void CuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) {
 #ifdef HAS_ROCTRACER
   if (!cpuOnly_) {
     VLOG(0) << "Retrieving GPU activity buffers";
+    timestamp_t offset = getTimeOffset();
+    cupti_.setTimeOffset(offset);
     const int count = cupti_.processActivities(
         std::bind(&CuptiActivityProfiler::handleRoctracerActivity, this, std::placeholders::_1, &logger),
         std::bind(&CuptiActivityProfiler::handleCorrelationActivity, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3));
diff --git a/libkineto/src/RoctracerActivityApi.cpp b/libkineto/src/RoctracerActivityApi.cpp
index 51392a084..ec4928962 100644
--- a/libkineto/src/RoctracerActivityApi.cpp
+++ b/libkineto/src/RoctracerActivityApi.cpp
@@ -8,11 +8,12 @@
 
 #include "RoctracerActivityApi.h"
 
+#include "ApproximateClock.h"
 #include <cstring>
 #include <chrono>
 #include <functional>
 #include <time.h>
-
+#include "Logger.h"
 #include "Demangle.h"
 #include "output_base.h"
 #include "ThreadUtil.h"
@@ -65,20 +66,16 @@ inline bool RoctracerActivityApi::isLogged(libkineto::ActivityType atype) {
   return activityMaskSnapshot_ & (1 << static_cast<uint32_t>(atype));
 }
 
+void RoctracerActivityApi::setTimeOffset(timestamp_t toffset) {
+  toffset_ = toffset;
+}
+
 int RoctracerActivityApi::processActivities(
     std::function<void(const roctracerBase*)> handler,
     std::function<void(uint64_t, uint64_t, RoctracerLogger::CorrelationDomain)> correlationHandler) {
   // Find offset to map from monotonic clock to system clock.
   // This will break time-ordering of events but is status quo.
 
-  timespec t0, t1, t00;
-  clock_gettime(CLOCK_REALTIME, &t0);
-  clock_gettime(CLOCK_MONOTONIC, &t1);
-  clock_gettime(CLOCK_REALTIME, &t00);
-
-  const timestamp_t toffset = (timespec_to_ns(t0) >> 1) + (timespec_to_ns(t00) >> 1) - timespec_to_ns(t1);
-  // Our stored timestamps (from roctracer and generated) are in CLOCK_MONOTONIC domain (in ns).
-
   int count = 0;
 
   // Process all external correlations pairs
@@ -125,8 +122,8 @@ int RoctracerActivityApi::processActivities(
     }
     if (!filtered) {
       // Convert the begin and end timestamps from monotonic clock to system clock.
-      item->begin = item->begin + toffset;
-      item->end = item->end + toffset;
+      item->begin = item->begin + toffset_;
+      item->end = item->end + toffset_;
       handler(item);
       ++count;
     }
diff --git a/libkineto/src/RoctracerActivityApi.h b/libkineto/src/RoctracerActivityApi.h
index a02c2acf0..e66c9a1e7 100644
--- a/libkineto/src/RoctracerActivityApi.h
+++ b/libkineto/src/RoctracerActivityApi.h
@@ -52,6 +52,7 @@ class RoctracerActivityApi {
     const std::set<ActivityType>& selected_activities);
   void clearActivities();
   void teardownContext() {}
+  void setTimeOffset(timestamp_t toffset);
 
   virtual int processActivities(
     std::function<void(const roctracerBase*)> handler,
@@ -63,6 +64,7 @@ class RoctracerActivityApi {
 
  private:
   bool registered_{false};
+  timestamp_t toffset_{0};
 
   // Enabled Activity Filters
   uint32_t activityMask_{0};

From 0aacc09b085eea5c20b964edd8410f1189dcbdb7 Mon Sep 17 00:00:00 2001
From: Shivam Raikundalia <sraikund@meta.com>
Date: Wed, 25 Sep 2024 10:28:11 -0700
Subject: [PATCH 07/16] Add 1ns buffer to Roctracer Events (#992)

Summary:
Pull Request resolved: https://github.com/pytorch/kineto/pull/992

As reported in https://github.com/ROCm/roctracer/issues/105, there is an issue where event starts and ends can "tie". This can cause a visual issue in the traces. Lets add a tiny buffer so the events are separate. At the single nanosecond level, the timings are inaccurate anyways so it doesn't really hurt to add this buffer in the meanwhile. Remove/wrap in ifdef once it is issue is resolved

Reviewed By: aaronenyeshi

Differential Revision: D63296093

fbshipit-source-id: 09e313e55bbee65f5e6a4974dc52b3e0df4d5922
---
 libkineto/src/RoctracerActivity.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/libkineto/src/RoctracerActivity.h b/libkineto/src/RoctracerActivity.h
index c4cbfe9bc..677e3507f 100644
--- a/libkineto/src/RoctracerActivity.h
+++ b/libkineto/src/RoctracerActivity.h
@@ -101,6 +101,15 @@ struct GpuActivity : public RoctracerActivity<roctracerAsyncRow> {
   void log(ActivityLogger& logger) const override;
   const std::string metadataJson() const override;
 
+  // Add small buffer to fix visual error created by https://github.com/ROCm/roctracer/issues/105
+  // Once this is resolved we can use ifdef to handle having this buffer or not based on version
+  int64_t timestamp() const override {
+    return activity_.begin + 1;
+  }
+  int64_t duration() const override {
+    return activity_.end - (activity_.begin + 1);
+  }
+
  private:
    ActivityType type_;
 };

From b5c85daac1ee123aa7f04eb6f2bc71363f429e68 Mon Sep 17 00:00:00 2001
From: Shivam Raikundalia <sraikund@meta.com>
Date: Thu, 26 Sep 2024 11:02:13 -0700
Subject: [PATCH 08/16] Fix Bandwidth Calculations in JSON (#993)

Summary:
Pull Request resolved: https://github.com/pytorch/kineto/pull/993

We are using TSC timestamps as durations instead of the conversion to seconds. We should never call the start/end bare, only use the duration() and timstamp() functions

Reviewed By: sanrise, davidberard98

Differential Revision: D63422014

fbshipit-source-id: 98964b44cc30e9d7a88bf340277630b4a44e69db
---
 libkineto/src/CuptiActivity.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/libkineto/src/CuptiActivity.cpp b/libkineto/src/CuptiActivity.cpp
index 28fccd995..d25916321 100644
--- a/libkineto/src/CuptiActivity.cpp
+++ b/libkineto/src/CuptiActivity.cpp
@@ -13,6 +13,8 @@
 #include "Demangle.h"
 #include "DeviceProperties.h"
 #include "output_base.h"
+#include "Logger.h"
+
 
 namespace KINETO_NAMESPACE {
 
@@ -167,7 +169,7 @@ inline const std::string GpuActivity<CUpti_ActivityMemcpy>::metadataJson() const
       "bytes": {}, "memory bandwidth (GB/s)": {})JSON",
       memcpy.deviceId, memcpy.contextId,
       memcpy.streamId, memcpy.correlationId,
-      memcpy.bytes, bandwidth(memcpy.bytes, memcpy.end - memcpy.start));
+      memcpy.bytes, bandwidth(memcpy.bytes, duration()));
   // clang-format on
 }
 
@@ -194,7 +196,7 @@ inline const std::string GpuActivity<CUpti_ActivityMemcpy2>::metadataJson() cons
       memcpy.srcDeviceId, memcpy.deviceId, memcpy.dstDeviceId,
       memcpy.srcContextId, memcpy.contextId, memcpy.dstContextId,
       memcpy.streamId, memcpy.correlationId,
-      memcpy.bytes, bandwidth(memcpy.bytes, memcpy.end - memcpy.start));
+      memcpy.bytes, bandwidth(memcpy.bytes, duration()));
   // clang-format on
 }
 
@@ -220,7 +222,7 @@ inline const std::string GpuActivity<CUpti_ActivityMemset>::metadataJson() const
       "bytes": {}, "memory bandwidth (GB/s)": {})JSON",
       memset.deviceId, memset.contextId,
       memset.streamId, memset.correlationId,
-      memset.bytes, bandwidth(memset.bytes, memset.end - memset.start));
+      memset.bytes, bandwidth(memset.bytes, duration()));
   // clang-format on
 }
 

From 86f1debd1463a9ff1e5c48c6dec411293d8b9c68 Mon Sep 17 00:00:00 2001
From: Eddie Yan <eddiey@nvidia.com>
Date: Tue, 1 Oct 2024 21:03:54 -0700
Subject: [PATCH 09/16] Don't call `getenv` in side threads (#984)

Summary:
Calling `getenv` on side threads is dangerous as it can potentially segfault if the main thread is in the middle of setting environment variables: https://github.com/pytorch/pytorch/issues/134596

This PR only calls `getenv` only once during the first call of  `isDaemonEnvVarSet()`, which is called from `init`.

Pull Request resolved: https://github.com/pytorch/kineto/pull/984

Reviewed By: sraikund16

Differential Revision: D62152169

Pulled By: malfet

fbshipit-source-id: 28dff07cb9775b004580749805b6437dba978eeb

Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
---
 libkineto/include/Config.h |  4 ++++
 libkineto/src/Config.cpp   | 13 ++++++++++++-
 libkineto/src/init.cpp     |  2 +-
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/libkineto/include/Config.h b/libkineto/include/Config.h
index eba4f3c90..b9db56fd8 100644
--- a/libkineto/include/Config.h
+++ b/libkineto/include/Config.h
@@ -502,4 +502,8 @@ class Config : public AbstractConfig {
 
 constexpr char kUseDaemonEnvVar[] = "KINETO_USE_DAEMON";
 
+#if __linux__
+bool isDaemonEnvVarSet();
+#endif
+
 } // namespace libkineto
diff --git a/libkineto/src/Config.cpp b/libkineto/src/Config.cpp
index 346f910ec..dc588c250 100644
--- a/libkineto/src/Config.cpp
+++ b/libkineto/src/Config.cpp
@@ -243,10 +243,21 @@ Config::Config()
     factories->addFeatureConfigs(*this);
   }
 #if __linux__
-  enableIpcFabric_ = (getenv(kUseDaemonEnvVar) != nullptr);
+  enableIpcFabric_ = libkineto::isDaemonEnvVarSet();
 #endif
 }
 
+#if __linux__
+bool isDaemonEnvVarSet() {
+  static bool rc = [] {
+      void *ptr = getenv(kUseDaemonEnvVar);
+      return ptr != nullptr;
+  }();
+  return rc;
+}
+#endif
+
+
 std::shared_ptr<void> Config::getStaticObjectsLifetimeHandle() {
   return configFactories();
 }
diff --git a/libkineto/src/init.cpp b/libkineto/src/init.cpp
index cc9d19040..5d9f9c3cd 100644
--- a/libkineto/src/init.cpp
+++ b/libkineto/src/init.cpp
@@ -131,7 +131,7 @@ void libkineto_init(bool cpuOnly, bool logOnError) {
 
   // Factory to connect to open source daemon if present
 #if __linux__
-  if (getenv(kUseDaemonEnvVar) != nullptr) {
+  if (libkineto::isDaemonEnvVarSet()) {
     LOG(INFO) << "Registering daemon config loader, cpuOnly =  "
               << cpuOnly;
     DaemonConfigLoader::registerFactory();

From 78737f1f909e5ee1592054e3a18d8c0e865a68ff Mon Sep 17 00:00:00 2001
From: Shivam Raikundalia <sraikund@meta.com>
Date: Wed, 2 Oct 2024 12:06:47 -0700
Subject: [PATCH 10/16] Do Not Cache PID/TID in Logging (#994)

Summary:
Pull Request resolved: https://github.com/pytorch/kineto/pull/994

S451588 was caused by the LOG macro caching values that would then be copied to other processes via forking. In general, we should probably use fork handlers to clear out said variables, but from a hygiene point of view we should also not be changing control flow based on logging. For this reason, the pid/tid retrieval in logging should get the cached variable if it exists, but never do the caching itself.

Reviewed By: aaronenyeshi

Differential Revision: D63668265

fbshipit-source-id: 6817c743248056464213be28f562d07752ec3283
---
 libkineto/include/ThreadUtil.h |  4 ++--
 libkineto/src/Logger.cpp       |  2 +-
 libkineto/src/ThreadUtil.cpp   | 26 ++++++++++++++++++--------
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/libkineto/include/ThreadUtil.h b/libkineto/include/ThreadUtil.h
index 12b37da55..29ffc0616 100644
--- a/libkineto/include/ThreadUtil.h
+++ b/libkineto/include/ThreadUtil.h
@@ -15,12 +15,12 @@
 
 namespace libkineto {
 
-int32_t systemThreadId();
+int32_t systemThreadId(bool cache=true);
 int32_t threadId();
 bool setThreadName(const std::string& name);
 std::string getThreadName();
 
-int32_t processId();
+int32_t processId(bool cache=true);
 std::string processName(int32_t pid);
 
 // Return a list of pids and process names for the current process
diff --git a/libkineto/src/Logger.cpp b/libkineto/src/Logger.cpp
index d4886506f..7a4b771d9 100644
--- a/libkineto/src/Logger.cpp
+++ b/libkineto/src/Logger.cpp
@@ -39,7 +39,7 @@ Logger::Logger(int severity, int line, const char* filePath, int errnum)
       std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
   const char* file = strrchr(filePath, '/');
   buf_ << fmt::format("{:%Y-%m-%d %H:%M:%S}", fmt::localtime(tt)) << " "
-       << processId() << ":" << systemThreadId() << " "
+       << processId(false) << ":" << systemThreadId(false) << " "
        << (file ? file + 1 : filePath) << ":" << line << "] ";
 }
 
diff --git a/libkineto/src/ThreadUtil.cpp b/libkineto/src/ThreadUtil.cpp
index 3fdc22b74..56da5d0c8 100644
--- a/libkineto/src/ThreadUtil.cpp
+++ b/libkineto/src/ThreadUtil.cpp
@@ -39,28 +39,38 @@ thread_local int32_t _tid = 0;
 thread_local int32_t _sysTid = 0;
 }
 
-int32_t processId() {
+int32_t processId(bool cache) {
+  int32_t pid = 0;
   if (!_pid) {
 #ifndef _WIN32
-    _pid = (int32_t)getpid();
+    pid = (int32_t)getpid();
 #else
-    _pid = (int32_t)GetCurrentProcessId();
+    pid = (int32_t)GetCurrentProcessId();
 #endif
+    if (cache) {
+      _pid = pid;
+    }
+    return pid;
   }
   return _pid;
 }
 
-int32_t systemThreadId() {
+int32_t systemThreadId(bool cache) {
+  int32_t sysTid = 0;
   if (!_sysTid) {
 #ifdef __APPLE__
-    _sysTid = (int32_t)syscall(SYS_thread_selfid);
+    sysTid = (int32_t)syscall(SYS_thread_selfid);
 #elif defined _WIN32
-    _sysTid = (int32_t)GetCurrentThreadId();
+    sysTid = (int32_t)GetCurrentThreadId();
 #elif defined __FreeBSD__
-    syscall(SYS_thr_self, &_sysTid);
+    syscall(SYS_thr_self, &sysTid);
 #else
-    _sysTid = (int32_t)syscall(SYS_gettid);
+    sysTid = (int32_t)syscall(SYS_gettid);
 #endif
+    if (cache) {
+      _sysTid = sysTid;
+    }
+    return sysTid;
   }
   return _sysTid;
 }

From 40eb0bb735ed4e868d1d117faed19066a610ac6e Mon Sep 17 00:00:00 2001
From: Shivam Raikundalia <sraikund@meta.com>
Date: Thu, 3 Oct 2024 13:15:16 -0700
Subject: [PATCH 11/16] Remove Stream from HIP Launches (#995)

Summary:
Pull Request resolved: https://github.com/pytorch/kineto/pull/995

We would like for AMD traces to match CUDA matches as closely as possible. Right now we are adding stream to the CPU launches whereas in CUDA we do not. This will make zoomer try to process these events when they shouldn't. To make matters worse, the stream id that is embedded in the cpu event don't even match the GPU event. For this reason, lets get rid of them all together.

Reviewed By: aaronenyeshi

Differential Revision: D63798661

fbshipit-source-id: 0802846565b275565329e9ddcdbd7d882880b0aa
---
 libkineto/src/RoctracerActivity_inl.h | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/libkineto/src/RoctracerActivity_inl.h b/libkineto/src/RoctracerActivity_inl.h
index dea799812..c76f6fb12 100644
--- a/libkineto/src/RoctracerActivity_inl.h
+++ b/libkineto/src/RoctracerActivity_inl.h
@@ -170,12 +170,10 @@ inline const std::string RuntimeActivity<roctracerKernelRow>::metadataJson() con
 
   return fmt::format(R"JSON(
       {}"cid": {}, "correlation": {},
-      "stream": "{}",
       "grid": [{}, {}, {}],
       "block": [{}, {}, {}],
       "shared memory": {})JSON",
       kernel, raw().cid, raw().id,
-      reinterpret_cast<void*>(raw().stream),
       raw().gridX, raw().gridY, raw().gridZ,
       raw().workgroupX, raw().workgroupY, raw().workgroupZ,
       raw().groupSegmentSize);
@@ -183,15 +181,9 @@ inline const std::string RuntimeActivity<roctracerKernelRow>::metadataJson() con
 
 template<>
 inline const std::string RuntimeActivity<roctracerCopyRow>::metadataJson() const {
-  std::string stream = "";
-  if ((raw().cid == HIP_API_ID_hipMemcpyAsync) || (raw().cid == HIP_API_ID_hipMemcpyWithStream)) {
-    stream = fmt::format(R"JSON(
-    "stream": "{}", )JSON",
-    reinterpret_cast<void*>(raw().stream));
-  }
   return fmt::format(R"JSON(
-      {}"cid": {}, "correlation": {}, "src": "{}", "dst": "{}", "size": "{}", "kind": "{}")JSON",
-      stream, raw().cid, raw().id, raw().src, raw().dst, raw().size, fmt::underlying(raw().kind));
+      "cid": {}, "correlation": {}, "src": "{}", "dst": "{}", "size": "{}", "kind": "{}")JSON",
+      raw().cid, raw().id, raw().src, raw().dst, raw().size, fmt::underlying(raw().kind));
 }
 
 template<>

From d260fc03ba474766b950a4c27f4aba2de2b9e145 Mon Sep 17 00:00:00 2001
From: Ritvij Saxena <saxenaritvij@meta.com>
Date: Mon, 7 Oct 2024 13:52:10 -0700
Subject: [PATCH 12/16] Applying CLANGFORMAT formatting

Reviewed By: zertosh

Differential Revision: D63984399

fbshipit-source-id: a97bc930cde48a234a20aede5a3f98290cca5c61
---
 libkineto/include/AbstractConfig.h            |   7 +-
 libkineto/include/ActivityProfilerInterface.h |  23 +-
 libkineto/include/ActivityType.h              |  70 ++--
 libkineto/include/Config.h                    |   9 +-
 libkineto/include/GenericTraceActivity.h      |  11 +-
 libkineto/include/IActivityProfiler.h         |  30 +-
 libkineto/include/ILoggerObserver.h           |  10 +-
 libkineto/include/LoggingAPI.h                |   4 +-
 libkineto/include/ThreadUtil.h                |   4 +-
 libkineto/include/TraceSpan.h                 |  12 +-
 libkineto/include/libkineto.h                 |  26 +-
 libkineto/include/output_base.h               |  12 +-
 libkineto/include/time_since_epoch.h          |   9 +-
 .../sample_programs/kineto_cupti_profiler.cpp |  24 +-
 .../sample_programs/kineto_playground.cpp     |   5 +-
 .../sample_programs/kineto_playground.cu      |  20 +-
 .../sample_programs/kineto_playground.cuh     |   2 +-
 libkineto/src/AbstractConfig.cpp              |   4 +-
 libkineto/src/ActivityBuffers.h               |   5 +-
 libkineto/src/ActivityLoggerFactory.h         |  14 +-
 libkineto/src/ActivityProfilerController.cpp  |  67 ++--
 libkineto/src/ActivityProfilerController.h    |  21 +-
 libkineto/src/ActivityProfilerProxy.cpp       |  23 +-
 libkineto/src/ActivityProfilerProxy.h         |  17 +-
 libkineto/src/ActivityTrace.h                 |   4 +-
 libkineto/src/ActivityType.cpp                |  57 ++--
 libkineto/src/ApproximateClock.h              |   4 +-
 libkineto/src/Config.cpp                      |  63 ++--
 libkineto/src/ConfigLoader.cpp                |  45 +--
 libkineto/src/ConfigLoader.h                  |  13 +-
 libkineto/src/CuptiActivity.cpp               |  76 ++---
 libkineto/src/CuptiActivity.h                 | 169 ++++++----
 libkineto/src/CuptiActivityApi.cpp            |  75 +++--
 libkineto/src/CuptiActivityApi.h              |  17 +-
 libkineto/src/CuptiActivityBuffer.h           |   5 +-
 libkineto/src/CuptiActivityProfiler.cpp       | 248 +++++++-------
 libkineto/src/CuptiActivityProfiler.h         | 146 ++++-----
 libkineto/src/CuptiCallbackApi.cpp            | 107 +++---
 libkineto/src/CuptiCallbackApi.h              |  44 ++-
 libkineto/src/CuptiEventApi.cpp               |   3 +-
 libkineto/src/CuptiNvPerfMetric.cpp           | 196 ++++++-----
 libkineto/src/CuptiNvPerfMetric.h             |  23 +-
 libkineto/src/CuptiRangeProfiler.cpp          |  88 +++--
 libkineto/src/CuptiRangeProfiler.h            |   7 +-
 libkineto/src/CuptiRangeProfilerApi.cpp       |  12 +-
 libkineto/src/CuptiRangeProfilerApi.h         |  26 +-
 libkineto/src/CuptiRangeProfilerConfig.cpp    |  30 +-
 libkineto/src/CuptiRangeProfilerConfig.h      |  16 +-
 libkineto/src/DaemonConfigLoader.cpp          |  23 +-
 libkineto/src/DaemonConfigLoader.h            |   4 +-
 libkineto/src/DeviceProperties.cpp            |  64 ++--
 libkineto/src/DeviceUtil.h                    |  46 +--
 libkineto/src/EventProfiler.cpp               |  11 +-
 libkineto/src/EventProfilerController.cpp     |  49 ++-
 libkineto/src/GenericTraceActivity.cpp        |   6 +-
 libkineto/src/ILoggerObserver.cpp             |  25 +-
 libkineto/src/InvariantViolations.h           |  10 +-
 libkineto/src/IpcFabricConfigClient.cpp       |  67 ++--
 libkineto/src/IpcFabricConfigClient.h         |   5 +-
 libkineto/src/Logger.cpp                      |  19 +-
 libkineto/src/Logger.h                        |  30 +-
 libkineto/src/LoggerCollector.h               |   7 +-
 libkineto/src/RoctracerActivity.h             | 101 ++++--
 libkineto/src/RoctracerActivityApi.cpp        |  56 ++--
 libkineto/src/RoctracerActivityApi.h          |  25 +-
 libkineto/src/RoctracerActivity_inl.h         | 122 ++++---
 libkineto/src/RoctracerLogger.cpp             | 280 ++++++++--------
 libkineto/src/RoctracerLogger.h               | 190 +++++++----
 libkineto/src/SampleListener.h                |   5 +-
 libkineto/src/ScopeExit.h                     |   5 +-
 libkineto/src/ThreadUtil.cpp                  |  47 +--
 libkineto/src/WeakSymbols.cpp                 |   8 +-
 libkineto/src/cupti_strings.cpp               |  20 +-
 libkineto/src/init.cpp                        |  53 +--
 libkineto/src/output_csv.cpp                  |   5 +-
 libkineto/src/output_csv.h                    |   3 +-
 libkineto/src/output_json.cpp                 | 123 ++++---
 libkineto/src/output_json.h                   |  34 +-
 libkineto/src/output_membuf.h                 |  13 +-
 .../src/plugin/xpupti/XpuptiActivityApi.cpp   |  10 +-
 .../src/plugin/xpupti/XpuptiActivityApi.h     |   9 +-
 .../plugin/xpupti/XpuptiActivityHandlers.cpp  |  18 +-
 .../plugin/xpupti/XpuptiActivityProfiler.cpp  |  54 ++--
 .../plugin/xpupti/XpuptiActivityProfiler.h    |  11 +-
 .../src/plugin/xpupti/XpuptiProfilerMacros.h  |  22 +-
 libkineto/stress_test/kineto_stress_test.cpp  | 306 ++++++++++++------
 .../stress_test/random_ops_stress_test.cu     | 262 +++++++++------
 .../stress_test/random_ops_stress_test.cuh    | 159 +++++----
 libkineto/stress_test/tensor_cache.cu         |  90 ++++--
 libkineto/stress_test/tensor_cache.cuh        |  47 +--
 libkineto/stress_test/utils.h                 |  46 +--
 libkineto/test/ConfigTest.cpp                 |  76 +++--
 libkineto/test/CuptiActivityProfilerTest.cpp  | 123 ++++---
 libkineto/test/CuptiCallbackApiTest.cpp       |   8 +-
 libkineto/test/CuptiProfilerApiTest.cu        |  67 ++--
 libkineto/test/CuptiRangeProfilerApiTest.cpp  |  41 ++-
 .../test/CuptiRangeProfilerConfigTest.cpp     |  25 +-
 libkineto/test/CuptiRangeProfilerTest.cpp     |  66 ++--
 libkineto/test/CuptiRangeProfilerTestUtil.h   |  14 +-
 libkineto/test/CuptiStringsTest.cpp           |  10 +-
 libkineto/test/EventProfilerTest.cpp          |  17 +-
 libkineto/test/LoggerObserverTest.cpp         |  29 +-
 libkineto/test/MockActivitySubProfiler.cpp    |  25 +-
 libkineto/test/MockActivitySubProfiler.h      |  64 ++--
 .../test/RoctracerActivityProfilerTest.cpp    | 236 ++++++++------
 105 files changed, 2882 insertions(+), 2312 deletions(-)

diff --git a/libkineto/include/AbstractConfig.h b/libkineto/include/AbstractConfig.h
index 85420699f..9a7d66def 100644
--- a/libkineto/include/AbstractConfig.h
+++ b/libkineto/include/AbstractConfig.h
@@ -80,8 +80,11 @@ class AbstractConfig {
   // multiple options.
   // Throw std::invalid_argument if automatic correction can not be made.
   //
-  // @param fallbackProfileStartTime Specify a fallback profile start timestamp in case it was never specified by the client
-  virtual void validate(const std::chrono::time_point<std::chrono::system_clock>& fallbackProfileStartTime) = 0;
+  // @param fallbackProfileStartTime Specify a fallback profile start timestamp
+  // in case it was never specified by the client
+  virtual void validate(
+      const std::chrono::time_point<std::chrono::system_clock>&
+          fallbackProfileStartTime) = 0;
 
   // TODO: Separate out each profiler type into features?
   virtual void printActivityProfilerConfig(std::ostream& s) const;
diff --git a/libkineto/include/ActivityProfilerInterface.h b/libkineto/include/ActivityProfilerInterface.h
index 18711c542..a9af8198b 100644
--- a/libkineto/include/ActivityProfilerInterface.h
+++ b/libkineto/include/ActivityProfilerInterface.h
@@ -13,8 +13,8 @@
 #include <thread>
 #include <vector>
 
-#include "ActivityType.h"
 #include "ActivityTraceInterface.h"
+#include "ActivityType.h"
 #include "IActivityProfiler.h"
 
 namespace libkineto {
@@ -24,7 +24,6 @@ struct CpuTraceBuffer;
 class Config;
 
 class ActivityProfilerInterface {
-
  public:
   virtual ~ActivityProfilerInterface() {}
 
@@ -32,7 +31,7 @@ class ActivityProfilerInterface {
   virtual bool isInitialized() {
     return false;
   }
-  virtual bool isActive(){
+  virtual bool isActive() {
     return false;
   }
 
@@ -56,8 +55,7 @@ class ActivityProfilerInterface {
       const std::string& configStr = "") {}
 
   // Toggle GPU tracing as a trace is running to omit certain parts of a graph
-  virtual void toggleCollectionDynamic(
-    const bool enable) {}
+  virtual void toggleCollectionDynamic(const bool enable) {}
 
   // Start recording, potentially reusing any buffers allocated since
   // prepareTrace was called.
@@ -75,14 +73,13 @@ class ActivityProfilerInterface {
 
   // *** TraceActivity API ***
   // FIXME: Pass activityProfiler interface into clientInterface?
-  virtual void pushCorrelationId(uint64_t id){}
-  virtual void popCorrelationId(){}
-  virtual void transferCpuTrace(
-      std::unique_ptr<CpuTraceBuffer> traceBuffer){}
+  virtual void pushCorrelationId(uint64_t id) {}
+  virtual void popCorrelationId() {}
+  virtual void transferCpuTrace(std::unique_ptr<CpuTraceBuffer> traceBuffer) {}
 
   // Correlation ids for user defined spans
-  virtual void pushUserCorrelationId(uint64_t){}
-  virtual void popUserCorrelationId(){}
+  virtual void pushUserCorrelationId(uint64_t) {}
+  virtual void popUserCorrelationId() {}
 
   // Saves information for the current thread to be used in profiler output
   // Client must record any new kernel thread where the activity has occured.
@@ -90,7 +87,9 @@ class ActivityProfilerInterface {
 
   // Record trace metadata, currently supporting only string key and values,
   // values with the same key are overwritten
-  virtual void addMetadata(const std::string& key, const std::string& value) = 0;
+  virtual void addMetadata(
+      const std::string& key,
+      const std::string& value) = 0;
 
   // Add a child activity profiler, this enables frameworks in the application
   // to enable custom framework events.
diff --git a/libkineto/include/ActivityType.h b/libkineto/include/ActivityType.h
index 4fb094dd6..84887c0b5 100644
--- a/libkineto/include/ActivityType.h
+++ b/libkineto/include/ActivityType.h
@@ -9,46 +9,47 @@
 #pragma once
 
 #include <array>
-#include <string>
 #include <set>
+#include <string>
 
 namespace libkineto {
 
 // Note : All activity types are not enabled by default. Please add them
 // at correct position in the enum
 enum class ActivityType {
-    // Activity types enabled by default
-    CPU_OP = 0, // cpu side ops
-    USER_ANNOTATION,
-    GPU_USER_ANNOTATION,
-    GPU_MEMCPY,
-    GPU_MEMSET,
-    CONCURRENT_KERNEL, // on-device kernels
-    EXTERNAL_CORRELATION,
-    CUDA_RUNTIME, // host side cuda runtime events
-    CUDA_DRIVER, // host side cuda driver events
-    CPU_INSTANT_EVENT, // host side point-like events
-    PYTHON_FUNCTION,
-    OVERHEAD, // CUPTI induced overhead events sampled from its overhead API.
-    MTIA_RUNTIME, // host side MTIA runtime events
-    MTIA_CCP_EVENTS, // MTIA ondevice CCP events
-    CUDA_SYNC, // synchronization events between runtime and kernels
-
-    // Optional Activity types
-    GLOW_RUNTIME, // host side glow runtime events
-    CUDA_PROFILER_RANGE, // CUPTI Profiler range for performance metrics
-    HPU_OP, // HPU host side runtime event
-    XPU_RUNTIME, // host side xpu runtime events
-    COLLECTIVE_COMM, // collective communication
-    MTIA_WORKLOADD, // MTIA workloadd events
-
-    // PRIVATEUSE1 Activity types are used for custom backends.
-    // The corresponding device type is `DeviceType::PrivateUse1` in PyTorch.
-    PRIVATEUSE1_RUNTIME, // host side privateUse1 runtime events
-    PRIVATEUSE1_DRIVER, // host side privateUse1 driver events
-
-    ENUM_COUNT, // This is to add buffer and not used for any profiling logic. Add your new type before it.
-    OPTIONAL_ACTIVITY_TYPE_START = GLOW_RUNTIME,
+  // Activity types enabled by default
+  CPU_OP = 0, // cpu side ops
+  USER_ANNOTATION,
+  GPU_USER_ANNOTATION,
+  GPU_MEMCPY,
+  GPU_MEMSET,
+  CONCURRENT_KERNEL, // on-device kernels
+  EXTERNAL_CORRELATION,
+  CUDA_RUNTIME, // host side cuda runtime events
+  CUDA_DRIVER, // host side cuda driver events
+  CPU_INSTANT_EVENT, // host side point-like events
+  PYTHON_FUNCTION,
+  OVERHEAD, // CUPTI induced overhead events sampled from its overhead API.
+  MTIA_RUNTIME, // host side MTIA runtime events
+  MTIA_CCP_EVENTS, // MTIA ondevice CCP events
+  CUDA_SYNC, // synchronization events between runtime and kernels
+
+  // Optional Activity types
+  GLOW_RUNTIME, // host side glow runtime events
+  CUDA_PROFILER_RANGE, // CUPTI Profiler range for performance metrics
+  HPU_OP, // HPU host side runtime event
+  XPU_RUNTIME, // host side xpu runtime events
+  COLLECTIVE_COMM, // collective communication
+  MTIA_WORKLOADD, // MTIA workloadd events
+
+  // PRIVATEUSE1 Activity types are used for custom backends.
+  // The corresponding device type is `DeviceType::PrivateUse1` in PyTorch.
+  PRIVATEUSE1_RUNTIME, // host side privateUse1 runtime events
+  PRIVATEUSE1_DRIVER, // host side privateUse1 driver events
+
+  ENUM_COUNT, // This is to add buffer and not used for any profiling logic. Add
+  // your new type before it.
+  OPTIONAL_ACTIVITY_TYPE_START = GLOW_RUNTIME,
 };
 
 const char* toString(ActivityType t);
@@ -56,7 +57,8 @@ ActivityType toActivityType(const std::string& str);
 
 // Return an array of all activity types except COUNT
 constexpr int activityTypeCount = (int)ActivityType::ENUM_COUNT;
-constexpr int defaultActivityTypeCount = (int)ActivityType::OPTIONAL_ACTIVITY_TYPE_START;
+constexpr int defaultActivityTypeCount =
+    (int)ActivityType::OPTIONAL_ACTIVITY_TYPE_START;
 const std::array<ActivityType, activityTypeCount> activityTypes();
 const std::array<ActivityType, defaultActivityTypeCount> defaultActivityTypes();
 
diff --git a/libkineto/include/Config.h b/libkineto/include/Config.h
index b9db56fd8..aabe5fafb 100644
--- a/libkineto/include/Config.h
+++ b/libkineto/include/Config.h
@@ -45,7 +45,7 @@ class Config : public AbstractConfig {
 
   bool activityProfilerEnabled() const {
     return activityProfilerEnabled_ ||
-      activitiesOnDemandTimestamp_.time_since_epoch().count() > 0;
+        activitiesOnDemandTimestamp_.time_since_epoch().count() > 0;
   }
 
   // Log activitiy trace to this file
@@ -353,7 +353,7 @@ class Config : public AbstractConfig {
 
   void printActivityProfilerConfig(std::ostream& s) const override;
   void setActivityDependentConfig() override;
-  
+
   void validate(const std::chrono::time_point<std::chrono::system_clock>&
                     fallbackProfileStartTime) override;
 
@@ -369,7 +369,7 @@ class Config : public AbstractConfig {
   // correct destruction order can be ensured.
   static std::shared_ptr<void> getStaticObjectsLifetimeHandle();
 
-  bool getTSCTimestampFlag() const{
+  bool getTSCTimestampFlag() const {
     return useTSCTimestamp_;
   }
 
@@ -447,7 +447,8 @@ class Config : public AbstractConfig {
   bool activitiesCudaSyncWaitEvents_;
 
   // Enable Profiler Config Options
-  // Temporarily disable shape collection until we re-roll out the feature for on-demand cases
+  // Temporarily disable shape collection until we re-roll out the feature for
+  // on-demand cases
   bool enableReportInputShapes_{false};
   bool enableProfileMemory_{false};
   bool enableWithStack_{false};
diff --git a/libkineto/include/GenericTraceActivity.h b/libkineto/include/GenericTraceActivity.h
index 9b798ed15..1e7bc6ba4 100644
--- a/libkineto/include/GenericTraceActivity.h
+++ b/libkineto/include/GenericTraceActivity.h
@@ -9,11 +9,11 @@
 #pragma once
 
 #include <fmt/format.h>
+#include <sstream>
 #include <string>
 #include <thread>
 #include <unordered_map>
 #include <vector>
-#include <sstream>
 
 #include "ITraceActivity.h"
 #include "ThreadUtil.h"
@@ -25,7 +25,8 @@ namespace libkineto {
 constexpr unsigned int kLinkFwdBwd = 1;
 constexpr unsigned int kLinkAsyncCpuGpu = 2;
 
-// @lint-ignore-every CLANGTIDY cppcoreguidelines-non-private-member-variables-in-classes
+// @lint-ignore-every CLANGTIDY
+// cppcoreguidelines-non-private-member-variables-in-classes
 // @lint-ignore-every CLANGTIDY cppcoreguidelines-pro-type-member-init
 class GenericTraceActivity : public ITraceActivity {
  public:
@@ -33,7 +34,9 @@ class GenericTraceActivity : public ITraceActivity {
       : activityType(ActivityType::ENUM_COUNT), traceSpan_(nullptr) {}
 
   GenericTraceActivity(
-      const TraceSpan& trace, ActivityType type, const std::string& name)
+      const TraceSpan& trace,
+      ActivityType type,
+      const std::string& name)
       : activityType(type), activityName(name), traceSpan_(&trace) {}
 
   int64_t deviceId() const override {
@@ -132,7 +135,7 @@ class GenericTraceActivity : public ITraceActivity {
   ActivityType activityType;
   std::string activityName;
   struct Flow {
-    Flow(): id(0), type(0), start(0) {}
+    Flow() : id(0), type(0), start(0) {}
     // Ids must be unique within each type
     uint32_t id : 27;
     // Type will be used to connect flows between profilers, as
diff --git a/libkineto/include/IActivityProfiler.h b/libkineto/include/IActivityProfiler.h
index 2e2df024b..50860df86 100644
--- a/libkineto/include/IActivityProfiler.h
+++ b/libkineto/include/IActivityProfiler.h
@@ -51,10 +51,10 @@ struct DeviceInfo {
       const std::string& name,
       const std::string& label)
       : id(id), sortIndex(sortIndex), name(name), label(label) {}
-  int64_t id;               // process id
-  int64_t sortIndex;        // position in trace view
-  const std::string name;   // process name
-  const std::string label;  // device label
+  int64_t id; // process id
+  int64_t sortIndex; // position in trace view
+  const std::string name; // process name
+  const std::string label; // device label
 };
 
 /* ResourceInfo:
@@ -67,21 +67,20 @@ struct ResourceInfo {
       int64_t sortIndex,
       const std::string& name)
       : id(id), sortIndex(sortIndex), deviceId(deviceId), name(name) {}
-  int64_t id;             // resource id
-  int64_t sortIndex;      // position in trace view
-  int64_t deviceId;       // id of device which owns this resource (specified in DeviceInfo.id)
+  int64_t id; // resource id
+  int64_t sortIndex; // position in trace view
+  int64_t deviceId; // id of device which owns this resource (specified in
+                    // DeviceInfo.id)
   const std::string name; // resource name
 };
 
-using getLinkedActivityCallback =
-  std::function<const ITraceActivity*(int32_t)>;
+using getLinkedActivityCallback = std::function<const ITraceActivity*(int32_t)>;
 
 /* IActivityProfilerSession:
  *   an opaque object that can be used by a high level profiler to
  *   start/stop and return trace events.
  */
 class IActivityProfilerSession {
-
  public:
   virtual ~IActivityProfilerSession() {}
 
@@ -101,9 +100,11 @@ class IActivityProfilerSession {
   // processes trace activities using logger
   virtual void processTrace(ActivityLogger& logger) = 0;
 
-  virtual void processTrace(ActivityLogger& logger,
-    getLinkedActivityCallback /*getLinkedActivity*/,
-    int64_t /*startTime*/, int64_t /*endTime*/) {
+  virtual void processTrace(
+      ActivityLogger& logger,
+      getLinkedActivityCallback /*getLinkedActivity*/,
+      int64_t /*startTime*/,
+      int64_t /*endTime*/) {
     processTrace(logger);
   }
 
@@ -129,7 +130,6 @@ class IActivityProfilerSession {
   TraceStatus status_ = TraceStatus::READY;
 };
 
-
 /* Activity Profiler Plugins:
  *   These allow other frameworks to integrate into Kineto's primariy
  *   activity profiler. While the primary activity profiler handles
@@ -137,9 +137,7 @@ class IActivityProfilerSession {
  *   can become source of new trace activity types.
  */
 class IActivityProfiler {
-
  public:
-
   virtual ~IActivityProfiler() {}
 
   // name of profiler
diff --git a/libkineto/include/ILoggerObserver.h b/libkineto/include/ILoggerObserver.h
index fd9af555e..d20fb1a89 100644
--- a/libkineto/include/ILoggerObserver.h
+++ b/libkineto/include/ILoggerObserver.h
@@ -36,13 +36,14 @@ enum LoggerOutputType {
 const char* toString(LoggerOutputType t);
 LoggerOutputType toLoggerOutputType(const std::string& str);
 
-constexpr int LoggerTypeCount = (int) LoggerOutputType::ENUM_COUNT;
+constexpr int LoggerTypeCount = (int)LoggerOutputType::ENUM_COUNT;
 
 class ILoggerObserver {
  public:
   virtual ~ILoggerObserver() = default;
   virtual void write(const std::string& message, LoggerOutputType ot) = 0;
-  virtual const std::map<LoggerOutputType, std::vector<std::string>> extractCollectorMetadata() = 0;
+  virtual const std::map<LoggerOutputType, std::vector<std::string>>
+  extractCollectorMetadata() = 0;
   virtual void reset() = 0;
   virtual void addDevice(const int64_t device) = 0;
   virtual void setTraceDurationMS(const int64_t duration) = 0;
@@ -51,8 +52,9 @@ class ILoggerObserver {
   virtual void setGroupTraceID(const std::string&) {}
   virtual void addDestination(const std::string& dest) = 0;
   virtual void setTriggerOnDemand() {}
-  virtual void addMetadata(const std::string& key, const std::string& value) = 0;
-
+  virtual void addMetadata(
+      const std::string& key,
+      const std::string& value) = 0;
 };
 
 } // namespace libkineto
diff --git a/libkineto/include/LoggingAPI.h b/libkineto/include/LoggingAPI.h
index 6d6701689..cc3ac2b27 100644
--- a/libkineto/include/LoggingAPI.h
+++ b/libkineto/include/LoggingAPI.h
@@ -9,6 +9,6 @@
 #pragma once
 
 namespace libkineto {
-  int getLogSeverityLevel();
-  void setLogSeverityLevel(int level);
+int getLogSeverityLevel();
+void setLogSeverityLevel(int level);
 } // namespace libkineto
diff --git a/libkineto/include/ThreadUtil.h b/libkineto/include/ThreadUtil.h
index 29ffc0616..4178ae4a1 100644
--- a/libkineto/include/ThreadUtil.h
+++ b/libkineto/include/ThreadUtil.h
@@ -15,12 +15,12 @@
 
 namespace libkineto {
 
-int32_t systemThreadId(bool cache=true);
+int32_t systemThreadId(bool cache = true);
 int32_t threadId();
 bool setThreadName(const std::string& name);
 std::string getThreadName();
 
-int32_t processId(bool cache=true);
+int32_t processId(bool cache = true);
 std::string processName(int32_t pid);
 
 // Return a list of pids and process names for the current process
diff --git a/libkineto/include/TraceSpan.h b/libkineto/include/TraceSpan.h
index 7e5546f7f..cc62a2aec 100644
--- a/libkineto/include/TraceSpan.h
+++ b/libkineto/include/TraceSpan.h
@@ -16,17 +16,13 @@ namespace libkineto {
 
 struct TraceSpan {
   TraceSpan() = delete;
-  TraceSpan(
-      int64_t startTime, int64_t endTime, std::string name)
-      : startTime(startTime), endTime(endTime), name(std::move(name)) {
-  }
-  TraceSpan(
-      int opCount, int it, std::string name, std::string prefix)
+  TraceSpan(int64_t startTime, int64_t endTime, std::string name)
+      : startTime(startTime), endTime(endTime), name(std::move(name)) {}
+  TraceSpan(int opCount, int it, std::string name, std::string prefix)
       : opCount(opCount),
         iteration(it),
         name(std::move(name)),
-        prefix(std::move(prefix)) {
-  }
+        prefix(std::move(prefix)) {}
 
   // FIXME: change to duration?
   int64_t startTime{0};
diff --git a/libkineto/include/libkineto.h b/libkineto/include/libkineto.h
index 425a06148..6fc571b34 100644
--- a/libkineto/include/libkineto.h
+++ b/libkineto/include/libkineto.h
@@ -12,32 +12,32 @@
 
 #include <atomic>
 #include <chrono>
+#include <deque>
 #include <functional>
 #include <memory>
 #include <mutex>
-#include <string>
 #include <set>
+#include <string>
 #include <thread>
 #include <vector>
-#include <deque>
 
 #include "ActivityProfilerInterface.h"
+#include "ActivityTraceInterface.h"
 #include "ActivityType.h"
 #include "ClientInterface.h"
 #include "GenericTraceActivity.h"
-#include "TraceSpan.h"
 #include "IActivityProfiler.h"
-#include "ActivityTraceInterface.h"
 #include "ILoggerObserver.h"
 #include "LoggingAPI.h"
+#include "TraceSpan.h"
 
 #include "ThreadUtil.h"
 
 extern "C" {
-  void suppressLibkinetoLogMessages();
-  int InitializeInjection(void);
-  void libkineto_init(bool cpuOnly, bool logOnError);
-  bool hasTestEnvVar();
+void suppressLibkinetoLogMessages();
+int InitializeInjection(void);
+void libkineto_init(bool cpuOnly, bool logOnError);
+bool hasTestEnvVar();
 }
 
 namespace libkineto {
@@ -68,14 +68,12 @@ struct CpuTraceBuffer {
 };
 
 using ChildActivityProfilerFactory =
-  std::function<std::unique_ptr<IActivityProfiler>()>;
+    std::function<std::unique_ptr<IActivityProfiler>()>;
 
 class LibkinetoApi {
  public:
-
   explicit LibkinetoApi(ConfigLoader& configLoader)
-      : configLoader_(configLoader) {
-  }
+      : configLoader_(configLoader) {}
 
   // Called by client that supports tracing API.
   // libkineto can still function without this.
@@ -124,8 +122,7 @@ class LibkinetoApi {
     return configLoader_;
   }
 
-  void registerProfilerFactory(
-      ChildActivityProfilerFactory factory) {
+  void registerProfilerFactory(ChildActivityProfilerFactory factory) {
     if (isProfilerInitialized()) {
       activityProfiler_->addChildActivityProfiler(factory());
     } else {
@@ -134,7 +131,6 @@ class LibkinetoApi {
   }
 
  private:
-
   void initChildActivityProfilers() {
     if (!isProfilerInitialized()) {
       return;
diff --git a/libkineto/include/output_base.h b/libkineto/include/output_base.h
index 9997f8d28..c5ef3d1e6 100644
--- a/libkineto/include/output_base.h
+++ b/libkineto/include/output_base.h
@@ -16,13 +16,13 @@
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
-#include "IActivityProfiler.h"
 #include "GenericTraceActivity.h"
+#include "IActivityProfiler.h"
 #include "ThreadUtil.h"
 #include "TraceSpan.h"
 
 namespace KINETO_NAMESPACE {
-  struct ActivityBuffers;
+struct ActivityBuffers;
 }
 
 namespace libkineto {
@@ -36,7 +36,6 @@ constexpr int64_t kExceedMaxPid = 5000000;
 
 class ActivityLogger {
  public:
-
   virtual ~ActivityLogger() = default;
 
   struct OverheadInfo {
@@ -44,9 +43,7 @@ class ActivityLogger {
     const std::string name;
   };
 
-  virtual void handleDeviceInfo(
-      const DeviceInfo &info,
-      uint64_t time) = 0;
+  virtual void handleDeviceInfo(const DeviceInfo& info, uint64_t time) = 0;
 
   virtual void handleResourceInfo(const ResourceInfo& info, int64_t time) = 0;
 
@@ -54,8 +51,7 @@ class ActivityLogger {
 
   virtual void handleTraceSpan(const TraceSpan& span) = 0;
 
-  virtual void handleActivity(
-      const libkineto::ITraceActivity& activity) = 0;
+  virtual void handleActivity(const libkineto::ITraceActivity& activity) = 0;
   virtual void handleGenericActivity(
       const libkineto::GenericTraceActivity& activity) = 0;
 
diff --git a/libkineto/include/time_since_epoch.h b/libkineto/include/time_since_epoch.h
index 8204ba4af..17faccec6 100644
--- a/libkineto/include/time_since_epoch.h
+++ b/libkineto/include/time_since_epoch.h
@@ -12,11 +12,10 @@
 
 namespace libkineto {
 template <class ClockT>
-inline int64_t timeSinceEpoch(
-      const std::chrono::time_point<ClockT>& t) {
-    return std::chrono::duration_cast<std::chrono::nanoseconds>(
-               t.time_since_epoch())
-        .count();
+inline int64_t timeSinceEpoch(const std::chrono::time_point<ClockT>& t) {
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             t.time_since_epoch())
+      .count();
 }
 
 } // namespace libkineto
diff --git a/libkineto/sample_programs/kineto_cupti_profiler.cpp b/libkineto/sample_programs/kineto_cupti_profiler.cpp
index 6c1d65252..b097847d6 100644
--- a/libkineto/sample_programs/kineto_cupti_profiler.cpp
+++ b/libkineto/sample_programs/kineto_cupti_profiler.cpp
@@ -8,10 +8,10 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <string>
 #include <chrono>
-#include <thread>
 #include <iostream>
+#include <string>
+#include <thread>
 
 #include <libkineto.h>
 
@@ -27,21 +27,22 @@ int main() {
 
   // Kineto config
   std::set<libkineto::ActivityType> types_cupti_prof = {
-    libkineto::ActivityType::CUDA_PROFILER_RANGE,
+      libkineto::ActivityType::CUDA_PROFILER_RANGE,
   };
 
-  //libkineto_init(false, true);
+  // libkineto_init(false, true);
   libkineto::api().initProfilerIfRegistered();
 
   // Use a special kineto__cuda_core_flop metric that counts individual
-  // CUDA core floating point instructions by operation type (fma,fadd,fmul,dadd ...)
-  // You can also use kineto__tensor_core_insts or any metric
-  // or any metric defined by CUPTI Profiler below
+  // CUDA core floating point instructions by operation type (fma,fadd,fmul,dadd
+  // ...) You can also use kineto__tensor_core_insts or any metric or any metric
+  // defined by CUPTI Profiler below
   //   https://docs.nvidia.com/cupti/Cupti/r_main.html#r_profiler
 
-  std::string profiler_config = "ACTIVITIES_WARMUP_PERIOD_SECS=0\n "
-    "CUPTI_PROFILER_METRICS=kineto__cuda_core_flops\n "
-    "CUPTI_PROFILER_ENABLE_PER_KERNEL=true";
+  std::string profiler_config =
+      "ACTIVITIES_WARMUP_PERIOD_SECS=0\n "
+      "CUPTI_PROFILER_METRICS=kineto__cuda_core_flops\n "
+      "CUPTI_PROFILER_ENABLE_PER_KERNEL=true";
 
   auto& profiler = libkineto::api().activityProfiler();
   profiler.prepareTrace(types_cupti_prof, profiler_config);
@@ -55,7 +56,8 @@ int main() {
   basicMemcpyFromDevice();
 
   auto trace = profiler.stopTrace();
-  std::cout << "Stopped and processed trace. Got " << trace->activities()->size() << " activities.";
+  std::cout << "Stopped and processed trace. Got "
+            << trace->activities()->size() << " activities.";
   trace->save(kFileName);
   return 0;
 }
diff --git a/libkineto/sample_programs/kineto_playground.cpp b/libkineto/sample_programs/kineto_playground.cpp
index b8feb24e4..2c8349626 100644
--- a/libkineto/sample_programs/kineto_playground.cpp
+++ b/libkineto/sample_programs/kineto_playground.cpp
@@ -8,8 +8,8 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <string>
 #include <iostream>
+#include <string>
 
 #include <libkineto.h>
 
@@ -43,7 +43,8 @@ int main() {
 
   std::cout << "Stop Trace" << std::endl;
   auto trace = profiler.stopTrace();
-  std::cout << "Stopped and processed trace. Got " << trace->activities()->size() << " activities.";
+  std::cout << "Stopped and processed trace. Got "
+            << trace->activities()->size() << " activities.";
   trace->save(kFileName);
   return 0;
 }
diff --git a/libkineto/sample_programs/kineto_playground.cu b/libkineto/sample_programs/kineto_playground.cu
index e6a693eaa..973a62117 100644
--- a/libkineto/sample_programs/kineto_playground.cu
+++ b/libkineto/sample_programs/kineto_playground.cu
@@ -10,18 +10,19 @@
 
 #include "kineto_playground.cuh"
 
-
 namespace kineto {
 
 void warmup(void) {
-  // Inititalizing CUDA can take a while which we normally do not want to see in Kineto traces.
-  // This is done in various ways that take Kineto as dependency. This is our way of doing warmup
-  // for kineto_playground
-	size_t bytes = 1000;
-	float* mem = NULL;
-	auto error = cudaMalloc(&mem, bytes);
+  // Inititalizing CUDA can take a while which we normally do not want to see in
+  // Kineto traces. This is done in various ways that take Kineto as dependency.
+  // This is our way of doing warmup for kineto_playground
+  size_t bytes = 1000;
+  float* mem = NULL;
+  auto error = cudaMalloc(&mem, bytes);
   if (error != cudaSuccess) {
-    printf("cudaMalloc failed during kineto_playground warmup. error code: %d", error);
+    printf(
+        "cudaMalloc failed during kineto_playground warmup. error code: %d",
+        error);
     return;
   }
 
@@ -52,7 +53,6 @@ void basicMemcpyToDevice(void) {
 }
 
 void basicMemcpyFromDevice(void) {
-
   size_t size = num * sizeof(float);
   cudaError_t err;
 
@@ -85,7 +85,7 @@ void compute(void) {
   int threadsPerBlock = 256;
   int blocksPerGrid = (num + threadsPerBlock - 1) / threadsPerBlock;
   for (int i = 0; i < 10; i++) {
-    square<<<blocksPerGrid, threadsPerBlock>>> (dA, num);
+    square<<<blocksPerGrid, threadsPerBlock>>>(dA, num);
   }
 }
 
diff --git a/libkineto/sample_programs/kineto_playground.cuh b/libkineto/sample_programs/kineto_playground.cuh
index 7100417c2..4e72402b5 100644
--- a/libkineto/sample_programs/kineto_playground.cuh
+++ b/libkineto/sample_programs/kineto_playground.cuh
@@ -27,4 +27,4 @@ void playground(void);
 // Run a simple elementwise kernel
 void compute(void);
 
-}
+} // namespace kineto
diff --git a/libkineto/src/AbstractConfig.cpp b/libkineto/src/AbstractConfig.cpp
index 4b69bcd7d..932066f62 100644
--- a/libkineto/src/AbstractConfig.cpp
+++ b/libkineto/src/AbstractConfig.cpp
@@ -8,8 +8,8 @@
 
 #include "AbstractConfig.h"
 
-#include <array>
 #include <fmt/format.h>
+#include <array>
 #include <sstream>
 
 #include "Logger.h"
@@ -158,7 +158,7 @@ bool AbstractConfig::parse(const string& conf) {
         }
       } catch (const std::exception& e) {
         LOG(ERROR) << "Failed to parse config: " << e.what()
-                   << " ; line: "<< line;
+                   << " ; line: " << line;
         return false;
       }
       if (!handled) {
diff --git a/libkineto/src/ActivityBuffers.h b/libkineto/src/ActivityBuffers.h
index 109eeefa4..a962ccbe3 100644
--- a/libkineto/src/ActivityBuffers.h
+++ b/libkineto/src/ActivityBuffers.h
@@ -8,12 +8,11 @@
 
 #pragma once
 
-
 #include <list>
 #include <memory>
 
-#include "libkineto.h"
 #include "CuptiActivityBuffer.h"
+#include "libkineto.h"
 
 namespace KINETO_NAMESPACE {
 
@@ -22,7 +21,7 @@ struct ActivityBuffers {
   std::unique_ptr<CuptiActivityBufferMap> gpu;
 
   // Add a wrapper object to the underlying struct stored in the buffer
-  template<class T>
+  template <class T>
   const ITraceActivity& addActivityWrapper(const T& act) {
     wrappers_.push_back(std::make_unique<T>(act));
     return *wrappers_.back().get();
diff --git a/libkineto/src/ActivityLoggerFactory.h b/libkineto/src/ActivityLoggerFactory.h
index 7ff7b921a..597445ea8 100644
--- a/libkineto/src/ActivityLoggerFactory.h
+++ b/libkineto/src/ActivityLoggerFactory.h
@@ -8,9 +8,9 @@
 
 #pragma once
 
+#include <fmt/format.h>
 #include <algorithm>
 #include <cctype>
-#include <fmt/format.h>
 #include <functional>
 #include <map>
 #include <string>
@@ -20,10 +20,9 @@ namespace KINETO_NAMESPACE {
 class ActivityLogger;
 
 class ActivityLoggerFactory {
-
  public:
   using FactoryFunc =
-    std::function<std::unique_ptr<ActivityLogger>(const std::string& url)>;
+      std::function<std::unique_ptr<ActivityLogger>(const std::string& url)>;
 
   // Add logger factory for a protocol prefix
   void addProtocol(const std::string& protocol, FactoryFunc f) {
@@ -38,16 +37,15 @@ class ActivityLoggerFactory {
       return it->second(stripProtocol(url));
     }
     throw std::invalid_argument(fmt::format(
-        "No logger registered for the {} protocol prefix",
-        protocol));
+        "No logger registered for the {} protocol prefix", protocol));
     return nullptr;
   }
 
  private:
   static std::string tolower(std::string s) {
-    std::transform(s.begin(), s.end(), s.begin(),
-        [](unsigned char c) { return std::tolower(c); }
-    );
+    std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) {
+      return std::tolower(c);
+    });
     return s;
   }
 
diff --git a/libkineto/src/ActivityProfilerController.cpp b/libkineto/src/ActivityProfilerController.cpp
index a2840c9ef..99c629cfb 100644
--- a/libkineto/src/ActivityProfilerController.cpp
+++ b/libkineto/src/ActivityProfilerController.cpp
@@ -43,7 +43,8 @@ void ActivityProfilerController::setLoggerCollectorFactory(
 #endif // !USE_GOOGLE_LOG
 
 ActivityProfilerController::ActivityProfilerController(
-    ConfigLoader& configLoader, bool cpuOnly)
+    ConfigLoader& configLoader,
+    bool cpuOnly)
     : configLoader_(configLoader) {
   // Initialize ChromeTraceBaseTime first of all.
   ChromeTraceBaseTime::singleton().init();
@@ -70,8 +71,7 @@ ActivityProfilerController::ActivityProfilerController(
 }
 
 ActivityProfilerController::~ActivityProfilerController() {
-  configLoader_.removeHandler(
-      ConfigLoader::ConfigKind::ActivityProfiler, this);
+  configLoader_.removeHandler(ConfigLoader::ConfigKind::ActivityProfiler, this);
   if (profilerThread_) {
     // signaling termination of the profiler loop
     stopRunloop_ = true;
@@ -90,7 +90,7 @@ ActivityProfilerController::~ActivityProfilerController() {
 static ActivityLoggerFactory initLoggerFactory() {
   ActivityLoggerFactory factory;
   factory.addProtocol("file", [](const std::string& url) {
-      return std::unique_ptr<ActivityLogger>(new ChromeTraceLogger(url));
+    return std::unique_ptr<ActivityLogger>(new ChromeTraceLogger(url));
   });
   return factory;
 }
@@ -101,7 +101,8 @@ static ActivityLoggerFactory& loggerFactory() {
 }
 
 void ActivityProfilerController::addLoggerFactory(
-    const std::string& protocol, ActivityLoggerFactory::FactoryFunc factory) {
+    const std::string& protocol,
+    ActivityLoggerFactory::FactoryFunc factory) {
   loggerFactory().addProtocol(protocol, factory);
 }
 
@@ -112,13 +113,15 @@ static std::unique_ptr<ActivityLogger> makeLogger(const Config& config) {
   return loggerFactory().makeLogger(config.activitiesLogUrl());
 }
 
-static std::unique_ptr<InvariantViolationsLogger>& invariantViolationsLoggerFactory() {
+static std::unique_ptr<InvariantViolationsLogger>&
+invariantViolationsLoggerFactory() {
   static std::unique_ptr<InvariantViolationsLogger> factory = nullptr;
   return factory;
 }
 
 void ActivityProfilerController::setInvariantViolationsLoggerFactory(
-    const std::function<std::unique_ptr<InvariantViolationsLogger>()>& factory) {
+    const std::function<std::unique_ptr<InvariantViolationsLogger>()>&
+        factory) {
   invariantViolationsLoggerFactory() = factory();
 }
 
@@ -140,13 +143,16 @@ bool ActivityProfilerController::shouldActivateTimestampConfig(
   }
   // Note on now + Config::kControllerIntervalMsecs:
   // Profiler interval does not align perfectly up to startTime - warmup.
-  // Waiting until the next tick won't allow sufficient time for the profiler to warm up.
-  // So check if we are very close to the warmup time and trigger warmup.
-  if (now + Config::kControllerIntervalMsecs
-      >= (asyncRequestConfig_->requestTimestamp() - asyncRequestConfig_->activitiesWarmupDuration())) {
-    LOG(INFO) << "Received on-demand activity trace request by "
-              << " profile timestamp = "
-              << asyncRequestConfig_->requestTimestamp().time_since_epoch().count();
+  // Waiting until the next tick won't allow sufficient time for the profiler to
+  // warm up. So check if we are very close to the warmup time and trigger
+  // warmup.
+  if (now + Config::kControllerIntervalMsecs >=
+      (asyncRequestConfig_->requestTimestamp() -
+       asyncRequestConfig_->activitiesWarmupDuration())) {
+    LOG(INFO)
+        << "Received on-demand activity trace request by "
+        << " profile timestamp = "
+        << asyncRequestConfig_->requestTimestamp().time_since_epoch().count();
     return true;
   }
   return false;
@@ -164,24 +170,25 @@ bool ActivityProfilerController::shouldActivateIterationConfig(
   }
 
   LOG(INFO) << "Received on-demand activity trace request by "
-                " profile start iteration = "
+               " profile start iteration = "
             << asyncRequestConfig_->profileStartIteration()
             << ", current iteration = " << currentIter;
   // Re-calculate the start iter if requested iteration is in the past.
   if (currentIter > rootIter) {
-    auto newProfileStart = currentIter +
-        asyncRequestConfig_->activitiesWarmupIterations();
+    auto newProfileStart =
+        currentIter + asyncRequestConfig_->activitiesWarmupIterations();
     // Use Start Iteration Round Up if it is present.
     if (asyncRequestConfig_->profileStartIterationRoundUp() > 0) {
       // round up to nearest multiple
       auto divisor = asyncRequestConfig_->profileStartIterationRoundUp();
       auto rem = newProfileStart % divisor;
       newProfileStart += ((rem == 0) ? 0 : divisor - rem);
-      LOG(INFO) << "Rounding up profiler start iteration to : " << newProfileStart;
+      LOG(INFO) << "Rounding up profiler start iteration to : "
+                << newProfileStart;
       asyncRequestConfig_->setProfileStartIteration(newProfileStart);
       if (currentIter != asyncRequestConfig_->startIterationIncludingWarmup()) {
-        // Ex. Current 9, start 8, warmup 5, roundup 100. Resolves new start to 100,
-        // with warmup starting at 95. So don't start now.
+        // Ex. Current 9, start 8, warmup 5, roundup 100. Resolves new start to
+        // 100, with warmup starting at 95. So don't start now.
         return false;
       }
     } else {
@@ -224,8 +231,8 @@ void ActivityProfilerController::profilerLoop() {
     if (profiler_->isActive()) {
       next_wakeup_time = profiler_->performRunLoopStep(now, next_wakeup_time);
       VLOG(1) << "Profiler loop: "
-          << duration_cast<milliseconds>(system_clock::now() - now).count()
-          << "ms";
+              << duration_cast<milliseconds>(system_clock::now() - now).count()
+              << "ms";
     }
   }
 
@@ -324,19 +331,21 @@ void ActivityProfilerController::startTrace() {
   profiler_->startTrace(std::chrono::system_clock::now());
 }
 
-std::unique_ptr<ActivityTraceInterface> ActivityProfilerController::stopTrace() {
+std::unique_ptr<ActivityTraceInterface>
+ActivityProfilerController::stopTrace() {
   profiler_->stopTrace(std::chrono::system_clock::now());
   UST_LOGGER_MARK_COMPLETED(kCollectionStage);
   auto logger = std::make_unique<MemoryTraceLogger>(profiler_->config());
   profiler_->processTrace(*logger);
-  // Will follow up with another patch for logging URLs when ActivityTrace is moved.
+  // Will follow up with another patch for logging URLs when ActivityTrace is
+  // moved.
   UST_LOGGER_MARK_COMPLETED(kPostProcessingStage);
 
   // Logger Metadata contains a map of LOGs collected in Kineto
   //   logger_level -> List of log lines
   // This will be added into the trace as metadata.
-  std::unordered_map<std::string, std::vector<std::string>>
-    loggerMD = profiler_->getLoggerMetadata();
+  std::unordered_map<std::string, std::vector<std::string>> loggerMD =
+      profiler_->getLoggerMetadata();
   logger->setLoggerMetadata(std::move(loggerMD));
 
   profiler_->reset();
@@ -344,7 +353,8 @@ std::unique_ptr<ActivityTraceInterface> ActivityProfilerController::stopTrace()
 }
 
 void ActivityProfilerController::addMetadata(
-    const std::string& key, const std::string& value) {
+    const std::string& key,
+    const std::string& value) {
   profiler_->addMetadata(key, value);
 }
 
@@ -354,7 +364,8 @@ void ActivityProfilerController::logInvariantViolation(
     const std::string& error,
     const std::string& group_profile_id) {
   if (invariantViolationsLoggerFactory()) {
-    invariantViolationsLoggerFactory()->logInvariantViolation(profile_id, assertion, error, group_profile_id);
+    invariantViolationsLoggerFactory()->logInvariantViolation(
+        profile_id, assertion, error, group_profile_id);
   }
 }
 
diff --git a/libkineto/src/ActivityProfilerController.h b/libkineto/src/ActivityProfilerController.h
index f294d6b79..47d21b245 100644
--- a/libkineto/src/ActivityProfilerController.h
+++ b/libkineto/src/ActivityProfilerController.h
@@ -18,13 +18,13 @@
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
 #include "ActivityLoggerFactory.h"
-#include "CuptiActivityProfiler.h"
 #include "ActivityProfilerInterface.h"
 #include "ActivityTraceInterface.h"
 #include "ConfigLoader.h"
 #include "CuptiActivityApi.h"
-#include "LoggerCollector.h"
+#include "CuptiActivityProfiler.h"
 #include "InvariantViolations.h"
+#include "LoggerCollector.h"
 
 namespace KINETO_NAMESPACE {
 
@@ -49,7 +49,8 @@ class ActivityProfilerController : public ConfigLoader::ConfigHandler {
       ActivityLoggerFactory::FactoryFunc factory);
 
   static void setInvariantViolationsLoggerFactory(
-      const std::function<std::unique_ptr<InvariantViolationsLogger>()>& factory);
+      const std::function<std::unique_ptr<InvariantViolationsLogger>()>&
+          factory);
 
   // These API are used for On-Demand Tracing.
   bool canAcceptConfig() override;
@@ -67,8 +68,7 @@ class ActivityProfilerController : public ConfigLoader::ConfigHandler {
     return profiler_->isActive();
   }
 
-  void transferCpuTrace(
-      std::unique_ptr<libkineto::CpuTraceBuffer> cpuTrace) {
+  void transferCpuTrace(std::unique_ptr<libkineto::CpuTraceBuffer> cpuTrace) {
     return profiler_->transferCpuTrace(std::move(cpuTrace));
   }
 
@@ -76,18 +76,17 @@ class ActivityProfilerController : public ConfigLoader::ConfigHandler {
     profiler_->recordThreadInfo();
   }
 
-  void addChildActivityProfiler(
-      std::unique_ptr<IActivityProfiler> profiler) {
+  void addChildActivityProfiler(std::unique_ptr<IActivityProfiler> profiler) {
     profiler_->addChildActivityProfiler(std::move(profiler));
   }
 
   void addMetadata(const std::string& key, const std::string& value);
 
   void logInvariantViolation(
-    const std::string& profile_id,
-    const std::string& assertion,
-    const std::string& error,
-    const std::string& group_profile_id = "");
+      const std::string& profile_id,
+      const std::string& assertion,
+      const std::string& error,
+      const std::string& group_profile_id = "");
 
   void pushCorrelationId(uint64_t id) {
     profiler_->pushCorrelationId(id);
diff --git a/libkineto/src/ActivityProfilerProxy.cpp b/libkineto/src/ActivityProfilerProxy.cpp
index 31880c73a..8400fd052 100644
--- a/libkineto/src/ActivityProfilerProxy.cpp
+++ b/libkineto/src/ActivityProfilerProxy.cpp
@@ -8,17 +8,17 @@
 
 #include "ActivityProfilerProxy.h"
 
+#include <chrono>
 #include "ActivityProfilerController.h"
 #include "Config.h"
 #include "Logger.h"
-#include <chrono>
 
 namespace KINETO_NAMESPACE {
 
 ActivityProfilerProxy::ActivityProfilerProxy(
-    bool cpuOnly, ConfigLoader& configLoader)
-  : cpuOnly_(cpuOnly), configLoader_(configLoader) {
-}
+    bool cpuOnly,
+    ConfigLoader& configLoader)
+    : cpuOnly_(cpuOnly), configLoader_(configLoader) {}
 
 ActivityProfilerProxy::~ActivityProfilerProxy() {
   delete controller_;
@@ -70,7 +70,7 @@ void ActivityProfilerProxy::prepareTrace(
   controller_->prepareTrace(config);
 }
 
-void ActivityProfilerProxy::toggleCollectionDynamic(const bool enable){
+void ActivityProfilerProxy::toggleCollectionDynamic(const bool enable) {
   controller_->toggleCollectionDynamic(enable);
 }
 
@@ -78,8 +78,7 @@ void ActivityProfilerProxy::startTrace() {
   controller_->startTrace();
 }
 
-std::unique_ptr<ActivityTraceInterface>
-ActivityProfilerProxy::stopTrace() {
+std::unique_ptr<ActivityTraceInterface> ActivityProfilerProxy::stopTrace() {
   return controller_->stopTrace();
 }
 
@@ -108,12 +107,13 @@ void ActivityProfilerProxy::popUserCorrelationId() {
 }
 
 void ActivityProfilerProxy::transferCpuTrace(
-   std::unique_ptr<CpuTraceBuffer> traceBuffer) {
+    std::unique_ptr<CpuTraceBuffer> traceBuffer) {
   controller_->transferCpuTrace(std::move(traceBuffer));
 }
 
 void ActivityProfilerProxy::addMetadata(
-    const std::string& key, const std::string& value) {
+    const std::string& key,
+    const std::string& value) {
   controller_->addMetadata(key, value);
 }
 
@@ -131,7 +131,8 @@ void ActivityProfilerProxy::logInvariantViolation(
     const std::string& assertion,
     const std::string& error,
     const std::string& group_profile_id) {
-    controller_->logInvariantViolation(profile_id, assertion, error, group_profile_id);
+  controller_->logInvariantViolation(
+      profile_id, assertion, error, group_profile_id);
 }
 
-} // namespace libkineto
+} // namespace KINETO_NAMESPACE
diff --git a/libkineto/src/ActivityProfilerProxy.h b/libkineto/src/ActivityProfilerProxy.h
index 35875c602..5c11f0842 100644
--- a/libkineto/src/ActivityProfilerProxy.h
+++ b/libkineto/src/ActivityProfilerProxy.h
@@ -18,9 +18,9 @@
 #include "ITraceActivity.h"
 
 namespace libkineto {
-  // previous declaration is struct so this one must be too.
-  struct CpuTraceBuffer;
-}
+// previous declaration is struct so this one must be too.
+struct CpuTraceBuffer;
+} // namespace libkineto
 
 namespace KINETO_NAMESPACE {
 
@@ -31,7 +31,6 @@ class Config;
 class ConfigLoader;
 
 class ActivityProfilerProxy : public ActivityProfilerInterface {
-
  public:
   ActivityProfilerProxy(bool cpuOnly, ConfigLoader& configLoader);
   ~ActivityProfilerProxy() override;
@@ -52,9 +51,8 @@ class ActivityProfilerProxy : public ActivityProfilerInterface {
       const std::set<ActivityType>& activityTypes,
       const std::string& configStr = "") override;
 
-  void toggleCollectionDynamic(
-    const bool enable) override;
-    
+  void toggleCollectionDynamic(const bool enable) override;
+
   void startTrace() override;
   void step() override;
   std::unique_ptr<ActivityTraceInterface> stopTrace() override;
@@ -65,8 +63,7 @@ class ActivityProfilerProxy : public ActivityProfilerInterface {
   void pushUserCorrelationId(uint64_t id) override;
   void popUserCorrelationId() override;
 
-  void transferCpuTrace(
-     std::unique_ptr<CpuTraceBuffer> traceBuffer) override;
+  void transferCpuTrace(std::unique_ptr<CpuTraceBuffer> traceBuffer) override;
 
   void addMetadata(const std::string& key, const std::string& value) override;
 
@@ -85,4 +82,4 @@ class ActivityProfilerProxy : public ActivityProfilerInterface {
   ActivityProfilerController* controller_{nullptr};
 };
 
-} // namespace libkineto
+} // namespace KINETO_NAMESPACE
diff --git a/libkineto/src/ActivityTrace.h b/libkineto/src/ActivityTrace.h
index b107d15f0..d7e6d0c31 100644
--- a/libkineto/src/ActivityTrace.h
+++ b/libkineto/src/ActivityTrace.h
@@ -23,9 +23,7 @@ class ActivityTrace : public ActivityTraceInterface {
   ActivityTrace(
       std::unique_ptr<MemoryTraceLogger> tmpLogger,
       const ActivityLoggerFactory& factory)
-    : memLogger_(std::move(tmpLogger)),
-      loggerFactory_(factory) {
-  }
+      : memLogger_(std::move(tmpLogger)), loggerFactory_(factory) {}
 
   const std::vector<const ITraceActivity*>* activities() override {
     return memLogger_->traceActivities();
diff --git a/libkineto/src/ActivityType.cpp b/libkineto/src/ActivityType.cpp
index 91a5341a6..1b1cda8a5 100644
--- a/libkineto/src/ActivityType.cpp
+++ b/libkineto/src/ActivityType.cpp
@@ -17,36 +17,35 @@ struct ActivityTypeName {
   ActivityType type;
 };
 
-static constexpr std::array<ActivityTypeName, activityTypeCount + 1> map{{
-    {"cpu_op", ActivityType::CPU_OP},
-    {"user_annotation", ActivityType::USER_ANNOTATION},
-    {"gpu_user_annotation", ActivityType::GPU_USER_ANNOTATION},
-    {"gpu_memcpy", ActivityType::GPU_MEMCPY},
-    {"gpu_memset", ActivityType::GPU_MEMSET},
-    {"kernel", ActivityType::CONCURRENT_KERNEL},
-    {"external_correlation", ActivityType::EXTERNAL_CORRELATION},
-    {"cuda_runtime", ActivityType::CUDA_RUNTIME},
-    {"cuda_driver", ActivityType::CUDA_DRIVER},
-    {"cpu_instant_event", ActivityType::CPU_INSTANT_EVENT},
-    {"python_function", ActivityType::PYTHON_FUNCTION},
-    {"overhead", ActivityType::OVERHEAD},
-    {"mtia_runtime", ActivityType::MTIA_RUNTIME},
-    {"mtia_ccp_events", ActivityType::MTIA_CCP_EVENTS},
-    {"cuda_sync", ActivityType::CUDA_SYNC},
-    {"glow_runtime", ActivityType::GLOW_RUNTIME},
-    {"cuda_profiler_range", ActivityType::CUDA_PROFILER_RANGE},
-    {"hpu_op", ActivityType::HPU_OP},
-    {"xpu_runtime", ActivityType::XPU_RUNTIME},
-    {"collective_comm", ActivityType::COLLECTIVE_COMM},
-    {"mtia_workloadd", ActivityType::MTIA_WORKLOADD},
-    {"privateuse1_runtime", ActivityType::PRIVATEUSE1_RUNTIME},
-    {"privateuse1_driver", ActivityType::PRIVATEUSE1_DRIVER},
-    {"ENUM_COUNT", ActivityType::ENUM_COUNT}
-}};
+static constexpr std::array<ActivityTypeName, activityTypeCount + 1> map{
+    {{"cpu_op", ActivityType::CPU_OP},
+     {"user_annotation", ActivityType::USER_ANNOTATION},
+     {"gpu_user_annotation", ActivityType::GPU_USER_ANNOTATION},
+     {"gpu_memcpy", ActivityType::GPU_MEMCPY},
+     {"gpu_memset", ActivityType::GPU_MEMSET},
+     {"kernel", ActivityType::CONCURRENT_KERNEL},
+     {"external_correlation", ActivityType::EXTERNAL_CORRELATION},
+     {"cuda_runtime", ActivityType::CUDA_RUNTIME},
+     {"cuda_driver", ActivityType::CUDA_DRIVER},
+     {"cpu_instant_event", ActivityType::CPU_INSTANT_EVENT},
+     {"python_function", ActivityType::PYTHON_FUNCTION},
+     {"overhead", ActivityType::OVERHEAD},
+     {"mtia_runtime", ActivityType::MTIA_RUNTIME},
+     {"mtia_ccp_events", ActivityType::MTIA_CCP_EVENTS},
+     {"cuda_sync", ActivityType::CUDA_SYNC},
+     {"glow_runtime", ActivityType::GLOW_RUNTIME},
+     {"cuda_profiler_range", ActivityType::CUDA_PROFILER_RANGE},
+     {"hpu_op", ActivityType::HPU_OP},
+     {"xpu_runtime", ActivityType::XPU_RUNTIME},
+     {"collective_comm", ActivityType::COLLECTIVE_COMM},
+     {"mtia_workloadd", ActivityType::MTIA_WORKLOADD},
+     {"privateuse1_runtime", ActivityType::PRIVATEUSE1_RUNTIME},
+     {"privateuse1_driver", ActivityType::PRIVATEUSE1_DRIVER},
+     {"ENUM_COUNT", ActivityType::ENUM_COUNT}}};
 
 static constexpr bool matchingOrder(int idx = 0) {
   return map[idx].type == ActivityType::ENUM_COUNT ||
-    ((idx == (int) map[idx].type) && matchingOrder(idx + 1));
+      ((idx == (int)map[idx].type) && matchingOrder(idx + 1));
 }
 static_assert(matchingOrder(), "ActivityTypeName map is out of order");
 
@@ -71,7 +70,8 @@ const std::array<ActivityType, activityTypeCount> activityTypes() {
   return res;
 }
 
-const std::array<ActivityType, defaultActivityTypeCount> defaultActivityTypes() {
+const std::array<ActivityType, defaultActivityTypeCount>
+defaultActivityTypes() {
   std::array<ActivityType, defaultActivityTypeCount> res;
   for (int i = 0; i < defaultActivityTypeCount; i++) {
     res[i] = map[i].type;
@@ -79,5 +79,4 @@ const std::array<ActivityType, defaultActivityTypeCount> defaultActivityTypes()
   return res;
 }
 
-
 } // namespace libkineto
diff --git a/libkineto/src/ApproximateClock.h b/libkineto/src/ApproximateClock.h
index 991085e4f..6907415fc 100644
--- a/libkineto/src/ApproximateClock.h
+++ b/libkineto/src/ApproximateClock.h
@@ -20,7 +20,6 @@
 
 namespace libkineto {
 
-
 #if defined(__i386__) || defined(__x86_64__) || defined(__amd64__)
 #define KINETO_RDTSC
 #if defined(_MSC_VER)
@@ -37,7 +36,6 @@ namespace libkineto {
 #endif
 #endif
 
-
 #if defined(_MSC_VER) && !defined(__clang__)
 #define KINETO_UNUSED __pragma(warning(suppress : 4100 4101))
 #else
@@ -86,7 +84,7 @@ inline auto getApproximateTime() {
 
 using approx_time_t = decltype(getApproximateTime());
 static_assert(
-  std::is_same_v<approx_time_t, int64_t> ||
+    std::is_same_v<approx_time_t, int64_t> ||
         std::is_same_v<approx_time_t, uint64_t>,
     "Expected either int64_t (`getTime`) or uint64_t (some TSC reads).");
 
diff --git a/libkineto/src/Config.cpp b/libkineto/src/Config.cpp
index dc588c250..18103440b 100644
--- a/libkineto/src/Config.cpp
+++ b/libkineto/src/Config.cpp
@@ -13,6 +13,7 @@
 #include <fmt/chrono.h>
 #include <fmt/format.h>
 #include <fmt/ranges.h>
+#include <time.h>
 #include <chrono>
 #include <fstream>
 #include <functional>
@@ -21,7 +22,6 @@
 #include <mutex>
 #include <ostream>
 #include <sstream>
-#include <time.h>
 
 #include "Logger.h"
 #include "ThreadUtil.h"
@@ -73,14 +73,18 @@ constexpr char kActivityTypesKey[] = "ACTIVITY_TYPES";
 constexpr char kActivitiesLogFileKey[] = "ACTIVITIES_LOG_FILE";
 constexpr char kActivitiesDurationKey[] = "ACTIVITIES_DURATION_SECS";
 constexpr char kActivitiesDurationMsecsKey[] = "ACTIVITIES_DURATION_MSECS";
-constexpr char kActivitiesWarmupDurationSecsKey[] = "ACTIVITIES_WARMUP_PERIOD_SECS";
+constexpr char kActivitiesWarmupDurationSecsKey[] =
+    "ACTIVITIES_WARMUP_PERIOD_SECS";
 constexpr char kActivitiesMaxGpuBufferSizeKey[] =
     "ACTIVITIES_MAX_GPU_BUFFER_SIZE_MB";
-constexpr char kActivitiesDisplayCudaSyncWaitEvents[] = "ACTIVITIES_DISPLAY_CUDA_SYNC_WAIT_EVENTS";
+constexpr char kActivitiesDisplayCudaSyncWaitEvents[] =
+    "ACTIVITIES_DISPLAY_CUDA_SYNC_WAIT_EVENTS";
 
 // Client Interface
-// TODO: keep supporting these older config options, deprecate in the future using replacements.
-constexpr char kClientInterfaceEnableOpInputsCollection[] = "CLIENT_INTERFACE_ENABLE_OP_INPUTS_COLLECTION";
+// TODO: keep supporting these older config options, deprecate in the future
+// using replacements.
+constexpr char kClientInterfaceEnableOpInputsCollection[] =
+    "CLIENT_INTERFACE_ENABLE_OP_INPUTS_COLLECTION";
 constexpr char kPythonStackTrace[] = "PYTHON_STACK_TRACE";
 // Profiler Config Options
 constexpr char kProfileReportInputShapes[] = "PROFILE_REPORT_INPUT_SHAPES";
@@ -135,9 +139,7 @@ constexpr char kProfileStartIterationRoundUpKey[] =
     "PROFILE_START_ITERATION_ROUNDUP";
 
 constexpr char kRequestTraceID[] = "REQUEST_TRACE_ID";
-constexpr char kRequestGroupTraceID[] =
-    "REQUEST_GROUP_TRACE_ID";
-
+constexpr char kRequestGroupTraceID[] = "REQUEST_GROUP_TRACE_ID";
 
 // Enable on-demand trigger via kill -USR2 <pid>
 // When triggered in this way, /tmp/libkineto.conf will be used as config.
@@ -147,7 +149,8 @@ constexpr char kEnableSigUsr2Key[] = "ENABLE_SIGUSR2";
 // and disable thrift communication with dynolog daemon
 constexpr char kEnableIpcFabricKey[] = "ENABLE_IPC_FABRIC";
 // Period to pull on-demand config from dynolog daemon
-constexpr char kOnDemandConfigUpdateIntervalSecsKey[] = "ON_DEMAND_CONFIG_UPDATE_INTERVAL_SECS";
+constexpr char kOnDemandConfigUpdateIntervalSecsKey[] =
+    "ON_DEMAND_CONFIG_UPDATE_INTERVAL_SECS";
 
 // Verbose log level
 // The actual glog is not used and --v and --vmodule has no effect.
@@ -235,7 +238,8 @@ Config::Config()
       requestTimestamp_(milliseconds(0)),
       enableSigUsr2_(false),
       enableIpcFabric_(false),
-      onDemandConfigUpdateIntervalSecs_(kDefaultOnDemandConfigUpdateIntervalSecs),
+      onDemandConfigUpdateIntervalSecs_(
+          kDefaultOnDemandConfigUpdateIntervalSecs),
       cuptiDeviceBufferSize_(kDefaultCuptiDeviceBufferSize),
       cuptiDeviceBufferPoolLimit_(kDefaultCuptiDeviceBufferPoolLimit) {
   auto factories = configFactories();
@@ -250,14 +254,13 @@ Config::Config()
 #if __linux__
 bool isDaemonEnvVarSet() {
   static bool rc = [] {
-      void *ptr = getenv(kUseDaemonEnvVar);
-      return ptr != nullptr;
+    void* ptr = getenv(kUseDaemonEnvVar);
+    return ptr != nullptr;
   }();
   return rc;
 }
 #endif
 
-
 std::shared_ptr<void> Config::getStaticObjectsLifetimeHandle() {
   return configFactories();
 }
@@ -385,11 +388,13 @@ bool Config::handleOption(const std::string& name, std::string& val) {
     activitiesLogUrl_ = fmt::format("file://{}", val);
     size_t jidx = activitiesLogUrl_.find(".pt.trace.json");
     if (jidx != std::string::npos) {
-      activitiesLogUrl_.replace(jidx, 14, fmt::format("_{}.pt.trace.json", processId()));
+      activitiesLogUrl_.replace(
+          jidx, 14, fmt::format("_{}.pt.trace.json", processId()));
     } else {
       jidx = activitiesLogUrl_.find(".json");
       if (jidx != std::string::npos) {
-        activitiesLogUrl_.replace(jidx, 5, fmt::format("_{}.json", processId()));
+        activitiesLogUrl_.replace(
+            jidx, 5, fmt::format("_{}.json", processId()));
       }
     }
     activitiesOnDemandTimestamp_ = timestamp();
@@ -429,8 +434,7 @@ bool Config::handleOption(const std::string& name, std::string& val) {
 
   // Common
   else if (!name.compare(kRequestTimestampKey)) {
-    LOG(INFO) << kRequestTimestampKey
-              << " has been deprecated - please use "
+    LOG(INFO) << kRequestTimestampKey << " has been deprecated - please use "
               << kProfileStartTimeKey;
     requestTimestamp_ = handleRequestTimestamp(toInt64(val));
   } else if (!name.compare(kProfileStartTimeKey)) {
@@ -513,12 +517,13 @@ void Config::validate(
     VLOG(0)
         << "No explicit timestamp has been set. "
         << "Defaulting it to now + activitiesWarmupDuration with a buffer of double the period of the monitoring thread.";
-    profileStartTime_ = fallbackProfileStartTime +
-        activitiesWarmupDuration() + 2 * Config::kControllerIntervalMsecs;
+    profileStartTime_ = fallbackProfileStartTime + activitiesWarmupDuration() +
+        2 * Config::kControllerIntervalMsecs;
   }
 
   if (profileStartIterationRoundUp_ == 0) {
-    // setting to 0 will mess up modulo arithmetic, set it to -1 so it has no effect
+    // setting to 0 will mess up modulo arithmetic, set it to -1 so it has no
+    // effect
     LOG(WARNING) << "Profiler start iteration round up should be >= 1.";
     profileStartIterationRoundUp_ = -1;
   }
@@ -543,16 +548,18 @@ void Config::printActivityProfilerConfig(std::ostream& s) const {
   s << "  Log file: " << activitiesLogFile() << std::endl;
   if (hasProfileStartIteration()) {
     s << "  Trace start Iteration: " << profileStartIteration() << std::endl;
-    s << "  Trace warmup Iterations: " << activitiesWarmupIterations() << std::endl;
-    s << "  Trace profile Iterations: " << activitiesRunIterations() << std::endl;
+    s << "  Trace warmup Iterations: " << activitiesWarmupIterations()
+      << std::endl;
+    s << "  Trace profile Iterations: " << activitiesRunIterations()
+      << std::endl;
     if (profileStartIterationRoundUp() > 0) {
-      s << "  Trace start iteration roundup : " << profileStartIterationRoundUp()
-        << std::endl;
+      s << "  Trace start iteration roundup : "
+        << profileStartIterationRoundUp() << std::endl;
     }
   } else if (hasProfileStartTime()) {
     std::time_t t_c = system_clock::to_time_t(requestTimestamp());
     s << "  Trace start time: "
-              << fmt::format("{:%Y-%m-%d %H:%M:%S}", fmt::localtime(t_c));
+      << fmt::format("{:%Y-%m-%d %H:%M:%S}", fmt::localtime(t_c));
     s << "  Trace duration: " << activitiesDuration().count() << "ms"
       << std::endl;
     s << "  Warmup duration: " << activitiesWarmupDuration().count() << "s"
@@ -566,13 +573,13 @@ void Config::printActivityProfilerConfig(std::ostream& s) const {
   for (const auto& activity : selectedActivityTypes_) {
     activities.push_back(toString(activity));
   }
-  s << "  Enabled activities: "
-    << fmt::format("{}", fmt::join(activities, ",")) << std::endl;
+  s << "  Enabled activities: " << fmt::format("{}", fmt::join(activities, ","))
+    << std::endl;
 
   AbstractConfig::printActivityProfilerConfig(s);
 }
 
-void Config::setActivityDependentConfig(){
+void Config::setActivityDependentConfig() {
   AbstractConfig::setActivityDependentConfig();
 }
 
diff --git a/libkineto/src/ConfigLoader.cpp b/libkineto/src/ConfigLoader.cpp
index 72fb04e4a..42b0b8163 100644
--- a/libkineto/src/ConfigLoader.cpp
+++ b/libkineto/src/ConfigLoader.cpp
@@ -26,7 +26,6 @@ using namespace std::chrono;
 
 namespace KINETO_NAMESPACE {
 
-
 constexpr char kConfigFileEnvVar[] = "KINETO_CONFIG";
 #ifdef __linux__
 constexpr char kConfigFile[] = "/etc/libkineto.conf";
@@ -90,7 +89,9 @@ static void setupSignalHandler(bool enableSigUsr2) {
 }
 
 // return an empty string if reading gets any errors. Otherwise a config string.
-static std::string readConfigFromConfigFile(const char* filename, bool verbose=true) {
+static std::string readConfigFromConfigFile(
+    const char* filename,
+    bool verbose = true) {
   // Read whole file into a string.
   std::ifstream file(filename);
   std::string conf;
@@ -109,7 +110,8 @@ static std::string readConfigFromConfigFile(const char* filename, bool verbose=t
 
 static std::function<std::unique_ptr<IDaemonConfigLoader>()>&
 daemonConfigLoaderFactory() {
-  static std::function<std::unique_ptr<IDaemonConfigLoader>()> factory = nullptr;
+  static std::function<std::unique_ptr<IDaemonConfigLoader>()> factory =
+      nullptr;
   return factory;
 }
 
@@ -144,12 +146,11 @@ int ConfigLoader::contextCountForGpu(uint32_t device) {
 
 ConfigLoader::ConfigLoader()
     : configUpdateIntervalSecs_(kConfigUpdateIntervalSecs),
-      // on-demand config will be overwritten by the value read from the regular config
-      // so the initial value is not important
+      // on-demand config will be overwritten by the value read from the regular
+      // config so the initial value is not important
       onDemandConfigUpdateIntervalSecs_(kConfigUpdateIntervalSecs),
       stopFlag_(false),
-      onDemandSignal_(false) {
-}
+      onDemandSignal_(false) {}
 
 void ConfigLoader::startThread() {
   if (!updateThread_) {
@@ -218,7 +219,7 @@ const char* ConfigLoader::customConfigFileName() {
   return getenv(kConfigFileEnvVar);
 }
 
-const std::string ConfigLoader::getConfString(){
+const std::string ConfigLoader::getConfString() {
   return readConfigFromConfigFile(configFileName(), false);
 }
 
@@ -241,8 +242,7 @@ void ConfigLoader::updateBaseConfig() {
     }
     setupSignalHandler(config_->sigUsr2Enabled());
     SET_LOG_VERBOSITY_LEVEL(
-        config_->verboseLogLevel(),
-        config_->verboseLogModules());
+        config_->verboseLogLevel(), config_->verboseLogModules());
     VLOG(0) << "Detected base config change";
   }
 }
@@ -250,11 +250,11 @@ void ConfigLoader::updateBaseConfig() {
 void ConfigLoader::configureFromSignal(
     time_point<system_clock> now,
     Config& config) {
-  LOG(INFO) << "Received on-demand profiling signal, "
-            << "reading config from " << kOnDemandConfigFile;
+  LOG(INFO) << "Received on-demand profiling signal, " << "reading config from "
+            << kOnDemandConfigFile;
   // Reset start time to 0 in order to compute new default start time
-  const std::string config_str = "PROFILE_START_TIME=0\n"
-      + readConfigFromConfigFile(kOnDemandConfigFile);
+  const std::string config_str =
+      "PROFILE_START_TIME=0\n" + readConfigFromConfigFile(kOnDemandConfigFile);
   config.parse(config_str);
   config.setSignalDefaults();
   notifyHandlers(config);
@@ -287,16 +287,19 @@ void ConfigLoader::updateConfigThread() {
   // Besides, on-demand update frequency can be configured via base config.
 
   // initialze with some time buffer in the past
-  auto prev_config_load_time = system_clock::now() - configUpdateIntervalSecs_ * 2;
+  auto prev_config_load_time =
+      system_clock::now() - configUpdateIntervalSecs_ * 2;
   auto prev_on_demand_load_time = prev_config_load_time;
   auto onDemandConfig = std::make_unique<Config>();
 
   // This can potentially sleep for long periods of time, so allow
   // the destructor to wake it to avoid a 5-minute long destruct period.
   for (;;) {
-    auto interval = std::min(
-        configUpdateIntervalSecs_ + prev_config_load_time,
-        onDemandConfigUpdateIntervalSecs_ + prev_on_demand_load_time) - system_clock::now();
+    auto interval =
+        std::min(
+            configUpdateIntervalSecs_ + prev_config_load_time,
+            onDemandConfigUpdateIntervalSecs_ + prev_on_demand_load_time) -
+        system_clock::now();
     if (interval.count() > 0) {
       std::unique_lock<std::mutex> lock(updateThreadMutex_);
       updateThreadCondVar_.wait_for(lock, interval);
@@ -307,13 +310,15 @@ void ConfigLoader::updateConfigThread() {
     auto now = system_clock::now();
     if (now > prev_config_load_time + configUpdateIntervalSecs_) {
       updateBaseConfig();
-      onDemandConfigUpdateIntervalSecs_ = config_->onDemandConfigUpdateIntervalSecs();
+      onDemandConfigUpdateIntervalSecs_ =
+          config_->onDemandConfigUpdateIntervalSecs();
       prev_config_load_time = now;
     }
     if (onDemandSignal_.exchange(false)) {
       onDemandConfig = config_->clone();
       configureFromSignal(now, *onDemandConfig);
-    } else if (now > prev_on_demand_load_time + onDemandConfigUpdateIntervalSecs_) {
+    } else if (
+        now > prev_on_demand_load_time + onDemandConfigUpdateIntervalSecs_) {
       onDemandConfig = std::make_unique<Config>();
       configureFromDaemon(now, *onDemandConfig);
       prev_on_demand_load_time = now;
diff --git a/libkineto/src/ConfigLoader.h b/libkineto/src/ConfigLoader.h
index ce5ffe96d..84366495c 100644
--- a/libkineto/src/ConfigLoader.h
+++ b/libkineto/src/ConfigLoader.h
@@ -24,7 +24,7 @@
 #include "ILoggerObserver.h"
 
 namespace libkineto {
-  class LibkinetoApi;
+class LibkinetoApi;
 }
 
 namespace KINETO_NAMESPACE {
@@ -34,14 +34,9 @@ class IDaemonConfigLoader;
 
 class ConfigLoader {
  public:
-
   static ConfigLoader& instance();
 
-  enum ConfigKind {
-    ActivityProfiler = 0,
-    EventProfiler,
-    NumConfigKinds
-  };
+  enum ConfigKind { ActivityProfiler = 0, EventProfiler, NumConfigKinds };
 
   struct ConfigHandler {
     virtual ~ConfigHandler() {}
@@ -57,8 +52,8 @@ class ConfigLoader {
 
   void removeHandler(ConfigKind kind, ConfigHandler* handler) {
     std::lock_guard<std::mutex> lock(updateThreadMutex_);
-    auto it = std::find(
-        handlers_[kind].begin(), handlers_[kind].end(), handler);
+    auto it =
+        std::find(handlers_[kind].begin(), handlers_[kind].end(), handler);
     if (it != handlers_[kind].end()) {
       handlers_[kind].erase(it);
     }
diff --git a/libkineto/src/CuptiActivity.cpp b/libkineto/src/CuptiActivity.cpp
index d25916321..0bed9f40a 100644
--- a/libkineto/src/CuptiActivity.cpp
+++ b/libkineto/src/CuptiActivity.cpp
@@ -12,9 +12,8 @@
 
 #include "Demangle.h"
 #include "DeviceProperties.h"
-#include "output_base.h"
 #include "Logger.h"
-
+#include "output_base.h"
 
 namespace KINETO_NAMESPACE {
 
@@ -23,12 +22,12 @@ using namespace libkineto;
 // forward declaration
 uint32_t contextIdtoDeviceId(uint32_t contextId);
 
-template<>
+template <>
 inline const std::string GpuActivity<CUpti_ActivityKernel4>::name() const {
   return demangle(raw().name);
 }
 
-template<>
+template <>
 inline ActivityType GpuActivity<CUpti_ActivityKernel4>::type() const {
   return ActivityType::CONCURRENT_KERNEL;
 }
@@ -39,23 +38,22 @@ inline bool isWaitEventSync(CUpti_ActivitySynchronizationType type) {
 
 inline bool isEventSync(CUpti_ActivitySynchronizationType type) {
   return (
-    type == CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE ||
-    type == CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT);
+      type == CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE ||
+      type == CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT);
 }
 
 inline std::string eventSyncInfo(
     const CUpti_ActivitySynchronization& act,
     int32_t srcStream,
-    int32_t srcCorrId
-    ) {
-  return fmt::format(R"JSON(
+    int32_t srcCorrId) {
+  return fmt::format(
+      R"JSON(
       "wait_on_stream": {},
       "wait_on_cuda_event_record_corr_id": {},
       "wait_on_cuda_event_id": {},)JSON",
       srcStream,
       srcCorrId,
-      act.cudaEventId
-  );
+      act.cudaEventId);
 }
 
 inline const std::string CudaSyncActivity::name() const {
@@ -95,7 +93,7 @@ inline const std::string CudaSyncActivity::metadataJson() const {
   return "";
 }
 
-template<class T>
+template <class T>
 inline void GpuActivity<T>::log(ActivityLogger& logger) const {
   logger.handleActivity(*this);
 }
@@ -106,8 +104,9 @@ constexpr int64_t us(int64_t timestamp) {
   return timestamp / 1000;
 }
 
-template<>
-inline const std::string GpuActivity<CUpti_ActivityKernel4>::metadataJson() const {
+template <>
+inline const std::string GpuActivity<CUpti_ActivityKernel4>::metadataJson()
+    const {
   const CUpti_ActivityKernel4& kernel = raw();
   float blocksPerSmVal = blocksPerSm(kernel);
   float warpsPerSmVal = warpsPerSm(kernel);
@@ -136,7 +135,6 @@ inline const std::string GpuActivity<CUpti_ActivityKernel4>::metadataJson() cons
   // clang-format on
 }
 
-
 inline std::string memcpyName(uint8_t kind, uint8_t src, uint8_t dst) {
   return fmt::format(
       "Memcpy {} ({} -> {})",
@@ -145,12 +143,12 @@ inline std::string memcpyName(uint8_t kind, uint8_t src, uint8_t dst) {
       memoryKindString((CUpti_ActivityMemoryKind)dst));
 }
 
-template<>
+template <>
 inline ActivityType GpuActivity<CUpti_ActivityMemcpy>::type() const {
   return ActivityType::GPU_MEMCPY;
 }
 
-template<>
+template <>
 inline const std::string GpuActivity<CUpti_ActivityMemcpy>::name() const {
   return memcpyName(raw().copyKind, raw().srcKind, raw().dstKind);
 }
@@ -159,8 +157,9 @@ inline std::string bandwidth(uint64_t bytes, uint64_t duration) {
   return duration == 0 ? "\"N/A\"" : fmt::format("{}", bytes * 1.0 / duration);
 }
 
-template<>
-inline const std::string GpuActivity<CUpti_ActivityMemcpy>::metadataJson() const {
+template <>
+inline const std::string GpuActivity<CUpti_ActivityMemcpy>::metadataJson()
+    const {
   const CUpti_ActivityMemcpy& memcpy = raw();
   // clang-format off
   return fmt::format(R"JSON(
@@ -173,19 +172,19 @@ inline const std::string GpuActivity<CUpti_ActivityMemcpy>::metadataJson() const
   // clang-format on
 }
 
-
-template<>
+template <>
 inline ActivityType GpuActivity<CUpti_ActivityMemcpy2>::type() const {
   return ActivityType::GPU_MEMCPY;
 }
 
-template<>
+template <>
 inline const std::string GpuActivity<CUpti_ActivityMemcpy2>::name() const {
   return memcpyName(raw().copyKind, raw().srcKind, raw().dstKind);
 }
 
-template<>
-inline const std::string GpuActivity<CUpti_ActivityMemcpy2>::metadataJson() const {
+template <>
+inline const std::string GpuActivity<CUpti_ActivityMemcpy2>::metadataJson()
+    const {
   const CUpti_ActivityMemcpy2& memcpy = raw();
   // clang-format off
   return fmt::format(R"JSON(
@@ -200,20 +199,21 @@ inline const std::string GpuActivity<CUpti_ActivityMemcpy2>::metadataJson() cons
   // clang-format on
 }
 
-template<>
+template <>
 inline const std::string GpuActivity<CUpti_ActivityMemset>::name() const {
   const char* memory_kind =
-    memoryKindString((CUpti_ActivityMemoryKind)raw().memoryKind);
+      memoryKindString((CUpti_ActivityMemoryKind)raw().memoryKind);
   return fmt::format("Memset ({})", memory_kind);
 }
 
-template<>
+template <>
 inline ActivityType GpuActivity<CUpti_ActivityMemset>::type() const {
   return ActivityType::GPU_MEMSET;
 }
 
-template<>
-inline const std::string GpuActivity<CUpti_ActivityMemset>::metadataJson() const {
+template <>
+inline const std::string GpuActivity<CUpti_ActivityMemset>::metadataJson()
+    const {
   const CUpti_ActivityMemset& memset = raw();
   // clang-format off
   return fmt::format(R"JSON(
@@ -268,17 +268,19 @@ inline bool RuntimeActivity::flowStart() const {
 }
 
 inline const std::string RuntimeActivity::metadataJson() const {
-  return fmt::format(R"JSON(
+  return fmt::format(
+      R"JSON(
       "cbid": {}, "correlation": {})JSON",
-      activity_.cbid, activity_.correlationId);
+      activity_.cbid,
+      activity_.correlationId);
 }
 
 inline bool isKernelLaunchApi(const CUpti_ActivityAPI& activity_) {
   return activity_.cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11060
-    || activity_.cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx
+      || activity_.cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx
 #endif
-    ;
+      ;
 }
 
 inline bool DriverActivity::flowStart() const {
@@ -286,9 +288,11 @@ inline bool DriverActivity::flowStart() const {
 }
 
 inline const std::string DriverActivity::metadataJson() const {
-  return fmt::format(R"JSON(
+  return fmt::format(
+      R"JSON(
       "cbid": {}, "correlation": {})JSON",
-      activity_.cbid, activity_.correlationId);
+      activity_.cbid,
+      activity_.correlationId);
 }
 
 inline const std::string DriverActivity::name() const {
@@ -306,7 +310,7 @@ inline const std::string DriverActivity::name() const {
   }
 }
 
-template<class T>
+template <class T>
 inline const std::string GpuActivity<T>::metadataJson() const {
   return "";
 }
diff --git a/libkineto/src/CuptiActivity.h b/libkineto/src/CuptiActivity.h
index d7c9d6b34..c5ff6816b 100644
--- a/libkineto/src/CuptiActivity.h
+++ b/libkineto/src/CuptiActivity.h
@@ -12,14 +12,14 @@
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
-#include "ITraceActivity.h"
+#include "ApproximateClock.h"
 #include "GenericTraceActivity.h"
+#include "ITraceActivity.h"
 #include "ThreadUtil.h"
 #include "cupti_strings.h"
-#include "ApproximateClock.h"
 
 namespace libkineto {
-  class ActivityLogger;
+class ActivityLogger;
 }
 
 namespace KINETO_NAMESPACE {
@@ -27,7 +27,7 @@ namespace KINETO_NAMESPACE {
 using namespace libkineto;
 struct TraceSpan;
 
-// This function allows us to activate/deactivate TSC CUPTI callbacks 
+// This function allows us to activate/deactivate TSC CUPTI callbacks
 // via a killswitch
 bool& use_cupti_tsc();
 
@@ -36,7 +36,7 @@ bool& use_cupti_tsc();
 // using the ITraceActivity interface and logged via ActivityLogger.
 
 // Abstract base class, templated on Cupti activity type
-template<class T>
+template <class T>
 struct CuptiActivity : public ITraceActivity {
   explicit CuptiActivity(const T* activity, const ITraceActivity* linked)
       : activity_(*activity), linked_(linked) {}
@@ -44,36 +44,51 @@ struct CuptiActivity : public ITraceActivity {
   // we use the default system clock so no conversion needed same for all
   // ifdefs below
   int64_t timestamp() const override {
-  #if defined(_WIN32) || CUDA_VERSION < 11060
+#if defined(_WIN32) || CUDA_VERSION < 11060
     return activity_.start;
-  #else
-    if (use_cupti_tsc()){
+#else
+    if (use_cupti_tsc()) {
       return get_time_converter()(activity_.start);
     } else {
       return activity_.start;
     }
-  #endif
+#endif
   }
 
   int64_t duration() const override {
-  #if defined(_WIN32) || CUDA_VERSION < 11060
+#if defined(_WIN32) || CUDA_VERSION < 11060
     return activity_.end - activity_.start;
-  #else
-    if (use_cupti_tsc()){
-      return get_time_converter()(activity_.end) - get_time_converter()(activity_.start);
+#else
+    if (use_cupti_tsc()) {
+      return get_time_converter()(activity_.end) -
+          get_time_converter()(activity_.start);
     } else {
       return activity_.end - activity_.start;
     }
-  #endif
+#endif
   }
   // TODO(T107507796): Deprecate ITraceActivity
-  int64_t correlationId() const override {return 0;}
-  int32_t getThreadId() const override {return 0;}
-  const ITraceActivity* linkedActivity() const override {return linked_;}
-  int flowType() const override {return kLinkAsyncCpuGpu;}
-  int flowId() const override {return correlationId();}
-  const T& raw() const {return activity_;}
-  const TraceSpan* traceSpan() const override {return nullptr;}
+  int64_t correlationId() const override {
+    return 0;
+  }
+  int32_t getThreadId() const override {
+    return 0;
+  }
+  const ITraceActivity* linkedActivity() const override {
+    return linked_;
+  }
+  int flowType() const override {
+    return kLinkAsyncCpuGpu;
+  }
+  int flowId() const override {
+    return correlationId();
+  }
+  const T& raw() const {
+    return activity_;
+  }
+  const TraceSpan* traceSpan() const override {
+    return nullptr;
+  }
 
  protected:
   const T& activity_;
@@ -87,12 +102,22 @@ struct RuntimeActivity : public CuptiActivity<CUpti_ActivityAPI> {
       const ITraceActivity* linked,
       int32_t threadId)
       : CuptiActivity(activity, linked), threadId_(threadId) {}
-  int64_t correlationId() const override {return activity_.correlationId;}
-  int64_t deviceId() const override {return processId();}
-  int64_t resourceId() const override {return threadId_;}
-  ActivityType type() const override {return ActivityType::CUDA_RUNTIME;}
+  int64_t correlationId() const override {
+    return activity_.correlationId;
+  }
+  int64_t deviceId() const override {
+    return processId();
+  }
+  int64_t resourceId() const override {
+    return threadId_;
+  }
+  ActivityType type() const override {
+    return ActivityType::CUDA_RUNTIME;
+  }
   bool flowStart() const override;
-  const std::string name() const override {return runtimeCbidName(activity_.cbid);}
+  const std::string name() const override {
+    return runtimeCbidName(activity_.cbid);
+  }
   void log(ActivityLogger& logger) const override;
   const std::string metadataJson() const override;
 
@@ -107,10 +132,18 @@ struct DriverActivity : public CuptiActivity<CUpti_ActivityAPI> {
       const ITraceActivity* linked,
       int32_t threadId)
       : CuptiActivity(activity, linked), threadId_(threadId) {}
-  int64_t correlationId() const override {return activity_.correlationId;}
-  int64_t deviceId() const override {return processId();}
-  int64_t resourceId() const override {return threadId_;}
-  ActivityType type() const override {return ActivityType::CUDA_DRIVER;}
+  int64_t correlationId() const override {
+    return activity_.correlationId;
+  }
+  int64_t deviceId() const override {
+    return processId();
+  }
+  int64_t resourceId() const override {
+    return threadId_;
+  }
+  ActivityType type() const override {
+    return ActivityType::CUDA_DRIVER;
+  }
   bool flowStart() const override;
   const std::string name() const override;
   void log(ActivityLogger& logger) const override;
@@ -125,40 +158,48 @@ struct OverheadActivity : public CuptiActivity<CUpti_ActivityOverhead> {
   explicit OverheadActivity(
       const CUpti_ActivityOverhead* activity,
       const ITraceActivity* linked,
-      int32_t threadId=0)
+      int32_t threadId = 0)
       : CuptiActivity(activity, linked), threadId_(threadId) {}
 
-
   int64_t timestamp() const override {
-  #if defined(_WIN32) || CUDA_VERSION < 11060
+#if defined(_WIN32) || CUDA_VERSION < 11060
     return activity_.start;
-  #else
-    if (use_cupti_tsc()){
+#else
+    if (use_cupti_tsc()) {
       return get_time_converter()(activity_.start);
     } else {
       return activity_.start;
     }
-  #endif
+#endif
   }
 
   int64_t duration() const override {
-  #if defined(_WIN32) || CUDA_VERSION < 11060
+#if defined(_WIN32) || CUDA_VERSION < 11060
     return activity_.end - activity_.start;
-  #else
-    if (use_cupti_tsc()){
-      return get_time_converter()(activity_.end) - get_time_converter()(activity_.start);
+#else
+    if (use_cupti_tsc()) {
+      return get_time_converter()(activity_.end) -
+          get_time_converter()(activity_.start);
     } else {
       return activity_.end - activity_.start;
     }
-  #endif
+#endif
   }
 
   // TODO: Update this with PID ordering
-  int64_t deviceId() const override {return -1;}
-  int64_t resourceId() const override {return threadId_;}
-  ActivityType type() const override {return ActivityType::OVERHEAD;}
+  int64_t deviceId() const override {
+    return -1;
+  }
+  int64_t resourceId() const override {
+    return threadId_;
+  }
+  ActivityType type() const override {
+    return ActivityType::OVERHEAD;
+  }
   bool flowStart() const override;
-  const std::string name() const override {return overheadKindString(activity_.overheadKind);}
+  const std::string name() const override {
+    return overheadKindString(activity_.overheadKind);
+  }
   void log(ActivityLogger& logger) const override;
   const std::string metadataJson() const override;
 
@@ -176,15 +217,23 @@ struct CudaSyncActivity : public CuptiActivity<CUpti_ActivitySynchronization> {
       : CuptiActivity(activity, linked),
         srcStream_(srcStream),
         srcCorrId_(srcCorrId) {}
-  int64_t correlationId() const override {return raw().correlationId;}
+  int64_t correlationId() const override {
+    return raw().correlationId;
+  }
   int64_t deviceId() const override;
   int64_t resourceId() const override;
-  ActivityType type() const override {return ActivityType::CUDA_SYNC;}
-  bool flowStart() const override {return false;}
+  ActivityType type() const override {
+    return ActivityType::CUDA_SYNC;
+  }
+  bool flowStart() const override {
+    return false;
+  }
   const std::string name() const override;
   void log(ActivityLogger& logger) const override;
   const std::string metadataJson() const override;
-  const CUpti_ActivitySynchronization& raw() const {return CuptiActivity<CUpti_ActivitySynchronization>::raw();}
+  const CUpti_ActivitySynchronization& raw() const {
+    return CuptiActivity<CUpti_ActivitySynchronization>::raw();
+  }
 
  private:
   const int32_t srcStream_;
@@ -193,19 +242,29 @@ struct CudaSyncActivity : public CuptiActivity<CUpti_ActivitySynchronization> {
 
 // Base class for GPU activities.
 // Can also be instantiated directly.
-template<class T>
+template <class T>
 struct GpuActivity : public CuptiActivity<T> {
   explicit GpuActivity(const T* activity, const ITraceActivity* linked)
       : CuptiActivity<T>(activity, linked) {}
-  int64_t correlationId() const override {return raw().correlationId;}
-  int64_t deviceId() const override {return raw().deviceId;}
-  int64_t resourceId() const override {return raw().streamId;}
+  int64_t correlationId() const override {
+    return raw().correlationId;
+  }
+  int64_t deviceId() const override {
+    return raw().deviceId;
+  }
+  int64_t resourceId() const override {
+    return raw().streamId;
+  }
   ActivityType type() const override;
-  bool flowStart() const override {return false;}
+  bool flowStart() const override {
+    return false;
+  }
   const std::string name() const override;
   void log(ActivityLogger& logger) const override;
   const std::string metadataJson() const override;
-  const T& raw() const {return CuptiActivity<T>::raw();}
+  const T& raw() const {
+    return CuptiActivity<T>::raw();
+  }
 };
 
 } // namespace KINETO_NAMESPACE
diff --git a/libkineto/src/CuptiActivityApi.cpp b/libkineto/src/CuptiActivityApi.cpp
index 154492bb3..116355a37 100644
--- a/libkineto/src/CuptiActivityApi.cpp
+++ b/libkineto/src/CuptiActivityApi.cpp
@@ -13,9 +13,9 @@
 #include <mutex>
 #include <thread>
 
-#include "Logger.h"
 #include "Config.h"
 #include "DeviceUtil.h"
+#include "Logger.h"
 
 using namespace std::chrono;
 
@@ -41,18 +41,22 @@ inline bool cuptiLazyInit_() {
 inline void reenableCuptiCallbacks_(std::shared_ptr<CuptiCallbackApi>& cbapi_) {
   // Re-enable callbacks from the past if they exist.
   LOG(INFO) << "Re-enable previous CUPTI callbacks - Starting";
-  VLOG(1) << "  CUPTI subscriber before reinit:" << cbapi_->getCuptiSubscriber();
+  VLOG(1) << "  CUPTI subscriber before reinit:"
+          << cbapi_->getCuptiSubscriber();
   cbapi_->initCallbackApi();
   if (cbapi_->initSuccess()) {
-    VLOG(1) << "  CUPTI subscriber after reinit:" << cbapi_->getCuptiSubscriber();
+    VLOG(1) << "  CUPTI subscriber after reinit:"
+            << cbapi_->getCuptiSubscriber();
     bool status = cbapi_->reenableCallbacks();
     if (!status) {
-      LOG(WARNING) << "Re-enable previous CUPTI callbacks - Failed to reenableCallbacks";
+      LOG(WARNING)
+          << "Re-enable previous CUPTI callbacks - Failed to reenableCallbacks";
     } else {
       LOG(INFO) << "Re-enable previous CUPTI callbacks - Successful";
     }
   } else {
-    LOG(WARNING) << "Re-enable previous CUPTI callbacks - Failed to initCallbackApi";
+    LOG(WARNING)
+        << "Re-enable previous CUPTI callbacks - Failed to initCallbackApi";
   }
 }
 #endif
@@ -68,14 +72,14 @@ void CuptiActivityApi::pushCorrelationID(int id, CorrelationFlowType type) {
     return;
   }
   VLOG(2) << "pushCorrelationID(" << id << ")";
-  switch(type) {
+  switch (type) {
     case Default:
       CUPTI_CALL(cuptiActivityPushExternalCorrelationId(
-        CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0, id));
-        break;
+          CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0, id));
+      break;
     case User:
       CUPTI_CALL(cuptiActivityPushExternalCorrelationId(
-        CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1, id));
+          CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1, id));
   }
 #endif
 }
@@ -85,14 +89,14 @@ void CuptiActivityApi::popCorrelationID(CorrelationFlowType type) {
   if (!singleton().externalCorrelationEnabled_) {
     return;
   }
-  switch(type) {
+  switch (type) {
     case Default:
       CUPTI_CALL(cuptiActivityPopExternalCorrelationId(
-        CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0, nullptr));
-        break;
+          CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0, nullptr));
+      break;
     case User:
       CUPTI_CALL(cuptiActivityPopExternalCorrelationId(
-        CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1, nullptr));
+          CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1, nullptr));
   }
 #endif
 }
@@ -121,14 +125,16 @@ void CuptiActivityApi::setMaxBufferSize(int size) {
 void CuptiActivityApi::setDeviceBufferSize(size_t size) {
 #ifdef HAS_CUPTI
   size_t valueSize = sizeof(size_t);
-  CUPTI_CALL(cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE, &valueSize, &size));
+  CUPTI_CALL(cuptiActivitySetAttribute(
+      CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE, &valueSize, &size));
 #endif
 }
 
 void CuptiActivityApi::setDeviceBufferPoolLimit(size_t limit) {
 #ifdef HAS_CUPTI
   size_t valueSize = sizeof(size_t);
-  CUPTI_CALL(cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &valueSize, &limit));
+  CUPTI_CALL(cuptiActivitySetAttribute(
+      CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &valueSize, &limit));
 #endif
 }
 
@@ -155,14 +161,15 @@ void CUPTIAPI CuptiActivityApi::bufferRequestedTrampoline(
 }
 
 void CuptiActivityApi::bufferRequested(
-    uint8_t** buffer, size_t* size, size_t* maxNumRecords) {
+    uint8_t** buffer,
+    size_t* size,
+    size_t* maxNumRecords) {
   std::lock_guard<std::mutex> guard(mutex_);
   if (allocatedGpuTraceBuffers_.size() >= maxGpuBufferCount_) {
     stopCollection = true;
     LOG(WARNING) << "Exceeded max GPU buffer count ("
-                 << allocatedGpuTraceBuffers_.size()
-                 << " > " << maxGpuBufferCount_
-                 << ") - terminating tracing";
+                 << allocatedGpuTraceBuffers_.size() << " > "
+                 << maxGpuBufferCount_ << ") - terminating tracing";
   }
 
   auto buf = std::make_unique<CuptiActivityBuffer>(kBufSize);
@@ -175,8 +182,7 @@ void CuptiActivityApi::bufferRequested(
 }
 #endif
 
-std::unique_ptr<CuptiActivityBufferMap>
-CuptiActivityApi::activityBuffers() {
+std::unique_ptr<CuptiActivityBufferMap> CuptiActivityApi::activityBuffers() {
   {
     std::lock_guard<std::mutex> guard(mutex_);
     if (allocatedGpuTraceBuffers_.empty()) {
@@ -272,12 +278,11 @@ void CuptiActivityApi::bufferCompleted(
     uint8_t* buffer,
     size_t /* unused */,
     size_t validSize) {
-
   std::lock_guard<std::mutex> guard(mutex_);
   auto it = allocatedGpuTraceBuffers_.find(buffer);
   if (it == allocatedGpuTraceBuffers_.end()) {
     LOG(ERROR) << "bufferCompleted called with unknown buffer: "
-               << (void*) buffer;
+               << (void*)buffer;
     return;
   }
 
@@ -312,8 +317,8 @@ void CuptiActivityApi::enableCuptiActivities(
   }
   cbapi_.reset();
 
-  CUPTI_CALL(
-      cuptiActivityRegisterCallbacks(bufferRequestedTrampoline, bufferCompletedTrampoline));
+  CUPTI_CALL(cuptiActivityRegisterCallbacks(
+      bufferRequestedTrampoline, bufferCompletedTrampoline));
 
   externalCorrelationEnabled_ = false;
   for (const auto& activity : selected_activities) {
@@ -365,7 +370,8 @@ void CuptiActivityApi::disableCuptiActivities(
       CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
     }
     if (activity == ActivityType::EXTERNAL_CORRELATION) {
-      CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION));
+      CUPTI_CALL(
+          cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION));
     }
     if (activity == ActivityType::CUDA_SYNC) {
       CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SYNCHRONIZATION));
@@ -392,7 +398,8 @@ void CuptiActivityApi::teardownContext() {
   if (cuptiTearDown_()) {
     LOG(INFO) << "teardownCupti starting";
 
-    // PyTorch Profiler is synchronous, so teardown needs to be run async in this thread.
+    // PyTorch Profiler is synchronous, so teardown needs to be run async in
+    // this thread.
     std::thread teardownThread([&] {
       auto cbapi_ = CuptiCallbackApi::singleton();
       if (!cbapi_->initSuccess()) {
@@ -402,21 +409,25 @@ void CuptiActivityApi::teardownContext() {
           return;
         }
       }
-      // Subscribe callbacks to call cuptiFinalize in the exit callback of these APIs
+      // Subscribe callbacks to call cuptiFinalize in the exit callback of these
+      // APIs
       bool status = cbapi_->enableCallbackDomain(CUPTI_CB_DOMAIN_RUNTIME_API);
-      status = status && cbapi_->enableCallbackDomain(CUPTI_CB_DOMAIN_DRIVER_API);
+      status =
+          status && cbapi_->enableCallbackDomain(CUPTI_CB_DOMAIN_DRIVER_API);
       if (!status) {
-        LOG(WARNING) << "CUPTI Callback failed to enable for domain, skipping teardown";
+        LOG(WARNING)
+            << "CUPTI Callback failed to enable for domain, skipping teardown";
         return;
       }
 
       // Force Flush before finalize
       CUPTI_CALL(cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
 
-      LOG(INFO) << "  CUPTI subscriber before finalize:" << cbapi_->getCuptiSubscriber();
+      LOG(INFO) << "  CUPTI subscriber before finalize:"
+                << cbapi_->getCuptiSubscriber();
       teardownCupti_ = 1;
       std::unique_lock<std::mutex> lck(finalizeMutex_);
-      finalizeCond_.wait(lck, [&]{return teardownCupti_ == 0;});
+      finalizeCond_.wait(lck, [&] { return teardownCupti_ == 0; });
       lck.unlock();
       LOG(INFO) << "teardownCupti complete";
 
diff --git a/libkineto/src/CuptiActivityApi.h b/libkineto/src/CuptiActivityApi.h
index 2e771374d..e9ee6d56c 100644
--- a/libkineto/src/CuptiActivityApi.h
+++ b/libkineto/src/CuptiActivityApi.h
@@ -28,7 +28,6 @@
 #include "CuptiCallbackApi.h"
 #endif
 
-
 namespace KINETO_NAMESPACE {
 
 using namespace libkineto;
@@ -39,10 +38,7 @@ using CUpti_Activity = void;
 
 class CuptiActivityApi {
  public:
-  enum CorrelationFlowType {
-    Default,
-    User
-  };
+  enum CorrelationFlowType { Default, User };
   // Control Variables shared with CuptiCallbackApi for teardown
   std::atomic<uint32_t> teardownCupti_{0};
   std::mutex finalizeMutex_;
@@ -59,10 +55,9 @@ class CuptiActivityApi {
   static void pushCorrelationID(int id, CorrelationFlowType type);
   static void popCorrelationID(CorrelationFlowType type);
 
-  void enableCuptiActivities(
-    const std::set<ActivityType>& selected_activities);
+  void enableCuptiActivities(const std::set<ActivityType>& selected_activities);
   void disableCuptiActivities(
-    const std::set<ActivityType>& selected_activities);
+      const std::set<ActivityType>& selected_activities);
   void clearActivities();
   void teardownContext();
 
@@ -97,8 +92,10 @@ class CuptiActivityApi {
       uint8_t* buf,
       size_t validSize,
       std::function<void(const CUpti_Activity*)> handler);
-  static void CUPTIAPI
-  bufferRequestedTrampoline(uint8_t** buffer, size_t* size, size_t* maxNumRecords);
+  static void CUPTIAPI bufferRequestedTrampoline(
+      uint8_t** buffer,
+      size_t* size,
+      size_t* maxNumRecords);
   static void CUPTIAPI bufferCompletedTrampoline(
       CUcontext ctx,
       uint32_t streamId,
diff --git a/libkineto/src/CuptiActivityBuffer.h b/libkineto/src/CuptiActivityBuffer.h
index 771d3323f..901bb2b6d 100644
--- a/libkineto/src/CuptiActivityBuffer.h
+++ b/libkineto/src/CuptiActivityBuffer.h
@@ -8,12 +8,12 @@
 
 #pragma once
 
-#include <stdlib.h>
 #include <assert.h>
+#include <stdlib.h>
+#include <sys/types.h>
 #include <cstdint>
 #include <map>
 #include <memory>
-#include <sys/types.h>
 #include <vector>
 
 #include "ITraceActivity.h"
@@ -44,7 +44,6 @@ class CuptiActivityBuffer {
   }
 
  private:
-
   std::vector<uint8_t> buf_;
   size_t size_;
 
diff --git a/libkineto/src/CuptiActivityProfiler.cpp b/libkineto/src/CuptiActivityProfiler.cpp
index 2735c602c..8e68ac6a2 100644
--- a/libkineto/src/CuptiActivityProfiler.cpp
+++ b/libkineto/src/CuptiActivityProfiler.cpp
@@ -7,20 +7,20 @@
  */
 
 #include "CuptiActivityProfiler.h"
-#include "ApproximateClock.h"
 #include <fmt/format.h>
 #include <time.h>
 #include <atomic>
 #include <cstdint>
 #include <functional>
 #include <iomanip>
+#include <limits>
 #include <optional>
 #include <string>
 #include <thread>
 #include <type_traits>
 #include <unordered_map>
 #include <vector>
-#include <limits>
+#include "ApproximateClock.h"
 
 #ifdef HAS_CUPTI
 #include <cupti.h>
@@ -32,8 +32,8 @@
 #include "DeviceUtil.h"
 #include "time_since_epoch.h"
 #ifdef HAS_CUPTI
-#include "CuptiActivity.h"
 #include "CuptiActivity.cpp"
+#include "CuptiActivity.h"
 #include "CuptiActivityApi.h"
 #endif // HAS_CUPTI
 #ifdef HAS_ROCTRACER
@@ -41,8 +41,8 @@
 #include "RoctracerActivityApi.h"
 #include "RoctracerLogger.h"
 #endif
-#include "output_base.h"
 #include "ActivityBuffers.h"
+#include "output_base.h"
 
 #include "Logger.h"
 #include "ThreadUtil.h"
@@ -59,13 +59,11 @@ struct CtxEventPair {
   }
 };
 
-template<>
+template <>
 struct std::hash<CtxEventPair> {
   std::size_t operator()(const CtxEventPair& c) const {
     return KINETO_NAMESPACE::detail::hash_combine(
-      std::hash<uint32_t>()(c.ctx),
-      std::hash<uint32_t>()(c.eventId)
-    );
+        std::hash<uint32_t>()(c.ctx), std::hash<uint32_t>()(c.eventId));
   }
 };
 
@@ -90,17 +88,16 @@ std::unordered_map<uint32_t, uint32_t>& ctxToDeviceId() {
   return ctxToDeviceId_;
 }
 
-}
+} // namespace
 
 namespace KINETO_NAMESPACE {
 
-// Sets the timestamp converter. If nothing is set then the converter just returns the
-// input. For this reason, until we add profiler impl of passing in TSC converter we just
-// need to guard the callback itself
+// Sets the timestamp converter. If nothing is set then the converter just
+// returns the input. For this reason, until we add profiler impl of passing in
+// TSC converter we just need to guard the callback itself
 std::function<time_t(approx_time_t)>& get_time_converter() {
-  static std::function<time_t(approx_time_t)> _time_converter = [](approx_time_t t) {
-    return t;
-  };
+  static std::function<time_t(approx_time_t)> _time_converter =
+      [](approx_time_t t) { return t; };
   return _time_converter;
 }
 #ifdef HAS_ROCTRACER
@@ -110,12 +107,13 @@ timestamp_t getTimeOffset() {
   t0 = libkineto::getApproximateTime();
   clock_gettime(CLOCK_MONOTONIC, &t1);
   t00 = libkineto::getApproximateTime();
-  
+
   // Confvert to ns (if necessary)
   t0 = libkineto::get_time_converter()(t0);
   t00 = libkineto::get_time_converter()(t00);
-  
-  // Our stored timestamps (from roctracer and generated) are in CLOCK_MONOTONIC domain (in ns).
+
+  // Our stored timestamps (from roctracer and generated) are in CLOCK_MONOTONIC
+  // domain (in ns).
   return (t0 >> 1) + (t00 >> 1) - timespec_to_ns(t1);
 }
 #endif
@@ -193,10 +191,13 @@ bool ConfigDerivedState::isCollectionDone(
   return false;
 }
 
-std::ostream& operator<<(std::ostream& oss, const CuptiActivityProfiler::ErrorCounts& ecs) {
+std::ostream& operator<<(
+    std::ostream& oss,
+    const CuptiActivityProfiler::ErrorCounts& ecs) {
   oss << "Out-of-range = " << ecs.out_of_range_events
       << ", Blocklisted runtime = " << ecs.blocklisted_runtime_events
-      << ", Invalid ext correlations = " << ecs.invalid_external_correlation_events
+      << ", Invalid ext correlations = "
+      << ecs.invalid_external_correlation_events
       << ", CPU GPU out-of-order = " << ecs.gpu_and_cpu_op_out_of_order
 #if defined(HAS_CUPTI) || defined(HAS_ROCTRACER)
       << ", Unexpected CUDA events = " << ecs.unexepected_cuda_events
@@ -258,18 +259,15 @@ void CuptiActivityProfiler::logGpuVersions() {
             << "; Runtime: " << cudaRuntimeVersion
             << "; Driver: " << cudaDriverVersion;
 
-  LOGGER_OBSERVER_ADD_METADATA(
-      "cupti_version", std::to_string(cuptiVersion));
+  LOGGER_OBSERVER_ADD_METADATA("cupti_version", std::to_string(cuptiVersion));
   LOGGER_OBSERVER_ADD_METADATA(
       "cuda_runtime_version", std::to_string(cudaRuntimeVersion));
   LOGGER_OBSERVER_ADD_METADATA(
       "cuda_driver_version", std::to_string(cudaDriverVersion));
-  addVersionMetadata(
-      "cupti_version", std::to_string(cuptiVersion));
+  addVersionMetadata("cupti_version", std::to_string(cuptiVersion));
   addVersionMetadata(
       "cuda_runtime_version", std::to_string(cudaRuntimeVersion));
-  addVersionMetadata(
-      "cuda_driver_version", std::to_string(cudaDriverVersion));
+  addVersionMetadata("cuda_driver_version", std::to_string(cudaDriverVersion));
 
 #elif defined(HAS_ROCTRACER)
   uint32_t majorVersion = roctracer_version_major();
@@ -283,18 +281,14 @@ void CuptiActivityProfiler::logGpuVersions() {
             << "; Runtime: " << hipRuntimeVersion
             << "; Driver: " << hipDriverVersion;
 
-  LOGGER_OBSERVER_ADD_METADATA(
-      "roctracer_version", roctracerVersion);
+  LOGGER_OBSERVER_ADD_METADATA("roctracer_version", roctracerVersion);
   LOGGER_OBSERVER_ADD_METADATA(
       "hip_runtime_version", std::to_string(hipRuntimeVersion));
   LOGGER_OBSERVER_ADD_METADATA(
       "hip_driver_version", std::to_string(hipDriverVersion));
-  addVersionMetadata(
-      "roctracer_version", roctracerVersion);
-  addVersionMetadata(
-      "hip_runtime_version", std::to_string(hipRuntimeVersion));
-  addVersionMetadata(
-      "hip_driver_version", std::to_string(hipDriverVersion));
+  addVersionMetadata("roctracer_version", roctracerVersion);
+  addVersionMetadata("hip_runtime_version", std::to_string(hipRuntimeVersion));
+  addVersionMetadata("hip_driver_version", std::to_string(hipDriverVersion));
 
 #endif
 }
@@ -348,7 +342,7 @@ void CuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) {
       LOGGER_OBSERVER_ADD_METADATA(
           "ResourceOverhead", std::to_string(resourceOverheadCount_));
     }
-    if (!gpuActivityPresent()){
+    if (!gpuActivityPresent()) {
       LOG(WARNING) << "GPU trace is empty!";
     }
   }
@@ -359,23 +353,36 @@ void CuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) {
     timestamp_t offset = getTimeOffset();
     cupti_.setTimeOffset(offset);
     const int count = cupti_.processActivities(
-        std::bind(&CuptiActivityProfiler::handleRoctracerActivity, this, std::placeholders::_1, &logger),
-        std::bind(&CuptiActivityProfiler::handleCorrelationActivity, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3));
+        std::bind(
+            &CuptiActivityProfiler::handleRoctracerActivity,
+            this,
+            std::placeholders::_1,
+            &logger),
+        std::bind(
+            &CuptiActivityProfiler::handleCorrelationActivity,
+            this,
+            std::placeholders::_1,
+            std::placeholders::_2,
+            std::placeholders::_3));
     LOG(INFO) << "Processed " << count << " GPU records";
     LOGGER_OBSERVER_ADD_EVENT_COUNT(count);
   }
 #endif // HAS_ROCTRACER
   if (!traceNonEmpty()) {
-    LOG(WARNING) << "No Valid Trace Events (CPU/GPU) found. Outputting empty trace.";
+    LOG(WARNING)
+        << "No Valid Trace Events (CPU/GPU) found. Outputting empty trace.";
   }
 
   for (const auto& session : sessions_) {
     LOG(INFO) << "Processing child profiler trace";
     // cpuActivity() function here is used to get the linked cpuActivity for
-    // session's activities. Passing captureWindowStartTime_ and captureWindowEndTime_
-    // in order to specify the range of activities that need to be processed.
-    session->processTrace(logger,
-        std::bind(&CuptiActivityProfiler::cpuActivity, this, std::placeholders::_1),
+    // session's activities. Passing captureWindowStartTime_ and
+    // captureWindowEndTime_ in order to specify the range of activities that
+    // need to be processed.
+    session->processTrace(
+        logger,
+        std::bind(
+            &CuptiActivityProfiler::cpuActivity, this, std::placeholders::_1),
         captureWindowStartTime_,
         captureWindowEndTime_);
   }
@@ -415,7 +422,8 @@ void CuptiActivityProfiler::processCpuTrace(
               const std::unique_ptr<GenericTraceActivity>>::value,
           "handleActivity is unsafe and relies on the caller to maintain not "
           "only lifetime but also address stability.");
-      if (act->type() ==  ActivityType::USER_ANNOTATION && act->duration()<=0){
+      if (act->type() == ActivityType::USER_ANNOTATION &&
+          act->duration() <= 0) {
         act->endTime = captureWindowEndTime_;
       }
       logger.handleActivity(*act);
@@ -445,7 +453,9 @@ inline void CuptiActivityProfiler::handleCorrelationActivity(
 #endif // HAS_CUPTI
 #ifdef HAS_ROCTRACER
 inline void CuptiActivityProfiler::handleCorrelationActivity(
-    uint64_t correlationId, uint64_t externalId, RoctracerLogger::CorrelationDomain externalKind) {
+    uint64_t correlationId,
+    uint64_t externalId,
+    RoctracerLogger::CorrelationDomain externalKind) {
   if (externalKind == RoctracerLogger::CorrelationDomain::Domain0) {
     cpuCorrelationMap_[correlationId] = externalId;
   } else if (externalKind == RoctracerLogger::CorrelationDomain::Domain1) {
@@ -530,7 +540,8 @@ inline static bool isBlockListedRuntimeCbid(CUpti_CallbackId cbid) {
   if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020 ||
       cbid == CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020 ||
       cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020 ||
-      // Support cudaEventRecord and cudaEventSynchronize, revisit if others are needed
+      // Support cudaEventRecord and cudaEventSynchronize, revisit if others are
+      // needed
       cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020 ||
       cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020 ||
       cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020) {
@@ -614,9 +625,7 @@ void CuptiActivityProfiler::handleOverheadActivity(
   setGpuActivityPresent(true);
 }
 
-
-std::optional<WaitEventInfo> getWaitEventInfo(
-    uint32_t ctx, uint32_t eventId) {
+std::optional<WaitEventInfo> getWaitEventInfo(uint32_t ctx, uint32_t eventId) {
   auto key = CtxEventPair{ctx, eventId};
   auto it = waitEventMap().find(key);
   if (it != waitEventMap().end()) {
@@ -635,7 +644,8 @@ void CuptiActivityProfiler::handleCudaEventActivity(
 
   // Update the stream, corrID the cudaEvent was last recorded on
   auto key = CtxEventPair{activity->contextId, activity->eventId};
-  waitEventMap()[key] = WaitEventInfo{activity->streamId, activity->correlationId};
+  waitEventMap()[key] =
+      WaitEventInfo{activity->streamId, activity->correlationId};
 }
 
 void CuptiActivityProfiler::handleCudaSyncActivity(
@@ -657,8 +667,8 @@ void CuptiActivityProfiler::handleCudaSyncActivity(
   int32_t src_stream = -1, src_corrid = -1;
 
   if (isEventSync(activity->type)) {
-    auto maybe_wait_event_info = getWaitEventInfo(
-        activity->contextId, activity->cudaEventId);
+    auto maybe_wait_event_info =
+        getWaitEventInfo(activity->contextId, activity->cudaEventId);
     if (maybe_wait_event_info) {
       src_stream = maybe_wait_event_info->stream;
       src_corrid = maybe_wait_event_info->correlationId;
@@ -666,7 +676,7 @@ void CuptiActivityProfiler::handleCudaSyncActivity(
   }
 
   // Marshal the logging to a functor so we can defer it if needed.
-  auto log_event = [=](){
+  auto log_event = [=]() {
     const ITraceActivity* linked =
         linkedActivity(activity->correlationId, cpuCorrelationMap_);
     const auto& cuda_sync_activity = traceBuffers_->addActivityWrapper(
@@ -682,7 +692,7 @@ void CuptiActivityProfiler::handleCudaSyncActivity(
       recordDevice(device_id);
     }
     VLOG(2) << "Logging sync event device = " << device_id
-            << " stream = " <<  activity->streamId
+            << " stream = " << activity->streamId
             << " sync type = " << syncTypeString(activity->type);
     cuda_sync_activity.log(*logger);
     setGpuActivityPresent(true);
@@ -704,8 +714,9 @@ void CuptiActivityProfiler::handleCudaSyncActivity(
 
 void CuptiActivityProfiler::logDeferredEvents() {
   // Stream Wait Events tend to be noisy, only pass these events if
-  // there was some GPU kernel/memcopy/memset observed on it in the trace window.
-  for (const auto& entry: logQueue_) {
+  // there was some GPU kernel/memcopy/memset observed on it in the trace
+  // window.
+  for (const auto& entry : logQueue_) {
     if (seenDeviceStreams_.find({entry.device, entry.stream}) ==
         seenDeviceStreams_.end()) {
       VLOG(2) << "Skipping Event Sync as no kernels have run yet on stream = "
@@ -758,11 +769,12 @@ void CuptiActivityProfiler::checkTimestampOrder(const ITraceActivity* act1) {
     std::swap(act1, act2);
   }
   if (act1->timestamp() > act2->timestamp()) {
-    LOG_FIRST_N(WARNING, 10) << "GPU op timestamp (" << act2->timestamp()
-                             << ") < runtime timestamp (" << act1->timestamp() << ") by "
-                             << act1->timestamp() - act2->timestamp() << "us"
-                             << " Name: " << act2->name() << " Device: " << act2->deviceId()
-                             << " Stream: " << act2->resourceId();
+    LOG_FIRST_N(WARNING, 10)
+        << "GPU op timestamp (" << act2->timestamp()
+        << ") < runtime timestamp (" << act1->timestamp() << ") by "
+        << act1->timestamp() - act2->timestamp() << "us"
+        << " Name: " << act2->name() << " Device: " << act2->deviceId()
+        << " Stream: " << act2->resourceId();
     ecs_.gpu_and_cpu_op_out_of_order++;
   }
 }
@@ -780,7 +792,6 @@ const ITraceActivity* CuptiActivityProfiler::linkedActivity(
   return nullptr;
 }
 
-
 inline void CuptiActivityProfiler::handleGpuActivity(
     const ITraceActivity& act,
     ActivityLogger* logger) {
@@ -852,7 +863,8 @@ void CuptiActivityProfiler::handleCuptiActivity(
       break;
     case CUPTI_ACTIVITY_KIND_SYNCHRONIZATION:
       handleCudaSyncActivity(
-          reinterpret_cast<const CUpti_ActivitySynchronization*>(record), logger);
+          reinterpret_cast<const CUpti_ActivitySynchronization*>(record),
+          logger);
       break;
     case CUPTI_ACTIVITY_KIND_CUDA_EVENT:
       handleCudaEventActivity(
@@ -911,8 +923,7 @@ void CuptiActivityProfiler::handleRuntimeActivity(
 inline void CuptiActivityProfiler::handleGpuActivity(
     const roctracerAsyncRow* act,
     ActivityLogger* logger) {
-  const ITraceActivity* linked =
-      linkedActivity(act->id, cpuCorrelationMap_);
+  const ITraceActivity* linked = linkedActivity(act->id, cpuCorrelationMap_);
   const auto& gpu_activity =
       traceBuffers_->addActivityWrapper(GpuActivity(act, linked));
   handleGpuActivity(gpu_activity, logger);
@@ -951,9 +962,10 @@ void CuptiActivityProfiler::handleRoctracerActivity(
 }
 #endif // HAS_ROCTRACER
 
-const ITraceActivity* CuptiActivityProfiler::cpuActivity(int32_t correlationId) {
-    const auto& it2 = activityMap_.find(correlationId);
-    return (it2 != activityMap_.end()) ? it2->second : nullptr;
+const ITraceActivity* CuptiActivityProfiler::cpuActivity(
+    int32_t correlationId) {
+  const auto& it2 = activityMap_.find(correlationId);
+  return (it2 != activityMap_.end()) ? it2->second : nullptr;
 }
 
 void CuptiActivityProfiler::configureChildProfilers() {
@@ -964,7 +976,7 @@ void CuptiActivityProfiler::configureChildProfilers() {
           .count();
   for (auto& profiler : profilers_) {
     LOG(INFO) << "[Profiler = " << profiler->name() << "] "
-              << "Evaluating whether to run child profiler." ;
+              << "Evaluating whether to run child profiler.";
     auto session = profiler->configure(
         start_time_ms,
         derivedConfig_->profileDuration().count(),
@@ -1038,7 +1050,8 @@ void CuptiActivityProfiler::configure(
     // presumably because structures are allocated and initialized, callbacks
     // are activated etc. After a while the overhead decreases and stabilizes.
     // It's therefore useful to perform some warmup before starting recording.
-    LOG(INFO) << "Enabling GPU tracing with max CUPTI buffer size " << config_->activitiesMaxGpuBufferSize() / 1024 / 1024 << "MB)";
+    LOG(INFO) << "Enabling GPU tracing with max CUPTI buffer size "
+              << config_->activitiesMaxGpuBufferSize() / 1024 / 1024 << "MB)";
     cupti_.setMaxBufferSize(config_->activitiesMaxGpuBufferSize());
     time_point<system_clock> timestamp;
     if (VLOG_IS_ON(1)) {
@@ -1046,20 +1059,17 @@ void CuptiActivityProfiler::configure(
     }
 #ifdef HAS_CUPTI
 #ifdef _WIN32
-    CUPTI_CALL(
-        cuptiActivityRegisterTimestampCallback([]() -> uint64_t {
-          auto system = std::chrono::time_point_cast<std::chrono::nanoseconds>(
-              std::chrono::system_clock::now());
-          return system.time_since_epoch().count();
-        }));
+    CUPTI_CALL(cuptiActivityRegisterTimestampCallback([]() -> uint64_t {
+      auto system = std::chrono::time_point_cast<std::chrono::nanoseconds>(
+          std::chrono::system_clock::now());
+      return system.time_since_epoch().count();
+    }));
 #else
 #if CUDA_VERSION >= 11060
     use_cupti_tsc() = config_->getTSCTimestampFlag();
-    if (use_cupti_tsc()){
-    CUPTI_CALL(
-        cuptiActivityRegisterTimestampCallback([]() -> uint64_t {
-          return getApproximateTime();
-        }));
+    if (use_cupti_tsc()) {
+      CUPTI_CALL(cuptiActivityRegisterTimestampCallback(
+          []() -> uint64_t { return getApproximateTime(); }));
     }
 #endif // CUDA_VERSION >= 11060
 #endif // _WIN32
@@ -1081,11 +1091,11 @@ void CuptiActivityProfiler::configure(
 
   if (libkineto::api().client()) {
     libkineto::api().client()->prepare(
-      config_->isReportInputShapesEnabled(),
-      config_->isProfileMemoryEnabled(),
-      config_->isWithStackEnabled(),
-      config_->isWithFlopsEnabled(),
-      config_->isWithModulesEnabled());
+        config_->isReportInputShapesEnabled(),
+        config_->isProfileMemoryEnabled(),
+        config_->isWithStackEnabled(),
+        config_->isWithFlopsEnabled(),
+        config_->isWithModulesEnabled());
   }
 
   if (derivedConfig_->isProfilingByIteration()) {
@@ -1110,7 +1120,7 @@ void CuptiActivityProfiler::configure(
   currentRunloopState_ = RunloopState::Warmup;
 }
 
-void CuptiActivityProfiler::toggleCollectionDynamic(const bool enable){
+void CuptiActivityProfiler::toggleCollectionDynamic(const bool enable) {
 #ifdef HAS_CUPTI
   if (enable) {
     cupti_.enableCuptiActivities(derivedConfig_->profileActivityTypes());
@@ -1211,7 +1221,9 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
         std::lock_guard<std::recursive_mutex> guard(mutex_);
         stopTraceInternal(now);
         resetInternal();
-        LOG(ERROR) << "State: Warmup stopped by CUPTI. (Buffer size configured is " << config_->activitiesMaxGpuBufferSize() / 1024 / 1024 << "MB)";
+        LOG(ERROR)
+            << "State: Warmup stopped by CUPTI. (Buffer size configured is "
+            << config_->activitiesMaxGpuBufferSize() / 1024 / 1024 << "MB)";
         UST_LOGGER_MARK_COMPLETED(kWarmUpStage);
         VLOG(0) << "Warmup -> WaitForRequest";
         break;
@@ -1251,10 +1263,11 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
 #if defined(HAS_CUPTI) || defined(HAS_ROCTRACER)
           || cupti_.stopCollection
 #endif // HAS_CUPTI || HAS_ROCTRACER
-      ){
+      ) {
         // Update runloop state first to prevent further updates to shared state
         LOG(INFO) << "Tracing complete.";
-        VLOG_IF(1, currentIter > 0) << "This state change was invoked by application's step() call";
+        VLOG_IF(1, currentIter > 0)
+            << "This state change was invoked by application's step() call";
 
         if (libkineto::api().client()) {
           libkineto::api().client()->stop();
@@ -1263,7 +1276,9 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
 #if defined(HAS_CUPTI) || defined(HAS_ROCTRACER)
         if (cupti_.stopCollection) {
           ecs_.cupti_stopped_early = cupti_.stopCollection;
-          LOG(ERROR) << "State: CollectTrace stopped by CUPTI. (Buffer size configured is " << config_->activitiesMaxGpuBufferSize() / 1024 / 1024 << "MB)";
+          LOG(ERROR)
+              << "State: CollectTrace stopped by CUPTI. (Buffer size configured is "
+              << config_->activitiesMaxGpuBufferSize() / 1024 / 1024 << "MB)";
         }
 #endif // HAS_CUPTI || HAS_ROCTRACER
 
@@ -1273,8 +1288,9 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
         UST_LOGGER_MARK_COMPLETED(kCollectionStage);
       } else if (derivedConfig_->isProfilingByIteration()) {
         // nothing to do here
-      } else if (now < derivedConfig_->profileEndTime() &&
-                 derivedConfig_->profileEndTime() < nextWakeupTime) {
+      } else if (
+          now < derivedConfig_->profileEndTime() &&
+          derivedConfig_->profileEndTime() < nextWakeupTime) {
         new_wakeup_time = derivedConfig_->profileEndTime();
       }
 
@@ -1302,7 +1318,9 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
   return new_wakeup_time;
 }
 
-void CuptiActivityProfiler::finalizeTrace(const Config& config, ActivityLogger& logger) {
+void CuptiActivityProfiler::finalizeTrace(
+    const Config& config,
+    ActivityLogger& logger) {
   LOG(INFO) << "Traces Recorded:";
   {
     for (const auto& it : iterationCountMap_) {
@@ -1326,7 +1344,10 @@ void CuptiActivityProfiler::finalizeTrace(const Config& config, ActivityLogger&
       // of the trace timelines.
       for (int gpu = 0; gpu <= kMaxGpuID; gpu++) {
         logger.handleDeviceInfo(
-            {gpu, gpu + kExceedMaxPid, process_name, fmt::format("GPU {}", gpu)},
+            {gpu,
+             gpu + kExceedMaxPid,
+             process_name,
+             fmt::format("GPU {}", gpu)},
             captureWindowStartTime_);
       }
     }
@@ -1338,14 +1359,14 @@ void CuptiActivityProfiler::finalizeTrace(const Config& config, ActivityLogger&
     logger.handleResourceInfo(resource, captureWindowStartTime_);
   }
 
-  for (auto &session : sessions_){
+  for (auto& session : sessions_) {
     auto device_info = session->getDeviceInfo();
-    if (device_info != nullptr){
+    if (device_info != nullptr) {
       logger.handleDeviceInfo(*device_info, captureWindowStartTime_);
     }
 
     auto resource_infos = session->getResourceInfos();
-    for (auto resource_info : resource_infos){
+    for (auto resource_info : resource_infos) {
       logger.handleResourceInfo(resource_info, captureWindowStartTime_);
     }
   }
@@ -1362,14 +1383,14 @@ void CuptiActivityProfiler::finalizeTrace(const Config& config, ActivityLogger&
 #ifdef HAS_CUPTI
   // Overhead info
   overheadInfo_.push_back(ActivityLogger::OverheadInfo("CUPTI Overhead"));
-  for(const auto& info : overheadInfo_) {
+  for (const auto& info : overheadInfo_) {
     logger.handleOverheadInfo(info, captureWindowStartTime_);
   }
 #endif // HAS_CUPTI
 
   gpuUserEventMap_.logEvents(&logger);
 
-  for (auto& session : sessions_){
+  for (auto& session : sessions_) {
     auto trace_buffer = session->getTraceBuffer();
     if (trace_buffer) {
       // Set child start time to profiling start time if not set
@@ -1383,8 +1404,8 @@ void CuptiActivityProfiler::finalizeTrace(const Config& config, ActivityLogger&
   // Logger Metadata contains a map of LOGs collected in Kineto
   //   logger_level -> List of log lines
   // This will be added into the trace as metadata.
-  std::unordered_map<std::string, std::vector<std::string>>
-    loggerMD = getLoggerMetadata();
+  std::unordered_map<std::string, std::vector<std::string>> loggerMD =
+      getLoggerMetadata();
   logger.finalizeTrace(
       config, std::move(traceBuffers_), captureWindowEndTime_, loggerMD);
 }
@@ -1405,12 +1426,12 @@ CuptiActivityProfiler::getLoggerMetadata() {
 
 void CuptiActivityProfiler::pushCorrelationId(uint64_t id) {
 #ifdef HAS_CUPTI
-  CuptiActivityApi::pushCorrelationID(id,
-    CuptiActivityApi::CorrelationFlowType::Default);
+  CuptiActivityApi::pushCorrelationID(
+      id, CuptiActivityApi::CorrelationFlowType::Default);
 #endif // HAS_CUPTI
 #ifdef HAS_ROCTRACER
-  RoctracerActivityApi::pushCorrelationID(id,
-    RoctracerActivityApi::CorrelationFlowType::Default);
+  RoctracerActivityApi::pushCorrelationID(
+      id, RoctracerActivityApi::CorrelationFlowType::Default);
 #endif
   for (auto& session : sessions_) {
     session->pushCorrelationId(id);
@@ -1420,11 +1441,11 @@ void CuptiActivityProfiler::pushCorrelationId(uint64_t id) {
 void CuptiActivityProfiler::popCorrelationId() {
 #ifdef HAS_CUPTI
   CuptiActivityApi::popCorrelationID(
-    CuptiActivityApi::CorrelationFlowType::Default);
+      CuptiActivityApi::CorrelationFlowType::Default);
 #endif // HAS_CUPTI
 #ifdef HAS_ROCTRACER
   RoctracerActivityApi::popCorrelationID(
-    RoctracerActivityApi::CorrelationFlowType::Default);
+      RoctracerActivityApi::CorrelationFlowType::Default);
 #endif
   for (auto& session : sessions_) {
     session->popCorrelationId();
@@ -1433,12 +1454,12 @@ void CuptiActivityProfiler::popCorrelationId() {
 
 void CuptiActivityProfiler::pushUserCorrelationId(uint64_t id) {
 #ifdef HAS_CUPTI
-  CuptiActivityApi::pushCorrelationID(id,
-    CuptiActivityApi::CorrelationFlowType::User);
+  CuptiActivityApi::pushCorrelationID(
+      id, CuptiActivityApi::CorrelationFlowType::User);
 #endif // HAS_CUPTI
 #ifdef HAS_ROCTRACER
-  RoctracerActivityApi::pushCorrelationID(id,
-    RoctracerActivityApi::CorrelationFlowType::User);
+  RoctracerActivityApi::pushCorrelationID(
+      id, RoctracerActivityApi::CorrelationFlowType::User);
 #endif
   for (auto& session : sessions_) {
     session->pushUserCorrelationId(id);
@@ -1448,11 +1469,11 @@ void CuptiActivityProfiler::pushUserCorrelationId(uint64_t id) {
 void CuptiActivityProfiler::popUserCorrelationId() {
 #ifdef HAS_CUPTI
   CuptiActivityApi::popCorrelationID(
-    CuptiActivityApi::CorrelationFlowType::User);
+      CuptiActivityApi::CorrelationFlowType::User);
 #endif // HAS_CUPTI
 #ifdef HAS_ROCTRACER
   RoctracerActivityApi::popCorrelationID(
-    RoctracerActivityApi::CorrelationFlowType::User);
+      RoctracerActivityApi::CorrelationFlowType::User);
 #endif
   for (auto& session : sessions_) {
     session->popUserCorrelationId();
@@ -1484,5 +1505,4 @@ void CuptiActivityProfiler::resetTraceData() {
 #endif // !USE_GOOGLE_LOG
 }
 
-
 } // namespace KINETO_NAMESPACE
diff --git a/libkineto/src/CuptiActivityProfiler.h b/libkineto/src/CuptiActivityProfiler.h
index 0669be2d9..01ca5cb2e 100644
--- a/libkineto/src/CuptiActivityProfiler.h
+++ b/libkineto/src/CuptiActivityProfiler.h
@@ -10,8 +10,8 @@
 
 #include <atomic>
 #include <chrono>
-#include <deque>
 #include <condition_variable>
+#include <deque>
 #include <list>
 #include <map>
 #include <memory>
@@ -35,13 +35,13 @@
 #include "RoctracerLogger.h"
 #endif // HAS_ROCTRACER
 
+#include "GenericTraceActivity.h"
+#include "IActivityProfiler.h"
+#include "LoggerCollector.h"
 #include "ThreadUtil.h"
 #include "TraceSpan.h"
 #include "libkineto.h"
 #include "output_base.h"
-#include "GenericTraceActivity.h"
-#include "IActivityProfiler.h"
-#include "LoggerCollector.h"
 
 namespace KINETO_NAMESPACE {
 
@@ -57,7 +57,7 @@ struct ConfigDerivedState final {
 
   // Calculate if starting is valid.
   bool canStart(
-    const std::chrono::time_point<std::chrono::system_clock>& now) const;
+      const std::chrono::time_point<std::chrono::system_clock>& now) const;
 
   // TODO: consider using union since only 1 arg is used.
   bool isWarmupDone(
@@ -73,24 +73,29 @@ struct ConfigDerivedState final {
     return profileActivityTypes_;
   }
 
-  const std::chrono::time_point<std::chrono::system_clock>
-  profileStartTime() const {
+  const std::chrono::time_point<std::chrono::system_clock> profileStartTime()
+      const {
     return profileStartTime_;
   }
 
-  const std::chrono::time_point<std::chrono::system_clock>
-  profileEndTime() const {
+  const std::chrono::time_point<std::chrono::system_clock> profileEndTime()
+      const {
     return profileEndTime_;
   }
 
-  const std::chrono::milliseconds
-  profileDuration() const {
+  const std::chrono::milliseconds profileDuration() const {
     return profileDuration_;
   }
 
-  int64_t profileStartIteration() const { return profileStartIter_; }
-  int64_t profileEndIteration() const { return profileEndIter_; }
-  bool isProfilingByIteration() const { return profilingByIter_; }
+  int64_t profileStartIteration() const {
+    return profileStartIter_;
+  }
+  int64_t profileEndIteration() const {
+    return profileEndIter_;
+  }
+  bool isProfilingByIteration() const {
+    return profilingByIter_;
+  }
 
  private:
   std::set<ActivityType> profileActivityTypes_;
@@ -105,9 +110,9 @@ struct ConfigDerivedState final {
 };
 
 namespace detail {
-  inline size_t hash_combine(size_t seed, size_t value) {
-    return seed ^ (value + 0x9e3779b9 + (seed << 6u) + (seed >> 2u));
-  }
+inline size_t hash_combine(size_t seed, size_t value) {
+  return seed ^ (value + 0x9e3779b9 + (seed << 6u) + (seed >> 2u));
+}
 } // namespace detail
 
 class CuptiActivityProfiler {
@@ -136,19 +141,19 @@ class CuptiActivityProfiler {
     logger_ = logger;
   }
 
- inline void setCpuActivityPresent(bool val){
+  inline void setCpuActivityPresent(bool val) {
     cpuActivityPresent_ = val;
   }
 
-  inline void setGpuActivityPresent(bool val){
+  inline void setGpuActivityPresent(bool val) {
     gpuActivityPresent_ = val;
-  } 
+  }
 
-  inline bool gpuActivityPresent(){
+  inline bool gpuActivityPresent() {
     return gpuActivityPresent_;
   }
 
-  inline bool traceNonEmpty(){
+  inline bool traceNonEmpty() {
     return cpuActivityPresent_ || gpuActivityPresent_;
   }
 
@@ -159,7 +164,8 @@ class CuptiActivityProfiler {
     startTraceInternal(now);
   }
 
-  void stopTrace(const std::chrono::time_point<std::chrono::system_clock>& now) {
+  void stopTrace(
+      const std::chrono::time_point<std::chrono::system_clock>& now) {
     std::lock_guard<std::recursive_mutex> guard(mutex_);
     stopTraceInternal(now);
   }
@@ -179,13 +185,12 @@ class CuptiActivityProfiler {
   void configure(
       const Config& config,
       const std::chrono::time_point<std::chrono::system_clock>& now);
-  
+
   // Toggle GPU tracing during a profile instance
   void toggleCollectionDynamic(const bool enable);
 
   // Registered with client API to pass CPU trace events over
-  void transferCpuTrace(
-      std::unique_ptr<libkineto::CpuTraceBuffer> cpuTrace);
+  void transferCpuTrace(std::unique_ptr<libkineto::CpuTraceBuffer> cpuTrace);
 
   const Config& config() {
     return *config_;
@@ -201,7 +206,8 @@ class CuptiActivityProfiler {
     recordThreadInfo(sysTid, tid, pid);
   }
 
-  // T107508020: We can deprecate the recordThreadInfo(void) once we optimized profiler_kineto
+  // T107508020: We can deprecate the recordThreadInfo(void) once we optimized
+  // profiler_kineto
   void recordThreadInfo(int32_t sysTid, int32_t tid, int32_t pid) {
     if (resourceInfo_.find({pid, tid}) == resourceInfo_.end()) {
       resourceInfo_.emplace(
@@ -224,8 +230,7 @@ class CuptiActivityProfiler {
     versionMetadata_[key] = value;
   }
 
-  void addChildActivityProfiler(
-      std::unique_ptr<IActivityProfiler> profiler) {
+  void addChildActivityProfiler(std::unique_ptr<IActivityProfiler> profiler) {
     std::lock_guard<std::recursive_mutex> guard(mutex_);
     profilers_.push_back(std::move(profiler));
   }
@@ -239,7 +244,6 @@ class CuptiActivityProfiler {
   void popUserCorrelationId();
 
  protected:
-
   using CpuGpuSpanPair = std::pair<TraceSpan, TraceSpan>;
   static const CpuGpuSpanPair& defaultTraceSpan();
 
@@ -259,10 +263,11 @@ class CuptiActivityProfiler {
     // Insert a user defined event which maps to the gpu trace activity.
     // If the user defined event mapping already exists this will update the
     // gpu side span to include the span of gpuTraceActivity.
-    void insertOrExtendEvent(const ITraceActivity& cpuTraceActivity,
-      const ITraceActivity& gpuTraceActivity);
+    void insertOrExtendEvent(
+        const ITraceActivity& cpuTraceActivity,
+        const ITraceActivity& gpuTraceActivity);
     // Log out the events to the logger
-    void logEvents(ActivityLogger *logger);
+    void logEvents(ActivityLogger* logger);
 
     void clear() {
       streamSpanMap_.clear();
@@ -286,8 +291,7 @@ class CuptiActivityProfiler {
   // external events, e.g.operator activities from PyTorch.
   std::unordered_map<int64_t, int64_t> cpuCorrelationMap_;
   // CUDA runtime <-> GPU Activity
-  std::unordered_map<int64_t, const ITraceActivity*>
-      correlatedCudaActivities_;
+  std::unordered_map<int64_t, const ITraceActivity*> correlatedCudaActivities_;
   std::unordered_map<int64_t, int64_t> userCorrelationMap_;
 
   // data structure to collect cuptiActivityFlushAll() latency overhead
@@ -325,10 +329,9 @@ class CuptiActivityProfiler {
   inline void recordStream(int device, int id, const char* postfix) {
     if (!hasDeviceResource(device, id)) {
       resourceInfo_.emplace(
-        std::make_pair(device, id),
-        ResourceInfo(
-          device, id, id, fmt::format(
-            "stream {} {}", id, postfix)));
+          std::make_pair(device, id),
+          ResourceInfo(
+              device, id, id, fmt::format("stream {} {}", id, postfix)));
     }
   }
 
@@ -337,9 +340,8 @@ class CuptiActivityProfiler {
     constexpr int id = -1;
     if (!hasDeviceResource(device, id)) {
       resourceInfo_.emplace(
-        std::make_pair(device, id),
-        ResourceInfo(
-          device, id, id, fmt::format("Device {}", device)));
+          std::make_pair(device, id),
+          ResourceInfo(device, id, id, fmt::format("Device {}", device)));
     }
   }
 
@@ -361,24 +363,29 @@ class CuptiActivityProfiler {
   const ITraceActivity* cpuActivity(int32_t correlationId);
   void updateGpuNetSpan(const ITraceActivity& gpuOp);
   bool outOfRange(const ITraceActivity& act);
-  void handleGpuActivity(const ITraceActivity& act,
-      ActivityLogger* logger);
+  void handleGpuActivity(const ITraceActivity& act, ActivityLogger* logger);
 
 #ifdef HAS_CUPTI
   // Process generic CUPTI activity
-  void handleCuptiActivity(const CUpti_Activity* record, ActivityLogger* logger);
+  void handleCuptiActivity(
+      const CUpti_Activity* record,
+      ActivityLogger* logger);
   // Process specific GPU activity types
   void handleCorrelationActivity(
       const CUpti_ActivityExternalCorrelation* correlation);
   void handleRuntimeActivity(
-      const CUpti_ActivityAPI* activity, ActivityLogger* logger);
+      const CUpti_ActivityAPI* activity,
+      ActivityLogger* logger);
   void handleDriverActivity(
-      const CUpti_ActivityAPI* activity, ActivityLogger* logger);
+      const CUpti_ActivityAPI* activity,
+      ActivityLogger* logger);
   void handleOverheadActivity(
-      const CUpti_ActivityOverhead* activity, ActivityLogger* logger);
+      const CUpti_ActivityOverhead* activity,
+      ActivityLogger* logger);
   void handleCudaEventActivity(const CUpti_ActivityCudaEvent* activity);
   void handleCudaSyncActivity(
-      const CUpti_ActivitySynchronization* activity, ActivityLogger* logger);
+      const CUpti_ActivitySynchronization* activity,
+      ActivityLogger* logger);
   template <class T>
   void handleGpuActivity(const T* act, ActivityLogger* logger);
   void logDeferredEvents();
@@ -387,20 +394,18 @@ class CuptiActivityProfiler {
 #ifdef HAS_ROCTRACER
   // Process generic RocTracer activity
   void handleRoctracerActivity(
-    const roctracerBase* record,
-    ActivityLogger* logger);
+      const roctracerBase* record,
+      ActivityLogger* logger);
   void handleCorrelationActivity(
-    uint64_t correlationId,
-    uint64_t externalId,
-    RoctracerLogger::CorrelationDomain externalKind);
+      uint64_t correlationId,
+      uint64_t externalId,
+      RoctracerLogger::CorrelationDomain externalKind);
   // Process specific GPU activity types
   template <class T>
-  void handleRuntimeActivity(
-    const T* activity,
-    ActivityLogger* logger);
+  void handleRuntimeActivity(const T* activity, ActivityLogger* logger);
   void handleGpuActivity(
-    const roctracerAsyncRow* record,
-    ActivityLogger* logger);
+      const roctracerAsyncRow* record,
+      ActivityLogger* logger);
 #endif // HAS_ROCTRACER
 
   void resetTraceData();
@@ -419,7 +424,8 @@ class CuptiActivityProfiler {
   void checkTimestampOrder(const ITraceActivity* act1);
 
   // On-demand Request Config (should not be modified)
-  // TODO: remove this config_, dependency needs to be removed from finalizeTrace.
+  // TODO: remove this config_, dependency needs to be removed from
+  // finalizeTrace.
   std::unique_ptr<const Config> config_;
 
   // Resolved details about the config and states are stored here.
@@ -430,7 +436,7 @@ class CuptiActivityProfiler {
 
   // Calls to CUPTI is encapsulated behind this interface
 #ifdef HAS_ROCTRACER
-  RoctracerActivityApi& cupti_;		// Design failure here
+  RoctracerActivityApi& cupti_; // Design failure here
 #else
   CuptiActivityApi& cupti_;
 #endif
@@ -455,9 +461,7 @@ class CuptiActivityProfiler {
 
   // Cache thread names and system thread ids for pthread ids,
   // and stream ids for GPU streams
-  std::map<
-      std::pair<int64_t, int64_t>,
-      ResourceInfo> resourceInfo_;
+  std::map<std::pair<int64_t, int64_t>, ResourceInfo> resourceInfo_;
 
   std::vector<ActivityLogger::OverheadInfo> overheadInfo_;
 
@@ -483,8 +487,8 @@ class CuptiActivityProfiler {
   std::atomic<RunloopState> currentRunloopState_{RunloopState::WaitForRequest};
 
   // Keep track of the start time and end time for the trace collected.
-  // External threads using startTrace need to manually stopTrace. Part of the mock tests.
-  // All CUDA events before this time will be removed
+  // External threads using startTrace need to manually stopTrace. Part of the
+  // mock tests. All CUDA events before this time will be removed
   int64_t captureWindowStartTime_{0};
   // Similarly, all CUDA API events after the last net event will be removed
   int64_t captureWindowEndTime_{0};
@@ -501,12 +505,10 @@ class CuptiActivityProfiler {
   };
 
   struct DevStreamHash {
-  	std::size_t operator()(const DevStream& c) const {
-  		return detail::hash_combine(
-        std::hash<int64_t>()(c.ctx),
-        std::hash<int64_t>()(c.stream)
-      );
-  	}
+    std::size_t operator()(const DevStream& c) const {
+      return detail::hash_combine(
+          std::hash<int64_t>()(c.ctx), std::hash<int64_t>()(c.stream));
+    }
   };
 
   struct ErrorCounts {
diff --git a/libkineto/src/CuptiCallbackApi.cpp b/libkineto/src/CuptiCallbackApi.cpp
index b6d4fc69f..9c8e0f442 100644
--- a/libkineto/src/CuptiCallbackApi.cpp
+++ b/libkineto/src/CuptiCallbackApi.cpp
@@ -12,14 +12,13 @@
 #include "CuptiActivityApi.h"
 
 #include <assert.h>
-#include <chrono>
 #include <algorithm>
+#include <chrono>
 #include <mutex>
 
 #include "DeviceUtil.h"
 #include "Logger.h"
 
-
 namespace KINETO_NAMESPACE {
 
 // limit on number of handles per callback type
@@ -44,7 +43,6 @@ constexpr uint32_t MAX_CUPTI_CALLBACK_ID_ALL = 0xffffffff;
  *  See type declrartions in header file.
  */
 
-
 /* callback_switchboard : is the global callback handler we register
  *  with CUPTI. The goal is to make it as efficient as possible
  *  to re-direct to the registered callback(s).
@@ -59,24 +57,22 @@ static void CUPTIAPI callback_switchboard(
 #else
 static void callback_switchboard(
 #endif
-   void* /* unused */,
-   CUpti_CallbackDomain domain,
-   CUpti_CallbackId cbid,
-   const CUpti_CallbackData* cbInfo) {
+    void* /* unused */,
+    CUpti_CallbackDomain domain,
+    CUpti_CallbackId cbid,
+    const CUpti_CallbackData* cbInfo) {
 
   // below statement is likey going to call a mutex
   // on the singleton access
-  CuptiCallbackApi::singleton()->__callback_switchboard(
-      domain, cbid, cbInfo);
+  CuptiCallbackApi::singleton()->__callback_switchboard(domain, cbid, cbInfo);
 }
 
-
 void CuptiCallbackApi::__callback_switchboard(
-   CUpti_CallbackDomain domain,
-   CUpti_CallbackId cbid,
-   const CUpti_CallbackData* cbInfo) {
+    CUpti_CallbackDomain domain,
+    CUpti_CallbackId cbid,
+    const CUpti_CallbackData* cbInfo) {
   LOG(INFO) << "Callback: domain = " << domain << ", cbid = " << cbid;
-  CallbackList *cblist = nullptr;
+  CallbackList* cblist = nullptr;
 
   switch (domain) {
     // add the fastest path for kernel launch callbacks
@@ -84,19 +80,22 @@ void CuptiCallbackApi::__callback_switchboard(
     case CUPTI_CB_DOMAIN_RUNTIME_API:
       switch (cbid) {
         case CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000:
-          cblist = &callbacks_.runtime[
-            CUDA_LAUNCH_KERNEL - __RUNTIME_CB_DOMAIN_START];
+          cblist =
+              &callbacks_
+                   .runtime[CUDA_LAUNCH_KERNEL - __RUNTIME_CB_DOMAIN_START];
           break;
 #if defined(CUDA_VERSION) && (CUDA_VERSION >= 11080)
         case CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060:
-          cblist = &callbacks_.runtime[
-            CUDA_LAUNCH_KERNEL_EXC - __RUNTIME_CB_DOMAIN_START];
+          cblist =
+              &callbacks_
+                   .runtime[CUDA_LAUNCH_KERNEL_EXC - __RUNTIME_CB_DOMAIN_START];
           break;
 #endif
         default:
           break;
       }
-      // This is required to teardown cupti after profiling to prevent QPS slowdown.
+      // This is required to teardown cupti after profiling to prevent QPS
+      // slowdown.
       if (CuptiActivityApi::singleton().teardownCupti_) {
         if (cbInfo->callbackSite == CUPTI_API_EXIT) {
           LOG(INFO) << "  Calling cuptiFinalize in exit callsite";
@@ -115,12 +114,13 @@ void CuptiCallbackApi::__callback_switchboard(
     case CUPTI_CB_DOMAIN_RESOURCE:
       switch (cbid) {
         case CUPTI_CBID_RESOURCE_CONTEXT_CREATED:
-          cblist = &callbacks_.resource[
-            RESOURCE_CONTEXT_CREATED - __RESOURCE_CB_DOMAIN_START];
+          cblist = &callbacks_.resource
+                        [RESOURCE_CONTEXT_CREATED - __RESOURCE_CB_DOMAIN_START];
           break;
         case CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING:
-          cblist = &callbacks_.resource[
-            RESOURCE_CONTEXT_DESTROYED - __RESOURCE_CB_DOMAIN_START];
+          cblist =
+              &callbacks_.resource
+                   [RESOURCE_CONTEXT_DESTROYED - __RESOURCE_CB_DOMAIN_START];
           break;
         default:
           break;
@@ -144,8 +144,8 @@ void CuptiCallbackApi::__callback_switchboard(
     ReaderLockGuard rl(callbackLock_);
     int i = 0;
     for (auto it = cblist->begin();
-        it != cblist->end() && i < MAX_CB_FNS_PER_CB;
-        it++, i++) {
+         it != cblist->end() && i < MAX_CB_FNS_PER_CB;
+         it++, i++) {
       callbacks[i] = *it;
     }
     num_cbs = i;
@@ -158,22 +158,19 @@ void CuptiCallbackApi::__callback_switchboard(
 }
 
 std::shared_ptr<CuptiCallbackApi> CuptiCallbackApi::singleton() {
-	static const std::shared_ptr<CuptiCallbackApi>
-		instance = [] {
-			std::shared_ptr<CuptiCallbackApi> inst =
-				std::shared_ptr<CuptiCallbackApi>(new CuptiCallbackApi());
-			return inst;
-	}();
+  static const std::shared_ptr<CuptiCallbackApi> instance = [] {
+    std::shared_ptr<CuptiCallbackApi> inst =
+        std::shared_ptr<CuptiCallbackApi>(new CuptiCallbackApi());
+    return inst;
+  }();
   return instance;
 }
 
 void CuptiCallbackApi::initCallbackApi() {
 #ifdef HAS_CUPTI
   lastCuptiStatus_ = CUPTI_ERROR_UNKNOWN;
-  lastCuptiStatus_ = CUPTI_CALL_NOWARN(
-    cuptiSubscribe(&subscriber_,
-      (CUpti_CallbackFunc)callback_switchboard,
-      nullptr));
+  lastCuptiStatus_ = CUPTI_CALL_NOWARN(cuptiSubscribe(
+      &subscriber_, (CUpti_CallbackFunc)callback_switchboard, nullptr));
 
   // TODO: Remove temporarily to work around static initialization order issue
   // betweent this and GLOG.
@@ -186,11 +183,11 @@ void CuptiCallbackApi::initCallbackApi() {
 }
 
 CuptiCallbackApi::CallbackList* CuptiCallbackApi::CallbackTable::lookup(
-    CUpti_CallbackDomain domain, CuptiCallBackID cbid) {
+    CUpti_CallbackDomain domain,
+    CuptiCallBackID cbid) {
   size_t idx;
 
   switch (domain) {
-
     case CUPTI_CB_DOMAIN_RESOURCE:
       assert(cbid >= __RESOURCE_CB_DOMAIN_START);
       assert(cbid < __RESOURCE_CB_DOMAIN_END);
@@ -245,8 +242,8 @@ bool CuptiCallbackApi::deleteCallback(
     CuptiCallbackFn cbfn) {
   CallbackList* cblist = callbacks_.lookup(domain, cbid);
   if (!cblist) {
-    LOG(WARNING) << "Attempting to remove unsupported callback -- domain = " << domain
-                 << " callback id = " << cbid;
+    LOG(WARNING) << "Attempting to remove unsupported callback -- domain = "
+                 << domain << " callback id = " << cbid;
     return false;
   }
 
@@ -267,11 +264,12 @@ bool CuptiCallbackApi::deleteCallback(
 }
 
 bool CuptiCallbackApi::enableCallback(
-    CUpti_CallbackDomain domain, CUpti_CallbackId cbid) {
+    CUpti_CallbackDomain domain,
+    CUpti_CallbackId cbid) {
 #ifdef HAS_CUPTI
   if (initSuccess_) {
-    lastCuptiStatus_ = CUPTI_CALL_NOWARN(
-        cuptiEnableCallback(1, subscriber_, domain, cbid));
+    lastCuptiStatus_ =
+        CUPTI_CALL_NOWARN(cuptiEnableCallback(1, subscriber_, domain, cbid));
     enabledCallbacks_.insert({domain, cbid});
     return (lastCuptiStatus_ == CUPTI_SUCCESS);
   }
@@ -280,24 +278,24 @@ bool CuptiCallbackApi::enableCallback(
 }
 
 bool CuptiCallbackApi::disableCallback(
-    CUpti_CallbackDomain domain, CUpti_CallbackId cbid) {
+    CUpti_CallbackDomain domain,
+    CUpti_CallbackId cbid) {
 #ifdef HAS_CUPTI
   enabledCallbacks_.erase({domain, cbid});
   if (initSuccess_) {
-    lastCuptiStatus_ = CUPTI_CALL_NOWARN(
-        cuptiEnableCallback(0, subscriber_, domain, cbid));
+    lastCuptiStatus_ =
+        CUPTI_CALL_NOWARN(cuptiEnableCallback(0, subscriber_, domain, cbid));
     return (lastCuptiStatus_ == CUPTI_SUCCESS);
   }
 #endif
   return false;
 }
 
-bool CuptiCallbackApi::enableCallbackDomain(
-    CUpti_CallbackDomain domain) {
+bool CuptiCallbackApi::enableCallbackDomain(CUpti_CallbackDomain domain) {
 #ifdef HAS_CUPTI
   if (initSuccess_) {
-    lastCuptiStatus_ = CUPTI_CALL_NOWARN(
-        cuptiEnableDomain(1, subscriber_, domain));
+    lastCuptiStatus_ =
+        CUPTI_CALL_NOWARN(cuptiEnableDomain(1, subscriber_, domain));
     enabledCallbacks_.insert({domain, MAX_CUPTI_CALLBACK_ID_ALL});
     return (lastCuptiStatus_ == CUPTI_SUCCESS);
   }
@@ -305,13 +303,12 @@ bool CuptiCallbackApi::enableCallbackDomain(
   return false;
 }
 
-bool CuptiCallbackApi::disableCallbackDomain(
-    CUpti_CallbackDomain domain) {
+bool CuptiCallbackApi::disableCallbackDomain(CUpti_CallbackDomain domain) {
 #ifdef HAS_CUPTI
   enabledCallbacks_.erase({domain, MAX_CUPTI_CALLBACK_ID_ALL});
   if (initSuccess_) {
-    lastCuptiStatus_ = CUPTI_CALL_NOWARN(
-        cuptiEnableDomain(0, subscriber_, domain));
+    lastCuptiStatus_ =
+        CUPTI_CALL_NOWARN(cuptiEnableDomain(0, subscriber_, domain));
     return (lastCuptiStatus_ == CUPTI_SUCCESS);
   }
 #endif
@@ -323,8 +320,8 @@ bool CuptiCallbackApi::reenableCallbacks() {
   if (initSuccess_) {
     for (auto& cbpair : enabledCallbacks_) {
       if ((uint32_t)cbpair.second == MAX_CUPTI_CALLBACK_ID_ALL) {
-        lastCuptiStatus_ = CUPTI_CALL_NOWARN(
-            cuptiEnableDomain(1, subscriber_, cbpair.first));
+        lastCuptiStatus_ =
+            CUPTI_CALL_NOWARN(cuptiEnableDomain(1, subscriber_, cbpair.first));
       } else {
         lastCuptiStatus_ = CUPTI_CALL_NOWARN(
             cuptiEnableCallback(1, subscriber_, cbpair.first, cbpair.second));
diff --git a/libkineto/src/CuptiCallbackApi.h b/libkineto/src/CuptiCallbackApi.h
index ba7503c76..85a8f114c 100644
--- a/libkineto/src/CuptiCallbackApi.h
+++ b/libkineto/src/CuptiCallbackApi.h
@@ -16,8 +16,8 @@
 #include <list>
 #include <memory>
 #include <mutex>
-#include <shared_mutex>
 #include <set>
+#include <shared_mutex>
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
@@ -27,8 +27,6 @@ namespace KINETO_NAMESPACE {
 
 using namespace libkineto;
 
-
-
 /* CuptiCallbackApi : Provides an abstraction over CUPTI callback
  *  interface. This enables various callback functions to be registered
  *  with this class. The class registers a global callback handler that
@@ -38,24 +36,21 @@ using namespace libkineto;
  *  in order to speed up the implementation for fast path.
  */
 
-using CuptiCallbackFn = void(*)(
+using CuptiCallbackFn = void (*)(
     CUpti_CallbackDomain domain,
     CUpti_CallbackId cbid,
     const CUpti_CallbackData* cbInfo);
 
-
 class CuptiCallbackApi {
-
  public:
-
   /* Global list of supported callback ids
    *  use the class namespace to avoid confusing with CUPTI enums*/
   enum CuptiCallBackID {
-    CUDA_LAUNCH_KERNEL =  0,
+    CUDA_LAUNCH_KERNEL = 0,
     // can possibly support more callback ids per domain
     //
     __RUNTIME_CB_DOMAIN_START = CUDA_LAUNCH_KERNEL,
-    CUDA_LAUNCH_KERNEL_EXC,  // Used in H100
+    CUDA_LAUNCH_KERNEL_EXC, // Used in H100
 
     // Callbacks under Resource CB domain
     RESOURCE_CONTEXT_CREATED,
@@ -90,15 +85,15 @@ class CuptiCallbackApi {
 #endif
 
   bool registerCallback(
-    CUpti_CallbackDomain domain,
-    CuptiCallBackID cbid,
-    CuptiCallbackFn cbfn);
+      CUpti_CallbackDomain domain,
+      CuptiCallBackID cbid,
+      CuptiCallbackFn cbfn);
 
   // returns false if callback was not found
   bool deleteCallback(
-    CUpti_CallbackDomain domain,
-    CuptiCallBackID cbid,
-    CuptiCallbackFn cbfn);
+      CUpti_CallbackDomain domain,
+      CuptiCallBackID cbid,
+      CuptiCallbackFn cbfn);
 
   // Cupti Callback may be enable for domain and cbid pairs, or domains alone.
   bool enableCallback(CUpti_CallbackDomain domain, CUpti_CallbackId cbid);
@@ -109,7 +104,6 @@ class CuptiCallbackApi {
   // to re-enabled all previously running callback subscriptions.
   bool reenableCallbacks();
 
-
   // Please do not use this method. This has to be exposed as public
   // so it is accessible from the callback handler
   void __callback_switchboard(
@@ -118,18 +112,17 @@ class CuptiCallbackApi {
       const CUpti_CallbackData* cbInfo);
 
  private:
-
   friend class std::shared_ptr<CuptiCallbackApi>;
 
   // For callback table design overview see the .cpp file
   using CallbackList = std::list<CuptiCallbackFn>;
 
   // level 2 tables sizes are known at compile time
-  constexpr static size_t RUNTIME_CB_DOMAIN_SIZE
-    = (__RUNTIME_CB_DOMAIN_END - __RUNTIME_CB_DOMAIN_START);
+  constexpr static size_t RUNTIME_CB_DOMAIN_SIZE =
+      (__RUNTIME_CB_DOMAIN_END - __RUNTIME_CB_DOMAIN_START);
 
-  constexpr static size_t RESOURCE_CB_DOMAIN_SIZE
-    = (__RESOURCE_CB_DOMAIN_END - __RESOURCE_CB_DOMAIN_START);
+  constexpr static size_t RESOURCE_CB_DOMAIN_SIZE =
+      (__RESOURCE_CB_DOMAIN_END - __RESOURCE_CB_DOMAIN_START);
 
   // level 1 table is a struct
   struct CallbackTable {
@@ -141,12 +134,11 @@ class CuptiCallbackApi {
 
   CallbackTable callbacks_;
   bool initSuccess_ = false;
-  // Record a list of enabled callbacks, so that after teardown, we can re-enable
-  // the callbacks that were turned off to clean cupti context.
-  // As an implementation detail, cbid == 0xffffffff means enable the domain.
+  // Record a list of enabled callbacks, so that after teardown, we can
+  // re-enable the callbacks that were turned off to clean cupti context. As an
+  // implementation detail, cbid == 0xffffffff means enable the domain.
   std::set<std::pair<CUpti_CallbackDomain, CUpti_CallbackId>> enabledCallbacks_;
 
-
   // Reader Writer lock types
   using ReaderWriterLock = std::shared_timed_mutex;
   using ReaderLockGuard = std::shared_lock<ReaderWriterLock>;
@@ -154,7 +146,7 @@ class CuptiCallbackApi {
   ReaderWriterLock callbackLock_;
 #ifdef HAS_CUPTI
   CUptiResult lastCuptiStatus_;
-  CUpti_SubscriberHandle subscriber_ {nullptr};
+  CUpti_SubscriberHandle subscriber_{nullptr};
 #endif
 };
 
diff --git a/libkineto/src/CuptiEventApi.cpp b/libkineto/src/CuptiEventApi.cpp
index f7d84f754..43da0441b 100644
--- a/libkineto/src/CuptiEventApi.cpp
+++ b/libkineto/src/CuptiEventApi.cpp
@@ -17,8 +17,7 @@ using std::vector;
 
 namespace KINETO_NAMESPACE {
 
-CuptiEventApi::CuptiEventApi(CUcontext context)
-    : context_(context) {
+CuptiEventApi::CuptiEventApi(CUcontext context) : context_(context) {
   CUPTI_CALL(cuptiGetDeviceId(context_, (uint32_t*)&device_));
 }
 
diff --git a/libkineto/src/CuptiNvPerfMetric.cpp b/libkineto/src/CuptiNvPerfMetric.cpp
index 9a2900a06..b620b5ad7 100644
--- a/libkineto/src/CuptiNvPerfMetric.cpp
+++ b/libkineto/src/CuptiNvPerfMetric.cpp
@@ -8,7 +8,8 @@
 
 #ifdef HAS_CUPTI
 #include <cuda_runtime_api.h>
-#if defined(USE_CUPTI_RANGE_PROFILER) && defined(CUDART_VERSION) && CUDART_VERSION > 10000
+#if defined(USE_CUPTI_RANGE_PROFILER) && defined(CUDART_VERSION) && \
+    CUDART_VERSION > 10000
 #include <nvperf_cuda_host.h>
 #include <nvperf_host.h>
 #include <nvperf_target.h>
@@ -17,9 +18,9 @@
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
-#include "ScopeExit.h"
 #include "CuptiNvPerfMetric.h"
 #include "Logger.h"
+#include "ScopeExit.h"
 
 namespace KINETO_NAMESPACE {
 
@@ -28,7 +29,6 @@ namespace KINETO_NAMESPACE {
 // to NVIDIA PerfWorks APIs.
 namespace nvperf {
 
-
 // Largely based on NVIDIA sample code provided with CUDA release
 //  files Metric.cpp and Eval.cpp
 
@@ -36,7 +36,6 @@ namespace nvperf {
 // Metric and Counter Data Configuration
 // -------------------------------------------------
 
-
 // Note: Be carful before modifying the code below. There is a specific
 // sequence one needs to follow to program the metrics else things may
 // stop working. We tried to keep the flow consistent with the example
@@ -46,7 +45,8 @@ namespace nvperf {
 // Only supported on CUDA RT Version between 10.0 and 11.04.
 // After CUDA RT 11.04, the structure has changed.
 // TODO update the structure NVPA_RawMetricsConfig to support 11.04
-#if defined(USE_CUPTI_RANGE_PROFILER) && defined(CUDART_VERSION) && CUDART_VERSION > 10000
+#if defined(USE_CUPTI_RANGE_PROFILER) && defined(CUDART_VERSION) && \
+    CUDART_VERSION > 10000
 
 bool getRawMetricRequests(
     NVPA_MetricsContext* metricsContext,
@@ -59,15 +59,21 @@ bool getRawMetricRequests(
   bool keepInstances = true;
 
   for (const auto& metricName : metricNames) {
-
     NVPW_MetricsContext_GetMetricProperties_Begin_Params
         getMetricPropertiesBeginParams = {
-            NVPW_MetricsContext_GetMetricProperties_Begin_Params_STRUCT_SIZE, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0};
+            NVPW_MetricsContext_GetMetricProperties_Begin_Params_STRUCT_SIZE,
+            nullptr,
+            nullptr,
+            nullptr,
+            nullptr,
+            nullptr,
+            nullptr,
+            0,
+            0};
     getMetricPropertiesBeginParams.pMetricsContext = metricsContext;
     getMetricPropertiesBeginParams.pMetricName = metricName.c_str();
 
-    if (!NVPW_CALL(
-        NVPW_MetricsContext_GetMetricProperties_Begin(
+    if (!NVPW_CALL(NVPW_MetricsContext_GetMetricProperties_Begin(
             &getMetricPropertiesBeginParams))) {
       return false;
     }
@@ -81,7 +87,9 @@ bool getRawMetricRequests(
 
     NVPW_MetricsContext_GetMetricProperties_End_Params
         getMetricPropertiesEndParams = {
-            NVPW_MetricsContext_GetMetricProperties_End_Params_STRUCT_SIZE, nullptr, nullptr};
+            NVPW_MetricsContext_GetMetricProperties_End_Params_STRUCT_SIZE,
+            nullptr,
+            nullptr};
     getMetricPropertiesEndParams.pMetricsContext = metricsContext;
 
     if (!NVPW_CALL(NVPW_MetricsContext_GetMetricProperties_End(
@@ -91,13 +99,14 @@ bool getRawMetricRequests(
   }
 
   for (const auto& rawMetricName : rawMetricsDeps) {
-    NVPA_RawMetricRequest metricRequest = {NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE, nullptr, nullptr, false, false};
+    NVPA_RawMetricRequest metricRequest = {
+        NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE, nullptr, nullptr, false, false};
     metricRequest.pMetricName = rawMetricName.c_str();
     metricRequest.isolated = isolated;
     metricRequest.keepInstances = keepInstances;
     rawMetricRequests.push_back(metricRequest);
     VLOG(1) << "Adding raw metric struct  : raw metric = " << rawMetricName
-        << " isolated = " << isolated << " keepinst = " << keepInstances;
+            << " isolated = " << isolated << " keepinst = " << keepInstances;
   }
 
   if (rawMetricRequests.size() == 0) {
@@ -113,13 +122,15 @@ bool getProfilerConfigImage(
     const std::vector<std::string>& metricNames,
     std::vector<uint8_t>& configImage,
     const uint8_t* counterAvailabilityImage) {
-
   NVPW_CUDA_MetricsContext_Create_Params metricsContextCreateParams = {
-      NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE, nullptr, nullptr, nullptr};
+      NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE,
+      nullptr,
+      nullptr,
+      nullptr};
   metricsContextCreateParams.pChipName = chipName.c_str();
 
   if (!NVPW_CALL(
-        NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams))) {
+          NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams))) {
     return false;
   }
 
@@ -141,36 +152,39 @@ bool getProfilerConfigImage(
   std::vector<std::string> rawMetricDeps;
 
   if (!getRawMetricRequests(
-      metricsContextCreateParams.pMetricsContext,
-      metricNames,
-      rawMetricDeps,
-      rawMetricRequests)) {
+          metricsContextCreateParams.pMetricsContext,
+          metricNames,
+          rawMetricDeps,
+          rawMetricRequests)) {
     return false;
   }
 
   // Starting CUDA 11.4 the metric config create call and struct has changed
 #if CUDART_VERSION < 11040
-   NVPA_RawMetricsConfigOptions metricsConfigOptions = {
-       NVPA_RAW_METRICS_CONFIG_OPTIONS_STRUCT_SIZE, nullptr};
+  NVPA_RawMetricsConfigOptions metricsConfigOptions = {
+      NVPA_RAW_METRICS_CONFIG_OPTIONS_STRUCT_SIZE, nullptr};
 #else
-   NVPW_CUDA_RawMetricsConfig_Create_Params metricsConfigOptions = {
-       NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE, nullptr, NVPA_ACTIVITY_KIND_INVALID, nullptr, nullptr};
+  NVPW_CUDA_RawMetricsConfig_Create_Params metricsConfigOptions = {
+      NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE,
+      nullptr,
+      NVPA_ACTIVITY_KIND_INVALID,
+      nullptr,
+      nullptr};
 #endif // CUDART_VERSION < 11040
 
-   metricsConfigOptions.activityKind = NVPA_ACTIVITY_KIND_PROFILER;
-   metricsConfigOptions.pChipName = chipName.c_str();
+  metricsConfigOptions.activityKind = NVPA_ACTIVITY_KIND_PROFILER;
+  metricsConfigOptions.pChipName = chipName.c_str();
 
-   NVPA_RawMetricsConfig* rawMetricsConfig;
+  NVPA_RawMetricsConfig* rawMetricsConfig;
 #if CUDART_VERSION < 11040
-   if (!NVPW_CALL(
-         NVPA_RawMetricsConfig_Create(
-           &metricsConfigOptions, &rawMetricsConfig))) {
-     return false;
-   }
+  if (!NVPW_CALL(NVPA_RawMetricsConfig_Create(
+          &metricsConfigOptions, &rawMetricsConfig))) {
+    return false;
+  }
 #else
-   if (!NVPW_CALL(NVPW_CUDA_RawMetricsConfig_Create(&metricsConfigOptions))) {
-     return false;
-   }
+  if (!NVPW_CALL(NVPW_CUDA_RawMetricsConfig_Create(&metricsConfigOptions))) {
+    return false;
+  }
   rawMetricsConfig = metricsConfigOptions.pRawMetricsConfig;
 #endif // CUDART_VERSION < 11040
 
@@ -178,12 +192,14 @@ bool getProfilerConfigImage(
   if (counterAvailabilityImage) {
     NVPW_RawMetricsConfig_SetCounterAvailability_Params
         setCounterAvailabilityParams = {
-            NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE, nullptr, nullptr, nullptr};
+            NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE,
+            nullptr,
+            nullptr,
+            nullptr};
     setCounterAvailabilityParams.pRawMetricsConfig = rawMetricsConfig;
     setCounterAvailabilityParams.pCounterAvailabilityImage =
         counterAvailabilityImage;
-    if (!NVPW_CALL(
-          NVPW_RawMetricsConfig_SetCounterAvailability(
+    if (!NVPW_CALL(NVPW_RawMetricsConfig_SetCounterAvailability(
             &setCounterAvailabilityParams))) {
       return false;
     }
@@ -199,21 +215,26 @@ bool getProfilerConfigImage(
 
   // Start a Raw Metric Pass group
   NVPW_RawMetricsConfig_BeginPassGroup_Params beginPassGroupParams = {
-      NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE, nullptr, nullptr, 0};
+      NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE,
+      nullptr,
+      nullptr,
+      0};
   beginPassGroupParams.pRawMetricsConfig = rawMetricsConfig;
-  if (!NVPW_CALL(
-        NVPW_RawMetricsConfig_BeginPassGroup(&beginPassGroupParams))) {
+  if (!NVPW_CALL(NVPW_RawMetricsConfig_BeginPassGroup(&beginPassGroupParams))) {
     return false;
   }
 
   // Add all raw metrics
   NVPW_RawMetricsConfig_AddMetrics_Params addMetricsParams = {
-      NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE, nullptr, nullptr, nullptr, 0};
+      NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE,
+      nullptr,
+      nullptr,
+      nullptr,
+      0};
   addMetricsParams.pRawMetricsConfig = rawMetricsConfig;
   addMetricsParams.pRawMetricRequests = rawMetricRequests.data();
   addMetricsParams.numMetricRequests = rawMetricRequests.size();
-  if (!NVPW_CALL(
-        NVPW_RawMetricsConfig_AddMetrics(&addMetricsParams))) {
+  if (!NVPW_CALL(NVPW_RawMetricsConfig_AddMetrics(&addMetricsParams))) {
     return false;
   }
 
@@ -221,28 +242,31 @@ bool getProfilerConfigImage(
   NVPW_RawMetricsConfig_EndPassGroup_Params endPassGroupParams = {
       NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE, nullptr, nullptr};
   endPassGroupParams.pRawMetricsConfig = rawMetricsConfig;
-  if (!NVPW_CALL(
-        NVPW_RawMetricsConfig_EndPassGroup(&endPassGroupParams))) {
+  if (!NVPW_CALL(NVPW_RawMetricsConfig_EndPassGroup(&endPassGroupParams))) {
     return false;
   }
 
   // Setup Config Image generation
   NVPW_RawMetricsConfig_GenerateConfigImage_Params generateConfigImageParams = {
-      NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE, nullptr, nullptr, false};
+      NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE,
+      nullptr,
+      nullptr,
+      false};
   generateConfigImageParams.pRawMetricsConfig = rawMetricsConfig;
-  if (!NVPW_CALL(
-        NVPW_RawMetricsConfig_GenerateConfigImage(&generateConfigImageParams))) {
+  if (!NVPW_CALL(NVPW_RawMetricsConfig_GenerateConfigImage(
+          &generateConfigImageParams))) {
     return false;
   }
 
   // Get the Config Image size... nearly there
   NVPW_RawMetricsConfig_GetConfigImage_Params getConfigImageParams = {
-      NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE, nullptr, nullptr};
+      NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE,
+      nullptr,
+      nullptr};
   getConfigImageParams.pRawMetricsConfig = rawMetricsConfig;
   getConfigImageParams.bytesAllocated = 0;
   getConfigImageParams.pBuffer = nullptr;
-  if (!NVPW_CALL(
-        NVPW_RawMetricsConfig_GetConfigImage(&getConfigImageParams))) {
+  if (!NVPW_CALL(NVPW_RawMetricsConfig_GetConfigImage(&getConfigImageParams))) {
     return false;
   }
 
@@ -251,8 +275,7 @@ bool getProfilerConfigImage(
   // Write the Config image binary
   getConfigImageParams.bytesAllocated = configImage.size();
   getConfigImageParams.pBuffer = configImage.data();
-  if (!NVPW_CALL(
-        NVPW_RawMetricsConfig_GetConfigImage(&getConfigImageParams))) {
+  if (!NVPW_CALL(NVPW_RawMetricsConfig_GetConfigImage(&getConfigImageParams))) {
     return false;
   }
 
@@ -263,13 +286,12 @@ bool getCounterDataPrefixImage(
     const std::string& chipName,
     const std::vector<std::string>& metricNames,
     std::vector<uint8_t>& counterDataImagePrefix) {
-
   NVPW_CUDA_MetricsContext_Create_Params metricsContextCreateParams = {
       NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE, nullptr, nullptr};
   metricsContextCreateParams.pChipName = chipName.c_str();
 
   if (!NVPW_CALL(
-        NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams))) {
+          NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams))) {
     return false;
   }
 
@@ -278,7 +300,6 @@ bool getCounterDataPrefixImage(
   metricsContextDestroyParams.pMetricsContext =
       metricsContextCreateParams.pMetricsContext;
 
-
   SCOPE_EXIT([&]() {
     NVPW_MetricsContext_Destroy(
         (NVPW_MetricsContext_Destroy_Params*)&metricsContextDestroyParams);
@@ -292,19 +313,22 @@ bool getCounterDataPrefixImage(
   std::vector<std::string> rawMetricDeps;
 
   if (!getRawMetricRequests(
-      metricsContextCreateParams.pMetricsContext,
-      metricNames,
-      rawMetricDeps,
-      rawMetricRequests)) {
+          metricsContextCreateParams.pMetricsContext,
+          metricNames,
+          rawMetricDeps,
+          rawMetricRequests)) {
     return false;
   }
 
   // Setup Counter Data builder
   NVPW_CounterDataBuilder_Create_Params counterDataBuilderCreateParams = {
-      NVPW_CounterDataBuilder_Create_Params_STRUCT_SIZE, nullptr, nullptr, nullptr};
+      NVPW_CounterDataBuilder_Create_Params_STRUCT_SIZE,
+      nullptr,
+      nullptr,
+      nullptr};
   counterDataBuilderCreateParams.pChipName = chipName.c_str();
   if (!NVPW_CALL(
-        NVPW_CounterDataBuilder_Create(&counterDataBuilderCreateParams))) {
+          NVPW_CounterDataBuilder_Create(&counterDataBuilderCreateParams))) {
     return false;
   }
 
@@ -319,26 +343,33 @@ bool getCounterDataPrefixImage(
 
   // Add metrics to counter data image prefix
   NVPW_CounterDataBuilder_AddMetrics_Params addMetricsParams = {
-      NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE, nullptr, nullptr, nullptr, 0};
+      NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE,
+      nullptr,
+      nullptr,
+      nullptr,
+      0};
   addMetricsParams.pCounterDataBuilder =
       counterDataBuilderCreateParams.pCounterDataBuilder;
   addMetricsParams.pRawMetricRequests = rawMetricRequests.data();
   addMetricsParams.numMetricRequests = rawMetricRequests.size();
-  if (!NVPW_CALL(
-        NVPW_CounterDataBuilder_AddMetrics(&addMetricsParams))) {
+  if (!NVPW_CALL(NVPW_CounterDataBuilder_AddMetrics(&addMetricsParams))) {
     return false;
   }
 
   // Get image prefix size
   NVPW_CounterDataBuilder_GetCounterDataPrefix_Params
       getCounterDataPrefixParams = {
-          NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE, nullptr, nullptr, 0, nullptr, 0};
+          NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE,
+          nullptr,
+          nullptr,
+          0,
+          nullptr,
+          0};
   getCounterDataPrefixParams.pCounterDataBuilder =
       counterDataBuilderCreateParams.pCounterDataBuilder;
   getCounterDataPrefixParams.bytesAllocated = 0;
   getCounterDataPrefixParams.pBuffer = nullptr;
-  if (!NVPW_CALL(
-        NVPW_CounterDataBuilder_GetCounterDataPrefix(
+  if (!NVPW_CALL(NVPW_CounterDataBuilder_GetCounterDataPrefix(
           &getCounterDataPrefixParams))) {
     return false;
   }
@@ -348,8 +379,7 @@ bool getCounterDataPrefixImage(
   // Now write counter data image prefix
   getCounterDataPrefixParams.bytesAllocated = counterDataImagePrefix.size();
   getCounterDataPrefixParams.pBuffer = counterDataImagePrefix.data();
-  if (!NVPW_CALL(
-        NVPW_CounterDataBuilder_GetCounterDataPrefix(
+  if (!NVPW_CALL(NVPW_CounterDataBuilder_GetCounterDataPrefix(
           &getCounterDataPrefixParams))) {
     return false;
   }
@@ -367,20 +397,21 @@ std::string getRangeDescription(
   std::vector<const char*> descriptionPtrs;
 
   NVPW_Profiler_CounterData_GetRangeDescriptions_Params getRangeDescParams = {
-      NVPW_Profiler_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE, nullptr};
+      NVPW_Profiler_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE,
+      nullptr};
   getRangeDescParams.pCounterDataImage = counterDataImage.data();
   getRangeDescParams.rangeIndex = rangeIndex;
 
-  if (!NVPW_CALL(
-      NVPW_Profiler_CounterData_GetRangeDescriptions(&getRangeDescParams))) {
+  if (!NVPW_CALL(NVPW_Profiler_CounterData_GetRangeDescriptions(
+          &getRangeDescParams))) {
     return "";
   }
 
   descriptionPtrs.resize(getRangeDescParams.numDescriptions);
   getRangeDescParams.ppDescriptions = descriptionPtrs.data();
 
-  if (!NVPW_CALL(
-      NVPW_Profiler_CounterData_GetRangeDescriptions(&getRangeDescParams))) {
+  if (!NVPW_CALL(NVPW_Profiler_CounterData_GetRangeDescriptions(
+          &getRangeDescParams))) {
     return "";
   }
 
@@ -400,7 +431,6 @@ CuptiProfilerResult evalMetricValues(
     const std::vector<uint8_t>& counterDataImage,
     const std::vector<std::string>& metricNames,
     bool verbose) {
-
   if (!counterDataImage.size()) {
     LOG(ERROR) << "Counter Data Image is empty!";
     return {};
@@ -410,7 +440,7 @@ CuptiProfilerResult evalMetricValues(
       NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE, nullptr};
   metricsContextCreateParams.pChipName = chipName.c_str();
   if (!NVPW_CALL(
-        NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams))) {
+          NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams))) {
     return {};
   }
 
@@ -426,8 +456,7 @@ CuptiProfilerResult evalMetricValues(
   NVPW_CounterData_GetNumRanges_Params getNumRangesParams = {
       NVPW_CounterData_GetNumRanges_Params_STRUCT_SIZE, nullptr};
   getNumRangesParams.pCounterDataImage = counterDataImage.data();
-  if (!NVPW_CALL(
-      NVPW_CounterData_GetNumRanges(&getNumRangesParams))) {
+  if (!NVPW_CALL(NVPW_CounterData_GetNumRanges(&getNumRangesParams))) {
     return {};
   }
 
@@ -442,14 +471,12 @@ CuptiProfilerResult evalMetricValues(
     metricNamePtrs.push_back(metric.c_str());
   }
 
-  CuptiProfilerResult result{
-    .metricNames = metricNames};
+  CuptiProfilerResult result{.metricNames = metricNames};
 
   for (size_t rangeIndex = 0; rangeIndex < getNumRangesParams.numRanges;
        ++rangeIndex) {
-
-    CuptiRangeMeasurement rangeData {
-    .rangeName = getRangeDescription(counterDataImage, rangeIndex)};
+    CuptiRangeMeasurement rangeData{
+        .rangeName = getRangeDescription(counterDataImage, rangeIndex)};
     rangeData.values.resize(metricNames.size());
 
     // First set Counter data image with current range
@@ -464,7 +491,6 @@ CuptiProfilerResult evalMetricValues(
 
     NVPW_CALL(NVPW_MetricsContext_SetCounterData(&setCounterDataParams));
 
-
     // Now we can evaluate GPU metrics
     NVPW_MetricsContext_EvaluateToGpuValues_Params evalToGpuParams = {
         NVPW_MetricsContext_EvaluateToGpuValues_Params_STRUCT_SIZE, nullptr};
diff --git a/libkineto/src/CuptiNvPerfMetric.h b/libkineto/src/CuptiNvPerfMetric.h
index 5f07e6a0a..333acb8c4 100644
--- a/libkineto/src/CuptiNvPerfMetric.h
+++ b/libkineto/src/CuptiNvPerfMetric.h
@@ -8,9 +8,9 @@
 
 #pragma once
 
+#include <fmt/format.h>
 #include <string>
 #include <vector>
-#include <fmt/format.h>
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
@@ -32,17 +32,15 @@ struct CuptiProfilerResult {
 /* Utilities for CUPTI and NVIDIA PerfWorks Metric API
  */
 
-#define NVPW_CALL(call)                            \
-  [&]() -> bool {                                  \
-    NVPA_Status _status_ = call;                   \
-    if (_status_ != NVPA_STATUS_SUCCESS) {         \
-      LOG(WARNING) << fmt::format(                 \
-          "function {} failed with error ({})",    \
-          #call,                                   \
-          (int)_status_);                          \
-      return false;                                \
-    }                                              \
-    return true;                                   \
+#define NVPW_CALL(call)                                                \
+  [&]() -> bool {                                                      \
+    NVPA_Status _status_ = call;                                       \
+    if (_status_ != NVPA_STATUS_SUCCESS) {                             \
+      LOG(WARNING) << fmt::format(                                     \
+          "function {} failed with error ({})", #call, (int)_status_); \
+      return false;                                                    \
+    }                                                                  \
+    return true;                                                       \
   }()
 
 // fixme - add a results string
@@ -72,6 +70,5 @@ CuptiProfilerResult evalMetricValues(
     const std::vector<std::string>& metricNames,
     bool verbose = false);
 
-
 } // namespace nvperf
 } // namespace KINETO_NAMESPACE
diff --git a/libkineto/src/CuptiRangeProfiler.cpp b/libkineto/src/CuptiRangeProfiler.cpp
index 4844c1275..782acfeb6 100644
--- a/libkineto/src/CuptiRangeProfiler.cpp
+++ b/libkineto/src/CuptiRangeProfiler.cpp
@@ -16,10 +16,10 @@
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
-#include "output_base.h"
 #include "CuptiRangeProfiler.h"
 #include "CuptiRangeProfilerConfig.h"
 #include "Demangle.h"
+#include "output_base.h"
 
 namespace KINETO_NAMESPACE {
 
@@ -43,31 +43,28 @@ namespace {
 CuptiProfilerPrePostCallback cuptiProfilerPreRunCb;
 CuptiProfilerPrePostCallback cuptiProfilerPostRunCb;
 
-
 /* Following are aliases to a set of CUPTI metrics that can be
  * used to derived measures like FLOPs etc.
  */
 std::unordered_map<std::string, std::vector<std::string>> kDerivedMetrics = {
-  {"kineto__cuda_core_flops", {
-    "smsp__sass_thread_inst_executed_op_dadd_pred_on.sum",
-    "smsp__sass_thread_inst_executed_op_dfma_pred_on.sum",
-    "smsp__sass_thread_inst_executed_op_dmul_pred_on.sum",
-    "smsp__sass_thread_inst_executed_op_hadd_pred_on.sum",
-    "smsp__sass_thread_inst_executed_op_hfma_pred_on.sum",
-    "smsp__sass_thread_inst_executed_op_hmul_pred_on.sum",
-    "smsp__sass_thread_inst_executed_op_fadd_pred_on.sum",
-    "smsp__sass_thread_inst_executed_op_ffma_pred_on.sum",
-    "smsp__sass_thread_inst_executed_op_fmul_pred_on.sum"}},
-  {"kineto__tensor_core_insts", {
-    "sm__inst_executed_pipe_tensor.sum"}},
+    {"kineto__cuda_core_flops",
+     {"smsp__sass_thread_inst_executed_op_dadd_pred_on.sum",
+      "smsp__sass_thread_inst_executed_op_dfma_pred_on.sum",
+      "smsp__sass_thread_inst_executed_op_dmul_pred_on.sum",
+      "smsp__sass_thread_inst_executed_op_hadd_pred_on.sum",
+      "smsp__sass_thread_inst_executed_op_hfma_pred_on.sum",
+      "smsp__sass_thread_inst_executed_op_hmul_pred_on.sum",
+      "smsp__sass_thread_inst_executed_op_fadd_pred_on.sum",
+      "smsp__sass_thread_inst_executed_op_ffma_pred_on.sum",
+      "smsp__sass_thread_inst_executed_op_fmul_pred_on.sum"}},
+    {"kineto__tensor_core_insts", {"sm__inst_executed_pipe_tensor.sum"}},
 };
 
-} // namespace;
-
+} // namespace
 
 CuptiRangeProfilerSession::CuptiRangeProfilerSession(
-    const Config& config, ICuptiRBProfilerSessionFactory& factory) {
-
+    const Config& config,
+    ICuptiRBProfilerSessionFactory& factory) {
   // CUPTI APIs can conflict with other monitoring systems like DCGM
   // or NSight / NVProf. The pre and post run hooks enable users to
   // potentially pause other tools like DCGM.
@@ -78,7 +75,7 @@ CuptiRangeProfilerSession::CuptiRangeProfilerSession(
   }
 
   const CuptiRangeProfilerConfig& cupti_config =
-    CuptiRangeProfilerConfig::get(config);
+      CuptiRangeProfilerConfig::get(config);
 
   std::vector<std::string> cupti_metrics;
   const auto& requested_metrics = cupti_config.activitiesCuptiMetrics();
@@ -101,8 +98,7 @@ CuptiRangeProfilerSession::CuptiRangeProfilerSession(
     replayType_ = CUPTI_KernelReplay;
   }
 
-  LOG(INFO) << "Configuring " << cupti_metrics.size()
-            << " CUPTI metrics";
+  LOG(INFO) << "Configuring " << cupti_metrics.size() << " CUPTI metrics";
 
   int max_ranges = cupti_config.cuptiProfilerMaxRanges();
   for (const auto& m : cupti_metrics) {
@@ -126,14 +122,14 @@ CuptiRangeProfilerSession::CuptiRangeProfilerSession(
 }
 
 void CuptiRangeProfilerSession::start() {
-  for (auto& profiler: profilers_) {
+  for (auto& profiler : profilers_) {
     // user range or auto range
     profiler->asyncStartAndEnable(rangeType_, replayType_);
   }
 }
 
 void CuptiRangeProfilerSession::stop() {
-  for (auto& profiler: profilers_) {
+  for (auto& profiler : profilers_) {
     profiler->disableAndStop();
   }
 }
@@ -169,10 +165,8 @@ void CuptiRangeProfilerSession::addRangeEvents(
     traceBuffer_.emplace_activity(
         traceBuffer_.span,
         kProfActivityType,
-        use_kernel_as_range ?
-          demangle(profiler->getKernelNames()[ridx]) :
-          measurement.rangeName
-    );
+        use_kernel_as_range ? demangle(profiler->getKernelNames()[ridx])
+                            : measurement.rangeName);
     auto& event = activities.back();
     event->startTime = startTime + interval * ridx;
     event->endTime = startTime + interval * (ridx + 1);
@@ -193,7 +187,7 @@ void CuptiRangeProfilerSession::processTrace(ActivityLogger& logger) {
   }
 
   traceBuffer_.span = profilers_[0]->getProfilerTraceSpan();
-  for (auto& profiler: profilers_) {
+  for (auto& profiler : profilers_) {
     bool verbose = VLOG_IS_ON(1);
     auto result = profiler->evaluateMetrics(verbose);
 
@@ -240,19 +234,16 @@ std::vector<ResourceInfo> CuptiRangeProfilerSession::getResourceInfos() {
  * Implement CuptiRangeProfiler
  * ----------------------------------------
  */
-CuptiRangeProfiler::CuptiRangeProfiler()
-  : CuptiRangeProfiler(getFactory()) {}
+CuptiRangeProfiler::CuptiRangeProfiler() : CuptiRangeProfiler(getFactory()) {}
 
 CuptiRangeProfiler::CuptiRangeProfiler(ICuptiRBProfilerSessionFactory& factory)
-  : factory_(factory) {}
+    : factory_(factory) {}
 
-void CuptiRangeProfiler::setPreRunCallback(
-    CuptiProfilerPrePostCallback fn) {
+void CuptiRangeProfiler::setPreRunCallback(CuptiProfilerPrePostCallback fn) {
   cuptiProfilerPreRunCb = fn;
 }
 
-void CuptiRangeProfiler::setPostRunCallback(
-    CuptiProfilerPrePostCallback fn) {
+void CuptiRangeProfiler::setPostRunCallback(CuptiProfilerPrePostCallback fn) {
   cuptiProfilerPostRunCb = fn;
 }
 
@@ -260,8 +251,7 @@ const std::string& CuptiRangeProfiler::name() const {
   return kProfilerName;
 }
 
-const std::set<ActivityType>& CuptiRangeProfiler::availableActivities()
-    const {
+const std::set<ActivityType>& CuptiRangeProfiler::availableActivities() const {
   return kSupportedActivities;
 }
 
@@ -273,24 +263,23 @@ std::unique_ptr<IActivityProfilerSession> CuptiRangeProfiler::configure(
   if (activity_types_.find(kProfActivityType) == activity_types_.end()) {
     return nullptr;
   }
-  bool has_gpu_event_types = (
-      activity_types_.count(ActivityType::GPU_MEMCPY) +
-      activity_types_.count(ActivityType::GPU_MEMSET) +
-      activity_types_.count(ActivityType::CONCURRENT_KERNEL)
-    ) > 0;
+  bool has_gpu_event_types =
+      (activity_types_.count(ActivityType::GPU_MEMCPY) +
+       activity_types_.count(ActivityType::GPU_MEMSET) +
+       activity_types_.count(ActivityType::CONCURRENT_KERNEL)) > 0;
 
   if (has_gpu_event_types) {
-    LOG(WARNING) << kProfilerName << " cannot run in combination with"
-                << " other cuda activity profilers, please configure"
-                << " with cuda_profiler_range and optionally cpu_op/user_annotations";
+    LOG(WARNING)
+        << kProfilerName << " cannot run in combination with"
+        << " other cuda activity profilers, please configure"
+        << " with cuda_profiler_range and optionally cpu_op/user_annotations";
     return nullptr;
   }
 
   return std::make_unique<CuptiRangeProfilerSession>(config, factory_);
 }
 
-std::unique_ptr<IActivityProfilerSession>
-CuptiRangeProfiler::configure(
+std::unique_ptr<IActivityProfilerSession> CuptiRangeProfiler::configure(
     int64_t /*ts_ms*/,
     int64_t /*duration_ms*/,
     const std::set<ActivityType>& activity_types,
@@ -317,9 +306,8 @@ CuptiRangeProfilerInit::CuptiRangeProfilerInit() {
   }
 
   // Register the activity profiler instance with libkineto api
-  api().registerProfilerFactory([&]() {
-    return std::make_unique<CuptiRangeProfiler>();
-  });
+  api().registerProfilerFactory(
+      [&]() { return std::make_unique<CuptiRangeProfiler>(); });
 }
 
 } // namespace KINETO_NAMESPACE
diff --git a/libkineto/src/CuptiRangeProfiler.h b/libkineto/src/CuptiRangeProfiler.h
index a31bea57e..ae23baf58 100644
--- a/libkineto/src/CuptiRangeProfiler.h
+++ b/libkineto/src/CuptiRangeProfiler.h
@@ -10,8 +10,8 @@
 
 #include <functional>
 
-#include <libkineto.h>
 #include <IActivityProfiler.h>
+#include <libkineto.h>
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
@@ -68,11 +68,9 @@ class CuptiRangeProfilerSession : public IActivityProfilerSession {
   CUpti_ProfilerReplayMode replayType_ = CUPTI_UserReplay;
 
   CpuTraceBuffer traceBuffer_;
-  std::vector<
-    std::unique_ptr<CuptiRBProfilerSession>> profilers_;
+  std::vector<std::unique_ptr<CuptiRBProfilerSession>> profilers_;
 };
 
-
 /* This is a wrapper class that refers to the underlying
  * CuptiRangeProfiler. Using a wrapper libkineto can manage the ownership
  * of this object independent of the CuptiRangeProfiler itself.
@@ -108,6 +106,7 @@ class CuptiRangeProfiler : public libkineto::IActivityProfiler {
   // profiling sesssion.
   static void setPreRunCallback(CuptiProfilerPrePostCallback fn);
   static void setPostRunCallback(CuptiProfilerPrePostCallback fn);
+
  private:
   ICuptiRBProfilerSessionFactory& factory_;
 };
diff --git a/libkineto/src/CuptiRangeProfilerApi.cpp b/libkineto/src/CuptiRangeProfilerApi.cpp
index a7bf3817d..6f28ac02c 100644
--- a/libkineto/src/CuptiRangeProfilerApi.cpp
+++ b/libkineto/src/CuptiRangeProfilerApi.cpp
@@ -220,13 +220,13 @@ void enableKernelCallbacks() {
 // cudaLaunchKernelExC() used from H100 onwards.
 #if defined(CUDA_VERSION) && (CUDA_VERSION >= 11080)
   status &= cbapi->enableCallback(
-    CUPTI_CB_DOMAIN_RUNTIME_API,
-    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060);
+      CUPTI_CB_DOMAIN_RUNTIME_API,
+      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060);
 #endif
 
   if (!status) {
     LOG(WARNING) << "CUPTI Range Profiler unable to "
-                  << "enable cuda kernel launch callback.";
+                 << "enable cuda kernel launch callback.";
   }
 
   LOG(INFO) << "CUPTI Profiler kernel callbacks enabled";
@@ -240,13 +240,13 @@ void disableKernelCallbacks() {
       CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000);
 #if defined(CUDA_VERSION) && (CUDA_VERSION >= 11080)
   status &= cbapi->disableCallback(
-    CUPTI_CB_DOMAIN_RUNTIME_API,
-    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060);
+      CUPTI_CB_DOMAIN_RUNTIME_API,
+      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060);
 #endif
 
   if (!status) {
     LOG(WARNING) << "CUPTI Range Profiler unable to "
-                  << "disable cuda kernel launch callback.";
+                 << "disable cuda kernel launch callback.";
     return;
   }
 
diff --git a/libkineto/src/CuptiRangeProfilerApi.h b/libkineto/src/CuptiRangeProfilerApi.h
index ee552e794..2227a27eb 100644
--- a/libkineto/src/CuptiRangeProfilerApi.h
+++ b/libkineto/src/CuptiRangeProfilerApi.h
@@ -11,8 +11,10 @@
 #ifdef HAS_CUPTI
 #include <cuda.h>
 #include <cuda_runtime_api.h>
-// Using CUDA 11 and above due to usage of API: cuptiProfilerGetCounterAvailability.
-#if defined(USE_CUPTI_RANGE_PROFILER) && defined(CUDART_VERSION) && CUDART_VERSION >= 10000 && CUDA_VERSION >= 11000
+// Using CUDA 11 and above due to usage of API:
+// cuptiProfilerGetCounterAvailability.
+#if defined(USE_CUPTI_RANGE_PROFILER) && defined(CUDART_VERSION) && \
+    CUDART_VERSION >= 10000 && CUDA_VERSION >= 11000
 #define HAS_CUPTI_RANGE_PROFILER 1
 #endif // CUDART_VERSION > 10.00 and CUDA_VERSION >= 11.00
 #endif // HAS_CUPTI
@@ -22,14 +24,12 @@
 #include <cupti_profiler_target.h>
 #include <cupti_target.h>
 #else
-enum CUpti_ProfilerRange
-{
+enum CUpti_ProfilerRange {
   CUPTI_AutoRange,
   CUPTI_UserRange,
 };
 
-enum CUpti_ProfilerReplayMode
-{
+enum CUpti_ProfilerReplayMode {
   CUPTI_KernelReplay,
   CUPTI_UserReplay,
 };
@@ -37,15 +37,15 @@ enum CUpti_ProfilerReplayMode
 
 #include <chrono>
 #include <mutex>
+#include <set>
 #include <string>
 #include <vector>
-#include <set>
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
-#include "TraceSpan.h"
 #include "CuptiCallbackApi.h"
 #include "CuptiNvPerfMetric.h"
+#include "TraceSpan.h"
 
 /* Cupti Range based profiler session
  * See : https://docs.nvidia.com/cupti/Cupti/r_main.html#r_profiler
@@ -68,7 +68,6 @@ struct CuptiRangeProfilerOptions {
 
 class CuptiRBProfilerSession {
  public:
-
   explicit CuptiRBProfilerSession(const CuptiRangeProfilerOptions& opts);
 
   virtual ~CuptiRBProfilerSession();
@@ -127,7 +126,7 @@ class CuptiRBProfilerSession {
     evaluateMetrics(true);
   }
 
- TraceSpan getProfilerTraceSpan();
+  TraceSpan getProfilerTraceSpan();
 
   virtual CuptiProfilerResult evaluateMetrics(bool verbose = false);
 
@@ -169,11 +168,10 @@ class CuptiRBProfilerSession {
   CUpti_ProfilerRange curRange_ = CUPTI_AutoRange;
   CUpti_ProfilerReplayMode curReplay_ = CUPTI_KernelReplay;
 
-  std::chrono::time_point<std::chrono::high_resolution_clock>
-    profilerStartTs_, profilerStopTs_, profilerInitDoneTs_;
+  std::chrono::time_point<std::chrono::high_resolution_clock> profilerStartTs_,
+      profilerStopTs_, profilerInitDoneTs_;
 
  private:
-
   bool createCounterDataImage();
 
   // log kernel name that used with callbacks
@@ -190,7 +188,6 @@ class CuptiRBProfilerSession {
   int numNestingLevels_;
   CUcontext cuContext_;
 
-
   // data buffers for configuration and counter data collection
   std::vector<uint8_t> counterDataImagePrefix;
   std::vector<uint8_t> configImage;
@@ -228,7 +225,6 @@ struct CuptiRBProfilerSessionFactory : ICuptiRBProfilerSessionFactory {
       const CuptiRangeProfilerOptions& opts) override;
 };
 
-
 // called directly only in unit tests
 namespace testing {
 
diff --git a/libkineto/src/CuptiRangeProfilerConfig.cpp b/libkineto/src/CuptiRangeProfilerConfig.cpp
index be897e5b3..0ff6f3417 100644
--- a/libkineto/src/CuptiRangeProfilerConfig.cpp
+++ b/libkineto/src/CuptiRangeProfilerConfig.cpp
@@ -15,16 +15,16 @@
 #include <fmt/ranges.h>
 #include <ostream>
 
-
 namespace KINETO_NAMESPACE {
 
 // number of ranges affect the size of counter data binary used by
 // the CUPTI Profiler. these defaults can be tuned
 constexpr int KMaxAutoRanges = 1500; // supports 1500 kernels
-constexpr int KMaxUserRanges = 10;   // enable upto 10 sub regions marked by user
+constexpr int KMaxUserRanges = 10; // enable upto 10 sub regions marked by user
 
 constexpr char kCuptiProfilerMetricsKey[] = "CUPTI_PROFILER_METRICS";
-constexpr char kCuptiProfilerPerKernelKey[] = "CUPTI_PROFILER_ENABLE_PER_KERNEL";
+constexpr char kCuptiProfilerPerKernelKey[] =
+    "CUPTI_PROFILER_ENABLE_PER_KERNEL";
 constexpr char kCuptiProfilerMaxRangesKey[] = "CUPTI_PROFILER_MAX_RANGES";
 
 CuptiRangeProfilerConfig::CuptiRangeProfilerConfig(Config& cfg)
@@ -32,7 +32,9 @@ CuptiRangeProfilerConfig::CuptiRangeProfilerConfig(Config& cfg)
       cuptiProfilerPerKernel_(false),
       cuptiProfilerMaxRanges_(0) {}
 
-bool CuptiRangeProfilerConfig::handleOption(const std::string& name, std::string& val) {
+bool CuptiRangeProfilerConfig::handleOption(
+    const std::string& name,
+    std::string& val) {
   VLOG(0) << " handling : " << name << " = " << val;
   // Cupti Range based Profiler configuration
   if (!name.compare(kCuptiProfilerMetricsKey)) {
@@ -50,26 +52,28 @@ bool CuptiRangeProfilerConfig::handleOption(const std::string& name, std::string
 void CuptiRangeProfilerConfig::setDefaults() {
   if (activitiesCuptiMetrics_.size() > 0 && cuptiProfilerMaxRanges_ == 0) {
     cuptiProfilerMaxRanges_ =
-      cuptiProfilerPerKernel_ ? KMaxAutoRanges : KMaxUserRanges;
+        cuptiProfilerPerKernel_ ? KMaxAutoRanges : KMaxUserRanges;
   }
 }
 
-void CuptiRangeProfilerConfig::printActivityProfilerConfig(std::ostream& s) const {
+void CuptiRangeProfilerConfig::printActivityProfilerConfig(
+    std::ostream& s) const {
   if (activitiesCuptiMetrics_.size() > 0) {
     s << "Cupti Profiler metrics : "
-      << fmt::format("{}", fmt::join(activitiesCuptiMetrics_, ", ")) << std::endl;
-    s << "Cupti Profiler measure per kernel : "
-      << cuptiProfilerPerKernel_ << std::endl;
+      << fmt::format("{}", fmt::join(activitiesCuptiMetrics_, ", "))
+      << std::endl;
+    s << "Cupti Profiler measure per kernel : " << cuptiProfilerPerKernel_
+      << std::endl;
     s << "Cupti Profiler max ranges : " << cuptiProfilerMaxRanges_ << std::endl;
   }
 }
 
-void CuptiRangeProfilerConfig::setActivityDependentConfig(){}
+void CuptiRangeProfilerConfig::setActivityDependentConfig() {}
 
 void CuptiRangeProfilerConfig::registerFactory() {
-  Config::addConfigFactory(
-      kCuptiProfilerConfigName,
-      [](Config& cfg) { return new CuptiRangeProfilerConfig(cfg); });
+  Config::addConfigFactory(kCuptiProfilerConfigName, [](Config& cfg) {
+    return new CuptiRangeProfilerConfig(cfg);
+  });
 }
 
 } // namespace KINETO_NAMESPACE
diff --git a/libkineto/src/CuptiRangeProfilerConfig.h b/libkineto/src/CuptiRangeProfilerConfig.h
index b863895ae..690057f07 100644
--- a/libkineto/src/CuptiRangeProfilerConfig.h
+++ b/libkineto/src/CuptiRangeProfilerConfig.h
@@ -23,13 +23,12 @@ class CuptiRangeProfilerConfig : public AbstractConfig {
  public:
   bool handleOption(const std::string& name, std::string& val) override;
 
-  void validate(
-      const std::chrono::time_point<std::chrono::system_clock>&
-      fallbackProfileStartTime) override {}
+  void validate(const std::chrono::time_point<std::chrono::system_clock>&
+                    fallbackProfileStartTime) override {}
 
   static CuptiRangeProfilerConfig& get(const Config& cfg) {
-    return dynamic_cast<CuptiRangeProfilerConfig&>(cfg.feature(
-          kCuptiProfilerConfigName));
+    return dynamic_cast<CuptiRangeProfilerConfig&>(
+        cfg.feature(kCuptiProfilerConfigName));
   }
 
   Config& parent() const {
@@ -59,6 +58,7 @@ class CuptiRangeProfilerConfig : public AbstractConfig {
   void printActivityProfilerConfig(std::ostream& s) const override;
   void setActivityDependentConfig() override;
   static void registerFactory();
+
  protected:
   AbstractConfig* cloneDerived(AbstractConfig& parent) const override {
     CuptiRangeProfilerConfig* clone = new CuptiRangeProfilerConfig(*this);
@@ -67,10 +67,10 @@ class CuptiRangeProfilerConfig : public AbstractConfig {
   }
 
  private:
- CuptiRangeProfilerConfig() = delete;
+  CuptiRangeProfilerConfig() = delete;
   explicit CuptiRangeProfilerConfig(Config& parent);
-  explicit CuptiRangeProfilerConfig(
-      const CuptiRangeProfilerConfig& other) = default;
+  explicit CuptiRangeProfilerConfig(const CuptiRangeProfilerConfig& other) =
+      default;
 
   // some defaults will depend on other configuration
   void setDefaults();
diff --git a/libkineto/src/DaemonConfigLoader.cpp b/libkineto/src/DaemonConfigLoader.cpp
index 726d561ee..6147e2db2 100644
--- a/libkineto/src/DaemonConfigLoader.cpp
+++ b/libkineto/src/DaemonConfigLoader.cpp
@@ -10,15 +10,15 @@
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
-#include "Logger.h"
-#include "ConfigLoader.h"
 #include "DaemonConfigLoader.h"
+#include "ConfigLoader.h"
+#include "Logger.h"
 
 namespace KINETO_NAMESPACE {
 
 // TODO : implications of this singleton being thread safe on forks?
 IpcFabricConfigClient* DaemonConfigLoader::getConfigClient() {
-  if (!configClient){
+  if (!configClient) {
     configClient = std::make_unique<IpcFabricConfigClient>();
   }
   return configClient.get();
@@ -34,7 +34,9 @@ std::string DaemonConfigLoader::readBaseConfig() {
   return configClient->getLibkinetoBaseConfig();
 }
 
-std::string DaemonConfigLoader::readOnDemandConfig(bool events, bool activities) {
+std::string DaemonConfigLoader::readOnDemandConfig(
+    bool events,
+    bool activities) {
   auto configClient = getConfigClient();
   if (!configClient) {
     LOG_EVERY_N(WARNING, 10) << "Failed to read config: No dyno config client";
@@ -67,18 +69,17 @@ void DaemonConfigLoader::setCommunicationFabric(bool enabled) {
   if (!configClient) {
     LOG(WARNING) << "Failed to read config: No dyno config client";
     // This is probably a temporary problem - return -1 to indicate error.
-    return ;
+    return;
   }
   return configClient->setIpcFabricEnabled(enabled);
 }
 
 void DaemonConfigLoader::registerFactory() {
-  ConfigLoader::setDaemonConfigLoaderFactory(
-      []() {
-        auto loader = std::make_unique<DaemonConfigLoader>();
-        loader->setCommunicationFabric(true);
-        return loader;
-      });
+  ConfigLoader::setDaemonConfigLoaderFactory([]() {
+    auto loader = std::make_unique<DaemonConfigLoader>();
+    loader->setCommunicationFabric(true);
+    return loader;
+  });
 }
 
 } // namespace KINETO_NAMESPACE
diff --git a/libkineto/src/DaemonConfigLoader.h b/libkineto/src/DaemonConfigLoader.h
index e30833007..e283dce00 100644
--- a/libkineto/src/DaemonConfigLoader.h
+++ b/libkineto/src/DaemonConfigLoader.h
@@ -59,10 +59,10 @@ class DaemonConfigLoader : public IDaemonConfigLoader {
   IpcFabricConfigClient* getConfigClient();
 
   static void registerFactory();
-private:
+
+ private:
   std::unique_ptr<IpcFabricConfigClient> configClient;
 };
 #endif // __linux__
 
-
 } // namespace KINETO_NAMESPACE
diff --git a/libkineto/src/DeviceProperties.cpp b/libkineto/src/DeviceProperties.cpp
index 902bd8da7..2e2876dcb 100644
--- a/libkineto/src/DeviceProperties.cpp
+++ b/libkineto/src/DeviceProperties.cpp
@@ -13,8 +13,8 @@
 #include <vector>
 
 #if defined(HAS_CUPTI)
-#include <cuda_runtime.h>
 #include <cuda_occupancy.h>
+#include <cuda_runtime.h>
 #elif defined(HAS_ROCTRACER)
 #include <hip/hip_runtime.h>
 #endif
@@ -68,19 +68,25 @@ static const std::vector<gpuDeviceProp>& deviceProps() {
 }
 
 static const std::string createDevicePropertiesJson(
-    size_t id, const gpuDeviceProp& props) {
+    size_t id,
+    const gpuDeviceProp& props) {
   std::string gpuSpecific = "";
 #if defined(HAS_CUPTI)
-  gpuSpecific = fmt::format(R"JSON(
+  gpuSpecific = fmt::format(
+      R"JSON(
     , "regsPerMultiprocessor": {}, "sharedMemPerBlockOptin": {}, "sharedMemPerMultiprocessor": {})JSON",
-    props.regsPerMultiprocessor, props.sharedMemPerBlockOptin, props.sharedMemPerMultiprocessor);
+      props.regsPerMultiprocessor,
+      props.sharedMemPerBlockOptin,
+      props.sharedMemPerMultiprocessor);
 #elif defined(HAS_ROCTRACER)
-  gpuSpecific = fmt::format(R"JSON(
+  gpuSpecific = fmt::format(
+      R"JSON(
     , "maxSharedMemoryPerMultiProcessor": {})JSON",
-    props.maxSharedMemoryPerMultiProcessor);
+      props.maxSharedMemoryPerMultiProcessor);
 #endif
 
-  return fmt::format(R"JSON(
+  return fmt::format(
+      R"JSON(
     {{
       "id": {}, "name": "{}", "totalGlobalMem": {},
       "computeMajor": {}, "computeMinor": {},
@@ -88,11 +94,17 @@ static const std::string createDevicePropertiesJson(
       "regsPerBlock": {}, "warpSize": {},
       "sharedMemPerBlock": {}, "numSms": {}{}
     }})JSON",
-      id, props.name, props.totalGlobalMem,
-      props.major, props.minor,
-      props.maxThreadsPerBlock, props.maxThreadsPerMultiProcessor,
-      props.regsPerBlock,  props.warpSize,
-      props.sharedMemPerBlock, props.multiProcessorCount,
+      id,
+      props.name,
+      props.totalGlobalMem,
+      props.major,
+      props.minor,
+      props.maxThreadsPerBlock,
+      props.maxThreadsPerMultiProcessor,
+      props.regsPerBlock,
+      props.warpSize,
+      props.sharedMemPerBlock,
+      props.multiProcessorCount,
       gpuSpecific);
 }
 
@@ -111,9 +123,8 @@ const std::string& devicePropertiesJson() {
 }
 
 int smCount(uint32_t deviceId) {
-  const std::vector<gpuDeviceProp> &props = deviceProps();
-  return deviceId >= props.size() ? 0 :
-     props[deviceId].multiProcessorCount;
+  const std::vector<gpuDeviceProp>& props = deviceProps();
+  return deviceId >= props.size() ? 0 : props[deviceId].multiProcessorCount;
 }
 #else
 const std::string& devicePropertiesJson() {
@@ -129,13 +140,12 @@ int smCount(uint32_t deviceId) {
 #ifdef HAS_CUPTI
 float blocksPerSm(const CUpti_ActivityKernel4& kernel) {
   return (kernel.gridX * kernel.gridY * kernel.gridZ) /
-      (float) smCount(kernel.deviceId);
+      (float)smCount(kernel.deviceId);
 }
 
 float warpsPerSm(const CUpti_ActivityKernel4& kernel) {
   constexpr int threads_per_warp = 32;
-  return blocksPerSm(kernel) *
-      (kernel.blockX * kernel.blockY * kernel.blockZ) /
+  return blocksPerSm(kernel) * (kernel.blockX * kernel.blockY * kernel.blockZ) /
       threads_per_warp;
 }
 
@@ -144,7 +154,7 @@ float kernelOccupancy(const CUpti_ActivityKernel4& kernel) {
   int sm_count = smCount(kernel.deviceId);
   if (sm_count) {
     blocks_per_sm =
-        (kernel.gridX * kernel.gridY * kernel.gridZ) / (float) sm_count;
+        (kernel.gridX * kernel.gridY * kernel.gridZ) / (float)sm_count;
   }
   return kernelOccupancy(
       kernel.deviceId,
@@ -168,7 +178,7 @@ float kernelOccupancy(
     float blocksPerSm) {
   // Calculate occupancy
   float occupancy = -1.0;
-  const std::vector<cudaDeviceProp> &props = deviceProps();
+  const std::vector<cudaDeviceProp>& props = deviceProps();
   if (deviceId < props.size()) {
     cudaOccFuncAttributes occFuncAttr;
     occFuncAttr.maxThreadsPerBlock = INT_MAX;
@@ -183,17 +193,21 @@ float kernelOccupancy(
     cudaOccResult occ_result;
     cudaOccDeviceProp prop(props[deviceId]);
     cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor(
-          &occ_result, &prop, &occFuncAttr, &occDeviceState,
-          blockSize, dynamicSmemSize);
+        &occ_result,
+        &prop,
+        &occFuncAttr,
+        &occDeviceState,
+        blockSize,
+        dynamicSmemSize);
     if (status == CUDA_OCC_SUCCESS) {
       if (occ_result.activeBlocksPerMultiprocessor < blocksPerSm) {
         blocksPerSm = occ_result.activeBlocksPerMultiprocessor;
       }
       occupancy = blocksPerSm * blockSize /
-          (float) props[deviceId].maxThreadsPerMultiProcessor;
+          (float)props[deviceId].maxThreadsPerMultiProcessor;
     } else {
-      LOG_EVERY_N(ERROR, 1000) << "Failed to calculate occupancy, status = "
-                               << status;
+      LOG_EVERY_N(ERROR, 1000)
+          << "Failed to calculate occupancy, status = " << status;
     }
   }
   return occupancy;
diff --git a/libkineto/src/DeviceUtil.h b/libkineto/src/DeviceUtil.h
index ffa03e6c9..c7ad2cda7 100644
--- a/libkineto/src/DeviceUtil.h
+++ b/libkineto/src/DeviceUtil.h
@@ -15,18 +15,18 @@
 #include <cuda_runtime.h>
 #include <cupti.h>
 
-#define CUDA_CALL(call)                                      \
-  [&]() -> cudaError_t {                                     \
-    cudaError_t _status_ = call;                             \
-    if (_status_ != cudaSuccess) {                           \
-      const char* _errstr_ = cudaGetErrorString(_status_);   \
-      LOG(WARNING) << fmt::format(                           \
-          "function {} failed with error {} ({})",           \
-          #call,                                             \
-          _errstr_,                                          \
-          (int)_status_);                                    \
-    }                                                        \
-    return _status_;                                         \
+#define CUDA_CALL(call)                                    \
+  [&]() -> cudaError_t {                                   \
+    cudaError_t _status_ = call;                           \
+    if (_status_ != cudaSuccess) {                         \
+      const char* _errstr_ = cudaGetErrorString(_status_); \
+      LOG(WARNING) << fmt::format(                         \
+          "function {} failed with error {} ({})",         \
+          #call,                                           \
+          _errstr_,                                        \
+          (int)_status_);                                  \
+    }                                                      \
+    return _status_;                                       \
   }()
 
 #define CUPTI_CALL(call)                           \
@@ -48,17 +48,17 @@
 #include <hip/hip_runtime.h>
 #include <roctracer.h>
 
-#define CUDA_CALL(call)                                     \
-  {                                                         \
-    hipError_t _status_ = call;                             \
-    if (_status_ != hipSuccess) {                           \
-      const char* _errstr_ = hipGetErrorString(_status_);   \
-      LOG(WARNING) << fmt::format(                          \
-          "function {} failed with error {} ({})",          \
-          #call,                                            \
-          _errstr_,                                         \
-          (int)_status_);                                   \
-    }                                                       \
+#define CUDA_CALL(call)                                   \
+  {                                                       \
+    hipError_t _status_ = call;                           \
+    if (_status_ != hipSuccess) {                         \
+      const char* _errstr_ = hipGetErrorString(_status_); \
+      LOG(WARNING) << fmt::format(                        \
+          "function {} failed with error {} ({})",        \
+          #call,                                          \
+          _errstr_,                                       \
+          (int)_status_);                                 \
+    }                                                     \
   }
 
 #define CUPTI_CALL(call) call
diff --git a/libkineto/src/EventProfiler.cpp b/libkineto/src/EventProfiler.cpp
index f55ace6d6..30650dff1 100644
--- a/libkineto/src/EventProfiler.cpp
+++ b/libkineto/src/EventProfiler.cpp
@@ -242,9 +242,9 @@ void EventGroupSet::printDescription(ostream& s) const {
 // Find nearest factor of a number by linear search,
 // starting at hi and lo - hi searches up and lo searches down
 static int nearestFactor(int hi, int lo, int number) {
-  return number % hi == 0
-      ? hi
-      : number % lo == 0 ? lo : nearestFactor(hi + 1, lo - 1, number);
+  return number % hi == 0 ? hi
+      : number % lo == 0  ? lo
+                          : nearestFactor(hi + 1, lo - 1, number);
 }
 
 static int nearestFactor(int count, int max) {
@@ -324,9 +324,8 @@ static unique_ptr<Config> alignAndValidateConfigs(
     Config* onDemand) {
   auto now = system_clock::now();
   if (!onDemand ||
-      now >
-          (onDemand->eventProfilerOnDemandStartTime() +
-           onDemand->eventProfilerOnDemandDuration())) {
+      now > (onDemand->eventProfilerOnDemandStartTime() +
+             onDemand->eventProfilerOnDemandDuration())) {
     base.validate(now);
     return base.clone();
   }
diff --git a/libkineto/src/EventProfilerController.cpp b/libkineto/src/EventProfilerController.cpp
index 55bf83033..8d5dcb3eb 100644
--- a/libkineto/src/EventProfilerController.cpp
+++ b/libkineto/src/EventProfilerController.cpp
@@ -54,8 +54,7 @@ vector<unique_ptr<SampleListener>> makeLoggers(const Config& config) {
   return loggers;
 }
 
-vector<unique_ptr<SampleListener>> makeOnDemandLoggers(
-    const Config& config) {
+vector<unique_ptr<SampleListener>> makeOnDemandLoggers(const Config& config) {
   vector<unique_ptr<SampleListener>> loggers;
   for (const auto& factory : onDemandLoggerFactories()) {
     loggers.push_back(factory(config));
@@ -69,20 +68,18 @@ vector<unique_ptr<SampleListener>>& loggers(const Config& config) {
   return res;
 }
 
-vector<unique_ptr<SampleListener>>& onDemandLoggers(
-    const Config& config) {
+vector<unique_ptr<SampleListener>>& onDemandLoggers(const Config& config) {
   static auto res = makeOnDemandLoggers(config);
   return res;
 }
 
-} // anon namespace
+} // namespace
 
 // Keep an eye on profiling threads.
 // We've observed deadlocks in Cuda11 in libcuda / libcupti..
 namespace detail {
 
 class HeartbeatMonitor {
-
  public:
   ~HeartbeatMonitor() {
     stopMonitoring();
@@ -119,7 +116,7 @@ class HeartbeatMonitor {
 
   void monitorLoop() {
     std::unique_lock<std::mutex> lock(mutex_);
-    while(!stopMonitor_) {
+    while (!stopMonitor_) {
       auto cv_status = condVar_.wait_for(lock, seconds(period_));
       // Don't perform check on spurious wakeup or on notify
       if (cv_status == std::cv_status::timeout) {
@@ -139,8 +136,8 @@ class HeartbeatMonitor {
     if (!monitorThread_) {
       VLOG(0) << "Starting monitoring thread";
       stopMonitor_ = false;
-      monitorThread_ = std::make_unique<std::thread>(
-          &HeartbeatMonitor::monitorLoop, this);
+      monitorThread_ =
+          std::make_unique<std::thread>(&HeartbeatMonitor::monitorLoop, this);
     }
   }
 
@@ -183,16 +180,17 @@ void reportLateSample(
 }
 
 void configureHeartbeatMonitor(
-    detail::HeartbeatMonitor& monitor, const Config& base, const Config* onDemand) {
-  seconds base_period =
-      base.eventProfilerHeartbeatMonitorPeriod();
-  seconds on_demand_period = !onDemand ? seconds(0) :
-      onDemand->eventProfilerHeartbeatMonitorPeriod();
+    detail::HeartbeatMonitor& monitor,
+    const Config& base,
+    const Config* onDemand) {
+  seconds base_period = base.eventProfilerHeartbeatMonitorPeriod();
+  seconds on_demand_period =
+      !onDemand ? seconds(0) : onDemand->eventProfilerHeartbeatMonitorPeriod();
   monitor.setPeriod(
       on_demand_period > seconds(0) ? on_demand_period : base_period);
 }
 
-} // anon namespace
+} // namespace
 
 void EventProfilerController::addLoggerFactory(
     std::function<unique_ptr<SampleListener>(const Config&)> factory) {
@@ -210,10 +208,8 @@ EventProfilerController::EventProfilerController(
     detail::HeartbeatMonitor& heartbeatMonitor)
     : configLoader_(configLoader), heartbeatMonitor_(heartbeatMonitor) {
   auto cupti_events = std::make_unique<CuptiEventApi>(context);
-  auto cupti_metrics =
-      std::make_unique<CuptiMetricApi>(cupti_events->device());
-  configLoader_.addHandler(
-      ConfigLoader::ConfigKind::EventProfiler, this);
+  auto cupti_metrics = std::make_unique<CuptiMetricApi>(cupti_events->device());
+  configLoader_.addHandler(ConfigLoader::ConfigKind::EventProfiler, this);
   auto config = configLoader.getConfigCopy();
   profiler_ = std::make_unique<EventProfiler>(
       std::move(cupti_events),
@@ -230,8 +226,7 @@ EventProfilerController::~EventProfilerController() {
     stopRunloop_ = true;
     profilerThread_->join();
   }
-  configLoader_.removeHandler(
-      ConfigLoader::ConfigKind::EventProfiler, this);
+  configLoader_.removeHandler(ConfigLoader::ConfigKind::EventProfiler, this);
   VLOG(0) << "Stopped event profiler";
 }
 
@@ -247,8 +242,8 @@ void EventProfilerController::start(CUcontext ctx, ConfigLoader& configLoader) {
   // before everything the controller accesses gets destroyed.
   // Hence access the profilerMap after initialization of the controller.
   started() = true;
-  auto controller = unique_ptr<EventProfilerController>(
-      new EventProfilerController(
+  auto controller =
+      unique_ptr<EventProfilerController>(new EventProfilerController(
           ctx, configLoader, detail::HeartbeatMonitor::instance()));
   profilerMap()[ctx] = std::move(controller);
 }
@@ -303,8 +298,8 @@ void EventProfilerController::profilerLoop() {
   }
 
   if (!profiler_->setContinuousMode()) {
-    VLOG(0) << "Continuous mode not supported for GPU "
-            << profiler_->device() << ". Not starting Event Profiler.";
+    VLOG(0) << "Continuous mode not supported for GPU " << profiler_->device()
+            << ". Not starting Event Profiler.";
     return;
   }
 
@@ -353,7 +348,7 @@ void EventProfilerController::profilerLoop() {
         profiler_->configure(*config, on_demand_config.get());
       } catch (const std::exception& ex) {
         LOG(ERROR) << "Encountered error while configuring event profiler: "
-            << ex.what();
+                   << ex.what();
         // Exit profiling entirely when encountering an error here
         // as it indicates a serious problem or bug.
         break;
@@ -387,7 +382,7 @@ void EventProfilerController::profilerLoop() {
     }
     int sleep_time = duration_cast<milliseconds>(now - start_sleep).count();
 
-    if(stopRunloop_)
+    if (stopRunloop_)
       break;
 
     auto start_sample = now;
diff --git a/libkineto/src/GenericTraceActivity.cpp b/libkineto/src/GenericTraceActivity.cpp
index 6d3cc4a75..ede315351 100644
--- a/libkineto/src/GenericTraceActivity.cpp
+++ b/libkineto/src/GenericTraceActivity.cpp
@@ -10,7 +10,7 @@
 #include "output_base.h"
 
 namespace libkineto {
-  void GenericTraceActivity::log(ActivityLogger& logger) const {
-    logger.handleGenericActivity(*this);
-  }
+void GenericTraceActivity::log(ActivityLogger& logger) const {
+  logger.handleGenericActivity(*this);
+}
 } // namespace libkineto
diff --git a/libkineto/src/ILoggerObserver.cpp b/libkineto/src/ILoggerObserver.cpp
index 0667b3b7a..a943367a5 100644
--- a/libkineto/src/ILoggerObserver.cpp
+++ b/libkineto/src/ILoggerObserver.cpp
@@ -12,34 +12,34 @@
 
 #if !USE_GOOGLE_LOG
 
-#include <array>
 #include <fmt/format.h>
+#include <array>
 
 namespace libkineto {
 
 struct LoggerTypeName {
-  constexpr LoggerTypeName(const char* n, LoggerOutputType t) : name(n), type(t) {}
+  constexpr LoggerTypeName(const char* n, LoggerOutputType t)
+      : name(n), type(t) {}
   const char* name;
   LoggerOutputType type;
 };
 
-static constexpr std::array<LoggerTypeName, LoggerTypeCount + 1> LoggerMap{{
-    {"VERBOSE", LoggerOutputType::VERBOSE},
-    {"INFO", LoggerOutputType::INFO},
-    {"WARNING", LoggerOutputType::WARNING},
-    {"STAGE", LoggerOutputType::STAGE},
-    {"ERROR", LoggerOutputType::ERROR},
-    {"???", LoggerOutputType::ENUM_COUNT}
-}};
+static constexpr std::array<LoggerTypeName, LoggerTypeCount + 1> LoggerMap{
+    {{"VERBOSE", LoggerOutputType::VERBOSE},
+     {"INFO", LoggerOutputType::INFO},
+     {"WARNING", LoggerOutputType::WARNING},
+     {"STAGE", LoggerOutputType::STAGE},
+     {"ERROR", LoggerOutputType::ERROR},
+     {"???", LoggerOutputType::ENUM_COUNT}}};
 
 static constexpr bool matchingOrder(int idx = 0) {
   return LoggerMap[idx].type == LoggerOutputType::ENUM_COUNT ||
-    ((idx == (int) LoggerMap[idx].type) && matchingOrder(idx + 1));
+      ((idx == (int)LoggerMap[idx].type) && matchingOrder(idx + 1));
 }
 static_assert(matchingOrder(), "LoggerTypeName map is out of order");
 
 const char* toString(LoggerOutputType t) {
-  if(t < VERBOSE || t >= ENUM_COUNT) {
+  if (t < VERBOSE || t >= ENUM_COUNT) {
     return LoggerMap[ENUM_COUNT].name;
   }
   return LoggerMap[(int)t].name;
@@ -56,5 +56,4 @@ LoggerOutputType toLoggerOutputType(const std::string& str) {
 
 } // namespace libkineto
 
-
 #endif // !USE_GOOGLE_LOG
diff --git a/libkineto/src/InvariantViolations.h b/libkineto/src/InvariantViolations.h
index 7ff897601..4b51cf1f7 100644
--- a/libkineto/src/InvariantViolations.h
+++ b/libkineto/src/InvariantViolations.h
@@ -13,14 +13,14 @@
 namespace KINETO_NAMESPACE {
 
 class InvariantViolationsLogger {
-  public:
-    virtual ~InvariantViolationsLogger() = default;
-    virtual void logInvariantViolation(
+ public:
+  virtual ~InvariantViolationsLogger() = default;
+  virtual void logInvariantViolation(
       const std::string& profile_id,
       const std::string& assertion,
       const std::string& error,
       const std::string& group_profile_id) = 0;
-    static void registerFactory();
+  static void registerFactory();
 };
 
-}
+} // namespace KINETO_NAMESPACE
diff --git a/libkineto/src/IpcFabricConfigClient.cpp b/libkineto/src/IpcFabricConfigClient.cpp
index 7e70d9514..2e1c18429 100644
--- a/libkineto/src/IpcFabricConfigClient.cpp
+++ b/libkineto/src/IpcFabricConfigClient.cpp
@@ -10,9 +10,9 @@
 
 #include "IpcFabricConfigClient.h"
 
+#include <stdlib.h>
 #include <random>
 #include <sstream>
-#include <stdlib.h>
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
@@ -31,28 +31,28 @@ std::string generate_uuid_v4() {
   int i;
   ss << std::hex;
   for (i = 0; i < 8; i++) {
-      ss << dis(gen);
+    ss << dis(gen);
   }
   ss << "-";
   for (i = 0; i < 4; i++) {
-      ss << dis(gen);
+    ss << dis(gen);
   }
   ss << "-4";
   for (i = 0; i < 3; i++) {
-      ss << dis(gen);
+    ss << dis(gen);
   }
   ss << "-";
   ss << dis2(gen);
   for (i = 0; i < 3; i++) {
-      ss << dis(gen);
+    ss << dis(gen);
   }
   ss << "-";
   for (i = 0; i < 12; i++) {
-      ss << dis(gen);
+    ss << dis(gen);
   }
   return ss.str();
 }
-}
+} // namespace uuid
 
 static std::vector<int32_t> getPids() {
   const auto& pids = pidCommandPairsOfAncestors();
@@ -78,15 +78,15 @@ static int64_t getJobId() {
   return strtoll(id, nullptr, 10);
 }
 
-IpcFabricConfigClient::IpcFabricConfigClient() : jobId_(getJobId()), pids_(getPids()), ipcFabricEnabled_(true) {
-
+IpcFabricConfigClient::IpcFabricConfigClient()
+    : jobId_(getJobId()), pids_(getPids()), ipcFabricEnabled_(true) {
   // setup IPC Fabric
   std::string ep_name = "dynoconfigclient" + uuid::generate_uuid_v4();
 
   fabricManager_ = ::dynolog::ipcfabric::FabricManager::factory(ep_name);
 #ifdef ENABLE_IPC_FABRIC
-  LOG(INFO) << "Setting up IPC Fabric at endpoint: " << ep_name
-            << " status = " << (fabricManager_ ? "initialized" : "failed (null)");
+  LOG(INFO) << "Setting up IPC Fabric at endpoint: " << ep_name << " status = "
+            << (fabricManager_ ? "initialized" : "failed (null)");
 #endif
 }
 
@@ -98,7 +98,6 @@ constexpr const char* kDynoIpcName = "dynolog";
 constexpr int maxIpcRetries = 5;
 constexpr int kSleepUs = 10000;
 
-
 int32_t IpcFabricConfigClient::registerInstance(int32_t gpu) {
   if (!ipcFabricEnabled_) {
     return -1;
@@ -111,27 +110,27 @@ int32_t IpcFabricConfigClient::registerInstance(int32_t gpu) {
 
   // Setup message
   ::dynolog::ipcfabric::LibkinetoContext ctxt{
-    .gpu=gpu,
-    .pid=getpid(),
-    .jobid=jobId_
-  };
+      .gpu = gpu, .pid = getpid(), .jobid = jobId_};
 
   std::unique_ptr<::dynolog::ipcfabric::Message> msg =
-    ::dynolog::ipcfabric::Message::constructMessage<decltype(ctxt)>(
-      ctxt, "ctxt");
+      ::dynolog::ipcfabric::Message::constructMessage<decltype(ctxt)>(
+          ctxt, "ctxt");
 
   try {
     if (!fabricManager_->sync_send(*msg, std::string(kDynoIpcName))) {
-      LOG(ERROR) << "Failed to register pid " << ctxt.pid << " with dyno: IPC sync_send fail";
+      LOG(ERROR) << "Failed to register pid " << ctxt.pid
+                 << " with dyno: IPC sync_send fail";
       return -1;
     }
     msg = fabricManager_->poll_recv(maxIpcRetries, kSleepUs);
     if (!msg) {
-      LOG(ERROR) << "Failed to register pid " << ctxt.pid << " with dyno: IPC recv fail";
+      LOG(ERROR) << "Failed to register pid " << ctxt.pid
+                 << " with dyno: IPC recv fail";
       return -1;
     }
   } catch (const std::runtime_error& ex) {
-    LOG(ERROR) << "Failed to send/recv registering pic over fabric: " << ex.what();
+    LOG(ERROR) << "Failed to send/recv registering pic over fabric: "
+               << ex.what();
     return -1;
   }
 
@@ -144,7 +143,8 @@ std::string IpcFabricConfigClient::getLibkinetoBaseConfig() {
     return "";
   }
 
-  LOG(WARNING) << "Missing IPC Fabric implementation for getLibkinetoBaseConfig";
+  LOG(WARNING)
+      << "Missing IPC Fabric implementation for getLibkinetoBaseConfig";
   return "";
 }
 
@@ -159,7 +159,10 @@ std::string IpcFabricConfigClient::getLibkinetoOndemandConfig(int32_t type) {
   }
 
   int size = pids_.size();
-  ::dynolog::ipcfabric::LibkinetoRequest* req = (::dynolog::ipcfabric::LibkinetoRequest*)malloc(sizeof(::dynolog::ipcfabric::LibkinetoRequest) + sizeof(int32_t) * size);
+  ::dynolog::ipcfabric::LibkinetoRequest* req =
+      (::dynolog::ipcfabric::LibkinetoRequest*)malloc(
+          sizeof(::dynolog::ipcfabric::LibkinetoRequest) +
+          sizeof(int32_t) * size);
   req->type = type;
   req->n = size;
   req->jobid = jobId_;
@@ -167,12 +170,14 @@ std::string IpcFabricConfigClient::getLibkinetoOndemandConfig(int32_t type) {
     req->pids[i] = pids_[i];
   }
   std::unique_ptr<::dynolog::ipcfabric::Message> msg =
-    ::dynolog::ipcfabric::Message::constructMessage<::dynolog::ipcfabric::LibkinetoRequest, int32_t>(
-      *req, "req", size);
+      ::dynolog::ipcfabric::Message::
+          constructMessage<::dynolog::ipcfabric::LibkinetoRequest, int32_t>(
+              *req, "req", size);
 
   try {
     if (!fabricManager_->sync_send(*msg, std::string(kDynoIpcName))) {
-      LOG(ERROR) << "Failed to send config type=" << type << " to dyno: IPC sync_send fail";
+      LOG(ERROR) << "Failed to send config type=" << type
+                 << " to dyno: IPC sync_send fail";
       free(req);
       req = nullptr;
       return "";
@@ -180,16 +185,17 @@ std::string IpcFabricConfigClient::getLibkinetoOndemandConfig(int32_t type) {
     free(req);
     msg = fabricManager_->poll_recv(maxIpcRetries, kSleepUs);
     if (!msg) {
-      LOG(ERROR) << "Failed to receive ondemand config type=" << type << " from dyno: IPC recv fail";
+      LOG(ERROR) << "Failed to receive ondemand config type=" << type
+                 << " from dyno: IPC recv fail";
       return "";
     }
   } catch (const std::runtime_error& ex) {
-    LOG(ERROR) << "Failed to recv ondemand config over ipc fabric: " << ex.what();
+    LOG(ERROR) << "Failed to recv ondemand config over ipc fabric: "
+               << ex.what();
     free(req);
     return "";
   }
 
-
   return std::string((char*)msg->buf.get(), msg->metadata.size);
 }
 
@@ -202,7 +208,8 @@ int32_t IpcFabricConfigClient::registerInstance(int32_t /*gpu*/) {
 std::string IpcFabricConfigClient::getLibkinetoBaseConfig() {
   return "";
 }
-std::string IpcFabricConfigClient::getLibkinetoOndemandConfig(int32_t /*type*/) {
+std::string IpcFabricConfigClient::getLibkinetoOndemandConfig(
+    int32_t /*type*/) {
   return "";
 }
 
diff --git a/libkineto/src/IpcFabricConfigClient.h b/libkineto/src/IpcFabricConfigClient.h
index 8be1c9a85..6341476b7 100644
--- a/libkineto/src/IpcFabricConfigClient.h
+++ b/libkineto/src/IpcFabricConfigClient.h
@@ -54,7 +54,6 @@ enum LibkinetoConfigType {
   ACTIVITIES = 0x2,
 };
 
-
 // IpcFabricConfigClient : connects to a daemon using the IPC Fabric
 //   this can be used as a base class for other Daemon Config clients as well.
 class IpcFabricConfigClient {
@@ -77,8 +76,8 @@ class IpcFabricConfigClient {
   }
 
  protected:
-  // Temporarily keep both int and string job id until IPC related code is updated to handle
-  // string job id.
+  // Temporarily keep both int and string job id until IPC related code is
+  // updated to handle string job id.
   int64_t jobId_;
   std::string jobIdStr_;
   std::vector<int32_t> pids_;
diff --git a/libkineto/src/Logger.cpp b/libkineto/src/Logger.cpp
index 7a4b771d9..7c5858ff6 100644
--- a/libkineto/src/Logger.cpp
+++ b/libkineto/src/Logger.cpp
@@ -13,11 +13,11 @@
 
 #ifndef USE_GOOGLE_LOG
 
+#include <time.h>
 #include <chrono>
 #include <cstring>
 #include <iomanip>
 #include <iostream>
-#include <time.h>
 
 #include <fmt/chrono.h>
 #include <fmt/format.h>
@@ -30,10 +30,12 @@ std::atomic_int Logger::severityLevel_{VERBOSE};
 std::atomic_int Logger::verboseLogLevel_{-1};
 std::atomic<uint64_t> Logger::verboseLogModules_{~0ull};
 
-
 Logger::Logger(int severity, int line, const char* filePath, int errnum)
-    : buf_(), out_(LIBKINETO_DBG_STREAM), errnum_(errnum), messageSeverity_(severity) {
-  buf_ << toString((LoggerOutputType) severity) << ":";
+    : buf_(),
+      out_(LIBKINETO_DBG_STREAM),
+      errnum_(errnum),
+      messageSeverity_(severity) {
+  buf_ << toString((LoggerOutputType)severity) << ":";
 
   const auto tt =
       std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
@@ -54,9 +56,10 @@ Logger::~Logger() {
   {
     std::lock_guard<std::mutex> guard(loggerObserversMutex());
     for (auto* observer : loggerObservers()) {
-      // Output to observers. Current Severity helps keep track of which bucket the output goes.
+      // Output to observers. Current Severity helps keep track of which bucket
+      // the output goes.
       if (observer) {
-        observer->write(buf_.str(), (LoggerOutputType) messageSeverity_);
+        observer->write(buf_.str(), (LoggerOutputType)messageSeverity_);
       }
     }
   }
@@ -139,7 +142,9 @@ void Logger::setLoggerObserverOnDemand() {
   }
 }
 
-void Logger::addLoggerObserverAddMetadata(const std::string& key, const std::string& value) {
+void Logger::addLoggerObserverAddMetadata(
+    const std::string& key,
+    const std::string& value) {
   std::lock_guard<std::mutex> guard(loggerObserversMutex());
   for (auto observer : loggerObservers()) {
     observer->addMetadata(key, value);
diff --git a/libkineto/src/Logger.h b/libkineto/src/Logger.h
index 6d7fa6c06..3b9caafdf 100644
--- a/libkineto/src/Logger.h
+++ b/libkineto/src/Logger.h
@@ -35,8 +35,8 @@
 #include <mutex>
 #include <ostream>
 #include <set>
-#include <string>
 #include <sstream>
+#include <string>
 #include <vector>
 
 // TODO(T90238193)
@@ -93,9 +93,9 @@ class Logger {
     return (!s[off] ? 57ull : (hash_rec(s, off + 1) * 293) ^ s[off]);
   }
   static constexpr const char* basename(const char* s, int off = 0) {
-    return !s[off]
-        ? s
-        : s[off] == '/' ? basename(&s[off + 1]) : basename(s, off + 1);
+    return !s[off]      ? s
+        : s[off] == '/' ? basename(&s[off + 1])
+                        : basename(s, off + 1);
   }
 
   static void setVerboseLogModules(const std::vector<std::string>& modules);
@@ -127,7 +127,9 @@ class Logger {
 
   static void setLoggerObserverOnDemand();
 
-  static void addLoggerObserverAddMetadata(const std::string& key, const std::string& value);
+  static void addLoggerObserverAddMetadata(
+      const std::string& key,
+      const std::string& value);
 
  private:
   std::stringstream buf_;
@@ -177,12 +179,12 @@ class VoidLogger {
 #undef LOG_OCCURRENCES
 #endif
 
-#define LOG_IS_ON(severity) \
-  (severity >= libkineto::Logger::severityLevel())
+#define LOG_IS_ON(severity) (severity >= libkineto::Logger::severityLevel())
 
-#define LOG_IF(severity, condition) \
-  !(LOG_IS_ON(severity) && (condition)) ? (void)0 : libkineto::VoidLogger() & \
-    libkineto::Logger(severity, __LINE__, __FILE__).stream()
+#define LOG_IF(severity, condition)                                 \
+  !(LOG_IS_ON(severity) && (condition)) ? (void)0                   \
+                                        : libkineto::VoidLogger() & \
+          libkineto::Logger(severity, __LINE__, __FILE__).stream()
 
 #define LOG(severity) LOG_IF(severity, true)
 
@@ -206,7 +208,7 @@ template <uint64_t n>
 struct __to_constant__ {
   static const uint64_t val = n;
 };
-#define FILENAME_HASH                             \
+#define FILENAME_HASH                      \
   __to_constant__<libkineto::Logger::hash( \
       libkineto::Logger::basename(__FILE__))>::val
 #define VLOG_IS_ON(verbosity)                           \
@@ -226,10 +228,9 @@ struct __to_constant__ {
 #define PLOG(severity) \
   libkineto::Logger(severity, __LINE__, __FILE__, errno).stream()
 
-#define SET_LOG_SEVERITY_LEVEL(level) \
-  libkineto::Logger::setSeverityLevel(level)
+#define SET_LOG_SEVERITY_LEVEL(level) libkineto::Logger::setSeverityLevel(level)
 
-#define SET_LOG_VERBOSITY_LEVEL(level, modules)   \
+#define SET_LOG_VERBOSITY_LEVEL(level, modules) \
   libkineto::Logger::setVerboseLogLevel(level); \
   libkineto::Logger::setVerboseLogModules(modules)
 
@@ -265,7 +266,6 @@ struct __to_constant__ {
 #define LOGGER_OBSERVER_ADD_METADATA(key, value) \
   libkineto::Logger::addLoggerObserverAddMetadata(key, value)
 
-
 // UST Logger Semantics to describe when a stage is complete.
 #define UST_LOGGER_MARK_COMPLETED(stage) \
   LOG(libkineto::LoggerOutputType::STAGE) << "Completed Stage: " << stage
diff --git a/libkineto/src/LoggerCollector.h b/libkineto/src/LoggerCollector.h
index d75ed49ae..eb62c1ce8 100644
--- a/libkineto/src/LoggerCollector.h
+++ b/libkineto/src/LoggerCollector.h
@@ -36,7 +36,8 @@ class LoggerCollector : public ILoggerObserver {
     buckets_[ot].push_back(message);
   }
 
-  const std::map<LoggerOutputType, std::vector<std::string>> extractCollectorMetadata() override {
+  const std::map<LoggerOutputType, std::vector<std::string>>
+  extractCollectorMetadata() override {
     return buckets_;
   }
 
@@ -69,12 +70,12 @@ class LoggerCollector : public ILoggerObserver {
  protected:
   std::map<LoggerOutputType, std::vector<std::string>> buckets_;
 
-  // These are useful metadata to collect from CUPTIActivityProfiler for internal tracking.
+  // These are useful metadata to collect from CUPTIActivityProfiler for
+  // internal tracking.
   std::set<int64_t> devices;
   int64_t trace_duration_ms{0};
   std::atomic<uint64_t> event_count{0};
   std::set<std::string> destinations;
-
 };
 
 } // namespace KINETO_NAMESPACE
diff --git a/libkineto/src/RoctracerActivity.h b/libkineto/src/RoctracerActivity.h
index 677e3507f..8dcff4995 100644
--- a/libkineto/src/RoctracerActivity.h
+++ b/libkineto/src/RoctracerActivity.h
@@ -9,19 +9,19 @@
 #pragma once
 
 #include <roctracer.h>
-#include <roctracer_hip.h>
 #include <roctracer_ext.h>
+#include <roctracer_hip.h>
 #include <roctracer_roctx.h>
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
-#include "ITraceActivity.h"
 #include "GenericTraceActivity.h"
-#include "ThreadUtil.h"
+#include "ITraceActivity.h"
 #include "RoctracerLogger.h"
+#include "ThreadUtil.h"
 
 namespace libkineto {
-  class ActivityLogger;
+class ActivityLogger;
 }
 
 namespace KINETO_NAMESPACE {
@@ -34,25 +34,39 @@ struct TraceSpan;
 // using the ITraceActivity interface and logged via ActivityLogger.
 
 // Abstract base class, templated on Roctracer activity type
-template<class T>
+template <class T>
 struct RoctracerActivity : public ITraceActivity {
   explicit RoctracerActivity(const T* activity, const ITraceActivity* linked)
       : activity_(*activity), linked_(linked) {}
-  // Our stored timestamps (from roctracer and generated) are in CLOCK_MONOTONIC domain (in ns).
-  // Convert the timestamps.
+  // Our stored timestamps (from roctracer and generated) are in CLOCK_MONOTONIC
+  // domain (in ns). Convert the timestamps.
   int64_t timestamp() const override {
     return activity_.begin;
   }
   int64_t duration() const override {
     return activity_.end - activity_.begin;
   }
-  int64_t correlationId() const override {return 0;}
-  int32_t getThreadId() const override {return 0;}
-  const ITraceActivity* linkedActivity() const override {return linked_;}
-  int flowType() const override {return kLinkAsyncCpuGpu;}
-  int flowId() const override {return correlationId();}
-  const T& raw() const {return activity_;}
-  const TraceSpan* traceSpan() const override {return nullptr;}
+  int64_t correlationId() const override {
+    return 0;
+  }
+  int32_t getThreadId() const override {
+    return 0;
+  }
+  const ITraceActivity* linkedActivity() const override {
+    return linked_;
+  }
+  int flowType() const override {
+    return kLinkAsyncCpuGpu;
+  }
+  int flowId() const override {
+    return correlationId();
+  }
+  const T& raw() const {
+    return activity_;
+  }
+  const TraceSpan* traceSpan() const override {
+    return nullptr;
+  }
   const std::string getMetadataValue(const std::string& key) const override {
     auto it = metadata_.find(key);
     if (it != metadata_.end()) {
@@ -92,17 +106,28 @@ struct GpuActivity : public RoctracerActivity<roctracerAsyncRow> {
         break;
     }
   }
-  int64_t correlationId() const override {return activity_.id;}
-  int64_t deviceId() const override {return activity_.device;}
-  int64_t resourceId() const override {return  activity_.queue;}
-  ActivityType type() const override {return type_;};
-  bool flowStart() const override {return false;}
+  int64_t correlationId() const override {
+    return activity_.id;
+  }
+  int64_t deviceId() const override {
+    return activity_.device;
+  }
+  int64_t resourceId() const override {
+    return activity_.queue;
+  }
+  ActivityType type() const override {
+    return type_;
+  };
+  bool flowStart() const override {
+    return false;
+  }
   const std::string name() const override;
   void log(ActivityLogger& logger) const override;
   const std::string metadataJson() const override;
 
-  // Add small buffer to fix visual error created by https://github.com/ROCm/roctracer/issues/105
-  // Once this is resolved we can use ifdef to handle having this buffer or not based on version
+  // Add small buffer to fix visual error created by
+  // https://github.com/ROCm/roctracer/issues/105 Once this is resolved we can
+  // use ifdef to handle having this buffer or not based on version
   int64_t timestamp() const override {
     return activity_.begin + 1;
   }
@@ -111,25 +136,37 @@ struct GpuActivity : public RoctracerActivity<roctracerAsyncRow> {
   }
 
  private:
-   ActivityType type_;
+  ActivityType type_;
 };
 
-// roctracerRow, roctracerKernelRow, roctracerCopyRow, roctracerMallocRow - Roctracer runtime activities
+// roctracerRow, roctracerKernelRow, roctracerCopyRow, roctracerMallocRow -
+// Roctracer runtime activities
 template <class T>
 struct RuntimeActivity : public RoctracerActivity<T> {
-  explicit RuntimeActivity(
-      const T* activity,
-      const ITraceActivity* linked)
+  explicit RuntimeActivity(const T* activity, const ITraceActivity* linked)
       : RoctracerActivity<T>(activity, linked) {}
-  int64_t correlationId() const override {return raw().id;}
-  int64_t deviceId() const override {return raw().pid;}
-  int64_t resourceId() const override {return raw().tid;}
-  ActivityType type() const override {return ActivityType::CUDA_RUNTIME;}
+  int64_t correlationId() const override {
+    return raw().id;
+  }
+  int64_t deviceId() const override {
+    return raw().pid;
+  }
+  int64_t resourceId() const override {
+    return raw().tid;
+  }
+  ActivityType type() const override {
+    return ActivityType::CUDA_RUNTIME;
+  }
   bool flowStart() const override;
-  const std::string name() const override {return std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, raw().cid, 0));}
+  const std::string name() const override {
+    return std::string(
+        roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, raw().cid, 0));
+  }
   void log(ActivityLogger& logger) const override;
   const std::string metadataJson() const override;
-  const T& raw() const {return RoctracerActivity<T>::raw();}
+  const T& raw() const {
+    return RoctracerActivity<T>::raw();
+  }
 };
 
 } // namespace KINETO_NAMESPACE
diff --git a/libkineto/src/RoctracerActivityApi.cpp b/libkineto/src/RoctracerActivityApi.cpp
index ec4928962..34fa97aa0 100644
--- a/libkineto/src/RoctracerActivityApi.cpp
+++ b/libkineto/src/RoctracerActivityApi.cpp
@@ -8,15 +8,15 @@
 
 #include "RoctracerActivityApi.h"
 
-#include "ApproximateClock.h"
-#include <cstring>
+#include <time.h>
 #include <chrono>
+#include <cstring>
 #include <functional>
-#include <time.h>
-#include "Logger.h"
+#include "ApproximateClock.h"
 #include "Demangle.h"
-#include "output_base.h"
+#include "Logger.h"
 #include "ThreadUtil.h"
+#include "output_base.h"
 
 using namespace std::chrono;
 
@@ -28,8 +28,7 @@ RoctracerActivityApi& RoctracerActivityApi::singleton() {
 }
 
 RoctracerActivityApi::RoctracerActivityApi()
-: d(&RoctracerLogger::singleton()) {
-}
+    : d(&RoctracerLogger::singleton()) {}
 
 RoctracerActivityApi::~RoctracerActivityApi() {
   disableActivities(std::set<ActivityType>());
@@ -40,7 +39,8 @@ void RoctracerActivityApi::pushCorrelationID(int id, CorrelationFlowType type) {
   if (!singleton().d->externalCorrelationEnabled_) {
     return;
   }
-  singleton().d->pushCorrelationID(id, static_cast<RoctracerLogger::CorrelationDomain>(type));
+  singleton().d->pushCorrelationID(
+      id, static_cast<RoctracerLogger::CorrelationDomain>(type));
 #endif
 }
 
@@ -49,13 +49,14 @@ void RoctracerActivityApi::popCorrelationID(CorrelationFlowType type) {
   if (!singleton().d->externalCorrelationEnabled_) {
     return;
   }
-  singleton().d->popCorrelationID(static_cast<RoctracerLogger::CorrelationDomain>(type));
+  singleton().d->popCorrelationID(
+      static_cast<RoctracerLogger::CorrelationDomain>(type));
 #endif
 }
 
 void RoctracerActivityApi::setMaxBufferSize(int size) {
   // FIXME: implement?
-  //maxGpuBufferCount_ = 1 + size / kBufSize;
+  // maxGpuBufferCount_ = 1 + size / kBufSize;
 }
 
 inline bool inRange(int64_t start, int64_t end, int64_t stamp) {
@@ -72,29 +73,35 @@ void RoctracerActivityApi::setTimeOffset(timestamp_t toffset) {
 
 int RoctracerActivityApi::processActivities(
     std::function<void(const roctracerBase*)> handler,
-    std::function<void(uint64_t, uint64_t, RoctracerLogger::CorrelationDomain)> correlationHandler) {
+    std::function<void(uint64_t, uint64_t, RoctracerLogger::CorrelationDomain)>
+        correlationHandler) {
   // Find offset to map from monotonic clock to system clock.
   // This will break time-ordering of events but is status quo.
 
   int count = 0;
 
   // Process all external correlations pairs
-  for (int it = RoctracerLogger::CorrelationDomain::begin; it < RoctracerLogger::CorrelationDomain::end; ++it) {
-    auto &externalCorrelations = d->externalCorrelations_[it];
-    for (auto &item : externalCorrelations) {
-      correlationHandler(item.first, item.second, static_cast<RoctracerLogger::CorrelationDomain>(it));
+  for (int it = RoctracerLogger::CorrelationDomain::begin;
+       it < RoctracerLogger::CorrelationDomain::end;
+       ++it) {
+    auto& externalCorrelations = d->externalCorrelations_[it];
+    for (auto& item : externalCorrelations) {
+      correlationHandler(
+          item.first,
+          item.second,
+          static_cast<RoctracerLogger::CorrelationDomain>(it));
     }
     std::lock_guard<std::mutex> lock(d->externalCorrelationsMutex_);
     externalCorrelations.clear();
   }
 
   // All Runtime API Calls
-  for (auto &item : d->rows_) {
+  for (auto& item : d->rows_) {
     bool filtered = false;
-    if (item->type != ROCTRACER_ACTIVITY_ASYNC && !isLogged(ActivityType::CUDA_RUNTIME)) {
+    if (item->type != ROCTRACER_ACTIVITY_ASYNC &&
+        !isLogged(ActivityType::CUDA_RUNTIME)) {
       filtered = true;
-    }
-    else {
+    } else {
       switch (reinterpret_cast<roctracerAsyncRow*>(item)->kind) {
         case HIP_OP_COPY_KIND_DEVICE_TO_HOST_:
         case HIP_OP_COPY_KIND_HOST_TO_DEVICE_:
@@ -115,13 +122,15 @@ int RoctracerActivityApi::processActivities(
           if (!isLogged(ActivityType::CONCURRENT_KERNEL))
             filtered = true;
           // Don't record barriers/markers
-          if (reinterpret_cast<roctracerAsyncRow*>(item)->op == HIP_OP_ID_BARRIER)
+          if (reinterpret_cast<roctracerAsyncRow*>(item)->op ==
+              HIP_OP_ID_BARRIER)
             filtered = true;
           break;
       }
     }
     if (!filtered) {
-      // Convert the begin and end timestamps from monotonic clock to system clock.
+      // Convert the begin and end timestamps from monotonic clock to system
+      // clock.
       item->begin = item->begin + toffset_;
       item->end = item->end + toffset_;
       handler(item);
@@ -135,7 +144,6 @@ void RoctracerActivityApi::clearActivities() {
   d->clearLogs();
 }
 
-
 void RoctracerActivityApi::enableActivities(
     const std::set<ActivityType>& selected_activities) {
 #ifdef HAS_ROCTRACER
@@ -144,7 +152,7 @@ void RoctracerActivityApi::enableActivities(
   for (const auto& activity : selected_activities) {
     activityMask_ |= (1 << static_cast<uint32_t>(activity));
     if (activity == ActivityType::EXTERNAL_CORRELATION) {
-        d->externalCorrelationEnabled_ = true;
+      d->externalCorrelationEnabled_ = true;
     }
   }
 #endif
@@ -160,7 +168,7 @@ void RoctracerActivityApi::disableActivities(
   for (const auto& activity : selected_activities) {
     activityMask_ &= ~(1 << static_cast<uint32_t>(activity));
     if (activity == ActivityType::EXTERNAL_CORRELATION) {
-        d->externalCorrelationEnabled_ = false;
+      d->externalCorrelationEnabled_ = false;
     }
   }
 #endif
diff --git a/libkineto/src/RoctracerActivityApi.h b/libkineto/src/RoctracerActivityApi.h
index e66c9a1e7..491f13f7e 100644
--- a/libkineto/src/RoctracerActivityApi.h
+++ b/libkineto/src/RoctracerActivityApi.h
@@ -8,11 +8,11 @@
 
 #pragma once
 
-#include <vector>
-#include <map>
-#include <set>
 #include <atomic>
 #include <functional>
+#include <map>
+#include <set>
+#include <vector>
 
 #ifdef HAS_ROCTRACER
 #include <roctracer.h>
@@ -30,10 +30,7 @@ using namespace libkineto;
 
 class RoctracerActivityApi {
  public:
-  enum CorrelationFlowType {
-    Default,
-    User
-  };
+  enum CorrelationFlowType { Default, User };
 
   RoctracerActivityApi();
   RoctracerActivityApi(const RoctracerActivityApi&) = delete;
@@ -46,17 +43,17 @@ class RoctracerActivityApi {
   static void pushCorrelationID(int id, CorrelationFlowType type);
   static void popCorrelationID(CorrelationFlowType type);
 
-  void enableActivities(
-    const std::set<ActivityType>& selected_activities);
-  void disableActivities(
-    const std::set<ActivityType>& selected_activities);
+  void enableActivities(const std::set<ActivityType>& selected_activities);
+  void disableActivities(const std::set<ActivityType>& selected_activities);
   void clearActivities();
   void teardownContext() {}
   void setTimeOffset(timestamp_t toffset);
 
   virtual int processActivities(
-    std::function<void(const roctracerBase*)> handler,
-    std::function<void(uint64_t, uint64_t, RoctracerLogger::CorrelationDomain)> correlationHandler);
+      std::function<void(const roctracerBase*)> handler,
+      std::function<
+          void(uint64_t, uint64_t, RoctracerLogger::CorrelationDomain)>
+          correlationHandler);
 
   void setMaxBufferSize(int size);
 
@@ -71,7 +68,7 @@ class RoctracerActivityApi {
   uint32_t activityMaskSnapshot_{0};
   bool isLogged(libkineto::ActivityType atype);
 
-  RoctracerLogger *d;
+  RoctracerLogger* d;
 };
 
 } // namespace KINETO_NAMESPACE
diff --git a/libkineto/src/RoctracerActivity_inl.h b/libkineto/src/RoctracerActivity_inl.h
index c76f6fb12..f2ee41167 100644
--- a/libkineto/src/RoctracerActivity_inl.h
+++ b/libkineto/src/RoctracerActivity_inl.h
@@ -74,23 +74,18 @@ void getMemcpySrcDstString(uint32_t kind, std::string& src, std::string& dst) {
 
 inline const std::string GpuActivity::name() const {
   if (type_ == ActivityType::CONCURRENT_KERNEL) {
-    const char *name = roctracer_op_string(raw().domain, raw().op, raw().kind);
-    return demangle(raw().kernelName.length() > 0 ? raw().kernelName : std::string(name));
-  }
-  else if (type_ == ActivityType::GPU_MEMSET) {
-    return fmt::format(
-      "Memset ({})",
-      getGpuActivityKindString(raw().kind));
-  }
-  else if (type_ == ActivityType::GPU_MEMCPY) {
+    const char* name = roctracer_op_string(raw().domain, raw().op, raw().kind);
+    return demangle(
+        raw().kernelName.length() > 0 ? raw().kernelName : std::string(name));
+  } else if (type_ == ActivityType::GPU_MEMSET) {
+    return fmt::format("Memset ({})", getGpuActivityKindString(raw().kind));
+  } else if (type_ == ActivityType::GPU_MEMCPY) {
     std::string src = "";
     std::string dst = "";
     getMemcpySrcDstString(raw().kind, src, dst);
     return fmt::format(
-      "Memcpy {} ({} -> {})",
-      getGpuActivityKindString(raw().kind), src, dst);
-  }
-  else {
+        "Memcpy {} ({} -> {})", getGpuActivityKindString(raw().kind), src, dst);
+  } else {
     return "";
   }
 }
@@ -125,85 +120,112 @@ inline const std::string GpuActivity::metadataJson() const {
 
 template <class T>
 inline bool RuntimeActivity<T>::flowStart() const {
-  bool should_correlate =
-      raw().cid == HIP_API_ID_hipLaunchKernel ||
+  bool should_correlate = raw().cid == HIP_API_ID_hipLaunchKernel ||
       raw().cid == HIP_API_ID_hipExtLaunchKernel ||
       raw().cid == HIP_API_ID_hipLaunchCooperativeKernel ||
       raw().cid == HIP_API_ID_hipHccModuleLaunchKernel ||
       raw().cid == HIP_API_ID_hipModuleLaunchKernel ||
       raw().cid == HIP_API_ID_hipExtModuleLaunchKernel ||
-      raw().cid == HIP_API_ID_hipMalloc ||
-      raw().cid == HIP_API_ID_hipFree ||
+      raw().cid == HIP_API_ID_hipMalloc || raw().cid == HIP_API_ID_hipFree ||
       raw().cid == HIP_API_ID_hipMemcpy ||
       raw().cid == HIP_API_ID_hipMemcpyAsync ||
       raw().cid == HIP_API_ID_hipMemcpyWithStream;
   return should_correlate;
 }
 
-template<class T>
+template <class T>
 inline void RuntimeActivity<T>::log(ActivityLogger& logger) const {
   logger.handleActivity(*this);
 }
 
-template<>
-inline const std::string RuntimeActivity<roctracerKernelRow>::metadataJson() const {
+template <>
+inline const std::string RuntimeActivity<roctracerKernelRow>::metadataJson()
+    const {
   std::string kernel = "";
   if ((raw().functionAddr != nullptr)) {
-    kernel = fmt::format(R"JSON(
+    kernel = fmt::format(
+        R"JSON(
     "kernel": "{}", )JSON",
-    demangle(hipKernelNameRefByPtr(raw().functionAddr, raw().stream)));
-  }
-  else if ((raw().function != nullptr)) {
-    kernel = fmt::format(R"JSON(
+        demangle(hipKernelNameRefByPtr(raw().functionAddr, raw().stream)));
+  } else if ((raw().function != nullptr)) {
+    kernel = fmt::format(
+        R"JSON(
     "kernel": "{}", )JSON",
-    demangle(hipKernelNameRef(raw().function)));
+        demangle(hipKernelNameRef(raw().function)));
   }
-  //cache grid and block so we can pass it into async activity (GPU track)
-  correlationToGrid[raw().id] = fmt::format(R"JSON(
+  // cache grid and block so we can pass it into async activity (GPU track)
+  correlationToGrid[raw().id] = fmt::format(
+      R"JSON(
     [{}, {}, {}])JSON",
-    raw().gridX, raw().gridY, raw().gridZ);
-  
-  correlationToBlock[raw().id] = fmt::format(R"JSON(
+      raw().gridX,
+      raw().gridY,
+      raw().gridZ);
+
+  correlationToBlock[raw().id] = fmt::format(
+      R"JSON(
     [{}, {}, {}])JSON",
-    raw().workgroupX, raw().workgroupY, raw().workgroupZ);
-  
+      raw().workgroupX,
+      raw().workgroupY,
+      raw().workgroupZ);
 
-  return fmt::format(R"JSON(
+  return fmt::format(
+      R"JSON(
       {}"cid": {}, "correlation": {},
       "grid": [{}, {}, {}],
       "block": [{}, {}, {}],
       "shared memory": {})JSON",
-      kernel, raw().cid, raw().id,
-      raw().gridX, raw().gridY, raw().gridZ,
-      raw().workgroupX, raw().workgroupY, raw().workgroupZ,
+      kernel,
+      raw().cid,
+      raw().id,
+      raw().gridX,
+      raw().gridY,
+      raw().gridZ,
+      raw().workgroupX,
+      raw().workgroupY,
+      raw().workgroupZ,
       raw().groupSegmentSize);
 }
 
-template<>
-inline const std::string RuntimeActivity<roctracerCopyRow>::metadataJson() const {
-  return fmt::format(R"JSON(
+template <>
+inline const std::string RuntimeActivity<roctracerCopyRow>::metadataJson()
+    const {
+  return fmt::format(
+      R"JSON(
       "cid": {}, "correlation": {}, "src": "{}", "dst": "{}", "size": "{}", "kind": "{}")JSON",
-      raw().cid, raw().id, raw().src, raw().dst, raw().size, fmt::underlying(raw().kind));
+      raw().cid,
+      raw().id,
+      raw().src,
+      raw().dst,
+      raw().size,
+      fmt::underlying(raw().kind));
 }
 
-template<>
-inline const std::string RuntimeActivity<roctracerMallocRow>::metadataJson() const {
+template <>
+inline const std::string RuntimeActivity<roctracerMallocRow>::metadataJson()
+    const {
   std::string size = "";
   if (raw().cid == HIP_API_ID_hipMalloc) {
-    size = fmt::format(R"JSON(
+    size = fmt::format(
+        R"JSON(
       "size": {}, )JSON",
-      raw().size);
+        raw().size);
   }
-  return fmt::format(R"JSON(
+  return fmt::format(
+      R"JSON(
       {}"cid": {}, "correlation": {}, "ptr": "{}")JSON",
-      size, raw().cid, raw().id, raw().ptr);
+      size,
+      raw().cid,
+      raw().id,
+      raw().ptr);
 }
 
-template<class T>
+template <class T>
 inline const std::string RuntimeActivity<T>::metadataJson() const {
-  return fmt::format(R"JSON(
+  return fmt::format(
+      R"JSON(
       "cid": {}, "correlation": {})JSON",
-      raw().cid, raw().id);
+      raw().cid,
+      raw().id);
 }
 
 } // namespace KINETO_NAMESPACE
diff --git a/libkineto/src/RoctracerLogger.cpp b/libkineto/src/RoctracerLogger.cpp
index e0cff8337..b0484849c 100644
--- a/libkineto/src/RoctracerLogger.cpp
+++ b/libkineto/src/RoctracerLogger.cpp
@@ -8,29 +8,28 @@
 
 #include "RoctracerLogger.h"
 
-#include <cstring>
-#include <chrono>
 #include <time.h>
-#include <mutex>
 #include <unistd.h>
+#include <chrono>
+#include <cstring>
+#include <mutex>
 
+#include "Demangle.h"
 #include "Logger.h"
 #include "ThreadUtil.h"
-#include "Demangle.h"
 
 using namespace libkineto;
 using namespace std::chrono;
 
-class Flush
-{
-public:
+class Flush {
+ public:
   std::mutex mutex_;
   std::atomic<uint64_t> maxCorrelationId_;
-  uint64_t maxCompletedCorrelationId_ {0};
-  void reportCorrelation(const uint64_t &cid) {
+  uint64_t maxCompletedCorrelationId_{0};
+  void reportCorrelation(const uint64_t& cid) {
     uint64_t prev = maxCorrelationId_;
-    while (prev < cid && !maxCorrelationId_.compare_exchange_weak(prev, cid))
-      {}
+    while (prev < cid && !maxCorrelationId_.compare_exchange_weak(prev, cid)) {
+    }
   }
 };
 static Flush s_flush;
@@ -40,8 +39,7 @@ RoctracerLogger& RoctracerLogger::singleton() {
   return instance;
 }
 
-RoctracerLogger::RoctracerLogger() {
-}
+RoctracerLogger::RoctracerLogger() {}
 
 RoctracerLogger::~RoctracerLogger() {
   stopLogging();
@@ -49,7 +47,8 @@ RoctracerLogger::~RoctracerLogger() {
 }
 
 namespace {
-  thread_local std::deque<uint64_t> t_externalIds[RoctracerLogger::CorrelationDomain::size];
+thread_local std::deque<uint64_t>
+    t_externalIds[RoctracerLogger::CorrelationDomain::size];
 }
 
 void RoctracerLogger::pushCorrelationID(uint64_t id, CorrelationDomain type) {
@@ -74,49 +73,51 @@ void RoctracerLogger::clearLogs() {
 }
 
 void RoctracerLogger::insert_row_to_buffer(roctracerBase* row) {
-  RoctracerLogger *dis = &singleton();
+  RoctracerLogger* dis = &singleton();
   std::lock_guard<std::mutex> lock(dis->rowsMutex_);
   if (dis->rows_.size() >= dis->maxBufferSize_) {
-    LOG_FIRST_N(WARNING, 10) << "Exceeded max GPU buffer count ("
-                 << dis->rows_.size()
-                 << " > " << dis->maxBufferSize_
-                 << ") - terminating tracing";
+    LOG_FIRST_N(WARNING, 10)
+        << "Exceeded max GPU buffer count (" << dis->rows_.size() << " > "
+        << dis->maxBufferSize_ << ") - terminating tracing";
     return;
   }
   dis->rows_.push_back(row);
 }
 
-void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg)
-{
-  RoctracerLogger *dis = &singleton();
+void RoctracerLogger::api_callback(
+    uint32_t domain,
+    uint32_t cid,
+    const void* callback_data,
+    void* arg) {
+  RoctracerLogger* dis = &singleton();
 
   if (domain == ACTIVITY_DOMAIN_HIP_API && dis->loggedIds_.contains(cid)) {
     const hip_api_data_t* data = (const hip_api_data_t*)(callback_data);
 
     // Pack callbacks into row structures
 
-    thread_local std::unordered_map<activity_correlation_id_t, timespec> timestamps;
+    thread_local std::unordered_map<activity_correlation_id_t, timespec>
+        timestamps;
 
     if (data->phase == ACTIVITY_API_PHASE_ENTER) {
       timespec timestamp;
-      clock_gettime(CLOCK_MONOTONIC, &timestamp);  // record proper clock
+      clock_gettime(CLOCK_MONOTONIC, &timestamp); // record proper clock
       timestamps[data->correlation_id] = timestamp;
-    }
-    else { // (data->phase == ACTIVITY_API_PHASE_EXIT)
+    } else { // (data->phase == ACTIVITY_API_PHASE_EXIT)
       timespec startTime;
       startTime = timestamps[data->correlation_id];
       timestamps.erase(data->correlation_id);
       timespec endTime;
-      clock_gettime(CLOCK_MONOTONIC, &endTime);  // record proper clock
+      clock_gettime(CLOCK_MONOTONIC, &endTime); // record proper clock
 
       switch (cid) {
         case HIP_API_ID_hipLaunchKernel:
         case HIP_API_ID_hipExtLaunchKernel:
-        case HIP_API_ID_hipLaunchCooperativeKernel:     // Should work here
-          {
-            s_flush.reportCorrelation(data->correlation_id);
-            auto &args = data->args.hipLaunchKernel;
-            roctracerKernelRow* row = new roctracerKernelRow(
+        case HIP_API_ID_hipLaunchCooperativeKernel: // Should work here
+        {
+          s_flush.reportCorrelation(data->correlation_id);
+          auto& args = data->args.hipLaunchKernel;
+          roctracerKernelRow* row = new roctracerKernelRow(
               data->correlation_id,
               domain,
               cid,
@@ -133,18 +134,15 @@ void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* ca
               args.dimBlocks.y,
               args.dimBlocks.z,
               args.sharedMemBytes,
-              args.stream
-            );
-            insert_row_to_buffer(row);
-          }
-          break;
+              args.stream);
+          insert_row_to_buffer(row);
+        } break;
         case HIP_API_ID_hipHccModuleLaunchKernel:
         case HIP_API_ID_hipModuleLaunchKernel:
-        case HIP_API_ID_hipExtModuleLaunchKernel:
-          {
-            s_flush.reportCorrelation(data->correlation_id);
-            auto &args = data->args.hipModuleLaunchKernel;
-            roctracerKernelRow* row = new roctracerKernelRow(
+        case HIP_API_ID_hipExtModuleLaunchKernel: {
+          s_flush.reportCorrelation(data->correlation_id);
+          auto& args = data->args.hipModuleLaunchKernel;
+          roctracerKernelRow* row = new roctracerKernelRow(
               data->correlation_id,
               domain,
               cid,
@@ -161,11 +159,9 @@ void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* ca
               args.blockDimY,
               args.blockDimZ,
               args.sharedMemBytes,
-              args.stream
-            );
-            insert_row_to_buffer(row);
-          }
-          break;
+              args.stream);
+          insert_row_to_buffer(row);
+        } break;
         case HIP_API_ID_hipLaunchCooperativeKernelMultiDevice:
         case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice:
 #if 0
@@ -194,9 +190,8 @@ void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* ca
           }
 #endif
           break;
-        case HIP_API_ID_hipMalloc:
-          {
-            roctracerMallocRow* row = new roctracerMallocRow(
+        case HIP_API_ID_hipMalloc: {
+          roctracerMallocRow* row = new roctracerMallocRow(
               data->correlation_id,
               domain,
               cid,
@@ -205,14 +200,11 @@ void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* ca
               timespec_to_ns(startTime),
               timespec_to_ns(endTime),
               data->args.hipMalloc.ptr__val,
-              data->args.hipMalloc.size
-            );
-            insert_row_to_buffer(row);
-          }
-          break;
-        case HIP_API_ID_hipFree:
-          {
-            roctracerMallocRow* row = new roctracerMallocRow(
+              data->args.hipMalloc.size);
+          insert_row_to_buffer(row);
+        } break;
+        case HIP_API_ID_hipFree: {
+          roctracerMallocRow* row = new roctracerMallocRow(
               data->correlation_id,
               domain,
               cid,
@@ -221,15 +213,12 @@ void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* ca
               timespec_to_ns(startTime),
               timespec_to_ns(endTime),
               data->args.hipFree.ptr,
-              0
-            );
-            insert_row_to_buffer(row);
-          }
-          break;
-        case HIP_API_ID_hipMemcpy:
-          {
-            auto &args = data->args.hipMemcpy;
-            roctracerCopyRow* row = new roctracerCopyRow(
+              0);
+          insert_row_to_buffer(row);
+        } break;
+        case HIP_API_ID_hipMemcpy: {
+          auto& args = data->args.hipMemcpy;
+          roctracerCopyRow* row = new roctracerCopyRow(
               data->correlation_id,
               domain,
               cid,
@@ -241,16 +230,14 @@ void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* ca
               args.dst,
               args.sizeBytes,
               args.kind,
-              static_cast<hipStream_t>(0)  // use placeholder?
-            );
-            insert_row_to_buffer(row);
-          }
-          break;
+              static_cast<hipStream_t>(0) // use placeholder?
+          );
+          insert_row_to_buffer(row);
+        } break;
         case HIP_API_ID_hipMemcpyAsync:
-        case HIP_API_ID_hipMemcpyWithStream:
-          {
-            auto &args = data->args.hipMemcpyAsync;
-            roctracerCopyRow* row = new roctracerCopyRow(
+        case HIP_API_ID_hipMemcpyWithStream: {
+          auto& args = data->args.hipMemcpyAsync;
+          roctracerCopyRow* row = new roctracerCopyRow(
               data->correlation_id,
               domain,
               cid,
@@ -262,64 +249,63 @@ void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* ca
               args.dst,
               args.sizeBytes,
               args.kind,
-              args.stream
-            );
-            insert_row_to_buffer(row);
-          }
-          break;
-        default:
-          {
-            roctracerRow* row = new roctracerRow(
+              args.stream);
+          insert_row_to_buffer(row);
+        } break;
+        default: {
+          roctracerRow* row = new roctracerRow(
               data->correlation_id,
               domain,
               cid,
               processId(),
               systemThreadId(),
               timespec_to_ns(startTime),
-              timespec_to_ns(endTime)
-            );
-            insert_row_to_buffer(row);
-          }
-          break;
-      }  // switch
+              timespec_to_ns(endTime));
+          insert_row_to_buffer(row);
+        } break;
+      } // switch
       // External correlation
-      for (int it = CorrelationDomain::begin; it < CorrelationDomain::end; ++it) {
+      for (int it = CorrelationDomain::begin; it < CorrelationDomain::end;
+           ++it) {
         if (t_externalIds[it].size() > 0) {
           std::lock_guard<std::mutex> lock(dis->externalCorrelationsMutex_);
-          dis->externalCorrelations_[it].emplace_back(data->correlation_id, t_externalIds[it].back());
+          dis->externalCorrelations_[it].emplace_back(
+              data->correlation_id, t_externalIds[it].back());
         }
       }
-    }  // phase exit
+    } // phase exit
   }
 }
 
-void RoctracerLogger::activity_callback(const char* begin, const char* end, void* arg)
-{
-  RoctracerLogger *dis = &singleton();
+void RoctracerLogger::activity_callback(
+    const char* begin,
+    const char* end,
+    void* arg) {
+  RoctracerLogger* dis = &singleton();
 
-  // Log latest completed correlation id.  Used to ensure we have flushed all data on stop
+  // Log latest completed correlation id.  Used to ensure we have flushed all
+  // data on stop
   std::unique_lock<std::mutex> lock(s_flush.mutex_);
   const roctracer_record_t* record = (const roctracer_record_t*)(begin);
   const roctracer_record_t* end_record = (const roctracer_record_t*)(end);
 
   while (record < end_record) {
     if (record->correlation_id > s_flush.maxCompletedCorrelationId_) {
-       s_flush.maxCompletedCorrelationId_ = record->correlation_id;
+      s_flush.maxCompletedCorrelationId_ = record->correlation_id;
     }
     roctracerAsyncRow* row = new roctracerAsyncRow(
-      record->correlation_id,
-      record->domain,
-      record->kind,
-      record->op,
-      record->device_id,
-      record->queue_id,
-      record->begin_ns,
-      record->end_ns,
-      ((record->kind == HIP_OP_DISPATCH_KIND_KERNEL_)
-        || (record->kind == HIP_OP_DISPATCH_KIND_TASK_))
-        ? demangle(record->kernel_name)
-        : std::string()
-    );
+        record->correlation_id,
+        record->domain,
+        record->kind,
+        record->op,
+        record->device_id,
+        record->queue_id,
+        record->begin_ns,
+        record->end_ns,
+        ((record->kind == HIP_OP_DISPATCH_KIND_KERNEL_) ||
+         (record->kind == HIP_OP_DISPATCH_KIND_TASK_))
+            ? demangle(record->kernel_name)
+            : std::string());
     insert_row_to_buffer(row);
     roctracer_next_record(record, &record);
   }
@@ -327,10 +313,11 @@ void RoctracerLogger::activity_callback(const char* begin, const char* end, void
 
 void RoctracerLogger::startLogging() {
   if (!registered_) {
-    roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, nullptr);  // Magic encantation
+    roctracer_set_properties(
+        ACTIVITY_DOMAIN_HIP_API, nullptr); // Magic encantation
 
     // Set some api calls to ignore
-    loggedIds_.setInvertMode(true);  // Omit the specified api
+    loggedIds_.setInvertMode(true); // Omit the specified api
     loggedIds_.add("hipGetDevice");
     loggedIds_.add("hipSetDevice");
     loggedIds_.add("hipGetLastError");
@@ -348,22 +335,26 @@ void RoctracerLogger::startLogging() {
 
     // Enable API callbacks
     if (loggedIds_.invertMode() == true) {
-        // exclusion list - enable entire domain and turn off things in list
-        roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, api_callback, nullptr);
-        const std::unordered_map<uint32_t, uint32_t> &filter = loggedIds_.filterList();
-        for (auto it = filter.begin(); it != filter.end(); ++it) {
-            roctracer_disable_op_callback(ACTIVITY_DOMAIN_HIP_API, it->first);
-        }
-    }
-    else {
-        // inclusion list - only enable things in the list
-        const std::unordered_map<uint32_t, uint32_t> &filter = loggedIds_.filterList();
-        roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API);
-        for (auto it = filter.begin(); it != filter.end(); ++it) {
-            roctracer_enable_op_callback(ACTIVITY_DOMAIN_HIP_API, it->first, api_callback, nullptr);
-        }
+      // exclusion list - enable entire domain and turn off things in list
+      roctracer_enable_domain_callback(
+          ACTIVITY_DOMAIN_HIP_API, api_callback, nullptr);
+      const std::unordered_map<uint32_t, uint32_t>& filter =
+          loggedIds_.filterList();
+      for (auto it = filter.begin(); it != filter.end(); ++it) {
+        roctracer_disable_op_callback(ACTIVITY_DOMAIN_HIP_API, it->first);
+      }
+    } else {
+      // inclusion list - only enable things in the list
+      const std::unordered_map<uint32_t, uint32_t>& filter =
+          loggedIds_.filterList();
+      roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API);
+      for (auto it = filter.begin(); it != filter.end(); ++it) {
+        roctracer_enable_op_callback(
+            ACTIVITY_DOMAIN_HIP_API, it->first, api_callback, nullptr);
+      }
     }
-    //roctracer_enable_domain_callback(ACTIVITY_DOMAIN_ROCTX, api_callback, nullptr);
+    // roctracer_enable_domain_callback(ACTIVITY_DOMAIN_ROCTX, api_callback,
+    // nullptr);
 
     // Allocate default tracing pool
     roctracer_properties_t properties;
@@ -401,7 +392,8 @@ void RoctracerLogger::stopLogging() {
   // If we are stopping the tracer, implement reliable flushing
   std::unique_lock<std::mutex> lock(s_flush.mutex_);
 
-  auto correlationId = s_flush.maxCorrelationId_.load();  // load ending id from the running max
+  auto correlationId =
+      s_flush.maxCorrelationId_.load(); // load ending id from the running max
 
   // Poll on the worker finding the final correlation id
   int timeout = 50;
@@ -418,7 +410,7 @@ void RoctracerLogger::stopLogging() {
 void RoctracerLogger::endTracing() {
   if (registered_ == true) {
     roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API);
-    //roctracer_disable_domain_callback(ACTIVITY_DOMAIN_ROCTX);
+    // roctracer_disable_domain_callback(ACTIVITY_DOMAIN_ROCTX);
 
     roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS);
     roctracer_close_pool_expl(hccPool_);
@@ -426,33 +418,29 @@ void RoctracerLogger::endTracing() {
   }
 }
 
+ApiIdList::ApiIdList() : invert_(true) {}
 
-ApiIdList::ApiIdList()
-: invert_(true)
-{
-}
-
-void ApiIdList::add(const std::string &apiName)
-{
+void ApiIdList::add(const std::string& apiName) {
   uint32_t cid = 0;
-  if (roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == ROCTRACER_STATUS_SUCCESS) {
+  if (roctracer_op_code(
+          ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) ==
+      ROCTRACER_STATUS_SUCCESS) {
     filter_[cid] = 1;
   }
 }
-void ApiIdList::remove(const std::string &apiName)
-{
+void ApiIdList::remove(const std::string& apiName) {
   uint32_t cid = 0;
-  if (roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == ROCTRACER_STATUS_SUCCESS) {
+  if (roctracer_op_code(
+          ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) ==
+      ROCTRACER_STATUS_SUCCESS) {
     filter_.erase(cid);
   }
 }
 
-bool ApiIdList::loadUserPrefs()
-{
+bool ApiIdList::loadUserPrefs() {
   // placeholder
   return false;
 }
-bool ApiIdList::contains(uint32_t apiId)
-{
-  return (filter_.find(apiId) != filter_.end()) ? !invert_ : invert_;  // XOR
+bool ApiIdList::contains(uint32_t apiId) {
+  return (filter_.find(apiId) != filter_.end()) ? !invert_ : invert_; // XOR
 }
diff --git a/libkineto/src/RoctracerLogger.h b/libkineto/src/RoctracerLogger.h
index b0ae58f66..e602764fd 100644
--- a/libkineto/src/RoctracerLogger.h
+++ b/libkineto/src/RoctracerLogger.h
@@ -8,23 +8,24 @@
 
 #pragma once
 
+#include <atomic>
+#include <deque>
 #include <functional>
 #include <list>
+#include <map>
 #include <memory>
+#include <mutex>
 #include <set>
-#include <vector>
-#include <map>
 #include <unordered_map>
-#include <deque>
-#include <atomic>
-#include <mutex>
+#include <vector>
 
 #include <roctracer.h>
-#include <roctracer_hip.h>
 #include <roctracer_ext.h>
+#include <roctracer_hip.h>
 #include <roctracer_roctx.h>
 
-// Local copy of hip op types.  These are public (and stable) in later rocm releases
+// Local copy of hip op types.  These are public (and stable) in later rocm
+// releases
 typedef enum {
   HIP_OP_COPY_KIND_UNKNOWN_ = 0,
   HIP_OP_COPY_KIND_DEVICE_TO_HOST_ = 0x11F3,
@@ -42,16 +43,14 @@ typedef enum {
   HIP_OP_DISPATCH_KIND_TASK_ = 0x11F1
 } hip_op_dispatch_kind_t_;
 
-typedef enum {
-  HIP_OP_BARRIER_KIND_UNKNOWN_ = 0
-} hip_op_barrier_kind_t_;
+typedef enum { HIP_OP_BARRIER_KIND_UNKNOWN_ = 0 } hip_op_barrier_kind_t_;
 // end hip op defines
 
-namespace onnxruntime{
+namespace onnxruntime {
 namespace profiling {
 class RocmProfiler;
 }
-}
+} // namespace onnxruntime
 
 namespace libkineto {
 class RoctracerActivityApi;
@@ -66,13 +65,19 @@ static timestamp_t timespec_to_ns(const timespec& time) {
 class ApiIdList {
  public:
   ApiIdList();
-  bool invertMode() { return invert_; }
-  void setInvertMode(bool invert) { invert_ = invert; }
-  void add(const std::string &apiName);
-  void remove(const std::string &apiName);
+  bool invertMode() {
+    return invert_;
+  }
+  void setInvertMode(bool invert) {
+    invert_ = invert;
+  }
+  void add(const std::string& apiName);
+  void remove(const std::string& apiName);
   bool loadUserPrefs();
   bool contains(uint32_t apiId);
-  const std::unordered_map<uint32_t, uint32_t> &filterList() { return filter_; }
+  const std::unordered_map<uint32_t, uint32_t>& filterList() {
+    return filter_;
+  }
 
  private:
   std::unordered_map<uint32_t, uint32_t> filter_;
@@ -90,10 +95,13 @@ typedef enum {
 
 struct roctracerBase {
   roctracerBase(
-    uint64_t id, uint32_t domain, uint64_t begin, uint64_t end,
-    roctracer_activity_types type = ROCTRACER_ACTIVITY_NONE)
-    : id(id), begin(begin), end(end), domain(domain), type(type) {}
-  uint64_t id;  // correlation_id
+      uint64_t id,
+      uint32_t domain,
+      uint64_t begin,
+      uint64_t end,
+      roctracer_activity_types type = ROCTRACER_ACTIVITY_NONE)
+      : id(id), begin(begin), end(end), domain(domain), type(type) {}
+  uint64_t id; // correlation_id
   uint64_t begin;
   uint64_t end;
   uint32_t domain;
@@ -102,10 +110,18 @@ struct roctracerBase {
 
 struct roctracerRow : public roctracerBase {
   roctracerRow(
-    uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid,
-    uint32_t tid, uint64_t begin, uint64_t end,
-    roctracer_activity_types type = ROCTRACER_ACTIVITY_DEFAULT)
-    : roctracerBase(id, domain, begin, end, type), cid(cid), pid(pid), tid(tid) {}
+      uint64_t id,
+      uint32_t domain,
+      uint32_t cid,
+      uint32_t pid,
+      uint32_t tid,
+      uint64_t begin,
+      uint64_t end,
+      roctracer_activity_types type = ROCTRACER_ACTIVITY_DEFAULT)
+      : roctracerBase(id, domain, begin, end, type),
+        cid(cid),
+        pid(pid),
+        tid(tid) {}
   uint32_t cid;
   uint32_t pid;
   uint32_t tid;
@@ -113,17 +129,35 @@ struct roctracerRow : public roctracerBase {
 
 struct roctracerKernelRow : public roctracerRow {
   roctracerKernelRow(
-    uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid,
-    uint32_t tid, uint64_t begin, uint64_t end,
-    const void *faddr, hipFunction_t function,
-    unsigned int gx, unsigned int gy, unsigned int gz,
-    unsigned int wx, unsigned int wy, unsigned int wz,
-    size_t gss, hipStream_t stream,
-    roctracer_activity_types type = ROCTRACER_ACTIVITY_KERNEL)
-    : roctracerRow(id, domain, cid, pid, tid, begin, end, type), functionAddr(faddr),
-    function(function), gridX(gx), gridY(gy), gridZ(gz),
-    workgroupX(wx), workgroupY(wy), workgroupZ(wz), groupSegmentSize(gss),
-    stream(stream) {}
+      uint64_t id,
+      uint32_t domain,
+      uint32_t cid,
+      uint32_t pid,
+      uint32_t tid,
+      uint64_t begin,
+      uint64_t end,
+      const void* faddr,
+      hipFunction_t function,
+      unsigned int gx,
+      unsigned int gy,
+      unsigned int gz,
+      unsigned int wx,
+      unsigned int wy,
+      unsigned int wz,
+      size_t gss,
+      hipStream_t stream,
+      roctracer_activity_types type = ROCTRACER_ACTIVITY_KERNEL)
+      : roctracerRow(id, domain, cid, pid, tid, begin, end, type),
+        functionAddr(faddr),
+        function(function),
+        gridX(gx),
+        gridY(gy),
+        gridZ(gz),
+        workgroupX(wx),
+        workgroupY(wy),
+        workgroupZ(wz),
+        groupSegmentSize(gss),
+        stream(stream) {}
   const void* functionAddr;
   hipFunction_t function;
   unsigned int gridX;
@@ -138,15 +172,27 @@ struct roctracerKernelRow : public roctracerRow {
 
 struct roctracerCopyRow : public roctracerRow {
   roctracerCopyRow(
-    uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid,
-    uint32_t tid, uint64_t begin, uint64_t end,
-    const void* src, const void *dst, size_t size, hipMemcpyKind kind,
-    hipStream_t stream,
-    roctracer_activity_types type = ROCTRACER_ACTIVITY_COPY)
-    : roctracerRow(id, domain, cid, pid, tid, begin, end, type),
-    src(src), dst(dst), size(size), kind(kind), stream(stream) {}
-  const void *src;
-  const void *dst;
+      uint64_t id,
+      uint32_t domain,
+      uint32_t cid,
+      uint32_t pid,
+      uint32_t tid,
+      uint64_t begin,
+      uint64_t end,
+      const void* src,
+      const void* dst,
+      size_t size,
+      hipMemcpyKind kind,
+      hipStream_t stream,
+      roctracer_activity_types type = ROCTRACER_ACTIVITY_COPY)
+      : roctracerRow(id, domain, cid, pid, tid, begin, end, type),
+        src(src),
+        dst(dst),
+        size(size),
+        kind(kind),
+        stream(stream) {}
+  const void* src;
+  const void* dst;
   size_t size;
   hipMemcpyKind kind;
   hipStream_t stream;
@@ -154,24 +200,41 @@ struct roctracerCopyRow : public roctracerRow {
 
 struct roctracerMallocRow : public roctracerRow {
   roctracerMallocRow(
-    uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid,
-    uint32_t tid, uint64_t begin, uint64_t end,
-    const void* ptr, size_t size,
-    roctracer_activity_types type = ROCTRACER_ACTIVITY_MALLOC)
-    : roctracerRow(id, domain, cid, pid, tid, begin, end, type)
-    , ptr(ptr), size(size) {}
-  const void *ptr;
+      uint64_t id,
+      uint32_t domain,
+      uint32_t cid,
+      uint32_t pid,
+      uint32_t tid,
+      uint64_t begin,
+      uint64_t end,
+      const void* ptr,
+      size_t size,
+      roctracer_activity_types type = ROCTRACER_ACTIVITY_MALLOC)
+      : roctracerRow(id, domain, cid, pid, tid, begin, end, type),
+        ptr(ptr),
+        size(size) {}
+  const void* ptr;
   size_t size;
 };
 
 struct roctracerAsyncRow : public roctracerBase {
   roctracerAsyncRow(
-    uint64_t id, uint32_t domain, uint32_t kind, uint32_t op,
-    int device, uint64_t queue, uint64_t begin,
-    uint64_t end, const std::string &kernelName,
-    roctracer_activity_types type = ROCTRACER_ACTIVITY_ASYNC)
-    : roctracerBase(id, domain, begin, end, type), kind(kind), op(op), device(device),
-    queue(queue), kernelName(kernelName) {}
+      uint64_t id,
+      uint32_t domain,
+      uint32_t kind,
+      uint32_t op,
+      int device,
+      uint64_t queue,
+      uint64_t begin,
+      uint64_t end,
+      const std::string& kernelName,
+      roctracer_activity_types type = ROCTRACER_ACTIVITY_ASYNC)
+      : roctracerBase(id, domain, begin, end, type),
+        kind(kind),
+        op(op),
+        device(device),
+        queue(queue),
+        kernelName(kernelName) {}
   uint32_t kind;
   uint32_t op;
   int device;
@@ -209,9 +272,13 @@ class RoctracerLogger {
   bool registered_{false};
   void endTracing();
 
-  roctracer_pool_t *hccPool_{NULL};
+  roctracer_pool_t* hccPool_{NULL};
   static void insert_row_to_buffer(roctracerBase* row);
-  static void api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg);
+  static void api_callback(
+      uint32_t domain,
+      uint32_t cid,
+      const void* callback_data,
+      void* arg);
   static void activity_callback(const char* begin, const char* end, void* arg);
 
   ApiIdList loggedIds_;
@@ -224,7 +291,8 @@ class RoctracerLogger {
   // This vector collects pairs of correlationId and their respective
   // externalCorrelationId for each CorrelationDomain. This will be used
   // to populate the Correlation maps during post processing.
-  std::vector<std::pair<uint64_t, uint64_t>> externalCorrelations_[CorrelationDomain::size];
+  std::vector<std::pair<uint64_t, uint64_t>>
+      externalCorrelations_[CorrelationDomain::size];
   std::mutex externalCorrelationsMutex_;
 
   bool externalCorrelationEnabled_{true};
diff --git a/libkineto/src/SampleListener.h b/libkineto/src/SampleListener.h
index f156e220b..5224d78f1 100644
--- a/libkineto/src/SampleListener.h
+++ b/libkineto/src/SampleListener.h
@@ -139,10 +139,11 @@ class SampleListener {
   SampleListener(const SampleListener&) = delete;
   SampleListener& operator=(const SampleListener&) = delete;
 
-  virtual ~SampleListener(){}
+  virtual ~SampleListener() {}
 
   // Report bucketed & aggregated values for event
-  virtual void handleSample(int device, const Sample& sample, bool from_new_version) = 0;
+  virtual void
+  handleSample(int device, const Sample& sample, bool from_new_version) = 0;
 
   virtual void update(const Config& config) = 0;
 
diff --git a/libkineto/src/ScopeExit.h b/libkineto/src/ScopeExit.h
index 67e52e8ce..5b7e39524 100644
--- a/libkineto/src/ScopeExit.h
+++ b/libkineto/src/ScopeExit.h
@@ -30,6 +30,5 @@ ScopeExit<T> makeScopeExit(T t) {
 #define __kINETO_CONCAT(name, line) name##line
 #define ANON_VAR(name, line) __kINETO_CONCAT(name, line)
 
-#define SCOPE_EXIT(func)                                      \
-  const auto ANON_VAR(SCOPE_BLOCK, __LINE__) =                \
-      makeScopeExit([=]() { func; })
+#define SCOPE_EXIT(func) \
+  const auto ANON_VAR(SCOPE_BLOCK, __LINE__) = makeScopeExit([=]() { func; })
diff --git a/libkineto/src/ThreadUtil.cpp b/libkineto/src/ThreadUtil.cpp
index 56da5d0c8..f9ec041e2 100644
--- a/libkineto/src/ThreadUtil.cpp
+++ b/libkineto/src/ThreadUtil.cpp
@@ -10,16 +10,16 @@
 
 #ifndef _WIN32
 #include <pthread.h>
-#include <unistd.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
+#include <unistd.h>
 #else // _WIN32
-#include <locale>
 #include <codecvt>
+#include <locale>
 #define WIN32_LEAN_AND_MEAN
 #define NOGDI
-#include <windows.h>
 #include <processthreadsapi.h>
+#include <windows.h>
 #undef ERROR
 #endif // _WIN32
 
@@ -37,7 +37,7 @@ namespace {
 thread_local int32_t _pid = 0;
 thread_local int32_t _tid = 0;
 thread_local int32_t _sysTid = 0;
-}
+} // namespace
 
 int32_t processId(bool cache) {
   int32_t pid = 0;
@@ -82,11 +82,11 @@ int32_t threadId() {
     pthread_threadid_np(nullptr, &tid);
     _tid = tid;
 #elif defined _WIN32
-  _tid = (int32_t)GetCurrentThreadId();
+    _tid = (int32_t)GetCurrentThreadId();
 #else
-  pthread_t pth = pthread_self();
-  int32_t* ptr = reinterpret_cast<int32_t*>(&pth);
-  _tid = *ptr;
+    pthread_t pth = pthread_self();
+    int32_t* ptr = reinterpret_cast<int32_t*>(&pth);
+    _tid = *ptr;
 #endif
   }
   return _tid;
@@ -96,24 +96,28 @@ namespace {
 static constexpr size_t kMaxThreadNameLength = 16;
 
 static constexpr const char* basename(const char* s, int off = 0) {
-  return !s[off]
-      ? s
-      : s[off] == '/' ? basename(&s[off + 1]) : basename(s, off + 1);
+  return !s[off]      ? s
+      : s[off] == '/' ? basename(&s[off + 1])
+                      : basename(s, off + 1);
 }
 #if defined(_WIN32)
-void *getKernel32Func(const char* procName) {
-  return reinterpret_cast<void*>(GetProcAddress(GetModuleHandleA("KERNEL32.DLL"), procName));
+void* getKernel32Func(const char* procName) {
+  return reinterpret_cast<void*>(
+      GetProcAddress(GetModuleHandleA("KERNEL32.DLL"), procName));
 }
 #endif
-}
+} // namespace
 
 bool setThreadName(const std::string& name) {
 #ifdef __APPLE__
   return 0 == pthread_setname_np(name.c_str());
 #elif defined _WIN32
-  // Per https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreaddescription
+  // Per
+  // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreaddescription
   // Use runtime linking to set thread description
-  static auto _SetThreadDescription = reinterpret_cast<decltype(&SetThreadDescription)>(getKernel32Func("SetThreadDescription"));
+  static auto _SetThreadDescription =
+      reinterpret_cast<decltype(&SetThreadDescription)>(
+          getKernel32Func("SetThreadDescription"));
   if (!_SetThreadDescription) {
     return false;
   }
@@ -131,16 +135,18 @@ std::string getThreadName() {
   char buf[kMaxThreadNameLength] = "";
   if (
 #ifndef __ANDROID__
-    pthread_getname_np(pthread_self(), buf, kMaxThreadNameLength) != 0
+      pthread_getname_np(pthread_self(), buf, kMaxThreadNameLength) != 0
 #else
-    prctl(PR_GET_NAME, buf, kMaxThreadNameLength) != 0
+      prctl(PR_GET_NAME, buf, kMaxThreadNameLength) != 0
 #endif
   ) {
     return "Unknown";
   }
   return buf;
 #else // _WIN32
-  static auto _GetThreadDescription = reinterpret_cast<decltype(&GetThreadDescription)>(getKernel32Func("GetThreadDescription"));
+  static auto _GetThreadDescription =
+      reinterpret_cast<decltype(&GetThreadDescription)>(
+          getKernel32Func("GetThreadDescription"));
   if (!_GetThreadDescription) {
     return "Unknown";
   }
@@ -215,7 +221,8 @@ std::vector<std::pair<int32_t, std::string>> pidCommandPairsOfAncestors() {
   // Usually we want to skip the root process (PID 1), but when running
   // inside a container the process itself has PID 1, so we need to include it
   for (int i = 0; i <= kMaxParentPids && (i == 0 || curr_pid > 1); i++) {
-    std::pair<int32_t, std::string> ppid_and_comm = parentPidAndCommand(curr_pid);
+    std::pair<int32_t, std::string> ppid_and_comm =
+        parentPidAndCommand(curr_pid);
     pairs.push_back(std::make_pair(curr_pid, ppid_and_comm.second));
     curr_pid = ppid_and_comm.first;
   }
diff --git a/libkineto/src/WeakSymbols.cpp b/libkineto/src/WeakSymbols.cpp
index f30bc4488..28f743d34 100644
--- a/libkineto/src/WeakSymbols.cpp
+++ b/libkineto/src/WeakSymbols.cpp
@@ -10,10 +10,12 @@
 
 #ifndef _MSC_VER
 extern "C" {
-// This function is needed to avoid superfluous dependency on GNU OpenMP library when cuPTI is linked statically
-// For more details see https://github.com/pytorch/pytorch/issues/51026
+// This function is needed to avoid superfluous dependency on GNU OpenMP library
+// when cuPTI is linked statically For more details see
+// https://github.com/pytorch/pytorch/issues/51026
 __attribute__((weak)) int acc_get_device_type() {
-  throw std::runtime_error("Dummy implementation of acc_get_device_type is not supposed to be called!");
+  throw std::runtime_error(
+      "Dummy implementation of acc_get_device_type is not supposed to be called!");
 }
 
 } // extern "C"
diff --git a/libkineto/src/cupti_strings.cpp b/libkineto/src/cupti_strings.cpp
index 2feb799ed..b4a4756bc 100644
--- a/libkineto/src/cupti_strings.cpp
+++ b/libkineto/src/cupti_strings.cpp
@@ -10,8 +10,7 @@
 
 namespace libkineto {
 
-const char* memcpyKindString(
-    CUpti_ActivityMemcpyKind kind) {
+const char* memcpyKindString(CUpti_ActivityMemcpyKind kind) {
   switch (kind) {
     case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
       return "HtoD";
@@ -39,8 +38,7 @@ const char* memcpyKindString(
   return "<unknown>";
 }
 
-const char* memoryKindString(
-    CUpti_ActivityMemoryKind kind) {
+const char* memoryKindString(CUpti_ActivityMemoryKind kind) {
   switch (kind) {
     case CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN:
       return "Unknown";
@@ -65,8 +63,7 @@ const char* memoryKindString(
   }
 }
 
-const char* overheadKindString(
-    CUpti_ActivityOverheadKind kind) {
+const char* overheadKindString(CUpti_ActivityOverheadKind kind) {
   switch (kind) {
     case CUPTI_ACTIVITY_OVERHEAD_UNKNOWN:
       return "Unknown";
@@ -85,8 +82,6 @@ const char* overheadKindString(
   }
 }
 
-
-
 static const char* runtimeCbidNames[] = {
     "INVALID",
     "cudaDriverGetVersion",
@@ -533,8 +528,7 @@ static const char* runtimeCbidNames[] = {
     "cudaStreamGetId_ptsz",
     "cudaGraphInstantiate",
     "cuda444",
-    "SIZE"
-};
+    "SIZE"};
 
 const char* runtimeCbidName(CUpti_CallbackId cbid) {
   constexpr int names_size =
@@ -545,10 +539,10 @@ const char* runtimeCbidName(CUpti_CallbackId cbid) {
   return runtimeCbidNames[cbid];
 }
 
-// From https://docs.nvidia.com/cupti/modules.html#group__CUPTI__ACTIVITY__API_1g80e1eb47615e31021f574df8ebbe5d9a
+// From
+// https://docs.nvidia.com/cupti/modules.html#group__CUPTI__ACTIVITY__API_1g80e1eb47615e31021f574df8ebbe5d9a
 //   enum CUpti_ActivitySynchronizationType
-const char* syncTypeString(
-    CUpti_ActivitySynchronizationType kind) {
+const char* syncTypeString(CUpti_ActivitySynchronizationType kind) {
   switch (kind) {
     case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE:
       return "Event Sync";
diff --git a/libkineto/src/init.cpp b/libkineto/src/init.cpp
index 5d9f9c3cd..6246db8e3 100644
--- a/libkineto/src/init.cpp
+++ b/libkineto/src/init.cpp
@@ -17,8 +17,8 @@
 #include "DaemonConfigLoader.h"
 #include "DeviceUtil.h"
 #ifdef HAS_CUPTI
-#include "CuptiCallbackApi.h"
 #include "CuptiActivityApi.h"
+#include "CuptiCallbackApi.h"
 #include "CuptiRangeProfiler.h"
 #include "EventProfilerController.h"
 #endif
@@ -125,15 +125,14 @@ void libkineto_init(bool cpuOnly, bool logOnError) {
   const char* logLevelEnv = getenv("KINETO_LOG_LEVEL");
   if (logLevelEnv) {
     // atoi returns 0 on error, so that's what we want - default to VERBOSE
-    static_assert (static_cast<int>(VERBOSE) == 0, "");
+    static_assert(static_cast<int>(VERBOSE) == 0, "");
     SET_LOG_SEVERITY_LEVEL(atoi(logLevelEnv));
   }
 
   // Factory to connect to open source daemon if present
 #if __linux__
   if (libkineto::isDaemonEnvVarSet()) {
-    LOG(INFO) << "Registering daemon config loader, cpuOnly =  "
-              << cpuOnly;
+    LOG(INFO) << "Registering daemon config loader, cpuOnly =  " << cpuOnly;
     DaemonConfigLoader::registerFactory();
   }
 #endif
@@ -148,7 +147,7 @@ void libkineto_init(bool cpuOnly, bool logOnError) {
     bool status = false;
     bool initRangeProfiler = true;
 
-    if (cbapi->initSuccess()){
+    if (cbapi->initSuccess()) {
       const CUpti_CallbackDomain domain = CUPTI_CB_DOMAIN_RESOURCE;
       status = cbapi->registerCallback(
           domain, CuptiCallbackApi::RESOURCE_CONTEXT_CREATED, initProfilers);
@@ -162,7 +161,9 @@ void libkineto_init(bool cpuOnly, bool logOnError) {
       if (enableEventProfiler()) {
         if (status) {
           status = cbapi->registerCallback(
-              domain, CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED, stopProfiler);
+              domain,
+              CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED,
+              stopProfiler);
         }
         if (status) {
           status = cbapi->enableCallback(
@@ -178,8 +179,9 @@ void libkineto_init(bool cpuOnly, bool logOnError) {
         CUPTI_CALL(cbapi->getCuptiStatus());
         LOG(WARNING) << "CUPTI initialization failed - "
                      << "CUDA profiler activities will be missing";
-        LOG(INFO) << "If you see CUPTI_ERROR_INSUFFICIENT_PRIVILEGES, refer to "
-                  << "https://developer.nvidia.com/nvidia-development-tools-solutions-err-nvgpuctrperm-cupti";
+        LOG(INFO)
+            << "If you see CUPTI_ERROR_INSUFFICIENT_PRIVILEGES, refer to "
+            << "https://developer.nvidia.com/nvidia-development-tools-solutions-err-nvgpuctrperm-cupti";
       }
     }
 
@@ -200,28 +202,31 @@ void libkineto_init(bool cpuOnly, bool logOnError) {
 
 #ifdef HAS_XPUPTI
   // register xpu pti profiler
-  libkineto::api().registerProfilerFactory([]() -> std::unique_ptr<IActivityProfiler> {
-    auto returnCode = ptiViewGPULocalAvailable();
-    if (returnCode != PTI_SUCCESS) {
-      std::string errPrefixMsg(
-          "Fail to enable Kineto Profiler on XPU due to error code: ");
-      errPrefixMsg = errPrefixMsg + std::to_string(returnCode);
+  libkineto::api().registerProfilerFactory(
+      []() -> std::unique_ptr<IActivityProfiler> {
+        auto returnCode = ptiViewGPULocalAvailable();
+        if (returnCode != PTI_SUCCESS) {
+          std::string errPrefixMsg(
+              "Fail to enable Kineto Profiler on XPU due to error code: ");
+          errPrefixMsg = errPrefixMsg + std::to_string(returnCode);
 #if PTI_VERSION_MAJOR > 0 || PTI_VERSION_MINOR > 9
-      std::string errMsg(ptiResultTypeToString(returnCode));
-      throw std::runtime_error(errPrefixMsg + std::string(". The detailed error message is: ") + errMsg);
+          std::string errMsg(ptiResultTypeToString(returnCode));
+          throw std::runtime_error(
+              errPrefixMsg + std::string(". The detailed error message is: ") +
+              errMsg);
 #else
-      throw std::runtime_error(errPrefixMsg);
+          throw std::runtime_error(errPrefixMsg);
 #endif
-    }
-    return std::make_unique<XPUActivityProfiler>();
-  });
+        }
+        return std::make_unique<XPUActivityProfiler>();
+      });
 #endif // HAS_XPUPTI
 
 #if __linux__
   // When CUDA/GPU is used the profiler initialization happens on the
   // creation of the first CUDA stream (see initProfilers()).
-  // This section bootstraps the profiler and its connection to a profiling daemon
-  // in the CPU only case.
+  // This section bootstraps the profiler and its connection to a profiling
+  // daemon in the CPU only case.
   if (cpuOnly && getenv(kUseDaemonEnvVar) != nullptr) {
     initProfilersCPU();
     libkineto::api().configLoader().initBaseConfig();
@@ -238,8 +243,8 @@ int InitializeInjection(void) {
 }
 
 bool hasTestEnvVar() {
-  return getenv("GTEST_OUTPUT") != nullptr || getenv("FB_TEST") != nullptr
-     || getenv("PYTORCH_TEST") != nullptr || getenv("TEST_PILOT") != nullptr;
+  return getenv("GTEST_OUTPUT") != nullptr || getenv("FB_TEST") != nullptr ||
+      getenv("PYTORCH_TEST") != nullptr || getenv("TEST_PILOT") != nullptr;
 }
 
 void suppressLibkinetoLogMessages() {
diff --git a/libkineto/src/output_csv.cpp b/libkineto/src/output_csv.cpp
index a0d9e2c86..e0930b1a6 100644
--- a/libkineto/src/output_csv.cpp
+++ b/libkineto/src/output_csv.cpp
@@ -42,7 +42,10 @@ void EventCSVLogger::update(const Config& config) {
   }
 }
 
-void EventCSVLogger::handleSample(int device, const Sample& sample, bool from_new_version) {
+void EventCSVLogger::handleSample(
+    int device,
+    const Sample& sample,
+    bool from_new_version) {
   using namespace std::chrono;
   if (out_) {
     auto now = system_clock::now();
diff --git a/libkineto/src/output_csv.h b/libkineto/src/output_csv.h
index b4981d1ad..72f422120 100644
--- a/libkineto/src/output_csv.h
+++ b/libkineto/src/output_csv.h
@@ -18,7 +18,8 @@ namespace KINETO_NAMESPACE {
 class EventCSVLogger : public SampleListener {
  public:
   void update(const Config& config) override;
-  void handleSample(int device, const Sample& sample, bool from_new_version) override;
+  void handleSample(int device, const Sample& sample, bool from_new_version)
+      override;
 
  protected:
   EventCSVLogger() : out_(nullptr) {}
diff --git a/libkineto/src/output_json.cpp b/libkineto/src/output_json.cpp
index 0ce8da8f2..ab70ab4b2 100644
--- a/libkineto/src/output_json.cpp
+++ b/libkineto/src/output_json.cpp
@@ -9,8 +9,8 @@
 #include "output_json.h"
 
 #include <fmt/format.h>
-#include <fstream>
 #include <time.h>
+#include <fstream>
 #include <map>
 #include "Config.h"
 #include "DeviceProperties.h"
@@ -18,7 +18,6 @@
 
 #include "Logger.h"
 
-
 namespace KINETO_NAMESPACE {
 
 static constexpr int kSchemaVersion = 1;
@@ -60,8 +59,9 @@ ChromeTraceBaseTime& ChromeTraceBaseTime::singleton() {
 // other applications can accurately read the 'ts' field as a double.
 // Use the program loading time as the baseline time.
 inline int64_t transToRelativeTime(int64_t time) {
-  // Sometimes after converting to relative time, it can be a few nanoseconds negative.
-  // Since Chrome trace and json processing will throw a parser error, guard this.
+  // Sometimes after converting to relative time, it can be a few nanoseconds
+  // negative. Since Chrome trace and json processing will throw a parser error,
+  // guard this.
   int64_t res = time - ChromeTraceBaseTime::singleton().get();
   if (res < 0) {
     return 0;
@@ -70,7 +70,8 @@ inline int64_t transToRelativeTime(int64_t time) {
 }
 
 void ChromeTraceLogger::sanitizeStrForJSON(std::string& value) {
-  // Replace all backslashes with forward slash because Windows paths causing JSONDecodeError.
+  // Replace all backslashes with forward slash because Windows paths causing
+  // JSONDecodeError.
   std::replace(value.begin(), value.end(), '\\', '/');
   // Remove all new line characters
   value.erase(std::remove(value.begin(), value.end(), '\n'), value.end());
@@ -78,7 +79,7 @@ void ChromeTraceLogger::sanitizeStrForJSON(std::string& value) {
 
 void ChromeTraceLogger::metadataToJSON(
     const std::unordered_map<std::string, std::string>& metadata) {
-  for (auto [k, v]: metadata) {
+  for (auto [k, v] : metadata) {
     std::string sanitizedValue = v;
     // There is a seperate mechanism for recording distributedInfo in on-demand
     // so add a guard to prevent "double counting" in auto-trace.
@@ -86,20 +87,27 @@ void ChromeTraceLogger::metadataToJSON(
       distInfo_.distInfo_present_ = true;
     }
     sanitizeStrForJSON(sanitizedValue);
-    traceOf_ << fmt::format(R"JSON(
-  "{}": {},)JSON", k, sanitizedValue);
+    traceOf_ << fmt::format(
+        R"JSON(
+  "{}": {},)JSON",
+        k,
+        sanitizedValue);
   }
 }
 
 void ChromeTraceLogger::handleTraceStart(
     const std::unordered_map<std::string, std::string>& metadata) {
-  traceOf_ << fmt::format(R"JSON(
+  traceOf_ << fmt::format(
+      R"JSON(
 {{
-  "schemaVersion": {},)JSON", kSchemaVersion);
+  "schemaVersion": {},)JSON",
+      kSchemaVersion);
 
-  traceOf_ << fmt::format(R"JSON(
+  traceOf_ << fmt::format(
+      R"JSON(
   "deviceProperties": [{}
-  ],)JSON", devicePropertiesJson());
+  ],)JSON",
+      devicePropertiesJson());
 
   metadataToJSON(metadata);
   traceOf_ << R"JSON(
@@ -203,8 +211,8 @@ void ChromeTraceLogger::handleOverheadInfo(
     return;
   }
 
-  // TOOD: reserve pid = -1 for overhead but we need to rethink how to scale this for
-  // other metadata
+  // TOOD: reserve pid = -1 for overhead but we need to rethink how to scale
+  // this for other metadata
   // clang-format off
   time = transToRelativeTime(time);
   traceOf_ << fmt::format(R"JSON(
@@ -233,7 +241,7 @@ void ChromeTraceLogger::handleTraceSpan(const TraceSpan& span) {
   }
 
   uint64_t start = transToRelativeTime(span.startTime);
-  
+
   // If endTime is 0 and start time is non-zero, dur can overflow. Add
   // a guard to prevent this.
   uint64_t dur = (span.endTime == 0) ? 0 : span.endTime - span.startTime;
@@ -291,7 +299,8 @@ void ChromeTraceLogger::handleGenericInstantEvent(
   }
 
   uint64_t ts = transToRelativeTime(op.timestamp());
-  traceOf_ << fmt::format(R"JSON(
+  traceOf_ << fmt::format(
+      R"JSON(
   {{
     "ph": "i", "cat": "{}", "s": "t", "name": "{}",
     "pid": {}, "tid": {},
@@ -300,12 +309,16 @@ void ChromeTraceLogger::handleGenericInstantEvent(
       {}
     }}
   }},)JSON",
-      toString(op.type()), op.name(), op.deviceId(), op.resourceId(),
-      ts/1000, ts%1000, op.metadataJson());
+      toString(op.type()),
+      op.name(),
+      op.deviceId(),
+      op.resourceId(),
+      ts / 1000,
+      ts % 1000,
+      op.metadataJson());
 }
 
-void ChromeTraceLogger::handleActivity(
-    const libkineto::ITraceActivity& op) {
+void ChromeTraceLogger::handleActivity(const libkineto::ITraceActivity& op) {
   if (!traceOf_) {
     return;
   }
@@ -319,24 +332,26 @@ void ChromeTraceLogger::handleActivity(
   int64_t duration = op.duration();
 
   if (duration < 0) {
-    // This should never happen but can occasionally suffer from regression in handling incomplete events.
-    // Having negative duration in Chrome trace can yield in very poor experience so add an extra guard
-    // before we generate trace events.
+    // This should never happen but can occasionally suffer from regression in
+    // handling incomplete events. Having negative duration in Chrome trace can
+    // yield in very poor experience so add an extra guard before we generate
+    // trace events.
     duration = 0;
   }
 
-  if (op.type() ==  ActivityType::GPU_USER_ANNOTATION) {
+  if (op.type() == ActivityType::GPU_USER_ANNOTATION) {
     // The GPU user annotations start at the same time as the
     // first associated GPU op. Since they appear later
     // in the trace file, this causes a visualization issue in Chrome.
     // Make it start one ns earlier and end 2 ns later.
-    ts-=1;
-    duration+=2; // Still need it to end at the original point rounded up.
+    ts -= 1;
+    duration += 2; // Still need it to end at the original point rounded up.
   }
 
   std::string arg_values = "";
   if (op.linkedActivity() && op.linkedActivity()->correlationId() != 0) {
-    arg_values.append(fmt::format("\"External id\": {}", op.linkedActivity()->correlationId()));
+    arg_values.append(fmt::format(
+        "\"External id\": {}", op.linkedActivity()->correlationId()));
   } else if (op.correlationId() != 0) {
     arg_values.append(fmt::format("\"External id\": {}", op.correlationId()));
   }
@@ -426,14 +441,14 @@ void ChromeTraceLogger::handleActivity(
       arg_values.append(fmt::format(", \"{}\": {}", kP2pSrc, srcRank));
     }
 
-
-    if (distInfo_.backend=="" && processGroupDesc=="\"default_pg\"") {
+    if (distInfo_.backend == "" && processGroupDesc == "\"default_pg\"") {
       distInfo_.backend = "nccl";
       distInfo_.rank = collectiveRecord->getMetadataValue(kRank);
       distInfo_.world_size = groupSize;
-      // Not sure if we want to have output.json depend on nccl at compilation so
-      // set nccl_version to "unknown" for now until we can determine if we can pass
-      // it at runtime or use ifdefs. Should not be necessary to enable HTA
+      // Not sure if we want to have output.json depend on nccl at compilation
+      // so set nccl_version to "unknown" for now until we can determine if we
+      // can pass it at runtime or use ifdefs. Should not be necessary to enable
+      // HTA
       distInfo_.nccl_version = "unknown";
     }
     auto pg_config = pgConfig();
@@ -443,15 +458,16 @@ void ChromeTraceLogger::handleActivity(
     pg_config.pg_size = groupSize;
     pg_config.ranks = groupRanks;
     pgMap.insert({processGroupName, pg_config});
-
   }
 
   std::string args = "";
   if (!arg_values.empty()) {
-    args = fmt::format(R"JSON(,
+    args = fmt::format(
+        R"JSON(,
     "args": {{
       {}
-    }})JSON", arg_values);
+    }})JSON",
+        arg_values);
   }
 
   int device = op.deviceId();
@@ -477,17 +493,14 @@ void ChromeTraceLogger::handleActivity(
 
 void ChromeTraceLogger::handleGenericActivity(
     const libkineto::GenericTraceActivity& op) {
-        handleActivity(op);
+  handleActivity(op);
 }
 
 void ChromeTraceLogger::handleGenericLink(const ITraceActivity& act) {
   static struct {
     int type;
     char name[16];
-  } flow_names[] = {
-    {kLinkFwdBwd, "fwdbwd"},
-    {kLinkAsyncCpuGpu, "ac2g"}
-  };
+  } flow_names[] = {{kLinkFwdBwd, "fwdbwd"}, {kLinkAsyncCpuGpu, "ac2g"}};
   for (auto& flow : flow_names) {
     if (act.flowType() == flow.type) {
       // Link the activities via flow ID in source and destination.
@@ -540,17 +553,27 @@ void ChromeTraceLogger::addOnDemandDistMetadata() {
   if (distInfo_.backend == "") {
     return;
   }
-  traceOf_ << fmt::format(R"JSON(
+  traceOf_ << fmt::format(
+      R"JSON(
   "distributedInfo": {{"backend": "{}", "rank": {}, "world_size": {}, "pg_count": {}, "pg_config": [)JSON",
-          distInfo_.backend, distInfo_.rank, distInfo_.world_size,  std::to_string(pgMap.size()));
-
-    for (const auto& element : pgMap) {
-        traceOf_ << fmt::format(R"JSON({{"pg_name": {}, "pg_desc": {}, "backend_config": "{}", "pg_size": {}, "ranks": {}}},)JSON",
-          element.second.pg_name, element.second.pg_desc, element.second.backend_config, element.second.pg_size, element.second.ranks);
-    }
-    traceOf_.seekp(-1, std::ios_base::end);
-   traceOf_ << fmt::format(R"JSON(], "nccl_version": "{}"}},)JSON", distInfo_.nccl_version);
-   distInfo_.distInfo_present_ = true;
+      distInfo_.backend,
+      distInfo_.rank,
+      distInfo_.world_size,
+      std::to_string(pgMap.size()));
+
+  for (const auto& element : pgMap) {
+    traceOf_ << fmt::format(
+        R"JSON({{"pg_name": {}, "pg_desc": {}, "backend_config": "{}", "pg_size": {}, "ranks": {}}},)JSON",
+        element.second.pg_name,
+        element.second.pg_desc,
+        element.second.backend_config,
+        element.second.pg_size,
+        element.second.ranks);
+  }
+  traceOf_.seekp(-1, std::ios_base::end);
+  traceOf_ << fmt::format(
+      R"JSON(], "nccl_version": "{}"}},)JSON", distInfo_.nccl_version);
+  distInfo_.distInfo_present_ = true;
 }
 
 void ChromeTraceLogger::finalizeTrace(
diff --git a/libkineto/src/output_json.h b/libkineto/src/output_json.h
index 58f7939d3..8fb595398 100644
--- a/libkineto/src/output_json.h
+++ b/libkineto/src/output_json.h
@@ -19,21 +19,20 @@
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
+#include "ActivityBuffers.h"
 #include "GenericTraceActivity.h"
 #include "output_base.h"
-#include "ActivityBuffers.h"
 #include "time_since_epoch.h"
 
 namespace KINETO_NAMESPACE {
-  // Previous declaration of TraceSpan is struct. Must match the same here.
-  struct TraceSpan;
-}
+// Previous declaration of TraceSpan is struct. Must match the same here.
+struct TraceSpan;
+} // namespace KINETO_NAMESPACE
 
 namespace KINETO_NAMESPACE {
 
 class Config;
 
-
 struct pgConfig {
   pgConfig() = default;
   std::string pg_name{""};
@@ -41,7 +40,6 @@ struct pgConfig {
   std::string backend_config{""};
   std::string pg_size{""};
   std::string ranks{""};
-
 };
 
 struct DistributedInfo {
@@ -61,9 +59,7 @@ class ChromeTraceLogger : public libkineto::ActivityLogger {
 
   // Note: the caller of these functions should handle concurrency
   // i.e., we these functions are not thread-safe
-  void handleDeviceInfo(
-      const DeviceInfo& info,
-      uint64_t time) override;
+  void handleDeviceInfo(const DeviceInfo& info, uint64_t time) override;
 
   void handleOverheadInfo(const OverheadInfo& info, int64_t time) override;
 
@@ -81,7 +77,8 @@ class ChromeTraceLogger : public libkineto::ActivityLogger {
       const Config& config,
       std::unique_ptr<ActivityBuffers> buffers,
       int64_t endTime,
-      std::unordered_map<std::string, std::vector<std::string>>& metadata) override;
+      std::unordered_map<std::string, std::vector<std::string>>& metadata)
+      override;
 
   std::string traceFileName() const {
     return fileName_;
@@ -93,7 +90,6 @@ class ChromeTraceLogger : public libkineto::ActivityLogger {
       std::unordered_map<std::string, std::vector<std::string>>& metadata);
 
  private:
-
   // Create a flow event (arrow)
   void handleLink(
       char type,
@@ -120,19 +116,19 @@ class ChromeTraceLogger : public libkineto::ActivityLogger {
   std::string tempFileName_;
   std::ofstream traceOf_;
   DistributedInfo distInfo_ = DistributedInfo();
-  // Map of all observed process groups to their configs in trace. Key is pg_name, 
-  // value is pgConfig that will be used to populate pg_config in 
+  // Map of all observed process groups to their configs in trace. Key is
+  // pg_name, value is pgConfig that will be used to populate pg_config in
   // distributedInfo of trace
   std::unordered_map<std::string, pgConfig> pgMap = {};
 };
 
-//std::chrono header start
+// std::chrono header start
 #ifdef _GLIBCXX_USE_C99_STDINT_TR1
-# define _KINETO_GLIBCXX_CHRONO_INT64_T int64_t
+#define _KINETO_GLIBCXX_CHRONO_INT64_T int64_t
 #elif defined __INT64_TYPE__
-# define _KINETO_GLIBCXX_CHRONO_INT64_T __INT64_TYPE__
+#define _KINETO_GLIBCXX_CHRONO_INT64_T __INT64_TYPE__
 #else
-# define _KINETO_GLIBCXX_CHRONO_INT64_T long long
+#define _KINETO_GLIBCXX_CHRONO_INT64_T long long
 #endif
 // std::chrono header end
 
@@ -143,8 +139,8 @@ class ChromeTraceLogger : public libkineto::ActivityLogger {
 // 3 months intervals, so we can still collect traces across ranks relative
 // to each other.
 // A month is 2629746, so 3 months is 7889238.
-using _trimonths = std::chrono::duration<
-    _KINETO_GLIBCXX_CHRONO_INT64_T, std::ratio<7889238>>;
+using _trimonths =
+    std::chrono::duration<_KINETO_GLIBCXX_CHRONO_INT64_T, std::ratio<7889238>>;
 #undef _GLIBCXX_CHRONO_INT64_T
 
 class ChromeTraceBaseTime {
diff --git a/libkineto/src/output_membuf.h b/libkineto/src/output_membuf.h
index 206167093..15fc38c58 100644
--- a/libkineto/src/output_membuf.h
+++ b/libkineto/src/output_membuf.h
@@ -15,10 +15,10 @@
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
+#include "ActivityBuffers.h"
 #include "Config.h"
 #include "GenericTraceActivity.h"
 #include "output_base.h"
-#include "ActivityBuffers.h"
 
 namespace KINETO_NAMESPACE {
 
@@ -32,9 +32,7 @@ class MemoryTraceLogger : public ActivityLogger {
 
   // Note: the caller of these functions should handle concurrency
   // i.e., these functions are not thread-safe
-  void handleDeviceInfo(
-      const DeviceInfo& info,
-      uint64_t time) override {
+  void handleDeviceInfo(const DeviceInfo& info, uint64_t time) override {
     deviceInfoList_.emplace_back(info, time);
   }
 
@@ -48,7 +46,7 @@ class MemoryTraceLogger : public ActivityLogger {
     // Handled separately
   }
 
-  template<class T>
+  template <class T>
   void addActivityWrapper(const T& act) {
     wrappers_.push_back(std::make_unique<T>(act));
     activities_.push_back(wrappers_.back().get());
@@ -72,7 +70,8 @@ class MemoryTraceLogger : public ActivityLogger {
       const Config& config,
       std::unique_ptr<ActivityBuffers> buffers,
       int64_t endTime,
-      std::unordered_map<std::string, std::vector<std::string>>& metadata) override {
+      std::unordered_map<std::string, std::vector<std::string>>& metadata)
+      override {
     buffers_ = std::move(buffers);
     endTime_ = endTime;
   }
@@ -111,8 +110,8 @@ class MemoryTraceLogger : public ActivityLogger {
   std::shared_ptr<ActivityLogger> getChromeLogger() {
     return chrome_logger_;
   }
- private:
 
+ private:
   std::unique_ptr<Config> config_;
   // Optimization: Remove unique_ptr by keeping separate vector per type
   std::vector<const ITraceActivity*> activities_;
diff --git a/libkineto/src/plugin/xpupti/XpuptiActivityApi.cpp b/libkineto/src/plugin/xpupti/XpuptiActivityApi.cpp
index 63b320ad1..cdb5f676b 100644
--- a/libkineto/src/plugin/xpupti/XpuptiActivityApi.cpp
+++ b/libkineto/src/plugin/xpupti/XpuptiActivityApi.cpp
@@ -50,10 +50,8 @@ void XpuptiActivityApi::popCorrelationID(CorrelationFlowType type) {
 #endif
 }
 
-static bool nextActivityRecord(
-    uint8_t* buffer,
-    size_t valid_size,
-    Pti_Activity*& record) {
+static bool
+nextActivityRecord(uint8_t* buffer, size_t valid_size, Pti_Activity*& record) {
 #ifdef HAS_XPUPTI
   pti_result status = ptiViewGetNextRecord(buffer, valid_size, &record);
   if (status != pti_result::PTI_SUCCESS) {
@@ -67,7 +65,9 @@ void XpuptiActivityApi::setMaxBufferSize(int size) {
   maxGpuBufferCount_ = 1 + size / kBufSize;
 }
 
-void XpuptiActivityApi::bufferRequestedTrampoline(uint8_t** buffer, size_t* size) {
+void XpuptiActivityApi::bufferRequestedTrampoline(
+    uint8_t** buffer,
+    size_t* size) {
   singleton().bufferRequested(buffer, size);
 }
 
diff --git a/libkineto/src/plugin/xpupti/XpuptiActivityApi.h b/libkineto/src/plugin/xpupti/XpuptiActivityApi.h
index 676fbd89e..b16f8cac5 100644
--- a/libkineto/src/plugin/xpupti/XpuptiActivityApi.h
+++ b/libkineto/src/plugin/xpupti/XpuptiActivityApi.h
@@ -27,7 +27,8 @@ class XpuptiActivityApi {
   static void pushCorrelationID(int id, CorrelationFlowType type);
   static void popCorrelationID(CorrelationFlowType type);
 
-  void enableXpuptiActivities(const std::set<ActivityType>& selected_activities);
+  void enableXpuptiActivities(
+      const std::set<ActivityType>& selected_activities);
   void disablePtiActivities(const std::set<ActivityType>& selected_activities);
   void clearActivities();
 
@@ -57,10 +58,8 @@ class XpuptiActivityApi {
       size_t validSize,
       std::function<void(const Pti_Activity*)> handler);
   static void bufferRequestedTrampoline(uint8_t** buffer, size_t* size);
-  static void bufferCompletedTrampoline(
-      uint8_t* buffer,
-      size_t size,
-      size_t validSize);
+  static void
+  bufferCompletedTrampoline(uint8_t* buffer, size_t size, size_t validSize);
 
  protected:
   void bufferRequested(uint8_t** buffer, size_t* size);
diff --git a/libkineto/src/plugin/xpupti/XpuptiActivityHandlers.cpp b/libkineto/src/plugin/xpupti/XpuptiActivityHandlers.cpp
index 3003f4d3a..10dfb73a9 100644
--- a/libkineto/src/plugin/xpupti/XpuptiActivityHandlers.cpp
+++ b/libkineto/src/plugin/xpupti/XpuptiActivityHandlers.cpp
@@ -15,7 +15,8 @@ void XpuptiActivityProfilerSession::removeCorrelatedPtiActivities(
   return;
 }
 
-void XpuptiActivityProfilerSession::checkTimestampOrder(const ITraceActivity* act1) {
+void XpuptiActivityProfilerSession::checkTimestampOrder(
+    const ITraceActivity* act1) {
   const auto& it = correlatedPtiActivities_.find(act1->correlationId());
   if (it == correlatedPtiActivities_.end()) {
     correlatedPtiActivities_.insert({act1->correlationId(), act1});
@@ -38,7 +39,8 @@ void XpuptiActivityProfilerSession::checkTimestampOrder(const ITraceActivity* ac
   }
 }
 
-inline bool XpuptiActivityProfilerSession::outOfRange(const ITraceActivity& act) {
+inline bool XpuptiActivityProfilerSession::outOfRange(
+    const ITraceActivity& act) {
   bool out_of_range = act.timestamp() < captureWindowStartTime_ ||
       (act.timestamp() + act.duration()) > captureWindowEndTime_;
   if (out_of_range) {
@@ -103,7 +105,9 @@ void XpuptiActivityProfilerSession::handleRuntimeActivity(
   const ITraceActivity* linked =
       linkedActivity(activity->_correlation_id, cpuCorrelationMap_);
   traceBuffer_.emplace_activity(
-      traceBuffer_.span, ActivityType::XPU_RUNTIME, std::string(activity->_name));
+      traceBuffer_.span,
+      ActivityType::XPU_RUNTIME,
+      std::string(activity->_name));
   auto& runtime_activity = traceBuffer_.activities.back();
   runtime_activity->startTime = activity->_start_timestamp;
   runtime_activity->endTime = activity->_end_timestamp;
@@ -113,9 +117,11 @@ void XpuptiActivityProfilerSession::handleRuntimeActivity(
   runtime_activity->threadId = activity->_thread_id;
   runtime_activity->flow.id = activity->_correlation_id;
   runtime_activity->flow.type = libkineto::kLinkAsyncCpuGpu;
-  runtime_activity->flow.start = bool(std::find(correlateRuntimeOps_.begin(),
-                                                correlateRuntimeOps_.end(),
-                                                runtime_activity->name()) != correlateRuntimeOps_.end());
+  runtime_activity->flow.start = bool(
+      std::find(
+          correlateRuntimeOps_.begin(),
+          correlateRuntimeOps_.end(),
+          runtime_activity->name()) != correlateRuntimeOps_.end());
   runtime_activity->linked = linked;
   runtime_activity->addMetadata("correlation", activity->_correlation_id);
 
diff --git a/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.cpp b/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.cpp
index c8a0ea33a..e722e49ae 100644
--- a/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.cpp
+++ b/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.cpp
@@ -1,5 +1,5 @@
-#include "XpuptiActivityApi.h"
 #include "XpuptiActivityProfiler.h"
+#include "XpuptiActivityApi.h"
 
 #include <chrono>
 
@@ -8,16 +8,16 @@ namespace KINETO_NAMESPACE {
 uint32_t XpuptiActivityProfilerSession::iterationCount_ = 0;
 std::vector<std::array<unsigned char, 16>>
     XpuptiActivityProfilerSession::deviceUUIDs_ = {};
-std::vector<std::string> 
-    XpuptiActivityProfilerSession::correlateRuntimeOps_ = {"piextUSMEnqueueFill",
-                                                           "piextUSMEnqueueFill2D",
-                                                           "piextUSMEnqueueMemcpy",
-                                                           "piextUSMEnqueueMemset",
-                                                           "piextUSMEnqueueMemcpy2D",
-                                                           "piextUSMEnqueueMemset2D",
-                                                           "piEnqueueKernelLaunch",
-                                                           "piextEnqueueKernelLaunchCustom",
-                                                           "piextEnqueueCooperativeKernelLaunch"};
+std::vector<std::string> XpuptiActivityProfilerSession::correlateRuntimeOps_ = {
+    "piextUSMEnqueueFill",
+    "piextUSMEnqueueFill2D",
+    "piextUSMEnqueueMemcpy",
+    "piextUSMEnqueueMemset",
+    "piextUSMEnqueueMemcpy2D",
+    "piextUSMEnqueueMemset2D",
+    "piEnqueueKernelLaunch",
+    "piextEnqueueKernelLaunchCustom",
+    "piextEnqueueCooperativeKernelLaunch"};
 
 // =========== Session Constructor ============= //
 XpuptiActivityProfilerSession::XpuptiActivityProfilerSession(
@@ -73,18 +73,18 @@ void XpuptiActivityProfilerSession::processTrace(
   processTrace(logger);
 }
 
-std::unique_ptr<libkineto::DeviceInfo> XpuptiActivityProfilerSession::
-    getDeviceInfo() {
+std::unique_ptr<libkineto::DeviceInfo>
+XpuptiActivityProfilerSession::getDeviceInfo() {
   return {};
 }
 
-std::vector<libkineto::ResourceInfo> XpuptiActivityProfilerSession::
-    getResourceInfos() {
+std::vector<libkineto::ResourceInfo>
+XpuptiActivityProfilerSession::getResourceInfos() {
   return {};
 }
 
-std::unique_ptr<libkineto::CpuTraceBuffer> XpuptiActivityProfilerSession::
-    getTraceBuffer() {
+std::unique_ptr<libkineto::CpuTraceBuffer>
+XpuptiActivityProfilerSession::getTraceBuffer() {
   return std::make_unique<libkineto::CpuTraceBuffer>(std::move(traceBuffer_));
 }
 
@@ -165,20 +165,20 @@ const std::set<ActivityType>& XPUActivityProfiler::availableActivities() const {
   return kXpuTypes;
 }
 
-std::unique_ptr<libkineto::IActivityProfilerSession> XPUActivityProfiler::
-    configure(
-        const std::set<ActivityType>& activity_types,
-        const libkineto::Config& config) {
+std::unique_ptr<libkineto::IActivityProfilerSession>
+XPUActivityProfiler::configure(
+    const std::set<ActivityType>& activity_types,
+    const libkineto::Config& config) {
   return std::make_unique<XpuptiActivityProfilerSession>(
       XpuptiActivityApi::singleton(), config, activity_types);
 }
 
-std::unique_ptr<libkineto::IActivityProfilerSession> XPUActivityProfiler::
-    configure(
-        int64_t ts_ms,
-        int64_t duration_ms,
-        const std::set<ActivityType>& activity_types,
-        const libkineto::Config& config) {
+std::unique_ptr<libkineto::IActivityProfilerSession>
+XPUActivityProfiler::configure(
+    int64_t ts_ms,
+    int64_t duration_ms,
+    const std::set<ActivityType>& activity_types,
+    const libkineto::Config& config) {
   AsyncProfileStartTime_ = ts_ms;
   AsyncProfileEndTime_ = ts_ms + duration_ms;
   return configure(activity_types, config);
diff --git a/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.h b/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.h
index 86a75e7a5..4a939931b 100644
--- a/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.h
+++ b/libkineto/src/plugin/xpupti/XpuptiActivityProfiler.h
@@ -7,7 +7,8 @@
 
 namespace KINETO_NAMESPACE {
 
-class XpuptiActivityProfilerSession : public libkineto::IActivityProfilerSession {
+class XpuptiActivityProfilerSession
+    : public libkineto::IActivityProfilerSession {
  public:
   XpuptiActivityProfilerSession() = delete;
   XpuptiActivityProfilerSession(
@@ -15,8 +16,8 @@ class XpuptiActivityProfilerSession : public libkineto::IActivityProfilerSession
       const libkineto::Config& config,
       const std::set<ActivityType>& activity_types);
   XpuptiActivityProfilerSession(const XpuptiActivityProfilerSession&) = delete;
-  XpuptiActivityProfilerSession& operator=(const XpuptiActivityProfilerSession&) =
-      delete;
+  XpuptiActivityProfilerSession& operator=(
+      const XpuptiActivityProfilerSession&) = delete;
 
   ~XpuptiActivityProfilerSession();
 
@@ -65,7 +66,9 @@ class XpuptiActivityProfilerSession : public libkineto::IActivityProfilerSession
   void handleOverheadActivity(
       const pti_view_record_overhead* activity,
       ActivityLogger* logger);
-  void handlePtiActivity(const pti_view_record_base* record, ActivityLogger* logger);
+  void handlePtiActivity(
+      const pti_view_record_base* record,
+      ActivityLogger* logger);
 
   // enumerate XPU Device UUIDs from runtime for once
   void enumDeviceUUIDs();
diff --git a/libkineto/src/plugin/xpupti/XpuptiProfilerMacros.h b/libkineto/src/plugin/xpupti/XpuptiProfilerMacros.h
index ab4cac151..7cdedcca6 100644
--- a/libkineto/src/plugin/xpupti/XpuptiProfilerMacros.h
+++ b/libkineto/src/plugin/xpupti/XpuptiProfilerMacros.h
@@ -12,17 +12,17 @@ namespace KINETO_NAMESPACE {
 using namespace libkineto;
 
 #if PTI_VERSION_MAJOR > 0 || PTI_VERSION_MINOR > 9
-#define XPUPTI_CALL(returnCode)                                                 \
-  {                                                                             \
-    if (returnCode != PTI_SUCCESS) {                                            \
-      std::string funcMsg(__func__);                                            \
-      std::string codeMsg = std::to_string(returnCode);                         \
-      std::string HeadMsg("Kineto Profiler on XPU got error from function ");   \
-      std::string Msg(". The error code is ");                                  \
-      std::string detailMsg(". The detailed error message is ");                \
-      detailMsg = detailMsg + std::string(ptiResultTypeToString(returnCode));   \
-      throw std::runtime_error(HeadMsg + funcMsg + Msg + codeMsg + detailMsg);  \
-    }                                                                           \
+#define XPUPTI_CALL(returnCode)                                                \
+  {                                                                            \
+    if (returnCode != PTI_SUCCESS) {                                           \
+      std::string funcMsg(__func__);                                           \
+      std::string codeMsg = std::to_string(returnCode);                        \
+      std::string HeadMsg("Kineto Profiler on XPU got error from function ");  \
+      std::string Msg(". The error code is ");                                 \
+      std::string detailMsg(". The detailed error message is ");               \
+      detailMsg = detailMsg + std::string(ptiResultTypeToString(returnCode));  \
+      throw std::runtime_error(HeadMsg + funcMsg + Msg + codeMsg + detailMsg); \
+    }                                                                          \
   }
 #else
 #define XPUPTI_CALL(returnCode)                                               \
diff --git a/libkineto/stress_test/kineto_stress_test.cpp b/libkineto/stress_test/kineto_stress_test.cpp
index 6871b294f..3267e11f9 100644
--- a/libkineto/stress_test/kineto_stress_test.cpp
+++ b/libkineto/stress_test/kineto_stress_test.cpp
@@ -9,8 +9,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <cstdint>
-#include <iostream>
 #include <fstream>
+#include <iostream>
 #include <string>
 #include <thread>
 #include <vector>
@@ -18,24 +18,26 @@
 #include <folly/json/dynamic.h>
 #include <folly/json/json.h>
 
-#include <libkineto.h>
 #include <cuda.h>
 #include <cupti_activity.h>
+#include <libkineto.h>
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "mpi.h"
+#include <ApproximateClock.h>
+#include <c10/util/ApproximateClock.h>
+#include "kineto/libkineto/fb/nccl_profiler/NcclProfiler.h"
 #include "kineto/libkineto/stress_test/random_ops_stress_test.cuh"
 #include "kineto/libkineto/stress_test/tensor_cache.cuh"
 #include "kineto/libkineto/stress_test/utils.h"
-#include "kineto/libkineto/fb/nccl_profiler/NcclProfiler.h"
-#include <c10/util/ApproximateClock.h>
-#include <ApproximateClock.h>
+#include "mpi.h"
 
 using namespace kineto_stress_test;
 
-void read_inputs_from_json(std::string sJsonFile, stress_test_args *test_args,
-    tensor_cache_args *cache_args) {
+void read_inputs_from_json(
+    std::string sJsonFile,
+    stress_test_args* test_args,
+    tensor_cache_args* cache_args) {
   std::ifstream fJson(sJsonFile.c_str());
   std::string sJson;
 
@@ -51,67 +53,85 @@ void read_inputs_from_json(std::string sJsonFile, stress_test_args *test_args,
     folly::dynamic sJsonParsed = folly::parseJson(sJson);
 
     folly::dynamic jsonTestArgs = sJsonParsed["test_args"];
-    test_args->num_operations = (uint32_t)jsonTestArgs["num_operations"].asInt();
-    test_args->num_cuda_streams = (uint32_t)jsonTestArgs["num_cuda_streams"].asInt();
-    test_args->prob_cuda_malloc = (double)jsonTestArgs["prob_cuda_malloc"].asDouble();
-    test_args->min_iters_kernel = (uint32_t)jsonTestArgs["min_iters_kernel"].asInt();
-    test_args->max_iters_kernel = (uint32_t)jsonTestArgs["max_iters_kernel"].asInt();
+    test_args->num_operations =
+        (uint32_t)jsonTestArgs["num_operations"].asInt();
+    test_args->num_cuda_streams =
+        (uint32_t)jsonTestArgs["num_cuda_streams"].asInt();
+    test_args->prob_cuda_malloc =
+        (double)jsonTestArgs["prob_cuda_malloc"].asDouble();
+    test_args->min_iters_kernel =
+        (uint32_t)jsonTestArgs["min_iters_kernel"].asInt();
+    test_args->max_iters_kernel =
+        (uint32_t)jsonTestArgs["max_iters_kernel"].asInt();
     test_args->memset_prob = (double)jsonTestArgs["memset_prob"].asDouble();
     test_args->min_idle_us = (uint32_t)jsonTestArgs["min_idle_us"].asInt();
     test_args->max_idle_us = (uint32_t)jsonTestArgs["max_idle_us"].asInt();
-    test_args->simulate_host_time = (bool)jsonTestArgs["simulate_host_time"].asBool();
+    test_args->simulate_host_time =
+        (bool)jsonTestArgs["simulate_host_time"].asBool();
     test_args->num_workers = (uint32_t)jsonTestArgs["num_workers"].asInt();
     test_args->use_uvm_buffers = (bool)jsonTestArgs["use_uvm_buffers"].asBool();
-    test_args->uvm_kernel_prob = (double)jsonTestArgs["uvm_kernel_prob"].asDouble();
-    test_args->parallel_uvm_alloc = (bool)jsonTestArgs["parallel_uvm_alloc"].asBool();
+    test_args->uvm_kernel_prob =
+        (double)jsonTestArgs["uvm_kernel_prob"].asDouble();
+    test_args->parallel_uvm_alloc =
+        (bool)jsonTestArgs["parallel_uvm_alloc"].asBool();
     test_args->uvm_len = (uint64_t)jsonTestArgs["uvm_len"].asDouble();
     test_args->is_multi_rank = (bool)jsonTestArgs["is_multi_rank"].asBool();
-    test_args->sz_nccl_buff_KB = (uint32_t)jsonTestArgs["sz_nccl_buff_KB"].asInt();
-    test_args->num_iters_nccl_sync = (uint32_t)jsonTestArgs["num_iters_nccl_sync"].asInt();
-    test_args->pre_alloc_streams = (bool)jsonTestArgs["pre_alloc_streams"].asBool();
-    test_args->use_memcpy_stream = (bool)jsonTestArgs["use_memcpy_stream"].asBool();
+    test_args->sz_nccl_buff_KB =
+        (uint32_t)jsonTestArgs["sz_nccl_buff_KB"].asInt();
+    test_args->num_iters_nccl_sync =
+        (uint32_t)jsonTestArgs["num_iters_nccl_sync"].asInt();
+    test_args->pre_alloc_streams =
+        (bool)jsonTestArgs["pre_alloc_streams"].asBool();
+    test_args->use_memcpy_stream =
+        (bool)jsonTestArgs["use_memcpy_stream"].asBool();
     test_args->use_uvm_stream = (bool)jsonTestArgs["use_uvm_stream"].asBool();
-    test_args->monitor_mem_usage = (bool)jsonTestArgs["monitor_mem_usage"].asBool();
-    test_args->trace_length_us = (uint32_t)jsonTestArgs["trace_length_us"].asInt();
-    test_args->cupti_buffer_mb = (uint32_t)jsonTestArgs["cupti_buffer_mb"].asInt();
+    test_args->monitor_mem_usage =
+        (bool)jsonTestArgs["monitor_mem_usage"].asBool();
+    test_args->trace_length_us =
+        (uint32_t)jsonTestArgs["trace_length_us"].asInt();
+    test_args->cupti_buffer_mb =
+        (uint32_t)jsonTestArgs["cupti_buffer_mb"].asInt();
 
     folly::dynamic cacheArgs = sJsonParsed["cache_args"];
     cache_args->sz_cache_KB = (uint32_t)cacheArgs["sz_cache_KB"].asInt();
-    cache_args->sz_GPU_memory_KB = (uint32_t)cacheArgs["sz_GPU_memory_KB"].asInt();
-    cache_args->sz_min_tensor_KB = (uint32_t)cacheArgs["sz_min_tensor_KB"].asInt();
-    cache_args->sz_max_tensor_KB = (uint32_t)cacheArgs["sz_max_tensor_KB"].asInt();
+    cache_args->sz_GPU_memory_KB =
+        (uint32_t)cacheArgs["sz_GPU_memory_KB"].asInt();
+    cache_args->sz_min_tensor_KB =
+        (uint32_t)cacheArgs["sz_min_tensor_KB"].asInt();
+    cache_args->sz_max_tensor_KB =
+        (uint32_t)cacheArgs["sz_max_tensor_KB"].asInt();
     cache_args->prob_h2d = (double)cacheArgs["prob_h2d"].asDouble();
     cache_args->prob_d2h = (double)cacheArgs["prob_d2h"].asDouble();
     cache_args->num_increments = (uint32_t)cacheArgs["num_increments"].asInt();
-    cache_args->num_pairs_per_increment = (uint32_t)cacheArgs["num_pairs_per_increment"].asInt();
+    cache_args->num_pairs_per_increment =
+        (uint32_t)cacheArgs["num_pairs_per_increment"].asInt();
   } else {
     printf("Reading input %s failed!.\n", sJsonFile.c_str());
   }
 }
 
-void trace_collection_thread(uint32_t trace_length_us,
-  uint32_t cupti_buffer_mb) {
+void trace_collection_thread(
+    uint32_t trace_length_us,
+    uint32_t cupti_buffer_mb) {
   c10::ApproximateClockToUnixTimeConverter clockConverter;
 
-
   if (cupti_buffer_mb > 0) {
     // Configure CUPTI buffer sizes
     size_t attrValue = 0, attrValueSize = sizeof(size_t);
     attrValue = (size_t)(cupti_buffer_mb * 1024 * 1024);
-    cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE,
-      &attrValueSize, &attrValue);
+    cuptiActivitySetAttribute(
+        CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE, &attrValueSize, &attrValue);
   }
 
   // Config Kineto
   std::set<libkineto::ActivityType> types = {
-    libkineto::ActivityType::CONCURRENT_KERNEL,
-    libkineto::ActivityType::GPU_MEMCPY,
-    libkineto::ActivityType::GPU_MEMSET,
-    libkineto::ActivityType::CUDA_RUNTIME,
-    libkineto::ActivityType::EXTERNAL_CORRELATION,
-    libkineto::ActivityType::OVERHEAD,
-    libkineto::ActivityType::COLLECTIVE_COMM
-  };
+      libkineto::ActivityType::CONCURRENT_KERNEL,
+      libkineto::ActivityType::GPU_MEMCPY,
+      libkineto::ActivityType::GPU_MEMSET,
+      libkineto::ActivityType::CUDA_RUNTIME,
+      libkineto::ActivityType::EXTERNAL_CORRELATION,
+      libkineto::ActivityType::OVERHEAD,
+      libkineto::ActivityType::COLLECTIVE_COMM};
   auto& profiler = libkineto::api().activityProfiler();
   libkineto::api().initProfilerIfRegistered();
   profiler.prepareTrace(types);
@@ -130,30 +150,64 @@ void trace_collection_thread(uint32_t trace_length_us,
   trace->save(kTraceFile);
 }
 
-void uvm_allocation_thread(stress_test_args *test_args) {
-    uint64_t alloc_size = test_args->uvm_len * sizeof(float);
-
-    std::cout << "UVM is used. Allocation size (MB) = " << 2 * alloc_size / (1024 * 1024) << std::endl;
-    int currentDevice = 0;
-    checkCudaStatus(cudaGetDevice(&currentDevice), __LINE__);
-    checkCudaStatus(cudaMallocManaged((void**)&test_args->uvm_a, alloc_size, cudaMemAttachGlobal), __LINE__);
-    checkCudaStatus(cudaMallocManaged((void**)&test_args->uvm_b, alloc_size, cudaMemAttachGlobal), __LINE__);
-    checkCudaStatus(cudaMemAdvise((void*)test_args->uvm_a, alloc_size, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId), __LINE__);
-    checkCudaStatus(cudaMemAdvise((void*)test_args->uvm_b, alloc_size, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId), __LINE__);
-    checkCudaStatus(cudaMemAdvise((void*)test_args->uvm_a, alloc_size, cudaMemAdviseSetAccessedBy, currentDevice), __LINE__);
-    checkCudaStatus(cudaMemAdvise((void*)test_args->uvm_b, alloc_size, cudaMemAdviseSetAccessedBy, currentDevice), __LINE__);
-    std::cout << "UVM buffers allocated. Initializing them with values." << std::endl;
-
-    // Put a bunch of non-zero values into the UVM buffers
-    srand(time(nullptr));
-    for (uint64_t i = 0; i < 32 * 128 * 1024; ++i) {
-      uint64_t idx_a = rand() % test_args->uvm_len;
-      uint64_t idx_b = rand() % test_args->uvm_len;
-      test_args->uvm_a[idx_a] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX);
-      test_args->uvm_b[idx_b] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX);
-    }
+void uvm_allocation_thread(stress_test_args* test_args) {
+  uint64_t alloc_size = test_args->uvm_len * sizeof(float);
+
+  std::cout << "UVM is used. Allocation size (MB) = "
+            << 2 * alloc_size / (1024 * 1024) << std::endl;
+  int currentDevice = 0;
+  checkCudaStatus(cudaGetDevice(&currentDevice), __LINE__);
+  checkCudaStatus(
+      cudaMallocManaged(
+          (void**)&test_args->uvm_a, alloc_size, cudaMemAttachGlobal),
+      __LINE__);
+  checkCudaStatus(
+      cudaMallocManaged(
+          (void**)&test_args->uvm_b, alloc_size, cudaMemAttachGlobal),
+      __LINE__);
+  checkCudaStatus(
+      cudaMemAdvise(
+          (void*)test_args->uvm_a,
+          alloc_size,
+          cudaMemAdviseSetPreferredLocation,
+          cudaCpuDeviceId),
+      __LINE__);
+  checkCudaStatus(
+      cudaMemAdvise(
+          (void*)test_args->uvm_b,
+          alloc_size,
+          cudaMemAdviseSetPreferredLocation,
+          cudaCpuDeviceId),
+      __LINE__);
+  checkCudaStatus(
+      cudaMemAdvise(
+          (void*)test_args->uvm_a,
+          alloc_size,
+          cudaMemAdviseSetAccessedBy,
+          currentDevice),
+      __LINE__);
+  checkCudaStatus(
+      cudaMemAdvise(
+          (void*)test_args->uvm_b,
+          alloc_size,
+          cudaMemAdviseSetAccessedBy,
+          currentDevice),
+      __LINE__);
+  std::cout << "UVM buffers allocated. Initializing them with values."
+            << std::endl;
+
+  // Put a bunch of non-zero values into the UVM buffers
+  srand(time(nullptr));
+  for (uint64_t i = 0; i < 32 * 128 * 1024; ++i) {
+    uint64_t idx_a = rand() % test_args->uvm_len;
+    uint64_t idx_b = rand() % test_args->uvm_len;
+    test_args->uvm_a[idx_a] =
+        static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
+    test_args->uvm_b[idx_b] =
+        static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
+  }
 
-    std::cout << "UVM buffers initialized." << std::endl;
+  std::cout << "UVM buffers initialized." << std::endl;
 }
 
 void run_parallel_stress_test(stress_test_args test_args) {
@@ -161,7 +215,8 @@ void run_parallel_stress_test(stress_test_args test_args) {
   if (test_args.num_workers > 1) {
     v_workers.reserve(test_args.num_workers);
     for (int i = 0; i < test_args.num_workers; ++i) {
-      v_workers.push_back(std::thread(run_stress_test, i, test_args.num_workers, test_args));
+      v_workers.push_back(
+          std::thread(run_stress_test, i, test_args.num_workers, test_args));
     }
     for (auto& t : v_workers) {
       t.join();
@@ -173,38 +228,60 @@ void run_parallel_stress_test(stress_test_args test_args) {
 }
 
 void create_cuda_streams(stress_test_args& test_args) {
-  test_args.compute_streams = (cudaStream_t*)malloc(test_args.num_cuda_streams *
-    test_args.num_workers * sizeof(cudaStream_t));
-  for (uint32_t i = 0; i < test_args.num_cuda_streams * test_args.num_workers; ++i) {
-    checkCudaStatus(cudaStreamCreateWithFlags(test_args.compute_streams + i, cudaStreamNonBlocking), __LINE__);
+  test_args.compute_streams = (cudaStream_t*)malloc(
+      test_args.num_cuda_streams * test_args.num_workers *
+      sizeof(cudaStream_t));
+  for (uint32_t i = 0; i < test_args.num_cuda_streams * test_args.num_workers;
+       ++i) {
+    checkCudaStatus(
+        cudaStreamCreateWithFlags(
+            test_args.compute_streams + i, cudaStreamNonBlocking),
+        __LINE__);
   }
 
   if (test_args.use_memcpy_stream) {
-    test_args.memcpy_streams = (cudaStream_t*)malloc(test_args.num_workers * sizeof(cudaStream_t));
+    test_args.memcpy_streams =
+        (cudaStream_t*)malloc(test_args.num_workers * sizeof(cudaStream_t));
     for (uint32_t i = 0; i < test_args.num_workers; ++i) {
-      // We want to test the effect of CUDA stream priorities on the order of memory transfers.
+      // We want to test the effect of CUDA stream priorities on the order of
+      // memory transfers.
       if (i % 2 != 0) {
-        checkCudaStatus(cudaStreamCreateWithFlags(test_args.memcpy_streams + i, cudaStreamNonBlocking), __LINE__);
+        checkCudaStatus(
+            cudaStreamCreateWithFlags(
+                test_args.memcpy_streams + i, cudaStreamNonBlocking),
+            __LINE__);
       } else {
         int leastPriority = 0;
         int greatestPriority = 0;
-        checkCudaStatus(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority), __LINE__);
-        checkCudaStatus(cudaStreamCreateWithPriority(test_args.memcpy_streams + i, cudaStreamNonBlocking, leastPriority), __LINE__);
+        checkCudaStatus(
+            cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority),
+            __LINE__);
+        checkCudaStatus(
+            cudaStreamCreateWithPriority(
+                test_args.memcpy_streams + i,
+                cudaStreamNonBlocking,
+                leastPriority),
+            __LINE__);
       }
     }
   }
 
   if (test_args.use_uvm_stream) {
-    test_args.uvm_streams = (cudaStream_t*)malloc(test_args.num_workers * sizeof(cudaStream_t));
+    test_args.uvm_streams =
+        (cudaStream_t*)malloc(test_args.num_workers * sizeof(cudaStream_t));
     for (uint32_t i = 0; i < test_args.num_workers; ++i) {
-      checkCudaStatus(cudaStreamCreateWithFlags(test_args.uvm_streams + i, cudaStreamNonBlocking), __LINE__);
+      checkCudaStatus(
+          cudaStreamCreateWithFlags(
+              test_args.uvm_streams + i, cudaStreamNonBlocking),
+          __LINE__);
     }
   }
 }
 
 void cleanup_cuda_streams(stress_test_args& test_args) {
   for (int i = 0; i < test_args.num_cuda_streams * test_args.num_workers; ++i) {
-    checkCudaStatus(cudaStreamSynchronize(test_args.compute_streams[i]), __LINE__);
+    checkCudaStatus(
+        cudaStreamSynchronize(test_args.compute_streams[i]), __LINE__);
     checkCudaStatus(cudaStreamDestroy(test_args.compute_streams[i]), __LINE__);
   }
 
@@ -214,7 +291,8 @@ void cleanup_cuda_streams(stress_test_args& test_args) {
 
   if (test_args.use_memcpy_stream) {
     for (int i = 0; i < test_args.num_workers; ++i) {
-      checkCudaStatus(cudaStreamSynchronize(test_args.memcpy_streams[i]), __LINE__);
+      checkCudaStatus(
+          cudaStreamSynchronize(test_args.memcpy_streams[i]), __LINE__);
       checkCudaStatus(cudaStreamDestroy(test_args.memcpy_streams[i]), __LINE__);
     }
 
@@ -225,7 +303,8 @@ void cleanup_cuda_streams(stress_test_args& test_args) {
 
   if (test_args.use_uvm_stream) {
     for (int i = 0; i < test_args.num_workers; ++i) {
-      checkCudaStatus(cudaStreamSynchronize(test_args.uvm_streams[i]), __LINE__);
+      checkCudaStatus(
+          cudaStreamSynchronize(test_args.uvm_streams[i]), __LINE__);
       checkCudaStatus(cudaStreamDestroy(test_args.uvm_streams[i]), __LINE__);
     }
 
@@ -235,8 +314,7 @@ void cleanup_cuda_streams(stress_test_args& test_args) {
   }
 }
 
-int main(int argc, char *argv[]) {
-
+int main(int argc, char* argv[]) {
   /////////////////////////////////////////////////////////////////////////////
   // Read test configuration
   /////////////////////////////////////////////////////////////////////////////
@@ -265,7 +343,9 @@ int main(int argc, char *argv[]) {
     test_args.num_workers = 1;
     test_args.num_ranks = num_ranks;
     test_args.rank = rank;
-    std::cout << "When running in multi-rank mode, only a single worker can be used!" << std::endl;
+    std::cout
+        << "When running in multi-rank mode, only a single worker can be used!"
+        << std::endl;
   } else {
     test_args.rank = 0;
     test_args.num_ranks = 1;
@@ -283,17 +363,25 @@ int main(int argc, char *argv[]) {
     if (test_args.rank == 0) {
       NCCLCHECK(ncclGetUniqueId(&nccl_id));
     }
-    MPICHECK(MPI_Bcast((void *)&nccl_id, sizeof(nccl_id), MPI_BYTE, 0, MPI_COMM_WORLD));
+    MPICHECK(MPI_Bcast(
+        (void*)&nccl_id, sizeof(nccl_id), MPI_BYTE, 0, MPI_COMM_WORLD));
     MPICHECK(MPI_Barrier(MPI_COMM_WORLD));
 
     // Create the communicator
-    NCCLCHECK(ncclCommInitRank(&nccl_communicator, test_args.num_ranks, nccl_id, test_args.rank));
+    NCCLCHECK(ncclCommInitRank(
+        &nccl_communicator, test_args.num_ranks, nccl_id, test_args.rank));
 
     // Allocate memory for the buffers
-    checkCudaStatus(cudaMalloc(&pBuffNCCLSend, test_args.sz_nccl_buff_KB * 1024), __LINE__);
-    checkCudaStatus(cudaMalloc(&pBuffNCCLRecv, test_args.sz_nccl_buff_KB * 1024), __LINE__);
-    checkCudaStatus(cudaMemset(pBuffNCCLSend, 0, test_args.sz_nccl_buff_KB * 1024), __LINE__);
-    checkCudaStatus(cudaMemset(pBuffNCCLRecv, 0, test_args.sz_nccl_buff_KB * 1024), __LINE__);
+    checkCudaStatus(
+        cudaMalloc(&pBuffNCCLSend, test_args.sz_nccl_buff_KB * 1024), __LINE__);
+    checkCudaStatus(
+        cudaMalloc(&pBuffNCCLRecv, test_args.sz_nccl_buff_KB * 1024), __LINE__);
+    checkCudaStatus(
+        cudaMemset(pBuffNCCLSend, 0, test_args.sz_nccl_buff_KB * 1024),
+        __LINE__);
+    checkCudaStatus(
+        cudaMemset(pBuffNCCLRecv, 0, test_args.sz_nccl_buff_KB * 1024),
+        __LINE__);
 
     // Make sure all the processes have allocated this memory
     MPICHECK(MPI_Barrier(MPI_COMM_WORLD));
@@ -310,17 +398,20 @@ int main(int argc, char *argv[]) {
       uvm_init_thread = std::thread(uvm_allocation_thread, &test_args);
     } else {
       uvm_allocation_thread(&test_args);
-      std::cout << "Rank " << test_args.rank << " finished UVM init." << std::endl;
+      std::cout << "Rank " << test_args.rank << " finished UVM init."
+                << std::endl;
     }
   }
 
   generate_tensor_cache(cache_args);
-  std::cout << "Rank " << test_args.rank << " generating tensor cache completed." << std::endl;
+  std::cout << "Rank " << test_args.rank
+            << " generating tensor cache completed." << std::endl;
 
   if (test_args.use_uvm_buffers) {
     if (test_args.parallel_uvm_alloc) {
       uvm_init_thread.join();
-      std::cout << "Rank " << test_args.rank << " finished UVM init." << std::endl;
+      std::cout << "Rank " << test_args.rank << " finished UVM init."
+                << std::endl;
     }
   }
 
@@ -346,8 +437,9 @@ int main(int argc, char *argv[]) {
   run_parallel_stress_test(test_args);
   clock_t t_stop = clock();
   double t_no_trace = (double)(t_stop - t_start) / 1e+3;
-  std::cout << "Rank " << test_args.rank << " before kineto tracing. Duration (ms) = "
-      << t_no_trace << std::endl;
+  std::cout << "Rank " << test_args.rank
+            << " before kineto tracing. Duration (ms) = " << t_no_trace
+            << std::endl;
 
   // Re-generate tensor cache values
   re_initialize_buffer_values();
@@ -360,24 +452,27 @@ int main(int argc, char *argv[]) {
     // Tracing thread
     std::thread kineto_thread;
 
-    // We are gradually increasing the GPU memory usage so that we have GPU traces being
-    // collected while we are almost out-of-memory. This is an attempt to expose errors
-    // that we often see in our fleet like: illegal instruction, uncorrectable NVLink
-    // error, etc.
+    // We are gradually increasing the GPU memory usage so that we have GPU
+    // traces being collected while we are almost out-of-memory. This is an
+    // attempt to expose errors that we often see in our fleet like: illegal
+    // instruction, uncorrectable NVLink error, etc.
 
     for (uint32_t idx = 0; idx < cache_args.num_increments; ++idx) {
       // Run with kineto tracing
       t_start = clock();
-      kineto_thread = std::thread(trace_collection_thread, test_args.trace_length_us,
-        test_args.cupti_buffer_mb);
+      kineto_thread = std::thread(
+          trace_collection_thread,
+          test_args.trace_length_us,
+          test_args.cupti_buffer_mb);
       run_parallel_stress_test(test_args);
       kineto_thread.join();
       t_stop = clock();
       double t_with_trace = (double)(t_stop - t_start) / 1e+3;
 
       std::cout << "Rank " << test_args.rank << " kineto run " << idx
-          << " completed. Used GPU memory (MB) = " << sz_memory_pool_KB / 1024
-          << "; Duration (ms) = " << t_with_trace << std::endl;
+                << " completed. Used GPU memory (MB) = "
+                << sz_memory_pool_KB / 1024
+                << "; Duration (ms) = " << t_with_trace << std::endl;
 
       // The first run is the default run
       add_pairs_to_tensor_cache(cache_args, cache_args.num_pairs_per_increment);
@@ -389,7 +484,8 @@ int main(int argc, char *argv[]) {
       MPICHECK(MPI_Barrier(MPI_COMM_WORLD));
     }
   } else {
-    std::cout << "Rank " << test_args.rank << " has tracing disabled (trace_length = 0)!" << std::endl;
+    std::cout << "Rank " << test_args.rank
+              << " has tracing disabled (trace_length = 0)!" << std::endl;
   }
 
   // Run again after kineto tracing
@@ -397,9 +493,11 @@ int main(int argc, char *argv[]) {
   run_parallel_stress_test(test_args);
   t_stop = clock();
   double t_after_trace = (double)(t_stop - t_start) / 1e+3;
-  std::cout << "Rank " << test_args.rank << " after kineto tracing. Duration (ms) = "
-      << t_after_trace << "; Kernel Launch Throughput = " << (double)test_args.num_operations / (t_after_trace / 1000)
-      << " kernels/second" << std::endl;
+  std::cout << "Rank " << test_args.rank
+            << " after kineto tracing. Duration (ms) = " << t_after_trace
+            << "; Kernel Launch Throughput = "
+            << (double)test_args.num_operations / (t_after_trace / 1000)
+            << " kernels/second" << std::endl;
 
   // Final barrier before destroying everything
   if (test_args.is_multi_rank) {
diff --git a/libkineto/stress_test/random_ops_stress_test.cu b/libkineto/stress_test/random_ops_stress_test.cu
index f04f67333..fad2b52fc 100644
--- a/libkineto/stress_test/random_ops_stress_test.cu
+++ b/libkineto/stress_test/random_ops_stress_test.cu
@@ -20,8 +20,8 @@ namespace kineto_stress_test {
 // NCCL variables buffers
 ncclUniqueId nccl_id;
 ncclComm_t nccl_communicator;
-float *pBuffNCCLSend;
-float *pBuffNCCLRecv;
+float* pBuffNCCLSend;
+float* pBuffNCCLRecv;
 
 // C = A + B kernel where A and B are generated using a linear
 // congruential generator. If the number of iterations is small
@@ -31,7 +31,7 @@ float *pBuffNCCLRecv;
 // We use the template call to be able to change the kernel name with
 // a simple hardcoded constant number
 
-template<uint32_t offset_seed_a, uint32_t offset_seed_b>
+template <uint32_t offset_seed_a, uint32_t offset_seed_b>
 __global__ void iterative_lcg_3_buffers(lcg_kernel_input input) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < input.len) {
@@ -51,7 +51,7 @@ __global__ void iterative_lcg_3_buffers(lcg_kernel_input input) {
   }
 }
 
-template<uint32_t offset_seed_a, uint32_t offset_seed_b>
+template <uint32_t offset_seed_a, uint32_t offset_seed_b>
 __global__ void iterative_lcg_3_with_uvm(lcg_kernel_input input) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   uint64_t uvm_idx = idx;
@@ -60,7 +60,7 @@ __global__ void iterative_lcg_3_with_uvm(lcg_kernel_input input) {
   float val_uvm_b = 0.0f;
 
   // Fetch data from UVM
-  for (int i = 0; i < 5 ; ++i) {
+  for (int i = 0; i < 5; ++i) {
     val_uvm_a += input.uvm_a[uvm_idx % input.uvm_len];
     val_uvm_b += input.uvm_b[uvm_idx % input.uvm_len];
     uvm_idx += 4096;
@@ -108,11 +108,12 @@ void run_stress_test(
   cudaStream_t memcpy_stream = NULL;
   cudaStream_t uvm_stream = NULL;
 
-  // Allocate streams within the test function as this can run on multiple threads
-  // and we want to see the effect of parallel stream creation
+  // Allocate streams within the test function as this can run on multiple
+  // threads and we want to see the effect of parallel stream creation
 
   if (test_args.pre_alloc_streams) {
-    v_streams = test_args.compute_streams + (thread_id * test_args.num_cuda_streams);
+    v_streams =
+        test_args.compute_streams + (thread_id * test_args.num_cuda_streams);
 
     if (test_args.use_memcpy_stream) {
       memcpy_stream = test_args.memcpy_streams[thread_id];
@@ -122,17 +123,24 @@ void run_stress_test(
       uvm_stream = test_args.uvm_streams[thread_id];
     }
   } else {
-    v_streams = (cudaStream_t*)malloc(test_args.num_cuda_streams * sizeof(cudaStream_t));
+    v_streams = (cudaStream_t*)malloc(
+        test_args.num_cuda_streams * sizeof(cudaStream_t));
     for (uint32_t i = 0; i < test_args.num_cuda_streams; ++i) {
-      checkCudaStatus(cudaStreamCreateWithFlags(v_streams + i, cudaStreamNonBlocking), __LINE__);
+      checkCudaStatus(
+          cudaStreamCreateWithFlags(v_streams + i, cudaStreamNonBlocking),
+          __LINE__);
     }
 
     if (test_args.use_memcpy_stream) {
-      checkCudaStatus(cudaStreamCreateWithFlags(&memcpy_stream, cudaStreamNonBlocking), __LINE__);
+      checkCudaStatus(
+          cudaStreamCreateWithFlags(&memcpy_stream, cudaStreamNonBlocking),
+          __LINE__);
     }
 
     if (test_args.use_uvm_stream) {
-      checkCudaStatus(cudaStreamCreateWithFlags(&uvm_stream, cudaStreamNonBlocking), __LINE__);
+      checkCudaStatus(
+          cudaStreamCreateWithFlags(&uvm_stream, cudaStreamNonBlocking),
+          __LINE__);
     }
   }
 
@@ -149,8 +157,9 @@ void run_stress_test(
   for (uint32_t i = 0; i < test_args.num_operations; ++i) {
     // All good things start with a break. In our case some GPU idle time
     if (test_args.simulate_host_time) {
-      uint32_t gpu_idle_us = rand_r(&rng_state) % (test_args.max_idle_us -
-          test_args.min_idle_us) + test_args.min_idle_us;
+      uint32_t gpu_idle_us =
+          rand_r(&rng_state) % (test_args.max_idle_us - test_args.min_idle_us) +
+          test_args.min_idle_us;
       usleep(gpu_idle_us);
     }
 
@@ -171,7 +180,8 @@ void run_stress_test(
     }
 
     // Check if we do a CUDA malloc
-    if (((float)(rand_r(&rng_state) % 32767) / 32767.0) < test_args.prob_cuda_malloc) {
+    if (((float)(rand_r(&rng_state) % 32767) / 32767.0) <
+        test_args.prob_cuda_malloc) {
       checkCudaStatus(cudaDeviceSynchronize(), __LINE__);
       free_and_realloc_tensor_pairs(p_memory_pool + pair_idx, current_stream);
 
@@ -209,9 +219,9 @@ void run_stress_test(
     }
 
     // Launch kernel
-    uint32_t num_iters_stream =
-        rand_r(&rng_state) % (test_args.max_iters_kernel - test_args.min_iters_kernel) +
-            test_args.min_iters_kernel;
+    uint32_t num_iters_stream = rand_r(&rng_state) %
+            (test_args.max_iters_kernel - test_args.min_iters_kernel) +
+        test_args.min_iters_kernel;
     uint32_t thread_blocks = p_memory_pool[pair_idx].n_elements / 256;
     lcg_kernel_input kernel_args;
     kernel_args.d_a = p_memory_pool[pair_idx].d_A;
@@ -224,34 +234,43 @@ void run_stress_test(
       kernel_args.uvm_a = test_args.uvm_a;
       kernel_args.uvm_b = test_args.uvm_b;
       kernel_args.uvm_len = test_args.uvm_len;
-    }
-    else {
+    } else {
       kernel_args.uvm_a = NULL;
       kernel_args.uvm_b = NULL;
       kernel_args.uvm_len = 0;
     }
 
-    bool b_do_memset = ((float)(rand_r(&rng_state) % 32767) / 32767.0) < test_args.memset_prob;
-    bool b_uvm_kernel = ((float)(rand_r(&rng_state) % 32767) / 32767.0) < test_args.uvm_kernel_prob ? true : false;
+    bool b_do_memset =
+        ((float)(rand_r(&rng_state) % 32767) / 32767.0) < test_args.memset_prob;
+    bool b_uvm_kernel = ((float)(rand_r(&rng_state) % 32767) / 32767.0) <
+            test_args.uvm_kernel_prob
+        ? true
+        : false;
     if ((kernel_args.uvm_len > 0) && (b_uvm_kernel)) {
       if (b_do_memset) {
         memset((void*)test_args.uvm_a, 42, kernel_args.len * sizeof(float));
         memset((void*)test_args.uvm_a, 42, kernel_args.len * sizeof(float));
-        // checkCudaStatus(cudaMemset((void*)test_args.uvm_a, 42, kernel_args.len * sizeof(float)));
-        // checkCudaStatus(cudaMemset((void*)test_args.uvm_a, 42, kernel_args.len * sizeof(float)));
+        // checkCudaStatus(cudaMemset((void*)test_args.uvm_a, 42,
+        // kernel_args.len * sizeof(float)));
+        // checkCudaStatus(cudaMemset((void*)test_args.uvm_a, 42,
+        // kernel_args.len * sizeof(float)));
       } else {
-        iterative_lcg_3_with_uvm<113, 119><<<thread_blocks, 256, 0, current_uvm_stream>>>(kernel_args);
+        iterative_lcg_3_with_uvm<113, 119>
+            <<<thread_blocks, 256, 0, current_uvm_stream>>>(kernel_args);
         CUDA_KERNEL_LAUNCH_CHECK();
       }
     } else {
       // Check to see if we do a simple kernel call or a memset
       if (b_do_memset) {
-        checkCudaStatus(cudaMemset((void*)kernel_args.d_a, 42, kernel_args.len * sizeof(float)));
-        checkCudaStatus(cudaMemset((void*)kernel_args.d_b, 42, kernel_args.len * sizeof(float)));
-        checkCudaStatus(cudaMemset((void*)kernel_args.d_c, 42, kernel_args.len * sizeof(float)));
+        checkCudaStatus(cudaMemset(
+            (void*)kernel_args.d_a, 42, kernel_args.len * sizeof(float)));
+        checkCudaStatus(cudaMemset(
+            (void*)kernel_args.d_b, 42, kernel_args.len * sizeof(float)));
+        checkCudaStatus(cudaMemset(
+            (void*)kernel_args.d_c, 42, kernel_args.len * sizeof(float)));
       } else {
-        call_compute_kernel(thread_blocks, 256, 0, current_stream,
-            kernel_args, i);
+        call_compute_kernel(
+            thread_blocks, 256, 0, current_stream, kernel_args, i);
       }
     }
 
@@ -260,17 +279,33 @@ void run_stress_test(
     if ((i % test_args.num_iters_nccl_sync == 0) && (test_args.is_multi_rank)) {
       uint32_t n_elements = p_memory_pool[pair_idx].n_elements;
       size_t szTransfer = n_elements * sizeof(float);
-      checkCudaStatus(cudaMemcpy(pBuffNCCLSend, p_memory_pool[pair_idx].d_C,
-          szTransfer, cudaMemcpyDeviceToDevice), __LINE__);
-      NCCLCHECK(ncclAllReduce((const void*)pBuffNCCLSend, (void*)pBuffNCCLRecv, n_elements,
-          ncclFloat, ncclAvg, nccl_communicator, current_stream));
+      checkCudaStatus(
+          cudaMemcpy(
+              pBuffNCCLSend,
+              p_memory_pool[pair_idx].d_C,
+              szTransfer,
+              cudaMemcpyDeviceToDevice),
+          __LINE__);
+      NCCLCHECK(ncclAllReduce(
+          (const void*)pBuffNCCLSend,
+          (void*)pBuffNCCLRecv,
+          n_elements,
+          ncclFloat,
+          ncclAvg,
+          nccl_communicator,
+          current_stream));
       checkCudaStatus(cudaStreamSynchronize(current_stream), __LINE__);
-      checkCudaStatus(cudaMemcpy(p_memory_pool[pair_idx].d_C, pBuffNCCLRecv,
-          szTransfer, cudaMemcpyDeviceToDevice), __LINE__);
+      checkCudaStatus(
+          cudaMemcpy(
+              p_memory_pool[pair_idx].d_C,
+              pBuffNCCLRecv,
+              szTransfer,
+              cudaMemcpyDeviceToDevice),
+          __LINE__);
     }
 
-    // Simulate checkpoint download. The odd workers will have higher stream priorities
-    // but lower number of transactions
+    // Simulate checkpoint download. The odd workers will have higher stream
+    // priorities but lower number of transactions
     bool enable_d2h_copy = p_memory_pool[pair_idx].b_copy_d2h;
     if (thread_id % 2 != 0) {
       if (rand_r(&rng_state) % 100 < 97) {
@@ -278,14 +313,16 @@ void run_stress_test(
       }
     }
 
-    // Tehchnically we should wait for the kernels to complete before downloading
-    // using a stream synchronization on the compute stream. But if we want to generate
-    // multiple overlapping transfers, we need to remove the synchronization. That means
-    // the downloaded tensors may not have correct data.
+    // Tehchnically we should wait for the kernels to complete before
+    // downloading using a stream synchronization on the compute stream. But if
+    // we want to generate multiple overlapping transfers, we need to remove the
+    // synchronization. That means the downloaded tensors may not have correct
+    // data.
 
     if (enable_d2h_copy) {
       // checkCudaStatus(cudaStreamSynchronize(current_stream), __LINE__);
-      uint32_t rand_index = rand_r(&rng_state) % p_memory_pool[pair_idx].n_elements;
+      uint32_t rand_index =
+          rand_r(&rng_state) % p_memory_pool[pair_idx].n_elements;
       checkCudaStatus(
           cudaMemcpyAsync(
               p_memory_pool[pair_idx].h_C,
@@ -295,7 +332,8 @@ void run_stress_test(
               current_memcpy_stream),
           __LINE__);
       uint32_t rand_idx_out = rand_r(&rng_state) % test_args.num_operations;
-      // checkCudaStatus(cudaStreamSynchronize(current_memcpy_stream), __LINE__);
+      // checkCudaStatus(cudaStreamSynchronize(current_memcpy_stream),
+      // __LINE__);
       h_output[rand_idx_out] = p_memory_pool[pair_idx].h_C[rand_index];
     }
 
@@ -378,136 +416,176 @@ void run_stress_test(
 // kernel names. This will make the trace to look like a rainbow.
 
 void call_compute_kernel(
-  uint32_t thread_blocks,
-  uint32_t threads_per_block,
-  uint32_t shmem_sz,
-  cudaStream_t stream,
-  lcg_kernel_input kernel_args,
-  uint32_t op_id
-) {
+    uint32_t thread_blocks,
+    uint32_t threads_per_block,
+    uint32_t shmem_sz,
+    cudaStream_t stream,
+    lcg_kernel_input kernel_args,
+    uint32_t op_id) {
   switch (op_id % 20) {
     case 0:
-      iterative_lcg_3_buffers<0, 1><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<0, 1>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 1:
-      iterative_lcg_3_buffers<1, 2><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<1, 2>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 2:
-      iterative_lcg_3_buffers<2, 3><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<2, 3>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 3:
-      iterative_lcg_3_buffers<3, 4><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<3, 4>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 4:
-      iterative_lcg_3_buffers<4, 5><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<4, 5>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 5:
-      iterative_lcg_3_buffers<5, 6><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<5, 6>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 6:
-      iterative_lcg_3_buffers<6, 7><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<6, 7>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 7:
-      iterative_lcg_3_buffers<7, 8><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<7, 8>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 8:
-      iterative_lcg_3_buffers<8, 9><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<8, 9>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 9:
-      iterative_lcg_3_buffers<9, 10><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<9, 10>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 10:
-      iterative_lcg_3_buffers<10, 11><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<10, 11>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 11:
-      iterative_lcg_3_buffers<11, 12><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<11, 12>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 12:
-      iterative_lcg_3_buffers<12, 13><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<12, 13>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 13:
-      iterative_lcg_3_buffers<13, 14><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<13, 14>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 14:
-      iterative_lcg_3_buffers<14, 15><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<14, 15>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 15:
-      iterative_lcg_3_buffers<15, 16><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<15, 16>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 16:
-      iterative_lcg_3_buffers<16, 17><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<16, 17>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 17:
-      iterative_lcg_3_buffers<17, 18><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<17, 18>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 18:
-      iterative_lcg_3_buffers<18, 19><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<18, 19>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 19:
-      iterative_lcg_3_buffers<19, 20><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<19, 20>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 20:
-      iterative_lcg_3_buffers<20, 1><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<20, 1>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 21:
-      iterative_lcg_3_buffers<21, 2><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<21, 2>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 22:
-      iterative_lcg_3_buffers<22, 3><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<22, 3>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 23:
-      iterative_lcg_3_buffers<23, 4><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<23, 4>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 24:
-      iterative_lcg_3_buffers<24, 5><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<24, 5>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 25:
-      iterative_lcg_3_buffers<25, 6><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<25, 6>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 26:
-      iterative_lcg_3_buffers<26, 7><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<26, 7>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 27:
-      iterative_lcg_3_buffers<27, 8><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<27, 8>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 28:
-      iterative_lcg_3_buffers<28, 9><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<28, 9>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 29:
-      iterative_lcg_3_buffers<29, 10><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<29, 10>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 30:
-      iterative_lcg_3_buffers<30, 11><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<30, 11>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 31:
-      iterative_lcg_3_buffers<31, 12><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<31, 12>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 32:
-      iterative_lcg_3_buffers<32, 13><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<32, 13>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 33:
-      iterative_lcg_3_buffers<33, 14><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<33, 14>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 34:
-      iterative_lcg_3_buffers<34, 15><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<34, 15>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 35:
-      iterative_lcg_3_buffers<35, 16><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<35, 16>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 36:
-      iterative_lcg_3_buffers<36, 17><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<36, 17>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 37:
-      iterative_lcg_3_buffers<37, 18><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<37, 18>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 38:
-      iterative_lcg_3_buffers<38, 19><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<38, 19>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     case 39:
-      iterative_lcg_3_buffers<39, 20><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<39, 20>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
     default:
-      iterative_lcg_3_buffers<0, 0><<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
+      iterative_lcg_3_buffers<0, 0>
+          <<<thread_blocks, threads_per_block, 0, stream>>>(kernel_args);
       break;
   }
   CUDA_KERNEL_LAUNCH_CHECK();
diff --git a/libkineto/stress_test/random_ops_stress_test.cuh b/libkineto/stress_test/random_ops_stress_test.cuh
index ba348e416..eea7a1ae8 100644
--- a/libkineto/stress_test/random_ops_stress_test.cuh
+++ b/libkineto/stress_test/random_ops_stress_test.cuh
@@ -23,108 +23,108 @@ extern float* pBuffNCCLSend;
 extern float* pBuffNCCLRecv;
 
 struct stress_test_args {
-    // Number of threads that run the stress test
-    uint32_t num_workers {1};
+  // Number of threads that run the stress test
+  uint32_t num_workers{1};
 
-    // Number of operations per stress test
-    uint32_t num_operations {10000};
+  // Number of operations per stress test
+  uint32_t num_operations{10000};
 
-    // Can improve event density. Cuda streams per worker
-    uint32_t num_cuda_streams {1};
+  // Can improve event density. Cuda streams per worker
+  uint32_t num_cuda_streams{1};
 
-    // Simulates cuda mallocs happening in the PT Cuda Cache Allocator
-    double prob_cuda_malloc {0.001};
+  // Simulates cuda mallocs happening in the PT Cuda Cache Allocator
+  double prob_cuda_malloc{0.001};
 
-    // The min number of compute iterations in the cuda kernel. If high
-    // this reduces event density.
-    uint32_t min_iters_kernel {1};
+  // The min number of compute iterations in the cuda kernel. If high
+  // this reduces event density.
+  uint32_t min_iters_kernel{1};
 
-    // The max number of compute iterations in the cuda kernel. If high
-    // this reduces event density.
-    uint32_t max_iters_kernel {5};
+  // The max number of compute iterations in the cuda kernel. If high
+  // this reduces event density.
+  uint32_t max_iters_kernel{5};
 
-    // The probability that instead of a kernel call we do a memset on the
-    // input buffers, using a magic value
-    double memset_prob {0.05};
+  // The probability that instead of a kernel call we do a memset on the
+  // input buffers, using a magic value
+  double memset_prob{0.05};
 
-    // The min idle time between kernel launches in microseconds
-    uint32_t min_idle_us {1};
+  // The min idle time between kernel launches in microseconds
+  uint32_t min_idle_us{1};
 
-    // The max idle time between kernel launches in microseconds
-    uint32_t max_idle_us {2};
+  // The max idle time between kernel launches in microseconds
+  uint32_t max_idle_us{2};
 
-    // If true, we randomly sleep a number of microseconds between kernel
-    // launches.
-    bool simulate_host_time {false};
+  // If true, we randomly sleep a number of microseconds between kernel
+  // launches.
+  bool simulate_host_time{false};
 
-    // If non-zero, we allocate UVM memory and use it
-    bool use_uvm_buffers {false};
+  // If non-zero, we allocate UVM memory and use it
+  bool use_uvm_buffers{false};
 
-    // Size of a single buffer in FP32 elements in UVM
-    uint64_t uvm_len {0};
+  // Size of a single buffer in FP32 elements in UVM
+  uint64_t uvm_len{0};
 
-    // If set true, the UVM allocation and initialization will be done in parallel
-    // with cache allocation (e.g. cudaHostAlloc)
-    bool parallel_uvm_alloc {false};
+  // If set true, the UVM allocation and initialization will be done in parallel
+  // with cache allocation (e.g. cudaHostAlloc)
+  bool parallel_uvm_alloc{false};
 
-    // The probability of running a kernel that uses UVM
-    double uvm_kernel_prob {0.001};
+  // The probability of running a kernel that uses UVM
+  double uvm_kernel_prob{0.001};
 
-    // If true we need to run the binary using MPI on multiple ranks
-    bool is_multi_rank {false};
+  // If true we need to run the binary using MPI on multiple ranks
+  bool is_multi_rank{false};
 
-    // Number of parallel processes to be spawned via MPI
-    int32_t num_ranks {1};
+  // Number of parallel processes to be spawned via MPI
+  int32_t num_ranks{1};
 
-    // Use this variable to pin a process to a specific GPU index.
-    // Do not modify!
-    int32_t rank {0};
+  // Use this variable to pin a process to a specific GPU index.
+  // Do not modify!
+  int32_t rank{0};
 
-    // Size of the NCCL buffers which needs to be at least the size
-    // of the largest tensor
-    uint32_t sz_nccl_buff_KB {1024};
+  // Size of the NCCL buffers which needs to be at least the size
+  // of the largest tensor
+  uint32_t sz_nccl_buff_KB{1024};
 
-    // Number of iterations between NCCL sync calls
-    uint32_t num_iters_nccl_sync {100};
+  // Number of iterations between NCCL sync calls
+  uint32_t num_iters_nccl_sync{100};
 
-    // If true, we pre-allocate CUDA streams and reuse them throughout
-    // the experiment
-    bool pre_alloc_streams {false};
+  // If true, we pre-allocate CUDA streams and reuse them throughout
+  // the experiment
+  bool pre_alloc_streams{false};
 
-    // If true, h2d and d2h transfers would be scheduled on their own
-    // CUDA stream
-    bool use_memcpy_stream {false};
+  // If true, h2d and d2h transfers would be scheduled on their own
+  // CUDA stream
+  bool use_memcpy_stream{false};
 
-    // If true, kernels using UVM would be scheduled on their own
-    // CUDA stream
-    bool use_uvm_stream {false};
+  // If true, kernels using UVM would be scheduled on their own
+  // CUDA stream
+  bool use_uvm_stream{false};
 
-    // If true, we use cudaGetMemInfo throughout the stress test to
-    // measure peak memory usage
-    bool monitor_mem_usage {false};
+  // If true, we use cudaGetMemInfo throughout the stress test to
+  // measure peak memory usage
+  bool monitor_mem_usage{false};
 
-    // Number of microseconds for trace collection. If 0 the trace is
-    // not collected
-    uint32_t trace_length_us {1000000};
+  // Number of microseconds for trace collection. If 0 the trace is
+  // not collected
+  uint32_t trace_length_us{1000000};
 
-    // Size of the CUPTI activity buffer in MB. If it's 0, we don't
-    // explicitly set a value
-    uint32_t cupti_buffer_mb {0};
+  // Size of the CUPTI activity buffer in MB. If it's 0, we don't
+  // explicitly set a value
+  uint32_t cupti_buffer_mb{0};
 
-    /* VARIABLES */
+  /* VARIABLES */
 
-    // The CUDA streams vector
-    cudaStream_t *compute_streams {nullptr};
+  // The CUDA streams vector
+  cudaStream_t* compute_streams{nullptr};
 
-    // The explicit memcpy stream
-    cudaStream_t *memcpy_streams {nullptr};
+  // The explicit memcpy stream
+  cudaStream_t* memcpy_streams{nullptr};
 
-    // The explicit UVM stream
-    cudaStream_t *uvm_streams {nullptr};
+  // The explicit UVM stream
+  cudaStream_t* uvm_streams{nullptr};
 
-    // UVM buffers
-    float* uvm_a {nullptr};
-    float* uvm_b {nullptr};
+  // UVM buffers
+  float* uvm_a{nullptr};
+  float* uvm_b{nullptr};
 };
 
 // We are using this to reduce the number of code lines
@@ -141,13 +141,12 @@ struct lcg_kernel_input {
 
 // Use this function to vary the kernel name at runtime
 void call_compute_kernel(
-  uint32_t thread_blocks,
-  uint32_t threads_per_block,
-  uint32_t shmem_sz,
-  cudaStream_t stream,
-  lcg_kernel_input kernel_args,
-  uint32_t op_id
-);
+    uint32_t thread_blocks,
+    uint32_t threads_per_block,
+    uint32_t shmem_sz,
+    cudaStream_t stream,
+    lcg_kernel_input kernel_args,
+    uint32_t op_id);
 
 void run_stress_test(
     uint32_t thread_id,
diff --git a/libkineto/stress_test/tensor_cache.cu b/libkineto/stress_test/tensor_cache.cu
index 7b1805f5d..a8cffdce9 100644
--- a/libkineto/stress_test/tensor_cache.cu
+++ b/libkineto/stress_test/tensor_cache.cu
@@ -44,25 +44,29 @@ uint32_t sz_memory_pool_KB;
 // Number of tensor pairs in the memory pool
 uint32_t num_tensor_pairs;
 
-void add_pairs_to_tensor_cache(tensor_cache_args cache_args, uint32_t
-    num_added_pairs) {
+void add_pairs_to_tensor_cache(
+    tensor_cache_args cache_args,
+    uint32_t num_added_pairs) {
   uint32_t num_current_pairs = num_tensor_pairs;
 
-  for (uint32_t i = num_current_pairs;
-      i < num_current_pairs + num_added_pairs; ++i) {
+  for (uint32_t i = num_current_pairs; i < num_current_pairs + num_added_pairs;
+       ++i) {
     uint32_t num_KB =
         rand() % (cache_args.sz_max_tensor_KB - cache_args.sz_min_tensor_KB) +
-            cache_args.sz_min_tensor_KB;
+        cache_args.sz_min_tensor_KB;
     uint32_t num_elements = num_KB * 1024 / sizeof(float);
 
     // Allocate device buffers
     p_memory_pool[i].n_elements = num_elements;
     checkCudaStatus(
-        cudaMalloc(&p_memory_pool[i].d_A, num_elements * sizeof(float)), __LINE__);
+        cudaMalloc(&p_memory_pool[i].d_A, num_elements * sizeof(float)),
+        __LINE__);
     checkCudaStatus(
-        cudaMalloc(&p_memory_pool[i].d_B, num_elements * sizeof(float)), __LINE__);
+        cudaMalloc(&p_memory_pool[i].d_B, num_elements * sizeof(float)),
+        __LINE__);
     checkCudaStatus(
-        cudaMalloc(&p_memory_pool[i].d_C, num_elements * sizeof(float)), __LINE__);
+        cudaMalloc(&p_memory_pool[i].d_C, num_elements * sizeof(float)),
+        __LINE__);
 
     // Initialize device buffers with random values
     uint32_t thread_blocks = num_elements / 256;
@@ -76,11 +80,18 @@ void add_pairs_to_tensor_cache(tensor_cache_args cache_args, uint32_t
         p_memory_pool[i].d_C, p_memory_pool[i].n_elements);
     CUDA_KERNEL_LAUNCH_CHECK();
 
-    // Throw a dice to see if we will do memcopy host to device for this one and use pinned memory
+    // Throw a dice to see if we will do memcopy host to device for this one and
+    // use pinned memory
     if (((float)(rand() % 32767) / 32767.0) < cache_args.prob_h2d) {
       p_memory_pool[i].b_copy_h2d = true;
-      checkCudaStatus(cudaHostAlloc(&p_memory_pool[i].h_A, num_elements * sizeof(float), cudaHostAllocDefault), __LINE__);
-      // checkCudaStatus(cudaHostAlloc(&p_memory_pool[i].h_B, num_elements * sizeof(float), cudaHostAllocDefault), __LINE__);
+      checkCudaStatus(
+          cudaHostAlloc(
+              &p_memory_pool[i].h_A,
+              num_elements * sizeof(float),
+              cudaHostAllocDefault),
+          __LINE__);
+      // checkCudaStatus(cudaHostAlloc(&p_memory_pool[i].h_B, num_elements *
+      // sizeof(float), cudaHostAllocDefault), __LINE__);
       p_memory_pool[i].h_B = (float*)malloc(sizeof(float) * num_elements);
 
       simple_lcg_host(p_memory_pool[i].h_A, num_elements);
@@ -96,7 +107,12 @@ void add_pairs_to_tensor_cache(tensor_cache_args cache_args, uint32_t
       p_memory_pool[i].b_copy_d2h = true;
       // Make 50% of the D2H on pageable and 50% on pinned memory
       if (rand() % 2 == 1) {
-        checkCudaStatus(cudaHostAlloc(&p_memory_pool[i].h_C, num_elements * sizeof(float), cudaHostAllocDefault), __LINE__);
+        checkCudaStatus(
+            cudaHostAlloc(
+                &p_memory_pool[i].h_C,
+                num_elements * sizeof(float),
+                cudaHostAllocDefault),
+            __LINE__);
         p_memory_pool[i].h_C_pinned = true;
       } else {
         p_memory_pool[i].h_C = (float*)malloc(sizeof(float) * num_elements);
@@ -116,9 +132,8 @@ void add_pairs_to_tensor_cache(tensor_cache_args cache_args, uint32_t
 
 void generate_tensor_cache(tensor_cache_args cache_args) {
   // Estimate the number of tensor pairs
-  uint32_t num_pairs_max =
-      cache_args.sz_GPU_memory_KB / (3 * (cache_args.sz_max_tensor_KB -
-          cache_args.sz_min_tensor_KB) / 2);
+  uint32_t num_pairs_max = cache_args.sz_GPU_memory_KB /
+      (3 * (cache_args.sz_max_tensor_KB - cache_args.sz_min_tensor_KB) / 2);
 
   // Number of actual pairs
   num_tensor_pairs = 0;
@@ -128,8 +143,7 @@ void generate_tensor_cache(tensor_cache_args cache_args) {
 
   // Pre-allocate num_pairs_max and if num_tensor_pairs comes lower, well,
   // that's life
-  p_memory_pool =
-      (tensor_pair*)malloc(num_pairs_max * sizeof(tensor_pair));
+  p_memory_pool = (tensor_pair*)malloc(num_pairs_max * sizeof(tensor_pair));
 
   // Start creating the pool
   srand(RNG_SEED);
@@ -162,29 +176,38 @@ void re_initialize_buffer_values() {
   }
 }
 
-void free_and_realloc_tensor_pairs(tensor_pair *tensor_pair, cudaStream_t stream) {
+void free_and_realloc_tensor_pairs(
+    tensor_pair* tensor_pair,
+    cudaStream_t stream) {
   checkCudaStatus(cudaFree(tensor_pair->d_A), __LINE__);
   checkCudaStatus(cudaFree(tensor_pair->d_B), __LINE__);
   checkCudaStatus(cudaFree(tensor_pair->d_C), __LINE__);
 
   // Allocate device buffers
   uint32_t num_elements = tensor_pair->n_elements;
-  checkCudaStatus(cudaMalloc(&tensor_pair->d_A,
-    num_elements * sizeof(float)),
-    __LINE__);
-  checkCudaStatus(cudaMalloc(&tensor_pair->d_B,
-    num_elements * sizeof(float)),
-    __LINE__);
-  checkCudaStatus(cudaMalloc(&tensor_pair->d_C,
-    num_elements * sizeof(float)),
-    __LINE__);
+  checkCudaStatus(
+      cudaMalloc(&tensor_pair->d_A, num_elements * sizeof(float)), __LINE__);
+  checkCudaStatus(
+      cudaMalloc(&tensor_pair->d_B, num_elements * sizeof(float)), __LINE__);
+  checkCudaStatus(
+      cudaMalloc(&tensor_pair->d_C, num_elements * sizeof(float)), __LINE__);
 
   if (tensor_pair->b_copy_h2d) {
     checkCudaStatus(cudaFreeHost(tensor_pair->h_A), __LINE__);
     checkCudaStatus(cudaFreeHost(tensor_pair->h_B), __LINE__);
 
-    checkCudaStatus(cudaHostAlloc(&tensor_pair->h_A, num_elements * sizeof(float), cudaHostAllocDefault), __LINE__);
-    checkCudaStatus(cudaHostAlloc(&tensor_pair->h_B, num_elements * sizeof(float), cudaHostAllocDefault), __LINE__);
+    checkCudaStatus(
+        cudaHostAlloc(
+            &tensor_pair->h_A,
+            num_elements * sizeof(float),
+            cudaHostAllocDefault),
+        __LINE__);
+    checkCudaStatus(
+        cudaHostAlloc(
+            &tensor_pair->h_B,
+            num_elements * sizeof(float),
+            cudaHostAllocDefault),
+        __LINE__);
 
     simple_lcg_host(tensor_pair->h_A, num_elements);
     simple_lcg_host(tensor_pair->h_B, num_elements);
@@ -192,7 +215,12 @@ void free_and_realloc_tensor_pairs(tensor_pair *tensor_pair, cudaStream_t stream
 
   if (tensor_pair->b_copy_d2h) {
     checkCudaStatus(cudaFreeHost(tensor_pair->h_C), __LINE__);
-    checkCudaStatus(cudaHostAlloc(&tensor_pair->h_C, num_elements * sizeof(float), cudaHostAllocDefault), __LINE__);
+    checkCudaStatus(
+        cudaHostAlloc(
+            &tensor_pair->h_C,
+            num_elements * sizeof(float),
+            cudaHostAllocDefault),
+        __LINE__);
     simple_lcg_host(tensor_pair->h_C, num_elements);
   }
 }
@@ -210,7 +238,7 @@ void free_tensor_cache() {
       }
 
       if (p_memory_pool[i].h_B) {
-        //checkCudaStatus(cudaFreeHost(p_memory_pool[i].h_B), __LINE__);
+        // checkCudaStatus(cudaFreeHost(p_memory_pool[i].h_B), __LINE__);
         free(p_memory_pool[i].h_B);
         p_memory_pool[i].h_B = NULL;
       }
diff --git a/libkineto/stress_test/tensor_cache.cuh b/libkineto/stress_test/tensor_cache.cuh
index 4317b0528..980c66c31 100644
--- a/libkineto/stress_test/tensor_cache.cuh
+++ b/libkineto/stress_test/tensor_cache.cuh
@@ -50,32 +50,32 @@ struct tensor_pair {
 extern tensor_pair* p_memory_pool;
 
 struct tensor_cache_args {
-    // Sets GPU memory utilization
-    uint32_t sz_cache_KB {1024 * 128};
+  // Sets GPU memory utilization
+  uint32_t sz_cache_KB{1024 * 128};
 
-    // If small, density is higher due to shorter kernel times
-    uint32_t sz_min_tensor_KB {16};
+  // If small, density is higher due to shorter kernel times
+  uint32_t sz_min_tensor_KB{16};
 
-    // If large, we will have kernels with high duration thus smaller
-    // event density. That's because kernels will have to run on larger
-    // buffer sizes.
-    uint32_t sz_max_tensor_KB {2048};
+  // If large, we will have kernels with high duration thus smaller
+  // event density. That's because kernels will have to run on larger
+  // buffer sizes.
+  uint32_t sz_max_tensor_KB{2048};
 
-    // Sets the maximum GPU memory
-    uint32_t sz_GPU_memory_KB {1024 * 1024 * 16};
+  // Sets the maximum GPU memory
+  uint32_t sz_GPU_memory_KB{1024 * 1024 * 16};
 
-    // Simulates the chance of uploading a batch to the GPU.
-    // It reduces event density if it's set too high
-    double prob_h2d {0.005};
+  // Simulates the chance of uploading a batch to the GPU.
+  // It reduces event density if it's set too high
+  double prob_h2d{0.005};
 
-    // Simulates the chance of downloading results from the GPU.
-    // It reduces event density if it's set too high
-    double prob_d2h {0.0001};
+  // Simulates the chance of downloading results from the GPU.
+  // It reduces event density if it's set too high
+  double prob_d2h{0.0001};
 
-    // Number of increments in the GPU memory usage to see what happens at the
-    // peak memory usage.
-    uint32_t num_increments {1};
-    uint32_t num_pairs_per_increment {1};
+  // Number of increments in the GPU memory usage to see what happens at the
+  // peak memory usage.
+  uint32_t num_increments{1};
+  uint32_t num_pairs_per_increment{1};
 };
 
 // Generates all the buffer pairs, using a minimum and a maximum size.
@@ -84,11 +84,14 @@ struct tensor_cache_args {
 void generate_tensor_cache(tensor_cache_args cache_args);
 
 // Empties the tensor cache and reallocates it
-void free_and_realloc_tensor_pairs(tensor_pair *tensor_pair, cudaStream_t stream);
+void free_and_realloc_tensor_pairs(
+    tensor_pair* tensor_pair,
+    cudaStream_t stream);
 
 // For some experiments we may need to add additional pairs to stress the
 // GPU memory limits
-void add_pairs_to_tensor_cache(tensor_cache_args cache_args,
+void add_pairs_to_tensor_cache(
+    tensor_cache_args cache_args,
     uint32_t num_added_pairs);
 
 // Re-initializes the random values in the device buffers
diff --git a/libkineto/stress_test/utils.h b/libkineto/stress_test/utils.h
index c210a56a0..11849f93e 100644
--- a/libkineto/stress_test/utils.h
+++ b/libkineto/stress_test/utils.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
-#include <cuda_runtime_api.h>
 #include <cuda.h>
+#include <cuda_runtime_api.h>
 
 #include <stdexcept>
 #include <string>
@@ -47,22 +47,32 @@ inline void checkCudaStatus(cudaError_t status, int lineNumber = -1) {
 
 #define CUDA_KERNEL_LAUNCH_CHECK() CUDA_CHECK(cudaGetLastError())
 
-#define MPICHECK(cmd) do {                              \
-  int e = cmd;                                          \
-  if( e != MPI_SUCCESS ) {                              \
-    printf("PID %d --> Failed: MPI error %s:%d '%d'\n", \
-        getpid(), __FILE__,__LINE__, e);                \
-    exit(EXIT_FAILURE);                                 \
-  }                                                     \
-} while(0)
+#define MPICHECK(cmd)                                  \
+  do {                                                 \
+    int e = cmd;                                       \
+    if (e != MPI_SUCCESS) {                            \
+      printf(                                          \
+          "PID %d --> Failed: MPI error %s:%d '%d'\n", \
+          getpid(),                                    \
+          __FILE__,                                    \
+          __LINE__,                                    \
+          e);                                          \
+      exit(EXIT_FAILURE);                              \
+    }                                                  \
+  } while (0)
 
-#define NCCLCHECK(cmd) do {                                 \
-  ncclResult_t r = cmd;                                     \
-  if (r!= ncclSuccess) {                                    \
-    printf("PID %d --> Failed, NCCL error %s:%d '%s'\n",    \
-        getpid(), __FILE__,__LINE__,ncclGetErrorString(r)); \
-    exit(EXIT_FAILURE);                                     \
-  }                                                         \
-} while(0)
+#define NCCLCHECK(cmd)                                  \
+  do {                                                  \
+    ncclResult_t r = cmd;                               \
+    if (r != ncclSuccess) {                             \
+      printf(                                           \
+          "PID %d --> Failed, NCCL error %s:%d '%s'\n", \
+          getpid(),                                     \
+          __FILE__,                                     \
+          __LINE__,                                     \
+          ncclGetErrorString(r));                       \
+      exit(EXIT_FAILURE);                               \
+    }                                                   \
+  } while (0)
 
-} //namespace kineto_stress_test
+} // namespace kineto_stress_test
diff --git a/libkineto/test/ConfigTest.cpp b/libkineto/test/ConfigTest.cpp
index a99e428ff..1c3deb3d5 100644
--- a/libkineto/test/ConfigTest.cpp
+++ b/libkineto/test/ConfigTest.cpp
@@ -74,8 +74,10 @@ TEST(ParseTest, DefaultActivityTypes) {
   Config cfg;
   cfg.validate(std::chrono::system_clock::now());
   auto default_activities = defaultActivityTypes();
-  EXPECT_EQ(cfg.selectedActivityTypes(),
-    std::set<ActivityType>(default_activities.begin(), default_activities.end()));
+  EXPECT_EQ(
+      cfg.selectedActivityTypes(),
+      std::set<ActivityType>(
+          default_activities.begin(), default_activities.end()));
 }
 
 TEST(ParseTest, ActivityTypes) {
@@ -84,49 +86,59 @@ TEST(ParseTest, ActivityTypes) {
   EXPECT_TRUE(cfg.parse("ACTIVITY_TYPES="));
   EXPECT_FALSE(cfg.parse("=ACTIVITY_TYPES="));
 
-  EXPECT_EQ(cfg.selectedActivityTypes(),
-    std::set<ActivityType>({ActivityType::CPU_OP,
-                            ActivityType::CPU_INSTANT_EVENT,
-                            ActivityType::PYTHON_FUNCTION,
-                            ActivityType::USER_ANNOTATION,
-                            ActivityType::GPU_USER_ANNOTATION,
-                            ActivityType::GPU_MEMCPY,
-                            ActivityType::GPU_MEMSET,
-                            ActivityType::CONCURRENT_KERNEL,
-                            ActivityType::EXTERNAL_CORRELATION,
-                            ActivityType::OVERHEAD,
-                            ActivityType::CUDA_RUNTIME,
-                            ActivityType::CUDA_DRIVER,
-                            ActivityType::CUDA_SYNC,
-                            ActivityType::MTIA_RUNTIME,
-                            ActivityType::MTIA_CCP_EVENTS}));
+  EXPECT_EQ(
+      cfg.selectedActivityTypes(),
+      std::set<ActivityType>(
+          {ActivityType::CPU_OP,
+           ActivityType::CPU_INSTANT_EVENT,
+           ActivityType::PYTHON_FUNCTION,
+           ActivityType::USER_ANNOTATION,
+           ActivityType::GPU_USER_ANNOTATION,
+           ActivityType::GPU_MEMCPY,
+           ActivityType::GPU_MEMSET,
+           ActivityType::CONCURRENT_KERNEL,
+           ActivityType::EXTERNAL_CORRELATION,
+           ActivityType::OVERHEAD,
+           ActivityType::CUDA_RUNTIME,
+           ActivityType::CUDA_DRIVER,
+           ActivityType::CUDA_SYNC,
+           ActivityType::MTIA_RUNTIME,
+           ActivityType::MTIA_CCP_EVENTS}));
 
   Config cfg2;
   EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES=gpu_memcpy,gpu_MeMsEt,kernel"));
-  EXPECT_EQ(cfg2.selectedActivityTypes(),
-    std::set<ActivityType>({ActivityType::GPU_MEMCPY,
-                            ActivityType::GPU_MEMSET,
-                            ActivityType::CONCURRENT_KERNEL}));
+  EXPECT_EQ(
+      cfg2.selectedActivityTypes(),
+      std::set<ActivityType>(
+          {ActivityType::GPU_MEMCPY,
+           ActivityType::GPU_MEMSET,
+           ActivityType::CONCURRENT_KERNEL}));
 
   EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES = cuda_Runtime,"));
-  EXPECT_EQ(cfg2.selectedActivityTypes(),
-    std::set<ActivityType>({ActivityType::CUDA_RUNTIME}));
+  EXPECT_EQ(
+      cfg2.selectedActivityTypes(),
+      std::set<ActivityType>({ActivityType::CUDA_RUNTIME}));
 
   // Should throw an exception because incorrect activity name
   EXPECT_FALSE(cfg2.parse("ACTIVITY_TYPES = memcopy,cuda_runtime"));
 
   EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES = cpu_op"));
-  EXPECT_EQ(cfg2.selectedActivityTypes(),
-    std::set<ActivityType>({ActivityType::CPU_OP}));
+  EXPECT_EQ(
+      cfg2.selectedActivityTypes(),
+      std::set<ActivityType>({ActivityType::CPU_OP}));
 
   EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES = xpu_Runtime"));
-  EXPECT_EQ(cfg2.selectedActivityTypes(),
-    std::set<ActivityType>({ActivityType::XPU_RUNTIME}));
+  EXPECT_EQ(
+      cfg2.selectedActivityTypes(),
+      std::set<ActivityType>({ActivityType::XPU_RUNTIME}));
 
-  EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES=privateuse1_Runtime,privateuse1_driver"));
-  EXPECT_EQ(cfg2.selectedActivityTypes(),
-    std::set<ActivityType>({ActivityType::PRIVATEUSE1_RUNTIME,
-                            ActivityType::PRIVATEUSE1_DRIVER}));
+  EXPECT_TRUE(
+      cfg2.parse("ACTIVITY_TYPES=privateuse1_Runtime,privateuse1_driver"));
+  EXPECT_EQ(
+      cfg2.selectedActivityTypes(),
+      std::set<ActivityType>(
+          {ActivityType::PRIVATEUSE1_RUNTIME,
+           ActivityType::PRIVATEUSE1_DRIVER}));
 }
 
 TEST(ParseTest, SamplePeriod) {
diff --git a/libkineto/test/CuptiActivityProfilerTest.cpp b/libkineto/test/CuptiActivityProfilerTest.cpp
index 2f8dcd8c8..eef9588d4 100644
--- a/libkineto/test/CuptiActivityProfilerTest.cpp
+++ b/libkineto/test/CuptiActivityProfilerTest.cpp
@@ -22,9 +22,9 @@
 #endif
 
 #include "include/Config.h"
+#include "include/libkineto.h"
 #include "include/output_base.h"
 #include "include/time_since_epoch.h"
-#include "include/libkineto.h"
 #include "src/ActivityTrace.h"
 #include "src/CuptiActivityApi.h"
 #include "src/CuptiActivityProfiler.h"
@@ -111,9 +111,11 @@ struct MockCuptiActivityBuffer {
 
   void addRuntimeActivity(
       CUpti_runtime_api_trace_cbid_enum cbid,
-      int64_t start_ns, int64_t end_ns, int64_t correlation) {
-    auto& act = createActivity<CUpti_ActivityAPI>(
-        start_ns, end_ns, correlation);
+      int64_t start_ns,
+      int64_t end_ns,
+      int64_t correlation) {
+    auto& act =
+        createActivity<CUpti_ActivityAPI>(start_ns, end_ns, correlation);
     act.kind = CUPTI_ACTIVITY_KIND_RUNTIME;
     act.cbid = cbid;
     act.threadId = threadId();
@@ -122,19 +124,21 @@ struct MockCuptiActivityBuffer {
 
   void addDriverActivity(
       CUpti_driver_api_trace_cbid_enum cbid,
-      int64_t start_ns, int64_t end_ns, int64_t correlation) {
-    auto& act = createActivity<CUpti_ActivityAPI>(
-        start_ns, end_ns, correlation);
+      int64_t start_ns,
+      int64_t end_ns,
+      int64_t correlation) {
+    auto& act =
+        createActivity<CUpti_ActivityAPI>(start_ns, end_ns, correlation);
     act.kind = CUPTI_ACTIVITY_KIND_DRIVER;
     act.cbid = cbid;
     act.threadId = threadId();
     activities.push_back(reinterpret_cast<CUpti_Activity*>(&act));
   }
 
-  void addKernelActivity(
-      int64_t start_ns, int64_t end_ns, int64_t correlation) {
-    auto& act = createActivity<CUpti_ActivityKernel4>(
-        start_ns, end_ns, correlation);
+  void
+  addKernelActivity(int64_t start_ns, int64_t end_ns, int64_t correlation) {
+    auto& act =
+        createActivity<CUpti_ActivityKernel4>(start_ns, end_ns, correlation);
     act.kind = CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL;
     act.deviceId = 0;
     act.contextId = 0;
@@ -145,10 +149,10 @@ struct MockCuptiActivityBuffer {
     activities.push_back(reinterpret_cast<CUpti_Activity*>(&act));
   }
 
-  void addMemcpyActivity(
-      int64_t start_ns, int64_t end_ns, int64_t correlation) {
-    auto& act = createActivity<CUpti_ActivityMemcpy>(
-        start_ns, end_ns, correlation);
+  void
+  addMemcpyActivity(int64_t start_ns, int64_t end_ns, int64_t correlation) {
+    auto& act =
+        createActivity<CUpti_ActivityMemcpy>(start_ns, end_ns, correlation);
     act.kind = CUPTI_ACTIVITY_KIND_MEMCPY;
     act.deviceId = 0;
     act.streamId = 2;
@@ -159,8 +163,11 @@ struct MockCuptiActivityBuffer {
   }
 
   void addSyncActivity(
-      int64_t start_ns, int64_t end_ns, int64_t correlation,
-      CUpti_ActivitySynchronizationType type, int64_t stream = 1) {
+      int64_t start_ns,
+      int64_t end_ns,
+      int64_t correlation,
+      CUpti_ActivitySynchronizationType type,
+      int64_t stream = 1) {
     auto& act = createActivity<CUpti_ActivitySynchronization>(
         start_ns, end_ns, correlation);
     act.kind = CUPTI_ACTIVITY_KIND_SYNCHRONIZATION;
@@ -170,10 +177,10 @@ struct MockCuptiActivityBuffer {
     activities.push_back(reinterpret_cast<CUpti_Activity*>(&act));
   }
 
-  void addCollectiveActivity(
-      int64_t start_ns, int64_t end_ns, int64_t correlation) {
-    auto& act = createActivity<CUpti_ActivityKernel4>(
-        start_ns, end_ns, correlation);
+  void
+  addCollectiveActivity(int64_t start_ns, int64_t end_ns, int64_t correlation) {
+    auto& act =
+        createActivity<CUpti_ActivityKernel4>(start_ns, end_ns, correlation);
     act.name = "collective_gpu";
     act.kind = CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL;
     act.queued = 0;
@@ -460,7 +467,8 @@ TEST_F(CuptiActivityProfilerTest, SyncTrace) {
 
   // Start and stop profiling
   CuptiActivityProfiler profiler(cuptiActivities_, /*cpu only*/ false);
-  int64_t start_time_ns = libkineto::timeSinceEpoch(std::chrono::system_clock::now());
+  int64_t start_time_ns =
+      libkineto::timeSinceEpoch(std::chrono::system_clock::now());
   int64_t duration_ns = 300;
   auto start_time = time_point<system_clock>(nanoseconds(start_time_ns));
   profiler.configure(*cfg_, start_time);
@@ -481,31 +489,51 @@ TEST_F(CuptiActivityProfilerTest, SyncTrace) {
 
   // And some GPU ops
   auto gpuOps = std::make_unique<MockCuptiActivityBuffer>();
-  gpuOps->addRuntimeActivity(CUDA_LAUNCH_KERNEL, start_time_ns + 33, start_time_ns + 38, 1);
-  gpuOps->addRuntimeActivity(CUDA_MEMCPY, start_time_ns + 110, start_time_ns + 120, 2);
-  gpuOps->addRuntimeActivity(CUDA_LAUNCH_KERNEL, start_time_ns + 130, start_time_ns + 145, 3);
-  gpuOps->addDriverActivity(CU_LAUNCH_KERNEL, start_time_ns + 165, start_time_ns + 175, 4);
-  gpuOps->addDriverActivity(CU_LAUNCH_KERNEL_EX, start_time_ns + 195, start_time_ns + 205, 5);
-  gpuOps->addRuntimeActivity(CUDA_STREAM_SYNC, start_time_ns + 146, start_time_ns + 240, 6);
-  gpuOps->addRuntimeActivity(CUDA_EVENT_SYNC, start_time_ns + 241, start_time_ns + 250, 7);
+  gpuOps->addRuntimeActivity(
+      CUDA_LAUNCH_KERNEL, start_time_ns + 33, start_time_ns + 38, 1);
+  gpuOps->addRuntimeActivity(
+      CUDA_MEMCPY, start_time_ns + 110, start_time_ns + 120, 2);
+  gpuOps->addRuntimeActivity(
+      CUDA_LAUNCH_KERNEL, start_time_ns + 130, start_time_ns + 145, 3);
+  gpuOps->addDriverActivity(
+      CU_LAUNCH_KERNEL, start_time_ns + 165, start_time_ns + 175, 4);
+  gpuOps->addDriverActivity(
+      CU_LAUNCH_KERNEL_EX, start_time_ns + 195, start_time_ns + 205, 5);
+  gpuOps->addRuntimeActivity(
+      CUDA_STREAM_SYNC, start_time_ns + 146, start_time_ns + 240, 6);
+  gpuOps->addRuntimeActivity(
+      CUDA_EVENT_SYNC, start_time_ns + 241, start_time_ns + 250, 7);
   gpuOps->addKernelActivity(start_time_ns + 50, start_time_ns + 70, 1);
   gpuOps->addMemcpyActivity(start_time_ns + 140, start_time_ns + 150, 2);
   gpuOps->addKernelActivity(start_time_ns + 160, start_time_ns + 220, 3);
   gpuOps->addKernelActivity(start_time_ns + 230, start_time_ns + 250, 4);
   gpuOps->addKernelActivity(start_time_ns + 260, start_time_ns + 280, 5);
-  gpuOps->addSyncActivity(start_time_ns + 221, start_time_ns + 223, 6, CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE);
+  gpuOps->addSyncActivity(
+      start_time_ns + 221,
+      start_time_ns + 223,
+      6,
+      CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE);
   // Add wait event on kernel stream 1
   gpuOps->addSyncActivity(
-      start_time_ns + 224, start_time_ns + 226, 7, CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT,
+      start_time_ns + 224,
+      start_time_ns + 226,
+      7,
+      CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT,
       1 /*stream*/);
   // This event should be ignored because it is not on a stream that has no GPU
   // kernels
   gpuOps->addSyncActivity(
-      start_time_ns + 226, start_time_ns + 230, 8, CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT,
+      start_time_ns + 226,
+      start_time_ns + 230,
+      8,
+      CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT,
       4 /*stream*/);
   // Comes from CudaEventSynchronize call on CPU
   gpuOps->addSyncActivity(
-      start_time_ns + 227, start_time_ns + 226, 7, CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE,
+      start_time_ns + 227,
+      start_time_ns + 226,
+      7,
+      CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE,
       -1 /*stream*/);
   cuptiActivities_.activityBuffer = std::move(gpuOps);
 
@@ -544,7 +572,8 @@ TEST_F(CuptiActivityProfilerTest, SyncTrace) {
   EXPECT_EQ(activityCounts["Memcpy HtoD (Pinned -> Device)"], 1);
 
   auto sysTid = systemThreadId();
-  // Ops and runtime events are on thread sysTid along with the flow start events
+  // Ops and runtime events are on thread sysTid along with the flow start
+  // events
   EXPECT_EQ(resourceIds[sysTid], 12);
   // Kernels and sync events are on stream 1, memcpy on stream 2
   EXPECT_EQ(resourceIds[1], 6);
@@ -575,7 +604,8 @@ TEST_F(CuptiActivityProfilerTest, GpuNCCLCollectiveTest) {
 
   // Start and stop profiling
   CuptiActivityProfiler profiler(cuptiActivities_, /*cpu only*/ false);
-  int64_t start_time_ns = libkineto::timeSinceEpoch(std::chrono::system_clock::now());
+  int64_t start_time_ns =
+      libkineto::timeSinceEpoch(std::chrono::system_clock::now());
   int64_t duration_ns = 300;
   auto start_time = time_point<system_clock>(nanoseconds(start_time_ns));
   profiler.configure(*cfg_, start_time);
@@ -587,7 +617,8 @@ TEST_F(CuptiActivityProfilerTest, GpuNCCLCollectiveTest) {
 
   // Prepare metadata map
   std::unordered_map<std::string, std::string> metadataMap;
-  metadataMap.emplace(kCollectiveName, fmt::format("\"{}\"", "_allgather_base"));
+  metadataMap.emplace(
+      kCollectiveName, fmt::format("\"{}\"", "_allgather_base"));
   metadataMap.emplace(kDtype, fmt::format("\"{}\"", "Float"));
   metadataMap.emplace(kInMsgNelems, "65664");
   metadataMap.emplace(kOutMsgNelems, "131328");
@@ -745,7 +776,8 @@ TEST_F(CuptiActivityProfilerTest, GpuUserAnnotationTest) {
 
   // Start and stop profiling
   CuptiActivityProfiler profiler(cuptiActivities_, /*cpu only*/ false);
-  int64_t start_time_ns = libkineto::timeSinceEpoch(std::chrono::system_clock::now());
+  int64_t start_time_ns =
+      libkineto::timeSinceEpoch(std::chrono::system_clock::now());
   int64_t duration_ns = 300;
   auto start_time = time_point<system_clock>(nanoseconds(start_time_ns));
   profiler.configure(*cfg_, start_time);
@@ -813,7 +845,8 @@ TEST_F(CuptiActivityProfilerTest, SubActivityProfilers) {
   ev.device = 1;
   ev.resource = 0;
 
-  int64_t start_time_ns = libkineto::timeSinceEpoch(std::chrono::system_clock::now());
+  int64_t start_time_ns =
+      libkineto::timeSinceEpoch(std::chrono::system_clock::now());
   int64_t duration_ns = 1000;
   auto start_time = time_point<system_clock>(nanoseconds(start_time_ns));
 
@@ -1007,7 +1040,8 @@ TEST_F(CuptiActivityProfilerTest, JsonGPUIDSortTest) {
 
   // Start and stop profiling
   CuptiActivityProfiler profiler(cuptiActivities_, /*cpu only*/ false);
-  int64_t start_time_ns = libkineto::timeSinceEpoch(std::chrono::system_clock::now());
+  int64_t start_time_ns =
+      libkineto::timeSinceEpoch(std::chrono::system_clock::now());
   int64_t duration_ns = 500;
   auto start_time = time_point<system_clock>(nanoseconds(start_time_ns));
   profiler.configure(*cfg_, start_time);
@@ -1021,7 +1055,8 @@ TEST_F(CuptiActivityProfilerTest, JsonGPUIDSortTest) {
   cpuOps->addOp("op1", start_time_ns + 10, start_time_ns + 30, 1);
   profiler.transferCpuTrace(std::move(cpuOps));
   auto gpuOps = std::make_unique<MockCuptiActivityBuffer>();
-  gpuOps->addRuntimeActivity(CUDA_LAUNCH_KERNEL, start_time_ns + 23, start_time_ns + 28, 1);
+  gpuOps->addRuntimeActivity(
+      CUDA_LAUNCH_KERNEL, start_time_ns + 23, start_time_ns + 28, 1);
   gpuOps->addKernelActivity(start_time_ns + 50, start_time_ns + 70, 1);
   cuptiActivities_.activityBuffer = std::move(gpuOps);
 
@@ -1057,17 +1092,19 @@ TEST_F(CuptiActivityProfilerTest, JsonGPUIDSortTest) {
   std::unordered_map<int64_t, std::string> sortLabel;
   std::unordered_map<int64_t, int64_t> sortIdx;
   for (auto& event : jsonData["traceEvents"]) {
-    if (event["name"] == "process_labels" && event["tid"] == 0 && event["pid"].isInt()) {
+    if (event["name"] == "process_labels" && event["tid"] == 0 &&
+        event["pid"].isInt()) {
       sortLabel[event["pid"].asInt()] = event["args"]["labels"].asString();
     }
-    if (event["name"] == "process_sort_index" && event["tid"] == 0 && event["pid"].isInt()) {
+    if (event["name"] == "process_sort_index" && event["tid"] == 0 &&
+        event["pid"].isInt()) {
       sortIdx[event["pid"].asInt()] = event["args"]["sort_index"].asInt();
     }
   }
 
   // Expect there is 1 CUPTI Overhead, and 16 CPU + GPU sorts, total 17.
   EXPECT_EQ(17, sortLabel.size());
-  for (int i = 0; i<16; i++) {
+  for (int i = 0; i < 16; i++) {
     // Check there are 16 GPU sorts (0-15) with expected sort_index.
     EXPECT_EQ("GPU " + std::to_string(i), sortLabel[i]);
     // sortIndex is gpu + kExceedMaxPid to put GPU tracks at the bottom
diff --git a/libkineto/test/CuptiCallbackApiTest.cpp b/libkineto/test/CuptiCallbackApiTest.cpp
index b292688e2..96fcb9a78 100644
--- a/libkineto/test/CuptiCallbackApiTest.cpp
+++ b/libkineto/test/CuptiCallbackApiTest.cpp
@@ -79,8 +79,8 @@ TEST(CuptiCallbackApiTest, SimpleTest) {
       &simple_cudaLaunchKernel_cb))
       << "Failed to add callback";
   EXPECT_TRUE(addSimpleCallback(
-    CuptiCallbackApi::CuptiCallBackID::CUDA_LAUNCH_KERNEL_EXC,
-    &simple_cudaLaunchKernelExC_cb))
+      CuptiCallbackApi::CuptiCallBackID::CUDA_LAUNCH_KERNEL_EXC,
+      &simple_cudaLaunchKernelExC_cb))
       << "Failed to add callback";
 
   // duplicate add should be okay
@@ -89,8 +89,8 @@ TEST(CuptiCallbackApiTest, SimpleTest) {
       &simple_cudaLaunchKernel_cb))
       << "Failed to re-add callback";
   EXPECT_TRUE(addSimpleCallback(
-    CuptiCallbackApi::CuptiCallBackID::CUDA_LAUNCH_KERNEL_EXC,
-    &simple_cudaLaunchKernelExC_cb))
+      CuptiCallbackApi::CuptiCallBackID::CUDA_LAUNCH_KERNEL_EXC,
+      &simple_cudaLaunchKernelExC_cb))
       << "Failed to re-add callback";
 
   simple_cb_calls = 0;
diff --git a/libkineto/test/CuptiProfilerApiTest.cu b/libkineto/test/CuptiProfilerApiTest.cu
index 73d377b89..9d9e2277b 100644
--- a/libkineto/test/CuptiProfilerApiTest.cu
+++ b/libkineto/test/CuptiProfilerApiTest.cu
@@ -6,30 +6,29 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <string>
 #include <fmt/format.h>
 #include <gtest/gtest.h>
+#include <string>
 
 #include <cuda.h>
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
-#include "src/Logger.h"
 #include "src/CuptiRangeProfilerApi.h"
+#include "src/Logger.h"
 
-#define DRIVER_API_CALL(apiFuncCall)                           \
-  do {                                                         \
-    CUresult _status = apiFuncCall;                            \
-    if (_status != CUDA_SUCCESS) {                             \
-      LOG(ERROR) << "Failed invoking CUDA driver function "    \
-                 << #apiFuncCall << " status = "               \
-                 << _status;                                   \
-      exit(-1);                                                \
-    }                                                          \
+#define DRIVER_API_CALL(apiFuncCall)                                        \
+  do {                                                                      \
+    CUresult _status = apiFuncCall;                                         \
+    if (_status != CUDA_SUCCESS) {                                          \
+      LOG(ERROR) << "Failed invoking CUDA driver function " << #apiFuncCall \
+                 << " status = " << _status;                                \
+      exit(-1);                                                             \
+    }                                                                       \
   } while (0)
 
-#define EXPECT(expr)\
-  if (!(expr)) {\
+#define EXPECT(expr) \
+  if (!(expr)) {     \
   };
 
 using namespace KINETO_NAMESPACE;
@@ -161,16 +160,14 @@ bool runTestWithAutoRange(
     const std::vector<std::string>& metricNames,
     CUcontext cuContext,
     bool async) {
-
   // create a CUPTI range based profiling profiler
   //  this configures the counter data as well
   CuptiRangeProfilerOptions opts{
-    .metricNames = metricNames,
-    .deviceId = deviceNum,
-    .maxRanges = 2,
-    .numNestingLevels = 1,
-    .cuContext = async ? nullptr : cuContext
-  };
+      .metricNames = metricNames,
+      .deviceId = deviceNum,
+      .maxRanges = 2,
+      .numNestingLevels = 1,
+      .cuContext = async ? nullptr : cuContext};
   CuptiRBProfilerSession profiler(opts);
 
   CUpti_ProfilerRange profilerRange = CUPTI_AutoRange;
@@ -209,7 +206,7 @@ bool runTestWithAutoRange(
       // each kernel has 50000 dadd ops
       EXPECT_EQ(measurement.values[1], 50000);
       // sm__inst_executed_pipe_tensor.sum
-      //EXPECT_EQ(measurement.values[2], 0);
+      // EXPECT_EQ(measurement.values[2], 0);
     }
   }
   return true;
@@ -220,16 +217,14 @@ bool runTestWithUserRange(
     const std::vector<std::string>& metricNames,
     CUcontext cuContext,
     bool async = false) {
-
   // create a CUPTI range based profiling profiler
   //  this configures the counter data as well
   CuptiRangeProfilerOptions opts{
-    .metricNames = metricNames,
-    .deviceId = deviceNum,
-    .maxRanges = numRanges,
-    .numNestingLevels = 1,
-    .cuContext = async ? nullptr : cuContext
-  };
+      .metricNames = metricNames,
+      .deviceId = deviceNum,
+      .maxRanges = numRanges,
+      .numNestingLevels = 1,
+      .cuContext = async ? nullptr : cuContext};
   CuptiRBProfilerSession profiler(opts);
 
   CUpti_ProfilerRange profilerRange = CUPTI_UserRange;
@@ -285,7 +280,7 @@ bool runTestWithUserRange(
         EXPECT_EQ(measurement.values[1], 100000);
       }
       // sm__inst_executed_pipe_tensor.sum
-      //EXPECT_EQ(measurement.values[2], 0);
+      // EXPECT_EQ(measurement.values[2], 0);
     }
   }
   return true;
@@ -293,7 +288,6 @@ bool runTestWithUserRange(
 #endif // HAS_CUPTI_RANGE_PROFILER
 
 int main(int argc, char* argv[]) {
-
   CUdevice cuDevice;
 
   int deviceCount, deviceNum;
@@ -326,10 +320,12 @@ int main(int argc, char* argv[]) {
       cuDevice));
 
   LOG(INFO) << "Compute Cabapbility = "
-            << fmt::format("{},{}",computeCapabilityMajor, computeCapabilityMinor);
+            << fmt::format(
+                   "{},{}", computeCapabilityMajor, computeCapabilityMinor);
 
   if (computeCapabilityMajor < 7) {
-    LOG(ERROR) << "CUPTI Profiler is not supported  with compute capability < 7.0";
+    LOG(ERROR)
+        << "CUPTI Profiler is not supported  with compute capability < 7.0";
     return -2;
   }
 
@@ -337,9 +333,9 @@ int main(int argc, char* argv[]) {
 
   // metrics to profile
   std::vector<std::string> metricNames = {
-    "smsp__warps_launched.avg",
-    "smsp__sass_thread_inst_executed_op_dadd_pred_on.sum",
-    "sm__inst_executed_pipe_tensor.sum",
+      "smsp__warps_launched.avg",
+      "smsp__sass_thread_inst_executed_op_dadd_pred_on.sum",
+      "sm__inst_executed_pipe_tensor.sum",
   };
 
   CUcontext cuContext;
@@ -366,6 +362,5 @@ int main(int argc, char* argv[]) {
 #endif // HAS_CUPTI_RANGE_PROFILER
   DRIVER_API_CALL(cuCtxDestroy(cuContext));
 
-
   return 0;
 }
diff --git a/libkineto/test/CuptiRangeProfilerApiTest.cpp b/libkineto/test/CuptiRangeProfilerApiTest.cpp
index 71f57fa9a..1865a5ccb 100644
--- a/libkineto/test/CuptiRangeProfilerApiTest.cpp
+++ b/libkineto/test/CuptiRangeProfilerApiTest.cpp
@@ -12,12 +12,12 @@
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
-#include "include/libkineto.h"
 #include "include/Config.h"
+#include "include/libkineto.h"
 #include "src/CuptiRangeProfilerApi.h"
 
-#include "src/Logger.h"
 #include "CuptiRangeProfilerTestUtil.h"
+#include "src/Logger.h"
 
 using namespace KINETO_NAMESPACE;
 
@@ -32,8 +32,7 @@ MockCuptiRBProfilerSession::getResults() {
 MockCuptiRBProfilerSessionFactory mfactory{};
 
 TEST(CuptiRangeProfilerApiTest, contextTracking) {
-  std::vector<std::string> log_modules(
-      {"CuptiRangeProfilerApi.cpp"});
+  std::vector<std::string> log_modules({"CuptiRangeProfilerApi.cpp"});
   SET_LOG_VERBOSITY_LEVEL(1, log_modules);
 
   std::array<int64_t, 3> data;
@@ -56,19 +55,16 @@ TEST(CuptiRangeProfilerApiTest, contextTracking) {
   simulateCudaContextDestroy(contexts[1], 1);
 
   EXPECT_EQ(
-      CuptiRBProfilerSession::getActiveDevices(),
-      std::set<uint32_t>({0, 2}));
+      CuptiRBProfilerSession::getActiveDevices(), std::set<uint32_t>({0, 2}));
 
   simulateCudaContextDestroy(contexts[0], 0);
   simulateCudaContextDestroy(contexts[2], 2);
 
-  EXPECT_TRUE(
-      CuptiRBProfilerSession::getActiveDevices().empty());
+  EXPECT_TRUE(CuptiRBProfilerSession::getActiveDevices().empty());
 }
 
 TEST(CuptiRangeProfilerApiTest, asyncLaunchUserRange) {
-  std::vector<std::string> log_modules(
-      {"CuptiRangeProfilerApi.cpp"});
+  std::vector<std::string> log_modules({"CuptiRangeProfilerApi.cpp"});
   SET_LOG_VERBOSITY_LEVEL(1, log_modules);
 
   // this is bad but the pointer is never accessed
@@ -76,11 +72,11 @@ TEST(CuptiRangeProfilerApiTest, asyncLaunchUserRange) {
   simulateCudaContextCreate(ctx0, 0 /*device_id*/);
 
   CuptiRangeProfilerOptions opts{
-    .metricNames = {"metricNames"},
-    .deviceId = 0,
-    .maxRanges = 1,
-    .numNestingLevels = 1,
-    .cuContext = ctx0 };
+      .metricNames = {"metricNames"},
+      .deviceId = 0,
+      .maxRanges = 1,
+      .numNestingLevels = 1,
+      .cuContext = ctx0};
 
   std::unique_ptr<CuptiRBProfilerSession> session_ = mfactory.make(opts);
   auto session = mfactory.asDerived(session_.get());
@@ -102,8 +98,7 @@ TEST(CuptiRangeProfilerApiTest, asyncLaunchUserRange) {
 }
 
 TEST(CuptiRangeProfilerApiTest, asyncLaunchAutoRange) {
-  std::vector<std::string> log_modules(
-      {"CuptiRangeProfilerApi.cpp"});
+  std::vector<std::string> log_modules({"CuptiRangeProfilerApi.cpp"});
   SET_LOG_VERBOSITY_LEVEL(1, log_modules);
 
   // this is bad but the pointer is never accessed
@@ -113,11 +108,11 @@ TEST(CuptiRangeProfilerApiTest, asyncLaunchAutoRange) {
   simulateCudaContextCreate(ctx0, 0 /*device_id*/);
 
   CuptiRangeProfilerOptions opts{
-    .metricNames = {"metricNames"},
-    .deviceId = 0,
-    .maxRanges = 1,
-    .numNestingLevels = 1,
-    .cuContext = ctx0 };
+      .metricNames = {"metricNames"},
+      .deviceId = 0,
+      .maxRanges = 1,
+      .numNestingLevels = 1,
+      .cuContext = ctx0};
 
   std::unique_ptr<CuptiRBProfilerSession> session_ = mfactory.make(opts);
   auto session = mfactory.asDerived(session_.get());
@@ -141,7 +136,7 @@ TEST(CuptiRangeProfilerApiTest, asyncLaunchAutoRange) {
   EXPECT_EQ(
       session->getKernelNames(),
       std::vector<std::string>({"hello", "foo", "bar"}))
-    << "Kernel names were not tracked";
+      << "Kernel names were not tracked";
 }
 
 #endif // HAS_CUPTI_RANGE_PROFILER
diff --git a/libkineto/test/CuptiRangeProfilerConfigTest.cpp b/libkineto/test/CuptiRangeProfilerConfigTest.cpp
index cb8ec2946..f21fbfd06 100644
--- a/libkineto/test/CuptiRangeProfilerConfigTest.cpp
+++ b/libkineto/test/CuptiRangeProfilerConfigTest.cpp
@@ -6,8 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "include/Config.h"
 #include "src/CuptiRangeProfilerConfig.h"
+#include "include/Config.h"
 
 #include <fmt/format.h>
 #include <fmt/ranges.h>
@@ -27,24 +27,23 @@ class CuptiRangeProfilerConfigTest : public ::testing::Test {
 TEST_F(CuptiRangeProfilerConfigTest, ConfigureProfiler) {
   Config cfg;
   std::vector<std::string> metrics = {
-    "kineto__cuda_core_flops",
-    "sm__inst_executed.sum",
-    "l1tex__data_bank_conflicts_pipe_lsu.sum",
+      "kineto__cuda_core_flops",
+      "sm__inst_executed.sum",
+      "l1tex__data_bank_conflicts_pipe_lsu.sum",
   };
   auto metricsConfigStr =
-        fmt::format("CUPTI_PROFILER_METRICS = {}", fmt::join(metrics, ","));
+      fmt::format("CUPTI_PROFILER_METRICS = {}", fmt::join(metrics, ","));
 
   EXPECT_TRUE(cfg.parse(metricsConfigStr));
   EXPECT_TRUE(cfg.parse("CUPTI_PROFILER_ENABLE_PER_KERNEL = true"));
   EXPECT_TRUE(cfg.parse("CUPTI_PROFILER_MAX_RANGES = 42"));
 
   const CuptiRangeProfilerConfig& cupti_cfg =
-    CuptiRangeProfilerConfig::get(cfg);
+      CuptiRangeProfilerConfig::get(cfg);
 
   EXPECT_EQ(cupti_cfg.activitiesCuptiMetrics(), metrics);
   EXPECT_EQ(cupti_cfg.cuptiProfilerPerKernel(), true);
   EXPECT_EQ(cupti_cfg.cuptiProfilerMaxRanges(), 42);
-
 }
 
 TEST_F(CuptiRangeProfilerConfigTest, RangesDefaults) {
@@ -56,7 +55,8 @@ TEST_F(CuptiRangeProfilerConfigTest, RangesDefaults) {
 
   cfg.setSignalDefaults();
 
-  EXPECT_TRUE(cfg_auto.parse("CUPTI_PROFILER_METRICS = kineto__cuda_core_flops"));
+  EXPECT_TRUE(
+      cfg_auto.parse("CUPTI_PROFILER_METRICS = kineto__cuda_core_flops"));
   EXPECT_TRUE(cfg_auto.parse("CUPTI_PROFILER_ENABLE_PER_KERNEL = true"));
 
   cfg_auto.setClientDefaults();
@@ -64,10 +64,13 @@ TEST_F(CuptiRangeProfilerConfigTest, RangesDefaults) {
   int user_ranges, auto_ranges;
 
   user_ranges = CuptiRangeProfilerConfig::get(cfg).cuptiProfilerMaxRanges();
-  auto_ranges = CuptiRangeProfilerConfig::get(cfg_auto).cuptiProfilerMaxRanges();
+  auto_ranges =
+      CuptiRangeProfilerConfig::get(cfg_auto).cuptiProfilerMaxRanges();
 
-  EXPECT_GE(user_ranges, 1) << " in user range mode default to at least 1 ranges";
-  EXPECT_GE(auto_ranges, 1000) << " in auto range mode default to at least 1000 ranges";
+  EXPECT_GE(user_ranges, 1)
+      << " in user range mode default to at least 1 ranges";
+  EXPECT_GE(auto_ranges, 1000)
+      << " in auto range mode default to at least 1000 ranges";
 
   EXPECT_GT(auto_ranges, user_ranges);
 }
diff --git a/libkineto/test/CuptiRangeProfilerTest.cpp b/libkineto/test/CuptiRangeProfilerTest.cpp
index e0ca7965c..fe8d25ad3 100644
--- a/libkineto/test/CuptiRangeProfilerTest.cpp
+++ b/libkineto/test/CuptiRangeProfilerTest.cpp
@@ -11,9 +11,9 @@
 #include <set>
 
 #ifdef __linux__
+#include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-#include <fcntl.h>
 #endif
 
 #include <fmt/format.h>
@@ -21,15 +21,15 @@
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
-#include "include/libkineto.h"
 #include "include/Config.h"
+#include "include/libkineto.h"
 #include "include/output_base.h"
 #include "src/ActivityTrace.h"
-#include "src/CuptiRangeProfilerConfig.h"
 #include "src/CuptiRangeProfiler.h"
+#include "src/CuptiRangeProfilerConfig.h"
+#include "src/Logger.h"
 #include "src/output_json.h"
 #include "src/output_membuf.h"
-#include "src/Logger.h"
 
 #include "CuptiRangeProfilerTestUtil.h"
 
@@ -43,10 +43,8 @@ MockCuptiRBProfilerSession::getResults() {
   return results;
 }
 
-static std::vector<std::string> kCtx0Kernels = {
-  "foo", "bar", "baz"};
-static std::vector<std::string> kCtx1Kernels = {
-  "mercury", "venus", "earth"};
+static std::vector<std::string> kCtx0Kernels = {"foo", "bar", "baz"};
+static std::vector<std::string> kCtx1Kernels = {"mercury", "venus", "earth"};
 
 static auto getActivityTypes() {
   static std::set activity_types_{libkineto::ActivityType::CUDA_PROFILER_RANGE};
@@ -82,7 +80,7 @@ class CuptiRangeProfilerTest : public ::testing::Test {
 
     // used for logging to a file
     loggerFactory.addProtocol("file", [](const std::string& url) {
-        return std::unique_ptr<ActivityLogger>(new ChromeTraceLogger(url));
+      return std::unique_ptr<ActivityLogger>(new ChromeTraceLogger(url));
     });
   }
 
@@ -92,11 +90,12 @@ class CuptiRangeProfilerTest : public ::testing::Test {
   }
 
   void setupConfig(const std::vector<std::string>& metrics, bool per_kernel) {
-    std::string config_str = fmt::format("ACTIVITIES_WARMUP_PERIOD_SECS=0\n "
-      "CUPTI_PROFILER_METRICS={}\n "
-      "CUPTI_PROFILER_ENABLE_PER_KERNEL={}",
-      fmt::join(metrics, ","),
-      (per_kernel ? "true" : "false"));
+    std::string config_str = fmt::format(
+        "ACTIVITIES_WARMUP_PERIOD_SECS=0\n "
+        "CUPTI_PROFILER_METRICS={}\n "
+        "CUPTI_PROFILER_ENABLE_PER_KERNEL={}",
+        fmt::join(metrics, ","),
+        (per_kernel ? "true" : "false"));
 
     cfg_ = std::make_unique<Config>();
     cfg_->parse(config_str);
@@ -133,11 +132,11 @@ class CuptiRangeProfilerTest : public ::testing::Test {
     // sets up mock results returned by Mock CUPTI interface
     for (const auto& k : kCtx0Kernels) {
       results_[0].rangeVals.emplace_back(
-        CuptiRangeMeasurement{k, measurements_});
+          CuptiRangeMeasurement{k, measurements_});
     }
     for (const auto& k : kCtx1Kernels) {
       results_[1].rangeVals.emplace_back(
-        CuptiRangeMeasurement{k, measurements_});
+          CuptiRangeMeasurement{k, measurements_});
     }
   }
 
@@ -146,8 +145,8 @@ class CuptiRangeProfilerTest : public ::testing::Test {
   ActivityLoggerFactory loggerFactory;
 
   std::vector<double> measurements_;
-  std::unordered_map<int, CuptiProfilerResult>& results_
-    = MockCuptiRBProfilerSession::getResults();
+  std::unordered_map<int, CuptiProfilerResult>& results_ =
+      MockCuptiRBProfilerSession::getResults();
 
   CUcontext ctx0_, ctx1_;
 };
@@ -157,8 +156,8 @@ void checkMetrics(
     const std::string& metadataJson) {
   for (const auto& m : metrics) {
     EXPECT_NE(metadataJson.find(m), std::string::npos)
-      << "Could not find metdata on metric " << m
-      << "\n metadata json = '" << metadataJson << "'";
+        << "Could not find metdata on metric " << m << "\n metadata json = '"
+        << metadataJson << "'";
   }
 }
 
@@ -183,19 +182,17 @@ void saveTrace(ActivityTrace& /*trace*/) {
 }
 
 TEST_F(CuptiRangeProfilerTest, BasicTest) {
-
   EXPECT_NE(profiler_->name().size(), 0);
   EXPECT_EQ(profiler_->availableActivities(), getActivityTypes());
 
   std::set<ActivityType> incorrect_act_types{
-    ActivityType::CUDA_RUNTIME, ActivityType::CONCURRENT_KERNEL};
+      ActivityType::CUDA_RUNTIME, ActivityType::CONCURRENT_KERNEL};
 
   cfg_ = std::make_unique<Config>();
   cfg_->setClientDefaults();
   cfg_->setSelectedActivityTypes({});
-  EXPECT_EQ(
-      profiler_->configure(incorrect_act_types, *cfg_).get(), nullptr)
-    << "Profiler config should fail for wrong activity type";
+  EXPECT_EQ(profiler_->configure(incorrect_act_types, *cfg_).get(), nullptr)
+      << "Profiler config should fail for wrong activity type";
 
   incorrect_act_types.insert(ActivityType::CUDA_PROFILER_RANGE);
 
@@ -203,17 +200,15 @@ TEST_F(CuptiRangeProfilerTest, BasicTest) {
   cfg_->setClientDefaults();
   cfg_->setSelectedActivityTypes(incorrect_act_types);
 
-  EXPECT_EQ(
-      profiler_->configure(incorrect_act_types, *cfg_).get(), nullptr)
-    << "Profiler config should fail if the activity types is not exclusively"
-    << " CUDA_PROFILER_RANGE";
+  EXPECT_EQ(profiler_->configure(incorrect_act_types, *cfg_).get(), nullptr)
+      << "Profiler config should fail if the activity types is not exclusively"
+      << " CUDA_PROFILER_RANGE";
 }
 
 TEST_F(CuptiRangeProfilerTest, UserRangeTest) {
-
   std::vector<std::string> metrics{
-    "smsp__sass_thread_inst_executed_op_dadd_pred_on.sum",
-    "sm__inst_executed_pipe_tensor.sum",
+      "smsp__sass_thread_inst_executed_op_dadd_pred_on.sum",
+      "sm__inst_executed_pipe_tensor.sum",
   };
 
   setupConfig(metrics, false /*per_kernel*/);
@@ -248,10 +243,9 @@ TEST_F(CuptiRangeProfilerTest, UserRangeTest) {
 }
 
 TEST_F(CuptiRangeProfilerTest, AutoRangeTest) {
-
   std::vector<std::string> metrics{
-    "smsp__sass_thread_inst_executed_op_dadd_pred_on.sum",
-    "sm__inst_executed_pipe_tensor.sum",
+      "smsp__sass_thread_inst_executed_op_dadd_pred_on.sum",
+      "sm__inst_executed_pipe_tensor.sum",
   };
   int kernel_count = 0;
 
@@ -305,7 +299,7 @@ TEST_F(CuptiRangeProfilerTest, AutoRangeTest) {
   saveTrace(trace);
 }
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   CuptiRangeProfilerConfig::registerFactory();
   return RUN_ALL_TESTS();
diff --git a/libkineto/test/CuptiRangeProfilerTestUtil.h b/libkineto/test/CuptiRangeProfilerTestUtil.h
index b2f491582..4a3761307 100644
--- a/libkineto/test/CuptiRangeProfilerTestUtil.h
+++ b/libkineto/test/CuptiRangeProfilerTestUtil.h
@@ -6,9 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <gtest/gtest.h>
 #include <stdlib.h>
 #include <unordered_map>
-#include <gtest/gtest.h>
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
@@ -21,7 +21,7 @@ namespace KINETO_NAMESPACE {
 class MockCuptiRBProfilerSession : public CuptiRBProfilerSession {
  public:
   explicit MockCuptiRBProfilerSession(const CuptiRangeProfilerOptions& opts)
-    : CuptiRBProfilerSession(opts) {}
+      : CuptiRBProfilerSession(opts) {}
 
   void beginPass() override {
     LOG(INFO) << " Mock CUPTI begin pass";
@@ -59,7 +59,7 @@ class MockCuptiRBProfilerSession : public CuptiRBProfilerSession {
     return getResults()[deviceId()];
   }
 
-protected:
+ protected:
   void startInternal(
       CUpti_ProfilerRange profilerRange,
       CUpti_ProfilerReplayMode profilerReplayMode) override {
@@ -68,7 +68,7 @@ class MockCuptiRBProfilerSession : public CuptiRBProfilerSession {
     curReplay_ = profilerReplayMode;
   }
 
-private:
+ private:
   void runChecks() {
     EXPECT_EQ(passes_started, passes_ended);
     EXPECT_EQ(ranges_started, ranges_ended);
@@ -98,8 +98,7 @@ struct MockCuptiRBProfilerSessionFactory : ICuptiRBProfilerSessionFactory {
 };
 
 inline void simulateCudaContextCreate(CUcontext context, uint32_t dev) {
-  testing::trackCudaCtx(
-      context, dev, CUPTI_CBID_RESOURCE_CONTEXT_CREATED);
+  testing::trackCudaCtx(context, dev, CUPTI_CBID_RESOURCE_CONTEXT_CREATED);
 }
 
 inline void simulateCudaContextDestroy(CUcontext context, uint32_t dev) {
@@ -108,7 +107,8 @@ inline void simulateCudaContextDestroy(CUcontext context, uint32_t dev) {
 }
 
 inline void simulateKernelLaunch(
-    CUcontext context, const std::string& kernelName) {
+    CUcontext context,
+    const std::string& kernelName) {
   testing::trackCudaKernelLaunch(context, kernelName.c_str());
 }
 
diff --git a/libkineto/test/CuptiStringsTest.cpp b/libkineto/test/CuptiStringsTest.cpp
index 0546babd9..f9f4abb3e 100644
--- a/libkineto/test/CuptiStringsTest.cpp
+++ b/libkineto/test/CuptiStringsTest.cpp
@@ -13,16 +13,16 @@
 using namespace KINETO_NAMESPACE;
 
 TEST(CuptiStringsTest, Valid) {
-  ASSERT_STREQ(
-      runtimeCbidName(CUPTI_RUNTIME_TRACE_CBID_INVALID), "INVALID");
+  ASSERT_STREQ(runtimeCbidName(CUPTI_RUNTIME_TRACE_CBID_INVALID), "INVALID");
   ASSERT_STREQ(
       runtimeCbidName(CUPTI_RUNTIME_TRACE_CBID_cudaDriverGetVersion_v3020),
       "cudaDriverGetVersion");
-  ASSERT_STREQ(runtimeCbidName
-      (CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020),
+  ASSERT_STREQ(
+      runtimeCbidName(CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020),
       "cudaDeviceSynchronize");
   ASSERT_STREQ(
-      runtimeCbidName(CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_ptsz_v11000),
+      runtimeCbidName(
+          CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_ptsz_v11000),
       "cudaStreamSetAttribute_ptsz");
 #if defined(CUPTI_API_VERSION) && CUPTI_API_VERSION >= 18
   ASSERT_STREQ(
diff --git a/libkineto/test/EventProfilerTest.cpp b/libkineto/test/EventProfilerTest.cpp
index ab9d989c1..bdb674a17 100644
--- a/libkineto/test/EventProfilerTest.cpp
+++ b/libkineto/test/EventProfilerTest.cpp
@@ -16,10 +16,11 @@ using namespace std::chrono;
 using namespace KINETO_NAMESPACE;
 
 TEST(PercentileTest, Create) {
-  PercentileList pct = {{10, SampleValue(0)},
-                        {49, SampleValue(0)},
-                        {50, SampleValue(0)},
-                        {90, SampleValue(0)}};
+  PercentileList pct = {
+      {10, SampleValue(0)},
+      {49, SampleValue(0)},
+      {50, SampleValue(0)},
+      {90, SampleValue(0)}};
 
   percentiles<int>({0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100}, pct);
   EXPECT_EQ(pct[0].second.getInt(), 10);
@@ -294,7 +295,9 @@ TEST(EventGroupSetTest, CollectSample) {
 
 class MockLogger : public SampleListener {
  public:
-  MOCK_METHOD3(handleSample, void(int device, const Sample& sample, bool from_new_version));
+  MOCK_METHOD3(
+      handleSample,
+      void(int device, const Sample& sample, bool from_new_version));
   MOCK_METHOD1(update, void(const Config& config));
 };
 
@@ -554,7 +557,9 @@ TEST_F(EventProfilerTest, ReportSample) {
   auto& logger = dynamic_cast<MockLogger&>(*loggers_[0]);
   EXPECT_CALL(logger, handleSample(0, _, _))
       .Times(1)
-      .WillOnce(Invoke([](int device, const Sample& sample, bool from_new_version) {
+      .WillOnce(Invoke([](int device,
+                          const Sample& sample,
+                          bool from_new_version) {
         // Sample will include all stats - logger must pick the
         // ones it wants.
         EXPECT_EQ(sample.stats.size(), 4);
diff --git a/libkineto/test/LoggerObserverTest.cpp b/libkineto/test/LoggerObserverTest.cpp
index cc2aca143..8895715f8 100644
--- a/libkineto/test/LoggerObserverTest.cpp
+++ b/libkineto/test/LoggerObserverTest.cpp
@@ -11,9 +11,9 @@
 
 // TODO(T90238193)
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
+#include "LoggerCollector.h"
 #include "include/libkineto.h"
 #include "src/Logger.h"
-#include "LoggerCollector.h"
 
 using namespace KINETO_NAMESPACE;
 
@@ -25,7 +25,8 @@ constexpr char ErrorTestStr[] = "Checking LOG(ERROR)";
 
 TEST(LoggerObserverTest, SingleCollectorObserver) {
   // Add a LoggerObserverCollector to collect all logs during the trace.
-  std::unique_ptr<LoggerCollector> lCollector = std::make_unique<LoggerCollector>();
+  std::unique_ptr<LoggerCollector> lCollector =
+      std::make_unique<LoggerCollector>();
   Logger::addLoggerObserver(lCollector.get());
 
   LOG(INFO) << InfoTestStr;
@@ -33,9 +34,15 @@ TEST(LoggerObserverTest, SingleCollectorObserver) {
   LOG(ERROR) << ErrorTestStr;
 
   auto LoggerMD = lCollector->extractCollectorMetadata();
-  EXPECT_TRUE(LoggerMD[LoggerOutputType::INFO][0].find(InfoTestStr) != std::string::npos);
-  EXPECT_TRUE(LoggerMD[LoggerOutputType::WARNING][0].find(WarningTestStr) != std::string::npos);
-  EXPECT_TRUE(LoggerMD[LoggerOutputType::ERROR][0].find(ErrorTestStr) != std::string::npos);
+  EXPECT_TRUE(
+      LoggerMD[LoggerOutputType::INFO][0].find(InfoTestStr) !=
+      std::string::npos);
+  EXPECT_TRUE(
+      LoggerMD[LoggerOutputType::WARNING][0].find(WarningTestStr) !=
+      std::string::npos);
+  EXPECT_TRUE(
+      LoggerMD[LoggerOutputType::ERROR][0].find(ErrorTestStr) !=
+      std::string::npos);
 
   Logger::removeLoggerObserver(lCollector.get());
 }
@@ -43,10 +50,10 @@ TEST(LoggerObserverTest, SingleCollectorObserver) {
 #define NUM_OF_MESSAGES_FOR_EACH_TYPE 10
 #define NUM_OF_WRITE_THREADS 200
 
-// Writes NUM_OF_MESSAGES_FOR_EACH_TYPE messages for each INFO, WARNING, and ERROR.
-// NOLINTNEXTLINE(clang-diagnostic-unused-parameter)
+// Writes NUM_OF_MESSAGES_FOR_EACH_TYPE messages for each INFO, WARNING, and
+// ERROR. NOLINTNEXTLINE(clang-diagnostic-unused-parameter)
 void* writeSeveralMessages(void* ptr) {
-  for(int i=0; i<NUM_OF_MESSAGES_FOR_EACH_TYPE; i++) {
+  for (int i = 0; i < NUM_OF_MESSAGES_FOR_EACH_TYPE; i++) {
     LOG(INFO) << InfoTestStr;
     LOG(WARNING) << WarningTestStr;
     LOG(ERROR) << ErrorTestStr;
@@ -67,12 +74,12 @@ TEST(LoggerObserverTest, FourCollectorObserver) {
 
   // Launch NUM_OF_WRITE_THREADS threads writing several messages.
   pthread_t ListOfThreads[NUM_OF_WRITE_THREADS];
-  for (int i=0; i<NUM_OF_WRITE_THREADS; i++) {
+  for (int i = 0; i < NUM_OF_WRITE_THREADS; i++) {
     ::pthread_create(&ListOfThreads[i], nullptr, writeSeveralMessages, nullptr);
   }
 
   // Wait for all threads to finish.
-  for (int i=0; i<NUM_OF_WRITE_THREADS; i++) {
+  for (int i = 0; i < NUM_OF_WRITE_THREADS; i++) {
     ::pthread_join(ListOfThreads[i], nullptr);
   }
 
@@ -96,7 +103,7 @@ TEST(LoggerObserverTest, FourCollectorObserver) {
 
 #endif // !USE_GOOGLE_LOG
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/libkineto/test/MockActivitySubProfiler.cpp b/libkineto/test/MockActivitySubProfiler.cpp
index 297068554..702b7bcb7 100644
--- a/libkineto/test/MockActivitySubProfiler.cpp
+++ b/libkineto/test/MockActivitySubProfiler.cpp
@@ -18,11 +18,11 @@
 
 namespace libkineto {
 
-const std::set<ActivityType> supported_activities {ActivityType::CPU_OP};
+const std::set<ActivityType> supported_activities{ActivityType::CPU_OP};
 const std::string profile_name{"MockProfiler"};
 
 void MockProfilerSession::processTrace(ActivityLogger& logger) {
-  for (const auto& activity: test_activities_) {
+  for (const auto& activity : test_activities_) {
     activity.log(logger);
   }
 }
@@ -31,27 +31,28 @@ const std::string& MockActivityProfiler::name() const {
   return profile_name;
 }
 
-const std::set<ActivityType>& MockActivityProfiler::availableActivities() const {
+const std::set<ActivityType>& MockActivityProfiler::availableActivities()
+    const {
   return supported_activities;
 }
 
 MockActivityProfiler::MockActivityProfiler(
-    std::deque<GenericTraceActivity>& activities) :
-  test_activities_(activities) {}
+    std::deque<GenericTraceActivity>& activities)
+    : test_activities_(activities) {}
 
 std::unique_ptr<IActivityProfilerSession> MockActivityProfiler::configure(
-      const std::set<ActivityType>& /*activity_types*/,
-      const Config& /*config*/) {
+    const std::set<ActivityType>& /*activity_types*/,
+    const Config& /*config*/) {
   auto session = std::make_unique<MockProfilerSession>();
-	session->set_test_activities(std::move(test_activities_));
+  session->set_test_activities(std::move(test_activities_));
   return session;
 }
 
 std::unique_ptr<IActivityProfilerSession> MockActivityProfiler::configure(
-      int64_t /*ts_ms*/,
-      int64_t /*duration_ms*/,
-      const std::set<ActivityType>& activity_types,
-      const Config& config) {
+    int64_t /*ts_ms*/,
+    int64_t /*duration_ms*/,
+    const std::set<ActivityType>& activity_types,
+    const Config& config) {
   return configure(activity_types, config);
 }
 
diff --git a/libkineto/test/MockActivitySubProfiler.h b/libkineto/test/MockActivitySubProfiler.h
index 3ffe13c2b..7405a1781 100644
--- a/libkineto/test/MockActivitySubProfiler.h
+++ b/libkineto/test/MockActivitySubProfiler.h
@@ -8,59 +8,57 @@
 
 #pragma once
 
+#include <deque>
 #include <memory>
 #include <set>
-#include <deque>
 
 #include "include/IActivityProfiler.h"
 #include "output_base.h"
 
 namespace libkineto {
 
-class MockProfilerSession: public IActivityProfilerSession {
+class MockProfilerSession : public IActivityProfilerSession {
+ public:
+  explicit MockProfilerSession() {}
 
-  public:
-    explicit MockProfilerSession() {}
+  void start() override {
+    start_count++;
+    status_ = TraceStatus::RECORDING;
+  }
 
-    void start() override {
-      start_count++;
-      status_ = TraceStatus::RECORDING;
-    }
+  void stop() override {
+    stop_count++;
+    status_ = TraceStatus::PROCESSING;
+  }
 
-    void stop() override {
-      stop_count++;
-      status_ = TraceStatus::PROCESSING;
-    }
+  std::vector<std::string> errors() override {
+    return {};
+  }
 
-    std::vector<std::string> errors() override {
-      return {};
-    }
+  void processTrace(ActivityLogger& logger) override;
 
-    void processTrace(ActivityLogger& logger) override;
+  void set_test_activities(std::deque<GenericTraceActivity>&& acs) {
+    test_activities_ = std::move(acs);
+  }
 
-    void set_test_activities(std::deque<GenericTraceActivity>&& acs) {
-      test_activities_ = std::move(acs);
-    }
+  std::unique_ptr<CpuTraceBuffer> getTraceBuffer() override;
 
-    std::unique_ptr<CpuTraceBuffer> getTraceBuffer() override;
+  std::unique_ptr<DeviceInfo> getDeviceInfo() override {
+    return {};
+  }
 
-    std::unique_ptr<DeviceInfo> getDeviceInfo() override {
-      return {};
-    }
+  std::vector<ResourceInfo> getResourceInfos() override {
+    return {};
+  }
 
-    std::vector<ResourceInfo> getResourceInfos() override {
-      return {};
-    }
+  int start_count = 0;
+  int stop_count = 0;
 
-    int start_count = 0;
-    int stop_count = 0;
-  private:
-    std::deque<GenericTraceActivity> test_activities_;
+ private:
+  std::deque<GenericTraceActivity> test_activities_;
 };
 
-
-class MockActivityProfiler: public IActivityProfiler {
-
+class MockActivityProfiler : public IActivityProfiler {
  public:
   explicit MockActivityProfiler(std::deque<GenericTraceActivity>& activities);
 
diff --git a/libkineto/test/RoctracerActivityProfilerTest.cpp b/libkineto/test/RoctracerActivityProfilerTest.cpp
index 006233a17..f4a7203b1 100644
--- a/libkineto/test/RoctracerActivityProfilerTest.cpp
+++ b/libkineto/test/RoctracerActivityProfilerTest.cpp
@@ -22,9 +22,9 @@
 #endif
 
 #include "include/Config.h"
+#include "include/libkineto.h"
 #include "include/output_base.h"
 #include "include/time_since_epoch.h"
-#include "include/libkineto.h"
 #include "src/ActivityTrace.h"
 #include "src/CuptiActivityProfiler.h"
 #include "src/RoctracerActivityApi.h"
@@ -102,102 +102,112 @@ struct MockRoctracerLogger {
   }
 
   void addRuntimeKernelActivity(
-      uint32_t cid, int64_t start_ns, int64_t end_ns, int64_t correlation) {
+      uint32_t cid,
+      int64_t start_ns,
+      int64_t end_ns,
+      int64_t correlation) {
     roctracerKernelRow* row = new roctracerKernelRow(
-      correlation,
-      ACTIVITY_DOMAIN_HIP_API,
-      cid,
-      processId(),
-      systemThreadId(),
-      start_ns,
-      end_ns,
-      nullptr,
-      nullptr,
-      0,0,0,0,0,0,0,0
-    );
+        correlation,
+        ACTIVITY_DOMAIN_HIP_API,
+        cid,
+        processId(),
+        systemThreadId(),
+        start_ns,
+        end_ns,
+        nullptr,
+        nullptr,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0);
     activities_.push_back(row);
   }
 
   void addRuntimeMallocActivity(
-      uint32_t cid, int64_t start_ns, int64_t end_ns, int64_t correlation) {
+      uint32_t cid,
+      int64_t start_ns,
+      int64_t end_ns,
+      int64_t correlation) {
     roctracerMallocRow* row = new roctracerMallocRow(
-      correlation,
-      ACTIVITY_DOMAIN_HIP_API,
-      cid,
-      processId(),
-      systemThreadId(),
-      start_ns,
-      end_ns,
-      nullptr,
-      1
-    );
+        correlation,
+        ACTIVITY_DOMAIN_HIP_API,
+        cid,
+        processId(),
+        systemThreadId(),
+        start_ns,
+        end_ns,
+        nullptr,
+        1);
     activities_.push_back(row);
   }
 
   void addRuntimeCopyActivity(
-      uint32_t cid, int64_t start_ns, int64_t end_ns, int64_t correlation) {
+      uint32_t cid,
+      int64_t start_ns,
+      int64_t end_ns,
+      int64_t correlation) {
     roctracerCopyRow* row = new roctracerCopyRow(
-      correlation,
-      ACTIVITY_DOMAIN_HIP_API,
-      cid,
-      processId(),
-      systemThreadId(),
-      start_ns,
-      end_ns,
-      nullptr,
-      nullptr,
-      1,
-      hipMemcpyHostToHost,
-      static_cast<hipStream_t>(0)
-    );
+        correlation,
+        ACTIVITY_DOMAIN_HIP_API,
+        cid,
+        processId(),
+        systemThreadId(),
+        start_ns,
+        end_ns,
+        nullptr,
+        nullptr,
+        1,
+        hipMemcpyHostToHost,
+        static_cast<hipStream_t>(0));
     activities_.push_back(row);
   }
 
-  void addKernelActivity(
-      int64_t start_ns, int64_t end_ns, int64_t correlation) {
+  void
+  addKernelActivity(int64_t start_ns, int64_t end_ns, int64_t correlation) {
     roctracerAsyncRow* row = new roctracerAsyncRow(
-      correlation,
-      ACTIVITY_DOMAIN_HIP_API,
-      HIP_OP_DISPATCH_KIND_KERNEL_,
-      0,
-      0,
-      1,
-      start_ns,
-      end_ns,
-      std::string("kernel")
-    );
+        correlation,
+        ACTIVITY_DOMAIN_HIP_API,
+        HIP_OP_DISPATCH_KIND_KERNEL_,
+        0,
+        0,
+        1,
+        start_ns,
+        end_ns,
+        std::string("kernel"));
     activities_.push_back(row);
   }
 
-  void addMemcpyH2DActivity(
-      int64_t start_ns, int64_t end_ns, int64_t correlation) {
+  void
+  addMemcpyH2DActivity(int64_t start_ns, int64_t end_ns, int64_t correlation) {
     roctracerAsyncRow* row = new roctracerAsyncRow(
-      correlation,
-      ACTIVITY_DOMAIN_HIP_API,
-      HIP_OP_COPY_KIND_HOST_TO_DEVICE_,
-      0,
-      0,
-      2,
-      start_ns,
-      end_ns,
-      std::string()
-    );
+        correlation,
+        ACTIVITY_DOMAIN_HIP_API,
+        HIP_OP_COPY_KIND_HOST_TO_DEVICE_,
+        0,
+        0,
+        2,
+        start_ns,
+        end_ns,
+        std::string());
     activities_.push_back(row);
   }
 
-  void addMemcpyD2HActivity(
-      int64_t start_ns, int64_t end_ns, int64_t correlation) {
+  void
+  addMemcpyD2HActivity(int64_t start_ns, int64_t end_ns, int64_t correlation) {
     roctracerAsyncRow* row = new roctracerAsyncRow(
-      correlation,
-      ACTIVITY_DOMAIN_HIP_API,
-      HIP_OP_COPY_KIND_DEVICE_TO_HOST_,
-      0,
-      0,
-      2,
-      start_ns,
-      end_ns,
-      std::string()
-    );
+        correlation,
+        ACTIVITY_DOMAIN_HIP_API,
+        HIP_OP_COPY_KIND_DEVICE_TO_HOST_,
+        0,
+        0,
+        2,
+        start_ns,
+        end_ns,
+        std::string());
     activities_.push_back(row);
   }
 
@@ -210,7 +220,8 @@ struct MockRoctracerLogger {
   }
 
   std::vector<roctracerBase*> activities_;
-  std::vector<std::pair<uint64_t, uint64_t>> externalCorrelations_[RoctracerLogger::CorrelationDomain::size];
+  std::vector<std::pair<uint64_t, uint64_t>>
+      externalCorrelations_[RoctracerLogger::CorrelationDomain::size];
 };
 
 // Mock parts of the RoctracerActivityApi
@@ -218,16 +229,23 @@ class MockRoctracerActivities : public RoctracerActivityApi {
  public:
   virtual int processActivities(
       std::function<void(const roctracerBase*)> handler,
-      std::function<void(uint64_t, uint64_t, RoctracerLogger::CorrelationDomain)> correlationHandler) override {
+      std::function<
+          void(uint64_t, uint64_t, RoctracerLogger::CorrelationDomain)>
+          correlationHandler) override {
     int count = 0;
-    for (int it = RoctracerLogger::CorrelationDomain::begin; it < RoctracerLogger::CorrelationDomain::end; ++it) {
-      auto &externalCorrelations = activityLogger->externalCorrelations_[it];
-      for (auto &item : externalCorrelations) {
-        correlationHandler(item.first, item.second, static_cast<RoctracerLogger::CorrelationDomain>(it));
+    for (int it = RoctracerLogger::CorrelationDomain::begin;
+         it < RoctracerLogger::CorrelationDomain::end;
+         ++it) {
+      auto& externalCorrelations = activityLogger->externalCorrelations_[it];
+      for (auto& item : externalCorrelations) {
+        correlationHandler(
+            item.first,
+            item.second,
+            static_cast<RoctracerLogger::CorrelationDomain>(it));
       }
       externalCorrelations.clear();
     }
-    for (auto &item : activityLogger->activities_) {
+    for (auto& item : activityLogger->activities_) {
       handler(item);
       ++count;
     }
@@ -279,7 +297,8 @@ TEST_F(RoctracerActivityProfilerTest, SyncTrace) {
 
   // Start and stop profiling
   CuptiActivityProfiler profiler(roctracerActivities_, /*cpu only*/ false);
-  int64_t start_time_ns = libkineto::timeSinceEpoch(std::chrono::system_clock::now());
+  int64_t start_time_ns =
+      libkineto::timeSinceEpoch(std::chrono::system_clock::now());
   int64_t duration_ns = 300;
   auto start_time = time_point<system_clock>(nanoseconds(start_time_ns));
   profiler.configure(*cfg_, start_time);
@@ -300,11 +319,16 @@ TEST_F(RoctracerActivityProfilerTest, SyncTrace) {
 
   // And some CPU runtime ops, and GPU ops
   auto gpuOps = std::make_unique<MockRoctracerLogger>();
-  gpuOps->addRuntimeKernelActivity(HIP_LAUNCH_KERNEL, start_time_ns + 33, start_time_ns + 38, 1);
-  gpuOps->addRuntimeCopyActivity(HIP_MEMCPY, start_time_ns + 110, start_time_ns + 120, 2);
-  gpuOps->addRuntimeKernelActivity(HIP_LAUNCH_KERNEL, start_time_ns + 130, start_time_ns + 145, 3);
-  gpuOps->addRuntimeCopyActivity(HIP_MEMCPY, start_time_ns + 165, start_time_ns + 175, 4);
-  gpuOps->addRuntimeKernelActivity(HIP_LAUNCH_KERNEL, start_time_ns + 195, start_time_ns + 205, 5);
+  gpuOps->addRuntimeKernelActivity(
+      HIP_LAUNCH_KERNEL, start_time_ns + 33, start_time_ns + 38, 1);
+  gpuOps->addRuntimeCopyActivity(
+      HIP_MEMCPY, start_time_ns + 110, start_time_ns + 120, 2);
+  gpuOps->addRuntimeKernelActivity(
+      HIP_LAUNCH_KERNEL, start_time_ns + 130, start_time_ns + 145, 3);
+  gpuOps->addRuntimeCopyActivity(
+      HIP_MEMCPY, start_time_ns + 165, start_time_ns + 175, 4);
+  gpuOps->addRuntimeKernelActivity(
+      HIP_LAUNCH_KERNEL, start_time_ns + 195, start_time_ns + 205, 5);
   gpuOps->addKernelActivity(start_time_ns + 50, start_time_ns + 70, 1);
   gpuOps->addMemcpyH2DActivity(start_time_ns + 140, start_time_ns + 150, 2);
   gpuOps->addKernelActivity(start_time_ns + 160, start_time_ns + 220, 3);
@@ -375,7 +399,8 @@ TEST_F(RoctracerActivityProfilerTest, GpuNCCLCollectiveTest) {
 
   // Start and stop profiling
   CuptiActivityProfiler profiler(roctracerActivities_, /*cpu only*/ false);
-  int64_t start_time_ns = libkineto::timeSinceEpoch(std::chrono::system_clock::now());
+  int64_t start_time_ns =
+      libkineto::timeSinceEpoch(std::chrono::system_clock::now());
   int64_t duration_ns = 300;
   auto start_time = time_point<system_clock>(nanoseconds(start_time_ns));
   profiler.configure(*cfg_, start_time);
@@ -387,7 +412,8 @@ TEST_F(RoctracerActivityProfilerTest, GpuNCCLCollectiveTest) {
 
   // Prepare metadata map
   std::unordered_map<std::string, std::string> metadataMap;
-  metadataMap.emplace(kCollectiveName, fmt::format("\"{}\"", "_allgather_base"));
+  metadataMap.emplace(
+      kCollectiveName, fmt::format("\"{}\"", "_allgather_base"));
   metadataMap.emplace(kDtype, fmt::format("\"{}\"", "Float"));
   metadataMap.emplace(kInMsgNelems, "65664");
   metadataMap.emplace(kOutMsgNelems, "131328");
@@ -456,7 +482,8 @@ TEST_F(RoctracerActivityProfilerTest, GpuNCCLCollectiveTest) {
   // Set up corresponding GPU events and connect with CPU events
   // via correlationId
   auto gpuOps = std::make_unique<MockRoctracerLogger>();
-  gpuOps->addCorrelationActivity(1, RoctracerLogger::CorrelationDomain::Domain0, 1);
+  gpuOps->addCorrelationActivity(
+      1, RoctracerLogger::CorrelationDomain::Domain0, 1);
   gpuOps->addKernelActivity(kernelLaunchTime + 5, kernelLaunchTime + 10, 1);
   roctracerActivities_.activityLogger = std::move(gpuOps);
 
@@ -485,8 +512,7 @@ TEST_F(RoctracerActivityProfilerTest, GpuNCCLCollectiveTest) {
   std::vector<int64_t> expectedGroupRanks(kTruncatLength - 1, 0);
   auto expectedGroupRanksStr = fmt::format(
       "\"[{}, ..., {}]\"", fmt::join(expectedGroupRanks, ", "), "0");
-  EXPECT_EQ(
-      cpu_op->getMetadataValue(kGroupRanks), expectedGroupRanksStr);
+  EXPECT_EQ(cpu_op->getMetadataValue(kGroupRanks), expectedGroupRanksStr);
 
 #ifdef __linux__
   // Test saved output can be loaded as JSON
@@ -546,7 +572,8 @@ TEST_F(RoctracerActivityProfilerTest, GpuUserAnnotationTest) {
 
   // Start and stop profiling
   CuptiActivityProfiler profiler(roctracerActivities_, /*cpu only*/ false);
-  int64_t start_time_ns = libkineto::timeSinceEpoch(std::chrono::system_clock::now());
+  int64_t start_time_ns =
+      libkineto::timeSinceEpoch(std::chrono::system_clock::now());
   int64_t duration_ns = 300;
   auto start_time = time_point<system_clock>(nanoseconds(start_time_ns));
   profiler.configure(*cfg_, start_time);
@@ -565,9 +592,11 @@ TEST_F(RoctracerActivityProfilerTest, GpuUserAnnotationTest) {
   // set up a couple of GPU events and correlate with above CPU event.
   // RoctracerLogger::CorrelationDomain::Domain1 is used for user annotations.
   auto gpuOps = std::make_unique<MockRoctracerLogger>();
-  gpuOps->addCorrelationActivity(1, RoctracerLogger::CorrelationDomain::Domain1, 1);
+  gpuOps->addCorrelationActivity(
+      1, RoctracerLogger::CorrelationDomain::Domain1, 1);
   gpuOps->addKernelActivity(kernelLaunchTime + 5, kernelLaunchTime + 10, 1);
-  gpuOps->addCorrelationActivity(1, RoctracerLogger::CorrelationDomain::Domain1, 1);
+  gpuOps->addCorrelationActivity(
+      1, RoctracerLogger::CorrelationDomain::Domain1, 1);
   gpuOps->addKernelActivity(kernelLaunchTime + 15, kernelLaunchTime + 25, 1);
   roctracerActivities_.activityLogger = std::move(gpuOps);
 
@@ -615,7 +644,8 @@ TEST_F(RoctracerActivityProfilerTest, SubActivityProfilers) {
   ev.device = 1;
   ev.resource = 0;
 
-  int64_t start_time_ns = libkineto::timeSinceEpoch(std::chrono::system_clock::now());
+  int64_t start_time_ns =
+      libkineto::timeSinceEpoch(std::chrono::system_clock::now());
   int64_t duration_ns = 1000;
   auto start_time = time_point<system_clock>(nanoseconds(start_time_ns));
 
@@ -685,7 +715,8 @@ TEST_F(RoctracerActivityProfilerTest, JsonGPUIDSortTest) {
 
   // Start and stop profiling
   CuptiActivityProfiler profiler(roctracerActivities_, /*cpu only*/ false);
-  int64_t start_time_ns = libkineto::timeSinceEpoch(std::chrono::system_clock::now());
+  int64_t start_time_ns =
+      libkineto::timeSinceEpoch(std::chrono::system_clock::now());
   int64_t duration_ns = 500;
   auto start_time = time_point<system_clock>(nanoseconds(start_time_ns));
   profiler.configure(*cfg_, start_time);
@@ -701,7 +732,8 @@ TEST_F(RoctracerActivityProfilerTest, JsonGPUIDSortTest) {
 
   // Set up GPU events
   auto gpuOps = std::make_unique<MockRoctracerLogger>();
-  gpuOps->addRuntimeKernelActivity(HIP_LAUNCH_KERNEL, start_time_ns + 23, start_time_ns + 28, 1);
+  gpuOps->addRuntimeKernelActivity(
+      HIP_LAUNCH_KERNEL, start_time_ns + 23, start_time_ns + 28, 1);
   gpuOps->addKernelActivity(start_time_ns + 50, start_time_ns + 70, 1);
   roctracerActivities_.activityLogger = std::move(gpuOps);
 
@@ -736,11 +768,13 @@ TEST_F(RoctracerActivityProfilerTest, JsonGPUIDSortTest) {
   std::unordered_map<int64_t, std::string> sortLabel;
   std::unordered_map<int64_t, int64_t> sortIdx;
   for (auto& event : jsonData["traceEvents"]) {
-    if (event["name"] == "process_labels" && event["tid"] == 0 && event["pid"].isInt()) {
+    if (event["name"] == "process_labels" && event["tid"] == 0 &&
+        event["pid"].isInt()) {
       sortLabel[event["pid"].asInt()] = event["args"]["labels"].asString();
       LOG(INFO) << sortLabel[event["pid"].asInt()];
     }
-    if (event["name"] == "process_sort_index" && event["tid"] == 0 && event["pid"].isInt()) {
+    if (event["name"] == "process_sort_index" && event["tid"] == 0 &&
+        event["pid"].isInt()) {
       sortIdx[event["pid"].asInt()] = event["args"]["sort_index"].asInt();
       LOG(INFO) << sortIdx[event["pid"].asInt()];
     }
@@ -748,7 +782,7 @@ TEST_F(RoctracerActivityProfilerTest, JsonGPUIDSortTest) {
 
   // Expect atleast 16 GPU nodes, and 1 or more CPU nodes.
   EXPECT_LE(16, sortLabel.size());
-  for (int i = 0; i<16; i++) {
+  for (int i = 0; i < 16; i++) {
     // Check there are 16 GPU sorts (0-15) with expected sort_index.
     EXPECT_EQ("GPU " + std::to_string(i), sortLabel[i]);
     // sortIndex is gpu + kExceedMaxPid to put GPU tracks at the bottom

From e029f96ad2d3c24d74585985a75ecc65ec1cd27a Mon Sep 17 00:00:00 2001
From: Shivam Raikundalia <sraikund@meta.com>
Date: Tue, 8 Oct 2024 11:03:36 -0700
Subject: [PATCH 13/16] Fix Clear On Fork (#998)

Summary:
Pull Request resolved: https://github.com/pytorch/kineto/pull/998

Threads can have the same PID and TID if forked after they have been cached. This can be dangerous especially if you have already ran a profiler workload and then decide to fork after and run another workload. To get around this, we add a routine to always clear the PID/TID when forking. Also, we make sure that the ConfigLoader will not wait on a thread when destructing since the forked thread will have no spawned child if the configuration has been already enabled.

Reviewed By: briancoutinho

Differential Revision: D63924780

fbshipit-source-id: 6ba70066ff02a263d85645287f895c999899f06c
---
 libkineto/include/ThreadUtil.h | 4 ++++
 libkineto/src/ConfigLoader.cpp | 4 +++-
 libkineto/src/ThreadUtil.cpp   | 8 ++++++++
 libkineto/src/init.cpp         | 3 +++
 4 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/libkineto/include/ThreadUtil.h b/libkineto/include/ThreadUtil.h
index 4178ae4a1..ca52c6343 100644
--- a/libkineto/include/ThreadUtil.h
+++ b/libkineto/include/ThreadUtil.h
@@ -27,4 +27,8 @@ std::string processName(int32_t pid);
 // and its parents.
 std::vector<std::pair<int32_t, std::string>> pidCommandPairsOfAncestors();
 
+// Resets all cached Thread local state, this must be done on
+// forks to prevent stale values from being retained.
+void resetTLS();
+
 } // namespace libkineto
diff --git a/libkineto/src/ConfigLoader.cpp b/libkineto/src/ConfigLoader.cpp
index 42b0b8163..4fe40c607 100644
--- a/libkineto/src/ConfigLoader.cpp
+++ b/libkineto/src/ConfigLoader.cpp
@@ -172,7 +172,9 @@ void ConfigLoader::stopThread() {
       std::lock_guard<std::mutex> lock(updateThreadMutex_);
       updateThreadCondVar_.notify_one();
     }
-    updateThread_->join();
+    if (updateThread_->joinable()) {
+      updateThread_->join();
+    }
     updateThread_ = nullptr;
   }
 }
diff --git a/libkineto/src/ThreadUtil.cpp b/libkineto/src/ThreadUtil.cpp
index f9ec041e2..642401599 100644
--- a/libkineto/src/ThreadUtil.cpp
+++ b/libkineto/src/ThreadUtil.cpp
@@ -39,6 +39,14 @@ thread_local int32_t _tid = 0;
 thread_local int32_t _sysTid = 0;
 } // namespace
 
+// Resets all cached Thread local state, this must be done on
+// forks to prevent stale values from being retained.
+void resetTLS() {
+  _pid = 0;
+  _tid = 0;
+  _sysTid = 0;
+}
+
 int32_t processId(bool cache) {
   int32_t pid = 0;
   if (!_pid) {
diff --git a/libkineto/src/init.cpp b/libkineto/src/init.cpp
index 6246db8e3..a78474920 100644
--- a/libkineto/src/init.cpp
+++ b/libkineto/src/init.cpp
@@ -121,6 +121,9 @@ extern "C" {
 
 // Return true if no CUPTI errors occurred during init
 void libkineto_init(bool cpuOnly, bool logOnError) {
+  // register fork handler
+  pthread_atfork(nullptr, nullptr, &resetTLS);
+
   // Start with initializing the log level
   const char* logLevelEnv = getenv("KINETO_LOG_LEVEL");
   if (logLevelEnv) {

From 7dcf02a02726b0e4ad6705562861b4aac5d5b707 Mon Sep 17 00:00:00 2001
From: Shivam Raikundalia <sraikund@fb.com>
Date: Tue, 8 Oct 2024 15:06:17 -0700
Subject: [PATCH 14/16] Revert D63924780: Fix Clear On Fork

Differential Revision:
D63924780

Original commit changeset: 6ba70066ff02

Original Phabricator Diff: D63924780

fbshipit-source-id: 0932a89ca7c7a7c69ed229de813c23cc4c3ce484
---
 libkineto/include/ThreadUtil.h | 4 ----
 libkineto/src/ConfigLoader.cpp | 4 +---
 libkineto/src/ThreadUtil.cpp   | 8 --------
 libkineto/src/init.cpp         | 3 ---
 4 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/libkineto/include/ThreadUtil.h b/libkineto/include/ThreadUtil.h
index ca52c6343..4178ae4a1 100644
--- a/libkineto/include/ThreadUtil.h
+++ b/libkineto/include/ThreadUtil.h
@@ -27,8 +27,4 @@ std::string processName(int32_t pid);
 // and its parents.
 std::vector<std::pair<int32_t, std::string>> pidCommandPairsOfAncestors();
 
-// Resets all cached Thread local state, this must be done on
-// forks to prevent stale values from being retained.
-void resetTLS();
-
 } // namespace libkineto
diff --git a/libkineto/src/ConfigLoader.cpp b/libkineto/src/ConfigLoader.cpp
index 4fe40c607..42b0b8163 100644
--- a/libkineto/src/ConfigLoader.cpp
+++ b/libkineto/src/ConfigLoader.cpp
@@ -172,9 +172,7 @@ void ConfigLoader::stopThread() {
       std::lock_guard<std::mutex> lock(updateThreadMutex_);
       updateThreadCondVar_.notify_one();
     }
-    if (updateThread_->joinable()) {
-      updateThread_->join();
-    }
+    updateThread_->join();
     updateThread_ = nullptr;
   }
 }
diff --git a/libkineto/src/ThreadUtil.cpp b/libkineto/src/ThreadUtil.cpp
index 642401599..f9ec041e2 100644
--- a/libkineto/src/ThreadUtil.cpp
+++ b/libkineto/src/ThreadUtil.cpp
@@ -39,14 +39,6 @@ thread_local int32_t _tid = 0;
 thread_local int32_t _sysTid = 0;
 } // namespace
 
-// Resets all cached Thread local state, this must be done on
-// forks to prevent stale values from being retained.
-void resetTLS() {
-  _pid = 0;
-  _tid = 0;
-  _sysTid = 0;
-}
-
 int32_t processId(bool cache) {
   int32_t pid = 0;
   if (!_pid) {
diff --git a/libkineto/src/init.cpp b/libkineto/src/init.cpp
index a78474920..6246db8e3 100644
--- a/libkineto/src/init.cpp
+++ b/libkineto/src/init.cpp
@@ -121,9 +121,6 @@ extern "C" {
 
 // Return true if no CUPTI errors occurred during init
 void libkineto_init(bool cpuOnly, bool logOnError) {
-  // register fork handler
-  pthread_atfork(nullptr, nullptr, &resetTLS);
-
   // Start with initializing the log level
   const char* logLevelEnv = getenv("KINETO_LOG_LEVEL");
   if (logLevelEnv) {

From a6febf5d07b0d78d6e9d84ebe9a4f52ec27b0cde Mon Sep 17 00:00:00 2001
From: Nikita Lutsenko <nlutsenko@meta.com>
Date: Wed, 9 Oct 2024 09:10:29 -0700
Subject: [PATCH 15/16] kineto | Fix 'deprecated-dynamic-exception-spec'
 warning on Android.

Summary: cxxabi.h from LLVM actually violates this warning, fix it.

Reviewed By: ChristianK275

Differential Revision: D64083890

fbshipit-source-id: 720c93f4d7d7c2dc7edb67613c066a30daa46aa5
---
 libkineto/src/Demangle.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/libkineto/src/Demangle.cpp b/libkineto/src/Demangle.cpp
index 9ae7537b5..3f511ff60 100644
--- a/libkineto/src/Demangle.cpp
+++ b/libkineto/src/Demangle.cpp
@@ -9,8 +9,17 @@
 #include "Demangle.h"
 
 #ifndef _MSC_VER
+
+#if defined(__clang__)
+_Pragma("GCC diagnostic push");
+_Pragma("GCC diagnostic ignored \"-Wdeprecated-dynamic-exception-spec\"");
+#endif
 #include <cxxabi.h>
+#if defined(__clang__)
+_Pragma("GCC diagnostic pop");
 #endif
+#endif // _MSC_VER
+
 #include <string.h>
 #include <string>
 

From 00a00e0b1a00b674172b9a13e7b386d8c2964c7d Mon Sep 17 00:00:00 2001
From: Shivam Raikundalia <sraikund@meta.com>
Date: Thu, 10 Oct 2024 12:44:52 -0700
Subject: [PATCH 16/16] Reset TLS on Profiling Entrance (#999)

Summary:
Pull Request resolved: https://github.com/pytorch/kineto/pull/999

D63924780 broke some tests because of pthread_atfork having strange properties with subsequent calls. To fix this, lets deviate from this method and just reset the TLS whenever we enter the a profiling context. This will ensure that we will start "fresh" for all the PID/TID related content upon every profile. The drawbacks are:

1. 1 Extra Cache Miss per Profile - This is negligible because the cache miss is during the prepare stage for auto-trace and schedule for on-demand. To add to this, the cache miss penalty is very small

2. No Reset if Fork during Profile - If someone were to fork in the middle of a profile the TLS won't get reset. However, there are many other issues that could happen due to a fork midway through a profile such as undefined behavior with cupti, distorted profiling window etc. We shouldn't worry about this case as of today.

Reviewed By: aaronenyeshi, briancoutinho

Differential Revision: D64120658

fbshipit-source-id: d7ed8462f76dfe6042f4a9a97979fa5010cccd2e
---
 libkineto/include/ThreadUtil.h          | 4 ++++
 libkineto/include/libkineto.h           | 4 ++++
 libkineto/src/ActivityProfilerProxy.cpp | 2 ++
 libkineto/src/ConfigLoader.cpp          | 4 +++-
 libkineto/src/ThreadUtil.cpp            | 8 ++++++++
 libkineto/src/init.cpp                  | 1 +
 6 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/libkineto/include/ThreadUtil.h b/libkineto/include/ThreadUtil.h
index 4178ae4a1..ca52c6343 100644
--- a/libkineto/include/ThreadUtil.h
+++ b/libkineto/include/ThreadUtil.h
@@ -27,4 +27,8 @@ std::string processName(int32_t pid);
 // and its parents.
 std::vector<std::pair<int32_t, std::string>> pidCommandPairsOfAncestors();
 
+// Resets all cached Thread local state, this must be done on
+// forks to prevent stale values from being retained.
+void resetTLS();
+
 } // namespace libkineto
diff --git a/libkineto/include/libkineto.h b/libkineto/include/libkineto.h
index 6fc571b34..a122a77a5 100644
--- a/libkineto/include/libkineto.h
+++ b/libkineto/include/libkineto.h
@@ -117,6 +117,10 @@ class LibkinetoApi {
     suppressLibkinetoLogMessages();
   }
 
+  void resetKinetoTLS() {
+    resetTLS();
+  }
+
   // Provides access to profier configuration manaegement
   ConfigLoader& configLoader() {
     return configLoader_;
diff --git a/libkineto/src/ActivityProfilerProxy.cpp b/libkineto/src/ActivityProfilerProxy.cpp
index 8400fd052..fe17a31b5 100644
--- a/libkineto/src/ActivityProfilerProxy.cpp
+++ b/libkineto/src/ActivityProfilerProxy.cpp
@@ -12,6 +12,7 @@
 #include "ActivityProfilerController.h"
 #include "Config.h"
 #include "Logger.h"
+#include "ThreadUtil.h"
 
 namespace KINETO_NAMESPACE {
 
@@ -31,6 +32,7 @@ void ActivityProfilerProxy::init() {
 }
 
 void ActivityProfilerProxy::scheduleTrace(const std::string& configStr) {
+  resetTLS();
   Config config;
   config.parse(configStr);
   controller_->scheduleTrace(config);
diff --git a/libkineto/src/ConfigLoader.cpp b/libkineto/src/ConfigLoader.cpp
index 42b0b8163..4fe40c607 100644
--- a/libkineto/src/ConfigLoader.cpp
+++ b/libkineto/src/ConfigLoader.cpp
@@ -172,7 +172,9 @@ void ConfigLoader::stopThread() {
       std::lock_guard<std::mutex> lock(updateThreadMutex_);
       updateThreadCondVar_.notify_one();
     }
-    updateThread_->join();
+    if (updateThread_->joinable()) {
+      updateThread_->join();
+    }
     updateThread_ = nullptr;
   }
 }
diff --git a/libkineto/src/ThreadUtil.cpp b/libkineto/src/ThreadUtil.cpp
index f9ec041e2..6f9429d16 100644
--- a/libkineto/src/ThreadUtil.cpp
+++ b/libkineto/src/ThreadUtil.cpp
@@ -92,6 +92,14 @@ int32_t threadId() {
   return _tid;
 }
 
+// Resets all cached Thread local state, this must be done on
+// forks to prevent stale values from being retained.
+void resetTLS() {
+  _pid = 0;
+  _tid = 0;
+  _sysTid = 0;
+}
+
 namespace {
 static constexpr size_t kMaxThreadNameLength = 16;
 
diff --git a/libkineto/src/init.cpp b/libkineto/src/init.cpp
index 6246db8e3..51dd332f4 100644
--- a/libkineto/src/init.cpp
+++ b/libkineto/src/init.cpp
@@ -16,6 +16,7 @@
 #include "ConfigLoader.h"
 #include "DaemonConfigLoader.h"
 #include "DeviceUtil.h"
+#include "ThreadUtil.h"
 #ifdef HAS_CUPTI
 #include "CuptiActivityApi.h"
 #include "CuptiCallbackApi.h"