From c9e508573fbe65c10c55c0e70e2cd7264f365c75 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Wed, 9 Mar 2022 03:34:17 +0000
Subject: [PATCH 01/15] no

---
 paddle/phi/core/compat/op_utils.h             |   1 +
 paddle/phi/kernels/funcs/reduce_functor.h     |   8 ++
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   |   4 +
 .../phi/kernels/gpu/take_along_axis_kernel.cu |   1 +
 paddle/phi/kernels/math_kernel.h              |   2 +-
 paddle/phi/ops/compat/reduce_sig.cc           |  24 ++++-
 python/paddle/profiler/profiler.py            | 101 ++++++++++--------
 python/paddle/profiler/utils.py               |   1 -
 8 files changed, 92 insertions(+), 50 deletions(-)
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index 9947e00ecb53c5..1ab718c0794384 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -47,6 +47,7 @@ const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                            "matmul_grad",
                                                            "matmul_grad_grad",
                                                            "mean",
+                                                           "max",
                                                            "reshape",
                                                            "reshape_grad",
                                                            "expand",
diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h
index aebd155ac59cb2..4e83d0fa371032 100644
--- a/paddle/phi/kernels/funcs/reduce_functor.h
+++ b/paddle/phi/kernels/funcs/reduce_functor.h
@@ -41,5 +41,13 @@ struct ProdFunctor {
   }
 };
 
+//////// Max Functor ///////
+struct MaxFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 6ad12245d2a45a..49b550f51e60e1 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -460,10 +460,14 @@ void BatchNormKernel(const Context &ctx,
       void *reserve_space_ptr = nullptr;
       void *workspace_ptr = nullptr;
       DenseTensor workspace_tensor;
+      DenseTensor reserve_space_tensor;
       // Create reserve space and workspace for batch norm.
       // Create tensor for each batchnorm op, it will be used in the
       // backward. Thus this tensor shouldn't be temp.
       // auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+      if (reserve_space == nullptr) {
+        reserve_space = &reserve_space_tensor;
+      }
       PADDLE_ENFORCE_NOT_NULL(
           reserve_space,
           phi::errors::NotFound(
diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
index 63113e3e672f37..9665a917d9dc4a 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
@@ -53,6 +53,7 @@ PD_REGISTER_KERNEL(take_along_axis,
                    GPU,
                    ALL_LAYOUT,
                    phi::TakeAlongAxisKernel,
+                   float,
                    double,
                    int64_t,
                    int,
diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h
index fe8f3b749cdd8a..7569cbcff087d7 100644
--- a/paddle/phi/kernels/math_kernel.h
+++ b/paddle/phi/kernels/math_kernel.h
@@ -156,7 +156,7 @@ DenseTensor Mean(const Context& dev_ctx,
                  bool keep_dim) {
   DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
-  ReduceInferMetaBase(x, axis, keep_dim, false, x.dtype(), &meta_out);
+  SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out);
   MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
   return dense_out;
 }
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 92839fb3030752..36798abe4c11b8 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -21,7 +21,7 @@ KernelSignature ReduceSumOpArgumentMapping(const ArgumentMappingContext& ctx) {
     bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
     // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
     // InferShape, so we must return the "sum_raw" KernelSignature.
-    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // And the InferMeta function(i.e. SumRawInferMeta) is accordance with
     // the "sum_raw" KernelSignature
     if (ctx.IsForInferShape() || reduce_all) {
       return KernelSignature("sum_raw",
@@ -40,7 +40,8 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
     bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
     // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
     // InferShape, so we must return the "mean_raw" KernelSignature.
-    // And the InferMeta function(i.e. MeanRawInferMeta) is accordance with the
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the
     // "mean_raw" KernelSignature
     if (ctx.IsForInferShape() || reduce_all) {
       return KernelSignature(
@@ -56,11 +57,30 @@ KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) {
       "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
 }
 
+KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "max_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the
+    // "max_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "max_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("max", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max);
 
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_max, phi::ReduceMaxOpArgumentMapping);
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index dc637bf983046b..3f67f69b2ad4a7 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -82,11 +82,9 @@ def make_scheduler(*,
     Examples:
         1. profiling range [2, 5]
         batch 0: closed, batch 1: ready, batch [2, 5] record
-        .. code-block:: python
         make_scheduler(closed=1, ready=1, record=4, repeat=1)
         2. profiling range [3,6], [9,12], [15,18]...
         batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
-        .. code-block:: python
         make_scheduler(closed=1, ready=1, record=4, skip_first=1)
     """
 
@@ -138,15 +136,16 @@ def export_chrome_tracing(dir_name: str,
 
     Examples:
         .. code-block:: python
-        import paddle.profiler as profiler
-        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
-                                        profiler.ProfilerTarget.GPU],
-                            scheduler = (3, 10),
-                            on_trace_ready = profiler.export_chrome_tracing('./log')
-                            ) as p:
-            for iter in range(N):
-            train()
-            p.step()
+
+            import paddle.profiler as profiler
+            with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                            profiler.ProfilerTarget.GPU],
+                                scheduler = (3, 10),
+                                on_trace_ready = profiler.export_chrome_tracing('./log')
+                                ) as p:
+                for iter in range(N):
+                    #train()
+                    p.step()
     """
     if not os.path.exists(dir_name):
         try:
@@ -181,15 +180,16 @@ def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
 
     Examples:
         .. code-block:: python
-        import paddle.profiler as profiler
-        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
-                                        profiler.ProfilerTarget.GPU],
-                            scheduler = (3, 10),
-                            on_trace_ready = profiler.export_protobuf('./log')
-                            ) as p:
-            for iter in range(N):
-            train()
-            p.step()
+
+            import paddle.profiler as profiler
+            with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                            profiler.ProfilerTarget.GPU],
+                                scheduler = (3, 10),
+                                on_trace_ready = profiler.export_protobuf('./log')
+                                ) as p:
+                for iter in range(N):
+                    #train()
+                    p.step()
     """
     if not os.path.exists(dir_name):
         try:
@@ -238,36 +238,45 @@ class Profiler:
     Examples:
         1. profiling range [2, 5)
         .. code-block:: python
-        import paddle.profiler as profiler
-        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
-                                        profiler.ProfilerTarget.GPU],
-                            scheduler = (2, 5),
-                            on_trace_ready = profiler.export_chrome_tracing('./log')
-                            ) as p:
-            for iter in range(N):
-            train()
-            p.step()
+            :name: code-example1
+
+                import paddle.profiler as profiler
+                with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                                profiler.ProfilerTarget.GPU],
+                                    scheduler = (2, 5),
+                                    on_trace_ready = profiler.export_chrome_tracing('./log')
+                                    ) as p:
+                    for iter in range(N):
+                        #train()
+                        p.step()
+
         2. profiling range [2,4], [7, 9], [11,13]
         .. code-block:: python
-        import paddle.profiler as profiler
-        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
-                                        profiler.ProfilerTarget.GPU],
-                            scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3),
-                            on_trace_ready = profiler.export_chrome_tracing('./log')
-                            ) as p:
-            for iter in range(N):
-            train()
-            p.step()
+            :name: code-example2
+
+                import paddle.profiler as profiler
+                with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                                profiler.ProfilerTarget.GPU],
+                                    scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3),
+                                    on_trace_ready = profiler.export_chrome_tracing('./log')
+                                    ) as p:
+                    for iter in range(N):
+                        #train()
+                        p.step()
+
         3. Use profiler without context manager, and use default parameters
         .. code-block:: python
-        import paddle.profiler as profiler
-        p = profiler.Profiler()
-        p.start()
-        for iter in range(N):
-            train()
-            p.step()
-        p.stop()
-        p.summary()
+        :name: code-example3
+
+            import paddle.profiler as profiler
+            p = profiler.Profiler()
+            p.start()
+            for iter in range(N):
+                #train()
+                p.step()
+            p.stop()
+            p.summary()
+
     """
 
     def __init__(
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index 642001dfbfc5a3..dea5feebccc195 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -36,7 +36,6 @@ class RecordEvent(ContextDecorator):
     event_type(TracerEventType): Type of the record event, can be used for statistics.
 
     Examples:
-        .. code-block:: python
         import paddle.profiler as profiler
         with profiler.RecordEvent(name='op1', event_type=TracerEventType=TracerEventType.UserDefined):
             op1()

From 3c9cb03c7922c584a703015557e40cc8edf1935f Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Thu, 24 Mar 2022 06:23:31 +0000
Subject: [PATCH 02/15] fix bugs

---
 paddle/fluid/platform/profiler/host_tracer.cc |   2 +-
 python/paddle/profiler/profiler.py            |  15 +-
 python/paddle/profiler/profiler_statistic.py  | 167 ++++++++++++------
 python/paddle/profiler/statistic_helper.py    |   1 +
 python/paddle/profiler/utils.py               |   2 +-
 5 files changed, 132 insertions(+), 55 deletions(-)

diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc
index afd1c2b3012d46..b7eb53331b793a 100644
--- a/paddle/fluid/platform/profiler/host_tracer.cc
+++ b/paddle/fluid/platform/profiler/host_tracer.cc
@@ -20,7 +20,7 @@
 
 // Used to filter events, works like glog VLOG(level).
 // RecordEvent will works if host_trace_level >= level.
-PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 2,
+PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1,
                              "RecordEvent will works "
                              "if host_trace_level >= level.");
 
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index efbe88583b776d..08f9f6de411d26 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -154,6 +154,9 @@ def export_chrome_tracing(dir_name: str,
     Parameters:
         dir_name(str): Directory to save profiling data.
         worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
+    
+    Returns:
+        A callable, which takes a Profiler object as parameter and calls its export method to save data to chrome tracing format file.
 
     Examples:
         .. code-block:: python
@@ -199,6 +202,9 @@ def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
         dir_name(str): Directory to save profiling data.
         worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
 
+    Returns:
+        A callable, which takes a Profiler object as parameter and calls its export method to save data to protobuf file.
+
     Examples:
         .. code-block:: python
 
@@ -522,7 +528,12 @@ def _trigger_action(self):
 
     def export(self, path="", format="json"):
         r"""
-        Exports the tracing data in Chrome tracing data format.
+        Exports the tracing data to file.
+
+        Parameters:
+            path(str): file path of the output.
+            format(str): output format, can be chosen from ['json', 'pb], 'json' for chrome tracing and 'pb' for protobuf.
+
 
         Examples:
             .. code-block:: python
@@ -548,7 +559,7 @@ def summary(self,
                 thread_sep=False,
                 time_unit='ms'):
         r"""
-        Print the Summary table.
+        Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and userdefined summary.
 
         Parameters:
             sorted_by(SortedKeys): how to rank the op table items.
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index a0bbd6b633ef01..61e02c4712c917 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -119,6 +119,23 @@ def traverse_tree(nodetrees):
     return results
 
 
+def get_device_nodes(hostnode):
+    '''
+    Get all device nodes called in the time range of hostnode.
+    '''
+    stack = []
+    device_nodes = []
+    stack.append(hostnode)
+    while stack:
+        current_node = stack.pop()
+        for childnode in current_node.children_node:
+            stack.append(childnode)
+        for runtimenode in current_node.runtime_node:
+            for devicenode in runtimenode.device_node:
+                device_nodes.append(devicenode)
+    return device_nodes
+
+
 def wrap_tree(nodetrees):
     '''
     Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
@@ -186,16 +203,6 @@ def parse(self, nodetrees):
                 CPUTimeRange[hostnode.type].append(
                     (hostnode.start_ns, hostnode.end_ns))
                 self.call_times[hostnode.type] += 1
-                if hostnode.type == TracerEventType.Operator and any([
-                        name in hostnode.name for name in _CommunicationOpName
-                ]):  # special case, communication op
-                    CPUTimeRange[TracerEventType.Communication].append(
-                        (hostnode.start_ns, hostnode.end_ns))
-                    self.call_times[TracerEventType.Communication] += 1
-                is_communication_node = (
-                    hostnode.type == TracerEventType.Communication
-                ) or (hostnode.type == TracerEventType.Operator and any(
-                    [name in hostnode.name for name in _CommunicationOpName]))
                 for runtimenode in hostnode.runtime_node:
                     CPUTimeRange[runtimenode.type].append(
                         (runtimenode.start_ns, runtimenode.end_ns))
@@ -205,12 +212,6 @@ def parse(self, nodetrees):
                             devicenode.stream_id].append(
                                 (devicenode.start_ns, devicenode.end_ns))
                         self.call_times[devicenode.type] += 1
-                        if is_communication_node:  # gpu activity for communication node
-                            GPUTimeRange[devicenode.device_id][
-                                TracerEventType.Communication][
-                                    devicenode.stream_id].append((
-                                        devicenode.start_ns, devicenode.end_ns))
-                            self.call_times[TracerEventType.Communication] += 1
 
             for event_type, time_ranges in CPUTimeRange.items():
                 time_ranges = merge_self_ranges(time_ranges, is_sorted=False)
@@ -243,6 +244,78 @@ def get_cpu_range_sum(self, event_type):
         return self.CPUTimeRangeSum[event_type]
 
 
+class DistributedSummary:
+    r"""
+    Analysis communication and computation time range, and their overlap.
+    The computation time is all kernel except kernels for communication like nccl.
+    """
+
+    def __init__(self):
+        self.cpu_communication_range = []
+        self.gpu_communication_range = []
+        self.communication_range = []
+        self.computation_range = []
+        self.overlap_range = []
+
+    def parse(self, nodetrees):
+        '''
+        Collect all communication and computation time ranges.
+        '''
+        thread2hostnodes = traverse_tree(nodetrees)
+        for threadid, hostnodes in thread2hostnodes.items():
+            for hostnode in hostnodes[1:]:  #skip root node
+                # case 1: TracerEventType is Communication
+                if hostnode.type == TracerEventType.Communication:
+                    self.cpu_communication_range.append(
+                        (hostnode.start_ns, hostnode.end_ns))
+                    device_nodes = get_device_nodes(hostnode)
+                    for device_node in device_nodes:
+                        if device_node.type == TracerEventType.Kernel:
+                            self.gpu_communication_range.append(
+                                (devicenode.start_ns, devicenode.end_ns))
+                        else:
+                            self.computation_range.append(
+                                (devicenode.start_ns, devicenode.end_ns))
+                #case 2: TracerEventType is Operator but is communication op
+                elif hostnode.type == TracerEventType.Operator and any([
+                        name in hostnode.name.lower()
+                        for name in _CommunicationOpName
+                ]):
+                    self.cpu_communication_range.append(
+                        (hostnode.start_ns, hostnode.end_ns))
+                    device_nodes = get_device_nodes(hostnode)
+                    for device_node in device_nodes:
+                        if device_node.type == TracerEventType.Kernel:
+                            self.gpu_communication_range.append(
+                                (devicenode.start_ns, devicenode.end_ns))
+                        else:
+                            self.computation_range.append(
+                                (devicenode.start_ns, devicenode.end_ns))
+                #case 3: Others, filter kernels named with nccl
+                else:
+                    for runtimenode in hostnode.runtime_node:
+                        for devicenode in runtimenode.device_node:
+                            if devicenode.type == TracerEventType.Kernel:
+                                if 'nccl' in devicenode.name.lower():
+                                    self.gpu_communication_range.append((
+                                        devicenode.start_ns, devicenode.end_ns))
+                                else:
+                                    self.computation_range.append((
+                                        devicenode.start_ns, devicenode.end_ns))
+        self.cpu_communication_range = merge_self_ranges(
+            self.cpu_communication_range, is_sorted=False)
+        self.gpu_communication_range = merge_self_ranges(
+            self.gpu_communication_range, is_sorted=False)
+        self.communication_range = merge_ranges(
+            self.cpu_communication_range,
+            self.gpu_communication_range,
+            is_sorted=True)
+        self.computation_range = merge_self_ranges(
+            self.computation_range, is_sorted=False)
+        self.overlap_range = intersection_ranges(
+            self.communication_range, self.computation_range, is_sorted=True)
+
+
 class EventSummary:
     r"""
     Analyse operator event in profiling data, correlate with its device event.
@@ -473,8 +546,10 @@ def __init__(self, node_trees, extra_info):
         self.extra_info = extra_info
         self.time_range_summary = TimeRangeSummary()
         self.event_summary = EventSummary()
+        self.distributed_summary = DistributedSummary()
         self.time_range_summary.parse(node_trees)
         self.event_summary.parse(node_trees)
+        self.distributed_summary.parse(node_trees)
 
 
 def _build_table(statistic_data,
@@ -610,7 +685,11 @@ def format_ratio(ratio, indent=0):
     gpu_type_time = collections.defaultdict(int)
     for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items(
     ):
-        cpu_type_time[event_type] = value
+        if event_type != TracerEventType.Communication:
+            cpu_type_time[event_type] = value
+    if statistic_data.distributed_summary.cpu_communication_range:
+        cpu_type_time[TracerEventType.Communication] = sum_ranges(
+            statistic_data.distributed_summary.cpu_communication_range)
 
     gpu_time_range = collections.defaultdict(list)
     for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
@@ -620,6 +699,9 @@ def format_ratio(ratio, indent=0):
                 gpu_time_range[event_type], time_range, is_sorted=True)
     for event_type, time_range in gpu_time_range.items():
         gpu_type_time[event_type] = sum_ranges(time_range)
+    if statistic_data.distributed_summary.gpu_communication_range:
+        gpu_type_time[TracerEventType.Communication] = sum_ranges(
+            statistic_data.distributed_summary.gpu_communication_range)
 
     sorted_items = sorted(
         cpu_type_time.items(), key=lambda x: x[1], reverse=True)
@@ -735,7 +817,7 @@ def format_ratio(ratio, indent=0):
         append('')
 
     ###### Print Distribution Summary Report ######
-    if TracerEventType.Communication in statistic_data.time_range_summary.CPUTimeRange:
+    if statistic_data.distributed_summary.communication_range:
         headers = [
             'Name',
             'Total Time',
@@ -759,33 +841,12 @@ def format_ratio(ratio, indent=0):
         append(header_sep)
         append(row_format.format(*headers))
         append(header_sep)
-        cpu_communication_time_range = []
-        gpu_communication_time_range = []
-        cpu_communication_time_range = merge_ranges(
-            statistic_data.time_range_summary.CPUTimeRange[
-                TracerEventType.Communication], cpu_communication_time_range)
-        kernel_time_range = []
-        for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
-        ):
-            kernel_time_range = merge_ranges(
-                device_time_ranges[TracerEventType.Kernel],
-                kernel_time_range,
-                is_sorted=True)
-            gpu_communication_time_range = merge_ranges(
-                device_time_ranges[TracerEventType.Communication],
-                gpu_communication_time_range,
-                is_sorted=True)
-        communication_time_range = merge_ranges(
-            cpu_communication_time_range,
-            gpu_communication_time_range,
-            is_sorted=True)
-        computation_time_range = subtract_ranges(kernel_time_range,
-                                                 gpu_communication_time_range)
-        overlap_time_range = intersection_ranges(communication_time_range,
-                                                 computation_time_range)
-        communication_time = sum_ranges(communication_time_range)
-        computation_time = sum_ranges(computation_time_range)
-        overlap_time = sum_ranges(overlap_time_range)
+        communication_time = sum_ranges(
+            statistic_data.distributed_summary.communication_range)
+        computation_time = sum_ranges(
+            statistic_data.distributed_summary.computation_range)
+        overlap_time = sum_ranges(
+            statistic_data.distributed_summary.overlap_range)
         row_values = [
             'Communication', format_time(
                 communication_time, unit=time_unit),
@@ -808,9 +869,9 @@ def format_ratio(ratio, indent=0):
         append(row_format.format(*row_values))
         append(header_sep)
         append(
-            "Note:\nCommunication time: Communication Op time and its kernel time on gpu.\n"
-            "Computation time: Kernel time, substract kernels belong to communication op.\n"
-            "Overlap time: Communication time intersect with computation time.\n"
+            "Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n"
+            "Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n"
+            "Overlap time: Communication time intersects with computation time.\n"
             "Example:\n"
             "Communication:\n"
             "  CPU:              |_________________|\n"
@@ -976,7 +1037,7 @@ def format_ratio(ratio, indent=0):
                                                                 - 5]
                             device_node_name += "..."
                         row_values = [
-                            '    {}'.format(device_node_name), devicenode.call,
+                            '  {}'.format(device_node_name), devicenode.call,
                             '- / - / - / - / -',
                             '{} / {} / {} / {} / {}'.format(
                                 format_time(
@@ -1001,10 +1062,14 @@ def format_ratio(ratio, indent=0):
             'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
             'GPU Total / Avg / Max / Min / Ratio(%)'
         ]
+        name_column_width = 0
+        for name, item in statistic_data.event_summary.memory_manipulation_items.items(
+        ):
+            name_column_width = max(len(name), name_column_width)
         row_format_list = [""]
         header_sep_list = [""]
         line_length_list = [-SPACING_SIZE]
-        name_column_width = 30
+        name_column_width = min(name_column_width, 45)
         add_column(name_column_width)
         add_column(6)
         add_column(40)
@@ -1059,7 +1124,7 @@ def format_ratio(ratio, indent=0):
         row_format_list = [""]
         header_sep_list = [""]
         line_length_list = [-SPACING_SIZE]
-        name_column_width = 30
+        name_column_width = 35
         add_column(name_column_width)
         add_column(6)
         add_column(40)
diff --git a/python/paddle/profiler/statistic_helper.py b/python/paddle/profiler/statistic_helper.py
index 1f11649928a7fe..76dd1f0a6439d1 100644
--- a/python/paddle/profiler/statistic_helper.py
+++ b/python/paddle/profiler/statistic_helper.py
@@ -96,6 +96,7 @@ def merge_ranges(range_list1, range_list2, is_sorted=False):
             else:
                 indx1 += 1
         while indx2 < len2:
+            range2 = range_list2[indx2]
             if range2[1] > merged_ranges[-1][1]:
                 if range2[0] <= merged_ranges[-1][1]:
                     merged_ranges[-1] = (merged_ranges[-1][0], range2[1])
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index 7fa7a27bad7bf5..b9c27efb19bff2 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -17,8 +17,8 @@
 import functools
 from contextlib import ContextDecorator
 
+from paddle.fluid import core
 from paddle.fluid.core import (_RecordEvent, TracerEventType)
-import paddle.fluid.core as core
 
 _AllowedEventTypeList = [
     TracerEventType.Dataloader, TracerEventType.ProfileStep,

From 3358cc8cb965fc77c4e8788a7f7d7bd8f437c856 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Thu, 24 Mar 2022 12:59:44 +0000
Subject: [PATCH 03/15] fix doc according to review

---
 python/paddle/profiler/profiler.py           | 96 +++++++++++---------
 python/paddle/profiler/profiler_statistic.py | 28 +++---
 python/paddle/profiler/utils.py              | 17 ++--
 3 files changed, 76 insertions(+), 65 deletions(-)

diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index 08f9f6de411d26..1a3680b683b410 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -29,17 +29,15 @@
 
 class ProfilerState(Enum):
     r"""
-    Profiler state that can be specified to control profiler action.
-
-    CLOSED: The profilers are closed.
-
-    READY:  The profilers are open, but the data will not be recorded.
-    This state is used for reducing overhead influence when profilers start.
-
-    RECORD: The profilers are open, and the data will be recorded.
-
-    RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period,
-    the collected data will be returned.
+    ProfilerState is used to present the state of :ref:`Profiler <api_profiler_profiler>` .
+
+    The meaning of each ProfilerState is as following:
+        - **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded.
+        - **ProfilerState.READY** : The profiler is open, but the data will not be recorded.
+        This state is used for reducing overhead influence when profiler starts.
+        - **ProfilerState.RECORD** : The profiler is open, and the data will be recorded.
+        - **ProfilerState.RECORD_AND_RETURN** : The profiler is open, and this state stands for the last batch of "RECORD" state in current profiling period.
+        The collected data will be returned in this state.
     """
     CLOSED = 0
     READY = 1
@@ -49,11 +47,11 @@ class ProfilerState(Enum):
 
 class ProfilerTarget(Enum):
     r"""
-    Target device for profiling.
+    ProfilerTarget is used to specify target device for :ref:`profiling <api_profiler_profiler>` . Only CPU and GPU are supported currently.
 
-    CPU: Profile events on CPU.
-    
-    GPU: Profile events on GPU.
+    The meaning of each ProfilerState is as following:
+        - **ProfilerTarget.CPU** : Profile events on CPU.
+        - **ProfilerTarget.GPU** : Profile events on GPU.
     """
     CPU = 0
     GPU = 1
@@ -66,7 +64,7 @@ def make_scheduler(*,
                    repeat: int=0,
                    skip_first: int=0) -> Callable:
     r"""
-    Return a scheduler function, which scheduler the state according to the setting.
+    Return a scheduler function, which scheduler the :ref:`state <api_profiler_profilerstate>` according to the setting.
     The state transform confirms to:
 
     .. code-block:: text
@@ -78,22 +76,23 @@ def make_scheduler(*,
                                 - - - - - - - - - - - -
         Note that repeat <= 0 means the cycle will continue until the profiler exits.
 
-    Parameters:
+    Args:
         closed(int): The number of steps in state ProfilerState.CLOSED.
         ready(int):  The number of steps in state ProfilerState.READY.
-        record(int): The number of steps in state ProfilerState.RECORD.
-        repeat(int): The number of cycles to repeat above state transform.
-        skip_first(int): The number of first steps to drop, not participate in the state transform.
+        record(int): The number of steps in state ProfilerState.RECORD, and the state in last step will be set as ProfilerState.RECORD_AND_RETURN.
+        repeat(int, optional): The number of cycles to repeat above state transform. Default value is 0, which means it will repeat this cycle until profiler exits.
+        skip_first(int, optional): The number of first steps to drop, not participate in the state transform, and at ProfilerState.CLOSED state. Default value is 0.
 
     Returns:
-        A scheduler function, conforms to above state transform setting.
+        A scheduler function, conforms to above state transform setting. The function will takes one parameter step_num, and returns corresponding ProfilerState.
 
     Examples:
         1. profiling range [2, 5]
 
-        batch 0: closed, batch 1: ready, batch [2, 5] record
+        Assume batch 0: closed, batch 1: ready, batch [2, 5] record
 
             .. code-block:: python
+              :name: code-example1
 
                 import paddle.profiler as profiler
                 profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1)
@@ -101,9 +100,10 @@ def make_scheduler(*,
 
         2. profiling range [3,6], [9,12], [15,18]...
 
-        batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
+        Assume batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
 
             .. code-block:: python
+              :name: code-example2
 
                 import paddle.profiler as profiler
                 profiler.make_scheduler(closed=1, ready=1, record=4, skip_first=1)
@@ -151,15 +151,17 @@ def export_chrome_tracing(dir_name: str,
     The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
     if worker_name is not set, the default name is [hostname]_[pid].
 
-    Parameters:
+    Args:
         dir_name(str): Directory to save profiling data.
-        worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
+        worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid].
     
     Returns:
         A callable, which takes a Profiler object as parameter and calls its export method to save data to chrome tracing format file.
 
     Examples:
+        The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler <api_profiler_profiler>` .
         .. code-block:: python
+          :name: code-example1
 
             # required: gpu
             import paddle.profiler as profiler
@@ -198,15 +200,17 @@ def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
     The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
     if worker_name is not set, the default name is [hostname]_[pid].
 
-    Parameters:
+    Args:
         dir_name(str): Directory to save profiling data.
-        worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
+        worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid].
 
     Returns:
         A callable, which takes a Profiler object as parameter and calls its export method to save data to protobuf file.
 
     Examples:
+        The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler <api_profiler_profiler>` .
         .. code-block:: python
+          :name: code-example1
 
             # required: gpu
             import paddle.profiler as profiler
@@ -250,20 +254,21 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]:
 
 class Profiler:
     r"""
-    Profiler context manager, user interface to manage profile process.
+    Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table.
 
-    Parameters:
-        targets (iterable): list of tracing targets, currently supported values, ``ProfilerTarget.CPU``, ``ProfilerTarget.GPU`` .
-        scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``.
-            If not provided, the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
+    Args:
+        targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU <api_profiler_profilertarget>` and :ref:`ProfilerTarget.GPU <api_profiler_profilertarget>` .
+        scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState <api_profiler_profilerstate>`. This callable object can be generated by :ref:`make_scheduler <api_profiler_make_scheduler>` function.
+            If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
             which means profiling range [start_batch, end_batch).
-        on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing.
-            This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``.
+        on_trace_ready (Callable, optional): Callable object, serves as callback function, and takes the Profiler object as parameter, which provides a way for users to do post-processing.
+            This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing <api_profiler_export_chrome_tracing>` (./profiler_log/).
 
     Examples:
-        1. profiling range [2, 5)
+        1. profiling range [2, 5).
 
             .. code-block:: python
+              :name: code-example1
 
                 # required: gpu
                 import paddle.profiler as profiler
@@ -278,6 +283,7 @@ class Profiler:
         2. profiling range [2,4], [7, 9], [11,13]
 
             .. code-block:: python
+              :name: code-example2
 
                 # required: gpu
                 import paddle.profiler as profiler
@@ -292,6 +298,7 @@ class Profiler:
         3. Use profiler without context manager, and use default parameters
 
             .. code-block:: python
+              :name: code-example3
 
                 # required: gpu
                 import paddle.profiler as profiler
@@ -373,6 +380,7 @@ def start(self):
 
         Examples:
             .. code-block:: python
+              :name: code-example1
 
                 # required: gpu
                 import paddle.profiler as profiler
@@ -407,6 +415,7 @@ def stop(self):
 
         Examples:
             .. code-block:: python
+              :name: code-example1
 
                 # required: gpu
                 import paddle.profiler as profiler
@@ -443,6 +452,7 @@ def step(self):
 
         Examples:
             .. code-block:: python
+              :name: code-example1
 
                 # required: gpu
                 import paddle.profiler as profiler
@@ -530,13 +540,14 @@ def export(self, path="", format="json"):
         r"""
         Exports the tracing data to file.
 
-        Parameters:
+        Args:
             path(str): file path of the output.
-            format(str): output format, can be chosen from ['json', 'pb], 'json' for chrome tracing and 'pb' for protobuf.
+            format(str, optional): output format, can be chosen from ['json', 'pb], 'json' for chrome tracing and 'pb' for protobuf, default value is "json".
 
 
         Examples:
             .. code-block:: python
+              :name: code-example1
 
                 # required: gpu
                 import paddle.profiler as profiler
@@ -561,14 +572,15 @@ def summary(self,
         r"""
         Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and userdefined summary.
 
-        Parameters:
-            sorted_by(SortedKeys): how to rank the op table items.
-            op_detail(bool): expand each operator detail information.
-            thread_sep(bool): print op table each thread.
-            time_unit(str): can be chosen form ['s', 'ms', 'us', 'ns']
+        Args:
+            sorted_by( :ref:`SortedKeys <api_profiler_sortedkeys>` , optional): how to rank the op table items, default value is SortedKeys.CPUTotal.
+            op_detail(bool, optional): expand each operator detail information, default value is True.
+            thread_sep(bool, optional): print op table each thread, default value is False.
+            time_unit(str, optional): time unit for display, can be chosen form ['s', 'ms', 'us', 'ns'], default value is 'ms'.
 
         Examples:
             .. code-block:: python
+              :name: code-example1
 
                 # required: gpu
                 import paddle.profiler as profiler
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index 61e02c4712c917..b9b0fa5ff2a742 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -33,23 +33,17 @@
 
 class SortedKeys(Enum):
     r"""
-    Sorted keys for printing summary table.
-
-    CPUTotal: Sorted by CPU total time.
-
-    CPUAvg: Sorted by CPU average time.
-
-    CPUMax: Sorted by CPU max time.
-
-    CPUMin: Sorted by CPU min time.
-
-    GPUTotal: Sorted by GPU total time.
-
-    GPUAvg: Sorted by GPU average time.
-
-    GPUMax: Sorted by GPU max time.
-
-    GPUMin: Sorted by GPU min time.
+    SortedKeys is used to specify how to sort items when printing :ref:`summary <api_profiler_profiler_summary>` table.
+
+    The meaning of each SortedKeys is as following:
+        - **SortedKeys.CPUTotal** :  Sorted by CPU total time.
+        - **SortedKeys.CPUAvg**  : Sorted by CPU average time.
+        - **SortedKeys.CPUMax**  : Sorted by CPU max time.
+        - **SortedKeys.CPUMin**  : Sorted by CPU min time.
+        - **SortedKeys.GPUTotal**  : Sorted by GPU total time.
+        - **SortedKeys.GPUAvg**  : Sorted by GPU average time.
+        - **SortedKeys.GPUMax**  : Sorted by GPU max time.
+        - **SortedKeys.GPUMin**  : Sorted by GPU min time.
     """
     CPUTotal = 0
     CPUAvg = 1
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index b9c27efb19bff2..d2b38f78f13fc9 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -30,13 +30,15 @@
 
 class RecordEvent(ContextDecorator):
     r"""
-    Interface for recording a time range.
+    Interface for recording a time range by user defined.
 
-    Parameters:
+    Args:
         name(str): Name of the record event
+        event_type(TracerEventType, optional): Optional, default value is TracerEventType.UserDefined. It is reserved for internal purpose, and it is better not to specify this parameter. 
 
     Examples:
         .. code-block:: python
+          :name: code-example1
 
             import paddle
             import paddle.profiler as profiler
@@ -53,8 +55,8 @@ class RecordEvent(ContextDecorator):
             result = data1 + data2
             record_event.end()
 
-    Note:
-        RecordEvent will take effect only when profiler is on and at the state of RECORD.
+    **Note**:
+        RecordEvent will take effect only when :ref:`Profiler <cn_api_profiler_profiler>` is on and at the state of RECORD.
     """
 
     def __init__(self,
@@ -76,6 +78,7 @@ def begin(self):
         Record the time of begining.
 
         .. code-block:: python
+          :name: code-example1
 
             import paddle
             import paddle.profiler as profiler
@@ -100,6 +103,7 @@ def end(self):
         Record the time of ending.
 
         .. code-block:: python
+          :name: code-example1
 
             import paddle
             import paddle.profiler as profiler
@@ -118,14 +122,15 @@ def load_profiler_result(filename: str):
     r"""
     Load dumped profiler data back to memory.
 
-    Parameters:
+    Args:
         filename(str): Name of the exported protobuf file of profiler data.
 
     Returns:
-        ProfilerResult object.
+        ProfilerResult object, which stores profiling data.
 
     Examples:
         .. code-block:: python
+          :name: code-example1
 
             # required: gpu
             import paddle.profiler as profiler

From 3bca1d2ccddac279368bf1c8012335eef985b717 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Thu, 24 Mar 2022 15:56:07 +0000
Subject: [PATCH 04/15] fix api doc format

---
 python/paddle/profiler/profiler.py           | 41 ++++++++++----------
 python/paddle/profiler/profiler_statistic.py |  2 +-
 python/paddle/profiler/utils.py              | 10 ++---
 3 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index 1a3680b683b410..b5cba09120247a 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -31,7 +31,7 @@ class ProfilerState(Enum):
     r"""
     ProfilerState is used to present the state of :ref:`Profiler <api_profiler_profiler>` .
 
-    The meaning of each ProfilerState is as following:
+    The meaning of each ProfilerState is as following
         - **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded.
         - **ProfilerState.READY** : The profiler is open, but the data will not be recorded.
         This state is used for reducing overhead influence when profiler starts.
@@ -49,7 +49,7 @@ class ProfilerTarget(Enum):
     r"""
     ProfilerTarget is used to specify target device for :ref:`profiling <api_profiler_profiler>` . Only CPU and GPU are supported currently.
 
-    The meaning of each ProfilerState is as following:
+    The meaning of each ProfilerState is as following
         - **ProfilerTarget.CPU** : Profile events on CPU.
         - **ProfilerTarget.GPU** : Profile events on GPU.
     """
@@ -148,7 +148,7 @@ def export_chrome_tracing(dir_name: str,
                           worker_name: Optional[str]=None) -> Callable:
     r"""
     Return a callable, used for outputing tracing data to chrome tracing format file.
-    The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
+    The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
     if worker_name is not set, the default name is [hostname]_[pid].
 
     Args:
@@ -159,9 +159,10 @@ def export_chrome_tracing(dir_name: str,
         A callable, which takes a Profiler object as parameter and calls its export method to save data to chrome tracing format file.
 
     Examples:
-        The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler <api_profiler_profiler>` .
+        The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler <api_paddle_profiler_Profiler>` .
+
         .. code-block:: python
-          :name: code-example1
+            :name: code-example1
 
             # required: gpu
             import paddle.profiler as profiler
@@ -197,7 +198,7 @@ def handle_fn(prof):
 def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
     r"""
     Return a callable, used for outputing tracing data to protobuf file.
-    The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
+    The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
     if worker_name is not set, the default name is [hostname]_[pid].
 
     Args:
@@ -208,9 +209,9 @@ def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
         A callable, which takes a Profiler object as parameter and calls its export method to save data to protobuf file.
 
     Examples:
-        The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler <api_profiler_profiler>` .
+        The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler <api_paddle_profiler_Profiler>` .
         .. code-block:: python
-          :name: code-example1
+            :name: code-example1
 
             # required: gpu
             import paddle.profiler as profiler
@@ -257,18 +258,18 @@ class Profiler:
     Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table.
 
     Args:
-        targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU <api_profiler_profilertarget>` and :ref:`ProfilerTarget.GPU <api_profiler_profilertarget>` .
-        scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState <api_profiler_profilerstate>`. This callable object can be generated by :ref:`make_scheduler <api_profiler_make_scheduler>` function.
+        targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU <api_paddle_profiler_ProfilerTarget>` and :ref:`ProfilerTarget.GPU <api_paddle_profiler_ProfilerTarget>` .
+        scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState <api_paddle_profiler_ProfilerState>`. This callable object can be generated by :ref:`make_scheduler <api_paddle_profiler_make_scheduler>` function.
             If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
             which means profiling range [start_batch, end_batch).
         on_trace_ready (Callable, optional): Callable object, serves as callback function, and takes the Profiler object as parameter, which provides a way for users to do post-processing.
-            This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing <api_profiler_export_chrome_tracing>` (./profiler_log/).
+            This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing <api_paddle_profiler_export_chrome_tracing>` (./profiler_log/).
 
     Examples:
         1. profiling range [2, 5).
 
             .. code-block:: python
-              :name: code-example1
+                :name: code-example1
 
                 # required: gpu
                 import paddle.profiler as profiler
@@ -283,7 +284,7 @@ class Profiler:
         2. profiling range [2,4], [7, 9], [11,13]
 
             .. code-block:: python
-              :name: code-example2
+                :name: code-example2
 
                 # required: gpu
                 import paddle.profiler as profiler
@@ -298,7 +299,7 @@ class Profiler:
         3. Use profiler without context manager, and use default parameters
 
             .. code-block:: python
-              :name: code-example3
+                :name: code-example3
 
                 # required: gpu
                 import paddle.profiler as profiler
@@ -380,7 +381,7 @@ def start(self):
 
         Examples:
             .. code-block:: python
-              :name: code-example1
+                :name: code-example4
 
                 # required: gpu
                 import paddle.profiler as profiler
@@ -415,7 +416,7 @@ def stop(self):
 
         Examples:
             .. code-block:: python
-              :name: code-example1
+                :name: code-example5
 
                 # required: gpu
                 import paddle.profiler as profiler
@@ -452,7 +453,7 @@ def step(self):
 
         Examples:
             .. code-block:: python
-              :name: code-example1
+                :name: code-example6
 
                 # required: gpu
                 import paddle.profiler as profiler
@@ -547,7 +548,7 @@ def export(self, path="", format="json"):
 
         Examples:
             .. code-block:: python
-              :name: code-example1
+                :name: code-example7
 
                 # required: gpu
                 import paddle.profiler as profiler
@@ -573,14 +574,14 @@ def summary(self,
         Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and userdefined summary.
 
         Args:
-            sorted_by( :ref:`SortedKeys <api_profiler_sortedkeys>` , optional): how to rank the op table items, default value is SortedKeys.CPUTotal.
+            sorted_by( :ref:`SortedKeys <api_paddle_profiler_SortedKeys>` , optional): how to rank the op table items, default value is SortedKeys.CPUTotal.
             op_detail(bool, optional): expand each operator detail information, default value is True.
             thread_sep(bool, optional): print op table each thread, default value is False.
             time_unit(str, optional): time unit for display, can be chosen form ['s', 'ms', 'us', 'ns'], default value is 'ms'.
 
         Examples:
             .. code-block:: python
-              :name: code-example1
+                :name: code-example8
 
                 # required: gpu
                 import paddle.profiler as profiler
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index b9b0fa5ff2a742..fdb1eadfae9865 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -35,7 +35,7 @@ class SortedKeys(Enum):
     r"""
     SortedKeys is used to specify how to sort items when printing :ref:`summary <api_profiler_profiler_summary>` table.
 
-    The meaning of each SortedKeys is as following:
+    The meaning of each SortedKeys is as following
         - **SortedKeys.CPUTotal** :  Sorted by CPU total time.
         - **SortedKeys.CPUAvg**  : Sorted by CPU average time.
         - **SortedKeys.CPUMax**  : Sorted by CPU max time.
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index d2b38f78f13fc9..1acd583ee0f1a9 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -38,7 +38,7 @@ class RecordEvent(ContextDecorator):
 
     Examples:
         .. code-block:: python
-          :name: code-example1
+            :name: code-example1
 
             import paddle
             import paddle.profiler as profiler
@@ -56,7 +56,7 @@ class RecordEvent(ContextDecorator):
             record_event.end()
 
     **Note**:
-        RecordEvent will take effect only when :ref:`Profiler <cn_api_profiler_profiler>` is on and at the state of RECORD.
+        RecordEvent will take effect only when :ref:`Profiler <api_paddle_profiler_Profiler>` is on and at the state of RECORD.
     """
 
     def __init__(self,
@@ -78,7 +78,7 @@ def begin(self):
         Record the time of begining.
 
         .. code-block:: python
-          :name: code-example1
+            :name: code-example2
 
             import paddle
             import paddle.profiler as profiler
@@ -103,7 +103,7 @@ def end(self):
         Record the time of ending.
 
         .. code-block:: python
-          :name: code-example1
+            :name: code-example3
 
             import paddle
             import paddle.profiler as profiler
@@ -130,7 +130,7 @@ def load_profiler_result(filename: str):
 
     Examples:
         .. code-block:: python
-          :name: code-example1
+            :name: code-example1
 
             # required: gpu
             import paddle.profiler as profiler

From 00507150967c04c5e5d793782937510e11a3d69a Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Fri, 25 Mar 2022 02:09:04 +0000
Subject: [PATCH 05/15] fix api doc according to review

---
 python/paddle/profiler/profiler.py           | 33 ++++++++++++--------
 python/paddle/profiler/profiler_statistic.py | 26 +++++++++------
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index b5cba09120247a..4fb247f712a456 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -29,15 +29,19 @@
 
 class ProfilerState(Enum):
     r"""
-    ProfilerState is used to present the state of :ref:`Profiler <api_profiler_profiler>` .
+    ProfilerState is used to present the state of :ref:`Profiler <api_paddle_profiler_Profiler>` .
 
     The meaning of each ProfilerState is as following
-        - **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded.
-        - **ProfilerState.READY** : The profiler is open, but the data will not be recorded.
-        This state is used for reducing overhead influence when profiler starts.
-        - **ProfilerState.RECORD** : The profiler is open, and the data will be recorded.
-        - **ProfilerState.RECORD_AND_RETURN** : The profiler is open, and this state stands for the last batch of "RECORD" state in current profiling period.
-        The collected data will be returned in this state.
+
+    **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded.
+
+    **ProfilerState.READY** : The profiler is open, but the data will not be recorded.
+    This state is used for reducing overhead influence when profiler starts.
+
+    **ProfilerState.RECORD** : The profiler is open, and the data will be recorded.
+
+    **ProfilerState.RECORD_AND_RETURN** : The profiler is open, and this state stands for the last batch of "RECORD" state in current profiling period.
+    The collected data will be returned in this state.
     """
     CLOSED = 0
     READY = 1
@@ -47,11 +51,13 @@ class ProfilerState(Enum):
 
 class ProfilerTarget(Enum):
     r"""
-    ProfilerTarget is used to specify target device for :ref:`profiling <api_profiler_profiler>` . Only CPU and GPU are supported currently.
+    ProfilerTarget is used to specify target device for :ref:`profiling <api_paddle_profiler_Profiler>` . Only CPU and GPU are supported currently.
 
     The meaning of each ProfilerState is as following
-        - **ProfilerTarget.CPU** : Profile events on CPU.
-        - **ProfilerTarget.GPU** : Profile events on GPU.
+
+    **ProfilerTarget.CPU** : Profile events on CPU.
+
+    **ProfilerTarget.GPU** : Profile events on GPU.
     """
     CPU = 0
     GPU = 1
@@ -64,7 +70,7 @@ def make_scheduler(*,
                    repeat: int=0,
                    skip_first: int=0) -> Callable:
     r"""
-    Return a scheduler function, which scheduler the :ref:`state <api_profiler_profilerstate>` according to the setting.
+    Return a scheduler function, which scheduler the :ref:`state <api_paddle_profiler_ProfilerState>` according to the setting.
     The state transform confirms to:
 
     .. code-block:: text
@@ -92,7 +98,7 @@ def make_scheduler(*,
         Assume batch 0: closed, batch 1: ready, batch [2, 5] record
 
             .. code-block:: python
-              :name: code-example1
+                :name: code-example1
 
                 import paddle.profiler as profiler
                 profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1)
@@ -103,7 +109,7 @@ def make_scheduler(*,
         Assume batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
 
             .. code-block:: python
-              :name: code-example2
+                :name: code-example2
 
                 import paddle.profiler as profiler
                 profiler.make_scheduler(closed=1, ready=1, record=4, skip_first=1)
@@ -210,6 +216,7 @@ def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
 
     Examples:
         The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler <api_paddle_profiler_Profiler>` .
+
         .. code-block:: python
             :name: code-example1
 
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index fdb1eadfae9865..9356d2cacdb48e 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -33,17 +33,25 @@
 
 class SortedKeys(Enum):
     r"""
-    SortedKeys is used to specify how to sort items when printing :ref:`summary <api_profiler_profiler_summary>` table.
+    SortedKeys is used to specify how to sort items when printing :ref:`summary <api_paddle_profiler_profiler_summary>` table.
 
     The meaning of each SortedKeys is as following
-        - **SortedKeys.CPUTotal** :  Sorted by CPU total time.
-        - **SortedKeys.CPUAvg**  : Sorted by CPU average time.
-        - **SortedKeys.CPUMax**  : Sorted by CPU max time.
-        - **SortedKeys.CPUMin**  : Sorted by CPU min time.
-        - **SortedKeys.GPUTotal**  : Sorted by GPU total time.
-        - **SortedKeys.GPUAvg**  : Sorted by GPU average time.
-        - **SortedKeys.GPUMax**  : Sorted by GPU max time.
-        - **SortedKeys.GPUMin**  : Sorted by GPU min time.
+
+    **SortedKeys.CPUTotal** :  Sorted by CPU total time.
+
+    **SortedKeys.CPUAvg**  : Sorted by CPU average time.
+
+    **SortedKeys.CPUMax**  : Sorted by CPU max time.
+
+    **SortedKeys.CPUMin**  : Sorted by CPU min time.
+
+    **SortedKeys.GPUTotal**  : Sorted by GPU total time.
+
+    **SortedKeys.GPUAvg**  : Sorted by GPU average time.
+
+    **SortedKeys.GPUMax**  : Sorted by GPU max time.
+
+    **SortedKeys.GPUMin**  : Sorted by GPU min time.
     """
     CPUTotal = 0
     CPUAvg = 1

From 42f542c9d4a1f778d44779e45ae02979ff0c1e64 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Fri, 25 Mar 2022 10:08:37 +0000
Subject: [PATCH 06/15] fix bug and add unit test

---
 .../unittests/test_profiler_statistic.py      | 222 ++++++++++++++++++
 python/paddle/profiler/profiler.py            |  14 +-
 python/paddle/profiler/profiler_statistic.py  |  28 +--
 3 files changed, 240 insertions(+), 24 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
index 73b501c9c7eade..4769a81e7936c5 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -207,6 +207,228 @@ def test_statistic_case1(self):
                 thread_sep=False,
                 time_unit='ms'))
 
+    def test_statistic_case2(self):
+        root_node = HostPythonNode('Root Node',
+                                   profiler.TracerEventType.UserDefined, 0,
+                                   float('inf'), 1000, 1001)
+        profilerstep_node = HostPythonNode('ProfileStep#1',
+                                           profiler.TracerEventType.ProfileStep,
+                                           0, 400, 1000, 1001)
+
+        dataloader_node = HostPythonNode(
+            'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
+
+        mobilenet_node = HostPythonNode(
+            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
+        yolonet_node = HostPythonNode(
+            'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001)
+
+        userdefined_node = HostPythonNode('Communication Time',
+                                          profiler.TracerEventType.UserDefined,
+                                          100, 110, 1000, 1001)
+        reduce_all_launchkernel0 = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104,
+            1000, 1001)
+
+        nccl_reduce_all_kernel0 = DevicePythonNode(
+            'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 105, 120,
+            0, 0, 2)
+
+        communication_node = HostPythonNode(
+            'Communication', profiler.TracerEventType.Communication, 105, 110,
+            1000, 1001)
+
+        reduce_all_op1 = HostPythonNode('cudalaunchkernel',
+                                        profiler.TracerEventType.Operator, 105,
+                                        108, 1000, 1001)
+
+        reduce_all_launchkernel1 = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 106, 107,
+            1000, 1001)
+
+        nccl_reduce_all_kernel1 = DevicePythonNode(
+            'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 130, 150,
+            0, 0, 2)
+
+        backward_node = HostPythonNode('Gradient Backward',
+                                       profiler.TracerEventType.Backward, 120,
+                                       200, 1000, 1001)
+        optimization_node = HostPythonNode(
+            'Optimization', profiler.TracerEventType.Optimization, 220, 300,
+            1000, 1001)
+        conv2d_node = HostPythonNode(
+            'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
+        sync_batch_norm_node = HostPythonNode('sync_batch_norm',
+                                              profiler.TracerEventType.Operator,
+                                              60, 100, 1000, 1001)
+        conv2d_infer_shape = HostPythonNode(
+            'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
+            30, 1000, 1001)
+        conv2d_compute = HostPythonNode('conv2d::compute',
+                                        profiler.TracerEventType.OperatorInner,
+                                        30, 40, 1000, 1001)
+        conv2d_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35,
+            1000, 1001)
+        conv2d_MemCpy = HostPythonNode('AsyncMemcpy',
+                                       profiler.TracerEventType.UserDefined, 35,
+                                       40, 1000, 1001)
+        conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
+                                           profiler.TracerEventType.CudaRuntime,
+                                           35, 40, 1000, 1001)
+        conv2d_kernel = DevicePythonNode(
+            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
+        conv2d_memcpy = DevicePythonNode(
+            'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
+        sync_batch_norm_infer_shape = HostPythonNode(
+            'sync_batch_norm::infer_shape',
+            profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
+        sync_batch_norm_compute = HostPythonNode(
+            'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner,
+            80, 100, 1000, 1001)
+        sync_batch_norm_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90,
+            1000, 1001)
+        sync_batch_norm_MemCpy = HostPythonNode(
+            'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000,
+            1001)
+        sync_batch_norm_cudaMemCpy = HostPythonNode(
+            'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000,
+            1001)
+        sync_batch_norm_kernel = DevicePythonNode(
+            'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 300,
+            0, 0, 0)
+        sync_batch_norm_memcpy = DevicePythonNode(
+            'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200,
+            0, 0, 1)
+
+        reduce_all_node2 = HostPythonNode('reduce_all',
+                                          profiler.TracerEventType.Operator,
+                                          230, 250, 1000, 1001)
+
+        reduce_all_launchkernel2 = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 235, 240,
+            1000, 1001)
+
+        nccl_reduce_all_kernel2 = DevicePythonNode(
+            'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 250, 280,
+            0, 0, 2)
+
+        root_node.children_node.append(profilerstep_node)
+        profilerstep_node.children_node.extend([
+            dataloader_node, mobilenet_node, yolonet_node, backward_node,
+            optimization_node
+        ])
+        mobilenet_node.children_node.append(conv2d_node)
+        yolonet_node.children_node.extend(
+            [sync_batch_norm_node, userdefined_node])
+        userdefined_node.children_node.append(communication_node)
+        userdefined_node.runtime_node.append(reduce_all_launchkernel0)
+        reduce_all_launchkernel0.device_node.append(nccl_reduce_all_kernel0)
+        communication_node.children_node.append(reduce_all_op1)
+        reduce_all_op1.runtime_node.append(reduce_all_launchkernel1)
+        reduce_all_launchkernel1.device_node.append(nccl_reduce_all_kernel1)
+        conv2d_node.children_node.extend(
+            [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
+        conv2d_compute.runtime_node.append(conv2d_launchkernel)
+        conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy)
+        conv2d_launchkernel.device_node.append(conv2d_kernel)
+        conv2d_cudaMemCpy.device_node.append(conv2d_memcpy)
+        sync_batch_norm_node.children_node.extend([
+            sync_batch_norm_infer_shape, sync_batch_norm_compute,
+            sync_batch_norm_MemCpy
+        ])
+        sync_batch_norm_compute.runtime_node.append(
+            sync_batch_norm_launchkernel)
+        sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy)
+        sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel)
+        sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy)
+        optimization_node.children_node.append(reduce_all_node2)
+        reduce_all_node2.runtime_node.append(reduce_all_launchkernel2)
+        reduce_all_launchkernel2.device_node.append(nccl_reduce_all_kernel2)
+        thread_tree = {'thread1001': root_node}
+        extra_info = {
+            'Process Cpu Utilization': '1.02',
+            'System Cpu Utilization': '0.68'
+        }
+        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
+                                                                   extra_info)
+        time_range_summary = statistic_data.time_range_summary
+        event_summary = statistic_data.event_summary
+        distributed_summary = statistic_data.distributed_summary
+
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.ProfileStep), 400)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Forward), 100)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Backward), 80)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Optimization), 80)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Operator), 78)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.OperatorInner), 45)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.CudaRuntime), 38)
+        self.assertEqual(
+            time_range_summary.get_gpu_range_sum(
+                0, profiler.TracerEventType.Kernel), 220)
+        self.assertEqual(
+            time_range_summary.get_gpu_range_sum(
+                0, profiler.TracerEventType.Memcpy), 60)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.UserDefined), 25)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Communication), 5)
+        self.assertEqual(
+            profiler.statistic_helper.sum_ranges(
+                distributed_summary.cpu_communication_range), 25)
+        self.assertEqual(
+            profiler.statistic_helper.sum_ranges(
+                distributed_summary.gpu_communication_range), 65)
+        self.assertEqual(
+            profiler.statistic_helper.sum_ranges(
+                distributed_summary.communication_range), 85)
+        self.assertEqual(
+            profiler.statistic_helper.sum_ranges(
+                distributed_summary.computation_range), 220)
+        self.assertEqual(
+            profiler.statistic_helper.sum_ranges(
+                distributed_summary.overlap_range), 85)
+        self.assertEqual(len(event_summary.items), 4)
+        self.assertEqual(len(event_summary.userdefined_items), 1)
+        self.assertEqual(len(event_summary.model_perspective_items), 3)
+        self.assertEqual(len(event_summary.memory_manipulation_items), 1)
+        self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
+        self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
+        self.assertEqual(
+            event_summary.model_perspective_items['Forward'].cpu_time, 100)
+        self.assertEqual(
+            event_summary.model_perspective_items['Forward'].gpu_time, 315)
+        self.assertEqual(
+            event_summary.model_perspective_items['Backward'].gpu_time, 0)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60)
+        print(
+            profiler.profiler_statistic._build_table(
+                statistic_data,
+                sorted_by=profiler.SortedKeys.CPUTotal,
+                op_detail=True,
+                thread_sep=False,
+                time_unit='ms'))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index 4fb247f712a456..d978e594399f40 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -33,15 +33,13 @@ class ProfilerState(Enum):
 
     The meaning of each ProfilerState is as following
 
-    **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded.
+    - **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded.
 
-    **ProfilerState.READY** : The profiler is open, but the data will not be recorded.
-    This state is used for reducing overhead influence when profiler starts.
+    - **ProfilerState.READY** : The profiler is open, but the data will not be recorded. This state is used for reducing overhead influence when profiler starts.
 
-    **ProfilerState.RECORD** : The profiler is open, and the data will be recorded.
+    - **ProfilerState.RECORD** : The profiler is open, and the data will be recorded.
 
-    **ProfilerState.RECORD_AND_RETURN** : The profiler is open, and this state stands for the last batch of "RECORD" state in current profiling period.
-    The collected data will be returned in this state.
+    - **ProfilerState.RECORD_AND_RETURN** : The profiler is open, and this state stands for the last batch of "RECORD" state in current profiling period. The collected data will be returned in this state.
     """
     CLOSED = 0
     READY = 1
@@ -55,9 +53,9 @@ class ProfilerTarget(Enum):
 
     The meaning of each ProfilerState is as following
 
-    **ProfilerTarget.CPU** : Profile events on CPU.
+    - **ProfilerTarget.CPU** : Profile events on CPU.
 
-    **ProfilerTarget.GPU** : Profile events on GPU.
+    - **ProfilerTarget.GPU** : Profile events on GPU.
     """
     CPU = 0
     GPU = 1
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index 9356d2cacdb48e..8bb30d111c8048 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -37,21 +37,21 @@ class SortedKeys(Enum):
 
     The meaning of each SortedKeys is as following
 
-    **SortedKeys.CPUTotal** :  Sorted by CPU total time.
+    - **SortedKeys.CPUTotal** :  Sorted by CPU total time.
 
-    **SortedKeys.CPUAvg**  : Sorted by CPU average time.
+    - **SortedKeys.CPUAvg**  : Sorted by CPU average time.
 
-    **SortedKeys.CPUMax**  : Sorted by CPU max time.
+    - **SortedKeys.CPUMax**  : Sorted by CPU max time.
 
-    **SortedKeys.CPUMin**  : Sorted by CPU min time.
+    - **SortedKeys.CPUMin**  : Sorted by CPU min time.
 
-    **SortedKeys.GPUTotal**  : Sorted by GPU total time.
+    - **SortedKeys.GPUTotal**  : Sorted by GPU total time.
 
-    **SortedKeys.GPUAvg**  : Sorted by GPU average time.
+    - **SortedKeys.GPUAvg**  : Sorted by GPU average time.
 
-    **SortedKeys.GPUMax**  : Sorted by GPU max time.
+    - **SortedKeys.GPUMax**  : Sorted by GPU max time.
 
-    **SortedKeys.GPUMin**  : Sorted by GPU min time.
+    - **SortedKeys.GPUMin**  : Sorted by GPU min time.
     """
     CPUTotal = 0
     CPUAvg = 1
@@ -274,10 +274,8 @@ def parse(self, nodetrees):
                     for device_node in device_nodes:
                         if device_node.type == TracerEventType.Kernel:
                             self.gpu_communication_range.append(
-                                (devicenode.start_ns, devicenode.end_ns))
-                        else:
-                            self.computation_range.append(
-                                (devicenode.start_ns, devicenode.end_ns))
+                                (device_node.start_ns, device_node.end_ns))
+
                 #case 2: TracerEventType is Operator but is communication op
                 elif hostnode.type == TracerEventType.Operator and any([
                         name in hostnode.name.lower()
@@ -289,10 +287,8 @@ def parse(self, nodetrees):
                     for device_node in device_nodes:
                         if device_node.type == TracerEventType.Kernel:
                             self.gpu_communication_range.append(
-                                (devicenode.start_ns, devicenode.end_ns))
-                        else:
-                            self.computation_range.append(
-                                (devicenode.start_ns, devicenode.end_ns))
+                                (device_node.start_ns, device_node.end_ns))
+
                 #case 3: Others, filter kernels named with nccl
                 else:
                     for runtimenode in hostnode.runtime_node:

From 2bf81d557187702f5d894d8469d335f0aebad199 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Mon, 28 Mar 2022 06:03:06 +0000
Subject: [PATCH 07/15] fix record event bug

---
 paddle/fluid/platform/profiler.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 940fc98d3b3202..5cd909b4e08f0d 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -164,6 +164,7 @@ void RecordEvent::End() {
 #ifdef PADDLE_WITH_CUDA
   if (g_enable_nvprof_hook && is_pushed_) {
     dynload::nvtxRangePop();
+    is_pushed_ = false;
   }
 #endif
 #endif

From a73802ad6242e8c6925c94597abeb76b1163c08c Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Mon, 28 Mar 2022 09:11:14 +0000
Subject: [PATCH 08/15] optimize chrome tracing display

---
 .../platform/profiler/chrometracing_logger.cc | 122 +++++++++++-------
 .../platform/profiler/chrometracing_logger.h  |   1 +
 paddle/fluid/platform/profiler/utils.h        |  11 +-
 python/paddle/profiler/utils.py               |  44 ++++---
 4 files changed, 108 insertions(+), 70 deletions(-)

diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 4061e2d4d494d5..478b601f88822c 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <cstdio>
 #include <ctime>
+#include <limits>
 
 #include "glog/logging.h"
 
@@ -75,6 +76,15 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
   // log all nodes except root node, root node is a helper node.
   const std::map<uint64_t, std::vector<HostTraceEventNode*>>
       thread2host_event_nodes = node_trees.Traverse(true);
+  // find the earliest time in current timeline
+  start_time_ = std::numeric_limits<uint64_t>::max();
+  for (auto it = thread2host_event_nodes.begin();
+       it != thread2host_event_nodes.end(); ++it) {
+    if ((*(it->second.begin() + 1))->StartNs() < start_time_) {
+      start_time_ = (*(it->second.begin() + 1))->StartNs();
+    }
+  }
+
   for (auto it = thread2host_event_nodes.begin();
        it != thread2host_event_nodes.end(); ++it) {
     for (auto hostnode = it->second.begin(); hostnode != it->second.end();
@@ -114,38 +124,44 @@ void ChromeTracingLogger::LogHostTraceEventNode(
           std::string(
               R"JSON(
   { 
-    "name": "%s", "pid": %lld, "tid": "%lld(Python)",
-    "ts": %lld, "dur": %lld,
+    "name": "%s[%.3f ms]", "pid": %lld, "tid": "%lld(Python)",
+    "ts": %lld, "dur": %.3f,
     "ph": "X", "cat": "%s", 
+    "cname": "thread_state_runnable",
     "args": {
-      "start_ns": %lld,
-      "end_ns": %lld
+      "start_time": "%.3f us",
+      "end_time": "%.3f us"
     }
   },
   )JSON"),
-          host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
-          nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
+          host_node.Name().c_str(), nsToMsFloat(host_node.Duration()),
+          host_node.ProcessId(), host_node.ThreadId(),
+          nsToUs(host_node.StartNs()), nsToUsFloat(host_node.Duration()),
           categary_name_[static_cast<int>(host_node.Type())],
-          host_node.StartNs(), host_node.EndNs());
+          nsToUsFloat(host_node.StartNs(), start_time_),
+          nsToUsFloat(host_node.EndNs(), start_time_));
       break;
     default:
       output_file_stream_ << string_format(
           std::string(
               R"JSON(
   { 
-    "name": "%s", "pid": %lld, "tid": "%lld(C++)",
-    "ts": %lld, "dur": %lld,
+    "name": "%s[%.3f ms]", "pid": %lld, "tid": "%lld(C++)",
+    "ts": %lld, "dur": %.3f,
     "ph": "X", "cat": "%s", 
+    "cname": "thread_state_runnable",
     "args": {
-      "start_ns": %lld,
-      "end_ns": %lld
+      "start_time": "%.3f us",
+      "end_time": "%.3f us"
     }
   },
   )JSON"),
-          host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
-          nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
+          host_node.Name().c_str(), nsToMsFloat(host_node.Duration()),
+          host_node.ProcessId(), host_node.ThreadId(),
+          nsToUs(host_node.StartNs()), nsToUsFloat(host_node.Duration()),
           categary_name_[static_cast<int>(host_node.Type())],
-          host_node.StartNs(), host_node.EndNs());
+          nsToUsFloat(host_node.StartNs(), start_time_),
+          nsToUsFloat(host_node.EndNs(), start_time_));
       break;
   }
 
@@ -161,22 +177,24 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
       std::string(
           R"JSON(
   { 
-    "name": "%s", "pid": %lld, "tid": "%lld(C++)",
-    "ts": %lld, "dur": %lld,
+    "name": "%s[%.3f ms]", "pid": %lld, "tid": "%lld(C++)",
+    "ts": %lld, "dur": %.3f,
     "ph": "X", "cat": "%s", 
+    "cname": "thread_state_running",
     "args": {
       "correlation id": %d,
-      "start_ns": %lld,
-      "end_ns": %lld
+      "start_time": "%.3f us",
+      "end_time": "%.3f us"
     }
   },
   )JSON"),
-      runtime_node.Name().c_str(), runtime_node.ProcessId(),
-      runtime_node.ThreadId(), nsToUs(runtime_node.StartNs()),
-      nsToUs(runtime_node.Duration()),
+      runtime_node.Name().c_str(), nsToMsFloat(runtime_node.Duration()),
+      runtime_node.ProcessId(), runtime_node.ThreadId(),
+      nsToUs(runtime_node.StartNs()), nsToUsFloat(runtime_node.Duration()),
       categary_name_[static_cast<int>(runtime_node.Type())],
-      runtime_node.CorrelationId(), runtime_node.StartNs(),
-      runtime_node.EndNs());
+      runtime_node.CorrelationId(),
+      nsToUsFloat(runtime_node.StartNs(), start_time_),
+      nsToUsFloat(runtime_node.EndNs(), start_time_));
   pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()});
 
   output_file_stream_ << string_format(
@@ -270,12 +288,13 @@ void ChromeTracingLogger::HandleTypeKernel(
       std::string(
           R"JSON(
   { 
-    "name": "%s", "pid": %lld, "tid": %lld,
-    "ts": %lld, "dur": %lld,
+    "name": "%s[%.3f ms]", "pid": %lld, "tid": %lld,
+    "ts": %lld, "dur": %.3f,
     "ph": "X", "cat": "%s", 
+    "cname": "rail_animation",
     "args": {
-      "start_ns": %lld,
-      "end_ns": %lld,
+      "start_time": "%.3f us",
+      "end_time": "%.3f us",
       "device": %d, "context": %d,
       "stream": %d, "correlation id": %d,
       "registers per thread": %d,
@@ -284,15 +303,16 @@ void ChromeTracingLogger::HandleTypeKernel(
       "warps per SM": %f,
       "grid": [%d, %d, %d],
       "block": [%d, %d, %d],
-      "theoretical achieved occupancy %%": %f
+      "theoretical achieved occupancy %%": %.3f
     }
   },
   )JSON"),
-      device_node.Name().c_str(), device_node.DeviceId(),
-      device_node.StreamId(), nsToUs(device_node.StartNs()),
-      nsToUs(device_node.Duration()),
+      device_node.Name().c_str(), nsToMsFloat(device_node.Duration()),
+      device_node.DeviceId(), device_node.StreamId(),
+      nsToUs(device_node.StartNs()), nsToUsFloat(device_node.Duration()),
       categary_name_[static_cast<int>(device_node.Type())],
-      device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(),
+      nsToUsFloat(device_node.StartNs(), start_time_),
+      nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(),
       device_node.ContextId(), device_node.StreamId(),
       device_node.CorrelationId(), kernel_info.registers_per_thread,
       kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory,
@@ -312,22 +332,24 @@ void ChromeTracingLogger::HandleTypeMemcpy(
       std::string(
           R"JSON(
   {
-    "name": "%s", "pid": %lld, "tid": %lld,
-    "ts": %lld, "dur": %lld,
+    "name": "%s[%.3f ms]", "pid": %lld, "tid": %lld,
+    "ts": %lld, "dur": %.3f,
     "ph": "X", "cat": "%s", 
+    "cname": "rail_animation",
     "args": {
-      "start_ns": %lld,
-      "end_ns": %lld,
+      "start_time": "%.3f us",
+      "end_time": "%.3f us",
       "stream": %d, "correlation id": %d,
-      "bytes": %d, "memory bandwidth (GB/s)": %f
+      "bytes": %d, "memory bandwidth (GB/s)": %.3f
     }
   },
   )JSON"),
-      device_node.Name().c_str(), device_node.DeviceId(),
-      device_node.StreamId(), nsToUs(device_node.StartNs()),
-      nsToUs(device_node.Duration()),
+      device_node.Name().c_str(), nsToMsFloat(device_node.Duration()),
+      device_node.DeviceId(), device_node.StreamId(),
+      nsToUs(device_node.StartNs()), nsToUsFloat(device_node.Duration()),
       categary_name_[static_cast<int>(device_node.Type())],
-      device_node.StartNs(), device_node.EndNs(), device_node.StreamId(),
+      nsToUsFloat(device_node.StartNs(), start_time_),
+      nsToUsFloat(device_node.EndNs(), start_time_), device_node.StreamId(),
       device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth);
 }
 
@@ -338,23 +360,25 @@ void ChromeTracingLogger::HandleTypeMemset(
       std::string(
           R"JSON(
   {
-    "name": "%s", "pid": %lld, "tid": %lld,
-    "ts": %lld, "dur": %lld,
+    "name": "%s[%.3f ms]", "pid": %lld, "tid": %lld,
+    "ts": %lld, "dur": %.3f,
     "ph": "X", "cat": "%s", 
+    "cname": "rail_animation",
     "args": {
-      "start_ns": %lld,
-      "end_ns": %lld,
+      "start_time": "%.3f us",
+      "end_time": "%.3f us",
       "device": %d, "context": %d,
       "stream": %d, "correlation id": %d,
       "bytes": %d, "value": %d
     }
   },
   )JSON"),
-      device_node.Name().c_str(), device_node.DeviceId(),
-      device_node.StreamId(), nsToUs(device_node.StartNs()),
-      nsToUs(device_node.Duration()),
+      device_node.Name().c_str(), nsToMsFloat(device_node.Duration()),
+      device_node.DeviceId(), device_node.StreamId(),
+      nsToUs(device_node.StartNs()), nsToUsFloat(device_node.Duration()),
       categary_name_[static_cast<int>(device_node.Type())],
-      device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(),
+      nsToUsFloat(device_node.StartNs(), start_time_),
+      nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(),
       device_node.ContextId(), device_node.StreamId(),
       device_node.CorrelationId(), memset_info.num_bytes, memset_info.value);
 }
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h
index 20a924a54cabdf..8977ab748c63a3 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.h
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.h
@@ -50,6 +50,7 @@ class ChromeTracingLogger : public BaseLogger {
   static const char* categary_name_[];
   std::set<std::pair<uint64_t, uint64_t>> pid_tid_set_;
   std::set<std::pair<uint64_t, uint64_t>> deviceid_streamid_set_;
+  uint64_t start_time_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h
index b471d6b79833a1..06d1636c4617ca 100644
--- a/paddle/fluid/platform/profiler/utils.h
+++ b/paddle/fluid/platform/profiler/utils.h
@@ -44,7 +44,16 @@ static std::string GetStringFormatLocalTime() {
   return std::string(buf);
 }
 
-static int64_t nsToUs(int64_t ns) { return ns / 1000; }
+static int64_t nsToUs(uint64_t end_ns, uint64_t start_ns = 0) {
+  return (end_ns - start_ns) / 1000;
+}
+
+static float nsToUsFloat(uint64_t end_ns, uint64_t start_ns = 0) {
+  return static_cast<float>(end_ns - start_ns) / 1000;
+}
+static float nsToMsFloat(uint64_t end_ns, uint64_t start_ns = 0) {
+  return static_cast<float>(end_ns - start_ns) / 1000 / 1000;
+}
 
 #ifdef PADDLE_WITH_CUPTI
 float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread,
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index 1acd583ee0f1a9..291326478e91bc 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -77,17 +77,19 @@ def begin(self):
         r"""
         Record the time of begining.
 
-        .. code-block:: python
-            :name: code-example2
+        Examples:
 
-            import paddle
-            import paddle.profiler as profiler
-            record_event = profiler.RecordEvent("record_sub")
-            record_event.begin()
-            data1 = paddle.randn(shape=[3])
-            data2 = paddle.randn(shape=[3])
-            result = data1 - data2
-            record_event.end()
+            .. code-block:: python
+                :name: code-example2
+
+                import paddle
+                import paddle.profiler as profiler
+                record_event = profiler.RecordEvent("record_sub")
+                record_event.begin()
+                data1 = paddle.randn(shape=[3])
+                data2 = paddle.randn(shape=[3])
+                result = data1 - data2
+                record_event.end()
         """
         if self.event_type not in _AllowedEventTypeList:
             warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\
@@ -102,17 +104,19 @@ def end(self):
         r'''
         Record the time of ending.
 
-        .. code-block:: python
-            :name: code-example3
+        Examples:
 
-            import paddle
-            import paddle.profiler as profiler
-            record_event = profiler.RecordEvent("record_mul")
-            record_event.begin()
-            data1 = paddle.randn(shape=[3])
-            data2 = paddle.randn(shape=[3])
-            result = data1 * data2
-            record_event.end()
+            .. code-block:: python
+                :name: code-example3
+
+                import paddle
+                import paddle.profiler as profiler
+                record_event = profiler.RecordEvent("record_mul")
+                record_event.begin()
+                data1 = paddle.randn(shape=[3])
+                data2 = paddle.randn(shape=[3])
+                result = data1 * data2
+                record_event.end()
         '''
         if self.event:
             self.event.end()

From cb536f87d446460d42ec62fd68464646d48a6534 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Mon, 28 Mar 2022 09:19:42 +0000
Subject: [PATCH 09/15] fix bug

---
 paddle/fluid/platform/profiler.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index b922d8f55b4a17..5cd909b4e08f0d 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -162,7 +162,7 @@ void RecordEvent::OriginalConstruct(const std::string &name,
 void RecordEvent::End() {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
-  if (g_enable_nvprof_hook && is_pushed_ && is_enabled_) {
+  if (g_enable_nvprof_hook && is_pushed_) {
     dynload::nvtxRangePop();
     is_pushed_ = false;
   }

From 03eeb4a086eec68e50b0d9f526ee6b277b13f7d8 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Mon, 28 Mar 2022 09:24:31 +0000
Subject: [PATCH 10/15] add comment

---
 paddle/fluid/platform/profiler/chrometracing_logger.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 478b601f88822c..3ca3ee5ed829e2 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -120,6 +120,7 @@ void ChromeTracingLogger::LogHostTraceEventNode(
     case TracerEventType::Optimization:
     case TracerEventType::PythonOp:
     case TracerEventType::PythonUserDefined:
+      // cname value comes from tracing.js reservedColorsByName variable
       output_file_stream_ << string_format(
           std::string(
               R"JSON(

From 982715f94741644e3a69bf888e1c21bfa31fd725 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Mon, 28 Mar 2022 10:47:14 +0000
Subject: [PATCH 11/15] add unit test

---
 .../platform/profiler/chrometracing_logger.cc | 82 +++++++++++++------
 .../fluid/tests/unittests/test_newprofiler.py | 10 +++
 2 files changed, 69 insertions(+), 23 deletions(-)
 mode change 100755 => 100644 python/paddle/fluid/tests/unittests/test_newprofiler.py

diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 3ca3ee5ed829e2..897297e0117e1e 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -112,6 +112,13 @@ void ChromeTracingLogger::LogHostTraceEventNode(
   if (!output_file_stream_) {
     return;
   }
+  std::string dur_display;
+  float dur = nsToMsFloat(host_node.Duration());
+  if (dur > 1.0) {
+    dur_display = string_format(std::string("%.3f ms"), dur);
+  } else {
+    dur_display = string_format(std::string("%.3f us"), dur * 1000);
+  }
   switch (host_node.Type()) {
     case TracerEventType::ProfileStep:
     case TracerEventType::Forward:
@@ -121,11 +128,12 @@ void ChromeTracingLogger::LogHostTraceEventNode(
     case TracerEventType::PythonOp:
     case TracerEventType::PythonUserDefined:
       // cname value comes from tracing.js reservedColorsByName variable
+
       output_file_stream_ << string_format(
           std::string(
               R"JSON(
   { 
-    "name": "%s[%.3f ms]", "pid": %lld, "tid": "%lld(Python)",
+    "name": "%s[%s]", "pid": %lld, "tid": "%lld(Python)",
     "ts": %lld, "dur": %.3f,
     "ph": "X", "cat": "%s", 
     "cname": "thread_state_runnable",
@@ -135,9 +143,9 @@ void ChromeTracingLogger::LogHostTraceEventNode(
     }
   },
   )JSON"),
-          host_node.Name().c_str(), nsToMsFloat(host_node.Duration()),
-          host_node.ProcessId(), host_node.ThreadId(),
-          nsToUs(host_node.StartNs()), nsToUsFloat(host_node.Duration()),
+          host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(),
+          host_node.ThreadId(), nsToUs(host_node.StartNs()),
+          nsToUsFloat(host_node.Duration()),
           categary_name_[static_cast<int>(host_node.Type())],
           nsToUsFloat(host_node.StartNs(), start_time_),
           nsToUsFloat(host_node.EndNs(), start_time_));
@@ -147,7 +155,7 @@ void ChromeTracingLogger::LogHostTraceEventNode(
           std::string(
               R"JSON(
   { 
-    "name": "%s[%.3f ms]", "pid": %lld, "tid": "%lld(C++)",
+    "name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)",
     "ts": %lld, "dur": %.3f,
     "ph": "X", "cat": "%s", 
     "cname": "thread_state_runnable",
@@ -157,9 +165,9 @@ void ChromeTracingLogger::LogHostTraceEventNode(
     }
   },
   )JSON"),
-          host_node.Name().c_str(), nsToMsFloat(host_node.Duration()),
-          host_node.ProcessId(), host_node.ThreadId(),
-          nsToUs(host_node.StartNs()), nsToUsFloat(host_node.Duration()),
+          host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(),
+          host_node.ThreadId(), nsToUs(host_node.StartNs()),
+          nsToUsFloat(host_node.Duration()),
           categary_name_[static_cast<int>(host_node.Type())],
           nsToUsFloat(host_node.StartNs(), start_time_),
           nsToUsFloat(host_node.EndNs(), start_time_));
@@ -174,11 +182,18 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
   if (!output_file_stream_) {
     return;
   }
+  float dur = nsToMsFloat(runtime_node.Duration());
+  std::string dur_display;
+  if (dur > 1.0) {
+    dur_display = string_format(std::string("%.3f ms"), dur);
+  } else {
+    dur_display = string_format(std::string("%.3f us"), dur * 1000);
+  }
   output_file_stream_ << string_format(
       std::string(
           R"JSON(
   { 
-    "name": "%s[%.3f ms]", "pid": %lld, "tid": "%lld(C++)",
+    "name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)",
     "ts": %lld, "dur": %.3f,
     "ph": "X", "cat": "%s", 
     "cname": "thread_state_running",
@@ -189,7 +204,7 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
     }
   },
   )JSON"),
-      runtime_node.Name().c_str(), nsToMsFloat(runtime_node.Duration()),
+      runtime_node.Name().c_str(), dur_display.c_str(),
       runtime_node.ProcessId(), runtime_node.ThreadId(),
       nsToUs(runtime_node.StartNs()), nsToUsFloat(runtime_node.Duration()),
       categary_name_[static_cast<int>(runtime_node.Type())],
@@ -218,6 +233,7 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
   if (!output_file_stream_) {
     return;
   }
+
   switch (device_node.Type()) {
     case TracerEventType::Kernel:
       HandleTypeKernel(device_node);
@@ -284,12 +300,18 @@ void ChromeTracingLogger::HandleTypeKernel(
       kernel_info.block_x, kernel_info.block_y, kernel_info.block_z,
       blocks_per_sm);
 #endif
-
+  float dur = nsToMsFloat(device_node.Duration());
+  std::string dur_display;
+  if (dur > 1.0) {
+    dur_display = string_format(std::string("%.3f ms"), dur);
+  } else {
+    dur_display = string_format(std::string("%.3f us"), dur * 1000);
+  }
   output_file_stream_ << string_format(
       std::string(
           R"JSON(
   { 
-    "name": "%s[%.3f ms]", "pid": %lld, "tid": %lld,
+    "name": "%s[%s]", "pid": %lld, "tid": %lld,
     "ts": %lld, "dur": %.3f,
     "ph": "X", "cat": "%s", 
     "cname": "rail_animation",
@@ -308,9 +330,9 @@ void ChromeTracingLogger::HandleTypeKernel(
     }
   },
   )JSON"),
-      device_node.Name().c_str(), nsToMsFloat(device_node.Duration()),
-      device_node.DeviceId(), device_node.StreamId(),
-      nsToUs(device_node.StartNs()), nsToUsFloat(device_node.Duration()),
+      device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
+      device_node.StreamId(), nsToUs(device_node.StartNs()),
+      nsToUsFloat(device_node.Duration()),
       categary_name_[static_cast<int>(device_node.Type())],
       nsToUsFloat(device_node.StartNs(), start_time_),
       nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(),
@@ -329,11 +351,18 @@ void ChromeTracingLogger::HandleTypeMemcpy(
   if (device_node.Duration() > 0) {
     memory_bandwidth = memcpy_info.num_bytes * 1.0 / device_node.Duration();
   }
+  float dur = nsToMsFloat(device_node.Duration());
+  std::string dur_display;
+  if (dur > 1.0) {
+    dur_display = string_format(std::string("%.3f ms"), dur);
+  } else {
+    dur_display = string_format(std::string("%.3f us"), dur * 1000);
+  }
   output_file_stream_ << string_format(
       std::string(
           R"JSON(
   {
-    "name": "%s[%.3f ms]", "pid": %lld, "tid": %lld,
+    "name": "%s[%s]", "pid": %lld, "tid": %lld,
     "ts": %lld, "dur": %.3f,
     "ph": "X", "cat": "%s", 
     "cname": "rail_animation",
@@ -345,9 +374,9 @@ void ChromeTracingLogger::HandleTypeMemcpy(
     }
   },
   )JSON"),
-      device_node.Name().c_str(), nsToMsFloat(device_node.Duration()),
-      device_node.DeviceId(), device_node.StreamId(),
-      nsToUs(device_node.StartNs()), nsToUsFloat(device_node.Duration()),
+      device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
+      device_node.StreamId(), nsToUs(device_node.StartNs()),
+      nsToUsFloat(device_node.Duration()),
       categary_name_[static_cast<int>(device_node.Type())],
       nsToUsFloat(device_node.StartNs(), start_time_),
       nsToUsFloat(device_node.EndNs(), start_time_), device_node.StreamId(),
@@ -357,11 +386,18 @@ void ChromeTracingLogger::HandleTypeMemcpy(
 void ChromeTracingLogger::HandleTypeMemset(
     const DeviceTraceEventNode& device_node) {
   MemsetEventInfo memset_info = device_node.MemsetInfo();
+  float dur = nsToMsFloat(device_node.Duration());
+  std::string dur_display;
+  if (dur > 1.0) {
+    dur_display = string_format(std::string("%.3f ms"), dur);
+  } else {
+    dur_display = string_format(std::string("%.3f us"), dur * 1000);
+  }
   output_file_stream_ << string_format(
       std::string(
           R"JSON(
   {
-    "name": "%s[%.3f ms]", "pid": %lld, "tid": %lld,
+    "name": "%s[%s]", "pid": %lld, "tid": %lld,
     "ts": %lld, "dur": %.3f,
     "ph": "X", "cat": "%s", 
     "cname": "rail_animation",
@@ -374,9 +410,9 @@ void ChromeTracingLogger::HandleTypeMemset(
     }
   },
   )JSON"),
-      device_node.Name().c_str(), nsToMsFloat(device_node.Duration()),
-      device_node.DeviceId(), device_node.StreamId(),
-      nsToUs(device_node.StartNs()), nsToUsFloat(device_node.Duration()),
+      device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
+      device_node.StreamId(), nsToUs(device_node.StartNs()),
+      nsToUsFloat(device_node.Duration()),
       categary_name_[static_cast<int>(device_node.Type())],
       nsToUsFloat(device_node.StartNs(), start_time_),
       nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(),
diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py
old mode 100755
new mode 100644
index 12fb0fa61b005f..837d133b91b364
--- a/python/paddle/fluid/tests/unittests/test_newprofiler.py
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py
@@ -125,5 +125,15 @@ def my_sheduler1(num_step):
         result = profiler.utils.load_profiler_result('./test_profiler_pb.pb')
 
 
+class TestNvprof(unittest.TestCase):
+    def test_nvprof(self):
+        for i in range(10):
+            paddle.fluid.profiler._nvprof_range(i, 10, 20)
+            x_value = np.random.randn(2, 3, 3)
+            x = paddle.to_tensor(
+                x_value, stop_gradient=False, place=paddle.CPUPlace())
+            y = x / 2.0
+
+
 if __name__ == '__main__':
     unittest.main()

From e190ec49d97d0d61a30a52af87daab291af71f21 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Tue, 29 Mar 2022 05:15:45 +0000
Subject: [PATCH 12/15] fix a bug

---
 .../platform/profiler/chrometracing_logger.cc     | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 897297e0117e1e..6f32bcc6878c10 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -80,8 +80,19 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
   start_time_ = std::numeric_limits<uint64_t>::max();
   for (auto it = thread2host_event_nodes.begin();
        it != thread2host_event_nodes.end(); ++it) {
-    if ((*(it->second.begin() + 1))->StartNs() < start_time_) {
-      start_time_ = (*(it->second.begin() + 1))->StartNs();
+    if (it->second.begin() + 1 != it->second.end()) {
+      if ((*(it->second.begin() + 1))->StartNs() < start_time_) {
+        start_time_ = (*(it->second.begin() + 1))->StartNs();
+      } else {
+        auto runtimenode =
+            (*(it->second.begin()))->GetRuntimeTraceEventNodes().begin();
+        if (runtimenode !=
+            (*(it->second.begin()))->GetRuntimeTraceEventNodes().end()) {
+          if ((*runtimenode)->StartNs() < start_time_) {
+            start_time_ = (*runtimenode)->StartNs();
+          }
+        }
+      }
     }
   }
 

From 5bcbbe5486fb6b717d6466f55440823181dbccbd Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Tue, 29 Mar 2022 05:17:57 +0000
Subject: [PATCH 13/15] fix

---
 python/paddle/fluid/tests/unittests/test_newprofiler.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 python/paddle/fluid/tests/unittests/test_newprofiler.py

diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py
old mode 100644
new mode 100755

From 9ac247f860f9fdc8c1d69268c272187c0abdb42f Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Tue, 29 Mar 2022 05:26:09 +0000
Subject: [PATCH 14/15] fix

---
 .../platform/profiler/chrometracing_logger.cc    | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 6f32bcc6878c10..0c5d90a9277a69 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -83,14 +83,14 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
     if (it->second.begin() + 1 != it->second.end()) {
       if ((*(it->second.begin() + 1))->StartNs() < start_time_) {
         start_time_ = (*(it->second.begin() + 1))->StartNs();
-      } else {
-        auto runtimenode =
-            (*(it->second.begin()))->GetRuntimeTraceEventNodes().begin();
-        if (runtimenode !=
-            (*(it->second.begin()))->GetRuntimeTraceEventNodes().end()) {
-          if ((*runtimenode)->StartNs() < start_time_) {
-            start_time_ = (*runtimenode)->StartNs();
-          }
+      }
+    } else {
+      auto runtimenode =
+          (*(it->second.begin()))->GetRuntimeTraceEventNodes().begin();
+      if (runtimenode !=
+          (*(it->second.begin()))->GetRuntimeTraceEventNodes().end()) {
+        if ((*runtimenode)->StartNs() < start_time_) {
+          start_time_ = (*runtimenode)->StartNs();
         }
       }
     }

From b8dfe9dcdc99ec7e13f8aa4573d52eadb6ecd0c7 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Wed, 30 Mar 2022 03:49:54 +0000
Subject: [PATCH 15/15] fix format

---
 python/paddle/fluid/tests/unittests/test_newprofiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py
index 1dd59f9f38c8dd..0088687b125636 100755
--- a/python/paddle/fluid/tests/unittests/test_newprofiler.py
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py
@@ -128,7 +128,6 @@ def my_sheduler1(num_step):
         result = profiler.utils.load_profiler_result('./test_profiler_pb.pb')
 
 
-
 class TestNvprof(unittest.TestCase):
     def test_nvprof(self):
         for i in range(10):
@@ -138,6 +137,7 @@ def test_nvprof(self):
                 x_value, stop_gradient=False, place=paddle.CPUPlace())
             y = x / 2.0
 
+
 class RandomDataset(Dataset):
     def __init__(self, num_samples):
         self.num_samples = num_samples