From c9e508573fbe65c10c55c0e70e2cd7264f365c75 Mon Sep 17 00:00:00 2001 From: chenjian Date: Wed, 9 Mar 2022 03:34:17 +0000 Subject: [PATCH 01/15] no --- paddle/phi/core/compat/op_utils.h | 1 + paddle/phi/kernels/funcs/reduce_functor.h | 8 ++ paddle/phi/kernels/gpu/batch_norm_kernel.cu | 4 + .../phi/kernels/gpu/take_along_axis_kernel.cu | 1 + paddle/phi/kernels/math_kernel.h | 2 +- paddle/phi/ops/compat/reduce_sig.cc | 24 ++++- python/paddle/profiler/profiler.py | 101 ++++++++++-------- python/paddle/profiler/utils.py | 1 - 8 files changed, 92 insertions(+), 50 deletions(-) diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index 9947e00ecb53c5..1ab718c0794384 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -47,6 +47,7 @@ const std::unordered_set deprecated_op_names({"diag", "matmul_grad", "matmul_grad_grad", "mean", + "max", "reshape", "reshape_grad", "expand", diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h index aebd155ac59cb2..4e83d0fa371032 100644 --- a/paddle/phi/kernels/funcs/reduce_functor.h +++ b/paddle/phi/kernels/funcs/reduce_functor.h @@ -41,5 +41,13 @@ struct ProdFunctor { } }; +//////// Max Functor /////// +struct MaxFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->maximum(dim); + } +}; + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 6ad12245d2a45a..49b550f51e60e1 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -460,10 +460,14 @@ void BatchNormKernel(const Context &ctx, void *reserve_space_ptr = nullptr; void *workspace_ptr = nullptr; DenseTensor workspace_tensor; + DenseTensor reserve_space_tensor; // Create reserve space and workspace for batch norm. // Create tensor for each batchnorm op, it will be used in the // backward. Thus this tensor shouldn't be temp. // auto *reserve_space = ctx.Output("ReserveSpace"); + if (reserve_space == nullptr) { + reserve_space = &reserve_space_tensor; + } PADDLE_ENFORCE_NOT_NULL( reserve_space, phi::errors::NotFound( diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu index 63113e3e672f37..9665a917d9dc4a 100644 --- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu +++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu @@ -53,6 +53,7 @@ PD_REGISTER_KERNEL(take_along_axis, GPU, ALL_LAYOUT, phi::TakeAlongAxisKernel, + float, double, int64_t, int, diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h index fe8f3b749cdd8a..7569cbcff087d7 100644 --- a/paddle/phi/kernels/math_kernel.h +++ b/paddle/phi/kernels/math_kernel.h @@ -156,7 +156,7 @@ DenseTensor Mean(const Context& dev_ctx, bool keep_dim) { DenseTensor dense_out; MetaTensor meta_out(&dense_out); - ReduceInferMetaBase(x, axis, keep_dim, false, x.dtype(), &meta_out); + SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out); MeanKernel(dev_ctx, x, axis, keep_dim, &dense_out); return dense_out; } diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc index 92839fb3030752..36798abe4c11b8 100644 --- a/paddle/phi/ops/compat/reduce_sig.cc +++ b/paddle/phi/ops/compat/reduce_sig.cc @@ -21,7 +21,7 @@ KernelSignature ReduceSumOpArgumentMapping(const ArgumentMappingContext& ctx) { bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in // InferShape, so we must return the "sum_raw" KernelSignature. - // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // And the InferMeta function(i.e. SumRawInferMeta) is accordance with // the "sum_raw" KernelSignature if (ctx.IsForInferShape() || reduce_all) { return KernelSignature("sum_raw", @@ -40,7 +40,8 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) { bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in // InferShape, so we must return the "mean_raw" KernelSignature. - // And the InferMeta function(i.e. MeanRawInferMeta) is accordance with the + // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // the // "mean_raw" KernelSignature if (ctx.IsForInferShape() || reduce_all) { return KernelSignature( @@ -56,11 +57,30 @@ KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) { "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); } +KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("X")) { + bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); + // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in + // InferShape, so we must return the "max_raw" KernelSignature. + // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // the + // "max_raw" KernelSignature + if (ctx.IsForInferShape() || reduce_all) { + return KernelSignature( + "max_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + } + return KernelSignature("max", {"X"}, {"dim", "keep_dim"}, {"Out"}); + } + return KernelSignature("unregistered", {}, {}, {}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum); PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean); +PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max); PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_max, phi::ReduceMaxOpArgumentMapping); diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index dc637bf983046b..3f67f69b2ad4a7 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -82,11 +82,9 @@ def make_scheduler(*, Examples: 1. profiling range [2, 5] batch 0: closed, batch 1: ready, batch [2, 5] record - .. code-block:: python make_scheduler(closed=1, ready=1, record=4, repeat=1) 2. profiling range [3,6], [9,12], [15,18]... batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat - .. code-block:: python make_scheduler(closed=1, ready=1, record=4, skip_first=1) """ @@ -138,15 +136,16 @@ def export_chrome_tracing(dir_name: str, Examples: .. code-block:: python - import paddle.profiler as profiler - with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, - profiler.ProfilerTarget.GPU], - scheduler = (3, 10), - on_trace_ready = profiler.export_chrome_tracing('./log') - ) as p: - for iter in range(N): - train() - p.step() + + import paddle.profiler as profiler + with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, + profiler.ProfilerTarget.GPU], + scheduler = (3, 10), + on_trace_ready = profiler.export_chrome_tracing('./log') + ) as p: + for iter in range(N): + #train() + p.step() """ if not os.path.exists(dir_name): try: @@ -181,15 +180,16 @@ def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable: Examples: .. code-block:: python - import paddle.profiler as profiler - with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, - profiler.ProfilerTarget.GPU], - scheduler = (3, 10), - on_trace_ready = profiler.export_protobuf('./log') - ) as p: - for iter in range(N): - train() - p.step() + + import paddle.profiler as profiler + with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, + profiler.ProfilerTarget.GPU], + scheduler = (3, 10), + on_trace_ready = profiler.export_protobuf('./log') + ) as p: + for iter in range(N): + #train() + p.step() """ if not os.path.exists(dir_name): try: @@ -238,36 +238,45 @@ class Profiler: Examples: 1. profiling range [2, 5) .. code-block:: python - import paddle.profiler as profiler - with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, - profiler.ProfilerTarget.GPU], - scheduler = (2, 5), - on_trace_ready = profiler.export_chrome_tracing('./log') - ) as p: - for iter in range(N): - train() - p.step() + :name: code-example1 + + import paddle.profiler as profiler + with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, + profiler.ProfilerTarget.GPU], + scheduler = (2, 5), + on_trace_ready = profiler.export_chrome_tracing('./log') + ) as p: + for iter in range(N): + #train() + p.step() + 2. profiling range [2,4], [7, 9], [11,13] .. code-block:: python - import paddle.profiler as profiler - with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, - profiler.ProfilerTarget.GPU], - scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3), - on_trace_ready = profiler.export_chrome_tracing('./log') - ) as p: - for iter in range(N): - train() - p.step() + :name: code-example2 + + import paddle.profiler as profiler + with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, + profiler.ProfilerTarget.GPU], + scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3), + on_trace_ready = profiler.export_chrome_tracing('./log') + ) as p: + for iter in range(N): + #train() + p.step() + 3. Use profiler without context manager, and use default parameters .. code-block:: python - import paddle.profiler as profiler - p = profiler.Profiler() - p.start() - for iter in range(N): - train() - p.step() - p.stop() - p.summary() + :name: code-example3 + + import paddle.profiler as profiler + p = profiler.Profiler() + p.start() + for iter in range(N): + #train() + p.step() + p.stop() + p.summary() + """ def __init__( diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py index 642001dfbfc5a3..dea5feebccc195 100644 --- a/python/paddle/profiler/utils.py +++ b/python/paddle/profiler/utils.py @@ -36,7 +36,6 @@ class RecordEvent(ContextDecorator): event_type(TracerEventType): Type of the record event, can be used for statistics. Examples: - .. code-block:: python import paddle.profiler as profiler with profiler.RecordEvent(name='op1', event_type=TracerEventType=TracerEventType.UserDefined): op1() From 3c9cb03c7922c584a703015557e40cc8edf1935f Mon Sep 17 00:00:00 2001 From: chenjian Date: Thu, 24 Mar 2022 06:23:31 +0000 Subject: [PATCH 02/15] fix bugs --- paddle/fluid/platform/profiler/host_tracer.cc | 2 +- python/paddle/profiler/profiler.py | 15 +- python/paddle/profiler/profiler_statistic.py | 167 ++++++++++++------ python/paddle/profiler/statistic_helper.py | 1 + python/paddle/profiler/utils.py | 2 +- 5 files changed, 132 insertions(+), 55 deletions(-) diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc index afd1c2b3012d46..b7eb53331b793a 100644 --- a/paddle/fluid/platform/profiler/host_tracer.cc +++ b/paddle/fluid/platform/profiler/host_tracer.cc @@ -20,7 +20,7 @@ // Used to filter events, works like glog VLOG(level). // RecordEvent will works if host_trace_level >= level. -PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 2, +PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1, "RecordEvent will works " "if host_trace_level >= level."); diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index efbe88583b776d..08f9f6de411d26 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -154,6 +154,9 @@ def export_chrome_tracing(dir_name: str, Parameters: dir_name(str): Directory to save profiling data. worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid]. + + Returns: + A callable, which takes a Profiler object as parameter and calls its export method to save data to chrome tracing format file. Examples: .. code-block:: python @@ -199,6 +202,9 @@ def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable: dir_name(str): Directory to save profiling data. worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid]. + Returns: + A callable, which takes a Profiler object as parameter and calls its export method to save data to protobuf file. + Examples: .. code-block:: python @@ -522,7 +528,12 @@ def _trigger_action(self): def export(self, path="", format="json"): r""" - Exports the tracing data in Chrome tracing data format. + Exports the tracing data to file. + + Parameters: + path(str): file path of the output. + format(str): output format, can be chosen from ['json', 'pb], 'json' for chrome tracing and 'pb' for protobuf. + Examples: .. code-block:: python @@ -548,7 +559,7 @@ def summary(self, thread_sep=False, time_unit='ms'): r""" - Print the Summary table. + Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and userdefined summary. Parameters: sorted_by(SortedKeys): how to rank the op table items. diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index a0bbd6b633ef01..61e02c4712c917 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -119,6 +119,23 @@ def traverse_tree(nodetrees): return results +def get_device_nodes(hostnode): + ''' + Get all device nodes called in the time range of hostnode. + ''' + stack = [] + device_nodes = [] + stack.append(hostnode) + while stack: + current_node = stack.pop() + for childnode in current_node.children_node: + stack.append(childnode) + for runtimenode in current_node.runtime_node: + for devicenode in runtimenode.device_node: + device_nodes.append(devicenode) + return device_nodes + + def wrap_tree(nodetrees): ''' Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics. @@ -186,16 +203,6 @@ def parse(self, nodetrees): CPUTimeRange[hostnode.type].append( (hostnode.start_ns, hostnode.end_ns)) self.call_times[hostnode.type] += 1 - if hostnode.type == TracerEventType.Operator and any([ - name in hostnode.name for name in _CommunicationOpName - ]): # special case, communication op - CPUTimeRange[TracerEventType.Communication].append( - (hostnode.start_ns, hostnode.end_ns)) - self.call_times[TracerEventType.Communication] += 1 - is_communication_node = ( - hostnode.type == TracerEventType.Communication - ) or (hostnode.type == TracerEventType.Operator and any( - [name in hostnode.name for name in _CommunicationOpName])) for runtimenode in hostnode.runtime_node: CPUTimeRange[runtimenode.type].append( (runtimenode.start_ns, runtimenode.end_ns)) @@ -205,12 +212,6 @@ def parse(self, nodetrees): devicenode.stream_id].append( (devicenode.start_ns, devicenode.end_ns)) self.call_times[devicenode.type] += 1 - if is_communication_node: # gpu activity for communication node - GPUTimeRange[devicenode.device_id][ - TracerEventType.Communication][ - devicenode.stream_id].append(( - devicenode.start_ns, devicenode.end_ns)) - self.call_times[TracerEventType.Communication] += 1 for event_type, time_ranges in CPUTimeRange.items(): time_ranges = merge_self_ranges(time_ranges, is_sorted=False) @@ -243,6 +244,78 @@ def get_cpu_range_sum(self, event_type): return self.CPUTimeRangeSum[event_type] +class DistributedSummary: + r""" + Analysis communication and computation time range, and their overlap. + The computation time is all kernel except kernels for communication like nccl. + """ + + def __init__(self): + self.cpu_communication_range = [] + self.gpu_communication_range = [] + self.communication_range = [] + self.computation_range = [] + self.overlap_range = [] + + def parse(self, nodetrees): + ''' + Collect all communication and computation time ranges. + ''' + thread2hostnodes = traverse_tree(nodetrees) + for threadid, hostnodes in thread2hostnodes.items(): + for hostnode in hostnodes[1:]: #skip root node + # case 1: TracerEventType is Communication + if hostnode.type == TracerEventType.Communication: + self.cpu_communication_range.append( + (hostnode.start_ns, hostnode.end_ns)) + device_nodes = get_device_nodes(hostnode) + for device_node in device_nodes: + if device_node.type == TracerEventType.Kernel: + self.gpu_communication_range.append( + (devicenode.start_ns, devicenode.end_ns)) + else: + self.computation_range.append( + (devicenode.start_ns, devicenode.end_ns)) + #case 2: TracerEventType is Operator but is communication op + elif hostnode.type == TracerEventType.Operator and any([ + name in hostnode.name.lower() + for name in _CommunicationOpName + ]): + self.cpu_communication_range.append( + (hostnode.start_ns, hostnode.end_ns)) + device_nodes = get_device_nodes(hostnode) + for device_node in device_nodes: + if device_node.type == TracerEventType.Kernel: + self.gpu_communication_range.append( + (devicenode.start_ns, devicenode.end_ns)) + else: + self.computation_range.append( + (devicenode.start_ns, devicenode.end_ns)) + #case 3: Others, filter kernels named with nccl + else: + for runtimenode in hostnode.runtime_node: + for devicenode in runtimenode.device_node: + if devicenode.type == TracerEventType.Kernel: + if 'nccl' in devicenode.name.lower(): + self.gpu_communication_range.append(( + devicenode.start_ns, devicenode.end_ns)) + else: + self.computation_range.append(( + devicenode.start_ns, devicenode.end_ns)) + self.cpu_communication_range = merge_self_ranges( + self.cpu_communication_range, is_sorted=False) + self.gpu_communication_range = merge_self_ranges( + self.gpu_communication_range, is_sorted=False) + self.communication_range = merge_ranges( + self.cpu_communication_range, + self.gpu_communication_range, + is_sorted=True) + self.computation_range = merge_self_ranges( + self.computation_range, is_sorted=False) + self.overlap_range = intersection_ranges( + self.communication_range, self.computation_range, is_sorted=True) + + class EventSummary: r""" Analyse operator event in profiling data, correlate with its device event. @@ -473,8 +546,10 @@ def __init__(self, node_trees, extra_info): self.extra_info = extra_info self.time_range_summary = TimeRangeSummary() self.event_summary = EventSummary() + self.distributed_summary = DistributedSummary() self.time_range_summary.parse(node_trees) self.event_summary.parse(node_trees) + self.distributed_summary.parse(node_trees) def _build_table(statistic_data, @@ -610,7 +685,11 @@ def format_ratio(ratio, indent=0): gpu_type_time = collections.defaultdict(int) for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items( ): - cpu_type_time[event_type] = value + if event_type != TracerEventType.Communication: + cpu_type_time[event_type] = value + if statistic_data.distributed_summary.cpu_communication_range: + cpu_type_time[TracerEventType.Communication] = sum_ranges( + statistic_data.distributed_summary.cpu_communication_range) gpu_time_range = collections.defaultdict(list) for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items( @@ -620,6 +699,9 @@ def format_ratio(ratio, indent=0): gpu_time_range[event_type], time_range, is_sorted=True) for event_type, time_range in gpu_time_range.items(): gpu_type_time[event_type] = sum_ranges(time_range) + if statistic_data.distributed_summary.gpu_communication_range: + gpu_type_time[TracerEventType.Communication] = sum_ranges( + statistic_data.distributed_summary.gpu_communication_range) sorted_items = sorted( cpu_type_time.items(), key=lambda x: x[1], reverse=True) @@ -735,7 +817,7 @@ def format_ratio(ratio, indent=0): append('') ###### Print Distribution Summary Report ###### - if TracerEventType.Communication in statistic_data.time_range_summary.CPUTimeRange: + if statistic_data.distributed_summary.communication_range: headers = [ 'Name', 'Total Time', @@ -759,33 +841,12 @@ def format_ratio(ratio, indent=0): append(header_sep) append(row_format.format(*headers)) append(header_sep) - cpu_communication_time_range = [] - gpu_communication_time_range = [] - cpu_communication_time_range = merge_ranges( - statistic_data.time_range_summary.CPUTimeRange[ - TracerEventType.Communication], cpu_communication_time_range) - kernel_time_range = [] - for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items( - ): - kernel_time_range = merge_ranges( - device_time_ranges[TracerEventType.Kernel], - kernel_time_range, - is_sorted=True) - gpu_communication_time_range = merge_ranges( - device_time_ranges[TracerEventType.Communication], - gpu_communication_time_range, - is_sorted=True) - communication_time_range = merge_ranges( - cpu_communication_time_range, - gpu_communication_time_range, - is_sorted=True) - computation_time_range = subtract_ranges(kernel_time_range, - gpu_communication_time_range) - overlap_time_range = intersection_ranges(communication_time_range, - computation_time_range) - communication_time = sum_ranges(communication_time_range) - computation_time = sum_ranges(computation_time_range) - overlap_time = sum_ranges(overlap_time_range) + communication_time = sum_ranges( + statistic_data.distributed_summary.communication_range) + computation_time = sum_ranges( + statistic_data.distributed_summary.computation_range) + overlap_time = sum_ranges( + statistic_data.distributed_summary.overlap_range) row_values = [ 'Communication', format_time( communication_time, unit=time_unit), @@ -808,9 +869,9 @@ def format_ratio(ratio, indent=0): append(row_format.format(*row_values)) append(header_sep) append( - "Note:\nCommunication time: Communication Op time and its kernel time on gpu.\n" - "Computation time: Kernel time, substract kernels belong to communication op.\n" - "Overlap time: Communication time intersect with computation time.\n" + "Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n" + "Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n" + "Overlap time: Communication time intersects with computation time.\n" "Example:\n" "Communication:\n" " CPU: |_________________|\n" @@ -976,7 +1037,7 @@ def format_ratio(ratio, indent=0): - 5] device_node_name += "..." row_values = [ - ' {}'.format(device_node_name), devicenode.call, + ' {}'.format(device_node_name), devicenode.call, '- / - / - / - / -', '{} / {} / {} / {} / {}'.format( format_time( @@ -1001,10 +1062,14 @@ def format_ratio(ratio, indent=0): 'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)', 'GPU Total / Avg / Max / Min / Ratio(%)' ] + name_column_width = 0 + for name, item in statistic_data.event_summary.memory_manipulation_items.items( + ): + name_column_width = max(len(name), name_column_width) row_format_list = [""] header_sep_list = [""] line_length_list = [-SPACING_SIZE] - name_column_width = 30 + name_column_width = min(name_column_width, 45) add_column(name_column_width) add_column(6) add_column(40) @@ -1059,7 +1124,7 @@ def format_ratio(ratio, indent=0): row_format_list = [""] header_sep_list = [""] line_length_list = [-SPACING_SIZE] - name_column_width = 30 + name_column_width = 35 add_column(name_column_width) add_column(6) add_column(40) diff --git a/python/paddle/profiler/statistic_helper.py b/python/paddle/profiler/statistic_helper.py index 1f11649928a7fe..76dd1f0a6439d1 100644 --- a/python/paddle/profiler/statistic_helper.py +++ b/python/paddle/profiler/statistic_helper.py @@ -96,6 +96,7 @@ def merge_ranges(range_list1, range_list2, is_sorted=False): else: indx1 += 1 while indx2 < len2: + range2 = range_list2[indx2] if range2[1] > merged_ranges[-1][1]: if range2[0] <= merged_ranges[-1][1]: merged_ranges[-1] = (merged_ranges[-1][0], range2[1]) diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py index 7fa7a27bad7bf5..b9c27efb19bff2 100644 --- a/python/paddle/profiler/utils.py +++ b/python/paddle/profiler/utils.py @@ -17,8 +17,8 @@ import functools from contextlib import ContextDecorator +from paddle.fluid import core from paddle.fluid.core import (_RecordEvent, TracerEventType) -import paddle.fluid.core as core _AllowedEventTypeList = [ TracerEventType.Dataloader, TracerEventType.ProfileStep, From 3358cc8cb965fc77c4e8788a7f7d7bd8f437c856 Mon Sep 17 00:00:00 2001 From: chenjian Date: Thu, 24 Mar 2022 12:59:44 +0000 Subject: [PATCH 03/15] fix doc according to review --- python/paddle/profiler/profiler.py | 96 +++++++++++--------- python/paddle/profiler/profiler_statistic.py | 28 +++--- python/paddle/profiler/utils.py | 17 ++-- 3 files changed, 76 insertions(+), 65 deletions(-) diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index 08f9f6de411d26..1a3680b683b410 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -29,17 +29,15 @@ class ProfilerState(Enum): r""" - Profiler state that can be specified to control profiler action. - - CLOSED: The profilers are closed. - - READY: The profilers are open, but the data will not be recorded. - This state is used for reducing overhead influence when profilers start. - - RECORD: The profilers are open, and the data will be recorded. - - RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period, - the collected data will be returned. + ProfilerState is used to present the state of :ref:`Profiler ` . + + The meaning of each ProfilerState is as following: + - **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded. + - **ProfilerState.READY** : The profiler is open, but the data will not be recorded. + This state is used for reducing overhead influence when profiler starts. + - **ProfilerState.RECORD** : The profiler is open, and the data will be recorded. + - **ProfilerState.RECORD_AND_RETURN** : The profiler is open, and this state stands for the last batch of "RECORD" state in current profiling period. + The collected data will be returned in this state. """ CLOSED = 0 READY = 1 @@ -49,11 +47,11 @@ class ProfilerState(Enum): class ProfilerTarget(Enum): r""" - Target device for profiling. + ProfilerTarget is used to specify target device for :ref:`profiling ` . Only CPU and GPU are supported currently. - CPU: Profile events on CPU. - - GPU: Profile events on GPU. + The meaning of each ProfilerState is as following: + - **ProfilerTarget.CPU** : Profile events on CPU. + - **ProfilerTarget.GPU** : Profile events on GPU. """ CPU = 0 GPU = 1 @@ -66,7 +64,7 @@ def make_scheduler(*, repeat: int=0, skip_first: int=0) -> Callable: r""" - Return a scheduler function, which scheduler the state according to the setting. + Return a scheduler function, which scheduler the :ref:`state ` according to the setting. The state transform confirms to: .. code-block:: text @@ -78,22 +76,23 @@ def make_scheduler(*, - - - - - - - - - - - - Note that repeat <= 0 means the cycle will continue until the profiler exits. - Parameters: + Args: closed(int): The number of steps in state ProfilerState.CLOSED. ready(int): The number of steps in state ProfilerState.READY. - record(int): The number of steps in state ProfilerState.RECORD. - repeat(int): The number of cycles to repeat above state transform. - skip_first(int): The number of first steps to drop, not participate in the state transform. + record(int): The number of steps in state ProfilerState.RECORD, and the state in last step will be set as ProfilerState.RECORD_AND_RETURN. + repeat(int, optional): The number of cycles to repeat above state transform. Default value is 0, which means it will repeat this cycle until profiler exits. + skip_first(int, optional): The number of first steps to drop, not participate in the state transform, and at ProfilerState.CLOSED state. Default value is 0. Returns: - A scheduler function, conforms to above state transform setting. + A scheduler function, conforms to above state transform setting. The function will takes one parameter step_num, and returns corresponding ProfilerState. Examples: 1. profiling range [2, 5] - batch 0: closed, batch 1: ready, batch [2, 5] record + Assume batch 0: closed, batch 1: ready, batch [2, 5] record .. code-block:: python + :name: code-example1 import paddle.profiler as profiler profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1) @@ -101,9 +100,10 @@ def make_scheduler(*, 2. profiling range [3,6], [9,12], [15,18]... - batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat + Assume batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat .. code-block:: python + :name: code-example2 import paddle.profiler as profiler profiler.make_scheduler(closed=1, ready=1, record=4, skip_first=1) @@ -151,15 +151,17 @@ def export_chrome_tracing(dir_name: str, The output file will be saved in directory 'dir_name', and file name will be set as worker_name. if worker_name is not set, the default name is [hostname]_[pid]. - Parameters: + Args: dir_name(str): Directory to save profiling data. - worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid]. + worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid]. Returns: A callable, which takes a Profiler object as parameter and calls its export method to save data to chrome tracing format file. Examples: + The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler ` . .. code-block:: python + :name: code-example1 # required: gpu import paddle.profiler as profiler @@ -198,15 +200,17 @@ def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable: The output file will be saved in directory 'dir_name', and file name will be set as worker_name. if worker_name is not set, the default name is [hostname]_[pid]. - Parameters: + Args: dir_name(str): Directory to save profiling data. - worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid]. + worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid]. Returns: A callable, which takes a Profiler object as parameter and calls its export method to save data to protobuf file. Examples: + The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler ` . .. code-block:: python + :name: code-example1 # required: gpu import paddle.profiler as profiler @@ -250,20 +254,21 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]: class Profiler: r""" - Profiler context manager, user interface to manage profile process. + Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table. - Parameters: - targets (iterable): list of tracing targets, currently supported values, ``ProfilerTarget.CPU``, ``ProfilerTarget.GPU`` . - scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``. - If not provided, the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch, + Args: + targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU ` and :ref:`ProfilerTarget.GPU ` . + scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState `. This callable object can be generated by :ref:`make_scheduler ` function. + If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch, which means profiling range [start_batch, end_batch). - on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing. - This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. + on_trace_ready (Callable, optional): Callable object, serves as callback function, and takes the Profiler object as parameter, which provides a way for users to do post-processing. + This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing ` (./profiler_log/). Examples: - 1. profiling range [2, 5) + 1. profiling range [2, 5). .. code-block:: python + :name: code-example1 # required: gpu import paddle.profiler as profiler @@ -278,6 +283,7 @@ class Profiler: 2. profiling range [2,4], [7, 9], [11,13] .. code-block:: python + :name: code-example2 # required: gpu import paddle.profiler as profiler @@ -292,6 +298,7 @@ class Profiler: 3. Use profiler without context manager, and use default parameters .. code-block:: python + :name: code-example3 # required: gpu import paddle.profiler as profiler @@ -373,6 +380,7 @@ def start(self): Examples: .. code-block:: python + :name: code-example1 # required: gpu import paddle.profiler as profiler @@ -407,6 +415,7 @@ def stop(self): Examples: .. code-block:: python + :name: code-example1 # required: gpu import paddle.profiler as profiler @@ -443,6 +452,7 @@ def step(self): Examples: .. code-block:: python + :name: code-example1 # required: gpu import paddle.profiler as profiler @@ -530,13 +540,14 @@ def export(self, path="", format="json"): r""" Exports the tracing data to file. - Parameters: + Args: path(str): file path of the output. - format(str): output format, can be chosen from ['json', 'pb], 'json' for chrome tracing and 'pb' for protobuf. + format(str, optional): output format, can be chosen from ['json', 'pb], 'json' for chrome tracing and 'pb' for protobuf, default value is "json". Examples: .. code-block:: python + :name: code-example1 # required: gpu import paddle.profiler as profiler @@ -561,14 +572,15 @@ def summary(self, r""" Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and userdefined summary. - Parameters: - sorted_by(SortedKeys): how to rank the op table items. - op_detail(bool): expand each operator detail information. - thread_sep(bool): print op table each thread. - time_unit(str): can be chosen form ['s', 'ms', 'us', 'ns'] + Args: + sorted_by( :ref:`SortedKeys ` , optional): how to rank the op table items, default value is SortedKeys.CPUTotal. + op_detail(bool, optional): expand each operator detail information, default value is True. + thread_sep(bool, optional): print op table each thread, default value is False. + time_unit(str, optional): time unit for display, can be chosen form ['s', 'ms', 'us', 'ns'], default value is 'ms'. Examples: .. code-block:: python + :name: code-example1 # required: gpu import paddle.profiler as profiler diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index 61e02c4712c917..b9b0fa5ff2a742 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -33,23 +33,17 @@ class SortedKeys(Enum): r""" - Sorted keys for printing summary table. - - CPUTotal: Sorted by CPU total time. - - CPUAvg: Sorted by CPU average time. - - CPUMax: Sorted by CPU max time. - - CPUMin: Sorted by CPU min time. - - GPUTotal: Sorted by GPU total time. - - GPUAvg: Sorted by GPU average time. - - GPUMax: Sorted by GPU max time. - - GPUMin: Sorted by GPU min time. + SortedKeys is used to specify how to sort items when printing :ref:`summary ` table. + + The meaning of each SortedKeys is as following: + - **SortedKeys.CPUTotal** : Sorted by CPU total time. + - **SortedKeys.CPUAvg** : Sorted by CPU average time. + - **SortedKeys.CPUMax** : Sorted by CPU max time. + - **SortedKeys.CPUMin** : Sorted by CPU min time. + - **SortedKeys.GPUTotal** : Sorted by GPU total time. + - **SortedKeys.GPUAvg** : Sorted by GPU average time. + - **SortedKeys.GPUMax** : Sorted by GPU max time. + - **SortedKeys.GPUMin** : Sorted by GPU min time. """ CPUTotal = 0 CPUAvg = 1 diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py index b9c27efb19bff2..d2b38f78f13fc9 100644 --- a/python/paddle/profiler/utils.py +++ b/python/paddle/profiler/utils.py @@ -30,13 +30,15 @@ class RecordEvent(ContextDecorator): r""" - Interface for recording a time range. + Interface for recording a time range by user defined. - Parameters: + Args: name(str): Name of the record event + event_type(TracerEventType, optional): Optional, default value is TracerEventType.UserDefined. It is reserved for internal purpose, and it is better not to specify this parameter. Examples: .. code-block:: python + :name: code-example1 import paddle import paddle.profiler as profiler @@ -53,8 +55,8 @@ class RecordEvent(ContextDecorator): result = data1 + data2 record_event.end() - Note: - RecordEvent will take effect only when profiler is on and at the state of RECORD. + **Note**: + RecordEvent will take effect only when :ref:`Profiler ` is on and at the state of RECORD. """ def __init__(self, @@ -76,6 +78,7 @@ def begin(self): Record the time of begining. .. code-block:: python + :name: code-example1 import paddle import paddle.profiler as profiler @@ -100,6 +103,7 @@ def end(self): Record the time of ending. .. code-block:: python + :name: code-example1 import paddle import paddle.profiler as profiler @@ -118,14 +122,15 @@ def load_profiler_result(filename: str): r""" Load dumped profiler data back to memory. - Parameters: + Args: filename(str): Name of the exported protobuf file of profiler data. Returns: - ProfilerResult object. + ProfilerResult object, which stores profiling data. Examples: .. code-block:: python + :name: code-example1 # required: gpu import paddle.profiler as profiler From 3bca1d2ccddac279368bf1c8012335eef985b717 Mon Sep 17 00:00:00 2001 From: chenjian Date: Thu, 24 Mar 2022 15:56:07 +0000 Subject: [PATCH 04/15] fix api doc format --- python/paddle/profiler/profiler.py | 41 ++++++++++---------- python/paddle/profiler/profiler_statistic.py | 2 +- python/paddle/profiler/utils.py | 10 ++--- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index 1a3680b683b410..b5cba09120247a 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -31,7 +31,7 @@ class ProfilerState(Enum): r""" ProfilerState is used to present the state of :ref:`Profiler ` . - The meaning of each ProfilerState is as following: + The meaning of each ProfilerState is as following - **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded. - **ProfilerState.READY** : The profiler is open, but the data will not be recorded. This state is used for reducing overhead influence when profiler starts. @@ -49,7 +49,7 @@ class ProfilerTarget(Enum): r""" ProfilerTarget is used to specify target device for :ref:`profiling ` . Only CPU and GPU are supported currently. - The meaning of each ProfilerState is as following: + The meaning of each ProfilerState is as following - **ProfilerTarget.CPU** : Profile events on CPU. - **ProfilerTarget.GPU** : Profile events on GPU. """ @@ -148,7 +148,7 @@ def export_chrome_tracing(dir_name: str, worker_name: Optional[str]=None) -> Callable: r""" Return a callable, used for outputing tracing data to chrome tracing format file. - The output file will be saved in directory 'dir_name', and file name will be set as worker_name. + The output file will be saved in directory ``dir_name``, and file name will be set as worker_name. if worker_name is not set, the default name is [hostname]_[pid]. Args: @@ -159,9 +159,10 @@ def export_chrome_tracing(dir_name: str, A callable, which takes a Profiler object as parameter and calls its export method to save data to chrome tracing format file. Examples: - The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler ` . + The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler ` . + .. code-block:: python - :name: code-example1 + :name: code-example1 # required: gpu import paddle.profiler as profiler @@ -197,7 +198,7 @@ def handle_fn(prof): def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable: r""" Return a callable, used for outputing tracing data to protobuf file. - The output file will be saved in directory 'dir_name', and file name will be set as worker_name. + The output file will be saved in directory ``dir_name``, and file name will be set as worker_name. if worker_name is not set, the default name is [hostname]_[pid]. Args: @@ -208,9 +209,9 @@ def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable: A callable, which takes a Profiler object as parameter and calls its export method to save data to protobuf file. Examples: - The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler ` . + The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler ` . .. code-block:: python - :name: code-example1 + :name: code-example1 # required: gpu import paddle.profiler as profiler @@ -257,18 +258,18 @@ class Profiler: Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table. Args: - targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU ` and :ref:`ProfilerTarget.GPU ` . - scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState `. This callable object can be generated by :ref:`make_scheduler ` function. + targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU ` and :ref:`ProfilerTarget.GPU ` . + scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState `. This callable object can be generated by :ref:`make_scheduler ` function. If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch, which means profiling range [start_batch, end_batch). on_trace_ready (Callable, optional): Callable object, serves as callback function, and takes the Profiler object as parameter, which provides a way for users to do post-processing. - This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing ` (./profiler_log/). + This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing ` (./profiler_log/). Examples: 1. profiling range [2, 5). .. code-block:: python - :name: code-example1 + :name: code-example1 # required: gpu import paddle.profiler as profiler @@ -283,7 +284,7 @@ class Profiler: 2. profiling range [2,4], [7, 9], [11,13] .. code-block:: python - :name: code-example2 + :name: code-example2 # required: gpu import paddle.profiler as profiler @@ -298,7 +299,7 @@ class Profiler: 3. Use profiler without context manager, and use default parameters .. code-block:: python - :name: code-example3 + :name: code-example3 # required: gpu import paddle.profiler as profiler @@ -380,7 +381,7 @@ def start(self): Examples: .. code-block:: python - :name: code-example1 + :name: code-example4 # required: gpu import paddle.profiler as profiler @@ -415,7 +416,7 @@ def stop(self): Examples: .. code-block:: python - :name: code-example1 + :name: code-example5 # required: gpu import paddle.profiler as profiler @@ -452,7 +453,7 @@ def step(self): Examples: .. code-block:: python - :name: code-example1 + :name: code-example6 # required: gpu import paddle.profiler as profiler @@ -547,7 +548,7 @@ def export(self, path="", format="json"): Examples: .. code-block:: python - :name: code-example1 + :name: code-example7 # required: gpu import paddle.profiler as profiler @@ -573,14 +574,14 @@ def summary(self, Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and userdefined summary. Args: - sorted_by( :ref:`SortedKeys ` , optional): how to rank the op table items, default value is SortedKeys.CPUTotal. + sorted_by( :ref:`SortedKeys ` , optional): how to rank the op table items, default value is SortedKeys.CPUTotal. op_detail(bool, optional): expand each operator detail information, default value is True. thread_sep(bool, optional): print op table each thread, default value is False. time_unit(str, optional): time unit for display, can be chosen form ['s', 'ms', 'us', 'ns'], default value is 'ms'. Examples: .. code-block:: python - :name: code-example1 + :name: code-example8 # required: gpu import paddle.profiler as profiler diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index b9b0fa5ff2a742..fdb1eadfae9865 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -35,7 +35,7 @@ class SortedKeys(Enum): r""" SortedKeys is used to specify how to sort items when printing :ref:`summary ` table. - The meaning of each SortedKeys is as following: + The meaning of each SortedKeys is as following - **SortedKeys.CPUTotal** : Sorted by CPU total time. - **SortedKeys.CPUAvg** : Sorted by CPU average time. - **SortedKeys.CPUMax** : Sorted by CPU max time. diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py index d2b38f78f13fc9..1acd583ee0f1a9 100644 --- a/python/paddle/profiler/utils.py +++ b/python/paddle/profiler/utils.py @@ -38,7 +38,7 @@ class RecordEvent(ContextDecorator): Examples: .. code-block:: python - :name: code-example1 + :name: code-example1 import paddle import paddle.profiler as profiler @@ -56,7 +56,7 @@ class RecordEvent(ContextDecorator): record_event.end() **Note**: - RecordEvent will take effect only when :ref:`Profiler ` is on and at the state of RECORD. + RecordEvent will take effect only when :ref:`Profiler ` is on and at the state of RECORD. """ def __init__(self, @@ -78,7 +78,7 @@ def begin(self): Record the time of begining. .. code-block:: python - :name: code-example1 + :name: code-example2 import paddle import paddle.profiler as profiler @@ -103,7 +103,7 @@ def end(self): Record the time of ending. .. code-block:: python - :name: code-example1 + :name: code-example3 import paddle import paddle.profiler as profiler @@ -130,7 +130,7 @@ def load_profiler_result(filename: str): Examples: .. code-block:: python - :name: code-example1 + :name: code-example1 # required: gpu import paddle.profiler as profiler From 00507150967c04c5e5d793782937510e11a3d69a Mon Sep 17 00:00:00 2001 From: chenjian Date: Fri, 25 Mar 2022 02:09:04 +0000 Subject: [PATCH 05/15] fix api doc according to review --- python/paddle/profiler/profiler.py | 33 ++++++++++++-------- python/paddle/profiler/profiler_statistic.py | 26 +++++++++------ 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index b5cba09120247a..4fb247f712a456 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -29,15 +29,19 @@ class ProfilerState(Enum): r""" - ProfilerState is used to present the state of :ref:`Profiler ` . + ProfilerState is used to present the state of :ref:`Profiler ` . The meaning of each ProfilerState is as following - - **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded. - - **ProfilerState.READY** : The profiler is open, but the data will not be recorded. - This state is used for reducing overhead influence when profiler starts. - - **ProfilerState.RECORD** : The profiler is open, and the data will be recorded. - - **ProfilerState.RECORD_AND_RETURN** : The profiler is open, and this state stands for the last batch of "RECORD" state in current profiling period. - The collected data will be returned in this state. + + **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded. + + **ProfilerState.READY** : The profiler is open, but the data will not be recorded. + This state is used for reducing overhead influence when profiler starts. + + **ProfilerState.RECORD** : The profiler is open, and the data will be recorded. + + **ProfilerState.RECORD_AND_RETURN** : The profiler is open, and this state stands for the last batch of "RECORD" state in current profiling period. + The collected data will be returned in this state. """ CLOSED = 0 READY = 1 @@ -47,11 +51,13 @@ class ProfilerState(Enum): class ProfilerTarget(Enum): r""" - ProfilerTarget is used to specify target device for :ref:`profiling ` . Only CPU and GPU are supported currently. + ProfilerTarget is used to specify target device for :ref:`profiling ` . Only CPU and GPU are supported currently. The meaning of each ProfilerState is as following - - **ProfilerTarget.CPU** : Profile events on CPU. - - **ProfilerTarget.GPU** : Profile events on GPU. + + **ProfilerTarget.CPU** : Profile events on CPU. + + **ProfilerTarget.GPU** : Profile events on GPU. """ CPU = 0 GPU = 1 @@ -64,7 +70,7 @@ def make_scheduler(*, repeat: int=0, skip_first: int=0) -> Callable: r""" - Return a scheduler function, which scheduler the :ref:`state ` according to the setting. + Return a scheduler function, which scheduler the :ref:`state ` according to the setting. The state transform confirms to: .. code-block:: text @@ -92,7 +98,7 @@ def make_scheduler(*, Assume batch 0: closed, batch 1: ready, batch [2, 5] record .. code-block:: python - :name: code-example1 + :name: code-example1 import paddle.profiler as profiler profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1) @@ -103,7 +109,7 @@ def make_scheduler(*, Assume batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat .. code-block:: python - :name: code-example2 + :name: code-example2 import paddle.profiler as profiler profiler.make_scheduler(closed=1, ready=1, record=4, skip_first=1) @@ -210,6 +216,7 @@ def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable: Examples: The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler ` . + .. code-block:: python :name: code-example1 diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index fdb1eadfae9865..9356d2cacdb48e 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -33,17 +33,25 @@ class SortedKeys(Enum): r""" - SortedKeys is used to specify how to sort items when printing :ref:`summary ` table. + SortedKeys is used to specify how to sort items when printing :ref:`summary ` table. The meaning of each SortedKeys is as following - - **SortedKeys.CPUTotal** : Sorted by CPU total time. - - **SortedKeys.CPUAvg** : Sorted by CPU average time. - - **SortedKeys.CPUMax** : Sorted by CPU max time. - - **SortedKeys.CPUMin** : Sorted by CPU min time. - - **SortedKeys.GPUTotal** : Sorted by GPU total time. - - **SortedKeys.GPUAvg** : Sorted by GPU average time. - - **SortedKeys.GPUMax** : Sorted by GPU max time. - - **SortedKeys.GPUMin** : Sorted by GPU min time. + + **SortedKeys.CPUTotal** : Sorted by CPU total time. + + **SortedKeys.CPUAvg** : Sorted by CPU average time. + + **SortedKeys.CPUMax** : Sorted by CPU max time. + + **SortedKeys.CPUMin** : Sorted by CPU min time. + + **SortedKeys.GPUTotal** : Sorted by GPU total time. + + **SortedKeys.GPUAvg** : Sorted by GPU average time. + + **SortedKeys.GPUMax** : Sorted by GPU max time. + + **SortedKeys.GPUMin** : Sorted by GPU min time. """ CPUTotal = 0 CPUAvg = 1 From 42f542c9d4a1f778d44779e45ae02979ff0c1e64 Mon Sep 17 00:00:00 2001 From: chenjian Date: Fri, 25 Mar 2022 10:08:37 +0000 Subject: [PATCH 06/15] fix bug and add unit test --- .../unittests/test_profiler_statistic.py | 222 ++++++++++++++++++ python/paddle/profiler/profiler.py | 14 +- python/paddle/profiler/profiler_statistic.py | 28 +-- 3 files changed, 240 insertions(+), 24 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py index 73b501c9c7eade..4769a81e7936c5 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py +++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py @@ -207,6 +207,228 @@ def test_statistic_case1(self): thread_sep=False, time_unit='ms')) + def test_statistic_case2(self): + root_node = HostPythonNode('Root Node', + profiler.TracerEventType.UserDefined, 0, + float('inf'), 1000, 1001) + profilerstep_node = HostPythonNode('ProfileStep#1', + profiler.TracerEventType.ProfileStep, + 0, 400, 1000, 1001) + + dataloader_node = HostPythonNode( + 'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001) + + mobilenet_node = HostPythonNode( + 'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) + yolonet_node = HostPythonNode( + 'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001) + + userdefined_node = HostPythonNode('Communication Time', + profiler.TracerEventType.UserDefined, + 100, 110, 1000, 1001) + reduce_all_launchkernel0 = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104, + 1000, 1001) + + nccl_reduce_all_kernel0 = DevicePythonNode( + 'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 105, 120, + 0, 0, 2) + + communication_node = HostPythonNode( + 'Communication', profiler.TracerEventType.Communication, 105, 110, + 1000, 1001) + + reduce_all_op1 = HostPythonNode('cudalaunchkernel', + profiler.TracerEventType.Operator, 105, + 108, 1000, 1001) + + reduce_all_launchkernel1 = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 106, 107, + 1000, 1001) + + nccl_reduce_all_kernel1 = DevicePythonNode( + 'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 130, 150, + 0, 0, 2) + + backward_node = HostPythonNode('Gradient Backward', + profiler.TracerEventType.Backward, 120, + 200, 1000, 1001) + optimization_node = HostPythonNode( + 'Optimization', profiler.TracerEventType.Optimization, 220, 300, + 1000, 1001) + conv2d_node = HostPythonNode( + 'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001) + sync_batch_norm_node = HostPythonNode('sync_batch_norm', + profiler.TracerEventType.Operator, + 60, 100, 1000, 1001) + conv2d_infer_shape = HostPythonNode( + 'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25, + 30, 1000, 1001) + conv2d_compute = HostPythonNode('conv2d::compute', + profiler.TracerEventType.OperatorInner, + 30, 40, 1000, 1001) + conv2d_launchkernel = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35, + 1000, 1001) + conv2d_MemCpy = HostPythonNode('AsyncMemcpy', + profiler.TracerEventType.UserDefined, 35, + 40, 1000, 1001) + conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy', + profiler.TracerEventType.CudaRuntime, + 35, 40, 1000, 1001) + conv2d_kernel = DevicePythonNode( + 'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0) + conv2d_memcpy = DevicePythonNode( + 'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0) + sync_batch_norm_infer_shape = HostPythonNode( + 'sync_batch_norm::infer_shape', + profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001) + sync_batch_norm_compute = HostPythonNode( + 'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner, + 80, 100, 1000, 1001) + sync_batch_norm_launchkernel = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90, + 1000, 1001) + sync_batch_norm_MemCpy = HostPythonNode( + 'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000, + 1001) + sync_batch_norm_cudaMemCpy = HostPythonNode( + 'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000, + 1001) + sync_batch_norm_kernel = DevicePythonNode( + 'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 300, + 0, 0, 0) + sync_batch_norm_memcpy = DevicePythonNode( + 'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200, + 0, 0, 1) + + reduce_all_node2 = HostPythonNode('reduce_all', + profiler.TracerEventType.Operator, + 230, 250, 1000, 1001) + + reduce_all_launchkernel2 = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 235, 240, + 1000, 1001) + + nccl_reduce_all_kernel2 = DevicePythonNode( + 'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 250, 280, + 0, 0, 2) + + root_node.children_node.append(profilerstep_node) + profilerstep_node.children_node.extend([ + dataloader_node, mobilenet_node, yolonet_node, backward_node, + optimization_node + ]) + mobilenet_node.children_node.append(conv2d_node) + yolonet_node.children_node.extend( + [sync_batch_norm_node, userdefined_node]) + userdefined_node.children_node.append(communication_node) + userdefined_node.runtime_node.append(reduce_all_launchkernel0) + reduce_all_launchkernel0.device_node.append(nccl_reduce_all_kernel0) + communication_node.children_node.append(reduce_all_op1) + reduce_all_op1.runtime_node.append(reduce_all_launchkernel1) + reduce_all_launchkernel1.device_node.append(nccl_reduce_all_kernel1) + conv2d_node.children_node.extend( + [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy]) + conv2d_compute.runtime_node.append(conv2d_launchkernel) + conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy) + conv2d_launchkernel.device_node.append(conv2d_kernel) + conv2d_cudaMemCpy.device_node.append(conv2d_memcpy) + sync_batch_norm_node.children_node.extend([ + sync_batch_norm_infer_shape, sync_batch_norm_compute, + sync_batch_norm_MemCpy + ]) + sync_batch_norm_compute.runtime_node.append( + sync_batch_norm_launchkernel) + sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy) + sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel) + sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy) + optimization_node.children_node.append(reduce_all_node2) + reduce_all_node2.runtime_node.append(reduce_all_launchkernel2) + reduce_all_launchkernel2.device_node.append(nccl_reduce_all_kernel2) + thread_tree = {'thread1001': root_node} + extra_info = { + 'Process Cpu Utilization': '1.02', + 'System Cpu Utilization': '0.68' + } + statistic_data = profiler.profiler_statistic.StatisticData(thread_tree, + extra_info) + time_range_summary = statistic_data.time_range_summary + event_summary = statistic_data.event_summary + distributed_summary = statistic_data.distributed_summary + + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.ProfileStep), 400) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Forward), 100) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Backward), 80) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Optimization), 80) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Operator), 78) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.OperatorInner), 45) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.CudaRuntime), 38) + self.assertEqual( + time_range_summary.get_gpu_range_sum( + 0, profiler.TracerEventType.Kernel), 220) + self.assertEqual( + time_range_summary.get_gpu_range_sum( + 0, profiler.TracerEventType.Memcpy), 60) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.UserDefined), 25) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Communication), 5) + self.assertEqual( + profiler.statistic_helper.sum_ranges( + distributed_summary.cpu_communication_range), 25) + self.assertEqual( + profiler.statistic_helper.sum_ranges( + distributed_summary.gpu_communication_range), 65) + self.assertEqual( + profiler.statistic_helper.sum_ranges( + distributed_summary.communication_range), 85) + self.assertEqual( + profiler.statistic_helper.sum_ranges( + distributed_summary.computation_range), 220) + self.assertEqual( + profiler.statistic_helper.sum_ranges( + distributed_summary.overlap_range), 85) + self.assertEqual(len(event_summary.items), 4) + self.assertEqual(len(event_summary.userdefined_items), 1) + self.assertEqual(len(event_summary.model_perspective_items), 3) + self.assertEqual(len(event_summary.memory_manipulation_items), 1) + self.assertEqual(event_summary.items['conv2d'].cpu_time, 15) + self.assertEqual(event_summary.items['conv2d'].gpu_time, 25) + self.assertEqual( + event_summary.model_perspective_items['Forward'].cpu_time, 100) + self.assertEqual( + event_summary.model_perspective_items['Forward'].gpu_time, 315) + self.assertEqual( + event_summary.model_perspective_items['Backward'].gpu_time, 0) + self.assertEqual( + event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15) + self.assertEqual( + event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60) + print( + profiler.profiler_statistic._build_table( + statistic_data, + sorted_by=profiler.SortedKeys.CPUTotal, + op_detail=True, + thread_sep=False, + time_unit='ms')) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index 4fb247f712a456..d978e594399f40 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -33,15 +33,13 @@ class ProfilerState(Enum): The meaning of each ProfilerState is as following - **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded. + - **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded. - **ProfilerState.READY** : The profiler is open, but the data will not be recorded. - This state is used for reducing overhead influence when profiler starts. + - **ProfilerState.READY** : The profiler is open, but the data will not be recorded. This state is used for reducing overhead influence when profiler starts. - **ProfilerState.RECORD** : The profiler is open, and the data will be recorded. + - **ProfilerState.RECORD** : The profiler is open, and the data will be recorded. - **ProfilerState.RECORD_AND_RETURN** : The profiler is open, and this state stands for the last batch of "RECORD" state in current profiling period. - The collected data will be returned in this state. + - **ProfilerState.RECORD_AND_RETURN** : The profiler is open, and this state stands for the last batch of "RECORD" state in current profiling period. The collected data will be returned in this state. """ CLOSED = 0 READY = 1 @@ -55,9 +53,9 @@ class ProfilerTarget(Enum): The meaning of each ProfilerState is as following - **ProfilerTarget.CPU** : Profile events on CPU. + - **ProfilerTarget.CPU** : Profile events on CPU. - **ProfilerTarget.GPU** : Profile events on GPU. + - **ProfilerTarget.GPU** : Profile events on GPU. """ CPU = 0 GPU = 1 diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index 9356d2cacdb48e..8bb30d111c8048 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -37,21 +37,21 @@ class SortedKeys(Enum): The meaning of each SortedKeys is as following - **SortedKeys.CPUTotal** : Sorted by CPU total time. + - **SortedKeys.CPUTotal** : Sorted by CPU total time. - **SortedKeys.CPUAvg** : Sorted by CPU average time. + - **SortedKeys.CPUAvg** : Sorted by CPU average time. - **SortedKeys.CPUMax** : Sorted by CPU max time. + - **SortedKeys.CPUMax** : Sorted by CPU max time. - **SortedKeys.CPUMin** : Sorted by CPU min time. + - **SortedKeys.CPUMin** : Sorted by CPU min time. - **SortedKeys.GPUTotal** : Sorted by GPU total time. + - **SortedKeys.GPUTotal** : Sorted by GPU total time. - **SortedKeys.GPUAvg** : Sorted by GPU average time. + - **SortedKeys.GPUAvg** : Sorted by GPU average time. - **SortedKeys.GPUMax** : Sorted by GPU max time. + - **SortedKeys.GPUMax** : Sorted by GPU max time. - **SortedKeys.GPUMin** : Sorted by GPU min time. + - **SortedKeys.GPUMin** : Sorted by GPU min time. """ CPUTotal = 0 CPUAvg = 1 @@ -274,10 +274,8 @@ def parse(self, nodetrees): for device_node in device_nodes: if device_node.type == TracerEventType.Kernel: self.gpu_communication_range.append( - (devicenode.start_ns, devicenode.end_ns)) - else: - self.computation_range.append( - (devicenode.start_ns, devicenode.end_ns)) + (device_node.start_ns, device_node.end_ns)) + #case 2: TracerEventType is Operator but is communication op elif hostnode.type == TracerEventType.Operator and any([ name in hostnode.name.lower() @@ -289,10 +287,8 @@ def parse(self, nodetrees): for device_node in device_nodes: if device_node.type == TracerEventType.Kernel: self.gpu_communication_range.append( - (devicenode.start_ns, devicenode.end_ns)) - else: - self.computation_range.append( - (devicenode.start_ns, devicenode.end_ns)) + (device_node.start_ns, device_node.end_ns)) + #case 3: Others, filter kernels named with nccl else: for runtimenode in hostnode.runtime_node: From 2bf81d557187702f5d894d8469d335f0aebad199 Mon Sep 17 00:00:00 2001 From: chenjian Date: Mon, 28 Mar 2022 06:03:06 +0000 Subject: [PATCH 07/15] fix record event bug --- paddle/fluid/platform/profiler.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 940fc98d3b3202..5cd909b4e08f0d 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -164,6 +164,7 @@ void RecordEvent::End() { #ifdef PADDLE_WITH_CUDA if (g_enable_nvprof_hook && is_pushed_) { dynload::nvtxRangePop(); + is_pushed_ = false; } #endif #endif From a73802ad6242e8c6925c94597abeb76b1163c08c Mon Sep 17 00:00:00 2001 From: chenjian Date: Mon, 28 Mar 2022 09:11:14 +0000 Subject: [PATCH 08/15] optimize chrome tracing display --- .../platform/profiler/chrometracing_logger.cc | 122 +++++++++++------- .../platform/profiler/chrometracing_logger.h | 1 + paddle/fluid/platform/profiler/utils.h | 11 +- python/paddle/profiler/utils.py | 44 ++++--- 4 files changed, 108 insertions(+), 70 deletions(-) diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index 4061e2d4d494d5..478b601f88822c 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include +#include #include "glog/logging.h" @@ -75,6 +76,15 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) { // log all nodes except root node, root node is a helper node. const std::map> thread2host_event_nodes = node_trees.Traverse(true); + // find the earliest time in current timeline + start_time_ = std::numeric_limits::max(); + for (auto it = thread2host_event_nodes.begin(); + it != thread2host_event_nodes.end(); ++it) { + if ((*(it->second.begin() + 1))->StartNs() < start_time_) { + start_time_ = (*(it->second.begin() + 1))->StartNs(); + } + } + for (auto it = thread2host_event_nodes.begin(); it != thread2host_event_nodes.end(); ++it) { for (auto hostnode = it->second.begin(); hostnode != it->second.end(); @@ -114,38 +124,44 @@ void ChromeTracingLogger::LogHostTraceEventNode( std::string( R"JSON( { - "name": "%s", "pid": %lld, "tid": "%lld(Python)", - "ts": %lld, "dur": %lld, + "name": "%s[%.3f ms]", "pid": %lld, "tid": "%lld(Python)", + "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", + "cname": "thread_state_runnable", "args": { - "start_ns": %lld, - "end_ns": %lld + "start_time": "%.3f us", + "end_time": "%.3f us" } }, )JSON"), - host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(), - nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()), + host_node.Name().c_str(), nsToMsFloat(host_node.Duration()), + host_node.ProcessId(), host_node.ThreadId(), + nsToUs(host_node.StartNs()), nsToUsFloat(host_node.Duration()), categary_name_[static_cast(host_node.Type())], - host_node.StartNs(), host_node.EndNs()); + nsToUsFloat(host_node.StartNs(), start_time_), + nsToUsFloat(host_node.EndNs(), start_time_)); break; default: output_file_stream_ << string_format( std::string( R"JSON( { - "name": "%s", "pid": %lld, "tid": "%lld(C++)", - "ts": %lld, "dur": %lld, + "name": "%s[%.3f ms]", "pid": %lld, "tid": "%lld(C++)", + "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", + "cname": "thread_state_runnable", "args": { - "start_ns": %lld, - "end_ns": %lld + "start_time": "%.3f us", + "end_time": "%.3f us" } }, )JSON"), - host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(), - nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()), + host_node.Name().c_str(), nsToMsFloat(host_node.Duration()), + host_node.ProcessId(), host_node.ThreadId(), + nsToUs(host_node.StartNs()), nsToUsFloat(host_node.Duration()), categary_name_[static_cast(host_node.Type())], - host_node.StartNs(), host_node.EndNs()); + nsToUsFloat(host_node.StartNs(), start_time_), + nsToUsFloat(host_node.EndNs(), start_time_)); break; } @@ -161,22 +177,24 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode( std::string( R"JSON( { - "name": "%s", "pid": %lld, "tid": "%lld(C++)", - "ts": %lld, "dur": %lld, + "name": "%s[%.3f ms]", "pid": %lld, "tid": "%lld(C++)", + "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", + "cname": "thread_state_running", "args": { "correlation id": %d, - "start_ns": %lld, - "end_ns": %lld + "start_time": "%.3f us", + "end_time": "%.3f us" } }, )JSON"), - runtime_node.Name().c_str(), runtime_node.ProcessId(), - runtime_node.ThreadId(), nsToUs(runtime_node.StartNs()), - nsToUs(runtime_node.Duration()), + runtime_node.Name().c_str(), nsToMsFloat(runtime_node.Duration()), + runtime_node.ProcessId(), runtime_node.ThreadId(), + nsToUs(runtime_node.StartNs()), nsToUsFloat(runtime_node.Duration()), categary_name_[static_cast(runtime_node.Type())], - runtime_node.CorrelationId(), runtime_node.StartNs(), - runtime_node.EndNs()); + runtime_node.CorrelationId(), + nsToUsFloat(runtime_node.StartNs(), start_time_), + nsToUsFloat(runtime_node.EndNs(), start_time_)); pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()}); output_file_stream_ << string_format( @@ -270,12 +288,13 @@ void ChromeTracingLogger::HandleTypeKernel( std::string( R"JSON( { - "name": "%s", "pid": %lld, "tid": %lld, - "ts": %lld, "dur": %lld, + "name": "%s[%.3f ms]", "pid": %lld, "tid": %lld, + "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", + "cname": "rail_animation", "args": { - "start_ns": %lld, - "end_ns": %lld, + "start_time": "%.3f us", + "end_time": "%.3f us", "device": %d, "context": %d, "stream": %d, "correlation id": %d, "registers per thread": %d, @@ -284,15 +303,16 @@ void ChromeTracingLogger::HandleTypeKernel( "warps per SM": %f, "grid": [%d, %d, %d], "block": [%d, %d, %d], - "theoretical achieved occupancy %%": %f + "theoretical achieved occupancy %%": %.3f } }, )JSON"), - device_node.Name().c_str(), device_node.DeviceId(), - device_node.StreamId(), nsToUs(device_node.StartNs()), - nsToUs(device_node.Duration()), + device_node.Name().c_str(), nsToMsFloat(device_node.Duration()), + device_node.DeviceId(), device_node.StreamId(), + nsToUs(device_node.StartNs()), nsToUsFloat(device_node.Duration()), categary_name_[static_cast(device_node.Type())], - device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(), + nsToUsFloat(device_node.StartNs(), start_time_), + nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(), device_node.CorrelationId(), kernel_info.registers_per_thread, kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory, @@ -312,22 +332,24 @@ void ChromeTracingLogger::HandleTypeMemcpy( std::string( R"JSON( { - "name": "%s", "pid": %lld, "tid": %lld, - "ts": %lld, "dur": %lld, + "name": "%s[%.3f ms]", "pid": %lld, "tid": %lld, + "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", + "cname": "rail_animation", "args": { - "start_ns": %lld, - "end_ns": %lld, + "start_time": "%.3f us", + "end_time": "%.3f us", "stream": %d, "correlation id": %d, - "bytes": %d, "memory bandwidth (GB/s)": %f + "bytes": %d, "memory bandwidth (GB/s)": %.3f } }, )JSON"), - device_node.Name().c_str(), device_node.DeviceId(), - device_node.StreamId(), nsToUs(device_node.StartNs()), - nsToUs(device_node.Duration()), + device_node.Name().c_str(), nsToMsFloat(device_node.Duration()), + device_node.DeviceId(), device_node.StreamId(), + nsToUs(device_node.StartNs()), nsToUsFloat(device_node.Duration()), categary_name_[static_cast(device_node.Type())], - device_node.StartNs(), device_node.EndNs(), device_node.StreamId(), + nsToUsFloat(device_node.StartNs(), start_time_), + nsToUsFloat(device_node.EndNs(), start_time_), device_node.StreamId(), device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth); } @@ -338,23 +360,25 @@ void ChromeTracingLogger::HandleTypeMemset( std::string( R"JSON( { - "name": "%s", "pid": %lld, "tid": %lld, - "ts": %lld, "dur": %lld, + "name": "%s[%.3f ms]", "pid": %lld, "tid": %lld, + "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", + "cname": "rail_animation", "args": { - "start_ns": %lld, - "end_ns": %lld, + "start_time": "%.3f us", + "end_time": "%.3f us", "device": %d, "context": %d, "stream": %d, "correlation id": %d, "bytes": %d, "value": %d } }, )JSON"), - device_node.Name().c_str(), device_node.DeviceId(), - device_node.StreamId(), nsToUs(device_node.StartNs()), - nsToUs(device_node.Duration()), + device_node.Name().c_str(), nsToMsFloat(device_node.Duration()), + device_node.DeviceId(), device_node.StreamId(), + nsToUs(device_node.StartNs()), nsToUsFloat(device_node.Duration()), categary_name_[static_cast(device_node.Type())], - device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(), + nsToUsFloat(device_node.StartNs(), start_time_), + nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(), device_node.CorrelationId(), memset_info.num_bytes, memset_info.value); } diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h index 20a924a54cabdf..8977ab748c63a3 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.h +++ b/paddle/fluid/platform/profiler/chrometracing_logger.h @@ -50,6 +50,7 @@ class ChromeTracingLogger : public BaseLogger { static const char* categary_name_[]; std::set> pid_tid_set_; std::set> deviceid_streamid_set_; + uint64_t start_time_; }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h index b471d6b79833a1..06d1636c4617ca 100644 --- a/paddle/fluid/platform/profiler/utils.h +++ b/paddle/fluid/platform/profiler/utils.h @@ -44,7 +44,16 @@ static std::string GetStringFormatLocalTime() { return std::string(buf); } -static int64_t nsToUs(int64_t ns) { return ns / 1000; } +static int64_t nsToUs(uint64_t end_ns, uint64_t start_ns = 0) { + return (end_ns - start_ns) / 1000; +} + +static float nsToUsFloat(uint64_t end_ns, uint64_t start_ns = 0) { + return static_cast(end_ns - start_ns) / 1000; +} +static float nsToMsFloat(uint64_t end_ns, uint64_t start_ns = 0) { + return static_cast(end_ns - start_ns) / 1000 / 1000; +} #ifdef PADDLE_WITH_CUPTI float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread, diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py index 1acd583ee0f1a9..291326478e91bc 100644 --- a/python/paddle/profiler/utils.py +++ b/python/paddle/profiler/utils.py @@ -77,17 +77,19 @@ def begin(self): r""" Record the time of begining. - .. code-block:: python - :name: code-example2 + Examples: - import paddle - import paddle.profiler as profiler - record_event = profiler.RecordEvent("record_sub") - record_event.begin() - data1 = paddle.randn(shape=[3]) - data2 = paddle.randn(shape=[3]) - result = data1 - data2 - record_event.end() + .. code-block:: python + :name: code-example2 + + import paddle + import paddle.profiler as profiler + record_event = profiler.RecordEvent("record_sub") + record_event.begin() + data1 = paddle.randn(shape=[3]) + data2 = paddle.randn(shape=[3]) + result = data1 - data2 + record_event.end() """ if self.event_type not in _AllowedEventTypeList: warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\ @@ -102,17 +104,19 @@ def end(self): r''' Record the time of ending. - .. code-block:: python - :name: code-example3 + Examples: - import paddle - import paddle.profiler as profiler - record_event = profiler.RecordEvent("record_mul") - record_event.begin() - data1 = paddle.randn(shape=[3]) - data2 = paddle.randn(shape=[3]) - result = data1 * data2 - record_event.end() + .. code-block:: python + :name: code-example3 + + import paddle + import paddle.profiler as profiler + record_event = profiler.RecordEvent("record_mul") + record_event.begin() + data1 = paddle.randn(shape=[3]) + data2 = paddle.randn(shape=[3]) + result = data1 * data2 + record_event.end() ''' if self.event: self.event.end() From cb536f87d446460d42ec62fd68464646d48a6534 Mon Sep 17 00:00:00 2001 From: chenjian Date: Mon, 28 Mar 2022 09:19:42 +0000 Subject: [PATCH 09/15] fix bug --- paddle/fluid/platform/profiler.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index b922d8f55b4a17..5cd909b4e08f0d 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -162,7 +162,7 @@ void RecordEvent::OriginalConstruct(const std::string &name, void RecordEvent::End() { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA - if (g_enable_nvprof_hook && is_pushed_ && is_enabled_) { + if (g_enable_nvprof_hook && is_pushed_) { dynload::nvtxRangePop(); is_pushed_ = false; } From 03eeb4a086eec68e50b0d9f526ee6b277b13f7d8 Mon Sep 17 00:00:00 2001 From: chenjian Date: Mon, 28 Mar 2022 09:24:31 +0000 Subject: [PATCH 10/15] add comment --- paddle/fluid/platform/profiler/chrometracing_logger.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index 478b601f88822c..3ca3ee5ed829e2 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -120,6 +120,7 @@ void ChromeTracingLogger::LogHostTraceEventNode( case TracerEventType::Optimization: case TracerEventType::PythonOp: case TracerEventType::PythonUserDefined: + // cname value comes from tracing.js reservedColorsByName variable output_file_stream_ << string_format( std::string( R"JSON( From 982715f94741644e3a69bf888e1c21bfa31fd725 Mon Sep 17 00:00:00 2001 From: chenjian Date: Mon, 28 Mar 2022 10:47:14 +0000 Subject: [PATCH 11/15] add unit test --- .../platform/profiler/chrometracing_logger.cc | 82 +++++++++++++------ .../fluid/tests/unittests/test_newprofiler.py | 10 +++ 2 files changed, 69 insertions(+), 23 deletions(-) mode change 100755 => 100644 python/paddle/fluid/tests/unittests/test_newprofiler.py diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index 3ca3ee5ed829e2..897297e0117e1e 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -112,6 +112,13 @@ void ChromeTracingLogger::LogHostTraceEventNode( if (!output_file_stream_) { return; } + std::string dur_display; + float dur = nsToMsFloat(host_node.Duration()); + if (dur > 1.0) { + dur_display = string_format(std::string("%.3f ms"), dur); + } else { + dur_display = string_format(std::string("%.3f us"), dur * 1000); + } switch (host_node.Type()) { case TracerEventType::ProfileStep: case TracerEventType::Forward: @@ -121,11 +128,12 @@ void ChromeTracingLogger::LogHostTraceEventNode( case TracerEventType::PythonOp: case TracerEventType::PythonUserDefined: // cname value comes from tracing.js reservedColorsByName variable + output_file_stream_ << string_format( std::string( R"JSON( { - "name": "%s[%.3f ms]", "pid": %lld, "tid": "%lld(Python)", + "name": "%s[%s]", "pid": %lld, "tid": "%lld(Python)", "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", "cname": "thread_state_runnable", @@ -135,9 +143,9 @@ void ChromeTracingLogger::LogHostTraceEventNode( } }, )JSON"), - host_node.Name().c_str(), nsToMsFloat(host_node.Duration()), - host_node.ProcessId(), host_node.ThreadId(), - nsToUs(host_node.StartNs()), nsToUsFloat(host_node.Duration()), + host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(), + host_node.ThreadId(), nsToUs(host_node.StartNs()), + nsToUsFloat(host_node.Duration()), categary_name_[static_cast(host_node.Type())], nsToUsFloat(host_node.StartNs(), start_time_), nsToUsFloat(host_node.EndNs(), start_time_)); @@ -147,7 +155,7 @@ void ChromeTracingLogger::LogHostTraceEventNode( std::string( R"JSON( { - "name": "%s[%.3f ms]", "pid": %lld, "tid": "%lld(C++)", + "name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)", "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", "cname": "thread_state_runnable", @@ -157,9 +165,9 @@ void ChromeTracingLogger::LogHostTraceEventNode( } }, )JSON"), - host_node.Name().c_str(), nsToMsFloat(host_node.Duration()), - host_node.ProcessId(), host_node.ThreadId(), - nsToUs(host_node.StartNs()), nsToUsFloat(host_node.Duration()), + host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(), + host_node.ThreadId(), nsToUs(host_node.StartNs()), + nsToUsFloat(host_node.Duration()), categary_name_[static_cast(host_node.Type())], nsToUsFloat(host_node.StartNs(), start_time_), nsToUsFloat(host_node.EndNs(), start_time_)); @@ -174,11 +182,18 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode( if (!output_file_stream_) { return; } + float dur = nsToMsFloat(runtime_node.Duration()); + std::string dur_display; + if (dur > 1.0) { + dur_display = string_format(std::string("%.3f ms"), dur); + } else { + dur_display = string_format(std::string("%.3f us"), dur * 1000); + } output_file_stream_ << string_format( std::string( R"JSON( { - "name": "%s[%.3f ms]", "pid": %lld, "tid": "%lld(C++)", + "name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)", "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", "cname": "thread_state_running", @@ -189,7 +204,7 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode( } }, )JSON"), - runtime_node.Name().c_str(), nsToMsFloat(runtime_node.Duration()), + runtime_node.Name().c_str(), dur_display.c_str(), runtime_node.ProcessId(), runtime_node.ThreadId(), nsToUs(runtime_node.StartNs()), nsToUsFloat(runtime_node.Duration()), categary_name_[static_cast(runtime_node.Type())], @@ -218,6 +233,7 @@ void ChromeTracingLogger::LogDeviceTraceEventNode( if (!output_file_stream_) { return; } + switch (device_node.Type()) { case TracerEventType::Kernel: HandleTypeKernel(device_node); @@ -284,12 +300,18 @@ void ChromeTracingLogger::HandleTypeKernel( kernel_info.block_x, kernel_info.block_y, kernel_info.block_z, blocks_per_sm); #endif - + float dur = nsToMsFloat(device_node.Duration()); + std::string dur_display; + if (dur > 1.0) { + dur_display = string_format(std::string("%.3f ms"), dur); + } else { + dur_display = string_format(std::string("%.3f us"), dur * 1000); + } output_file_stream_ << string_format( std::string( R"JSON( { - "name": "%s[%.3f ms]", "pid": %lld, "tid": %lld, + "name": "%s[%s]", "pid": %lld, "tid": %lld, "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", "cname": "rail_animation", @@ -308,9 +330,9 @@ void ChromeTracingLogger::HandleTypeKernel( } }, )JSON"), - device_node.Name().c_str(), nsToMsFloat(device_node.Duration()), - device_node.DeviceId(), device_node.StreamId(), - nsToUs(device_node.StartNs()), nsToUsFloat(device_node.Duration()), + device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(), + device_node.StreamId(), nsToUs(device_node.StartNs()), + nsToUsFloat(device_node.Duration()), categary_name_[static_cast(device_node.Type())], nsToUsFloat(device_node.StartNs(), start_time_), nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(), @@ -329,11 +351,18 @@ void ChromeTracingLogger::HandleTypeMemcpy( if (device_node.Duration() > 0) { memory_bandwidth = memcpy_info.num_bytes * 1.0 / device_node.Duration(); } + float dur = nsToMsFloat(device_node.Duration()); + std::string dur_display; + if (dur > 1.0) { + dur_display = string_format(std::string("%.3f ms"), dur); + } else { + dur_display = string_format(std::string("%.3f us"), dur * 1000); + } output_file_stream_ << string_format( std::string( R"JSON( { - "name": "%s[%.3f ms]", "pid": %lld, "tid": %lld, + "name": "%s[%s]", "pid": %lld, "tid": %lld, "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", "cname": "rail_animation", @@ -345,9 +374,9 @@ void ChromeTracingLogger::HandleTypeMemcpy( } }, )JSON"), - device_node.Name().c_str(), nsToMsFloat(device_node.Duration()), - device_node.DeviceId(), device_node.StreamId(), - nsToUs(device_node.StartNs()), nsToUsFloat(device_node.Duration()), + device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(), + device_node.StreamId(), nsToUs(device_node.StartNs()), + nsToUsFloat(device_node.Duration()), categary_name_[static_cast(device_node.Type())], nsToUsFloat(device_node.StartNs(), start_time_), nsToUsFloat(device_node.EndNs(), start_time_), device_node.StreamId(), @@ -357,11 +386,18 @@ void ChromeTracingLogger::HandleTypeMemcpy( void ChromeTracingLogger::HandleTypeMemset( const DeviceTraceEventNode& device_node) { MemsetEventInfo memset_info = device_node.MemsetInfo(); + float dur = nsToMsFloat(device_node.Duration()); + std::string dur_display; + if (dur > 1.0) { + dur_display = string_format(std::string("%.3f ms"), dur); + } else { + dur_display = string_format(std::string("%.3f us"), dur * 1000); + } output_file_stream_ << string_format( std::string( R"JSON( { - "name": "%s[%.3f ms]", "pid": %lld, "tid": %lld, + "name": "%s[%s]", "pid": %lld, "tid": %lld, "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", "cname": "rail_animation", @@ -374,9 +410,9 @@ void ChromeTracingLogger::HandleTypeMemset( } }, )JSON"), - device_node.Name().c_str(), nsToMsFloat(device_node.Duration()), - device_node.DeviceId(), device_node.StreamId(), - nsToUs(device_node.StartNs()), nsToUsFloat(device_node.Duration()), + device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(), + device_node.StreamId(), nsToUs(device_node.StartNs()), + nsToUsFloat(device_node.Duration()), categary_name_[static_cast(device_node.Type())], nsToUsFloat(device_node.StartNs(), start_time_), nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(), diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py old mode 100755 new mode 100644 index 12fb0fa61b005f..837d133b91b364 --- a/python/paddle/fluid/tests/unittests/test_newprofiler.py +++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py @@ -125,5 +125,15 @@ def my_sheduler1(num_step): result = profiler.utils.load_profiler_result('./test_profiler_pb.pb') +class TestNvprof(unittest.TestCase): + def test_nvprof(self): + for i in range(10): + paddle.fluid.profiler._nvprof_range(i, 10, 20) + x_value = np.random.randn(2, 3, 3) + x = paddle.to_tensor( + x_value, stop_gradient=False, place=paddle.CPUPlace()) + y = x / 2.0 + + if __name__ == '__main__': unittest.main() From e190ec49d97d0d61a30a52af87daab291af71f21 Mon Sep 17 00:00:00 2001 From: chenjian Date: Tue, 29 Mar 2022 05:15:45 +0000 Subject: [PATCH 12/15] fix a bug --- .../platform/profiler/chrometracing_logger.cc | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index 897297e0117e1e..6f32bcc6878c10 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -80,8 +80,19 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) { start_time_ = std::numeric_limits::max(); for (auto it = thread2host_event_nodes.begin(); it != thread2host_event_nodes.end(); ++it) { - if ((*(it->second.begin() + 1))->StartNs() < start_time_) { - start_time_ = (*(it->second.begin() + 1))->StartNs(); + if (it->second.begin() + 1 != it->second.end()) { + if ((*(it->second.begin() + 1))->StartNs() < start_time_) { + start_time_ = (*(it->second.begin() + 1))->StartNs(); + } else { + auto runtimenode = + (*(it->second.begin()))->GetRuntimeTraceEventNodes().begin(); + if (runtimenode != + (*(it->second.begin()))->GetRuntimeTraceEventNodes().end()) { + if ((*runtimenode)->StartNs() < start_time_) { + start_time_ = (*runtimenode)->StartNs(); + } + } + } } } From 5bcbbe5486fb6b717d6466f55440823181dbccbd Mon Sep 17 00:00:00 2001 From: chenjian Date: Tue, 29 Mar 2022 05:17:57 +0000 Subject: [PATCH 13/15] fix --- python/paddle/fluid/tests/unittests/test_newprofiler.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 python/paddle/fluid/tests/unittests/test_newprofiler.py diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py old mode 100644 new mode 100755 From 9ac247f860f9fdc8c1d69268c272187c0abdb42f Mon Sep 17 00:00:00 2001 From: chenjian Date: Tue, 29 Mar 2022 05:26:09 +0000 Subject: [PATCH 14/15] fix --- .../platform/profiler/chrometracing_logger.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index 6f32bcc6878c10..0c5d90a9277a69 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -83,14 +83,14 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) { if (it->second.begin() + 1 != it->second.end()) { if ((*(it->second.begin() + 1))->StartNs() < start_time_) { start_time_ = (*(it->second.begin() + 1))->StartNs(); - } else { - auto runtimenode = - (*(it->second.begin()))->GetRuntimeTraceEventNodes().begin(); - if (runtimenode != - (*(it->second.begin()))->GetRuntimeTraceEventNodes().end()) { - if ((*runtimenode)->StartNs() < start_time_) { - start_time_ = (*runtimenode)->StartNs(); - } + } + } else { + auto runtimenode = + (*(it->second.begin()))->GetRuntimeTraceEventNodes().begin(); + if (runtimenode != + (*(it->second.begin()))->GetRuntimeTraceEventNodes().end()) { + if ((*runtimenode)->StartNs() < start_time_) { + start_time_ = (*runtimenode)->StartNs(); } } } From b8dfe9dcdc99ec7e13f8aa4573d52eadb6ecd0c7 Mon Sep 17 00:00:00 2001 From: chenjian Date: Wed, 30 Mar 2022 03:49:54 +0000 Subject: [PATCH 15/15] fix format --- python/paddle/fluid/tests/unittests/test_newprofiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py index 1dd59f9f38c8dd..0088687b125636 100755 --- a/python/paddle/fluid/tests/unittests/test_newprofiler.py +++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py @@ -128,7 +128,6 @@ def my_sheduler1(num_step): result = profiler.utils.load_profiler_result('./test_profiler_pb.pb') - class TestNvprof(unittest.TestCase): def test_nvprof(self): for i in range(10): @@ -138,6 +137,7 @@ def test_nvprof(self): x_value, stop_gradient=False, place=paddle.CPUPlace()) y = x / 2.0 + class RandomDataset(Dataset): def __init__(self, num_samples): self.num_samples = num_samples