Save PyTorch profile data to CSV (#7114)

Enable to save pytorch profiling results to csv (top 5 most time consuming ops). - The --write-csv argument has been modified so that it can now take three values: **None** (do not write data to csv) **bench** (write benchmark data to csv) **prof** (write pytorch profiler data to csv) - Added new argument --export-chrome-trace, that enables to export chrome trace file (defaults to true). Example command to profile with pytorch and save results to csv: `python ./inference_benchmark.py --profile --write-csv prof --export-chrome-trace False` --------- Co-authored-by: rusty1s <matthias.fey@tu-dortmund.de>
pyg-team · Apr 18, 2023 · 9236cb7 · 9236cb7
1 parent f1f24a0
commit 9236cb7
Show file tree

Hide file tree

Showing 6 changed files with 151 additions and 32 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added an option to benchmark scripts to write PyTorch profiler results to CSV ([#7114](https://github.com/pyg-team/pytorch_geometric/pull/7114))
 - Added subgraph type sampling option with bidirectional edge support ([#7199](https://github.com/pyg-team/pytorch_geometric/pull/7199), [#7200](https://github.com/pyg-team/pytorch_geometric/pull/7200))
 - Added support for `"any"`-reductions in `scatter` ([#7198](https://github.com/pyg-team/pytorch_geometric/pull/7198))
 - Added manual sampling interface to `NodeLoader` and `LinkLoader` ([#7197](https://github.com/pyg-team/pytorch_geometric/pull/7197))

diff --git a/benchmark/inference/inference_benchmark.py b/benchmark/inference/inference_benchmark.py
@@ -1,4 +1,5 @@
 import argparse
+import warnings
 from collections import defaultdict
 from contextlib import nullcontext
 
@@ -37,6 +38,10 @@ def full_batch_inference(model, data):
 def run(args: argparse.ArgumentParser):
     csv_data = defaultdict(list)
 
+    if args.write_csv == 'prof' and not args.profile:
+        warnings.warn("Cannot write profile data to CSV because profiling is "
+                      "disabled")
+
     # cuda device is not suitable for full batch mode
     device = torch.device(
         'cuda' if not args.full_batch and torch.cuda.is_available() else 'cpu')
@@ -170,7 +175,8 @@ def run(args: argparse.ArgumentParser):
                         else:
                             cpu_affinity = nullcontext()
                         profile = torch_profile(
-                        ) if args.profile else nullcontext()
+                            args.export_chrome_trace, csv_data,
+                            args.write_csv) if args.profile else nullcontext()
                         itt = emit_itt(
                         ) if args.vtune_profile else nullcontext()
 
@@ -213,7 +219,7 @@ def run(args: argparse.ArgumentParser):
                                         print(f'Mini Batch Test Accuracy: \
                                             {test_acc:.4f}')
 
-                        if args.profile:
+                        if args.profile and args.export_chrome_trace:
                             rename_profile_file(model_name, dataset_name,
                                                 str(batch_size), str(layers),
                                                 str(hidden_channels),
@@ -228,13 +234,26 @@ def run(args: argparse.ArgumentParser):
                         print(f'Throughput: {throughput:.3f} samples/s')
                         print(f'Latency: {latency:.3f} ms')
 
-                        save_benchmark_data(csv_data, batch_size, layers,
-                                            num_neighbors, hidden_channels,
-                                            total_time, model_name,
-                                            dataset_name,
-                                            args.use_sparse_tensor)
+                        num_records = 1
+                        if args.write_csv == 'prof':
+                            # For profiling with PyTorch, we save the top-5
+                            # most time consuming operations. Therefore, the
+                            # same data should be entered for each of them.
+                            num_records = 5
+                        for _ in range(num_records):
+                            save_benchmark_data(
+                                csv_data,
+                                batch_size,
+                                layers,
+                                num_neighbors,
+                                hidden_channels,
+                                total_time,
+                                model_name,
+                                dataset_name,
+                                args.use_sparse_tensor,
+                            )
     if args.write_csv:
-        write_to_csv(csv_data)
+        write_to_csv(csv_data, args.write_csv)
 
 
 if __name__ == '__main__':
@@ -274,5 +293,8 @@ def run(args: argparse.ArgumentParser):
     add('--full-batch', action='store_true', help='Use full batch mode')
     add('--evaluate', action='store_true')
     add('--ckpt_path', type=str, help='Checkpoint path for loading a model')
-    add('--write-csv', action='store_true', help='Write benchmark data to csv')
+    add('--write-csv', choices=[None, 'bench', 'prof'], default=None,
+        help='Write benchmark or PyTorch profile data to CSV')
+    add('--export-chrome-trace', default=True, type=bool,
+        help='Export chrome trace file. Works only with PyTorch profiler')
     run(argparser.parse_args())
diff --git a/benchmark/training/training_benchmark.py b/benchmark/training/training_benchmark.py
@@ -82,6 +82,10 @@ def train_hetero(model, loader, optimizer, device, progress_bar=True, desc="",
 def run(args: argparse.ArgumentParser):
     csv_data = defaultdict(list)
 
+    if args.write_csv == 'prof' and not args.profile:
+        warnings.warn("Cannot write profile data to CSV because profiling is "
+                      "disabled")
+
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     # If we use a custom number of steps, then we need to use RandomSampler,
     # which already does shuffle.
@@ -242,15 +246,19 @@ def run(args: argparse.ArgumentParser):
                                 print(f'Test Accuracy: {test_acc:.4f}')
 
                             if args.profile:
-                                with torch_profile():
+                                profile = torch_profile(
+                                    args.export_chrome_trace, csv_data,
+                                    args.write_csv)
+                                with profile:
                                     train(model, subgraph_loader, optimizer,
                                           device, progress_bar=progress_bar,
                                           desc="Profile training")
-                                rename_profile_file(model_name, dataset_name,
-                                                    str(batch_size),
-                                                    str(layers),
-                                                    str(hidden_channels),
-                                                    str(num_neighbors))
+                                if args.export_chrome_trace:
+                                    rename_profile_file(
+                                        model_name, dataset_name,
+                                        str(batch_size), str(layers),
+                                        str(hidden_channels),
+                                        str(num_neighbors))
 
                         total_time = t.duration
                         if args.num_steps != -1:
@@ -262,13 +270,26 @@ def run(args: argparse.ArgumentParser):
                         print(f'Throughput: {throughput:.3f} samples/s')
                         print(f'Latency: {latency:.3f} ms')
 
-                        save_benchmark_data(csv_data, batch_size, layers,
-                                            num_neighbors, hidden_channels,
-                                            total_time, model_name,
-                                            dataset_name,
-                                            args.use_sparse_tensor)
+                        num_records = 1
+                        if args.write_csv == 'prof':
+                            # For profiling with PyTorch, we save the top-5
+                            # most time consuming operations. Therefore, the
+                            # same data should be entered for each of them.
+                            num_records = 5
+                        for _ in range(num_records):
+                            save_benchmark_data(
+                                csv_data,
+                                batch_size,
+                                layers,
+                                num_neighbors,
+                                hidden_channels,
+                                total_time,
+                                model_name,
+                                dataset_name,
+                                args.use_sparse_tensor,
+                            )
     if args.write_csv:
-        write_to_csv(csv_data, training=True)
+        write_to_csv(csv_data, args.write_csv, training=True)
 
 
 if __name__ == '__main__':
@@ -309,7 +330,10 @@ def run(args: argparse.ArgumentParser):
         help='Enable filter-per-worker feature of the dataloader.')
     add('--measure-load-time', action='store_true')
     add('--evaluate', action='store_true')
-    add('--write-csv', action='store_true', help='Write benchmark data to csv')
+    add('--write-csv', choices=[None, 'bench', 'prof'], default=None,
+        help='Write benchmark or PyTorch profile data to CSV')
+    add('--export-chrome-trace', default=True, type=bool,
+        help='Export chrome trace file. Works only with PyTorch profiler')
     add('--trim', action='store_true', help="Use `trim_to_layer` optimization")
     args = argparser.parse_args()
 

diff --git a/benchmark/utils/utils.py b/benchmark/utils/utils.py
@@ -141,16 +141,22 @@ def save_benchmark_data(csv_data, batch_size, layers, num_neighbors,
     csv_data['SPARSE'].append(use_sparse_tensor)
 
 
-def write_to_csv(csv_data, training=False):
+def write_to_csv(csv_data, write_csv='bench', training=False):
     import pandas as pd
     results_path = osp.join(osp.dirname(osp.realpath(__file__)), '../results/')
     os.makedirs(results_path, exist_ok=True)
 
     name = 'training' if training else 'inference'
-    csv_path = osp.join(results_path, f'TOTAL_{name}_benchmark.csv')
+    if write_csv == 'bench':
+        csv_file_name = f'TOTAL_{name}_benchmark.csv'
+    else:
+        csv_file_name = f'TOTAL_prof_{name}_benchmark.csv'
+    csv_path = osp.join(results_path, csv_file_name)
+    index_label = 'TEST_ID' if write_csv == 'bench' else 'ID'
+
     with_header = not osp.exists(csv_path)
     df = pd.DataFrame(csv_data)
-    df.to_csv(csv_path, mode='a', index_label='TEST_ID', header=with_header)
+    df.to_csv(csv_path, mode='a', index_label=index_label, header=with_header)
 
 
 @torch.no_grad()

diff --git a/torch_geometric/profile/__init__.py b/torch_geometric/profile/__init__.py
@@ -1,5 +1,10 @@
 from .profile import profileit, timeit, get_stats_summary
-from .profile import trace_handler, rename_profile_file, torch_profile
+from .profile import (
+    trace_handler,
+    print_time_total,
+    rename_profile_file,
+    torch_profile,
+)
 from .utils import count_parameters
 from .utils import get_model_size
 from .utils import get_data_size
@@ -13,6 +18,7 @@
     'timeit',
     'get_stats_summary',
     'trace_handler',
+    'print_time_total',
     'rename_profile_file',
     'torch_profile',
     'count_parameters',

diff --git a/torch_geometric/profile/profile.py b/torch_geometric/profile/profile.py
@@ -5,6 +5,7 @@
 from typing import Any, List, NamedTuple, Tuple
 
 import torch
+from torch.autograd.profiler import EventList
 from torch.profiler import ProfilerActivity, profile
 
 from torch_geometric.profile.utils import (
@@ -206,15 +207,19 @@ def read_from_memlab(line_profiler: Any) -> List[float]:  # pragma: no cover
 
 
 def trace_handler(p):
+    print_time_total(p)
+    profile_dir = str(pathlib.Path.cwd()) + '/'
+    timeline_file = profile_dir + 'timeline' + '.json'
+    p.export_chrome_trace(timeline_file)
+
+
+def print_time_total(p):
     if torch.cuda.is_available():
         profile_sort = 'self_cuda_time_total'
     else:
         profile_sort = 'self_cpu_time_total'
     output = p.key_averages().table(sort_by=profile_sort)
     print(output)
-    profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'timeline' + '.json'
-    p.export_chrome_trace(timeline_file)
 
 
 def rename_profile_file(*args):
@@ -227,11 +232,66 @@ def rename_profile_file(*args):
 
 
 @contextmanager
-def torch_profile():
+def torch_profile(export_chrome_trace=True, csv_data=None, write_csv=None):
+    use_cuda = torch.cuda.is_available()
+
     activities = [ProfilerActivity.CPU]
-    if torch.cuda.is_available():
+    if use_cuda:
         activities.append(ProfilerActivity.CUDA)
 
-    with profile(activities=activities, on_trace_ready=trace_handler) as p:
+    if export_chrome_trace:
+        p_trace_handler = trace_handler
+    else:
+        p_trace_handler = print_time_total
+
+    p = profile(activities=activities, on_trace_ready=p_trace_handler)
+
+    with p:
         yield
         p.step()
+
+    if csv_data is not None and write_csv == 'prof':
+        if use_cuda:
+            profile_sort = 'self_cuda_time_total'
+        else:
+            profile_sort = 'self_cpu_time_total'
+        events = EventList(
+            sorted(
+                p.key_averages(),
+                key=lambda evt: getattr(evt, profile_sort),
+                reverse=True,
+            ), use_cuda=use_cuda)
+
+        save_profile_data(csv_data, events, use_cuda)
+
+
+def format_prof_time(time):
+    # Profile time is in micro seconds, so format it appropriately:
+    return round(time / 1e6, 3)
+
+
+def save_profile_data(csv_data, events, use_cuda):
+    sum_self_cpu_time_total = sum(
+        [event.self_cpu_time_total for event in events])
+    sum_cpu_time_total = sum([event.self_cpu_time_total for event in events])
+    sum_self_cuda_time_total = sum(
+        [event.self_cuda_time_total for event in events]) if use_cuda else 0
+
+    for e in events[:5]:  # Save top 5 most time consuming operations:
+        csv_data['NAME'].append(e.key)
+        csv_data['SELF CPU %'].append(
+            round(e.self_cpu_time_total * 100.0 / sum_self_cpu_time_total, 3))
+        csv_data['SELF CPU'].append(format_prof_time(e.self_cpu_time_total))
+        csv_data['CPU TOTAL %'].append(
+            round(e.cpu_time_total * 100.0 / sum_cpu_time_total, 3))
+        csv_data['CPU TOTAL'].append(format_prof_time(e.cpu_time_total))
+        csv_data['CPU TIME AVG'].append(format_prof_time(e.cpu_time_total))
+        if use_cuda:
+            csv_data['SELF CUDA %'].append(e.self_cuda_time_total * 100.0 /
+                                           sum_self_cuda_time_total)
+            csv_data['SELF CUDA'].append(
+                format_prof_time(e.self_cuda_time_total))
+            csv_data['CUDA TOTAL'].append(format_prof_time(e.cpu_time_total))
+            csv_data['CUDA TIME AVG'].append(format_prof_time(
+                e.cpu_time_total))
+        csv_data['# OF CALLS'].append(e.count)