facebookresearch · wkaisertexas · Mar 28, 2024 · Mar 28, 2024 · Mar 28, 2024 · Mar 28, 2024
diff --git a/README.md b/README.md
@@ -83,6 +83,31 @@ Learn more about the features and the API from our [documentation](https://hta.r
 ### Data Preparation
 All traces collected from a job must reside in a unique folder.
 
+An example of trace collection using the PyTorch Profiler is shown below:
+
+```python
+from torch.profiler import profile, schedule, tensorboard_trace_handler
+
+tracing_schedule = schedule(skip_first = 5, wait = 5, warmup = 2, active = 2, repeat = 1)
+trace_handler = tensorboard_trace_handler(dir_name = "traces/", use_gzip = True)
+
+NUM_EPOCHS = 10 # arbitrary number of epochs to profile
+
+with profile(
+  activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA],
+  schedule = tracing_schedule,
+  on_trace_ready = trace_handler,
+  profile_memory = True,
+  record_shapes = True,
+  with_stack = True
+) as prof:
+
+   for _ in range(NUM_EPOCHS):
+      for step, batch_data in enumerate(data_loader):
+         train(batch_data)
+         prof.step()
+```
+
 ### Analysis in a Jupyter notebook
 
 Activate the Conda environment and launch a Jupyter notebook.
@@ -94,7 +119,11 @@ jupyter notebook
 Import HTA, and create a `TraceAnalysis` object
 ``` python
 from hta.trace_analysis import TraceAnalysis
-analyzer = TraceAnalysis(trace_dir = "/path/to/folder/containing/the/traces")
+analyzer = TraceAnalysis(trace_dir = "traces/") # path to the trace folder
+
+# or 
+
+analyzer = TraceAnalysis(trace_files={0: 'trace_0.json', 1: 'trace_1.json.gz'})
 ```
 
 #### Basic Usage

diff --git a/docs/source/features/augmented_counters.rst b/docs/source/features/augmented_counters.rst
@@ -25,7 +25,7 @@ API.
 
 .. code-block:: python
 
-  analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+  analyzer = TraceAnalysis(trace_dir = "traces/")
   analyzer.generate_trace_with_counters()
 
 A screenshot of the generated trace file with augmented counters.

diff --git a/docs/source/features/comm_comp_overlap.rst b/docs/source/features/comm_comp_overlap.rst
@@ -19,7 +19,7 @@ Communication computation overlap can be calculated as follows:
 
 .. code-block:: python
 
-   analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+   analyzer = TraceAnalysis(trace_dir = "traces/")
    overlap_df = analyzer.get_comm_comp_overlap()
 
 The function returns a dataframe containing the overlap percentage

diff --git a/docs/source/features/cuda_kernel_launch_stats.rst b/docs/source/features/cuda_kernel_launch_stats.rst
@@ -12,7 +12,8 @@ CPU operator ending. The kernel launch info can be generated as follows:
 
 .. code-block:: python
 
-  analyzer = TraceAnalysis(trace_dir="/path/to/trace/dir")
+  analyzer = TraceAnalysis(trace_dir = "traces/")
+
   kernel_info_df = analyzer.get_cuda_kernel_launch_stats()
 
 A screenshot of the generated dataframe is given below.

diff --git a/docs/source/features/cupti_counter_analysis.rst b/docs/source/features/cupti_counter_analysis.rst
@@ -22,18 +22,21 @@ an example.
 .. code-block:: python
 
     with torch.profiler.profile(
-        activities=[torch.profiler.ProfilerActivity.CUDA,
-                    torch.profiler.ProfilerActivity.CPU],
-        record_shapes=True,
-        on_trace_ready=trace_handler,
-        experimental_config=torch.profiler._ExperimentalConfig(
-            profiler_metrics=[
+        activities = [
+            torch.profiler.ProfilerActivity.CUDA,
+            torch.profiler.ProfilerActivity.CPU
+        ],
+        record_shapes = True,
+        on_trace_ready = trace_handler,
+        experimental_config = torch.profiler._ExperimentalConfig(
+            profiler_metrics = [
                 "kineto__tensor_core_insts",
                 "dram__bytes_read.sum",
-                "dram__bytes_write.sum"],
-        profiler_measure_per_kernel=True),
+                "dram__bytes_write.sum"
+            ],
+        profiler_measure_per_kernel = True),
     ) as prof:
-        res = train_batch(modeldef)
+        res = train_batch(model)
         prof.step()
 
 The generated trace contains the following additional information:
@@ -55,7 +58,8 @@ The code below runs CUPTI counter analysis on the collected trace.
 
 .. code-block:: python
 
-   analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+   analyzer = TraceAnalysis(trace_dir = "traces/")
+
    gpu_kernels = analyzer.get_cupti_counter_data_with_operators(ranks=[0])[0]
 
 It returns a list of dataframes, one per rank or trace file. Each dataframe

diff --git a/docs/source/features/frequent_cuda_kernels.rst b/docs/source/features/frequent_cuda_kernels.rst
@@ -22,7 +22,8 @@ be the same across different ranks.
 
 .. code-block:: python
 
-    analyzer = TraceAnalysis(trace_dir = "/path/to/trace_folder")
+    analyzer = TraceAnalysis(trace_dir = "traces/")
+
     cuda_sequences_df = analyzer.get_frequent_cuda_kernel_sequences(
         operator_name = "aten::linear",
         output_dir = "/tmp/"

diff --git a/docs/source/features/idle_time_breakdown.rst b/docs/source/features/idle_time_breakdown.rst
@@ -33,7 +33,7 @@ function. The idle time breakdown can be generated as follows:
 
 .. code-block:: python
 
-  analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+  analyzer = TraceAnalysis(trace_dir = "traces/")
   idle_time_df = analyzer.get_idle_time_breakdown()
 
 .. image:: ../_static/idle_time_breakdown_percentage.png

diff --git a/docs/source/features/kernel_breakdown.rst b/docs/source/features/kernel_breakdown.rst
@@ -13,7 +13,8 @@ The kernel breakdown can be calculated as follows:
 
 .. code-block:: python
 
-   analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+   analyzer = TraceAnalysis(trace_dir = "traces/")
+
    kernel_type_metrics_df, kernel_metrics_df = analyzer.get_gpu_kernel_breakdown()
 
 The first dataframe returned by the function contains the raw values used to

diff --git a/docs/source/features/temporal_breakdown.rst b/docs/source/features/temporal_breakdown.rst
@@ -25,7 +25,8 @@ The temporal breakdown can be calculated as follows:
 
 .. code-block:: python
 
-   analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+   analyzer = TraceAnalysis(trace_dir = "traces/")
+
    time_spent_df = analyzer.get_temporal_breakdown()
 
 The function returns a dataframe containing the temporal breakdown for each rank.

diff --git a/docs/source/features/trace_diff.rst b/docs/source/features/trace_diff.rst
@@ -53,7 +53,7 @@ follows:
 .. code-block:: python
 
     df = compare_traces_output.sort_values(by="diff_duration", ascending=False)
-    # The duration differerence can be overshadowed by the "ProfilerStep",
+    # The duration difference can be overshadowed by the "ProfilerStep",
     # so we can filter it out to show the trend of other operators.
     df = df.loc[~df.index.str.startswith("ProfilerStep")].head(10)
     TraceDiff.visualize_duration_diff(df)

diff --git a/docs/source/intro/installation.rst b/docs/source/intro/installation.rst
@@ -32,7 +32,10 @@ Install from source
 .. code-block::
 
   # get the source code
-  git clone https://github.com/facebookresearch/HolisticTraceAnalysis.git
+  git clone https://github.com/facebookresearch/HolisticTraceAnalysis
+
+  # move into the cloned directory
+  cd HolisticTraceAnalysis
 
   # execute the command below from the root of the repo
   pip install -e .
diff --git a/docs/source/intro/trace_collection.rst b/docs/source/intro/trace_collection.rst
@@ -23,12 +23,14 @@ To profile, wrap the code in the ``profile`` context manager as shown below.
 
 .. code-block:: python
     :linenos:
-    :emphasize-lines: 17
+    :emphasize-lines: 19
 
     from torch.profiler import profile, schedule, tensorboard_trace_handler
 
-    tracing_schedule = schedule(skip_first=5, wait=5, warmup=2, active=2, repeat=1)
-    trace_handler = tensorboard_trace_handler(dir_name=/output/folder, use_gzip=True)
+    tracing_schedule = schedule(skip_first = 5, wait = 5, warmup = 2, active = 2, repeat = 1)
+    trace_handler = tensorboard_trace_handler(dir_name="traces", use_gzip=True)
+
+    NUM_EPOCHS = 5 # arbitrary number of epochs to profile
 
     with profile(
       activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA],
@@ -39,9 +41,10 @@ To profile, wrap the code in the ``profile`` context manager as shown below.
       with_stack = True
     ) as prof:
 
-        for step, batch_data in enumerate(data_loader):
-            train(batch_data)
-            prof.step()
+        for _ in range(NUM_EPOCHS):
+          for step, batch_data in enumerate(data_loader):
+              train(batch_data)
+              prof.step()
 
-Line 17 in the code snippet above signals to the profiler that a training
+Line 19 in the code snippet above signals to the profiler that a training
 iteration has completed.
diff --git a/docs/source/intro/using_hta.rst b/docs/source/intro/using_hta.rst
@@ -13,8 +13,7 @@ Trace Analysis
 .. code-block:: python
 
     from hta.trace_analysis import TraceAnalysis
-    analyzer = TraceAnalysis(trace_dir = "/trace/folder/path")
-
+    analyzer = TraceAnalysis(trace_dir = "traces/")
 
 Using the features is straightforward. E.g.
 
@@ -48,9 +47,9 @@ Using the features is straightforward. E.g.
   cuda_kernel_launch_stats = analyzer.get_cuda_kernel_launch_stats()
 
   # Frequent CUDA kernel sequences
-  frequent_patterns_df = analyzer.get_frequent_cuda_kernel_sequences(operator_name="aten::linear",
-                                                                    output_dir="/output/trace/path"
-                                                                   )
+  frequent_patterns_df = analyzer.get_frequent_cuda_kernel_sequences(
+                                    operator_name="aten::linear", output_dir="/output/trace/path"
+                                 )
 
 To learn more about the features in detail we refer the reader to the
 **Features** section. The features can be tuned by various

diff --git a/examples/cupti_profiler_demo.py b/examples/cupti_profiler_demo.py
@@ -3,7 +3,7 @@
 # LICENSE file in the root directory of this source tree.
 
 """
-Measuring CUPTI performanc metrics using CUPTI Profiler.
+Measuring CUPTI performance metrics using CUPTI Profiler.
 This is supported on V100 and higher NVIDIA GPUs.
 """
 

diff --git a/examples/identify_stragglers.ipynb b/examples/identify_stragglers.ipynb
@@ -86,7 +86,7 @@
    "outputs": [],
    "source": [
     "# Set path to HolisticTraceAnalysis folder\n",
-    "path_to_hta = \"/path/to/HolisticTraceAnalysis\""
+    "PATH_TO_TRACES = \"traces/\""
    ]
   },
   {
@@ -106,7 +106,7 @@
     "%%time\n",
     "from hta.trace_analysis import TraceAnalysis\n",
     "\n",
-    "trace_dir = path_to_hta + \"/tests/data/vision_transformer\"\n",
+    "trace_dir = PATH_TO_TRACES + \"/tests/data/vision_transformer\"\n",
     "print(trace_dir)\n",
     "analyzer = TraceAnalysis(trace_dir = trace_path)"
    ]

diff --git a/examples/kernel_breakdown_demo.ipynb b/examples/kernel_breakdown_demo.ipynb
@@ -245,10 +245,9 @@
     }
    ],
    "source": [
-    "kernel_type_metrics_df, kernel_metrics_df = analyzer.get_gpu_kernel_breakdown( \n",
-    "                                             num_kernels=5, \n",
-    "                                             include_memory_kernels=True, \n",
-    "                                             image_renderer=\"png\")"
+    "kernel_type_metrics_df, kernel_metrics_df = analyzer.get_gpu_kernel_breakdown(\n",
+    "    num_kernels=5, include_memory_kernels=True, image_renderer=\"png\"\n",
+    ")"
    ]
   },
   {

diff --git a/hta/common/trace_file.py b/hta/common/trace_file.py
@@ -58,7 +58,7 @@ def create_rank_to_trace_dict(trace_dir: str) -> Tuple[bool, Dict]:
                     )
                 rank_to_trace_dict[int(rank)] = file_path
             else:
-                logger.error(
+                raise ValueError(
                     "If the trace file does not have the rank specified in it, then add the following snippet "
                     'key to the json files to use HTA; "distributedInfo": {"rank": 0}. If there are multiple '
                     "traces files, then each file should have a unique rank value."