Skip to content

Commit

Permalink
Work around crash when profiling multi-process/multi-GPU application (#…
Browse files Browse the repository at this point in the history
…376)

* Fix crash in multi-GPU scenario

Exclude -o option when invoking rocprof so that each rocprof process
writes to a different .csv file. Concatenate into a single .csv file
when finished.

Signed-off-by: benrichard-amd <ben.richard@amd.com>

* Only combine csv files when using rocprofv2

rocprofv1 does not have separate csv files

Signed-off-by: benrichard-amd <ben.richard@amd.com>

* Fix indices in combined CSV file

Use ignore_index flag to ensure there are no duplicate indices.

Signed-off-by: benrichard-amd <ben.richard@amd.com>

* Fix Dispatch_ID column and remove unnamed column

-Pandas was inserting an unnamed column (index column)
-Overwrite the Dispatch_ID column so that every row is unique, starting at 0
-Remove fixup_rocprofv2_dispatch_ids as no longer needed

Signed-off-by: benrichard-amd <ben.richard@amd.com>

* Fix code formatting

Signed-off-by: benrichard-amd <ben.richard@amd.com>

* Fix code formatting (for real this time)

Signed-off-by: benrichard-amd <ben.richard@amd.com>

---------

Signed-off-by: benrichard-amd <ben.richard@amd.com>
  • Loading branch information
benrichard-amd authored Jun 12, 2024
1 parent 6d24a07 commit 4d7d624
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 20 deletions.
8 changes: 1 addition & 7 deletions src/omniperf_profile/profiler_rocprof_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
demarcate,
console_log,
replace_timestamps,
fixup_rocprofv2_dispatch_ids,
)


Expand All @@ -48,10 +47,7 @@ def get_profiler_options(self, fname):
# v2 requires output directory argument
"-d",
self.get_args().path + "/" + "out",
# v2 does not require csv extension
"-o",
fbase,
# v2 doen not require quotes on cmd
# v2 does not require quotes on cmd
app_cmd,
]
return args
Expand Down Expand Up @@ -87,7 +83,5 @@ def post_processing(self):
if self.ready_to_profile:
# Manually join each pmc_perf*.csv output
self.join_prof()
# Correct dispatch ids
fixup_rocprofv2_dispatch_ids(self.get_args().path)
# Replace timestamp data to solve a known rocprof bug
replace_timestamps(self.get_args().path)
29 changes: 16 additions & 13 deletions src/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,22 @@ def run_prof(fname, profiler_options, workload_dir, mspec, loglevel):
console_error(output, exit=False)
console_error("Profiling execution failed.")

if rocprof_cmd.endswith("v2"):
# rocprofv2 has separate csv files for each process
results_files = glob.glob(workload_dir + "/out/pmc_1/results_*.csv")

# Combine results into single CSV file
combined_results = pd.concat(
[pd.read_csv(f) for f in results_files], ignore_index=True
)

# Overwrite column to ensure unique IDs.
combined_results["Dispatch_ID"] = range(0, len(combined_results))

combined_results.to_csv(
workload_dir + "/out/pmc_1/results_" + fbase + ".csv", index=False
)

if new_env:
# flatten tcc for applicable mi300 input
f = path(workload_dir + "/out/pmc_1/results_" + fbase + ".csv")
Expand Down Expand Up @@ -655,16 +671,3 @@ def set_locale_encoding():
exit=False,
)
console_error(error)


def fixup_rocprofv2_dispatch_ids(workload_dir):
# Workaround for rocprofv2 using 1-based dispatch indicies
# first read pmc_perf
df = pd.read_csv(workload_dir + "/pmc_perf.csv")
df["Dispatch_ID"] -= 1
df.to_csv(workload_dir + "/pmc_perf.csv", index=False)
# next glob for *LEVEL*.csv
for f in glob.glob(workload_dir + "/*LEVEL*.csv"):
df = pd.read_csv(f)
df["Dispatch_ID"] -= 1
df.to_csv(f, index=False)

0 comments on commit 4d7d624

Please sign in to comment.