Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cherry-pick] fix a bug when device info not exists in json format (#1166) #1176

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions visualdl/component/profiler/parser/event_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,10 +265,16 @@ def __init__(self, data):
def parse_json(self, json_data):
self.schema_version = json_data['schemaVersion']
self.span_idx = json_data['span_indx']
self.device_infos = {
device_info['id']: device_info
for device_info in json_data['deviceProperties']
}
try:
self.device_infos = {
device_info['id']: device_info
for device_info in json_data['deviceProperties']
}
except Exception:
print(
"paddlepaddle-gpu version is needed to get GPU device informations."
)
self.device_infos = {}
hostnodes = []
runtimenodes = []
devicenodes = []
Expand Down
2 changes: 2 additions & 0 deletions visualdl/component/profiler/profiler_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1767,6 +1767,8 @@ def get_distributed_info(self):
data = []
for profile_data in self.profile_datas:
device_infos = profile_data.device_infos
if not device_infos:
return data
gpu_id = int(next(iter(profile_data.gpu_ids)))
data.append({
'worker_name':
Expand Down
8 changes: 6 additions & 2 deletions visualdl/component/profiler/profiler_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# =======================================================================
import os
import re
from threading import Lock
from threading import Thread

import packaging.version
Expand All @@ -28,6 +29,7 @@
from visualdl.io import bfile

_name_pattern = re.compile(r"(.+)_time_(.+)\.paddle_trace\.((pb)|(json))")
_lock = Lock()


def is_VDLProfiler_file(path):
Expand Down Expand Up @@ -117,8 +119,10 @@ def runs(self, update=True):
self.run_managers[run] = RunManager(run)
self.run_managers[run].set_all_filenames(filenames)
for filename in filenames:
if self.run_managers[run].has_handled(filename):
continue
with _lock: # we add this to prevent parallel requests for handling a file multiple times
if self.run_managers[run].has_handled(filename):
continue
self.run_managers[run].handled_filenames.add(filename)
self._read_data(run, filename)
return list(self.walks.keys())

Expand Down
4 changes: 4 additions & 0 deletions visualdl/component/profiler/profiler_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,13 +194,17 @@ def distributed_steps(self, run, worker, span):
run_manager = self._reader.get_run_manager(run)
distributed_profiler_data = run_manager.get_distributed_profiler_data(
span)
if distributed_profiler_data is None:
return
return distributed_profiler_data.get_distributed_steps()

@result()
def distributed_histogram(self, run, worker, span, step, time_unit='ms'):
run_manager = self._reader.get_run_manager(run)
distributed_profiler_data = run_manager.get_distributed_profiler_data(
span)
if distributed_profiler_data is None:
return
return distributed_profiler_data.get_distributed_histogram(
step, time_unit)

Expand Down
3 changes: 0 additions & 3 deletions visualdl/component/profiler/run_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,8 @@ def _parse_file(self, worker_name, result):
return

def join(self):
if self.has_join:
return
for thread in self.threads.values():
thread.join()
self.has_join = True
distributed_profiler_data = defaultdict(list)
for worker_name, span_data in self.profiler_data.items():
for span_idx, profiler_data in span_data.items():
Expand Down