Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions smdebug/core/hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,12 +436,13 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None:
self.first_process = True
self.logger.info(f"Hook is writing from the hook with pid: {os.getpid()}\n")
else:
if self.first_process is None:
self.logger.warn(
f"Unsupported Distributed Training Strategy Detected. \
Sagemaker-Debugger will only write from one process. \
The process with pid: {os.getpid()} will not be writing any data. \n"
)
self.first_process = False
self.logger.warn(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move this log to 439 and then we will be logging only once.
if self.first_process is None:
log
self.first_process= False

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added in the latest commit

f"Unsupported Distributed Training Strategy Detected.\n\
Sagemaker-Debugger will only write from one process.\n\
The process with pid: {os.getpid()} will not be writing any data. \n"
)
Comment on lines -440 to -444
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably move this log to some other location where it is called fewer times instead of completely removing it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed this in the commit i just pushed

return

if self.save_all_workers is False:
Expand Down Expand Up @@ -546,6 +547,13 @@ def set_mode(self, mode):

def export_collections(self):
num_workers = self._get_num_workers()
if num_workers == 1 and self.first_process is False:
self.logger.warn(
f"Unsupported Distributed Training Strategy Detected. \
Sagemaker-Debugger will only write from one process. \
The process with pid: {os.getpid()} will not be writing any data. \n"
)
return
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similar log statement here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added in the latest commit

if self.save_all_workers is False:
if self.chief_worker != self.worker:
return
Expand Down