From 43fe681da6436a80c9497a83843a3e6a32f01e11 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 31 Jan 2023 08:11:59 +0200 Subject: [PATCH 01/11] wip --- .../common/sg_loggers/deci_platform_sg_logger.py | 9 +++++++++ .../training_hyperparams/default_train_params.yaml | 2 +- src/super_gradients/training/sg_trainer/sg_trainer.py | 3 ++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py b/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py index 4612b373c6..2ffbd19419 100644 --- a/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py +++ b/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py @@ -76,6 +76,15 @@ def upload(self): self._upload_latest_file_starting_with(start_with=TENSORBOARD_EVENTS_PREFIX) self._upload_latest_file_starting_with(start_with=LOGS_PREFIX) + print(">> START UPLOAD BIG FILE") + self.platform_client.save_experiment_file( + file_path="/home/louis.dupont/PycharmProjects/super-gradients/checkpoints/deci_lab_export_test_model/average_model.pth" + ) + self.platform_client.save_experiment_file( + file_path="/home/louis.dupont/PycharmProjects/super-gradients/checkpoints/deci_lab_export_test_model/average_model2.pth" + ) + print("<<") + @multi_process_safe def _upload_latest_file_starting_with(self, start_with: str): """ diff --git a/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml index 58a59798aa..061f3fc5dc 100644 --- a/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml +++ b/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml @@ -106,7 +106,7 @@ max_train_batches: # For debug- when not None- will break out of inner train lo max_valid_batches: # For debug- when not None- will break out of inner valid loop # (i.e iterating over valid_loader) when reaching this number of batches. -sg_logger: base_sg_logger +sg_logger: deci_platform_sg_logger sg_logger_params: tb_files_user_prompt: False # Asks User for Tensorboard Deletion Prompt launch_tensorboard: False diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index 675a4b0222..ae3f601c5f 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -1796,6 +1796,7 @@ def evaluate( pbar_start_msg = f"Validation epoch {epoch}" if evaluation_type == EvaluationType.VALIDATION else "Test" progress_bar_data_loader.set_description(pbar_start_msg) + raise RuntimeError("STOPPP IT ALL") with torch.no_grad(): for batch_idx, batch_items in enumerate(progress_bar_data_loader): batch_items = core_utils.tensor_container_to_device(batch_items, device_config.device, non_blocking=True) @@ -1808,6 +1809,7 @@ def evaluate( else: self.phase_callback_handler.on_test_batch_start(context) + raise RuntimeError("STOPPP IT ALL") output = self.net(inputs) context.update_context(preds=output) @@ -1832,7 +1834,6 @@ def evaluate( if evaluation_type == EvaluationType.VALIDATION and self.max_valid_batches is not None and self.max_valid_batches - 1 <= batch_idx: break - # NEED TO COMPUTE METRICS FOR THE FIRST TIME IF PROGRESS VERBOSITY IS NOT SET if not metrics_progress_verbose: # COMPUTE THE RUNNING USER METRICS AND LOSS RUNNING ITEMS. RESULT TUPLE IS THEIR CONCATENATION. From 8155c38c127415bc8fcc928a1b07e282536d37b6 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 31 Jan 2023 10:15:30 +0200 Subject: [PATCH 02/11] wip --- .../common/sg_loggers/deci_platform_sg_logger.py | 4 +--- src/super_gradients/training/sg_trainer/sg_trainer.py | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py b/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py index 2ffbd19419..2c7544d868 100644 --- a/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py +++ b/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py @@ -80,9 +80,7 @@ def upload(self): self.platform_client.save_experiment_file( file_path="/home/louis.dupont/PycharmProjects/super-gradients/checkpoints/deci_lab_export_test_model/average_model.pth" ) - self.platform_client.save_experiment_file( - file_path="/home/louis.dupont/PycharmProjects/super-gradients/checkpoints/deci_lab_export_test_model/average_model2.pth" - ) + self.platform_client.save_experiment_file(file_path="/home/louis.dupont/PycharmProjects/super-gradients/checkpoints/bigzipfile.zip") print("<<") @multi_process_safe diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index ae3f601c5f..f339c20f1f 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -470,6 +470,8 @@ def _train_epoch(self, epoch: int, silent_mode: bool = False) -> tuple: if not self.ddp_silent_mode: self.sg_logger.upload() + raise ValueError("Upload value is bad") + raise ValueError("Upload value is very bad") self.train_monitored_values = sg_trainer_utils.update_monitored_values_dict( monitored_values_dict=self.train_monitored_values, new_values_dict=pbar_message_dict From 6c53d16a7b6f52f4a712f479064d57fc878c462e Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 31 Jan 2023 11:53:42 +0200 Subject: [PATCH 03/11] wip --- src/super_gradients/common/sg_loggers/base_sg_logger.py | 6 ++++++ .../common/sg_loggers/deci_platform_sg_logger.py | 7 ------- src/super_gradients/training/sg_trainer/sg_trainer.py | 6 +----- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/super_gradients/common/sg_loggers/base_sg_logger.py b/src/super_gradients/common/sg_loggers/base_sg_logger.py index f9e1335e62..4846bf6555 100644 --- a/src/super_gradients/common/sg_loggers/base_sg_logger.py +++ b/src/super_gradients/common/sg_loggers/base_sg_logger.py @@ -249,6 +249,9 @@ def add_file(self, file_name: str = None): @multi_process_safe def upload(self): + """Upload the local tensorboard and log files to remote system.""" + self.flush() + if self.save_tensorboard_remote: self.model_checkpoints_data_interface.save_remote_tensorboard_event_files(self.experiment_name, self._local_dir) @@ -259,9 +262,12 @@ def upload(self): @multi_process_safe def flush(self): self.tensorboard_writer.flush() + ConsoleSink.flush() @multi_process_safe def close(self): + self.upload() + if self.system_monitor is not None: self.system_monitor.close() logger.info("[CLEANUP] - Successfully stopped system monitoring process") diff --git a/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py b/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py index a11c4b9847..c7729704e4 100644 --- a/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py +++ b/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py @@ -78,13 +78,6 @@ def upload(self): self._upload_latest_file_starting_with(start_with=CONSOLE_LOGS_PREFIX) self._upload_folder_files(folder_name=".hydra") - print(">> START UPLOAD BIG FILE") - self.platform_client.save_experiment_file( - file_path="/home/louis.dupont/PycharmProjects/super-gradients/checkpoints/deci_lab_export_test_model/average_model.pth" - ) - self.platform_client.save_experiment_file(file_path="/home/louis.dupont/PycharmProjects/super-gradients/checkpoints/bigzipfile.zip") - print("<<") - @multi_process_safe def _upload_latest_file_starting_with(self, start_with: str): """ diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index f339c20f1f..1c6083598d 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -468,11 +468,6 @@ def _train_epoch(self, epoch: int, silent_mode: bool = False) -> tuple: ): break - if not self.ddp_silent_mode: - self.sg_logger.upload() - raise ValueError("Upload value is bad") - raise ValueError("Upload value is very bad") - self.train_monitored_values = sg_trainer_utils.update_monitored_values_dict( monitored_values_dict=self.train_monitored_values, new_values_dict=pbar_message_dict ) @@ -1317,6 +1312,7 @@ def forward(self, inputs, targets): if not self.ddp_silent_mode: # SAVING AND LOGGING OCCURS ONLY IN THE MAIN PROCESS (IN CASES THERE ARE SEVERAL PROCESSES - DDP) self._write_to_disk_operations(train_metrics_tuple, validation_results_tuple, inf_time, epoch, context) + self.sg_logger.upload() # Evaluating the average model and removing snapshot averaging file if training is completed if self.training_params.average_best_models: From 3f0d294c75374a19ccbfc4805117d331acce5786 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 31 Jan 2023 11:55:19 +0200 Subject: [PATCH 04/11] remove raise to debug --- src/super_gradients/training/sg_trainer/sg_trainer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index 1c6083598d..e1c4139b2c 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -1794,7 +1794,6 @@ def evaluate( pbar_start_msg = f"Validation epoch {epoch}" if evaluation_type == EvaluationType.VALIDATION else "Test" progress_bar_data_loader.set_description(pbar_start_msg) - raise RuntimeError("STOPPP IT ALL") with torch.no_grad(): for batch_idx, batch_items in enumerate(progress_bar_data_loader): batch_items = core_utils.tensor_container_to_device(batch_items, device_config.device, non_blocking=True) @@ -1807,7 +1806,6 @@ def evaluate( else: self.phase_callback_handler.on_test_batch_start(context) - raise RuntimeError("STOPPP IT ALL") output = self.net(inputs) context.update_context(preds=output) @@ -1832,6 +1830,7 @@ def evaluate( if evaluation_type == EvaluationType.VALIDATION and self.max_valid_batches is not None and self.max_valid_batches - 1 <= batch_idx: break + # NEED TO COMPUTE METRICS FOR THE FIRST TIME IF PROGRESS VERBOSITY IS NOT SET if not metrics_progress_verbose: # COMPUTE THE RUNNING USER METRICS AND LOSS RUNNING ITEMS. RESULT TUPLE IS THEIR CONCATENATION. From b269f1fd9f940290bc9ffcd03f650916bfd0187f Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 31 Jan 2023 12:00:09 +0200 Subject: [PATCH 05/11] fix --- src/super_gradients/common/sg_loggers/base_sg_logger.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/super_gradients/common/sg_loggers/base_sg_logger.py b/src/super_gradients/common/sg_loggers/base_sg_logger.py index 4846bf6555..ad3fc92c88 100644 --- a/src/super_gradients/common/sg_loggers/base_sg_logger.py +++ b/src/super_gradients/common/sg_loggers/base_sg_logger.py @@ -271,6 +271,7 @@ def close(self): if self.system_monitor is not None: self.system_monitor.close() logger.info("[CLEANUP] - Successfully stopped system monitoring process") + self.tensorboard_writer.close() if self.tensor_board_process is not None: try: From 9a64cc17c92ecc39901cef15a37a30b6e256c797 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 31 Jan 2023 12:01:00 +0200 Subject: [PATCH 06/11] undo unwanted change --- .../recipes/training_hyperparams/default_train_params.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml index 061f3fc5dc..58a59798aa 100644 --- a/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml +++ b/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml @@ -106,7 +106,7 @@ max_train_batches: # For debug- when not None- will break out of inner train lo max_valid_batches: # For debug- when not None- will break out of inner valid loop # (i.e iterating over valid_loader) when reaching this number of batches. -sg_logger: deci_platform_sg_logger +sg_logger: base_sg_logger sg_logger_params: tb_files_user_prompt: False # Asks User for Tensorboard Deletion Prompt launch_tensorboard: False From 230cde011c5ffeae70fed684564a3784db513a74 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 31 Jan 2023 12:18:12 +0200 Subject: [PATCH 07/11] improve display of experiment_log epoch index --- src/super_gradients/common/sg_loggers/base_sg_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/super_gradients/common/sg_loggers/base_sg_logger.py b/src/super_gradients/common/sg_loggers/base_sg_logger.py index ad3fc92c88..b00b64f32e 100644 --- a/src/super_gradients/common/sg_loggers/base_sg_logger.py +++ b/src/super_gradients/common/sg_loggers/base_sg_logger.py @@ -169,7 +169,7 @@ def add_scalars(self, tag_scalar_dict: dict, global_step: int = None): self.tensorboard_writer.flush() # WRITE THE EPOCH RESULTS TO LOG FILE - log_line = f"\nEpoch ({global_step}/{self.max_global_steps}) - " + log_line = f"\nEpoch {global_step} ({global_step+1}/{self.max_global_steps}) - " for tag, value in tag_scalar_dict.items(): if isinstance(value, torch.Tensor): value = value.item() From 53d66eb1ec293d7ebd1fbf8b93470d6e6ee4097f Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 31 Jan 2023 12:27:18 +0200 Subject: [PATCH 08/11] fix --- .../common/sg_loggers/deci_platform_sg_logger.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py b/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py index c7729704e4..8a8259af92 100644 --- a/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py +++ b/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py @@ -5,6 +5,7 @@ from super_gradients.common.sg_loggers.base_sg_logger import BaseSGLogger, EXPERIMENT_LOGS_PREFIX, LOGGER_LOGS_PREFIX, CONSOLE_LOGS_PREFIX from super_gradients.common.environment.ddp_utils import multi_process_safe from super_gradients.common.plugins.deci_client import DeciClient +from contextlib import redirect_stdout logger = get_logger(__name__) @@ -91,8 +92,7 @@ def _upload_latest_file_starting_with(self, start_with: str): ] most_recent_file_path = max(files_path, key=os.path.getctime) - self.platform_client.save_experiment_file(file_path=most_recent_file_path) - logger.info(f"File saved to Deci platform: {most_recent_file_path}") + self._save_save_experiment_file(file_path=most_recent_file_path) @multi_process_safe def _upload_folder_files(self, folder_name: str): @@ -107,5 +107,9 @@ def _upload_folder_files(self, folder_name: str): return for file in os.listdir(folder_path): - self.platform_client.save_experiment_file(file_path=f"{folder_path}/{file}") - logger.info(f"File saved to Deci platform: {folder_path}/{file}") + self._save_save_experiment_file(file_path=f"{folder_path}/{file}") + + def _save_save_experiment_file(self, file_path: str): + with redirect_stdout(None): # Workaround until platform_client removes prints from save_experiment_file. + self.platform_client.save_experiment_file(file_path=file_path) + logger.info(f"File saved to Deci platform: {file_path}") From ee20dc15a6e12021edc8d5ab2f279248df32364e Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 31 Jan 2023 13:38:16 +0200 Subject: [PATCH 09/11] redirect to DEBUG --- .../sg_loggers/deci_platform_sg_logger.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py b/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py index 8a8259af92..35edd5515b 100644 --- a/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py +++ b/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py @@ -1,4 +1,6 @@ import os +import io +from contextlib import contextmanager from typing import Optional from super_gradients.common.abstractions.abstract_logger import get_logger @@ -110,6 +112,20 @@ def _upload_folder_files(self, folder_name: str): self._save_save_experiment_file(file_path=f"{folder_path}/{file}") def _save_save_experiment_file(self, file_path: str): - with redirect_stdout(None): # Workaround until platform_client removes prints from save_experiment_file. + + with log_stdout(): # Workaround until platform_client removes prints from save_experiment_file. self.platform_client.save_experiment_file(file_path=file_path) + logger.info(f"File saved to Deci platform: {file_path}") + + +@contextmanager +def log_stdout(): + """Redirect stdout to DEBUG.""" + buffer = io.StringIO() + with redirect_stdout(buffer): + yield + + redirected_str = buffer.getvalue() + if redirected_str: + logger.debug(msg=redirected_str) From 5b453ff82eb02ef89d3afce4194476dfa9787d9c Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 31 Jan 2023 13:42:06 +0200 Subject: [PATCH 10/11] update comment --- .../common/sg_loggers/deci_platform_sg_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py b/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py index 35edd5515b..12a9b90be7 100644 --- a/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py +++ b/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py @@ -113,7 +113,7 @@ def _upload_folder_files(self, folder_name: str): def _save_save_experiment_file(self, file_path: str): - with log_stdout(): # Workaround until platform_client removes prints from save_experiment_file. + with log_stdout(): # TODO: remove when platform_client remove prints from save_experiment_file self.platform_client.save_experiment_file(file_path=file_path) logger.info(f"File saved to Deci platform: {file_path}") From 83b3116fa2e4abe4e0468b6533e07b4b617d3ce3 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 31 Jan 2023 14:25:06 +0200 Subject: [PATCH 11/11] fix typo in name --- .../common/sg_loggers/deci_platform_sg_logger.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py b/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py index 12a9b90be7..ebd9c99f18 100644 --- a/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py +++ b/src/super_gradients/common/sg_loggers/deci_platform_sg_logger.py @@ -94,7 +94,7 @@ def _upload_latest_file_starting_with(self, start_with: str): ] most_recent_file_path = max(files_path, key=os.path.getctime) - self._save_save_experiment_file(file_path=most_recent_file_path) + self._save_experiment_file(file_path=most_recent_file_path) @multi_process_safe def _upload_folder_files(self, folder_name: str): @@ -109,13 +109,11 @@ def _upload_folder_files(self, folder_name: str): return for file in os.listdir(folder_path): - self._save_save_experiment_file(file_path=f"{folder_path}/{file}") - - def _save_save_experiment_file(self, file_path: str): + self._save_experiment_file(file_path=f"{folder_path}/{file}") + def _save_experiment_file(self, file_path: str): with log_stdout(): # TODO: remove when platform_client remove prints from save_experiment_file self.platform_client.save_experiment_file(file_path=file_path) - logger.info(f"File saved to Deci platform: {file_path}")