openvinotoolkit · eunwoosh · Oct 11, 2023 · Oct 5, 2023 · Oct 5, 2023 · Oct 6, 2023
@@ -6,6 +6,7 @@
 from typing import Callable, Dict, Tuple
 
 import torch
+import torch.distributed as dist
 
 from otx.algorithms.common.utils.logger import get_logger
 
@@ -51,7 +52,31 @@ def _try_batch_size(self, bs: int) -> Tuple[bool, int]:
             else:
                 raise e
 
-        max_memory_allocated = torch.cuda.max_memory_allocated(device=None)
+        max_memory_allocated = torch.cuda.max_memory_reserved(device=None)
+
+        if dist.is_initialized():  # Aggregate all results and broadcast to all processes
+            rank = dist.get_rank()
+            try_result = torch.tensor([int(cuda_oom), max_memory_allocated], dtype=torch.int64).cuda()
+
+            if rank == 0:
+                try_result_arr = [torch.empty(2, dtype=torch.int64).cuda() for _ in range(dist.get_world_size())]
+                dist.gather(try_result, gather_list=try_result_arr, dst=0)
+            else:
+                dist.gather(try_result, dst=0)
+
+            if rank == 0:
+                try_result_arr = torch.stack(try_result_arr)
+                cuda_oom = torch.any(try_result_arr[:, 0])  # type: ignore
+                max_memory_allocated = torch.max(try_result_arr[:, 1])  # type: ignore
+                total_try_result = torch.tensor([cuda_oom, max_memory_allocated], dtype=torch.int64).cuda()
+            else:
+                total_try_result = torch.empty(2, dtype=torch.int64).cuda()
+
+            dist.broadcast(total_try_result, src=0)
+
+            cuda_oom = total_try_result[0].bool().item()
+            max_memory_allocated = total_try_result[1].item()
+
         if not cuda_oom:
             # Because heapq only supports min heap, use negatized batch size
             self._bs_try_history[bs] = max_memory_allocated

@@ -35,7 +35,7 @@ def mock_train_func(batch_size):
             else:
                 mem_usage = 8500 * batch_size / max_runnable_bs
 
-            self.mock_torch.cuda.max_memory_allocated.return_value = mem_usage
+            self.mock_torch.cuda.max_memory_reserved.return_value = mem_usage
             return mem_usage
 
         return mock_train_func
@@ -91,7 +91,7 @@ def mock_train_func(batch_size):
                 mem_usage = 9000
             else:
                 mem_usage = 1000
-            self.mock_torch.cuda.max_memory_allocated.return_value = mem_usage
+            self.mock_torch.cuda.max_memory_reserved.return_value = mem_usage
             return mem_usage
 
         bs_search_algo = BsSearchAlgo(mock_train_func, 64, 1000)
@@ -108,7 +108,7 @@ def mock_train_func(batch_size):
                 mem_usage = 9000
             else:
                 mem_usage = 1000 + batch_size / 1000
-            self.mock_torch.cuda.max_memory_allocated.return_value = mem_usage
+            self.mock_torch.cuda.max_memory_reserved.return_value = mem_usage
             return mem_usage
 
         bs_search_algo = BsSearchAlgo(mock_train_func, 64, 1000)