Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug that auto batch size doesn't consider distributed training #2533

Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Callable, Dict, Tuple

import torch
import torch.distributed as dist

from otx.algorithms.common.utils.logger import get_logger

Expand Down Expand Up @@ -51,7 +52,31 @@ def _try_batch_size(self, bs: int) -> Tuple[bool, int]:
else:
raise e

max_memory_allocated = torch.cuda.max_memory_allocated(device=None)
max_memory_allocated = torch.cuda.max_memory_reserved(device=None)
sungmanc marked this conversation as resolved.
Show resolved Hide resolved

if dist.is_initialized(): # Aggregate all results and broadcast to all processes
rank = dist.get_rank()
try_result = torch.tensor([int(cuda_oom), max_memory_allocated], dtype=torch.int64).cuda()

if rank == 0:
try_result_arr = [torch.empty(2, dtype=torch.int64).cuda() for _ in range(dist.get_world_size())]
dist.gather(try_result, gather_list=try_result_arr, dst=0)
else:
dist.gather(try_result, dst=0)

if rank == 0:
try_result_arr = torch.stack(try_result_arr)
cuda_oom = torch.any(try_result_arr[:, 0]) # type: ignore
max_memory_allocated = torch.max(try_result_arr[:, 1]) # type: ignore
total_try_result = torch.tensor([cuda_oom, max_memory_allocated], dtype=torch.int64).cuda()
else:
total_try_result = torch.empty(2, dtype=torch.int64).cuda()

dist.broadcast(total_try_result, src=0)

cuda_oom = total_try_result[0].bool().item()
max_memory_allocated = total_try_result[1].item()

if not cuda_oom:
# Because heapq only supports min heap, use negatized batch size
self._bs_try_history[bs] = max_memory_allocated
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def mock_train_func(batch_size):
else:
mem_usage = 8500 * batch_size / max_runnable_bs

self.mock_torch.cuda.max_memory_allocated.return_value = mem_usage
self.mock_torch.cuda.max_memory_reserved.return_value = mem_usage
return mem_usage

return mock_train_func
Expand Down Expand Up @@ -91,7 +91,7 @@ def mock_train_func(batch_size):
mem_usage = 9000
else:
mem_usage = 1000
self.mock_torch.cuda.max_memory_allocated.return_value = mem_usage
self.mock_torch.cuda.max_memory_reserved.return_value = mem_usage
return mem_usage

bs_search_algo = BsSearchAlgo(mock_train_func, 64, 1000)
Expand All @@ -108,7 +108,7 @@ def mock_train_func(batch_size):
mem_usage = 9000
else:
mem_usage = 1000 + batch_size / 1000
self.mock_torch.cuda.max_memory_allocated.return_value = mem_usage
self.mock_torch.cuda.max_memory_reserved.return_value = mem_usage
return mem_usage

bs_search_algo = BsSearchAlgo(mock_train_func, 64, 1000)
Expand Down