diff --git a/vedacore/hooks/__init__.py b/vedacore/hooks/__init__.py index c5af217..d11ed90 100644 --- a/vedacore/hooks/__init__.py +++ b/vedacore/hooks/__init__.py @@ -5,8 +5,10 @@ from .lr_scheduler import FixedLrSchedulerHook from .optimizer import OptimizerHook from .snapshot import SnapshotHook +from .sampler_seed import DistSamplerSeedHook +from .worker_init import WorkerInitHook __all__ = [ 'BaseHook', 'EvalHook', 'HookPool', 'LoggerHook', 'FixedLrSchedulerHook', - 'OptimizerHook', 'SnapshotHook' + 'OptimizerHook', 'SnapshotHook', 'DistSamplerSeedHook', 'WorkerInitHook' ] diff --git a/vedacore/hooks/sampler_seed.py b/vedacore/hooks/sampler_seed.py new file mode 100644 index 0000000..bdfc081 --- /dev/null +++ b/vedacore/hooks/sampler_seed.py @@ -0,0 +1,24 @@ +# Copyright (c) Open-MMLab. All rights reserved. +from vedacore.misc import registry +from .base_hook import BaseHook + + +@registry.register_module('hook') +class DistSamplerSeedHook(BaseHook): + """Data-loading sampler for distributed training. + When distributed training, it is only useful in conjunction with + :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same + purpose with :obj:`IterLoader`. + """ + + def before_train_epoch(self, looper): + if hasattr(looper.train_dataloader.sampler, 'set_epoch'): + # in case the data loader uses `SequentialSampler` in Pytorch + looper.train_dataloader.sampler.set_epoch(looper.epoch) + elif hasattr(looper.train_dataloader.batch_sampler.sampler, 'set_epoch'): + # batch sampler in pytorch warps the sampler as its attributes. + looper.train_dataloader.batch_sampler.sampler.set_epoch(looper.epoch) + + @property + def modes(self): + return ['train'] diff --git a/vedacore/hooks/worker_init.py b/vedacore/hooks/worker_init.py new file mode 100644 index 0000000..5156796 --- /dev/null +++ b/vedacore/hooks/worker_init.py @@ -0,0 +1,17 @@ +from vedacore.misc import registry +from .base_hook import BaseHook + + +@registry.register_module('hook') +class WorkerInitHook(BaseHook): + """Worker init for training. + """ + + def before_train_epoch(self, looper): + worker_init_fn = looper.train_dataloader.worker_init_fn + if worker_init_fn is not None and hasattr(worker_init_fn, 'set_epoch'): + worker_init_fn.set_epoch(looper.epoch) + + @property + def modes(self): + return ['train'] diff --git a/vedacore/misc/logging.py b/vedacore/misc/logging.py index 953f262..361cbea 100644 --- a/vedacore/misc/logging.py +++ b/vedacore/misc/logging.py @@ -27,6 +27,7 @@ def get_logger(name, log_file=None, log_level=logging.INFO): logging.Logger: The expected logger. """ logger = logging.getLogger(name) + logger.propagate = False if name in logger_initialized: return logger # handle hierarchical names @@ -36,11 +37,6 @@ def get_logger(name, log_file=None, log_level=logging.INFO): if name.startswith(logger_name): return logger - if logger.parent is not None: - logger.parent.handlers.clear() - else: - logger.handlers.clear() - stream_handler = logging.StreamHandler() handlers = [stream_handler] diff --git a/vedadet/assembler/trainval.py b/vedadet/assembler/trainval.py index 5f4df7c..1ce516a 100644 --- a/vedadet/assembler/trainval.py +++ b/vedadet/assembler/trainval.py @@ -64,6 +64,13 @@ def trainval(cfg, distributed, logger): looper = EpochBasedLooper(cfg.modes, dataloaders, engines, hook_pool, logger, cfg.workdir) + + if isinstance(looper, EpochBasedLooper): + looper.hook_pool.register_hook(dict(typename='WorkerInitHook')) + if distributed: + looper.hook_pool.register_hook( + dict(typename='DistSamplerSeedHook')) + if 'weights' in cfg: looper.load_weights(**cfg.weights) if 'train' in cfg.modes: diff --git a/vedadet/datasets/builder.py b/vedadet/datasets/builder.py index e69bb09..8991a52 100644 --- a/vedadet/datasets/builder.py +++ b/vedadet/datasets/builder.py @@ -93,10 +93,10 @@ def build_dataloader(dataset, # that images on each GPU are in the same group if shuffle: sampler = DistributedGroupSampler(dataset, samples_per_gpu, - world_size, rank) + world_size, rank, seed=seed) else: sampler = DistributedSampler( - dataset, world_size, rank, shuffle=False) + dataset, world_size, rank, shuffle=False, seed=seed) batch_size = samples_per_gpu num_workers = workers_per_gpu else: @@ -104,9 +104,8 @@ def build_dataloader(dataset, batch_size = num_gpus * samples_per_gpu num_workers = num_gpus * workers_per_gpu - init_fn = partial( - worker_init_fn, num_workers=num_workers, rank=rank, - seed=seed) if seed is not None else None + init_fn = WorkerInit(num_workers=num_workers, rank=rank, + seed=seed) if seed is not None else None data_loader = DataLoader( dataset, @@ -121,9 +120,20 @@ def build_dataloader(dataset, return data_loader -def worker_init_fn(worker_id, num_workers, rank, seed): - # The seed of each worker equals to - # num_worker * rank + worker_id + user_seed - worker_seed = num_workers * rank + worker_id + seed - np.random.seed(worker_seed) - random.seed(worker_seed) +class WorkerInit: + def __init__(self, num_workers, rank, seed): + # The seed of each worker equals to + # num_worker * rank + worker_id + user_seed + self.num_workers = num_workers + self.rank = rank + self.seed = seed if seed is not None else 0 + self.epoch = 0 + + def set_epoch(self, epoch): + self.epoch = epoch + + def __call__(self, worker_id): + worker_seed = (self.num_workers * self.rank + worker_id + self.seed + + self.epoch) + np.random.seed(worker_seed) + random.seed(worker_seed) diff --git a/vedadet/datasets/samplers/distributed_sampler.py b/vedadet/datasets/samplers/distributed_sampler.py index c0f0cc1..2b5bdca 100644 --- a/vedadet/datasets/samplers/distributed_sampler.py +++ b/vedadet/datasets/samplers/distributed_sampler.py @@ -6,15 +6,17 @@ class DistributedSampler(_DistributedSampler): - def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): + def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, + seed=0): super().__init__(dataset, num_replicas=num_replicas, rank=rank) self.shuffle = shuffle + self.seed = seed if seed is not None else 0 def __iter__(self): # deterministically shuffle based on epoch if self.shuffle: g = torch.Generator() - g.manual_seed(self.epoch) + g.manual_seed(self.epoch + self.seed) indices = torch.randperm(len(self.dataset), generator=g).tolist() else: indices = torch.arange(len(self.dataset)).tolist() diff --git a/vedadet/datasets/samplers/group_sampler.py b/vedadet/datasets/samplers/group_sampler.py index ce1ec60..f84f2c6 100644 --- a/vedadet/datasets/samplers/group_sampler.py +++ b/vedadet/datasets/samplers/group_sampler.py @@ -73,7 +73,8 @@ def __init__(self, dataset, samples_per_gpu=1, num_replicas=None, - rank=None): + rank=None, + seed=0): _rank, _num_replicas = get_dist_info() if num_replicas is None: num_replicas = _num_replicas @@ -84,6 +85,7 @@ def __init__(self, self.num_replicas = num_replicas self.rank = rank self.epoch = 0 + self.seed = seed if seed is not None else 0 assert hasattr(self.dataset, 'flag') self.flag = self.dataset.flag @@ -99,7 +101,7 @@ def __init__(self, def __iter__(self): # deterministically shuffle based on epoch g = torch.Generator() - g.manual_seed(self.epoch) + g.manual_seed(self.epoch + self.seed) indices = [] for i, size in enumerate(self.group_sizes):