From 2de81f569d41d41f28a8cca290ec71d8848e7169 Mon Sep 17 00:00:00 2001 From: ZouJiu1 <1069679911@qq.com> Date: Fri, 18 Jun 2021 09:19:49 +0800 Subject: [PATCH 1/3] EMA changes for pre-model's batch_size --- train.py | 2 +- utils/torch_utils.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/train.py b/train.py index 3eb866345d47..058dde3471ee 100644 --- a/train.py +++ b/train.py @@ -158,7 +158,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary # plot_lr_scheduler(optimizer, scheduler, epochs) # EMA - ema = ModelEMA(model) if rank in [-1, 0] else None + ema = ModelEMA(model, batch_size=batch_size) if rank in [-1, 0] else None # Resume start_epoch, best_fitness = 0, 0.0 diff --git a/utils/torch_utils.py b/utils/torch_utils.py index b690dbe96700..9efd541a05a8 100644 --- a/utils/torch_utils.py +++ b/utils/torch_utils.py @@ -283,13 +283,16 @@ class ModelEMA: GPU assignment and distributed training wrappers. """ - def __init__(self, model, decay=0.9999, updates=0): + # when change batch_size, the decay exponential ramp should multiply by max(64/batch_size, 1) + # if not, the weights will update so slowly, maybe don't update anymore + # the batch_size of pre-trained model is 64 + def __init__(self, model, batch_size=64, decay=0.9999, updates=0): # Create EMA self.ema = deepcopy(model.module if is_parallel(model) else model).eval() # FP32 EMA # if next(model.parameters()).device.type != 'cpu': # self.ema.half() # FP16 EMA self.updates = updates # number of EMA updates - self.decay = lambda x: decay * (1 - math.exp(-x / 2000)) # decay exponential ramp (to help early epochs) + self.decay = lambda x: decay * (1 - math.exp(-x / (2000*max(64/batch_size, 1)))) # decay exponential ramp (to help early epochs) for p in self.ema.parameters(): p.requires_grad_(False) From d344501a67e73d138fb0dd693cd3f516104bcb70 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Fri, 18 Jun 2021 16:25:50 +0200 Subject: [PATCH 2/3] Update train.py --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 058dde3471ee..3eb866345d47 100644 --- a/train.py +++ b/train.py @@ -158,7 +158,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary # plot_lr_scheduler(optimizer, scheduler, epochs) # EMA - ema = ModelEMA(model, batch_size=batch_size) if rank in [-1, 0] else None + ema = ModelEMA(model) if rank in [-1, 0] else None # Resume start_epoch, best_fitness = 0, 0.0 From 06c6a3b61f05a8d9781705508562f7271d1bb0e2 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Fri, 18 Jun 2021 16:26:31 +0200 Subject: [PATCH 3/3] Update torch_utils.py --- utils/torch_utils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/utils/torch_utils.py b/utils/torch_utils.py index 9efd541a05a8..b690dbe96700 100644 --- a/utils/torch_utils.py +++ b/utils/torch_utils.py @@ -283,16 +283,13 @@ class ModelEMA: GPU assignment and distributed training wrappers. """ - # when change batch_size, the decay exponential ramp should multiply by max(64/batch_size, 1) - # if not, the weights will update so slowly, maybe don't update anymore - # the batch_size of pre-trained model is 64 - def __init__(self, model, batch_size=64, decay=0.9999, updates=0): + def __init__(self, model, decay=0.9999, updates=0): # Create EMA self.ema = deepcopy(model.module if is_parallel(model) else model).eval() # FP32 EMA # if next(model.parameters()).device.type != 'cpu': # self.ema.half() # FP16 EMA self.updates = updates # number of EMA updates - self.decay = lambda x: decay * (1 - math.exp(-x / (2000*max(64/batch_size, 1)))) # decay exponential ramp (to help early epochs) + self.decay = lambda x: decay * (1 - math.exp(-x / 2000)) # decay exponential ramp (to help early epochs) for p in self.ema.parameters(): p.requires_grad_(False)