From 2de81f569d41d41f28a8cca290ec71d8848e7169 Mon Sep 17 00:00:00 2001
From: ZouJiu1 <1069679911@qq.com>
Date: Fri, 18 Jun 2021 09:19:49 +0800
Subject: [PATCH 1/3] EMA changes for pre-model's batch_size

---
 train.py             | 2 +-
 utils/torch_utils.py | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/train.py b/train.py
index 3eb866345d47..058dde3471ee 100644
--- a/train.py
+++ b/train.py
@@ -158,7 +158,7 @@ def train(hyp,  # path/to/hyp.yaml or hyp dictionary
     # plot_lr_scheduler(optimizer, scheduler, epochs)
 
     # EMA
-    ema = ModelEMA(model) if rank in [-1, 0] else None
+    ema = ModelEMA(model, batch_size=batch_size) if rank in [-1, 0] else None
 
     # Resume
     start_epoch, best_fitness = 0, 0.0
diff --git a/utils/torch_utils.py b/utils/torch_utils.py
index b690dbe96700..9efd541a05a8 100644
--- a/utils/torch_utils.py
+++ b/utils/torch_utils.py
@@ -283,13 +283,16 @@ class ModelEMA:
     GPU assignment and distributed training wrappers.
     """
 
-    def __init__(self, model, decay=0.9999, updates=0):
+    # when change batch_size, the decay exponential ramp should multiply by max(64/batch_size, 1)
+    # if not, the weights will update so slowly, maybe don't update anymore
+    # the batch_size of pre-trained model is 64
+    def __init__(self, model, batch_size=64, decay=0.9999, updates=0):  
         # Create EMA
         self.ema = deepcopy(model.module if is_parallel(model) else model).eval()  # FP32 EMA
         # if next(model.parameters()).device.type != 'cpu':
         #     self.ema.half()  # FP16 EMA
         self.updates = updates  # number of EMA updates
-        self.decay = lambda x: decay * (1 - math.exp(-x / 2000))  # decay exponential ramp (to help early epochs)
+        self.decay = lambda x: decay * (1 - math.exp(-x / (2000*max(64/batch_size, 1))))  # decay exponential ramp (to help early epochs)
         for p in self.ema.parameters():
             p.requires_grad_(False)
 

From d344501a67e73d138fb0dd693cd3f516104bcb70 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Fri, 18 Jun 2021 16:25:50 +0200
Subject: [PATCH 2/3] Update train.py

---
 train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.py b/train.py
index 058dde3471ee..3eb866345d47 100644
--- a/train.py
+++ b/train.py
@@ -158,7 +158,7 @@ def train(hyp,  # path/to/hyp.yaml or hyp dictionary
     # plot_lr_scheduler(optimizer, scheduler, epochs)
 
     # EMA
-    ema = ModelEMA(model, batch_size=batch_size) if rank in [-1, 0] else None
+    ema = ModelEMA(model) if rank in [-1, 0] else None
 
     # Resume
     start_epoch, best_fitness = 0, 0.0

From 06c6a3b61f05a8d9781705508562f7271d1bb0e2 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Fri, 18 Jun 2021 16:26:31 +0200
Subject: [PATCH 3/3] Update torch_utils.py

---
 utils/torch_utils.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/utils/torch_utils.py b/utils/torch_utils.py
index 9efd541a05a8..b690dbe96700 100644
--- a/utils/torch_utils.py
+++ b/utils/torch_utils.py
@@ -283,16 +283,13 @@ class ModelEMA:
     GPU assignment and distributed training wrappers.
     """
 
-    # when change batch_size, the decay exponential ramp should multiply by max(64/batch_size, 1)
-    # if not, the weights will update so slowly, maybe don't update anymore
-    # the batch_size of pre-trained model is 64
-    def __init__(self, model, batch_size=64, decay=0.9999, updates=0):  
+    def __init__(self, model, decay=0.9999, updates=0):
         # Create EMA
         self.ema = deepcopy(model.module if is_parallel(model) else model).eval()  # FP32 EMA
         # if next(model.parameters()).device.type != 'cpu':
         #     self.ema.half()  # FP16 EMA
         self.updates = updates  # number of EMA updates
-        self.decay = lambda x: decay * (1 - math.exp(-x / (2000*max(64/batch_size, 1))))  # decay exponential ramp (to help early epochs)
+        self.decay = lambda x: decay * (1 - math.exp(-x / 2000))  # decay exponential ramp (to help early epochs)
         for p in self.ema.parameters():
             p.requires_grad_(False)