1
1
diff --git a/megatron/arguments.py b/megatron/arguments.py
2
- index b35af1d..2a55699 100644
2
+ index b35af1df..4c36694f 100644
3
3
--- a/megatron/arguments.py
4
4
+++ b/megatron/arguments.py
5
5
@@ -20,6 +20,9 @@ import os
@@ -22,9 +22,21 @@ index b35af1d..2a55699 100644
22
22
# Custom arguments.
23
23
if extra_args_provider is not None:
24
24
parser = extra_args_provider(parser)
25
-
25
+ @@ -232,7 +238,11 @@ def parse_args(extra_args_provider=None, defaults={},
26
+ assert args.checkpoint_activations, \
27
+ 'for distribute-checkpointed-activations to work you '\
28
+ 'need to enable checkpoint-activations'
29
+ -
30
+ + # if fmoe_num_experts is not specified,
31
+ + # we are using lower version of megatron,
32
+ + # copy num_experts to fmoe_num_experts
33
+ + if not hasattr(args, 'fmoe_num_experts'):
34
+ + args.fmoe_num_experts = args.num_experts
35
+ _print_args(args)
36
+ return args
37
+
26
38
diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py
27
- index 1251066..32afb2f 100644
39
+ index 12510662..32afb2fa 100644
28
40
--- a/megatron/data/indexed_dataset.py
29
41
+++ b/megatron/data/indexed_dataset.py
30
42
@@ -95,7 +95,7 @@ dtypes = {
@@ -46,7 +58,7 @@ index 1251066..32afb2f 100644
46
58
}
47
59
48
60
diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py
49
- index 823a51f..32f4b2e 100644
61
+ index 823a51f4..32f4b2e1 100644
50
62
--- a/megatron/optimizer/__init__.py
51
63
+++ b/megatron/optimizer/__init__.py
52
64
@@ -69,8 +69,10 @@ def get_megatron_optimizer(model):
@@ -63,7 +75,7 @@ index 823a51f..32f4b2e 100644
63
75
if args.fp16 or args.bf16:
64
76
65
77
diff --git a/megatron/optimizer/clip_grads.py b/megatron/optimizer/clip_grads.py
66
- index 036a1d4..81d5bd9 100644
78
+ index 036a1d4c..81d5bd96 100644
67
79
--- a/megatron/optimizer/clip_grads.py
68
80
+++ b/megatron/optimizer/clip_grads.py
69
81
@@ -54,17 +54,23 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
@@ -123,7 +135,7 @@ index 036a1d4..81d5bd9 100644
123
135
grad_norm = torch.norm(grad, norm_type)
124
136
total_norm += grad_norm ** norm_type
125
137
diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py
126
- index 368f587..080b06f 100644
138
+ index 368f5875..080b06f0 100644
127
139
--- a/megatron/optimizer/optimizer.py
128
140
+++ b/megatron/optimizer/optimizer.py
129
141
@@ -250,6 +250,9 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
@@ -169,7 +181,7 @@ index 368f587..080b06f 100644
169
181
num_zeros_in_grad = self.count_zeros() if \
170
182
self.log_num_zeros_in_grad else None
171
183
diff --git a/megatron/schedules.py b/megatron/schedules.py
172
- index d346c30..8eef46c 100644
184
+ index d346c30d..8eef46c8 100644
173
185
--- a/megatron/schedules.py
174
186
+++ b/megatron/schedules.py
175
187
@@ -23,7 +23,11 @@ from megatron import get_timers
@@ -318,7 +330,7 @@ index d346c30..8eef46c 100644
318
330
p2p_communication.send_backward(input_tensor_grad, timers=timers)
319
331
320
332
diff --git a/megatron/training.py b/megatron/training.py
321
- index 1ab57e9..fbe2fe8 100644
333
+ index 1ab57e9c..fbe2fe8e 100644
322
334
--- a/megatron/training.py
323
335
+++ b/megatron/training.py
324
336
@@ -35,14 +35,23 @@ from megatron import update_num_microbatches
0 commit comments