Skip to content

Commit 658538e

Browse files
committed
fix argument issue for megatron v2.5
1 parent 8fbcc38 commit 658538e

File tree

1 file changed

+20
-8
lines changed

1 file changed

+20
-8
lines changed

examples/megatron/v2.5.patch

+20-8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
diff --git a/megatron/arguments.py b/megatron/arguments.py
2-
index b35af1d..2a55699 100644
2+
index b35af1df..4c36694f 100644
33
--- a/megatron/arguments.py
44
+++ b/megatron/arguments.py
55
@@ -20,6 +20,9 @@ import os
@@ -22,9 +22,21 @@ index b35af1d..2a55699 100644
2222
# Custom arguments.
2323
if extra_args_provider is not None:
2424
parser = extra_args_provider(parser)
25-
25+
@@ -232,7 +238,11 @@ def parse_args(extra_args_provider=None, defaults={},
26+
assert args.checkpoint_activations, \
27+
'for distribute-checkpointed-activations to work you '\
28+
'need to enable checkpoint-activations'
29+
-
30+
+ # if fmoe_num_experts is not specified,
31+
+ # we are using lower version of megatron,
32+
+ # copy num_experts to fmoe_num_experts
33+
+ if not hasattr(args, 'fmoe_num_experts'):
34+
+ args.fmoe_num_experts = args.num_experts
35+
_print_args(args)
36+
return args
37+
2638
diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py
27-
index 1251066..32afb2f 100644
39+
index 12510662..32afb2fa 100644
2840
--- a/megatron/data/indexed_dataset.py
2941
+++ b/megatron/data/indexed_dataset.py
3042
@@ -95,7 +95,7 @@ dtypes = {
@@ -46,7 +58,7 @@ index 1251066..32afb2f 100644
4658
}
4759

4860
diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py
49-
index 823a51f..32f4b2e 100644
61+
index 823a51f4..32f4b2e1 100644
5062
--- a/megatron/optimizer/__init__.py
5163
+++ b/megatron/optimizer/__init__.py
5264
@@ -69,8 +69,10 @@ def get_megatron_optimizer(model):
@@ -63,7 +75,7 @@ index 823a51f..32f4b2e 100644
6375
if args.fp16 or args.bf16:
6476

6577
diff --git a/megatron/optimizer/clip_grads.py b/megatron/optimizer/clip_grads.py
66-
index 036a1d4..81d5bd9 100644
78+
index 036a1d4c..81d5bd96 100644
6779
--- a/megatron/optimizer/clip_grads.py
6880
+++ b/megatron/optimizer/clip_grads.py
6981
@@ -54,17 +54,23 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
@@ -123,7 +135,7 @@ index 036a1d4..81d5bd9 100644
123135
grad_norm = torch.norm(grad, norm_type)
124136
total_norm += grad_norm ** norm_type
125137
diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py
126-
index 368f587..080b06f 100644
138+
index 368f5875..080b06f0 100644
127139
--- a/megatron/optimizer/optimizer.py
128140
+++ b/megatron/optimizer/optimizer.py
129141
@@ -250,6 +250,9 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
@@ -169,7 +181,7 @@ index 368f587..080b06f 100644
169181
num_zeros_in_grad = self.count_zeros() if \
170182
self.log_num_zeros_in_grad else None
171183
diff --git a/megatron/schedules.py b/megatron/schedules.py
172-
index d346c30..8eef46c 100644
184+
index d346c30d..8eef46c8 100644
173185
--- a/megatron/schedules.py
174186
+++ b/megatron/schedules.py
175187
@@ -23,7 +23,11 @@ from megatron import get_timers
@@ -318,7 +330,7 @@ index d346c30..8eef46c 100644
318330
p2p_communication.send_backward(input_tensor_grad, timers=timers)
319331

320332
diff --git a/megatron/training.py b/megatron/training.py
321-
index 1ab57e9..fbe2fe8 100644
333+
index 1ab57e9c..fbe2fe8e 100644
322334
--- a/megatron/training.py
323335
+++ b/megatron/training.py
324336
@@ -35,14 +35,23 @@ from megatron import update_num_microbatches

0 commit comments

Comments
 (0)