Fix the hanging problem of init and other problems

aoyulong · Aug 18, 2024 · 66c1d3c · 66c1d3c
1 parent e46aa71
commit 66c1d3c
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 6 deletions.
diff --git a/megatron/megatron/core/parallel_state.py b/megatron/megatron/core/parallel_state.py
@@ -768,7 +768,7 @@ def generator_wrapper(group_type, **kwargs):
             _POSITION_EMBEDDING_GROUP = group
             _POSITION_EMBEDDING_GLOBAL_RANKS = position_embedding_ranks
 
-    _LAST_RANK_WHEN_USING_PIPELINE = generator_wrapper('pp')[-1][-1] 
+    _LAST_RANK_WHEN_USING_PIPELINE = decoder_rank_generator.get_ranks('pp')[-1][-1] 
 
     # Build the tensor + data parallel groups.
     global _TENSOR_AND_DATA_PARALLEL_GROUP

diff --git a/megatron/megatron/core/transformer/custom_layers/transformer_engine.py b/megatron/megatron/core/transformer/custom_layers/transformer_engine.py
@@ -29,7 +29,7 @@
 def get_te_version():
     def get_te_version_str():
         if hasattr(te, '__version__'):
-            return str(te.__version__)
+            return str(te.__version__).split('+')[0]
         else:
             return version("transformer-engine")
 
@@ -895,7 +895,7 @@ def te_checkpoint(
     def get_cpu_offload_context(
         enabled, num_layers, model_layers, activation_offloading, weight_offloading
     ):
-        if _te_version > packaging.version.Version("1.8.0"):
+        if _te_version > packaging.version.Version("1.9.0"):
             context, sync_func = _get_cpu_offload_context(
                 enabled, num_layers, model_layers, activation_offloading, weight_offloading
             )

diff --git a/megatron/megatron/training/initialize.py b/megatron/megatron/training/initialize.py
@@ -263,8 +263,9 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
             'rank': args.rank,
             'timeout': timedelta(minutes=args.distributed_timeout_minutes),
         }
-        if packaging.version.Version(torch.__version__) >= packaging.version.Version("2.3.0"):
-            init_process_group_kwargs['device_id'] = device_id
+        # TODO: @aoyulong the init_process_group will be hanging if the device_id is set 
+        # if packaging.version.Version(torch.__version__) >= packaging.version.Version("2.3.0"):
+        #     init_process_group_kwargs['device_id'] = device_id
 
         torch.distributed.init_process_group(**init_process_group_kwargs)
 

diff --git a/tests/scripts/unit_test_megatron.sh b/tests/scripts/unit_test_megatron.sh
@@ -14,7 +14,13 @@ export PYTHONPATH=..:$PYTHONPATH
 run_pytest() {
   local test_path=$1
   echo "Running $test_path"
-  torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/$code_id/cov-report-megatron --cov=megatron/core -q -x -p no:warnings $test_path
+
+  if [ "$test_path" == "tests/unit_tests/models" ]; then
+    torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/$code_id/cov-report-megatron --cov=megatron/core -q -x -p no:warnings --ignore=tests/unit_tests/models/test_mamba_model.py $test_path
+  else
+    torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/$code_id/cov-report-megatron --cov=megatron/core -q -x -p no:warnings $test_path
+  fi
+
   if [ $? -ne 0 ]; then
     echo "Pytest failed for $test_path"
     exit 1