Revert "Add distopt support for FP8 params and BF16 optimizer state (N…

…VIDIA#7909)" This reverts commit 6082d76.
minitu · Jan 17, 2024 · 2dddde4 · 2dddde4
1 parent 8bdf533
commit 2dddde4
Show file tree

Hide file tree

Showing 8 changed files with 76 additions and 497 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.11-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.10-py3
 
 # build an image that includes only the nemo dependencies, ensures that dependencies
 # are included first for optimal caching, and useful for building a development
@@ -69,16 +69,15 @@ RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
   git checkout e122536b7645edcb7ebf099b5c92a443f7dbf8e7 && \
   pip install .
 
-# Apex bugfix for PyTorch 23.11 container: https://github.com/NVIDIA/apex/pull/1760
+# Distributed Adam support for multiple dtypes
 RUN git clone https://github.com/NVIDIA/apex.git && \
   cd apex && \
-  git checkout c07a4cf67102b9cd3f97d1ba36690f985bae4227 && \
+  git checkout 52e18c894223800cb611682dce27d88050edf1de && \
   pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
 
-# Transformer Engine 1.2.0
 RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \
   cd TransformerEngine && \
-  git fetch origin 4f9662fbe621671f5f905e772fc1138953af77f6 && \
+  git fetch origin 8eae4ce2b8fdfbbe525fc8bfecb0df5498cc9687 && \
   git checkout FETCH_HEAD && \
   git submodule init && git submodule update && \
   NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -1,7 +1,7 @@
 pipeline {
   agent {
         docker {
-          image 'nvcr.io/nvidia/pytorch:23.11-py3'
+          image 'nvcr.io/nvidia/pytorch:23.10-py3'
           args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1'
         }
   }
@@ -57,28 +57,17 @@ pipeline {
       }
     }
 
-    // Transformer Engine 1.2.0
     stage('Transformer Engine installation') {
       steps {
          sh 'git clone https://github.com/NVIDIA/TransformerEngine.git && \
              cd TransformerEngine && \
-             git fetch origin 4f9662fbe621671f5f905e772fc1138953af77f6 && \
+             git fetch origin cf6fc898286e4ad347ff88925c88663324e2b87d && \
              git checkout FETCH_HEAD && \
              git submodule init && git submodule update && \
              NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .'
       }
     }
 
-    // Apex bugfix for PyTorch 23.11 container: https://github.com/NVIDIA/apex/pull/1760
-    stage('Apex installation') {
-      steps {
-         sh 'git clone https://github.com/NVIDIA/apex.git && \
-             cd apex && \
-             git checkout c07a4cf67102b9cd3f97d1ba36690f985bae4227 && \
-             cp -R apex /usr/local/lib/python3.10/dist-packages'
-      }
-    }
-
     // pip package should be working with main, if not we can update the commit here
     // until the pip package is updated
     // stage('Megatron Core installation') {
@@ -3246,7 +3235,7 @@ pipeline {
         sh "rm -rf examples/nlp/language_modeling/bert_pretrain_results"
         sh "rm -rf examples/nlp/language_modeling/bert_index_mappings"
       }
-    }
+    }    
     stage('L2: Megatron RETRO Pretraining and Resume Training') {
       when {
         anyOf {

diff --git a/README.rst b/README.rst
@@ -48,8 +48,8 @@ Latest News
   :alt: H200-NeMo-performance
   :width: 600
 
-NeMo Framework has been updated with state-of-the-art features,
-such as FSDP, Mixture-of-Experts, and RLHF with TensorRT-LLM to provide speedups up to 4.2x for Llama-2 pre-training on H200.
+NeMo Framework has been updated with state-of-the-art features, 
+such as FSDP, Mixture-of-Experts, and RLHF with TensorRT-LLM to provide speedups up to 4.2x for Llama-2 pre-training on H200. 
 **All of these features will be available in an upcoming release.**
 
 
@@ -326,7 +326,7 @@ To install Apex, run
 
     git clone https://github.com/NVIDIA/apex.git
     cd apex
-    git checkout c07a4cf67102b9cd3f97d1ba36690f985bae4227
+    git checkout 52e18c894223800cb611682dce27d88050edf1de
     pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
 
 It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Apex or any other dependencies.
@@ -366,7 +366,7 @@ Transformer Engine requires PyTorch to be built with CUDA 11.8.
 
 Flash Attention
 ~~~~~~~~~~~~~~~~~~~~
-Transformer Engine already supports Flash Attention for GPT models. If you want to use Flash Attention for non-causal models, please install `flash-attn <https://github.com/HazyResearch/flash-attention>`_. If you want to use Flash Attention with attention bias (introduced from position encoding, e.g. Alibi), please also install triton pinned version following the `implementation <https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3>`_.
+Transformer Engine already supports Flash Attention for GPT models. If you want to use Flash Attention for non-causal models, please install `flash-attn <https://github.com/HazyResearch/flash-attention>`_. If you want to use Flash Attention with attention bias (introduced from position encoding, e.g. Alibi), please also install triton pinned version following the `implementation <https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3>`_. 
 
 .. code-block:: bash
 

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -41,7 +41,7 @@
 from nemo.collections.nlp.parts.nlp_overrides import NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE, GradScaler
 from nemo.collections.nlp.parts.utils_funcs import activation_to_func
 from nemo.core.optim import MainParamsOptimizerWrapper, prepare_lr_scheduler
-from nemo.utils import AppState, logging, str_to_dtype
+from nemo.utils import AppState, logging
 from nemo.utils.get_rank import is_global_rank_zero
 
 try:
@@ -656,38 +656,18 @@ def setup_optimization(
         self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None,
     ):
         optim_kwargs = {} if optim_kwargs is None else optim_kwargs.copy()
-
-        def get_config_arg(key: str, default_value: Optional[Any] = None) -> Any:
-            """Get keyword argument from config"""
-            val = None
-            if val is None and optim_kwargs:
-                val = optim_kwargs.get(key, None)
-            if val is None and optim_config:
-                val = optim_config.get(key, None)
-            if val is None and self._cfg.optim:
-                val = self._cfg.optim.get(key, None)
-            if val is None:
-                val = default_value
-            return val
-
         if self.with_distributed_adam:
-            # Allocate contiguous grad buffer to avoid extra copies
-            optim_kwargs['contiguous_grad_buffer'] = get_config_arg('contiguous_grad_buffer', True)
-            if self.megatron_amp_O2 and not optim_kwargs['contiguous_grad_buffer']:
-                raise ValueError(
-                    "Distributed Adam optimizer requires contiguous param buffer for O2. "
-                    "Either enable contiguous_grad_buffer or disable megatron_amp_O2."
-                )
+            # Allocate contiguous buffer to avoid extra copies
+            optim_kwargs['contiguous_grad_buffer'] = True
 
-            # Optimizer dtype
-            optim_dtype = str_to_dtype(get_config_arg('dtype', torch.float32))
+            # Make sure optimizer state is in FP32
+            optim_dtype = torch.float32
             optim_kwargs['dtype'] = optim_dtype
 
             # Make sure embedding grad reductions are in FP32
-            if optim_dtype == torch.float32:
-                for name, param in self.named_parameters():
-                    if 'word_embedding' in name or 'position_embedding' in name or 'output_layer' in name:
-                        param._with_fp32_optimizer = True
+            for name, param in self.named_parameters():
+                if 'word_embedding' in name or 'position_embedding' in name or 'output_layer' in name:
+                    param._with_fp32_optimizer = True
 
             # Match param allgather with model dtype
             model_dtype = torch.float32
@@ -696,9 +676,7 @@ def get_config_arg(key: str, default_value: Optional[Any] = None) -> Any:
             optim_kwargs['param_sync_dtype'] = model_dtype
 
             # Determine whether to store master params in optimizer
-            if self.cfg.get('fp8_params', False):
-                optim_kwargs['store_params'] = True
-            elif optim_dtype == model_dtype:
+            if optim_dtype == model_dtype:
                 optim_kwargs['store_params'] = False
             elif optim_dtype == torch.float32 and model_dtype == torch.bfloat16:
                 optim_kwargs['store_params'] = False
@@ -765,11 +743,9 @@ def configure_optimizers(self):
         if self.with_distributed_adam:
 
             # Initialize param buckets if explicitly provided
-            if getattr(self, 'distributed_adam_buckets', None):
+            if hasattr(self, 'distributed_adam_buckets'):
                 for bucket in self.distributed_adam_buckets:
                     self._optimizer.init_params_bucket(bucket)
-                self._optimizer.init_params_bucket(self.parameters())
-            if hasattr(self, 'distributed_adam_buckets'):
                 del self.distributed_adam_buckets
 
             # Make sure all params are initialized so main grads are

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -16,7 +16,6 @@
 import os
 import queue
 import warnings
-from contextlib import nullcontext
 from dataclasses import fields
 from functools import partial
 from typing import Any, Dict, Iterator, List, Optional, Union
@@ -246,16 +245,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
                 virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None),
             )
         else:
-            build_model_context = nullcontext
-            if HAVE_TE and self.cfg.get('fp8', False) and self.cfg.get('fp8_params', False):
-                build_model_context = transformer_engine.pytorch.fp8_model_init
-            with build_model_context():
-                self.model = build_model(
-                    model_provider_func=self.model_provider_func,
-                    wrap_with_ddp=False,
-                    virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None),
-                    on_cpu=cfg.get('fsdp', False) and cfg.get('use_cpu_initialization', False),
-                )
+            self.model = build_model(
+                model_provider_func=self.model_provider_func,
+                wrap_with_ddp=False,
+                virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None),
+                on_cpu=cfg.get('fsdp', False) and cfg.get('use_cpu_initialization', False),
+            )
 
         # if we're not using interleaved, then self.model is a module.
         if self.cfg.get('virtual_pipeline_model_parallel_size', None) is None:
@@ -478,6 +473,12 @@ def configure_optimizers(self):
                             [p for p in layer.parameters() if not getattr(p, '_disable_overlap_grad_sync', False)]
                         )
             buckets.reverse()
+            used_params = set()
+            for bucket in buckets:
+                used_params.update(bucket)
+            remaining_params = [p for p in self.parameters() if p not in used_params]
+            if remaining_params:
+                buckets.append(remaining_params)
             self.distributed_adam_buckets = buckets
 
         return super().configure_optimizers()