diff --git a/Dockerfile.ci b/Dockerfile.ci index 10ea68f2c247..d51f79e1b3af 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.19.0 -ARG MCORE_TAG=213c8a23fa9fe95d19eff0932a1e6e71767f0962 +ARG MCORE_TAG=441cb9250101cf2cc406f0439b802f34f923f251 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py index 67a4802d83f6..0d75ab7cc706 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py @@ -206,6 +206,8 @@ def forward( context=None, context_mask=None, rotary_pos_emb=None, + rotary_pos_cos=None, + rotary_pos_sin=None, inference_params=None, packed_seq_params=None, ): diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py index 1783d5f5f3fd..131f154d6709 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py @@ -55,7 +55,7 @@ class FalconTransformerLayer(TransformerLayer): Transformer layer takes input with size [s, b, h] and returns an output of the same size. - + """ def __init__( @@ -106,6 +106,8 @@ def forward( context=None, context_mask=None, rotary_pos_emb=None, + rotary_pos_cos=None, + rotary_pos_sin=None, inference_params=None, packed_seq_params=None, ): diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py index e9fb1833fc08..d1945139dee9 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py @@ -250,6 +250,8 @@ def forward( context=None, context_mask=None, rotary_pos_emb=None, + rotary_pos_cos=None, + rotary_pos_sin=None, inference_params=None, packed_seq_params=None, # TODO: handle this ): diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py index 7a327a3a35cb..e0e3a2339ca1 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py @@ -160,7 +160,9 @@ def forward( rotary_pos_emb = None self.decoder.input_tensor = None if self.position_embedding_type == 'rope': - rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(None, self.decoder, hidden_states, self.config) + rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len( + None, self.decoder, hidden_states, self.config, None + ) rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len) hidden_states = self.decoder(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb) diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py index 5128b4ca6b16..da9c98fd94ea 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py @@ -80,11 +80,21 @@ def forward( context: Tensor = None, context_mask: Tensor = None, rotary_pos_emb: Tensor = None, + rotary_pos_cos: Tensor = None, + rotary_pos_sin: Tensor = None, inference_params: InferenceParams = None, packed_seq_params: PackedSeqParams = None, ): hidden_states = super().forward( - hidden_states, attention_mask, context, context_mask, rotary_pos_emb, inference_params, packed_seq_params + hidden_states, + attention_mask, + context, + context_mask, + rotary_pos_emb, + rotary_pos_cos, + rotary_pos_sin, + inference_params, + packed_seq_params, ) mlp_head_adapter = self.get_adapter_module(AdapterName.MLP_HEAD_ADAPTER) @@ -220,6 +230,8 @@ def forward( inference_params=None, rotary_pos_emb=None, packed_seq_params=None, + rotary_pos_cos=None, + rotary_pos_sin=None, ): # hidden_states: [sq, b, h] @@ -237,8 +249,8 @@ def forward( # =================================================== # Adjust key, value, and rotary_pos_emb for inference # =================================================== - key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference( - inference_params, key, value, rotary_pos_emb + query, key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference( + inference_params, query, key, value, rotary_pos_emb ) if packed_seq_params is not None: