Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New mcore transformer block spec #9035

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -728,8 +728,7 @@ def dummy():
config=self.transformer_config,
transformer_layer_spec=get_specs(
self.spec_name,
self.transformer_config.num_moe_experts,
self.transformer_config.moe_grouped_gemm,
self.transformer_config,
self.transformer_engine,
),
vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -501,8 +501,7 @@ def __init__(
add_class_token = True
vision_layer_spec = get_specs(
model_cfg.text.get('name', ''),
vision_transformer_config.num_moe_experts,
vision_transformer_config.moe_grouped_gemm,
vision_transformer_config,
model_cfg.get('transformer_engine', True),
)
vision_layer_spec.submodules.self_attention.params['attn_mask_type'] = MCoreAttnMaskType.no_mask
Expand All @@ -527,8 +526,7 @@ def __init__(
config=text_transformer_config,
transformer_layer_spec=get_specs(
model_cfg.text.get('name', ''),
text_transformer_config.num_moe_experts,
text_transformer_config.moe_grouped_gemm,
text_transformer_config,
model_cfg.get('transformer_engine', True),
),
vocab_size=model_cfg.text.get('override_vocab_size', padded_vocab_size),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@

try:
from megatron.core import parallel_state, tensor_parallel
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.transformer_block import TransformerBlockSubmodules, get_num_layers_to_build
from megatron.core.transformer.transformer_layer import BaseTransformerLayer
from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint

Expand Down Expand Up @@ -322,8 +324,10 @@


# Use this spec to use the full Transformer layer from Transformer Engine
def get_gpt_full_te_layer_autocast_spec() -> ModuleSpec:
def get_gpt_full_te_layer_autocast_spec(transformer_config) -> ModuleSpec:
if not HAVE_MEGATRON_CORE or not HAVE_TE:
raise ImportError(IMPORT_ERROR)

return ModuleSpec(module=TETransformerLayerAutocast)
num_layers = get_num_layers_to_build(transformer_config)
return TransformerBlockSubmodules(
layer_specs=[ModuleSpec(module=TETransformerLayerAutocast)] * num_layers, layer_norm=FusedLayerNorm
Dismissed Show dismissed Hide dismissed
)
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,11 @@ def mcore_supports_moe() -> bool:


## TODO: This function will not work if TE is not installed
def get_specs(spec_name, num_experts=None, moe_grouped_gemm=False, use_te=True, hyena_cfg: Dict = None):
def get_specs(spec_name, transformer_config=None, use_te=True, hyena_cfg: Dict = None):
# else cases for backwards compatibility with neva
num_experts = transformer_config.num_moe_experts if transformer_config else None
moe_grouped_gemm = transformer_config.moe_grouped_gemm if transformer_config else False

if num_experts is not None:
assert mcore_supports_moe(), "Megatron-core >= v0.5.0 is required for MoE"

Expand All @@ -148,7 +152,7 @@ def get_specs(spec_name, num_experts=None, moe_grouped_gemm=False, use_te=True,
"": get_gpt_layer_local_spec(num_experts, moe_grouped_gemm),
"te_gpt": get_gpt_layer_with_transformer_engine_spec(num_experts, moe_grouped_gemm),
"megatron_falcon_gpt": get_falcon_layer_spec(),
"megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(),
"megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(transformer_config),
"modelopt": get_gpt_layer_modelopt_spec(num_experts),
"te_gpt_hyena": get_gpt_layer_with_te_and_hyena_spec(hyena_cfg),
}
Expand Down Expand Up @@ -415,8 +419,7 @@ def model_provider_func(self, pre_process, post_process):
config=self.transformer_config,
transformer_layer_spec=get_specs(
self.spec_name,
self.transformer_config.num_moe_experts,
self.transformer_config.moe_grouped_gemm,
self.transformer_config,
self.transformer_engine,
self.cfg.get('hyena', None),
),
Expand Down
Loading