Skip to content

Commit e7f69c0

Browse files
NaveassafMu Huai
authored andcommitted
[Model] Update support for NemotronNAS models (vllm-project#15008)
Signed-off-by: Nave Assaf <nassaf@nvidia.com> Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com>
1 parent 7fdc7b0 commit e7f69c0

File tree

8 files changed

+524
-133
lines changed

8 files changed

+524
-133
lines changed

docs/source/models/supported_models.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ See [this page](#generative-models) for more information on how to use generativ
224224
* ✅︎
225225
- * `DeciLMForCausalLM`
226226
* DeciLM
227-
* `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
227+
* `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc.
228228
*
229229
* ✅︎
230230
- * `DeepseekForCausalLM`

tests/models/registry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def check_available_online(
112112
"Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
113113
trust_remote_code=True),
114114
"DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
115-
"DeciLMForCausalLM": _HfExamplesInfo("Deci/DeciLM-7B-instruct",
115+
"DeciLMForCausalLM": _HfExamplesInfo("nvidia/Llama-3_3-Nemotron-Super-49B-v1", # noqa: E501
116116
trust_remote_code=True),
117117
"DeepseekForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-llm-7b-chat"),
118118
"DeepseekV2ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V2-Lite-Chat", # noqa: E501

vllm/config.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,7 @@ def __init__(
411411

412412
self.is_attention_free = self._init_attention_free()
413413
self.is_hybrid = self._init_is_hybrid()
414+
self.has_noops = self._init_has_noops()
414415
self.has_inner_state = self._init_has_inner_state()
415416

416417
if current_platform.is_neuron():
@@ -510,6 +511,10 @@ def _init_attention_free(self) -> bool:
510511
def _init_is_hybrid(self) -> bool:
511512
return self.registry.is_hybrid_model(self.architectures)
512513

514+
def _init_has_noops(self) -> bool:
515+
architectures = getattr(self.hf_config, "architectures", [])
516+
return self.registry.is_noops_model(architectures)
517+
513518
def _init_has_inner_state(self) -> bool:
514519
return self.registry.model_has_inner_state(self.architectures)
515520

@@ -872,6 +877,14 @@ def get_total_num_kv_heads(self) -> int:
872877
return getattr(self.hf_config.attn_config, "kv_n_heads",
873878
self.hf_config.num_attention_heads)
874879

880+
if self.hf_config.model_type == "nemotron-nas":
881+
for block in self.hf_config.block_configs:
882+
if not block.attention.no_op:
883+
return self.hf_config.num_attention_heads \
884+
// block.attention.n_heads_in_group
885+
886+
raise RuntimeError("Couldn't determine number of kv heads")
887+
875888
if self.is_attention_free:
876889
return 0
877890

@@ -940,7 +953,9 @@ def get_num_layers_by_block_type(
940953
# This function relies on 'layers_block_type' in hf_config,
941954
# for w/o this attribute, we will need to have workarounds like so
942955
attn_block_type = block_type == LayerBlockType.attention
943-
is_transformer = not self.is_hybrid and not self.is_attention_free
956+
is_transformer = not self.is_hybrid and \
957+
not self.has_noops and \
958+
not self.is_attention_free
944959
start, end = self.get_layers_start_end_indices(parallel_config)
945960

946961
if is_transformer:
@@ -951,6 +966,10 @@ def get_num_layers_by_block_type(
951966
# Note that this code assumes there
952967
# is only one type of attention-free block type.
953968
return 0 if attn_block_type else end - start
969+
elif self.has_noops:
970+
block_configs = self.hf_config.block_configs
971+
return sum(not bc.attention.no_op
972+
for bc in block_configs[start:end])
954973
else:
955974
# Hybrid model
956975
layers_block_type_value = getattr(self.hf_config,

vllm/model_executor/models/decilm.py

Lines changed: 0 additions & 124 deletions
This file was deleted.

vllm/model_executor/models/interfaces.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,35 @@ def is_hybrid(
411411
return isinstance(model, IsHybrid)
412412

413413

414+
@runtime_checkable
415+
class HasNoOps(Protocol):
416+
has_noops: ClassVar[Literal[True]] = True
417+
418+
419+
@runtime_checkable
420+
class _HasNoOpsType(Protocol):
421+
has_noops: ClassVar[Literal[True]]
422+
423+
424+
@overload
425+
def has_noops(model: object) -> TypeIs[HasNoOps]:
426+
...
427+
428+
429+
@overload
430+
def has_noops(model: Type[object]) -> TypeIs[Type[HasNoOps]]:
431+
...
432+
433+
434+
def has_noops(
435+
model: Union[Type[object], object]
436+
) -> Union[TypeIs[Type[HasNoOps]], TypeIs[HasNoOps]]:
437+
if isinstance(model, type):
438+
return isinstance(model, _HasNoOpsType)
439+
440+
return isinstance(model, HasNoOps)
441+
442+
414443
@runtime_checkable
415444
class SupportsCrossEncoding(Protocol):
416445
"""The interface required for all models that support cross encoding."""

0 commit comments

Comments
 (0)