RichardoMrMu
diff --git a/‎docs/source/models/supported_models.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/models/supported_models.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/registry.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/models/registry.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vllm/config.py‎
Lines changed: 20 additions & 1 deletion b/‎vllm/config.py‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎vllm/model_executor/models/decilm.py‎
Lines changed: 0 additions & 124 deletions b/‎vllm/model_executor/models/decilm.py‎
Lines changed: 0 additions & 124 deletions
diff --git a/‎vllm/model_executor/models/interfaces.py‎
Lines changed: 29 additions & 0 deletions b/‎vllm/model_executor/models/interfaces.py‎
Lines changed: 29 additions & 0 deletions
@@ -224,7 +224,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 - * `DeciLMForCausalLM`
   * DeciLM
-  * `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
+  * `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc.
   *
   * ✅︎
 - * `DeepseekForCausalLM`
 
@@ -112,7 +112,7 @@ def check_available_online(
     "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
                                          trust_remote_code=True),
     "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
-    "DeciLMForCausalLM": _HfExamplesInfo("Deci/DeciLM-7B-instruct",
+    "DeciLMForCausalLM": _HfExamplesInfo("nvidia/Llama-3_3-Nemotron-Super-49B-v1", # noqa: E501
                                          trust_remote_code=True),
     "DeepseekForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-llm-7b-chat"),
     "DeepseekV2ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V2-Lite-Chat",  # noqa: E501
 
@@ -411,6 +411,7 @@ def __init__(
 
         self.is_attention_free = self._init_attention_free()
         self.is_hybrid = self._init_is_hybrid()
+        self.has_noops = self._init_has_noops()
         self.has_inner_state = self._init_has_inner_state()
 
         if current_platform.is_neuron():
@@ -510,6 +511,10 @@ def _init_attention_free(self) -> bool:
     def _init_is_hybrid(self) -> bool:
         return self.registry.is_hybrid_model(self.architectures)
 
+    def _init_has_noops(self) -> bool:
+        architectures = getattr(self.hf_config, "architectures", [])
+        return self.registry.is_noops_model(architectures)
+
     def _init_has_inner_state(self) -> bool:
         return self.registry.model_has_inner_state(self.architectures)
 
@@ -872,6 +877,14 @@ def get_total_num_kv_heads(self) -> int:
             return getattr(self.hf_config.attn_config, "kv_n_heads",
                            self.hf_config.num_attention_heads)
 
+        if self.hf_config.model_type == "nemotron-nas":
+            for block in self.hf_config.block_configs:
+                if not block.attention.no_op:
+                    return self.hf_config.num_attention_heads \
+                        // block.attention.n_heads_in_group
+
+            raise RuntimeError("Couldn't determine number of kv heads")
+
         if self.is_attention_free:
             return 0
 
@@ -940,7 +953,9 @@ def get_num_layers_by_block_type(
         # This function relies on 'layers_block_type' in hf_config,
         # for w/o this attribute, we will need to have workarounds like so
         attn_block_type = block_type == LayerBlockType.attention
-        is_transformer = not self.is_hybrid and not self.is_attention_free
+        is_transformer = not self.is_hybrid and \
+                            not self.has_noops and \
+                            not self.is_attention_free
         start, end = self.get_layers_start_end_indices(parallel_config)
 
         if is_transformer:
@@ -951,6 +966,10 @@ def get_num_layers_by_block_type(
             # Note that this code assumes there
             # is only one type of attention-free block type.
             return 0 if attn_block_type else end - start
+        elif self.has_noops:
+            block_configs = self.hf_config.block_configs
+            return sum(not bc.attention.no_op
+                       for bc in block_configs[start:end])
         else:
             # Hybrid model
             layers_block_type_value = getattr(self.hf_config,
 
@@ -411,6 +411,35 @@ def is_hybrid(
     return isinstance(model, IsHybrid)
 
 
+@runtime_checkable
+class HasNoOps(Protocol):
+    has_noops: ClassVar[Literal[True]] = True
+
+
+@runtime_checkable
+class _HasNoOpsType(Protocol):
+    has_noops: ClassVar[Literal[True]]
+
+
+@overload
+def has_noops(model: object) -> TypeIs[HasNoOps]:
+    ...
+
+
+@overload
+def has_noops(model: Type[object]) -> TypeIs[Type[HasNoOps]]:
+    ...
+
+
+def has_noops(
+    model: Union[Type[object], object]
+) -> Union[TypeIs[Type[HasNoOps]], TypeIs[HasNoOps]]:
+    if isinstance(model, type):
+        return isinstance(model, _HasNoOpsType)
+
+    return isinstance(model, HasNoOps)
+
+
 @runtime_checkable
 class SupportsCrossEncoding(Protocol):
     """The interface required for all models that support cross encoding."""
Original file line number	Diff line number	Diff line change
`@@ -224,7 +224,7 @@ See [this page](#generative-models) for more information on how to use generativ`
`224`	`224`	`* ✅︎`
`225`	`225`	- * `DeciLMForCausalLM`
`226`	`226`	`* DeciLM`
`227`		- * `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
	`227`	+ * `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc.
`228`	`228`	`*`
`229`	`229`	`* ✅︎`
`230`	`230`	- * `DeepseekForCausalLM`