move variable to additional config

chenwaner · chenwaner · commit 0b5e300562bf · 2025-06-09T15:52:51.000+08:00
Signed-off-by: chenwaner &lt;861645847@qq.com&gt;
diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md
@@ -24,28 +24,29 @@ LLM(model="Qwen/Qwen3-8B", additional_config={"config_key":"config_value"})
 
 The following table lists the additional configuration options available in vLLM Ascend:
 
-| Name | Type | Default | Description |
-| ---- | ---- | ------- | ----------- |
-| `torchair_graph_config` | dict | `{}` | The config options for torchair graph mode |
-| `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler  |
-| `expert_tensor_parallel_size` | str | `1` | Expert tensor parallel size the model to use. |
+| Name                          | Type | Default | Description                                   |
+| ----------------------------- | ---- | ------- | --------------------------------------------- |
+| `torchair_graph_config`       | dict | `{}`    | The config options for torchair graph mode    |
+| `ascend_scheduler_config`     | dict | `{}`    | The config options for ascend scheduler       |
+| `expert_tensor_parallel_size` | str  | `1`     | Expert tensor parallel size the model to use. |
 
 The details of each config option are as follows:
 
 **torchair_graph_config**
 
-| Name | Type | Default | Description |
-| ---- | ---- | ------- | ----------- |
-| `enabled` | bool | `False` | Whether to enable torchair graph mode |
-| `use_cached_graph` | bool | `False` | Whether to use cached graph |
-| `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache |
-| `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
+| Name                     | Type      | Default | Description                                                       |
+| ------------------------ | --------- | ------- | ----------------------------------------------------------------- |
+| `enabled`                | bool      | `False` | Whether to enable torchair graph mode                             |
+| `use_cached_graph`       | bool      | `False` | Whether to use cached graph                                       |
+| `graph_batch_sizes`      | list[int] | `[]`    | The batch size for torchair graph cache                           |
+| `graph_batch_sizes_init` | bool      | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
+| `enable_kv_nz`           | bool      | `False` | Whether to enable kvcache NZ layout                               |
 
 **ascend_scheduler_config**
 
-| Name | Type | Default | Description |
-| ---- | ---- | ------- | ----------- |
-| `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine|
+| Name      | Type | Default | Description                                      |
+| --------- | ---- | ------- | ------------------------------------------------ |
+| `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine |
 
 ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you  can add `chunked_prefill_enabled: true` to ascend_scheduler_config as well.
 
@@ -59,7 +60,8 @@ A full example of additional configuration is as follows:
         "enabled": true,
         "use_cached_graph": true,
         "graph_batch_sizes": [1, 2, 4, 8],
-        "graph_batch_sizes_init": true
+        "graph_batch_sizes_init": true,
+        "enable_kv_nz": false
     },
     "ascend_scheduler_config": {
         "enabled": true,
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -55,6 +55,8 @@ def __init__(self, torchair_graph_config):
             "graph_batch_sizes_init", False)
         self.enable_multistream_shared_expert = torchair_graph_config.get(
             "enable_multistream_shared_expert", False)
+        self.enable_kv_nz = torchair_graph_config.get(
+            "enable_kv_nz", False)
 
         if not isinstance(self.graph_batch_sizes, list):
             raise TypeError("graph_batch_sizes must be list[int]")
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -13,7 +13,6 @@
 
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
-import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
 
 if TYPE_CHECKING:
@@ -444,9 +443,9 @@ def __init__(
         self.kv_a_proj_with_mqa = kwargs.get('kv_a_proj_with_mqa', None)
         self.kv_a_layernorm = kwargs.get('kv_a_layernorm', None)
 
-        self.enable_kv_nz = envs_ascend.VLLM_ENABLE_KV_NZ
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz
 
     def _v_up_proj_and_o_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -55,9 +55,6 @@
     # Find more detail here: https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/ascendcbestP/atlas_ascendc_best_practices_10_0043.html
     "VLLM_ENABLE_MC2":
     lambda: bool(int(os.getenv("VLLM_ENABLE_MC2", '0'))),
-    # Whether to enable the kvcache nz optimization, the default value is False.
-    "VLLM_ENABLE_KV_NZ":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_KV_NZ", '0'))),
     # Whether to enable the topk optimization. It's disabled by default for experimental support
     # We'll make it enabled by default in the future.
     "VLLM_ASCEND_ENABLE_TOPK_OPTIMZE":