move variable to additional config

chenwaner · chenwaner · commit aa5b9e412e78 · 2025-06-09T17:18:23.000+08:00
Signed-off-by: chenwaner &lt;861645847@qq.com&gt;
diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md
@@ -40,6 +40,7 @@ The details of each config option are as follows:
 | `use_cached_graph` | bool | `False` | Whether to use cached graph |
 | `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache |
 | `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
+| `enable_kv_nz`| bool | `False` | Whether to enable kvcache NZ layout |
 
 **ascend_scheduler_config**
 
@@ -59,12 +60,14 @@ A full example of additional configuration is as follows:
         "enabled": true,
         "use_cached_graph": true,
         "graph_batch_sizes": [1, 2, 4, 8],
-        "graph_batch_sizes_init": true
+        "graph_batch_sizes_init": true,
+        "enable_kv_nz": false
     },
     "ascend_scheduler_config": {
         "enabled": true,
         "chunked_prefill_enabled": true,
     },
-    "expert_tensor_parallel_size": 1
+    "expert_tensor_parallel_size": 1,
+    "refresh": false,
 }
 ```
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -55,6 +55,7 @@ def __init__(self, torchair_graph_config):
             "graph_batch_sizes_init", False)
         self.enable_multistream_shared_expert = torchair_graph_config.get(
             "enable_multistream_shared_expert", False)
+        self.enable_kv_nz = torchair_graph_config.get("enable_kv_nz", False)
 
         if not isinstance(self.graph_batch_sizes, list):
             raise TypeError("graph_batch_sizes must be list[int]")
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -55,9 +55,6 @@
     # Find more detail here: https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/ascendcbestP/atlas_ascendc_best_practices_10_0043.html
     "VLLM_ENABLE_MC2":
     lambda: bool(int(os.getenv("VLLM_ENABLE_MC2", '0'))),
-    # Whether to enable the kvcache nz optimization, the default value is False.
-    "VLLM_ENABLE_KV_NZ":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_KV_NZ", '0'))),
     # Whether to enable the topk optimization. It's disabled by default for experimental support
     # We'll make it enabled by default in the future.
     "VLLM_ASCEND_ENABLE_TOPK_OPTIMZE":