xuebwang-amd
diff --git a/‎tests/conftest.py‎
Lines changed: 1 addition & 11 deletions b/‎tests/conftest.py‎
Lines changed: 1 addition & 11 deletions
diff --git a/‎tests/kernels/moe/test_mxfp4_moe.py‎
Lines changed: 22 additions & 15 deletions b/‎tests/kernels/moe/test_mxfp4_moe.py‎
Lines changed: 22 additions & 15 deletions
diff --git a/‎tests/models/multimodal/generation/test_qwen2_vl.py‎
Lines changed: 23 additions & 23 deletions b/‎tests/models/multimodal/generation/test_qwen2_vl.py‎
Lines changed: 23 additions & 23 deletions
diff --git a/‎tests/models/quantization/test_awq.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/models/quantization/test_awq.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/quantization/test_compressed_tensors.py‎
Lines changed: 8 additions & 10 deletions b/‎tests/quantization/test_compressed_tensors.py‎
Lines changed: 8 additions & 10 deletions
diff --git a/‎tests/quantization/test_fp8.py‎
Lines changed: 4 additions & 4 deletions b/‎tests/quantization/test_fp8.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tests/quantization/test_gptq_dynamic.py‎
Lines changed: 38 additions & 33 deletions b/‎tests/quantization/test_gptq_dynamic.py‎
Lines changed: 38 additions & 33 deletions
diff --git a/‎tests/quantization/test_lm_head.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/quantization/test_lm_head.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/quantization/test_modelopt.py‎
Lines changed: 3 additions & 7 deletions b/‎tests/quantization/test_modelopt.py‎
Lines changed: 3 additions & 7 deletions
@@ -987,17 +987,7 @@ def score(
         return [req_output.outputs.score for req_output in req_outputs]
 
     def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
-        if hasattr(self.llm.llm_engine, "model_executor"):
-            # This works either in V0 or in V1 with
-            # VLLM_ENABLE_V1_MULTIPROCESSING=0
-            executor = self.llm.llm_engine.model_executor
-            return executor.apply_model(func)
-
-        # This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
-        def _apply_model(self):
-            return func(self.get_model())
-
-        return self.llm.llm_engine.collective_rpc(_apply_model)
+        return self.llm.apply_model(func)
 
     def get_llm(self) -> LLM:
         return self.llm
 
@@ -1,21 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import importlib
 import importlib.metadata
 from dataclasses import dataclass
+from importlib.util import find_spec
 from typing import Optional
 
 import pytest
 import torch
 from packaging import version
 
+from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
+    QuarkLinearMethod, QuarkW4A4MXFP4)
+from vllm.model_executor.layers.quantization.quark.quark_moe import (  # noqa: E501
+    QuarkW4A4MXFp4MoEMethod)
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
 
-QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
-    "quark") is not None and version.parse(
-        importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
+QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
+    importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
 
 TRTLLM_GEN_MXFP4_AVAILABLE = current_platform.is_cuda(
 ) and current_platform.is_device_capability(100)
@@ -39,6 +42,12 @@ class ModelCase:
     tp: int
 
 
+@pytest.fixture(scope="function", autouse=True)
+def enable_pickle(monkeypatch):
+    """`LLM.apply_model` requires pickling a function."""
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
+
 @pytest.mark.parametrize('model_case', [
     ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
     ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
@@ -55,21 +64,19 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
                      tensor_parallel_size=model_case.tp,
                      load_format="dummy") as llm:
 
-        # TODO: llm.apply_model(check_model) currently relies on V0 internals.
-        # Re-enable this later.
-        # def check_model(model):
-        #     layer = model.model.layers[0]
+        def check_model(model):
+            layer = model.model.layers[0]
 
-        #     qkv_proj = layer.self_attn.qkv_proj
+            qkv_proj = layer.self_attn.qkv_proj
 
-        #     assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
-        #     assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
+            assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
+            assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
 
-        #     assert isinstance(layer.mlp.experts.quant_method,
-        #                       QuarkW4A4MXFp4MoEMethod)
+            assert isinstance(layer.mlp.experts.quant_method,
+                              QuarkW4A4MXFp4MoEMethod)
 
-        # if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
-        #     llm.apply_model(check_model)
+        if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
+            llm.apply_model(check_model)
 
         output = llm.generate_greedy("Today I am in the French Alps and",
                                      max_tokens=20)
 
@@ -10,18 +10,17 @@
 
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
+from vllm.utils import set_default_torch_num_threads
 
 from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
                           PromptVideoInput, VllmRunner)
 from ...utils import check_logprobs_close
 
 
 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    V1 Test: batch_make_xxxxx_embeddings calls a V0 internal
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+def enable_pickle(monkeypatch):
+    """`LLM.apply_model` requires pickling a function."""
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
 
 models = ["Qwen/Qwen2-VL-2B-Instruct"]
@@ -126,9 +125,8 @@ def get_image_embeds(model):
             image_grid_thw_on_device = image_grid_thw.to(visual.device,
                                                          dtype=torch.int64)
             return visual(pixel_values_on_device,
-                          grid_thw=image_grid_thw_on_device)
+                          grid_thw=image_grid_thw_on_device).cpu()
 
-    # V1 Test: this calls a V0 internal.
     image_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
     # split into original batches
@@ -210,7 +208,7 @@ def get_image_embeds(model):
             video_grid_thw_on_device = video_grid_thw.to(visual.device,
                                                          dtype=torch.int64)
             return visual(pixel_values_on_device,
-                          grid_thw=video_grid_thw_on_device)
+                          grid_thw=video_grid_thw_on_device).cpu()
 
     # V1 Test: this calls a V0 internal.
     video_embeds = torch.concat(llm.apply_model(get_image_embeds))
@@ -266,19 +264,22 @@ def run_embedding_input_test(
     processor = AutoProcessor.from_pretrained(model)
 
     # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     runner="generate",
-                     max_model_len=4000,
-                     max_num_seqs=3,
-                     dtype=dtype,
-                     limit_mm_per_prompt={
-                         "image": mm_limit,
-                         "video": mm_limit
-                     },
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
+    with set_default_torch_num_threads(1):
+        vllm_model = vllm_runner(
+            model,
+            runner="generate",
+            max_model_len=4000,
+            max_num_seqs=3,
+            dtype=dtype,
+            limit_mm_per_prompt={
+                "image": mm_limit,
+                "video": mm_limit
+            },
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+        )
 
+    with vllm_model:
         outputs_per_case_for_original_input = [
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
@@ -329,9 +330,8 @@ def run_embedding_input_test(
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
-                                         size_factors, dtype: str,
-                                         max_tokens: int,
-                                         num_logprobs: int) -> None:
+                                         size_factors, dtype, max_tokens,
+                                         num_logprobs, monkeypatch) -> None:
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_case: list[tuple[
 
@@ -112,7 +112,7 @@ def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
                     monkeypatch) -> None:
 
     # Test V1: this test hangs during setup on single-scale input.
-    # TODO: fixure out why and re-enable this on V1.
+    # TODO: figure out why and re-enable this on V1.
     monkeypatch.setenv("VLLM_USE_V1", "0")
     run_awq_test(
         vllm_runner,
 
@@ -43,12 +43,9 @@
 
 
 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This module relies on V0 internals, so set VLLM_USE_V1=0.
-    """
-    if not current_platform.is_cpu():
-        monkeypatch.setenv('VLLM_USE_V1', '0')
+def enable_pickle(monkeypatch):
+    """`LLM.apply_model` requires pickling a function."""
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
 
 @pytest.mark.parametrize(
@@ -176,10 +173,11 @@ def test_compressed_tensors_w8a8_logprobs(
 
     dtype = "bfloat16"
 
-    # skip language translation prompt for the static per tensor asym model
-    if (model_path ==
-            "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
-        ):  # noqa: E501
+    # skip language translation prompt for the static per tensor models
+    if model_path in (
+            "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+            "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
+    ):
         example_prompts = example_prompts[0:-1]
 
     with hf_runner(model_path, dtype=dtype) as hf_model:
 
@@ -60,8 +60,8 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
-    # vllm_runner.apply_model() relies on V0 internals.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    # `LLM.apply_model` requires pickling a function.
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
     with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
 
         def check_model(model):
@@ -104,8 +104,8 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
-    # vllm_runner.apply_model() relies on V0 internals.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    # `LLM.apply_model` requires pickling a function.
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
@@ -31,41 +31,46 @@
 @pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
 def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
                            monkeypatch):
-    # vllm_runner.apply_model() relies on V0 internals.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
+    # `LLM.apply_model` requires pickling a function.
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
     linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
         GPTQLinearMethod)
 
-    for name, submodule in (vllm_model.llm.llm_engine.model_executor.
-                            driver_worker.model_runner.model.named_modules()):
-        if name == "lm_head":
-            assert isinstance(submodule.quant_method, linear_method_cls)
-        elif name == 'model.layers.0.self_attn.qkv_proj':
-            # The first layer is quantized using bits=4, group_size=128
-            # desc_act=True
-            assert isinstance(submodule.quant_method, linear_method_cls)
-            config = submodule.quant_method.quant_config
-            assert config.weight_bits == 4
-            assert config.group_size == 128
-            assert config.desc_act
-        elif name == 'model.layers.1.self_attn.qkv_proj':
-            # The second layer is quantized using bits=8, group_size=32
-            # desc_act=False
-            assert isinstance(submodule.quant_method, linear_method_cls)
-            config = submodule.quant_method.quant_config
-            assert get_dynamic_override(config, layer_name=name,
-                                        key="bits") == 8
-            assert get_dynamic_override(config,
-                                        layer_name=name,
-                                        key="group_size") == 32
-            assert not get_dynamic_override(
-                config, layer_name=name, key="desc_act")
-        elif (name == 'model.layers.2.self_attn.qkv_proj'
-              or name == 'model.layers.2.mlp.gate_up_proj'):
-            # All other layers (layer index >= 2) are not quantized
-            assert isinstance(submodule.quant_method, UnquantizedLinearMethod)
+    with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as llm:
+
+        def check_model(model):
+            for name, submodule in model.named_modules():
+                if name == "lm_head":
+                    assert isinstance(submodule.quant_method,
+                                      linear_method_cls)
+                elif name == 'model.layers.0.self_attn.qkv_proj':
+                    # The first layer is quantized using bits=4, group_size=128
+                    # desc_act=True
+                    assert isinstance(submodule.quant_method,
+                                      linear_method_cls)
+                    config = submodule.quant_method.quant_config
+                    assert config.weight_bits == 4
+                    assert config.group_size == 128
+                    assert config.desc_act
+                elif name == 'model.layers.1.self_attn.qkv_proj':
+                    # The second layer is quantized using bits=8, group_size=32
+                    # desc_act=False
+                    assert isinstance(submodule.quant_method,
+                                      linear_method_cls)
+                    config = submodule.quant_method.quant_config
+                    assert get_dynamic_override(config,
+                                                layer_name=name,
+                                                key="bits") == 8
+                    assert get_dynamic_override(config,
+                                                layer_name=name,
+                                                key="group_size") == 32
+                    assert not get_dynamic_override(
+                        config, layer_name=name, key="desc_act")
+                elif (name == 'model.layers.2.self_attn.qkv_proj'
+                      or name == 'model.layers.2.mlp.gate_up_proj'):
+                    # All other layers (layer index >= 2) are not quantized
+                    assert isinstance(submodule.quant_method,
+                                      UnquantizedLinearMethod)
 
-    del vllm_model
+        llm.apply_model(check_model)
@@ -29,8 +29,8 @@ def test_lm_head(
     lm_head_quantized: bool,
     monkeypatch,
 ) -> None:
-    # vllm_runner.apply_model() relies on V0 internals.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    # `LLM.apply_model` requires pickling a function.
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
     with vllm_runner(model_id, dtype=torch.float16,
                      max_model_len=2048) as vllm_model:
 
 
@@ -11,16 +11,12 @@
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
-from vllm.platforms import current_platform
 
 
 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This module relies on V0 internals, so set VLLM_USE_V1=0.
-    """
-    if not current_platform.is_cpu():
-        monkeypatch.setenv('VLLM_USE_V1', '0')
+def enable_pickle(monkeypatch):
+    """`LLM.apply_model` requires pickling a function."""
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
 
 @pytest.mark.skipif(not is_quant_method_supported("modelopt"),