|  | 
| 31 | 31 | @pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT) | 
| 32 | 32 | def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool, | 
| 33 | 33 |                            monkeypatch): | 
| 34 |  | -    # vllm_runner.apply_model() relies on V0 internals. | 
| 35 |  | -    monkeypatch.setenv("VLLM_USE_V1", "0") | 
| 36 |  | - | 
| 37 |  | -    vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) | 
|  | 34 | +    # `LLM.apply_model` requires pickling a function. | 
|  | 35 | +    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") | 
| 38 | 36 | 
 | 
| 39 | 37 |     linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else ( | 
| 40 | 38 |         GPTQLinearMethod) | 
| 41 | 39 | 
 | 
| 42 |  | -    for name, submodule in (vllm_model.llm.llm_engine.model_executor. | 
| 43 |  | -                            driver_worker.model_runner.model.named_modules()): | 
| 44 |  | -        if name == "lm_head": | 
| 45 |  | -            assert isinstance(submodule.quant_method, linear_method_cls) | 
| 46 |  | -        elif name == 'model.layers.0.self_attn.qkv_proj': | 
| 47 |  | -            # The first layer is quantized using bits=4, group_size=128 | 
| 48 |  | -            # desc_act=True | 
| 49 |  | -            assert isinstance(submodule.quant_method, linear_method_cls) | 
| 50 |  | -            config = submodule.quant_method.quant_config | 
| 51 |  | -            assert config.weight_bits == 4 | 
| 52 |  | -            assert config.group_size == 128 | 
| 53 |  | -            assert config.desc_act | 
| 54 |  | -        elif name == 'model.layers.1.self_attn.qkv_proj': | 
| 55 |  | -            # The second layer is quantized using bits=8, group_size=32 | 
| 56 |  | -            # desc_act=False | 
| 57 |  | -            assert isinstance(submodule.quant_method, linear_method_cls) | 
| 58 |  | -            config = submodule.quant_method.quant_config | 
| 59 |  | -            assert get_dynamic_override(config, layer_name=name, | 
| 60 |  | -                                        key="bits") == 8 | 
| 61 |  | -            assert get_dynamic_override(config, | 
| 62 |  | -                                        layer_name=name, | 
| 63 |  | -                                        key="group_size") == 32 | 
| 64 |  | -            assert not get_dynamic_override( | 
| 65 |  | -                config, layer_name=name, key="desc_act") | 
| 66 |  | -        elif (name == 'model.layers.2.self_attn.qkv_proj' | 
| 67 |  | -              or name == 'model.layers.2.mlp.gate_up_proj'): | 
| 68 |  | -            # All other layers (layer index >= 2) are not quantized | 
| 69 |  | -            assert isinstance(submodule.quant_method, UnquantizedLinearMethod) | 
|  | 40 | +    with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as llm: | 
|  | 41 | + | 
|  | 42 | +        def check_model(model): | 
|  | 43 | +            for name, submodule in model.named_modules(): | 
|  | 44 | +                if name == "lm_head": | 
|  | 45 | +                    assert isinstance(submodule.quant_method, | 
|  | 46 | +                                      linear_method_cls) | 
|  | 47 | +                elif name == 'model.layers.0.self_attn.qkv_proj': | 
|  | 48 | +                    # The first layer is quantized using bits=4, group_size=128 | 
|  | 49 | +                    # desc_act=True | 
|  | 50 | +                    assert isinstance(submodule.quant_method, | 
|  | 51 | +                                      linear_method_cls) | 
|  | 52 | +                    config = submodule.quant_method.quant_config | 
|  | 53 | +                    assert config.weight_bits == 4 | 
|  | 54 | +                    assert config.group_size == 128 | 
|  | 55 | +                    assert config.desc_act | 
|  | 56 | +                elif name == 'model.layers.1.self_attn.qkv_proj': | 
|  | 57 | +                    # The second layer is quantized using bits=8, group_size=32 | 
|  | 58 | +                    # desc_act=False | 
|  | 59 | +                    assert isinstance(submodule.quant_method, | 
|  | 60 | +                                      linear_method_cls) | 
|  | 61 | +                    config = submodule.quant_method.quant_config | 
|  | 62 | +                    assert get_dynamic_override(config, | 
|  | 63 | +                                                layer_name=name, | 
|  | 64 | +                                                key="bits") == 8 | 
|  | 65 | +                    assert get_dynamic_override(config, | 
|  | 66 | +                                                layer_name=name, | 
|  | 67 | +                                                key="group_size") == 32 | 
|  | 68 | +                    assert not get_dynamic_override( | 
|  | 69 | +                        config, layer_name=name, key="desc_act") | 
|  | 70 | +                elif (name == 'model.layers.2.self_attn.qkv_proj' | 
|  | 71 | +                      or name == 'model.layers.2.mlp.gate_up_proj'): | 
|  | 72 | +                    # All other layers (layer index >= 2) are not quantized | 
|  | 73 | +                    assert isinstance(submodule.quant_method, | 
|  | 74 | +                                      UnquantizedLinearMethod) | 
| 70 | 75 | 
 | 
| 71 |  | -    del vllm_model | 
|  | 76 | +        llm.apply_model(check_model) | 
0 commit comments