Merge pull request vllm-project#25 from slyalin/disable_int8

ilya-lavrenov · web-flow · commit f35263ff4454 · 2024-04-17T12:47:21.000+04:00
Disable weight compression on optimum-intel conversion path
diff --git a/use_with_openvino.md b/use_with_openvino.md
@@ -52,7 +52,7 @@ python3 benchmark_serving.py --backend openai --endpoint /v1/completions --port
 ```
 
 
-## Use vLLM offline 
+## Use vLLM offline
 
 _All below steps assume you are in `vllm` root directory._
 
@@ -82,3 +82,11 @@ docker run --rm -it --entrypoint python3 -v $HOME/.cache/huggingface:/root/.cach
 # --num-prompts <number of requests to send> (default: 1000)
 # --swap-space <GiB for KV cache> (default: 50)
 ```
+
+## Use Int-8 Weights Compression
+
+Weights int-8 compression is disabled by default. For better performance and lesser memory consumption, the weights compression can be enabled by setting the environment variable `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1`.
+To pass the variable in docker, use `-e VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1` as an additional argument to `docker run` command in the examples above.
+
+The variable enables weights compression logic described in [optimum-intel 8-bit weights quantization](https://huggingface.co/docs/optimum/intel/optimization_ov#8-bit).
+Hence, even if the variable is enabled, the compression is applied only for models starting with a certain size and avoids compression of too small models due to a significant accuracy drop.
diff --git a/vllm/model_executor/openvino_model_loader.py b/vllm/model_executor/openvino_model_loader.py
@@ -599,10 +599,12 @@ def get_model(model_config: ModelConfig,
         else:
             print(f'[ INFO ] OpenVINO IR is avaialble for provided model id {model_config.model}. '
                   'This IR will be used for inference as-is, all possible options that may affect model conversion are ignored.')
+        load_in_8bit = None if os.environ.get('VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS', '0') == '1' else False
         pt_model = OVModelForCausalLM.from_pretrained(
             model_config.model,
             export=export,
             compile=False,
+            load_in_8bit=load_in_8bit,
             trust_remote_code=model_config.trust_remote_code
         )
         patch_stateful_model(pt_model.model, kv_cache_dtype, device_config.device.type == "cpu")