From c1478588850c0cc68ce25c07aaab8946b7788a24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=91=A3=E6=99=93=E9=BE=99?= Date: Fri, 22 Sep 2023 11:05:47 +0800 Subject: [PATCH 1/2] add vllm awq quantization --- fastchat/serve/vllm_worker.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py index 8e255b79c..2fe8e6304 100644 --- a/fastchat/serve/vllm_worker.py +++ b/fastchat/serve/vllm_worker.py @@ -210,6 +210,8 @@ async def api_model_details(request: Request): args.model = args.model_path if args.num_gpus > 1: args.tensor_parallel_size = args.num_gpus + if args.quantizaiton: + args.quantization = args.quantization engine_args = AsyncEngineArgs.from_cli_args(args) engine = AsyncLLMEngine.from_engine_args(engine_args) From 5baeb38bb75357a43222a323e2ee33abf2bcf62b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=91=A3=E6=99=93=E9=BE=99?= Date: Fri, 22 Sep 2023 11:15:54 +0800 Subject: [PATCH 2/2] vllm docs update --- docs/vllm_integration.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/vllm_integration.md b/docs/vllm_integration.md index 1886b1009..021fc3853 100644 --- a/docs/vllm_integration.md +++ b/docs/vllm_integration.md @@ -18,3 +18,8 @@ See the supported models [here](https://vllm.readthedocs.io/en/latest/models/sup ``` python3 -m fastchat.serve.vllm_worker --model-path lmsys/vicuna-7b-v1.3 --tokenizer hf-internal-testing/llama-tokenizer ``` + + if you use a awq model, try + ''' + python3 -m fastchat.serve.vllm_worker --model-path TheBloke/vicuna-7B-v1.5-AWQ --quantization awq + '''