From 68b168f297f0849f96bc03647c738b4f8d4f72ad Mon Sep 17 00:00:00 2001 From: LRL-ModelCloud Date: Sat, 16 Nov 2024 11:59:41 +0800 Subject: [PATCH 01/10] mod vllm generate --- gptqmodel/models/base.py | 10 +++++-- gptqmodel/utils/vllm.py | 56 +++++++++++++++++++++++++++------------- 2 files changed, 46 insertions(+), 20 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index dd19f40c3..6c5ee0347 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -600,13 +600,19 @@ def tmp(_, inp, out): @property def device(self): if not self.hf_device_map: - return self.model.device + if hasattr(self.model, "device"): + return self.model.device + else: + return torch.device("cuda" if torch.cuda.is_available() else "cpu") else: device = [d for d in self.hf_device_map.values() if d not in {"disk"}][0] return torch.device(device) def to(self, device: Union[str, torch.device]): - self.model.to(device) + if hasattr(self.model, "to"): + self.model.to(device) + else: + logger.warning(f"{self.model.__class__.__name__} does not support the to() method") return self def forward(self, *args, **kwargs): diff --git a/gptqmodel/utils/vllm.py b/gptqmodel/utils/vllm.py index 8cad116d1..2f8710e3f 100644 --- a/gptqmodel/utils/vllm.py +++ b/gptqmodel/utils/vllm.py @@ -4,6 +4,7 @@ try: from vllm import LLM, SamplingParams + VLLM_AVAILABLE = True except ImportError: VLLM_AVAILABLE = False @@ -22,15 +23,16 @@ def convert_hf_params_to_vllm(hf_params: Dict[str, Any]): 'temperature': hf_params.get('temperature', 1.0), 'top_k': hf_params.get('top_k', -1), 'top_p': hf_params.get('top_p', 1.0), - 'max_tokens': hf_params.get('max_length', 16), + 'max_tokens': hf_params.get('max_length', 2048), 'min_tokens': hf_params.get('min_length', 0), 'stop_token_ids': [hf_params.get('eos_token_id'), None], } return SamplingParams(**params) + def load_model_by_vllm( - model, - **kwargs, + model, + **kwargs, ): if not VLLM_AVAILABLE: raise ValueError(VLLM_INSTALL_HINT) @@ -42,23 +44,41 @@ def load_model_by_vllm( return model -@torch.inference_mode -def vllm_generate( - model, - **kwargs, -): + +@torch.inference_mode() +def vllm_generate(model, **kwargs): if not VLLM_AVAILABLE: raise ValueError(VLLM_INSTALL_HINT) - prompts = kwargs.pop("prompts", None) - sampling_params = kwargs.pop("sampling_params", None) + # Extract and validate prompts + prompts = kwargs.pop("prompts", None) or kwargs.pop("input_ids", None) + if prompts is None: + raise ValueError("Either prompts or input_ids must be provided") + sampling_params = kwargs.get("sampling_params") if not isinstance(sampling_params, SamplingParams): - hf_params = {key: kwargs[key] for key in [ - 'num_return_sequences', 'repetition_penalty', 'temperature', - 'top_k', 'top_p', 'max_length', 'min_length', 'eos_token_id' - ] if key in kwargs} - sampling_params = convert_hf_params_to_vllm(hf_params) - - outputs = model.generate(prompts, sampling_params) - return outputs + hf_params = { + key: kwargs.get(key) for key in [ + 'num_return_sequences', 'repetition_penalty', 'temperature', + 'top_k', 'top_p', 'max_length', 'min_length', 'eos_token_id' + ] + } + sampling_params = convert_hf_params_to_vllm({k: v for k, v in hf_params.items() if v is not None}) + + # Convert prompts to vLLM format + if isinstance(prompts, (list, torch.Tensor)): + token_ids = prompts.tolist() if isinstance(prompts, torch.Tensor) else prompts + req_results = model.generate(prompt_token_ids=token_ids, sampling_params=sampling_params) + else: + req_results = model.generate(prompts=prompts, sampling_params=sampling_params) + + outputs = [] + for result in req_results: + combined_token_ids = result.prompt_token_ids + list(result.outputs[0].token_ids) + outputs.append(combined_token_ids) + + pad_token_id = model.get_tokenizer().pad_token_id + max_length = max(len(sublist) for sublist in outputs) + padded_list = [sublist + [pad_token_id] * (max_length - len(sublist)) for sublist in outputs] + + return torch.Tensor(padded_list).to(torch.uint32) From e3668f345037b0c6bc607aa529ef9ac70a09a891 Mon Sep 17 00:00:00 2001 From: LRL-ModelCloud Date: Fri, 13 Dec 2024 14:18:55 +0800 Subject: [PATCH 02/10] fix vllm get device --- gptqmodel/models/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 6c9a9c46a..cf31d7fda 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -705,6 +705,8 @@ def device(self): if not self.hf_device_map: if hasattr(self.model, "device"): return self.model.device + elif hasattr(self.model, "llm_engine"): + return self.model.llm_engine.device_config.device_type else: return torch.device("cuda" if torch.cuda.is_available() else "cpu") else: From 7bd70b5b1fa4ddbf6bdda7b5cdf463dfb157a7d6 Mon Sep 17 00:00:00 2001 From: LRL-ModelCloud Date: Fri, 13 Dec 2024 15:20:40 +0800 Subject: [PATCH 03/10] fix vllm/gptq_marlin --- gptqmodel/integration/src/vllm/gptq_marlin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gptqmodel/integration/src/vllm/gptq_marlin.py b/gptqmodel/integration/src/vllm/gptq_marlin.py index dfbaa8f95..06e2c7eda 100644 --- a/gptqmodel/integration/src/vllm/gptq_marlin.py +++ b/gptqmodel/integration/src/vllm/gptq_marlin.py @@ -72,7 +72,7 @@ def __init__( def update_config(self, prefix: str): bits = self.weight_bits # check for variable/dynamic config - if len(self.dynamic) > 0 and prefix: + if self.dynamic and len(self.dynamic) > 0 and prefix: bits = self.dynamic_get(prefix, "bits", bits) self.group_size = self.dynamic_get(prefix, "group_size", self.group_size) self.desc_act = self.dynamic_get(prefix, "actorder", self.desc_act) @@ -156,7 +156,7 @@ def dynamic_get(self, layer_name: str, key: str = None, default_value: Union[int def get_quant_method(self, layer: torch.nn.Module, prefix: str ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod", UnquantizedLinearMethod]]: - if self.dynamic_get(layer_name=prefix) == False: # noqa: E712 + if self.dynamic and self.dynamic_get(layer_name=prefix) == False: # noqa: E712 return UnquantizedLinearMethod() if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead) From 6046de7d74f82e1a6205243e0913a6335f7c538a Mon Sep 17 00:00:00 2001 From: LRL-ModelCloud Date: Fri, 13 Dec 2024 16:28:29 +0800 Subject: [PATCH 04/10] fix vllm generate --- gptqmodel/utils/vllm.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/gptqmodel/utils/vllm.py b/gptqmodel/utils/vllm.py index 2f8710e3f..da2669a23 100644 --- a/gptqmodel/utils/vllm.py +++ b/gptqmodel/utils/vllm.py @@ -25,7 +25,7 @@ def convert_hf_params_to_vllm(hf_params: Dict[str, Any]): 'top_p': hf_params.get('top_p', 1.0), 'max_tokens': hf_params.get('max_length', 2048), 'min_tokens': hf_params.get('min_length', 0), - 'stop_token_ids': [hf_params.get('eos_token_id'), None], + 'stop_token_ids': [hf_params.get('eos_token_id'), [None, None]], } return SamplingParams(**params) @@ -66,11 +66,17 @@ def vllm_generate(model, **kwargs): sampling_params = convert_hf_params_to_vllm({k: v for k, v in hf_params.items() if v is not None}) # Convert prompts to vLLM format - if isinstance(prompts, (list, torch.Tensor)): - token_ids = prompts.tolist() if isinstance(prompts, torch.Tensor) else prompts - req_results = model.generate(prompt_token_ids=token_ids, sampling_params=sampling_params) - else: + if isinstance(prompts, torch.Tensor): + req_results = model.generate(prompt_token_ids=prompts.tolist(), sampling_params=sampling_params) + elif isinstance(prompts, list): + if isinstance(prompts[0], list) or isinstance(prompts[0], int): + req_results = model.generate(prompt_token_ids=prompts, sampling_params=sampling_params) + else: + req_results = model.generate(prompts=prompts, sampling_params=sampling_params) + elif isinstance(prompts, str): req_results = model.generate(prompts=prompts, sampling_params=sampling_params) + else: + raise ValueError(f"Invalid input type for vllm_generate, type is {type(prompts)}") outputs = [] for result in req_results: @@ -78,6 +84,8 @@ def vllm_generate(model, **kwargs): outputs.append(combined_token_ids) pad_token_id = model.get_tokenizer().pad_token_id + if pad_token_id is None: + pad_token_id = model.get_tokenizer().eos_token_id max_length = max(len(sublist) for sublist in outputs) padded_list = [sublist + [pad_token_id] * (max_length - len(sublist)) for sublist in outputs] From 85a1d13810a25f20b77b89c00156b6bace2b20f5 Mon Sep 17 00:00:00 2001 From: LRL-ModelCloud Date: Fri, 13 Dec 2024 16:35:37 +0800 Subject: [PATCH 05/10] fix test_vllm --- gptqmodel/utils/vllm.py | 3 +-- tests/test_vllm.py | 48 +++++++++++++++++++++-------------------- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/gptqmodel/utils/vllm.py b/gptqmodel/utils/vllm.py index da2669a23..21c90b29f 100644 --- a/gptqmodel/utils/vllm.py +++ b/gptqmodel/utils/vllm.py @@ -11,7 +11,6 @@ VLLM_INSTALL_HINT = "vLLM not installed. Please install via `pip install -U vllm`." - # returns SamplingParams but we can't use this typehint since vLLM is optional depend def convert_hf_params_to_vllm(hf_params: Dict[str, Any]): if not VLLM_AVAILABLE: @@ -25,7 +24,7 @@ def convert_hf_params_to_vllm(hf_params: Dict[str, Any]): 'top_p': hf_params.get('top_p', 1.0), 'max_tokens': hf_params.get('max_length', 2048), 'min_tokens': hf_params.get('min_length', 0), - 'stop_token_ids': [hf_params.get('eos_token_id'), [None, None]], + 'stop_token_ids': [hf_params.get('eos_token_id'), []], } return SamplingParams(**params) diff --git a/tests/test_vllm.py b/tests/test_vllm.py index 1ca51acd8..84521783a 100644 --- a/tests/test_vllm.py +++ b/tests/test_vllm.py @@ -52,25 +52,27 @@ def test_load_vllm(self): backend=BACKEND.VLLM, gpu_memory_utilization=0.2, ) + + tokenizer = model.get_tokenizer() + outputs = model.generate( prompts=self.prompts, sampling_params=self.sampling_params, ) - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - self.assertEquals(generated_text, " Paris, which is also the capital of France.") - outputs_param = model.generate( + + generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(self.prompts[0]):] + print(f"Prompt: {self.prompts!r}, Generated text: {generated_text!r}") + self.assertEquals(generated_text, " Paris, which is also the capital of France.") + + outputs = model.generate( prompts=self.prompts, temperature=0.8, top_p=0.95, ) - for output in outputs_param: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - self.assertEquals(generated_text, " ___________.\n6. City Name: Paris, France\n7. C") + + generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(self.prompts[0]):] + print(f"Prompt: {self.prompts!r}, Generated text: {generated_text!r}") + self.assertEquals(generated_text, " ___________.\n6. City Name: Paris, Franc[4467 chars]ial?") del model self.release_vllm_model() @@ -82,17 +84,16 @@ def test_load_shared_vllm(self): backend=BACKEND.VLLM, gpu_memory_utilization=0.2, ) + tokenizer = model.get_tokenizer() outputs = model.generate( prompts=self.prompts, temperature=0.8, top_p=0.95, ) - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - self.assertEquals(generated_text, - " Paris, which is also known as the city of love.") + + generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(self.prompts[0]):] + print(f"Prompt: {self.prompts!r}, Generated text: {generated_text!r}") + self.assertEquals(generated_text, " Paris, which is also known as the city of love.") del model self.release_vllm_model() @@ -140,6 +141,8 @@ def test_dynamic(self): gpu_memory_utilization=0.2, ) + tokenizer = model.get_tokenizer() + for name, submodule in model.named_modules(): if name == 'model.model.layers.0.self_attn.q_proj' and isinstance(submodule, BaseQuantLinear): # module 0 was skipped @@ -150,12 +153,11 @@ def test_dynamic(self): temperature=0.8, top_p=0.95, ) - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - self.assertEquals(generated_text, - " Paris, which is also the country's largest city.") + + generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(self.prompts[0]):] + print(f"Prompt: {self.prompts!r}, Generated text: {generated_text!r}") + self.assertEquals(generated_text, + " Paris, which is also the country's largest city.") del model self.release_vllm_model() From d6b2d84886bb1a74650b601329c0bce10086842c Mon Sep 17 00:00:00 2001 From: LRL-ModelCloud Date: Fri, 13 Dec 2024 16:40:08 +0800 Subject: [PATCH 06/10] cleanup --- gptqmodel/utils/vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gptqmodel/utils/vllm.py b/gptqmodel/utils/vllm.py index 21c90b29f..79cdab0d7 100644 --- a/gptqmodel/utils/vllm.py +++ b/gptqmodel/utils/vllm.py @@ -24,7 +24,7 @@ def convert_hf_params_to_vllm(hf_params: Dict[str, Any]): 'top_p': hf_params.get('top_p', 1.0), 'max_tokens': hf_params.get('max_length', 2048), 'min_tokens': hf_params.get('min_length', 0), - 'stop_token_ids': [hf_params.get('eos_token_id'), []], + 'stop_token_ids': [hf_params.get('eos_token_id'), None], } return SamplingParams(**params) From afb7debee2bb2496585978b89fdfda6f032438b6 Mon Sep 17 00:00:00 2001 From: LRL-ModelCloud Date: Fri, 13 Dec 2024 16:50:03 +0800 Subject: [PATCH 07/10] set max_tokens --- tests/test_vllm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_vllm.py b/tests/test_vllm.py index 84521783a..cd4090418 100644 --- a/tests/test_vllm.py +++ b/tests/test_vllm.py @@ -36,7 +36,7 @@ def setUpClass(self): self.prompts = [ "The capital of France is", ] - self.sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + self.sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=16) def release_vllm_model(self): from vllm.distributed.parallel_state import destroy_model_parallel # noqa: E402 @@ -68,11 +68,12 @@ def test_load_vllm(self): prompts=self.prompts, temperature=0.8, top_p=0.95, + max_tokens=16, ) generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(self.prompts[0]):] print(f"Prompt: {self.prompts!r}, Generated text: {generated_text!r}") - self.assertEquals(generated_text, " ___________.\n6. City Name: Paris, Franc[4467 chars]ial?") + self.assertEquals(generated_text, " ___________.\n6. City Name: Paris, France\n7. C") del model self.release_vllm_model() From 7ef8ebc1b90fdb2fdd23b6b3bab3ed69f509e108 Mon Sep 17 00:00:00 2001 From: LRL-ModelCloud Date: Fri, 13 Dec 2024 16:51:51 +0800 Subject: [PATCH 08/10] cleanup --- tests/test_vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_vllm.py b/tests/test_vllm.py index cd4090418..9aeb4d740 100644 --- a/tests/test_vllm.py +++ b/tests/test_vllm.py @@ -68,7 +68,7 @@ def test_load_vllm(self): prompts=self.prompts, temperature=0.8, top_p=0.95, - max_tokens=16, + max_length=16, ) generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(self.prompts[0]):] From 1533f7724114aeb89e7e943642b5db17403f7a1f Mon Sep 17 00:00:00 2001 From: LRL-ModelCloud Date: Fri, 13 Dec 2024 16:55:07 +0800 Subject: [PATCH 09/10] fix vllm samplingparams --- gptqmodel/utils/vllm.py | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/gptqmodel/utils/vllm.py b/gptqmodel/utils/vllm.py index 79cdab0d7..9d0d47d75 100644 --- a/gptqmodel/utils/vllm.py +++ b/gptqmodel/utils/vllm.py @@ -15,18 +15,33 @@ def convert_hf_params_to_vllm(hf_params: Dict[str, Any]): if not VLLM_AVAILABLE: raise ValueError(VLLM_INSTALL_HINT) + sampling_params = SamplingParams() - params = { - 'n': hf_params.get('num_return_sequences', 1), - 'repetition_penalty': hf_params.get('repetition_penalty', 1.0), - 'temperature': hf_params.get('temperature', 1.0), - 'top_k': hf_params.get('top_k', -1), - 'top_p': hf_params.get('top_p', 1.0), - 'max_tokens': hf_params.get('max_length', 2048), - 'min_tokens': hf_params.get('min_length', 0), - 'stop_token_ids': [hf_params.get('eos_token_id'), None], - } - return SamplingParams(**params) + if hf_params.get('num_return_sequences', None): + sampling_params.n = hf_params.get('num_return_sequences') + + if hf_params.get('repetition_penalty', None): + sampling_params.repetition_penalty = hf_params.get('repetition_penalty') + + if hf_params.get('temperature', None): + sampling_params.temperature = hf_params.get('temperature') + + if hf_params.get('top_k', None): + sampling_params.top_k = hf_params.get('top_k') + + if hf_params.get('top_p', None): + sampling_params.top_p = hf_params.get('top_p') + + if hf_params.get('max_length', None): + sampling_params.max_tokens = hf_params.get('max_length') + + if hf_params.get('min_length', None): + sampling_params.min_tokens = hf_params.get('min_length') + + if hf_params.get('eos_token_id', None): + sampling_params.stop_token_ids = [hf_params.get('eos_token_id'), None] + + return sampling_params def load_model_by_vllm( From 78e9243db3d21c6ad2ef800cdb4174256e57d27f Mon Sep 17 00:00:00 2001 From: LRL-ModelCloud Date: Fri, 13 Dec 2024 17:09:59 +0800 Subject: [PATCH 10/10] use top_k=1 --- tests/test_vllm.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/test_vllm.py b/tests/test_vllm.py index 9aeb4d740..271a29c8b 100644 --- a/tests/test_vllm.py +++ b/tests/test_vllm.py @@ -25,7 +25,8 @@ class TestLoadVLLM(unittest.TestCase): @classmethod def setUpClass(self): if importlib.util.find_spec("flashinfer") is None: - subprocess.check_call([sys.executable, "-m", "pip", "install", "flashinfer", "-i", f"https://flashinfer.ai/whl/cu{torch.version.cuda.replace('.', '')}/torch{'.'.join(torch.__version__.split('.')[:2])}"]) + subprocess.check_call([sys.executable, "-m", "pip", "install", "flashinfer", "-i", + f"https://flashinfer.ai/whl/cu{torch.version.cuda.replace('.', '')}/torch{'.'.join(torch.__version__.split('.')[:2])}"]) if importlib.util.find_spec("vllm") is None: subprocess.check_call([sys.executable, "-m", "pip", "install", "vllm>=0.6.2"]) @@ -36,7 +37,7 @@ def setUpClass(self): self.prompts = [ "The capital of France is", ] - self.sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=16) + self.sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=16, top_k=1) def release_vllm_model(self): from vllm.distributed.parallel_state import destroy_model_parallel # noqa: E402 @@ -62,18 +63,19 @@ def test_load_vllm(self): generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(self.prompts[0]):] print(f"Prompt: {self.prompts!r}, Generated text: {generated_text!r}") - self.assertEquals(generated_text, " Paris, which is also the capital of France.") + self.assertEquals(generated_text, " Paris.\n\n2. The capital of the United States is Washington, D") outputs = model.generate( prompts=self.prompts, temperature=0.8, top_p=0.95, max_length=16, + top_k=1, ) - + generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(self.prompts[0]):] print(f"Prompt: {self.prompts!r}, Generated text: {generated_text!r}") - self.assertEquals(generated_text, " ___________.\n6. City Name: Paris, France\n7. C") + self.assertEquals(generated_text, " Paris.\n\n2. The capital of the United States is Washington, D") del model self.release_vllm_model()