From 28201225ce7b183268aec82f4ee9e64d60891163 Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Wed, 8 May 2024 10:26:45 +0800 Subject: [PATCH 1/7] ENH: make deepseek_vl support streaming output --- xinference/model/llm/pytorch/deepseek_vl.py | 113 +++++++++++++++----- 1 file changed, 84 insertions(+), 29 deletions(-) diff --git a/xinference/model/llm/pytorch/deepseek_vl.py b/xinference/model/llm/pytorch/deepseek_vl.py index 0dd9bb5e79..20eda4e719 100644 --- a/xinference/model/llm/pytorch/deepseek_vl.py +++ b/xinference/model/llm/pytorch/deepseek_vl.py @@ -27,9 +27,11 @@ from ....model.utils import select_device from ....types import ( ChatCompletion, - ChatCompletionChoice, ChatCompletionChunk, ChatCompletionMessage, + Completion, + CompletionChoice, + CompletionChunk, CompletionUsage, ) from ..llm_family import LLMFamilyV1, LLMSpecV1 @@ -149,10 +151,11 @@ def chat( chat_history: Optional[List[ChatCompletionMessage]] = None, generate_config: Optional[PytorchGenerateConfig] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: - if generate_config and generate_config.get("stream"): - raise Exception( - f"Chat with model {self.model_family.model_name} does not support stream." - ) + if not generate_config: + generate_config = {} + + stream = generate_config.get("stream", False) + prompt, images = self._message_content_to_deepseek(prompt) prompt_messages: List[Dict[str, Any]] = [ { @@ -185,6 +188,7 @@ def chat( deepseek_history.extend(prompt_messages) from ....thirdparty.deepseek_vl.utils.io import load_pil_images + from ....thirdparty.deepseek_vl.serve.inference import generate # load images and prepare for inputs pil_images = load_pil_images(deepseek_history) @@ -192,37 +196,48 @@ def chat( conversations=deepseek_history, images=pil_images, force_batchify=True ).to(self._model.device, self._model.dtype) - # run image encoder to get the image embeddings - inputs_embeds = self._model.prepare_inputs_embeds(**prepare_inputs) - - # run the model to get the response - outputs = self._model.language_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=prepare_inputs.attention_mask, - pad_token_id=self._tokenizer.eos_token_id, - bos_token_id=self._tokenizer.bos_token_id, - eos_token_id=self._tokenizer.eos_token_id, - max_new_tokens=512, - do_sample=True, - top_p=0.95, - temperature=0.2, - repetition_penalty=1.1, - use_cache=True, - ) + temperature = generate_config.get("temperature", 0.2) + top_p = generate_config.get("top_p", 0.95) + max_new_tokens = generate_config.get("max_tokens", 512) + repetition_penalty = generate_config.get("repetition_penalty", 1.1) + + conversation = self._vl_chat_processor.new_chat_template() + stop_str = conversation.sep2 + stop_words = [stop_str] - answer = self._tokenizer.decode( - outputs[0].cpu().tolist(), skip_special_tokens=True + streamer = generate( + vl_gpt=self._model, + tokenizer=self._tokenizer, + prepare_inputs=prepare_inputs, + max_gen_len=max_new_tokens, + temperature=temperature, + repetition_penalty=repetition_penalty, + top_p=top_p, + stop_words=stop_words ) - return ChatCompletion( - id="chat" + str(uuid.uuid1()), - object="chat.completion", + if stream: + it = self._generate_stream(streamer, stop_str) + return self._to_chat_completion_chunks(it) + else: + c = self._generate(streamer, stop_str) + return self._to_chat_completion(c) + + def _generate(self, streamer, stop_str)->Completion: + generated_text = "" + for new_text in streamer: + if new_text.endswith(stop_str): + new_text = new_text[: -len(stop_str)] + generated_text += new_text + + c = Completion( + id=str(uuid.uuid1()), created=int(time.time()), model=self.model_uid, choices=[ - ChatCompletionChoice( + CompletionChoice( index=0, - message={"role": "assistant", "content": answer}, + text=generated_text, finish_reason="stop", ) ], @@ -230,3 +245,43 @@ def chat( prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 ), ) + return c + + def _generate_stream(self, streamer, stop_str)-> Iterator[CompletionChunk]: + completion_id = str(uuid.uuid1()) + for i, new_text in enumerate(streamer): + if new_text.endswith(stop_str): + new_text = new_text[: -len(stop_str)] + completion_choice = CompletionChoice( + text=new_text, index=0, logprobs=None, finish_reason=None + ) + chunk = CompletionChunk( + id=completion_id, + created=int(time.time()), + model=self.model_uid, + choices=[completion_choice], + ) + completion_usage = CompletionUsage( + prompt_tokens=-1, + completion_tokens=-1, + total_tokens=-1, + ) + chunk["usage"] = completion_usage + yield chunk + + completion_choice = CompletionChoice( + text="", index=0, logprobs=None, finish_reason="stop" + ) + chunk = CompletionChunk( + id=completion_id, + created=int(time.time()), + model=self.model_uid, + choices=[completion_choice], + ) + completion_usage = CompletionUsage( + prompt_tokens=-1, + completion_tokens=-1, + total_tokens=-1, + ) + chunk["usage"] = completion_usage + yield chunk \ No newline at end of file From fd0eb1c6ea089dc961381699a60f86809e7e9932 Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Wed, 8 May 2024 10:43:37 +0800 Subject: [PATCH 2/7] correct format --- xinference/model/llm/pytorch/deepseek_vl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xinference/model/llm/pytorch/deepseek_vl.py b/xinference/model/llm/pytorch/deepseek_vl.py index 20eda4e719..bd6d4670d1 100644 --- a/xinference/model/llm/pytorch/deepseek_vl.py +++ b/xinference/model/llm/pytorch/deepseek_vl.py @@ -223,7 +223,7 @@ def chat( c = self._generate(streamer, stop_str) return self._to_chat_completion(c) - def _generate(self, streamer, stop_str)->Completion: + def _generate(self, streamer, stop_str) -> Completion: generated_text = "" for new_text in streamer: if new_text.endswith(stop_str): @@ -247,7 +247,7 @@ def _generate(self, streamer, stop_str)->Completion: ) return c - def _generate_stream(self, streamer, stop_str)-> Iterator[CompletionChunk]: + def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]: completion_id = str(uuid.uuid1()) for i, new_text in enumerate(streamer): if new_text.endswith(stop_str): @@ -284,4 +284,4 @@ def _generate_stream(self, streamer, stop_str)-> Iterator[CompletionChunk]: total_tokens=-1, ) chunk["usage"] = completion_usage - yield chunk \ No newline at end of file + yield chunk From 32a64d6b3d2690e7bef28afed9a03ce0a8f22103 Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Wed, 8 May 2024 10:46:28 +0800 Subject: [PATCH 3/7] black reformat --- xinference/model/llm/pytorch/deepseek_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/pytorch/deepseek_vl.py b/xinference/model/llm/pytorch/deepseek_vl.py index bd6d4670d1..8eb25834a8 100644 --- a/xinference/model/llm/pytorch/deepseek_vl.py +++ b/xinference/model/llm/pytorch/deepseek_vl.py @@ -213,7 +213,7 @@ def chat( temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p, - stop_words=stop_words + stop_words=stop_words, ) if stream: From 54a88c2d90f5a862cb8e784c94eaae8118b5cdc3 Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Wed, 8 May 2024 10:51:57 +0800 Subject: [PATCH 4/7] isort reformat --- xinference/model/llm/pytorch/deepseek_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/pytorch/deepseek_vl.py b/xinference/model/llm/pytorch/deepseek_vl.py index 8eb25834a8..c3fd015696 100644 --- a/xinference/model/llm/pytorch/deepseek_vl.py +++ b/xinference/model/llm/pytorch/deepseek_vl.py @@ -187,8 +187,8 @@ def chat( deepseek_history.extend(prompt_messages) - from ....thirdparty.deepseek_vl.utils.io import load_pil_images from ....thirdparty.deepseek_vl.serve.inference import generate + from ....thirdparty.deepseek_vl.utils.io import load_pil_images # load images and prepare for inputs pil_images = load_pil_images(deepseek_history) From 0d8405e06088645d3ebcb976ae40f7d9c2ce555e Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Wed, 8 May 2024 11:04:01 +0800 Subject: [PATCH 5/7] fix missing key --- xinference/model/llm/pytorch/deepseek_vl.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xinference/model/llm/pytorch/deepseek_vl.py b/xinference/model/llm/pytorch/deepseek_vl.py index c3fd015696..d0e79f6c41 100644 --- a/xinference/model/llm/pytorch/deepseek_vl.py +++ b/xinference/model/llm/pytorch/deepseek_vl.py @@ -232,6 +232,7 @@ def _generate(self, streamer, stop_str) -> Completion: c = Completion( id=str(uuid.uuid1()), + object="text_completion", created=int(time.time()), model=self.model_uid, choices=[ @@ -257,6 +258,7 @@ def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]: ) chunk = CompletionChunk( id=completion_id, + object="text_completion", created=int(time.time()), model=self.model_uid, choices=[completion_choice], @@ -274,6 +276,7 @@ def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]: ) chunk = CompletionChunk( id=completion_id, + object="text_completion", created=int(time.time()), model=self.model_uid, choices=[completion_choice], From eb55b54e2290883e27b25b5662a556fd7820a05d Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Wed, 8 May 2024 11:06:25 +0800 Subject: [PATCH 6/7] fix missing key --- xinference/model/llm/pytorch/deepseek_vl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xinference/model/llm/pytorch/deepseek_vl.py b/xinference/model/llm/pytorch/deepseek_vl.py index d0e79f6c41..b918fb68b5 100644 --- a/xinference/model/llm/pytorch/deepseek_vl.py +++ b/xinference/model/llm/pytorch/deepseek_vl.py @@ -240,6 +240,7 @@ def _generate(self, streamer, stop_str) -> Completion: index=0, text=generated_text, finish_reason="stop", + logprobs=None ) ], usage=CompletionUsage( From 13134d8d6f56b131b6112d836a0c087b8bb90b2f Mon Sep 17 00:00:00 2001 From: Minamiyama Date: Wed, 8 May 2024 11:08:06 +0800 Subject: [PATCH 7/7] black reformat --- xinference/model/llm/pytorch/deepseek_vl.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/xinference/model/llm/pytorch/deepseek_vl.py b/xinference/model/llm/pytorch/deepseek_vl.py index b918fb68b5..bbe98a97e0 100644 --- a/xinference/model/llm/pytorch/deepseek_vl.py +++ b/xinference/model/llm/pytorch/deepseek_vl.py @@ -237,10 +237,7 @@ def _generate(self, streamer, stop_str) -> Completion: model=self.model_uid, choices=[ CompletionChoice( - index=0, - text=generated_text, - finish_reason="stop", - logprobs=None + index=0, text=generated_text, finish_reason="stop", logprobs=None ) ], usage=CompletionUsage(