vllm-project · youkaichao · Oct 8, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
@@ -495,30 +495,25 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
         assert len(batch.choices) == 2
         assert batch.choices[0].text == batch.choices[1].text
 
-        try:
-            # test n = 2
-            batch = await client.completions.create(
-                model=model_name,
-                prompt=prompts,
-                n=2,
-                max_tokens=5,
-                temperature=0.0,
-                extra_body=dict(
-                    # NOTE: this has to be true for n > 1 in vLLM, but
-                    # not necessary for official client.
-                    use_beam_search=True),
-            )
-            assert len(batch.choices) == 4
-            assert batch.choices[0].text != batch.choices[
-                1].text, "beam search should be different"
-            assert batch.choices[0].text == batch.choices[
-                2].text, "two copies of the same prompt should be the same"
-            assert batch.choices[1].text == batch.choices[
-                3].text, "two copies of the same prompt should be the same"
-        except BadRequestError as e:
-            # the only allowed exception is when beam search is not supported
-            # in the default mqllmengine
-            assert "--disable-frontend-multiprocessing" in str(e)
+        # test n = 2
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            n=2,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body=dict(
+                # NOTE: this has to be true for n > 1 in vLLM, but
+                # not necessary for official client.
+                use_beam_search=True),
+        )
+        assert len(batch.choices) == 4
+        assert batch.choices[0].text != batch.choices[
+            1].text, "beam search should be different"
+        assert batch.choices[0].text == batch.choices[
+            2].text, "two copies of the same prompt should be the same"
+        assert batch.choices[1].text == batch.choices[
+            3].text, "two copies of the same prompt should be the same"
 
         # test streaming
         batch = await client.completions.create(

diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
@@ -2,8 +2,8 @@
 import copy
 import pickle
 from contextlib import contextmanager, suppress
-from typing import (Any, AsyncGenerator, Dict, Iterator, Mapping, Optional,
-                    Union, overload)
+from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
+                    Optional, Union, overload)
 
 import cloudpickle
 import zmq
@@ -26,15 +26,18 @@
                                          RPCStartupRequest, RPCStartupResponse,
                                          RPCUProfileRequest)
 # yapf: enable
+from vllm.entrypoints.llm import BeamSearchSequence
 from vllm.envs import VLLM_RPC_TIMEOUT
-from vllm.inputs import PromptType
+from vllm.inputs import PromptType, TokensPrompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.outputs import (CompletionOutput, EmbeddingRequestOutput,
+                          RequestOutput)
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
-from vllm.utils import deprecate_kwargs
+from vllm.utils import (collect_from_async_generator, deprecate_kwargs,
+                        get_beam_search_score, random_uuid)
 
 logger = init_logger(__name__)
 
@@ -441,6 +444,107 @@ def generate(
                                      lora_request, trace_headers,
                                      prompt_adapter_request, priority)
 
+    async def beam_search(
+        self,
+        prompt: Union[PromptType, List[int]],
+        request_id: str,
+        params: BeamSearchParams,
+        lora_request: Optional[LoRARequest] = None
+    ) -> AsyncGenerator[RequestOutput, None]:
+
+        beam_width = params.beam_width
+        max_tokens = params.max_tokens
+        ignore_eos = params.ignore_eos
+        temperature = params.temperature
+        length_penalty = params.length_penalty
+
+        def sort_beams_key(x: BeamSearchSequence) -> float:
+            return get_beam_search_score(x.tokens, x.cum_logprob,
+                                         tokenizer.eos_token_id,
+                                         length_penalty)
+
+        tokenizer = await self.get_tokenizer(lora_request)
+        tokenizedPrompt = prompt if isinstance(
+            prompt, list) else tokenizer.encode(prompt)
+        tokenizedLength = len(tokenizedPrompt)
+
+        beam_search_params = SamplingParams(logprobs=2 * beam_width,
+                                            max_tokens=1,
+                                            temperature=temperature)
+        all_beams = [BeamSearchSequence(tokens=tokenizedPrompt, cum_logprob=0)]
+        completed = []
+
+        for _ in range(max_tokens):
+            prompts_batch = [
+                TokensPrompt(prompt_token_ids=beam.tokens)
+                for beam in all_beams
+            ]
+
+            tasks = []
+
+            request_id = f"beam_search-{random_uuid()}"
+            for i, individual_prompt in enumerate(prompts_batch):
+                request_id_item = f"{request_id}-{i}"
+                task = asyncio.create_task(
+                    collect_from_async_generator(
+                        self.generate(individual_prompt, beam_search_params,
+                                      request_id_item)))
+                tasks.append(task)
+
+            output = await asyncio.gather(*tasks)
+
+            output = [x[0] for x in output]
+
+            logger.info(output)
+
+            new_beams = []
+            for i, current_beam in enumerate(all_beams):
+                result = output[i]
+
+                if result.outputs[0].logprobs is not None:
+                    logprobs = result.outputs[0].logprobs[0]
+                    for token_id, logprob_obj in logprobs.items():
+                        new_beam = BeamSearchSequence(
+                            tokens=current_beam.tokens + [token_id],
+                            cum_logprob=current_beam.cum_logprob +
+                            logprob_obj.logprob)
+
+                        if token_id == tokenizer.eos_token_id and \
+                            not ignore_eos:
+                            completed.append(new_beam)
+                        else:
+                            new_beams.append(new_beam)
+
+            sorted_beams = sorted(new_beams, key=sort_beams_key, reverse=True)
+            all_beams = sorted_beams[:beam_width]
+
+        completed.extend(all_beams)
+        sorted_completed = sorted(completed, key=sort_beams_key, reverse=True)
+        best_beams = sorted_completed[:beam_width]
+
+        for beam in best_beams:
+            beam.text = tokenizer.decode(beam.tokens[tokenizedLength:])
+
+        beam_search_output = RequestOutput(
+            request_id=request_id,
+            prompt=prompt,
+            outputs=[
+                CompletionOutput(
+                    text=beam.text,
+                    cumulative_logprob=beam.cum_logprob,
+                    token_ids=beam.tokens,
+                    index=i,
+                    logprobs=beam.cum_logprob,
+                ) for (i, beam) in enumerate(best_beams)
+            ],
+            finished=True,
+            prompt_token_ids=tokenizedPrompt,
+            prompt_logprobs=None)
+
+        logger.info(beam_search_output)
+
+        yield beam_search_output
+
     @overload  # DEPRECATED
     def encode(
         self,

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -10,6 +10,7 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (ConversationMessage,
                                          apply_hf_chat_template,
@@ -236,15 +237,25 @@ async def create_chat_completion(
                 log_tracing_disabled_warning()
 
             if isinstance(sampling_params, BeamSearchParams):
-                if not isinstance(self.engine_client, AsyncLLMEngine):
+                if isinstance(self.engine_client, AsyncLLMEngine):
+                    result_generator = self.engine_client.beam_search(
+                        engine_inputs['prompt_token_ids'],
+                        request_id,
+                        sampling_params,
+                    )
+                elif isinstance(self.engine_client, MQLLMEngineClient):
+                    result_generator = self.engine_client.beam_search(
+                        engine_inputs['prompt_token_ids'],
+                        request_id,
+                        sampling_params,
+                        lora_request,
+                    )
+                else:
                     raise ValueError(
                         "Beam search in the API server is only supported with"
-                        " AsyncLLMEngine. please add "
+                        " AsyncLLMEngine and MQLLMEngineClient. please add "
                         "`--disable-frontend-multiprocessing` to "
                         "use beam search.")
-                result_generator = self.engine_client.beam_search(
-                    engine_inputs['prompt_token_ids'], request_id,
-                    sampling_params)
             else:
                 result_generator = self.engine_client.generate(
                     engine_inputs,

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -9,6 +9,7 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
@@ -150,15 +151,20 @@ async def create_completion(
                     log_tracing_disabled_warning()
 
                 if isinstance(sampling_params, BeamSearchParams):
-                    if not isinstance(self.engine_client, AsyncLLMEngine):
+                    if isinstance(self.engine_client, AsyncLLMEngine):
+                        generator = self.engine_client.beam_search(
+                            prompt_inputs["prompt_token_ids"], request_id_item,
+                            sampling_params)
+                    elif isinstance(self.engine_client, MQLLMEngineClient):
+                        generator = self.engine_client.beam_search(
+                            prompt_inputs["prompt_token_ids"], request_id_item,
+                            sampling_params, lora_request)
+                    else:
                         raise ValueError(
                             "Beam search in the API server is only supported"
-                            " with AsyncLLMEngine. please add "
-                            "`--disable-frontend-multiprocessing` to "
-                            "use beam search.")
-                    generator = self.engine_client.beam_search(
-                        prompt_inputs["prompt_token_ids"], request_id_item,
-                        sampling_params)
+                            " with AsyncLLMEngine and MQLLMEngineClient."
+                            " please add `--disable-frontend-multiprocessing`"
+                            " to use beam search.")
                 else:
                     generator = self.engine_client.generate(
                         {