From f5b5f94b14942edd8af719cb62fcf4c51a237f4a Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 10 Jun 2024 12:34:30 +0900 Subject: [PATCH 001/126] tp1 draft worker --- vllm/config.py | 5 +- vllm/distributed/parallel_state.py | 53 +++- vllm/spec_decode/single_tp_worker.py | 341 +++++++++++++++++++++++++ vllm/spec_decode/spec_decode_worker.py | 2 + 4 files changed, 393 insertions(+), 8 deletions(-) create mode 100644 vllm/spec_decode/single_tp_worker.py diff --git a/vllm/config.py b/vllm/config.py index 4efdb6cab52c4..a75e9346aac60 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -946,9 +946,8 @@ def create_draft_parallel_config( draft worker can have a different parallel strategy, e.g. TP=1. """ draft_parallel_config = ParallelConfig( - pipeline_parallel_size=target_parallel_config. - pipeline_parallel_size, - tensor_parallel_size=target_parallel_config.tensor_parallel_size, + pipeline_parallel_size=1, + tensor_parallel_size=1, distributed_executor_backend=target_parallel_config. distributed_executor_backend, max_parallel_loading_workers=target_parallel_config. diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 0ebd7a15eab9b..0cbc473c3b6fb 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -5,12 +5,15 @@ """Tensor and pipeline parallel groups.""" from typing import List, Optional +import contextlib import torch from torch.distributed import ProcessGroup import vllm.envs as envs from vllm.logger import init_logger +from datetime import timedelta + logger = init_logger(__name__) _ENABLE_CUSTOM_ALL_REDUCE = True @@ -84,7 +87,7 @@ def init_distributed_environment( local_rank: int = -1, backend: str = "nccl", ): - logger.debug( + logger.info( "world_size=%d rank=%d local_rank=%d " "distributed_init_method=%s backend=%s", world_size, rank, local_rank, distributed_init_method, backend) @@ -97,11 +100,13 @@ def init_distributed_environment( backend=backend, init_method=distributed_init_method, world_size=world_size, + timeout=timedelta(seconds=10), rank=rank) global _DEVICE_WORLD_GROUP, _CPU_WORLD_GROUP _DEVICE_WORLD_GROUP = torch.distributed.group.WORLD ranks = list(range(torch.distributed.get_world_size())) _CPU_WORLD_GROUP = torch.distributed.new_group(ranks=ranks, + timeout=timedelta(seconds=10), backend="gloo") # set the local rank # local_rank is not available in torch ProcessGroup, @@ -180,8 +185,8 @@ def initialize_model_parallel( ranks = list( range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)) - group = torch.distributed.new_group(ranks, backend=backend) - cpu_group = torch.distributed.new_group(ranks, backend="gloo") + group = torch.distributed.new_group(ranks, backend=backend, timeout=timedelta(seconds=10)) + cpu_group = torch.distributed.new_group(ranks, backend="gloo", timeout=timedelta(seconds=10)) if rank in ranks: _TP_DEVICE_GROUP = group _TP_CPU_GROUP = cpu_group @@ -210,8 +215,8 @@ def initialize_model_parallel( "pipeline model parallel group is already initialized") for i in range(num_pipeline_model_parallel_groups): ranks = list(range(i, world_size, num_pipeline_model_parallel_groups)) - group = torch.distributed.new_group(ranks, backend=backend) - cpu_group = torch.distributed.new_group(ranks, backend="gloo") + group = torch.distributed.new_group(ranks, backend=backend, timeout=timedelta(seconds=10)) + cpu_group = torch.distributed.new_group(ranks, backend="gloo", timeout=timedelta(seconds=10)) if rank in ranks: _PP_DEVICE_GROUP = group _PP_CPU_GROUP = cpu_group @@ -257,6 +262,44 @@ def model_parallel_is_initialized(): return (_TP_DEVICE_GROUP is not None and _PP_DEVICE_GROUP is not None) +override = False + +@contextlib.contextmanager +def patch_tensor_parallel_group(group, cpu_group, pynccl_comm=None, ca_comm=None): + global override + assert not override, "should not override during override" + override = True + old_world_group = get_world_group() + old_world_cpu_group = get_cpu_world_group() + old_tp_group = get_tensor_model_parallel_group() + old_tp_cpu_group = get_tensor_model_parallel_cpu_group() + old_tp_pynccl_comm = get_tp_pynccl_communicator() + old_tp_ca_comm = get_tp_ca_communicator() + global _DEVICE_WORLD_GROUP, _CPU_WORLD_GROUP, _TP_DEVICE_GROUP, _TP_CPU_GROUP, _TP_PYNCCL_COMMUNICATOR, _TP_CA_COMMUNICATOR + _DEVICE_WORLD_GROUP = group + _CPU_WORLD_GROUP = cpu_group + _TP_DEVICE_GROUP = group + _TP_CPU_GROUP = cpu_group + _TP_PYNCCL_COMMUNICATOR = pynccl_comm + _TP_CA_COMMUNICATOR = ca_comm + try: + yield + finally: + override = False + _DEVICE_WORLD_GROUP = old_world_group + _CPU_WORLD_GROUP = old_world_cpu_group + _TP_DEVICE_GROUP = old_tp_group + _TP_CPU_GROUP = old_tp_cpu_group + _TP_PYNCCL_COMMUNICATOR = old_tp_pynccl_comm + _TP_CA_COMMUNICATOR = old_tp_ca_comm + + +def get_world_group(): + """Get the GPU world group.""" + assert _DEVICE_WORLD_GROUP is not None, ("World group is not initialized") + return _DEVICE_WORLD_GROUP + + def get_cpu_world_group(): """Get the CPU world group.""" assert _CPU_WORLD_GROUP is not None, ("CPU world group is not initialized") diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/single_tp_worker.py new file mode 100644 index 0000000000000..554b457509259 --- /dev/null +++ b/vllm/spec_decode/single_tp_worker.py @@ -0,0 +1,341 @@ +import copy +from typing import List, Tuple, Set, Optional +import logging + +import torch +import torch.distributed + +from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceGroupMetadata) +from vllm.spec_decode.interfaces import SpeculativeProposals +from vllm.spec_decode.top1_proposer import Top1Proposer +from vllm.lora.request import LoRARequest + +from vllm.distributed.parallel_state import patch_tensor_parallel_group +from vllm.config import ParallelConfig +from vllm.worker.worker_base import WorkerBase + +logger = logging.getLogger(__name__) + + +class SingleTpWorker(WorkerBase): + """Class which allows a speculative draft model to run with tensor parallel + degree of 1, while target model runs with larger tensor parallel degree. + This reduces the overhead of small draft models. + + This is implemented by changing vLLM's tensor parallel group to a group of + size 1 during forward passes. + """ + + @classmethod + def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, + target_parallel_config: ParallelConfig): + """Wrap the worker in a SingleTpWorker if necessary. + """ + draft_tp = draft_parallel_config.tensor_parallel_size + if draft_tp == target_parallel_config.tensor_parallel_size: + return worker + + if draft_tp != 1: + raise ValueError("{cls} only supports tp=1, found " + f"{draft_tp=}") + + logger.info(f"Wrapping {type(worker)} in {cls}") + return cls(worker) + + def __init__( + self, + worker: WorkerBase, # MultiStepWorker + ): + self._worker = worker + self._single_tp_group = None + + # Lazy initialization list. + self._proposer: Top1Proposer + + def is_driver(self) -> bool: + return self._worker.is_driver() + + def init_device(self): + """Initialize the model on all ranks. + + This also creates a single-rank process group containing only the + self process. + """ + world_rank = torch.distributed.get_rank() + self._single_tp_group = torch.distributed.new_group(ranks=[world_rank]) + self._single_tp_cpu_group = torch.distributed.new_group(ranks=[world_rank], + backend="gloo") + + logger.info(f"init_device. world_rank: {world_rank}, single_tp_group: {self._single_tp_group}, single_tp_cput_group: {self._single_tp_cpu_group}") + + with patch_tensor_parallel_group(self._single_tp_group, self._single_tp_cpu_group): + self._worker.init_device() + + self._proposer = Top1Proposer( + self, + self._worker.device, + self.vocab_size, + max_proposal_len=self.max_model_len, + ) + + + def set_include_gpu_probs_tensor(self): + # Need include_gpu_probs_tensor for multi_step_worker + self._worker.set_include_gpu_probs_tensor() + + def load_model(self): + logger.info("SingleTPWorker.load_model()") + with patch_tensor_parallel_group(self._single_tp_group, self._single_tp_cpu_group): + self._worker.load_model() + + def determine_num_available_blocks(self): + """Profile the model on all ranks. + """ + with patch_tensor_parallel_group(self._single_tp_group, self._single_tp_cpu_group): + return self._worker.determine_num_available_blocks() + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int): + """Initialize the cache engine on all ranks. + """ + with patch_tensor_parallel_group(self._single_tp_group, self._single_tp_cpu_group): + self._worker.initialize_cache(num_gpu_blocks, + num_cpu_blocks) + + @torch.inference_mode() + def sampler_output( + self, + sample_len: int, + execute_model_req: ExecuteModelRequest + ) -> Tuple[List[SamplerOutput], bool]: + """Run the model forward pass sample_len times. Returns the list of + sampler output, one per model forward pass, along with indicator of + whether torch tensor in sampler output need to be transposed in latter + sampler_output_to_torch logic. + + For multi step worker, this indicator shall be True. + """ + + ## Worker-side logic: skip + if not self.is_driver(): + logger.info("Workers should not make proposals") + return None + + ## Driver-side logic + self._raise_if_unsupported(execute_model_req) + + # Shallow copy input data so modifications (such as appending tokens) + # do not cause side-effects. + copied_seq_group_metadata_list = self._shallow_copy_inputs( + execute_model_req.seq_group_metadata_list) + copied_execute_model_req = execute_model_req.clone( + copied_seq_group_metadata_list) + + # Assert enough KV space for sample_len tokens per sequence. + self._assert_enough_kv_space(execute_model_req.seq_group_metadata_list, + sample_len) + + # Run model sample_len times. + model_outputs = [] + for i in range(sample_len): + logger.info(f"Driver runs multiple draft steps. {i+1}/{sample_len}") + model_output = self._execute_model_tp1( + execute_model_req=copied_execute_model_req) + assert (len(model_output) == 1 + ), "composing multistep workers not supported" + model_output = model_output[0] + self._append_new_tokens(model_output, + copied_seq_group_metadata_list) + model_outputs.append(model_output) + + return model_outputs, True + + def get_spec_proposals( + self, + proposal_len: int, + execute_model_req: ExecuteModelRequest) -> SpeculativeProposals: + """Produce speculations given an input batch of sequences. The number of + speculative tokens per sequence is determined by max_proposal_len. + """ + with patch_tensor_parallel_group(self._single_tp_group, self._single_tp_cpu_group): + return self._proposer.get_proposals(proposal_len, execute_model_req) + + @torch.inference_mode() + def execute_model(self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + with patch_tensor_parallel_group(self._single_tp_group, self._single_tp_cpu_group): + return self._execute_model_tp1(execute_model_req) + + def _execute_model_tp1( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + logger.info("SingleTPWorker.execute_model_prefill()") + + if execute_model_req is None: + seq_group_metadata_list = None + else: + seq_group_metadata_list = execute_model_req.seq_group_metadata_list + + if not self._worker.is_driver(): + logger.info("Draft worker returns []") + return [] + + assert seq_group_metadata_list is not None + assert execute_model_req is not None + num_seq_groups = len(seq_group_metadata_list) + blocks_to_swap_in = execute_model_req.blocks_to_swap_in + blocks_to_swap_out = execute_model_req.blocks_to_swap_out + blocks_to_copy = execute_model_req.blocks_to_copy + + self._worker.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) + + # If there is no input, we don't need to execute the model. + if num_seq_groups == 0: + return [] + + logger.info("SingleTPWorker._worker.model_runner.execute_model()") + output = self._worker.model_runner.execute_model(seq_group_metadata_list, + self._worker.gpu_cache) + + logger.info("SingleTPWorker.execute_model_prefill() output:") + if output is not None: + for seq_group_output in output.outputs: + for sample in seq_group_output.samples: + logger.info(f"SamplerOutput: {sample}") + + # Worker only supports single-step execution. Wrap the output in a list + # to conform to interface. + return [output] + + + def get_cache_block_size_bytes(self) -> int: + """Return the size of a single cache block, in bytes. Used in + speculative decoding. + """ + return self._worker.get_cache_block_size_bytes() + + def add_lora(self, lora_request: LoRARequest) -> bool: + raise NotImplementedError + + def remove_lora(self, lora_id: int) -> bool: + raise NotImplementedError + + def list_loras(self) -> Set[int]: + raise NotImplementedError + + @property + def max_model_len(self) -> int: + return self._worker.max_model_len + + @property + def vocab_size(self) -> int: + return self._worker.vocab_size + + def _raise_if_unsupported( + self, + execute_model_req: ExecuteModelRequest, + ) -> None: + """MultiStepWorker does not yet implement support for cache swap + operations or beam search. + """ + if any([ + execute_model_req.blocks_to_swap_in, + execute_model_req.blocks_to_swap_out, + execute_model_req.blocks_to_copy + ]): + raise NotImplementedError( + "MultiStepWorker does not support cache operations") + + if any( + len(seq_group_metadata.seq_data.keys()) != 1 + for seq_group_metadata in + execute_model_req.seq_group_metadata_list): + raise NotImplementedError( + "MultiStepWorker does not support beam search.") + + def _shallow_copy_inputs( + self, seq_group_metadata_list: List[SequenceGroupMetadata] + ) -> List[SequenceGroupMetadata]: + """Copy input data structures to remove side-effects when input data + structures are shared with other modules. + + Helpful when the vLLM scheduler runs in the same process as the worker. + The alternative is deep-copying (or other form of deep copy); this has + performance downsides. + """ + + # Shallow-copy the list of SequenceGroupMetadata. This allows us to + # append tokens and change is_prompt without external side-effects. + new_seq_group_metadata_list = [] + + for old_seq_group_metadata in seq_group_metadata_list: + # We must shallow-copy seq_group_metadata as is_prompt could change. + seq_group_metadata = copy.copy(old_seq_group_metadata) + new_seq_group_metadata_list.append(seq_group_metadata) + + # We must shallow-copy seq_data as we will append token ids + new_seq_data = {} + for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): + new_seq_data[seq_id] = copy.copy(old_seq_data) + new_seq_data[ + seq_id].output_token_ids = old_seq_data.output_token_ids[:] + + seq_group_metadata.seq_data = new_seq_data + + return new_seq_group_metadata_list + + def _assert_enough_kv_space( + self, seq_group_metadata_list: List[SequenceGroupMetadata], + num_steps: int) -> None: + """Assert there are enough physical blocks per sequence to store the + current KV plus additional KV from num_steps tokens. + """ + assert self._worker.model_runner.block_size is not None + for seq_group_metadata in seq_group_metadata_list: + # Only one seq_id is guaranteed because there is no beam search. + seq_id = list(seq_group_metadata.seq_data.keys())[0] + seq = seq_group_metadata.seq_data[seq_id] + + # After num_steps, the seq len will be the current seq len + # plus one token per step. + final_seq_len = seq.get_len() + num_steps + + # We will have final_seq_len - 1 KV because vLLM saves KV for a + # token in the iteration after the token was generated. + required_num_kv_slots = final_seq_len - 1 + + # The allocated number of kv slots is the number of allocated blocks + # times the number of slots of block. + number_physical_blocks = len( + seq_group_metadata.block_tables[seq_id]) + allocated_kv_slots = (number_physical_blocks * + self._worker.model_runner.block_size) + + if required_num_kv_slots > allocated_kv_slots: + request_id = seq_group_metadata.request_id + raise ValueError( + "The worker attempted to run " + f"{num_steps} times but found insufficient KV space for " + f"{request_id=} {seq_id=}. ({allocated_kv_slots=} " + f"{required_num_kv_slots=}).") + + def _append_new_tokens( + self, model_output: SamplerOutput, + seq_group_metadata_list: SequenceGroupMetadata) -> None: + """Given model output from a single run, append the tokens to the + sequences. This is normally done outside of the worker, but it is + required if the worker is to perform multiple forward passes. + """ + for seq_group_metadata, sequence_group_outputs in zip( + seq_group_metadata_list, model_output): + seq_group_metadata.is_prompt = False + + for seq_output in sequence_group_outputs.samples: + # NOTE: Beam search is not supported, so we can assume that + # parent_seq_id == seq_id. + seq = seq_group_metadata.seq_data[seq_output.parent_seq_id] + + token_id = seq_output.output_token + token_logprob = seq_output.logprobs[token_id] + + seq.append_token_id(token_id, token_logprob.logprob) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 45d9d5735efc6..76779bf6a152b 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -13,6 +13,7 @@ SpeculativeScorer, SpeculativeScores) from vllm.spec_decode.metrics import AsyncMetricsCollector from vllm.spec_decode.multi_step_worker import MultiStepWorker +from vllm.spec_decode.single_tp_worker import SingleTpWorker from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.util import (create_sequence_group_output, @@ -105,6 +106,7 @@ def create_worker( ngram_prompt_lookup_max) else: proposer_worker = MultiStepWorker(**draft_worker_kwargs) + proposer_worker = SingleTpWorker.maybe_wrap_worker(proposer_worker, draft_worker_kwargs['parallel_config'], scorer_worker.parallel_config) logger.info("Configuring SpecDecodeWorker with proposer=%s", type(proposer_worker)) From 709de21653099a34627875bbd84accc3ffe90f8a Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 10 Jun 2024 15:20:17 +0900 Subject: [PATCH 002/126] refactor singlt_tp_worker --- vllm/spec_decode/multi_step_worker.py | 4 +-- vllm/spec_decode/single_tp_worker.py | 47 ++++++++++++--------------- vllm/worker/worker.py | 2 +- 3 files changed, 23 insertions(+), 30 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index fe15ea33b5f36..6455c5eef7b02 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -6,7 +6,7 @@ from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceGroupMetadata) -from vllm.spec_decode.interfaces import SpeculativeProposals +from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeProposer from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.worker.worker import Worker @@ -28,7 +28,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Lazy initialization list. - self._proposer: Top1Proposer + self._proposer: SpeculativeProposer def init_device(self): super().init_device() diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/single_tp_worker.py index 554b457509259..b000e323eac2c 100644 --- a/vllm/spec_decode/single_tp_worker.py +++ b/vllm/spec_decode/single_tp_worker.py @@ -6,18 +6,18 @@ import torch.distributed from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceGroupMetadata) -from vllm.spec_decode.interfaces import SpeculativeProposals +from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeProposer from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.lora.request import LoRARequest from vllm.distributed.parallel_state import patch_tensor_parallel_group from vllm.config import ParallelConfig -from vllm.worker.worker_base import WorkerBase +from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase logger = logging.getLogger(__name__) -class SingleTpWorker(WorkerBase): +class SingleTpWorker(ProposerWorkerBase): """Class which allows a speculative draft model to run with tensor parallel degree of 1, while target model runs with larger tensor parallel degree. This reduces the overhead of small draft models. @@ -44,16 +44,14 @@ def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, def __init__( self, - worker: WorkerBase, # MultiStepWorker + worker: ProposerWorkerBase, ): self._worker = worker self._single_tp_group = None + self._single_tp_cpu_group = None # Lazy initialization list. - self._proposer: Top1Proposer - - def is_driver(self) -> bool: - return self._worker.is_driver() + self._proposer: SpeculativeProposer def init_device(self): """Initialize the model on all ranks. @@ -80,7 +78,6 @@ def init_device(self): def set_include_gpu_probs_tensor(self): - # Need include_gpu_probs_tensor for multi_step_worker self._worker.set_include_gpu_probs_tensor() def load_model(self): @@ -105,8 +102,8 @@ def initialize_cache(self, num_gpu_blocks: int, @torch.inference_mode() def sampler_output( self, + execute_model_req: ExecuteModelRequest, sample_len: int, - execute_model_req: ExecuteModelRequest ) -> Tuple[List[SamplerOutput], bool]: """Run the model forward pass sample_len times. Returns the list of sampler output, one per model forward pass, along with indicator of @@ -117,7 +114,7 @@ def sampler_output( """ ## Worker-side logic: skip - if not self.is_driver(): + if execute_model_req is None: logger.info("Workers should not make proposals") return None @@ -152,36 +149,32 @@ def sampler_output( def get_spec_proposals( self, - proposal_len: int, - execute_model_req: ExecuteModelRequest) -> SpeculativeProposals: + execute_model_req: ExecuteModelRequest, + ) -> SpeculativeProposals: """Produce speculations given an input batch of sequences. The number of speculative tokens per sequence is determined by max_proposal_len. """ with patch_tensor_parallel_group(self._single_tp_group, self._single_tp_cpu_group): - return self._proposer.get_proposals(proposal_len, execute_model_req) + return self._proposer.get_spec_proposals(execute_model_req) @torch.inference_mode() - def execute_model(self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + if execute_model_req is None: #if not self._worker.is_driver_worker: + return [] + with patch_tensor_parallel_group(self._single_tp_group, self._single_tp_cpu_group): return self._execute_model_tp1(execute_model_req) def _execute_model_tp1( self, - execute_model_req: Optional[ExecuteModelRequest] = None + execute_model_req: Optional[ExecuteModelRequest] ) -> List[SamplerOutput]: logger.info("SingleTPWorker.execute_model_prefill()") - if execute_model_req is None: - seq_group_metadata_list = None - else: - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - - if not self._worker.is_driver(): - logger.info("Draft worker returns []") - return [] - - assert seq_group_metadata_list is not None - assert execute_model_req is not None + seq_group_metadata_list = execute_model_req.seq_group_metadata_list num_seq_groups = len(seq_group_metadata_list) blocks_to_swap_in = execute_model_req.blocks_to_swap_in blocks_to_swap_out = execute_model_req.blocks_to_swap_out diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 10411a2bf7a10..b7db883324da6 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -43,7 +43,7 @@ def __init__( distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, vision_language_config: Optional[VisionLanguageConfig] = None, - speculative_config: Optional[SpeculativeConfig] = None, + speculative_config: Optional[SpeculativeConfig] = None, # TODO: remove? is_driver_worker: bool = False, ) -> None: self.model_config = model_config From 0eacc965ec54e0d9bc7d5cc6755195bfec71e64c Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 10 Jun 2024 16:06:48 +0900 Subject: [PATCH 003/126] update execute_model logic minior add log --- vllm/distributed/parallel_state.py | 5 ++-- vllm/spec_decode/multi_step_worker.py | 6 ++++- vllm/spec_decode/single_tp_worker.py | 36 ++++++++++++++++++-------- vllm/spec_decode/spec_decode_worker.py | 2 +- vllm/worker/model_runner.py | 2 ++ vllm/worker/worker.py | 6 +++++ 6 files changed, 42 insertions(+), 15 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 0cbc473c3b6fb..f82e0a1bc44bb 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -4,6 +4,7 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. """Tensor and pipeline parallel groups.""" from typing import List, Optional +from datetime import timedelta import contextlib import torch @@ -12,8 +13,6 @@ import vllm.envs as envs from vllm.logger import init_logger -from datetime import timedelta - logger = init_logger(__name__) _ENABLE_CUSTOM_ALL_REDUCE = True @@ -128,6 +127,8 @@ def init_distributed_environment( if torch.cuda.is_available(): torch.cuda.synchronize() del data + else: + logger.info("torch.distributed has already been initialized") def initialize_model_parallel( diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 6455c5eef7b02..6236d0b07f6d8 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -10,6 +10,9 @@ from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.worker.worker import Worker +from vllm.logger import init_logger + +logger = init_logger(__name__) class MultiStepWorker(Worker, ProposerWorkerBase): @@ -72,7 +75,8 @@ def sampler_output( # Run model sample_len times. model_outputs = [] - for _ in range(sample_len): + for i in range(sample_len): + logger.info(f"Driver runs multiple draft steps. {i+1}/{sample_len}") model_output = super().execute_model( execute_model_req=copied_execute_model_req) assert (len(model_output) == 1 diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/single_tp_worker.py index b000e323eac2c..341322c251067 100644 --- a/vllm/spec_decode/single_tp_worker.py +++ b/vllm/spec_decode/single_tp_worker.py @@ -1,20 +1,22 @@ import copy -from typing import List, Tuple, Set, Optional -import logging +from typing import List, Tuple, Set, Optional, Union +from datetime import timedelta import torch import torch.distributed from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceGroupMetadata) from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeProposer +from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer +from vllm.worker.worker import Worker from vllm.lora.request import LoRARequest from vllm.distributed.parallel_state import patch_tensor_parallel_group from vllm.config import ParallelConfig -from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase +from vllm.logger import init_logger -logger = logging.getLogger(__name__) +logger = init_logger(__name__) class SingleTpWorker(ProposerWorkerBase): @@ -44,7 +46,7 @@ def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, def __init__( self, - worker: ProposerWorkerBase, + worker: Union[Worker, ProposerWorkerBase], ): self._worker = worker self._single_tp_group = None @@ -60,8 +62,9 @@ def init_device(self): self process. """ world_rank = torch.distributed.get_rank() - self._single_tp_group = torch.distributed.new_group(ranks=[world_rank]) + self._single_tp_group = torch.distributed.new_group(ranks=[world_rank], timeout=timedelta(seconds=10)) self._single_tp_cpu_group = torch.distributed.new_group(ranks=[world_rank], + timeout=timedelta(seconds=10), backend="gloo") logger.info(f"init_device. world_rank: {world_rank}, single_tp_group: {self._single_tp_group}, single_tp_cput_group: {self._single_tp_cpu_group}") @@ -172,13 +175,24 @@ def _execute_model_tp1( self, execute_model_req: Optional[ExecuteModelRequest] ) -> List[SamplerOutput]: - logger.info("SingleTPWorker.execute_model_prefill()") + logger.info("SingleTPWorker.execute_model_tp1()") seq_group_metadata_list = execute_model_req.seq_group_metadata_list num_seq_groups = len(seq_group_metadata_list) - blocks_to_swap_in = execute_model_req.blocks_to_swap_in - blocks_to_swap_out = execute_model_req.blocks_to_swap_out - blocks_to_copy = execute_model_req.blocks_to_copy + # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors. + # they contain parameters to launch cudamemcpyasync. + blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in, + device="cpu", + dtype=torch.int64).view(-1, 2) + blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out, + device="cpu", + dtype=torch.int64).view(-1, 2) + # `blocks_to_copy` is a gpu tensor. The src and tgt of + # blocks to copy are in the same device, and `blocks_to_copy` + # can be used directly within cuda kernels. + blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, + device=self._worker.device, + dtype=torch.int64).view(-1, 2) self._worker.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) @@ -190,7 +204,7 @@ def _execute_model_tp1( output = self._worker.model_runner.execute_model(seq_group_metadata_list, self._worker.gpu_cache) - logger.info("SingleTPWorker.execute_model_prefill() output:") + logger.info("SingleTPWorker.execute_model_tp1() output:") if output is not None: for seq_group_output in output.outputs: for sample in seq_group_output.samples: diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 76779bf6a152b..3740c8f255687 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -88,7 +88,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): @classmethod def create_worker( cls, - scorer_worker: WorkerBase, + scorer_worker: Worker, draft_worker_kwargs: Dict[str, Any], disable_by_batch_size: Optional[int], ) -> "SpecDecodeWorker": diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index c59288b4f73c6..edc7205da2093 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -722,6 +722,8 @@ def execute_model( lora_requests, lora_mapping, multi_modal_kwargs ) = self.prepare_input_tensors(seq_group_metadata_list) + + if self.lora_config: self.set_active_loras(lora_requests, lora_mapping) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index b7db883324da6..8d6468683e706 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -20,6 +20,9 @@ from vllm.worker.embedding_model_runner import EmbeddingModelRunner from vllm.worker.model_runner import ModelRunner from vllm.worker.worker_base import WorkerBase +from vllm.logger import init_logger + +logger = init_logger(__name__) class Worker(WorkerBase): @@ -227,9 +230,11 @@ def execute_model( execute_model_req: Optional[ExecuteModelRequest] = None ) -> List[Union[SamplerOutput, PoolerOutput]]: if not self.is_driver_worker: + logger.info("Worker.execute_model()") self._execute_model_non_driver() return [] + logger.info("Driver. Worker.execute_model()") if execute_model_req is None: # This signals that there's no more requests to process for now. # All workers are running infinite loop with broadcast_tensor_dict, @@ -269,6 +274,7 @@ def execute_model( if num_seq_groups == 0: return [] + logger.info("Worker.model_runner.execute_model()") output = self.model_runner.execute_model(seq_group_metadata_list, self.gpu_cache) From 2011ed0c0c722bbe02ca7172d96534bfc387a7e6 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 11 Jun 2024 10:50:20 +0900 Subject: [PATCH 004/126] fix log --- vllm/distributed/communication_op.py | 9 +++++++-- vllm/spec_decode/single_tp_worker.py | 2 +- vllm/spec_decode/spec_decode_worker.py | 3 ++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index 2b38ec472de66..2bdbbd2b80ddb 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -5,8 +5,12 @@ import torch from torch.distributed import ProcessGroup +from vllm.logger import init_logger -from .parallel_state import (get_cpu_world_group, get_pp_pynccl_communicator, +logger = init_logger(__name__) + + +from .parallel_state import (get_world_group, get_cpu_world_group, get_pp_pynccl_communicator, get_tensor_model_parallel_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -240,9 +244,10 @@ def broadcast_tensor_dict( or torch.distributed.get_world_size(group=group) == 1): return tensor_dict - group = group or torch.distributed.group.WORLD + group = group or get_world_group() metadata_group = metadata_group or get_cpu_world_group() ranks = torch.distributed.get_process_group_ranks(group) + logger.info(f"broadcast_tensor_dict. src: {src}, ranks: {ranks}") assert src in ranks, f"Invalid src rank ({src})" rank = torch.distributed.get_rank() diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/single_tp_worker.py index 341322c251067..2f568b0df20ee 100644 --- a/vllm/spec_decode/single_tp_worker.py +++ b/vllm/spec_decode/single_tp_worker.py @@ -118,7 +118,7 @@ def sampler_output( ## Worker-side logic: skip if execute_model_req is None: - logger.info("Workers should not make proposals") + logger.info("Workers do not make proposals") return None ## Driver-side logic diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 3740c8f255687..0a97129ca33d9 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -287,9 +287,10 @@ def execute_model( if num_lookahead_slots == 0 or len( execute_model_req.seq_group_metadata_list ) == 0 or disable_all_speculation: + logger.info("prefill step") return self._run_no_spec(execute_model_req, skip_proposer=disable_all_speculation) - + logger.info("decoding step") return self._run_speculative_decoding_step(execute_model_req, num_lookahead_slots) From 2e16c4e2424c27e1cbd6e75e91a6479b5751cd18 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 11 Jun 2024 15:01:11 +0900 Subject: [PATCH 005/126] DummyProposerWorker --- vllm/spec_decode/single_tp_worker.py | 61 ++++++++++++++++++++++++-- vllm/spec_decode/spec_decode_worker.py | 2 +- 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/single_tp_worker.py index 2f568b0df20ee..2658dfed5e62c 100644 --- a/vllm/spec_decode/single_tp_worker.py +++ b/vllm/spec_decode/single_tp_worker.py @@ -30,7 +30,7 @@ class SingleTpWorker(ProposerWorkerBase): @classmethod def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, - target_parallel_config: ParallelConfig): + target_parallel_config: ParallelConfig, is_driver_worker: bool): """Wrap the worker in a SingleTpWorker if necessary. """ draft_tp = draft_parallel_config.tensor_parallel_size @@ -41,8 +41,12 @@ def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, raise ValueError("{cls} only supports tp=1, found " f"{draft_tp=}") - logger.info(f"Wrapping {type(worker)} in {cls}") - return cls(worker) + if is_driver_worker: + logger.info(f"Wrapping {type(worker)} in {cls}") + return cls(worker) + else: + logger.info(f"None for non-driver workers") + return DummyProposerWorker() def __init__( self, @@ -346,3 +350,54 @@ def _append_new_tokens( token_logprob = seq_output.logprobs[token_id] seq.append_token_id(token_id, token_logprob.logprob) + + +class DummyProposerWorker(ProposerWorkerBase): + + def init_device(self): + pass + + def set_include_gpu_probs_tensor(self): + pass + + def load_model(self): + pass + + def determine_num_available_blocks(self): + pass + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int): + pass + + def sampler_output( + self, + execute_model_req: ExecuteModelRequest, + sample_len: int, + ) -> Tuple[List[SamplerOutput], bool]: + return None + + def get_spec_proposals( + self, + execute_model_req: ExecuteModelRequest, + ) -> SpeculativeProposals: + return None + + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + return None + + def get_cache_block_size_bytes(self) -> int: + return 0 + + def add_lora(self, lora_request: LoRARequest) -> bool: + pass + + def remove_lora(self, lora_id: int) -> bool: + pass + + def list_loras(self) -> Set[int]: + pass + diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 0a97129ca33d9..f8ca3ce86d687 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -106,7 +106,7 @@ def create_worker( ngram_prompt_lookup_max) else: proposer_worker = MultiStepWorker(**draft_worker_kwargs) - proposer_worker = SingleTpWorker.maybe_wrap_worker(proposer_worker, draft_worker_kwargs['parallel_config'], scorer_worker.parallel_config) + proposer_worker = SingleTpWorker.maybe_wrap_worker(proposer_worker, draft_worker_kwargs['parallel_config'], scorer_worker.parallel_config, scorer_worker.is_driver_worker) logger.info("Configuring SpecDecodeWorker with proposer=%s", type(proposer_worker)) From b412a51e0490d3156a49d758021909b0918dd271 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 11 Jun 2024 15:45:34 +0900 Subject: [PATCH 006/126] fix --- vllm/spec_decode/single_tp_worker.py | 87 +++++++++++++++++----------- vllm/worker/model_runner.py | 2 - 2 files changed, 53 insertions(+), 36 deletions(-) diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/single_tp_worker.py index 2658dfed5e62c..aaaa50b4d6e3d 100644 --- a/vllm/spec_decode/single_tp_worker.py +++ b/vllm/spec_decode/single_tp_worker.py @@ -5,7 +5,8 @@ import torch import torch.distributed -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceGroupMetadata) +from vllm.sequence import (ExecuteModelRequest, SamplerOutput, + SequenceGroupMetadata) from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeProposer from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer @@ -30,7 +31,8 @@ class SingleTpWorker(ProposerWorkerBase): @classmethod def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, - target_parallel_config: ParallelConfig, is_driver_worker: bool): + target_parallel_config: ParallelConfig, + is_driver_worker: bool): """Wrap the worker in a SingleTpWorker if necessary. """ draft_tp = draft_parallel_config.tensor_parallel_size @@ -45,8 +47,8 @@ def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, logger.info(f"Wrapping {type(worker)} in {cls}") return cls(worker) else: - logger.info(f"None for non-driver workers") - return DummyProposerWorker() + logger.info(f"dummy worker for non-driver") + return DummyProposerWorker(worker) def __init__( self, @@ -66,16 +68,19 @@ def init_device(self): self process. """ world_rank = torch.distributed.get_rank() - self._single_tp_group = torch.distributed.new_group(ranks=[world_rank], timeout=timedelta(seconds=10)) - self._single_tp_cpu_group = torch.distributed.new_group(ranks=[world_rank], - timeout=timedelta(seconds=10), - backend="gloo") - - logger.info(f"init_device. world_rank: {world_rank}, single_tp_group: {self._single_tp_group}, single_tp_cput_group: {self._single_tp_cpu_group}") - - with patch_tensor_parallel_group(self._single_tp_group, self._single_tp_cpu_group): + self._single_tp_group = torch.distributed.new_group( + ranks=[world_rank], timeout=timedelta(seconds=10)) + self._single_tp_cpu_group = torch.distributed.new_group( + ranks=[world_rank], timeout=timedelta(seconds=10), backend="gloo") + + logger.info( + f"init_device. world_rank: {world_rank}, single_tp_group: {self._single_tp_group}, single_tp_cput_group: {self._single_tp_cpu_group}" + ) + + with patch_tensor_parallel_group(self._single_tp_group, + self._single_tp_cpu_group): self._worker.init_device() - + self._proposer = Top1Proposer( self, self._worker.device, @@ -83,28 +88,28 @@ def init_device(self): max_proposal_len=self.max_model_len, ) - def set_include_gpu_probs_tensor(self): self._worker.set_include_gpu_probs_tensor() def load_model(self): logger.info("SingleTPWorker.load_model()") - with patch_tensor_parallel_group(self._single_tp_group, self._single_tp_cpu_group): + with patch_tensor_parallel_group(self._single_tp_group, + self._single_tp_cpu_group): self._worker.load_model() def determine_num_available_blocks(self): """Profile the model on all ranks. """ - with patch_tensor_parallel_group(self._single_tp_group, self._single_tp_cpu_group): + with patch_tensor_parallel_group(self._single_tp_group, + self._single_tp_cpu_group): return self._worker.determine_num_available_blocks() - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int): + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int): """Initialize the cache engine on all ranks. """ - with patch_tensor_parallel_group(self._single_tp_group, self._single_tp_cpu_group): - self._worker.initialize_cache(num_gpu_blocks, - num_cpu_blocks) + with patch_tensor_parallel_group(self._single_tp_group, + self._single_tp_cpu_group): + self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) @torch.inference_mode() def sampler_output( @@ -120,7 +125,7 @@ def sampler_output( For multi step worker, this indicator shall be True. """ - ## Worker-side logic: skip + ## Worker-side logic: skip # TODO: REMOVE if execute_model_req is None: logger.info("Workers do not make proposals") return None @@ -142,7 +147,8 @@ def sampler_output( # Run model sample_len times. model_outputs = [] for i in range(sample_len): - logger.info(f"Driver runs multiple draft steps. {i+1}/{sample_len}") + logger.info( + f"Driver runs multiple draft steps. {i+1}/{sample_len}") model_output = self._execute_model_tp1( execute_model_req=copied_execute_model_req) assert (len(model_output) == 1 @@ -161,7 +167,8 @@ def get_spec_proposals( """Produce speculations given an input batch of sequences. The number of speculative tokens per sequence is determined by max_proposal_len. """ - with patch_tensor_parallel_group(self._single_tp_group, self._single_tp_cpu_group): + with patch_tensor_parallel_group(self._single_tp_group, + self._single_tp_cpu_group): return self._proposer.get_spec_proposals(execute_model_req) @torch.inference_mode() @@ -169,15 +176,15 @@ def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None ) -> List[SamplerOutput]: - if execute_model_req is None: #if not self._worker.is_driver_worker: + if execute_model_req is None: #if not self._worker.is_driver_worker: return [] - with patch_tensor_parallel_group(self._single_tp_group, self._single_tp_cpu_group): + with patch_tensor_parallel_group(self._single_tp_group, + self._single_tp_cpu_group): return self._execute_model_tp1(execute_model_req) def _execute_model_tp1( - self, - execute_model_req: Optional[ExecuteModelRequest] + self, execute_model_req: Optional[ExecuteModelRequest] ) -> List[SamplerOutput]: logger.info("SingleTPWorker.execute_model_tp1()") @@ -198,15 +205,16 @@ def _execute_model_tp1( device=self._worker.device, dtype=torch.int64).view(-1, 2) - self._worker.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) + self._worker.cache_swap(blocks_to_swap_in, blocks_to_swap_out, + blocks_to_copy) # If there is no input, we don't need to execute the model. if num_seq_groups == 0: return [] logger.info("SingleTPWorker._worker.model_runner.execute_model()") - output = self._worker.model_runner.execute_model(seq_group_metadata_list, - self._worker.gpu_cache) + output = self._worker.model_runner.execute_model( + seq_group_metadata_list, self._worker.gpu_cache) logger.info("SingleTPWorker.execute_model_tp1() output:") if output is not None: @@ -218,7 +226,6 @@ def _execute_model_tp1( # to conform to interface. return [output] - def get_cache_block_size_bytes(self) -> int: """Return the size of a single cache block, in bytes. Used in speculative decoding. @@ -354,6 +361,12 @@ def _append_new_tokens( class DummyProposerWorker(ProposerWorkerBase): + def __init__( + self, + worker: Union[Worker, ProposerWorkerBase], + ): + self._worker = worker + def init_device(self): pass @@ -366,8 +379,7 @@ def load_model(self): def determine_num_available_blocks(self): pass - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int): + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int): pass def sampler_output( @@ -401,3 +413,10 @@ def remove_lora(self, lora_id: int) -> bool: def list_loras(self) -> Set[int]: pass + @property + def max_model_len(self) -> int: + return self._worker.max_model_len + + @property + def vocab_size(self) -> int: + return self._worker.vocab_size diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index edc7205da2093..c59288b4f73c6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -722,8 +722,6 @@ def execute_model( lora_requests, lora_mapping, multi_modal_kwargs ) = self.prepare_input_tensors(seq_group_metadata_list) - - if self.lora_config: self.set_active_loras(lora_requests, lora_mapping) From 593ccfa2371782d0e692cdc331c9a68c03d14bfa Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 11 Jun 2024 16:58:58 +0900 Subject: [PATCH 007/126] init only partial workers nit --- vllm/spec_decode/single_tp_worker.py | 76 ++++++++++++++------------ vllm/spec_decode/spec_decode_worker.py | 4 +- 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/single_tp_worker.py index aaaa50b4d6e3d..0228b2f8b475f 100644 --- a/vllm/spec_decode/single_tp_worker.py +++ b/vllm/spec_decode/single_tp_worker.py @@ -1,4 +1,5 @@ import copy +import weakref from typing import List, Tuple, Set, Optional, Union from datetime import timedelta @@ -32,31 +33,39 @@ class SingleTpWorker(ProposerWorkerBase): @classmethod def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, target_parallel_config: ParallelConfig, - is_driver_worker: bool): + rank: int): """Wrap the worker in a SingleTpWorker if necessary. """ draft_tp = draft_parallel_config.tensor_parallel_size - if draft_tp == target_parallel_config.tensor_parallel_size: - return worker + target_tp = target_parallel_config.tensor_parallel_size + if draft_tp > target_tp: + raise ValueError("{cls} only supports draft_tp smaller than target_tp." + f"{draft_tp=} {target_tp=}") + + ranks = list(range(draft_tp)) - if draft_tp != 1: - raise ValueError("{cls} only supports tp=1, found " - f"{draft_tp=}") + if draft_tp == target_tp: + return worker - if is_driver_worker: + logger.info(f"{rank=}, {ranks=}") + if rank in ranks: logger.info(f"Wrapping {type(worker)} in {cls}") - return cls(worker) + return cls(worker, ranks) else: - logger.info(f"dummy worker for non-driver") + logger.info(f"dummy worker that would not participate in draft generation") return DummyProposerWorker(worker) def __init__( self, worker: Union[Worker, ProposerWorkerBase], + ranks: List[int], ): self._worker = worker - self._single_tp_group = None - self._single_tp_cpu_group = None + self._ranks = ranks + self._tp_group = None + self._tp_cpu_group = None + self._tp_pynccl_comm = None #TODO: init&use + self._tp_ca_comm = None #TODO: init&use # Lazy initialization list. self._proposer: SpeculativeProposer @@ -67,22 +76,19 @@ def init_device(self): This also creates a single-rank process group containing only the self process. """ - world_rank = torch.distributed.get_rank() - self._single_tp_group = torch.distributed.new_group( - ranks=[world_rank], timeout=timedelta(seconds=10)) - self._single_tp_cpu_group = torch.distributed.new_group( - ranks=[world_rank], timeout=timedelta(seconds=10), backend="gloo") - - logger.info( - f"init_device. world_rank: {world_rank}, single_tp_group: {self._single_tp_group}, single_tp_cput_group: {self._single_tp_cpu_group}" - ) + self._tp_group = torch.distributed.new_group( + ranks=self._ranks, timeout=timedelta(seconds=10)) + self._tp_cpu_group = torch.distributed.new_group( + ranks=self._ranks, timeout=timedelta(seconds=10), backend="gloo") + + logger.info(f"init_device. ranks: {self._ranks}") - with patch_tensor_parallel_group(self._single_tp_group, - self._single_tp_cpu_group): + with patch_tensor_parallel_group(self._tp_group, + self._tp_cpu_group): self._worker.init_device() self._proposer = Top1Proposer( - self, + weakref.proxy(self), self._worker.device, self.vocab_size, max_proposal_len=self.max_model_len, @@ -93,22 +99,22 @@ def set_include_gpu_probs_tensor(self): def load_model(self): logger.info("SingleTPWorker.load_model()") - with patch_tensor_parallel_group(self._single_tp_group, - self._single_tp_cpu_group): + with patch_tensor_parallel_group(self._tp_group, + self._tp_cpu_group): self._worker.load_model() def determine_num_available_blocks(self): """Profile the model on all ranks. """ - with patch_tensor_parallel_group(self._single_tp_group, - self._single_tp_cpu_group): + with patch_tensor_parallel_group(self._tp_group, + self._tp_cpu_group): return self._worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int): """Initialize the cache engine on all ranks. """ - with patch_tensor_parallel_group(self._single_tp_group, - self._single_tp_cpu_group): + with patch_tensor_parallel_group(self._tp_group, + self._tp_cpu_group): self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) @torch.inference_mode() @@ -125,10 +131,10 @@ def sampler_output( For multi step worker, this indicator shall be True. """ - ## Worker-side logic: skip # TODO: REMOVE + ## Worker-side logic: if execute_model_req is None: logger.info("Workers do not make proposals") - return None + return [], True ## Driver-side logic self._raise_if_unsupported(execute_model_req) @@ -167,8 +173,8 @@ def get_spec_proposals( """Produce speculations given an input batch of sequences. The number of speculative tokens per sequence is determined by max_proposal_len. """ - with patch_tensor_parallel_group(self._single_tp_group, - self._single_tp_cpu_group): + with patch_tensor_parallel_group(self._tp_group, + self._tp_cpu_group): return self._proposer.get_spec_proposals(execute_model_req) @torch.inference_mode() @@ -179,8 +185,8 @@ def execute_model( if execute_model_req is None: #if not self._worker.is_driver_worker: return [] - with patch_tensor_parallel_group(self._single_tp_group, - self._single_tp_cpu_group): + with patch_tensor_parallel_group(self._tp_group, + self._tp_cpu_group): return self._execute_model_tp1(execute_model_req) def _execute_model_tp1( diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index f8ca3ce86d687..97f02fcd5d593 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -106,7 +106,9 @@ def create_worker( ngram_prompt_lookup_max) else: proposer_worker = MultiStepWorker(**draft_worker_kwargs) - proposer_worker = SingleTpWorker.maybe_wrap_worker(proposer_worker, draft_worker_kwargs['parallel_config'], scorer_worker.parallel_config, scorer_worker.is_driver_worker) + + #TODO: support NGramWorker + proposer_worker = SingleTpWorker.maybe_wrap_worker(proposer_worker, draft_worker_kwargs['parallel_config'], scorer_worker.parallel_config, scorer_worker.rank) logger.info("Configuring SpecDecodeWorker with proposer=%s", type(proposer_worker)) From c5d3476748a483c6b1c28576ea4dd0024dd196bc Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 10:20:22 +0900 Subject: [PATCH 008/126] Use multi_step_worker logic --- vllm/spec_decode/single_tp_worker.py | 212 +-------------------------- 1 file changed, 3 insertions(+), 209 deletions(-) diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/single_tp_worker.py index 0228b2f8b475f..d180ff167daa6 100644 --- a/vllm/spec_decode/single_tp_worker.py +++ b/vllm/spec_decode/single_tp_worker.py @@ -67,9 +67,6 @@ def __init__( self._tp_pynccl_comm = None #TODO: init&use self._tp_ca_comm = None #TODO: init&use - # Lazy initialization list. - self._proposer: SpeculativeProposer - def init_device(self): """Initialize the model on all ranks. @@ -87,13 +84,6 @@ def init_device(self): self._tp_cpu_group): self._worker.init_device() - self._proposer = Top1Proposer( - weakref.proxy(self), - self._worker.device, - self.vocab_size, - max_proposal_len=self.max_model_len, - ) - def set_include_gpu_probs_tensor(self): self._worker.set_include_gpu_probs_tensor() @@ -123,48 +113,7 @@ def sampler_output( execute_model_req: ExecuteModelRequest, sample_len: int, ) -> Tuple[List[SamplerOutput], bool]: - """Run the model forward pass sample_len times. Returns the list of - sampler output, one per model forward pass, along with indicator of - whether torch tensor in sampler output need to be transposed in latter - sampler_output_to_torch logic. - - For multi step worker, this indicator shall be True. - """ - - ## Worker-side logic: - if execute_model_req is None: - logger.info("Workers do not make proposals") - return [], True - - ## Driver-side logic - self._raise_if_unsupported(execute_model_req) - - # Shallow copy input data so modifications (such as appending tokens) - # do not cause side-effects. - copied_seq_group_metadata_list = self._shallow_copy_inputs( - execute_model_req.seq_group_metadata_list) - copied_execute_model_req = execute_model_req.clone( - copied_seq_group_metadata_list) - - # Assert enough KV space for sample_len tokens per sequence. - self._assert_enough_kv_space(execute_model_req.seq_group_metadata_list, - sample_len) - - # Run model sample_len times. - model_outputs = [] - for i in range(sample_len): - logger.info( - f"Driver runs multiple draft steps. {i+1}/{sample_len}") - model_output = self._execute_model_tp1( - execute_model_req=copied_execute_model_req) - assert (len(model_output) == 1 - ), "composing multistep workers not supported" - model_output = model_output[0] - self._append_new_tokens(model_output, - copied_seq_group_metadata_list) - model_outputs.append(model_output) - - return model_outputs, True + return self._worker.sampler_output(execute_model_req, sample_len) def get_spec_proposals( self, @@ -175,62 +124,16 @@ def get_spec_proposals( """ with patch_tensor_parallel_group(self._tp_group, self._tp_cpu_group): - return self._proposer.get_spec_proposals(execute_model_req) + return self._worker.get_spec_proposals(execute_model_req) @torch.inference_mode() def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None ) -> List[SamplerOutput]: - if execute_model_req is None: #if not self._worker.is_driver_worker: - return [] - with patch_tensor_parallel_group(self._tp_group, self._tp_cpu_group): - return self._execute_model_tp1(execute_model_req) - - def _execute_model_tp1( - self, execute_model_req: Optional[ExecuteModelRequest] - ) -> List[SamplerOutput]: - logger.info("SingleTPWorker.execute_model_tp1()") - - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - num_seq_groups = len(seq_group_metadata_list) - # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors. - # they contain parameters to launch cudamemcpyasync. - blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in, - device="cpu", - dtype=torch.int64).view(-1, 2) - blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out, - device="cpu", - dtype=torch.int64).view(-1, 2) - # `blocks_to_copy` is a gpu tensor. The src and tgt of - # blocks to copy are in the same device, and `blocks_to_copy` - # can be used directly within cuda kernels. - blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, - device=self._worker.device, - dtype=torch.int64).view(-1, 2) - - self._worker.cache_swap(blocks_to_swap_in, blocks_to_swap_out, - blocks_to_copy) - - # If there is no input, we don't need to execute the model. - if num_seq_groups == 0: - return [] - - logger.info("SingleTPWorker._worker.model_runner.execute_model()") - output = self._worker.model_runner.execute_model( - seq_group_metadata_list, self._worker.gpu_cache) - - logger.info("SingleTPWorker.execute_model_tp1() output:") - if output is not None: - for seq_group_output in output.outputs: - for sample in seq_group_output.samples: - logger.info(f"SamplerOutput: {sample}") - - # Worker only supports single-step execution. Wrap the output in a list - # to conform to interface. - return [output] + return self._worker.execute_model(execute_model_req) def get_cache_block_size_bytes(self) -> int: """Return the size of a single cache block, in bytes. Used in @@ -255,115 +158,6 @@ def max_model_len(self) -> int: def vocab_size(self) -> int: return self._worker.vocab_size - def _raise_if_unsupported( - self, - execute_model_req: ExecuteModelRequest, - ) -> None: - """MultiStepWorker does not yet implement support for cache swap - operations or beam search. - """ - if any([ - execute_model_req.blocks_to_swap_in, - execute_model_req.blocks_to_swap_out, - execute_model_req.blocks_to_copy - ]): - raise NotImplementedError( - "MultiStepWorker does not support cache operations") - - if any( - len(seq_group_metadata.seq_data.keys()) != 1 - for seq_group_metadata in - execute_model_req.seq_group_metadata_list): - raise NotImplementedError( - "MultiStepWorker does not support beam search.") - - def _shallow_copy_inputs( - self, seq_group_metadata_list: List[SequenceGroupMetadata] - ) -> List[SequenceGroupMetadata]: - """Copy input data structures to remove side-effects when input data - structures are shared with other modules. - - Helpful when the vLLM scheduler runs in the same process as the worker. - The alternative is deep-copying (or other form of deep copy); this has - performance downsides. - """ - - # Shallow-copy the list of SequenceGroupMetadata. This allows us to - # append tokens and change is_prompt without external side-effects. - new_seq_group_metadata_list = [] - - for old_seq_group_metadata in seq_group_metadata_list: - # We must shallow-copy seq_group_metadata as is_prompt could change. - seq_group_metadata = copy.copy(old_seq_group_metadata) - new_seq_group_metadata_list.append(seq_group_metadata) - - # We must shallow-copy seq_data as we will append token ids - new_seq_data = {} - for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): - new_seq_data[seq_id] = copy.copy(old_seq_data) - new_seq_data[ - seq_id].output_token_ids = old_seq_data.output_token_ids[:] - - seq_group_metadata.seq_data = new_seq_data - - return new_seq_group_metadata_list - - def _assert_enough_kv_space( - self, seq_group_metadata_list: List[SequenceGroupMetadata], - num_steps: int) -> None: - """Assert there are enough physical blocks per sequence to store the - current KV plus additional KV from num_steps tokens. - """ - assert self._worker.model_runner.block_size is not None - for seq_group_metadata in seq_group_metadata_list: - # Only one seq_id is guaranteed because there is no beam search. - seq_id = list(seq_group_metadata.seq_data.keys())[0] - seq = seq_group_metadata.seq_data[seq_id] - - # After num_steps, the seq len will be the current seq len - # plus one token per step. - final_seq_len = seq.get_len() + num_steps - - # We will have final_seq_len - 1 KV because vLLM saves KV for a - # token in the iteration after the token was generated. - required_num_kv_slots = final_seq_len - 1 - - # The allocated number of kv slots is the number of allocated blocks - # times the number of slots of block. - number_physical_blocks = len( - seq_group_metadata.block_tables[seq_id]) - allocated_kv_slots = (number_physical_blocks * - self._worker.model_runner.block_size) - - if required_num_kv_slots > allocated_kv_slots: - request_id = seq_group_metadata.request_id - raise ValueError( - "The worker attempted to run " - f"{num_steps} times but found insufficient KV space for " - f"{request_id=} {seq_id=}. ({allocated_kv_slots=} " - f"{required_num_kv_slots=}).") - - def _append_new_tokens( - self, model_output: SamplerOutput, - seq_group_metadata_list: SequenceGroupMetadata) -> None: - """Given model output from a single run, append the tokens to the - sequences. This is normally done outside of the worker, but it is - required if the worker is to perform multiple forward passes. - """ - for seq_group_metadata, sequence_group_outputs in zip( - seq_group_metadata_list, model_output): - seq_group_metadata.is_prompt = False - - for seq_output in sequence_group_outputs.samples: - # NOTE: Beam search is not supported, so we can assume that - # parent_seq_id == seq_id. - seq = seq_group_metadata.seq_data[seq_output.parent_seq_id] - - token_id = seq_output.output_token - token_logprob = seq_output.logprobs[token_id] - - seq.append_token_id(token_id, token_logprob.logprob) - class DummyProposerWorker(ProposerWorkerBase): From 44e623bc592ee2609639131de26588786c18b090 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 10:59:34 +0900 Subject: [PATCH 009/126] self._patch_tp_group --- vllm/spec_decode/single_tp_worker.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/single_tp_worker.py index d180ff167daa6..52ef37885b9e3 100644 --- a/vllm/spec_decode/single_tp_worker.py +++ b/vllm/spec_decode/single_tp_worker.py @@ -67,6 +67,10 @@ def __init__( self._tp_pynccl_comm = None #TODO: init&use self._tp_ca_comm = None #TODO: init&use + def _patch_tensor_parallel_group(self): + return patch_tensor_parallel_group(self._tp_group, self._tp_cpu_group, + self._tp_pynccl_comm, self._tp_ca_comm) + def init_device(self): """Initialize the model on all ranks. @@ -80,8 +84,7 @@ def init_device(self): logger.info(f"init_device. ranks: {self._ranks}") - with patch_tensor_parallel_group(self._tp_group, - self._tp_cpu_group): + with self._patch_tensor_parallel_group(): self._worker.init_device() def set_include_gpu_probs_tensor(self): @@ -89,22 +92,19 @@ def set_include_gpu_probs_tensor(self): def load_model(self): logger.info("SingleTPWorker.load_model()") - with patch_tensor_parallel_group(self._tp_group, - self._tp_cpu_group): + with self._patch_tensor_parallel_group(): self._worker.load_model() def determine_num_available_blocks(self): """Profile the model on all ranks. """ - with patch_tensor_parallel_group(self._tp_group, - self._tp_cpu_group): + with self._patch_tensor_parallel_group(): return self._worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int): """Initialize the cache engine on all ranks. """ - with patch_tensor_parallel_group(self._tp_group, - self._tp_cpu_group): + with self._patch_tensor_parallel_group(): self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) @torch.inference_mode() @@ -122,8 +122,7 @@ def get_spec_proposals( """Produce speculations given an input batch of sequences. The number of speculative tokens per sequence is determined by max_proposal_len. """ - with patch_tensor_parallel_group(self._tp_group, - self._tp_cpu_group): + with self._patch_tensor_parallel_group(): return self._worker.get_spec_proposals(execute_model_req) @torch.inference_mode() @@ -131,8 +130,7 @@ def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None ) -> List[SamplerOutput]: - with patch_tensor_parallel_group(self._tp_group, - self._tp_cpu_group): + with self._patch_tensor_parallel_group(): return self._worker.execute_model(execute_model_req) def get_cache_block_size_bytes(self) -> int: From 98caf17e2889e7cb50ce45ddf91b86ff3e0da458 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 12:50:22 +0900 Subject: [PATCH 010/126] refactor it to support other draft-tp than 1 --- vllm/distributed/parallel_state.py | 16 ++++++++-------- vllm/spec_decode/single_tp_worker.py | 26 +++++++++++++++++++++----- vllm/spec_decode/spec_decode_worker.py | 2 +- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index f82e0a1bc44bb..2203c3b633487 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -199,14 +199,14 @@ def initialize_model_parallel( device=_LOCAL_RANK, ) - # Initialize a custom fast all-reduce implementation. - if _ENABLE_CUSTOM_ALL_REDUCE: - from vllm.distributed.device_communicators.custom_all_reduce import ( - CustomAllreduce) - _TP_CA_COMMUNICATOR = CustomAllreduce( - group=_TP_CPU_GROUP, - device=_LOCAL_RANK, - ) + # Initialize a custom fast all-reduce implementation. + if _ENABLE_CUSTOM_ALL_REDUCE: + from vllm.distributed.device_communicators.custom_all_reduce import ( + CustomAllreduce) + _TP_CA_COMMUNICATOR = CustomAllreduce( + group=_TP_CPU_GROUP, + device=_LOCAL_RANK, + ) # Build the pipeline model-parallel groups. global _PP_DEVICE_GROUP, _PP_CPU_GROUP diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/single_tp_worker.py index 52ef37885b9e3..02aae14657092 100644 --- a/vllm/spec_decode/single_tp_worker.py +++ b/vllm/spec_decode/single_tp_worker.py @@ -14,7 +14,7 @@ from vllm.worker.worker import Worker from vllm.lora.request import LoRARequest -from vllm.distributed.parallel_state import patch_tensor_parallel_group +from vllm.distributed.parallel_state import patch_tensor_parallel_group, _ENABLE_CUSTOM_ALL_REDUCE from vllm.config import ParallelConfig from vllm.logger import init_logger @@ -33,7 +33,7 @@ class SingleTpWorker(ProposerWorkerBase): @classmethod def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, target_parallel_config: ParallelConfig, - rank: int): + rank: int, local_rank: int): """Wrap the worker in a SingleTpWorker if necessary. """ draft_tp = draft_parallel_config.tensor_parallel_size @@ -50,7 +50,7 @@ def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, logger.info(f"{rank=}, {ranks=}") if rank in ranks: logger.info(f"Wrapping {type(worker)} in {cls}") - return cls(worker, ranks) + return cls(worker, ranks, local_rank) else: logger.info(f"dummy worker that would not participate in draft generation") return DummyProposerWorker(worker) @@ -59,13 +59,15 @@ def __init__( self, worker: Union[Worker, ProposerWorkerBase], ranks: List[int], + local_rank: int ): self._worker = worker self._ranks = ranks + self._local_rank = local_rank self._tp_group = None self._tp_cpu_group = None - self._tp_pynccl_comm = None #TODO: init&use - self._tp_ca_comm = None #TODO: init&use + self._tp_pynccl_comm = None + self._tp_ca_comm = None def _patch_tensor_parallel_group(self): return patch_tensor_parallel_group(self._tp_group, self._tp_cpu_group, @@ -82,6 +84,20 @@ def init_device(self): self._tp_cpu_group = torch.distributed.new_group( ranks=self._ranks, timeout=timedelta(seconds=10), backend="gloo") + if len(self._ranks) > 1: + from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator + self._tp_pynccl_comm = PyNcclCommunicator( + group=self._tp_cpu_group, + device=self._local_rank, + ) + if _ENABLE_CUSTOM_ALL_REDUCE: + from vllm.distributed.device_communicators.custom_all_reduce import ( + CustomAllreduce) + self._tp_ca_comm = CustomAllreduce( + group=self._tp_cpu_group, + device=self._local_rank, + ) + logger.info(f"init_device. ranks: {self._ranks}") with self._patch_tensor_parallel_group(): diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 97f02fcd5d593..d56f2504507e8 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -108,7 +108,7 @@ def create_worker( proposer_worker = MultiStepWorker(**draft_worker_kwargs) #TODO: support NGramWorker - proposer_worker = SingleTpWorker.maybe_wrap_worker(proposer_worker, draft_worker_kwargs['parallel_config'], scorer_worker.parallel_config, scorer_worker.rank) + proposer_worker = SingleTpWorker.maybe_wrap_worker(proposer_worker, draft_worker_kwargs['parallel_config'], scorer_worker.parallel_config, scorer_worker.rank, scorer_worker.local_rank) logger.info("Configuring SpecDecodeWorker with proposer=%s", type(proposer_worker)) From 7fc4ff59c457234d412aa873d5c56200c7f5a45b Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 12:50:49 +0900 Subject: [PATCH 011/126] spec-tp configuarable --- vllm/config.py | 21 +++++++++++++-------- vllm/engine/arg_utils.py | 9 +++++++++ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index a75e9346aac60..2f4a6cc391593 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,3 +1,4 @@ +import copy import enum import json from dataclasses import dataclass, field, fields @@ -765,6 +766,7 @@ def maybe_create_spec_config( target_parallel_config: ParallelConfig, target_dtype: str, speculative_model: Optional[str], + speculative_tensor_parallel_size: Optional[int], num_speculative_tokens: Optional[int], speculative_max_model_len: Optional[int], enable_chunked_prefill: bool, @@ -890,9 +892,9 @@ def maybe_create_spec_config( )) draft_parallel_config = ( - SpeculativeConfig.create_draft_parallel_config( - target_parallel_config)) - + SpeculativeConfig.create_draft_parallel_config( + target_parallel_config, speculative_tensor_parallel_size)) + return SpeculativeConfig( draft_model_config, draft_parallel_config, @@ -939,15 +941,15 @@ def _maybe_override_draft_max_model_len( @staticmethod def create_draft_parallel_config( - target_parallel_config: ParallelConfig) -> ParallelConfig: + target_parallel_config: ParallelConfig, + speculative_tensor_parallel_size: int) -> ParallelConfig: """Create a parallel config for use by the draft worker. - This is mostly a copy of the target parallel config. In the future the - draft worker can have a different parallel strategy, e.g. TP=1. + This is mostly a copy of the target parallel config. """ draft_parallel_config = ParallelConfig( - pipeline_parallel_size=1, - tensor_parallel_size=1, + pipeline_parallel_size=target_parallel_config.pipeline_parallel_size, + tensor_parallel_size=target_parallel_config.tensor_parallel_size, distributed_executor_backend=target_parallel_config. distributed_executor_backend, max_parallel_loading_workers=target_parallel_config. @@ -960,6 +962,9 @@ def create_draft_parallel_config( placement_group=target_parallel_config.placement_group, ) + if speculative_tensor_parallel_size is not None: + draft_parallel_config.tensor_parallel_size = speculative_tensor_parallel_size + return draft_parallel_config def __init__( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b7e815db12eb4..d159acc0775da 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1,3 +1,4 @@ +import copy import argparse import dataclasses import json @@ -92,6 +93,7 @@ class EngineArgs: guided_decoding_backend: str = 'outlines' # Speculative decoding configuration. speculative_model: Optional[str] = None + speculative_tensor_parallel_size: Optional[int] = None num_speculative_tokens: Optional[int] = None speculative_max_model_len: Optional[int] = None speculative_disable_by_batch_size: Optional[int] = None @@ -527,6 +529,12 @@ def add_cli_args( default=EngineArgs.num_speculative_tokens, help='The number of speculative tokens to sample from ' 'the draft model in speculative decoding.') + parser.add_argument('--speculatvie-tensor-parallel-size', + '-spec-tp', + type=int, + default=EngineArgs.speculative_tensor_parallel_size, + help='Number of tensor parallel replicas for ' + 'the draft model in speculative decoding.') parser.add_argument( '--speculative-max-model-len', @@ -668,6 +676,7 @@ def create_engine_config(self, ) -> EngineConfig: target_parallel_config=parallel_config, target_dtype=self.dtype, speculative_model=self.speculative_model, + speculative_tensor_parallel_size = self.speculative_tensor_parallel_size, num_speculative_tokens=self.num_speculative_tokens, speculative_disable_by_batch_size=self. speculative_disable_by_batch_size, From a96e720d568ec7af8367ff77d6f32452929d7fbc Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 13:23:08 +0900 Subject: [PATCH 012/126] ngram worker support test --- vllm/config.py | 9 ++++----- vllm/engine/arg_utils.py | 4 ++-- vllm/engine/llm_engine.py | 2 ++ vllm/executor/gpu_executor.py | 1 + vllm/spec_decode/single_tp_worker.py | 1 + vllm/spec_decode/spec_decode_worker.py | 2 ++ 6 files changed, 12 insertions(+), 7 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 2f4a6cc391593..6f534e44d62a5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -862,7 +862,6 @@ def maybe_create_spec_config( # config, in future, we may try refactor it out, and set # draft related config as None here. draft_model_config = target_model_config - draft_parallel_config = target_parallel_config else: ngram_prompt_lookup_max = 0 ngram_prompt_lookup_min = 0 @@ -891,10 +890,10 @@ def maybe_create_spec_config( target_model_config.max_model_len, )) - draft_parallel_config = ( - SpeculativeConfig.create_draft_parallel_config( - target_parallel_config, speculative_tensor_parallel_size)) - + draft_parallel_config = ( + SpeculativeConfig.create_draft_parallel_config( + target_parallel_config, speculative_tensor_parallel_size)) + return SpeculativeConfig( draft_model_config, draft_parallel_config, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d159acc0775da..80acc108241d2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -97,8 +97,8 @@ class EngineArgs: num_speculative_tokens: Optional[int] = None speculative_max_model_len: Optional[int] = None speculative_disable_by_batch_size: Optional[int] = None - ngram_prompt_lookup_max: Optional[int] = None - ngram_prompt_lookup_min: Optional[int] = None + ngram_prompt_lookup_max: Optional[int] = 4 + ngram_prompt_lookup_min: Optional[int] = 3 qlora_adapter_name_or_path: Optional[str] = None diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index cb5893e707c8b..87dcc6fbb6b97 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -355,6 +355,8 @@ def from_engine_args( from vllm.executor.gpu_executor import GPUExecutor executor_class = GPUExecutor + logger.info(f"engine_config.to_dict(): {engine_config.to_dict()}") + # Create the LLM engine. engine = cls( **engine_config.to_dict(), diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 3ad201f4757ec..19fda41a3ded9 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -32,6 +32,7 @@ def _get_worker_kwargs( if distributed_init_method is None: distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) + logger.info(f"spec_tp: {self.speculative_config.draft_parallel_config.tensor_parallel_size}") return dict( model_config=self.model_config, parallel_config=self.parallel_config, diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/single_tp_worker.py index 02aae14657092..2bb14f215fcb3 100644 --- a/vllm/spec_decode/single_tp_worker.py +++ b/vllm/spec_decode/single_tp_worker.py @@ -38,6 +38,7 @@ def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, """ draft_tp = draft_parallel_config.tensor_parallel_size target_tp = target_parallel_config.tensor_parallel_size + logger.info(f"{target_tp=}, {draft_tp=}") if draft_tp > target_tp: raise ValueError("{cls} only supports draft_tp smaller than target_tp." f"{draft_tp=} {target_tp=}") diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index d56f2504507e8..d7da7ca0d7c03 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -47,6 +47,8 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": #load_config=load_config, ) + logger.info(f"spec_tp: {speculative_config.draft_parallel_config.tensor_parallel_size}") + spec_decode_worker = SpecDecodeWorker.create_worker( scorer_worker=target_worker, draft_worker_kwargs=draft_worker_kwargs, From db39576160651bbf1d6fcfa7b76d811476e3158f Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 15:18:05 +0900 Subject: [PATCH 013/126] minor refine --- vllm/config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 6f534e44d62a5..aba0c1adda3b7 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -941,14 +941,17 @@ def _maybe_override_draft_max_model_len( @staticmethod def create_draft_parallel_config( target_parallel_config: ParallelConfig, - speculative_tensor_parallel_size: int) -> ParallelConfig: + speculative_tensor_parallel_size: Optional[int]) -> ParallelConfig: """Create a parallel config for use by the draft worker. This is mostly a copy of the target parallel config. """ + + _speculative_tensor_parallel_size = speculative_tensor_parallel_size or target_parallel_config.tensor_parallel_size + draft_parallel_config = ParallelConfig( pipeline_parallel_size=target_parallel_config.pipeline_parallel_size, - tensor_parallel_size=target_parallel_config.tensor_parallel_size, + tensor_parallel_size=_speculative_tensor_parallel_size, distributed_executor_backend=target_parallel_config. distributed_executor_backend, max_parallel_loading_workers=target_parallel_config. @@ -961,9 +964,6 @@ def create_draft_parallel_config( placement_group=target_parallel_config.placement_group, ) - if speculative_tensor_parallel_size is not None: - draft_parallel_config.tensor_parallel_size = speculative_tensor_parallel_size - return draft_parallel_config def __init__( From b2e85954ec06b172a4462d5e1c18e3fd7f177c84 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 15:29:04 +0900 Subject: [PATCH 014/126] cleanup --- vllm/config.py | 5 ++++- vllm/distributed/communication_op.py | 7 +++---- vllm/distributed/parallel_state.py | 13 ++++++------- vllm/engine/arg_utils.py | 4 ++-- vllm/executor/gpu_executor.py | 1 - vllm/spec_decode/multi_step_worker.py | 3 ++- vllm/spec_decode/single_tp_worker.py | 25 ++++++++++++------------- vllm/spec_decode/spec_decode_worker.py | 11 ++++++----- 8 files changed, 35 insertions(+), 34 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index aba0c1adda3b7..b7a63dd02f571 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -947,7 +947,10 @@ def create_draft_parallel_config( This is mostly a copy of the target parallel config. """ - _speculative_tensor_parallel_size = speculative_tensor_parallel_size or target_parallel_config.tensor_parallel_size + _speculative_tensor_parallel_size = ( + speculative_tensor_parallel_size + or target_parallel_config.tensor_parallel_size + ) draft_parallel_config = ParallelConfig( pipeline_parallel_size=target_parallel_config.pipeline_parallel_size, diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index 2bdbbd2b80ddb..2764625f1b5ae 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -5,10 +5,6 @@ import torch from torch.distributed import ProcessGroup -from vllm.logger import init_logger - -logger = init_logger(__name__) - from .parallel_state import (get_world_group, get_cpu_world_group, get_pp_pynccl_communicator, get_tensor_model_parallel_group, @@ -16,6 +12,9 @@ get_tensor_model_parallel_world_size, get_tp_ca_communicator, get_tp_pynccl_communicator) +from vllm.logger import init_logger + +logger = init_logger(__name__) @dataclass diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 2203c3b633487..b98f416c6ab8f 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -99,13 +99,11 @@ def init_distributed_environment( backend=backend, init_method=distributed_init_method, world_size=world_size, - timeout=timedelta(seconds=10), rank=rank) global _DEVICE_WORLD_GROUP, _CPU_WORLD_GROUP _DEVICE_WORLD_GROUP = torch.distributed.group.WORLD ranks = list(range(torch.distributed.get_world_size())) _CPU_WORLD_GROUP = torch.distributed.new_group(ranks=ranks, - timeout=timedelta(seconds=10), backend="gloo") # set the local rank # local_rank is not available in torch ProcessGroup, @@ -186,8 +184,8 @@ def initialize_model_parallel( ranks = list( range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)) - group = torch.distributed.new_group(ranks, backend=backend, timeout=timedelta(seconds=10)) - cpu_group = torch.distributed.new_group(ranks, backend="gloo", timeout=timedelta(seconds=10)) + group = torch.distributed.new_group(ranks, backend=backend) + cpu_group = torch.distributed.new_group(ranks, backend="gloo") if rank in ranks: _TP_DEVICE_GROUP = group _TP_CPU_GROUP = cpu_group @@ -216,8 +214,8 @@ def initialize_model_parallel( "pipeline model parallel group is already initialized") for i in range(num_pipeline_model_parallel_groups): ranks = list(range(i, world_size, num_pipeline_model_parallel_groups)) - group = torch.distributed.new_group(ranks, backend=backend, timeout=timedelta(seconds=10)) - cpu_group = torch.distributed.new_group(ranks, backend="gloo", timeout=timedelta(seconds=10)) + group = torch.distributed.new_group(ranks, backend=backend) + cpu_group = torch.distributed.new_group(ranks, backend="gloo") if rank in ranks: _PP_DEVICE_GROUP = group _PP_CPU_GROUP = cpu_group @@ -276,7 +274,8 @@ def patch_tensor_parallel_group(group, cpu_group, pynccl_comm=None, ca_comm=None old_tp_cpu_group = get_tensor_model_parallel_cpu_group() old_tp_pynccl_comm = get_tp_pynccl_communicator() old_tp_ca_comm = get_tp_ca_communicator() - global _DEVICE_WORLD_GROUP, _CPU_WORLD_GROUP, _TP_DEVICE_GROUP, _TP_CPU_GROUP, _TP_PYNCCL_COMMUNICATOR, _TP_CA_COMMUNICATOR + global _DEVICE_WORLD_GROUP, _CPU_WORLD_GROUP, _TP_DEVICE_GROUP, \ + _TP_CPU_GROUP, _TP_PYNCCL_COMMUNICATOR, _TP_CA_COMMUNICATOR _DEVICE_WORLD_GROUP = group _CPU_WORLD_GROUP = cpu_group _TP_DEVICE_GROUP = group diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 80acc108241d2..7bf8f8b80f613 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1,4 +1,3 @@ -import copy import argparse import dataclasses import json @@ -676,7 +675,8 @@ def create_engine_config(self, ) -> EngineConfig: target_parallel_config=parallel_config, target_dtype=self.dtype, speculative_model=self.speculative_model, - speculative_tensor_parallel_size = self.speculative_tensor_parallel_size, + speculative_tensor_parallel_size = \ + self.speculative_tensor_parallel_size, num_speculative_tokens=self.num_speculative_tokens, speculative_disable_by_batch_size=self. speculative_disable_by_batch_size, diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 19fda41a3ded9..3ad201f4757ec 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -32,7 +32,6 @@ def _get_worker_kwargs( if distributed_init_method is None: distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) - logger.info(f"spec_tp: {self.speculative_config.draft_parallel_config.tensor_parallel_size}") return dict( model_config=self.model_config, parallel_config=self.parallel_config, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 6236d0b07f6d8..309f6f3a3608e 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -6,7 +6,8 @@ from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceGroupMetadata) -from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeProposer +from vllm.spec_decode.interfaces import (SpeculativeProposals, + SpeculativeProposer) from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.worker.worker import Worker diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/single_tp_worker.py index 2bb14f215fcb3..109a53557f8fe 100644 --- a/vllm/spec_decode/single_tp_worker.py +++ b/vllm/spec_decode/single_tp_worker.py @@ -1,20 +1,17 @@ -import copy -import weakref from typing import List, Tuple, Set, Optional, Union from datetime import timedelta import torch import torch.distributed -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, - SequenceGroupMetadata) -from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeProposer +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase -from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.worker.worker import Worker from vllm.lora.request import LoRARequest -from vllm.distributed.parallel_state import patch_tensor_parallel_group, _ENABLE_CUSTOM_ALL_REDUCE +from vllm.distributed.parallel_state import (patch_tensor_parallel_group, + _ENABLE_CUSTOM_ALL_REDUCE) from vllm.config import ParallelConfig from vllm.logger import init_logger @@ -40,8 +37,9 @@ def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, target_tp = target_parallel_config.tensor_parallel_size logger.info(f"{target_tp=}, {draft_tp=}") if draft_tp > target_tp: - raise ValueError("{cls} only supports draft_tp smaller than target_tp." - f"{draft_tp=} {target_tp=}") + raise ValueError( + f"{cls} only supports draft_tp smaller than target_tp." + f"{draft_tp=} {target_tp=}") ranks = list(range(draft_tp)) @@ -53,7 +51,7 @@ def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, logger.info(f"Wrapping {type(worker)} in {cls}") return cls(worker, ranks, local_rank) else: - logger.info(f"dummy worker that would not participate in draft generation") + logger.info(f"Returning dummy worker") return DummyProposerWorker(worker) def __init__( @@ -86,14 +84,15 @@ def init_device(self): ranks=self._ranks, timeout=timedelta(seconds=10), backend="gloo") if len(self._ranks) > 1: - from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator + from vllm.distributed.device_communicators.pynccl \ + import PyNcclCommunicator self._tp_pynccl_comm = PyNcclCommunicator( group=self._tp_cpu_group, device=self._local_rank, ) if _ENABLE_CUSTOM_ALL_REDUCE: - from vllm.distributed.device_communicators.custom_all_reduce import ( - CustomAllreduce) + from vllm.distributed.device_communicators.custom_all_reduce \ + import CustomAllreduce self._tp_ca_comm = CustomAllreduce( group=self._tp_cpu_group, device=self._local_rank, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index d7da7ca0d7c03..015a19b9f34d0 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -47,8 +47,6 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": #load_config=load_config, ) - logger.info(f"spec_tp: {speculative_config.draft_parallel_config.tensor_parallel_size}") - spec_decode_worker = SpecDecodeWorker.create_worker( scorer_worker=target_worker, draft_worker_kwargs=draft_worker_kwargs, @@ -108,9 +106,12 @@ def create_worker( ngram_prompt_lookup_max) else: proposer_worker = MultiStepWorker(**draft_worker_kwargs) - - #TODO: support NGramWorker - proposer_worker = SingleTpWorker.maybe_wrap_worker(proposer_worker, draft_worker_kwargs['parallel_config'], scorer_worker.parallel_config, scorer_worker.rank, scorer_worker.local_rank) + + proposer_worker = SingleTpWorker.maybe_wrap_worker( + proposer_worker, draft_worker_kwargs['parallel_config'], + scorer_worker.parallel_config, + scorer_worker.rank, scorer_worker.local_rank + ) logger.info("Configuring SpecDecodeWorker with proposer=%s", type(proposer_worker)) From 756442a5ae41d9000af718c86af8119c27594c7a Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 15:56:20 +0900 Subject: [PATCH 015/126] return type fix --- vllm/spec_decode/single_tp_worker.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/single_tp_worker.py index 109a53557f8fe..98ef4432ea29a 100644 --- a/vllm/spec_decode/single_tp_worker.py +++ b/vllm/spec_decode/single_tp_worker.py @@ -201,31 +201,31 @@ def sampler_output( execute_model_req: ExecuteModelRequest, sample_len: int, ) -> Tuple[List[SamplerOutput], bool]: - return None + return [], True def get_spec_proposals( self, execute_model_req: ExecuteModelRequest, ) -> SpeculativeProposals: - return None + return SpeculativeProposals(None, None, None) def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None ) -> List[SamplerOutput]: - return None + return [] def get_cache_block_size_bytes(self) -> int: return 0 def add_lora(self, lora_request: LoRARequest) -> bool: - pass + return False def remove_lora(self, lora_id: int) -> bool: - pass + return False def list_loras(self) -> Set[int]: - pass + return set() @property def max_model_len(self) -> int: From 32094f15ac3fea57735c54f9d2ba1bf185eada21 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 16:13:08 +0900 Subject: [PATCH 016/126] cleanup --- vllm/config.py | 1 - vllm/distributed/communication_op.py | 3 ++- vllm/distributed/parallel_state.py | 18 +++++++++--------- vllm/spec_decode/multi_step_worker.py | 2 +- vllm/spec_decode/single_tp_worker.py | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index b7a63dd02f571..d5a84d645e3ef 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,4 +1,3 @@ -import copy import enum import json from dataclasses import dataclass, field, fields diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index 2764625f1b5ae..c5cd7388dc860 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -6,7 +6,8 @@ import torch from torch.distributed import ProcessGroup -from .parallel_state import (get_world_group, get_cpu_world_group, get_pp_pynccl_communicator, +from .parallel_state import (get_world_group, get_cpu_world_group, + get_pp_pynccl_communicator, get_tensor_model_parallel_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index b98f416c6ab8f..c93df1aab7583 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -4,7 +4,6 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. """Tensor and pipeline parallel groups.""" from typing import List, Optional -from datetime import timedelta import contextlib import torch @@ -190,7 +189,8 @@ def initialize_model_parallel( _TP_DEVICE_GROUP = group _TP_CPU_GROUP = cpu_group - from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator + from vllm.distributed.device_communicators.pynccl \ + import PyNcclCommunicator if tensor_model_parallel_size > 1: _TP_PYNCCL_COMMUNICATOR = PyNcclCommunicator( group=_TP_CPU_GROUP, @@ -199,8 +199,8 @@ def initialize_model_parallel( # Initialize a custom fast all-reduce implementation. if _ENABLE_CUSTOM_ALL_REDUCE: - from vllm.distributed.device_communicators.custom_all_reduce import ( - CustomAllreduce) + from vllm.distributed.device_communicators.custom_all_reduce \ + import CustomAllreduce _TP_CA_COMMUNICATOR = CustomAllreduce( group=_TP_CPU_GROUP, device=_LOCAL_RANK, @@ -261,13 +261,13 @@ def model_parallel_is_initialized(): return (_TP_DEVICE_GROUP is not None and _PP_DEVICE_GROUP is not None) -override = False +OVERRIDE_TP_STATE = False @contextlib.contextmanager def patch_tensor_parallel_group(group, cpu_group, pynccl_comm=None, ca_comm=None): - global override - assert not override, "should not override during override" - override = True + global OVERRIDE_TP_STATE + assert not OVERRIDE_TP_STATE, "should not override during override" + OVERRIDE_TP_STATE = True old_world_group = get_world_group() old_world_cpu_group = get_cpu_world_group() old_tp_group = get_tensor_model_parallel_group() @@ -285,7 +285,7 @@ def patch_tensor_parallel_group(group, cpu_group, pynccl_comm=None, ca_comm=None try: yield finally: - override = False + OVERRIDE_TP_STATE = False _DEVICE_WORLD_GROUP = old_world_group _CPU_WORLD_GROUP = old_world_cpu_group _TP_DEVICE_GROUP = old_tp_group diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 309f6f3a3608e..75ecfc2811891 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -75,7 +75,7 @@ def sampler_output( sample_len) # Run model sample_len times. - model_outputs = [] + model_outputs: List[SamplerOutput] = [] for i in range(sample_len): logger.info(f"Driver runs multiple draft steps. {i+1}/{sample_len}") model_output = super().execute_model( diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/single_tp_worker.py index 98ef4432ea29a..b83ef14c33b28 100644 --- a/vllm/spec_decode/single_tp_worker.py +++ b/vllm/spec_decode/single_tp_worker.py @@ -85,7 +85,7 @@ def init_device(self): if len(self._ranks) > 1: from vllm.distributed.device_communicators.pynccl \ - import PyNcclCommunicator + import PyNcclCommunicator self._tp_pynccl_comm = PyNcclCommunicator( group=self._tp_cpu_group, device=self._local_rank, From 7890191005c0cf31fed407901c35053e0ec94430 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 16:40:12 +0900 Subject: [PATCH 017/126] cleanup --- vllm/distributed/parallel_state.py | 4 ++- vllm/spec_decode/single_tp_worker.py | 37 +++++++++++--------------- vllm/spec_decode/spec_decode_worker.py | 4 +-- 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index c93df1aab7583..b9245435a79b8 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -266,7 +266,9 @@ def model_parallel_is_initialized(): @contextlib.contextmanager def patch_tensor_parallel_group(group, cpu_group, pynccl_comm=None, ca_comm=None): global OVERRIDE_TP_STATE - assert not OVERRIDE_TP_STATE, "should not override during override" + if OVERRIDE_TP_STATE: + return + OVERRIDE_TP_STATE = True old_world_group = get_world_group() old_world_cpu_group = get_cpu_world_group() diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/single_tp_worker.py index b83ef14c33b28..96b2d2b7d95dd 100644 --- a/vllm/spec_decode/single_tp_worker.py +++ b/vllm/spec_decode/single_tp_worker.py @@ -18,39 +18,42 @@ logger = init_logger(__name__) -class SingleTpWorker(ProposerWorkerBase): - """Class which allows a speculative draft model to run with tensor parallel - degree of 1, while target model runs with larger tensor parallel degree. - This reduces the overhead of small draft models. +class SmallerTpProposerWorker(ProposerWorkerBase): + """Class which allows a speculative draft model to run with smaller tensor + parallel degree than target model. + This reduces the communication overhead of small draft models. This is implemented by changing vLLM's tensor parallel group to a group of - size 1 during forward passes. + size temporarily during forward passes of draft models. """ @classmethod def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, target_parallel_config: ParallelConfig, rank: int, local_rank: int): - """Wrap the worker in a SingleTpWorker if necessary. + """Wrap the worker in a SmallerTpProposerWorker if necessary. """ draft_tp = draft_parallel_config.tensor_parallel_size target_tp = target_parallel_config.tensor_parallel_size logger.info(f"{target_tp=}, {draft_tp=}") + + if draft_tp == target_tp: + return worker + if draft_tp > target_tp: raise ValueError( f"{cls} only supports draft_tp smaller than target_tp." f"{draft_tp=} {target_tp=}") + # gpu ranks that will generate draft tokens together ranks = list(range(draft_tp)) - if draft_tp == target_tp: - return worker - logger.info(f"{rank=}, {ranks=}") if rank in ranks: logger.info(f"Wrapping {type(worker)} in {cls}") return cls(worker, ranks, local_rank) else: + # for workers not participating in the draft generation logger.info(f"Returning dummy worker") return DummyProposerWorker(worker) @@ -73,10 +76,10 @@ def _patch_tensor_parallel_group(self): self._tp_pynccl_comm, self._tp_ca_comm) def init_device(self): - """Initialize the model on all ranks. + """Initialize the device. - This also creates a single-rank process group containing only the - self process. + This also creates an additional tensor-parallel process group containing + only a subset of the whole ranks. """ self._tp_group = torch.distributed.new_group( ranks=self._ranks, timeout=timedelta(seconds=10)) @@ -107,37 +110,29 @@ def set_include_gpu_probs_tensor(self): self._worker.set_include_gpu_probs_tensor() def load_model(self): - logger.info("SingleTPWorker.load_model()") with self._patch_tensor_parallel_group(): self._worker.load_model() def determine_num_available_blocks(self): - """Profile the model on all ranks. - """ with self._patch_tensor_parallel_group(): return self._worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int): - """Initialize the cache engine on all ranks. - """ with self._patch_tensor_parallel_group(): self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - @torch.inference_mode() def sampler_output( self, execute_model_req: ExecuteModelRequest, sample_len: int, ) -> Tuple[List[SamplerOutput], bool]: + # it's called after tp_group has already been overriden return self._worker.sampler_output(execute_model_req, sample_len) def get_spec_proposals( self, execute_model_req: ExecuteModelRequest, ) -> SpeculativeProposals: - """Produce speculations given an input batch of sequences. The number of - speculative tokens per sequence is determined by max_proposal_len. - """ with self._patch_tensor_parallel_group(): return self._worker.get_spec_proposals(execute_model_req) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 015a19b9f34d0..eab6a59ac5f88 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -13,7 +13,7 @@ SpeculativeScorer, SpeculativeScores) from vllm.spec_decode.metrics import AsyncMetricsCollector from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.single_tp_worker import SingleTpWorker +from vllm.spec_decode.single_tp_worker import SmallerTpProposerWorker from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.util import (create_sequence_group_output, @@ -107,7 +107,7 @@ def create_worker( else: proposer_worker = MultiStepWorker(**draft_worker_kwargs) - proposer_worker = SingleTpWorker.maybe_wrap_worker( + proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker( proposer_worker, draft_worker_kwargs['parallel_config'], scorer_worker.parallel_config, scorer_worker.rank, scorer_worker.local_rank From 53b2ea9294e4fbf9a583b6fbc959738c6d5994e5 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 16:46:58 +0900 Subject: [PATCH 018/126] typo --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7bf8f8b80f613..6181e64e486f3 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -528,7 +528,7 @@ def add_cli_args( default=EngineArgs.num_speculative_tokens, help='The number of speculative tokens to sample from ' 'the draft model in speculative decoding.') - parser.add_argument('--speculatvie-tensor-parallel-size', + parser.add_argument('--speculative-tensor-parallel-size', '-spec-tp', type=int, default=EngineArgs.speculative_tensor_parallel_size, From a29c9c524771a709bd7cc00cb2f3eb161257c020 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 17:04:43 +0900 Subject: [PATCH 019/126] verify arg --- vllm/config.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index d5a84d645e3ef..4a15a34c8cf41 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -946,14 +946,21 @@ def create_draft_parallel_config( This is mostly a copy of the target parallel config. """ - _speculative_tensor_parallel_size = ( + speculative_tensor_parallel_size = ( speculative_tensor_parallel_size or target_parallel_config.tensor_parallel_size ) + if speculative_tensor_parallel_size > \ + target_parallel_config.tensor_parallel_size: + raise ValueError( + f"{speculative_tensor_parallel_size=} cannot be" + f"larger than {target_parallel_config.tensor_parallel_size}" + ) + draft_parallel_config = ParallelConfig( pipeline_parallel_size=target_parallel_config.pipeline_parallel_size, - tensor_parallel_size=_speculative_tensor_parallel_size, + tensor_parallel_size=speculative_tensor_parallel_size, distributed_executor_backend=target_parallel_config. distributed_executor_backend, max_parallel_loading_workers=target_parallel_config. From 52ba09d73f0dbd03ca9bdcaec04975be9731ea10 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 17:15:30 +0900 Subject: [PATCH 020/126] remove testing code --- vllm/engine/arg_utils.py | 4 ++-- vllm/worker/worker.py | 6 ------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6181e64e486f3..4d8c953159149 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -96,8 +96,8 @@ class EngineArgs: num_speculative_tokens: Optional[int] = None speculative_max_model_len: Optional[int] = None speculative_disable_by_batch_size: Optional[int] = None - ngram_prompt_lookup_max: Optional[int] = 4 - ngram_prompt_lookup_min: Optional[int] = 3 + ngram_prompt_lookup_max: Optional[int] = None + ngram_prompt_lookup_min: Optional[int] = None qlora_adapter_name_or_path: Optional[str] = None diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 8d6468683e706..b7db883324da6 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -20,9 +20,6 @@ from vllm.worker.embedding_model_runner import EmbeddingModelRunner from vllm.worker.model_runner import ModelRunner from vllm.worker.worker_base import WorkerBase -from vllm.logger import init_logger - -logger = init_logger(__name__) class Worker(WorkerBase): @@ -230,11 +227,9 @@ def execute_model( execute_model_req: Optional[ExecuteModelRequest] = None ) -> List[Union[SamplerOutput, PoolerOutput]]: if not self.is_driver_worker: - logger.info("Worker.execute_model()") self._execute_model_non_driver() return [] - logger.info("Driver. Worker.execute_model()") if execute_model_req is None: # This signals that there's no more requests to process for now. # All workers are running infinite loop with broadcast_tensor_dict, @@ -274,7 +269,6 @@ def execute_model( if num_seq_groups == 0: return [] - logger.info("Worker.model_runner.execute_model()") output = self.model_runner.execute_model(seq_group_metadata_list, self.gpu_cache) From d26ef085beb379de375dc6bf92d6ad3f6b749465 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 17:27:25 +0900 Subject: [PATCH 021/126] cleanup --- vllm/config.py | 4 ++-- vllm/distributed/communication_op.py | 4 ---- vllm/distributed/parallel_state.py | 2 -- vllm/engine/llm_engine.py | 2 -- vllm/spec_decode/multi_step_worker.py | 8 ++------ 5 files changed, 4 insertions(+), 16 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 4a15a34c8cf41..a51d63c583f47 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -892,7 +892,7 @@ def maybe_create_spec_config( draft_parallel_config = ( SpeculativeConfig.create_draft_parallel_config( target_parallel_config, speculative_tensor_parallel_size)) - + return SpeculativeConfig( draft_model_config, draft_parallel_config, @@ -943,7 +943,7 @@ def create_draft_parallel_config( speculative_tensor_parallel_size: Optional[int]) -> ParallelConfig: """Create a parallel config for use by the draft worker. - This is mostly a copy of the target parallel config. + This is mostly a copy of the target parallel config, except the tp_size. """ speculative_tensor_parallel_size = ( diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index c5cd7388dc860..b18a338e84a31 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -13,9 +13,6 @@ get_tensor_model_parallel_world_size, get_tp_ca_communicator, get_tp_pynccl_communicator) -from vllm.logger import init_logger - -logger = init_logger(__name__) @dataclass @@ -247,7 +244,6 @@ def broadcast_tensor_dict( group = group or get_world_group() metadata_group = metadata_group or get_cpu_world_group() ranks = torch.distributed.get_process_group_ranks(group) - logger.info(f"broadcast_tensor_dict. src: {src}, ranks: {ranks}") assert src in ranks, f"Invalid src rank ({src})" rank = torch.distributed.get_rank() diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index b9245435a79b8..c3e5a0eab3f8b 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -124,8 +124,6 @@ def init_distributed_environment( if torch.cuda.is_available(): torch.cuda.synchronize() del data - else: - logger.info("torch.distributed has already been initialized") def initialize_model_parallel( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 87dcc6fbb6b97..cb5893e707c8b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -355,8 +355,6 @@ def from_engine_args( from vllm.executor.gpu_executor import GPUExecutor executor_class = GPUExecutor - logger.info(f"engine_config.to_dict(): {engine_config.to_dict()}") - # Create the LLM engine. engine = cls( **engine_config.to_dict(), diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 75ecfc2811891..8e57e5749c9c1 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -11,9 +11,6 @@ from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.worker.worker import Worker -from vllm.logger import init_logger - -logger = init_logger(__name__) class MultiStepWorker(Worker, ProposerWorkerBase): @@ -75,9 +72,8 @@ def sampler_output( sample_len) # Run model sample_len times. - model_outputs: List[SamplerOutput] = [] - for i in range(sample_len): - logger.info(f"Driver runs multiple draft steps. {i+1}/{sample_len}") + model_outputs = [] + for _ in range(sample_len): model_output = super().execute_model( execute_model_req=copied_execute_model_req) assert (len(model_output) == 1 From 80c499494e2973d19bc232976dc0533c2b4cc272 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 17:28:17 +0900 Subject: [PATCH 022/126] rename module --- .../{single_tp_worker.py => smaller_tp_proposer_worker.py} | 0 vllm/spec_decode/spec_decode_worker.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename vllm/spec_decode/{single_tp_worker.py => smaller_tp_proposer_worker.py} (100%) diff --git a/vllm/spec_decode/single_tp_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py similarity index 100% rename from vllm/spec_decode/single_tp_worker.py rename to vllm/spec_decode/smaller_tp_proposer_worker.py diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index eab6a59ac5f88..b3005a5d767e2 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -13,7 +13,7 @@ SpeculativeScorer, SpeculativeScores) from vllm.spec_decode.metrics import AsyncMetricsCollector from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.single_tp_worker import SmallerTpProposerWorker +from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.util import (create_sequence_group_output, From 0f16f3f60d8e51411a03e9cbb44590917f07d946 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 17:32:29 +0900 Subject: [PATCH 023/126] cleanup --- vllm/spec_decode/spec_decode_worker.py | 4 +--- vllm/worker/worker.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index b3005a5d767e2..ee15f27643901 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -88,7 +88,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): @classmethod def create_worker( cls, - scorer_worker: Worker, + scorer_worker: WorkerBase, draft_worker_kwargs: Dict[str, Any], disable_by_batch_size: Optional[int], ) -> "SpecDecodeWorker": @@ -292,10 +292,8 @@ def execute_model( if num_lookahead_slots == 0 or len( execute_model_req.seq_group_metadata_list ) == 0 or disable_all_speculation: - logger.info("prefill step") return self._run_no_spec(execute_model_req, skip_proposer=disable_all_speculation) - logger.info("decoding step") return self._run_speculative_decoding_step(execute_model_req, num_lookahead_slots) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index b7db883324da6..a31de02633607 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -43,7 +43,6 @@ def __init__( distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, vision_language_config: Optional[VisionLanguageConfig] = None, - speculative_config: Optional[SpeculativeConfig] = None, # TODO: remove? is_driver_worker: bool = False, ) -> None: self.model_config = model_config From 140f478cdbf069e903564371c46e94cf783dbf36 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 17:38:19 +0900 Subject: [PATCH 024/126] cleanup --- vllm/spec_decode/smaller_tp_proposer_worker.py | 6 +----- vllm/spec_decode/spec_decode_worker.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 96b2d2b7d95dd..4fdf73e6f49b6 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -48,13 +48,12 @@ def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, # gpu ranks that will generate draft tokens together ranks = list(range(draft_tp)) - logger.info(f"{rank=}, {ranks=}") if rank in ranks: logger.info(f"Wrapping {type(worker)} in {cls}") return cls(worker, ranks, local_rank) else: # for workers not participating in the draft generation - logger.info(f"Returning dummy worker") + logger.info("Returning dummy worker") return DummyProposerWorker(worker) def __init__( @@ -179,9 +178,6 @@ def __init__( def init_device(self): pass - def set_include_gpu_probs_tensor(self): - pass - def load_model(self): pass diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index ee15f27643901..17100c6dd8814 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -88,7 +88,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): @classmethod def create_worker( cls, - scorer_worker: WorkerBase, + scorer_worker: Worker, draft_worker_kwargs: Dict[str, Any], disable_by_batch_size: Optional[int], ) -> "SpecDecodeWorker": From 3fd7e91fa1828e4f2b0eb599aaf323875f0ec8a6 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 17:41:34 +0900 Subject: [PATCH 025/126] remove unnecessary methods --- vllm/spec_decode/smaller_tp_proposer_worker.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 4fdf73e6f49b6..26816cd3ea6b1 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -218,10 +218,6 @@ def remove_lora(self, lora_id: int) -> bool: def list_loras(self) -> Set[int]: return set() - @property - def max_model_len(self) -> int: - return self._worker.max_model_len - @property def vocab_size(self) -> int: return self._worker.vocab_size From 495aa308340c150cad80690d24ee548a0e52f263 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 17:44:54 +0900 Subject: [PATCH 026/126] fix --- vllm/worker/worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index a31de02633607..10411a2bf7a10 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -43,6 +43,7 @@ def __init__( distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, vision_language_config: Optional[VisionLanguageConfig] = None, + speculative_config: Optional[SpeculativeConfig] = None, is_driver_worker: bool = False, ) -> None: self.model_config = model_config From 3a5a47fbefe837eb58ddabcd3039b1f31f8f861d Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 17:53:32 +0900 Subject: [PATCH 027/126] undo unrelated changes --- vllm/distributed/parallel_state.py | 16 ++++++++-------- vllm/spec_decode/smaller_tp_proposer_worker.py | 16 +++++++--------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index c3e5a0eab3f8b..6bae3ea056832 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -195,14 +195,14 @@ def initialize_model_parallel( device=_LOCAL_RANK, ) - # Initialize a custom fast all-reduce implementation. - if _ENABLE_CUSTOM_ALL_REDUCE: - from vllm.distributed.device_communicators.custom_all_reduce \ - import CustomAllreduce - _TP_CA_COMMUNICATOR = CustomAllreduce( - group=_TP_CPU_GROUP, - device=_LOCAL_RANK, - ) + # Initialize a custom fast all-reduce implementation. + if _ENABLE_CUSTOM_ALL_REDUCE: + from vllm.distributed.device_communicators.custom_all_reduce \ + import CustomAllreduce + _TP_CA_COMMUNICATOR = CustomAllreduce( + group=_TP_CPU_GROUP, + device=_LOCAL_RANK, + ) # Build the pipeline model-parallel groups. global _PP_DEVICE_GROUP, _PP_CPU_GROUP diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 26816cd3ea6b1..fdc31f84b0db0 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -92,15 +92,13 @@ def init_device(self): group=self._tp_cpu_group, device=self._local_rank, ) - if _ENABLE_CUSTOM_ALL_REDUCE: - from vllm.distributed.device_communicators.custom_all_reduce \ - import CustomAllreduce - self._tp_ca_comm = CustomAllreduce( - group=self._tp_cpu_group, - device=self._local_rank, - ) - - logger.info(f"init_device. ranks: {self._ranks}") + if _ENABLE_CUSTOM_ALL_REDUCE: + from vllm.distributed.device_communicators.custom_all_reduce \ + import CustomAllreduce + self._tp_ca_comm = CustomAllreduce( + group=self._tp_cpu_group, + device=self._local_rank, + ) with self._patch_tensor_parallel_group(): self._worker.init_device() From 07ddbb87881e5beef33f30ee1b23c25551e86a73 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 17:55:05 +0900 Subject: [PATCH 028/126] minor fix --- vllm/distributed/parallel_state.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 6bae3ea056832..f54ff17c8409d 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -262,7 +262,10 @@ def model_parallel_is_initialized(): OVERRIDE_TP_STATE = False @contextlib.contextmanager -def patch_tensor_parallel_group(group, cpu_group, pynccl_comm=None, ca_comm=None): +def patch_tensor_parallel_group(group, + cpu_group, + pynccl_comm=None, + ca_comm=None): global OVERRIDE_TP_STATE if OVERRIDE_TP_STATE: return From b0a677d623eeebd4df5f34c2cbb76f8cb2ee7dc8 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 18:01:32 +0900 Subject: [PATCH 029/126] fix ruff errors --- vllm/distributed/parallel_state.py | 4 ++-- vllm/spec_decode/smaller_tp_proposer_worker.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index f54ff17c8409d..1f002ff3a8a76 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -3,9 +3,9 @@ # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. """Tensor and pipeline parallel groups.""" +from contextlib import contextmanager from typing import List, Optional -import contextlib import torch from torch.distributed import ProcessGroup @@ -261,7 +261,7 @@ def model_parallel_is_initialized(): OVERRIDE_TP_STATE = False -@contextlib.contextmanager +@contextmanager def patch_tensor_parallel_group(group, cpu_group, pynccl_comm=None, diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index fdc31f84b0db0..fba2d9b22bca7 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -35,7 +35,6 @@ def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, """ draft_tp = draft_parallel_config.tensor_parallel_size target_tp = target_parallel_config.tensor_parallel_size - logger.info(f"{target_tp=}, {draft_tp=}") if draft_tp == target_tp: return worker @@ -49,7 +48,7 @@ def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, ranks = list(range(draft_tp)) if rank in ranks: - logger.info(f"Wrapping {type(worker)} in {cls}") + logger.info("Wrapping {%s} in {%s}", type(worker), cls) return cls(worker, ranks, local_rank) else: # for workers not participating in the draft generation From 9998b9c6f63dcb1c6adca97b2d2bdedfcd727f69 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 18:10:35 +0900 Subject: [PATCH 030/126] typo --- vllm/spec_decode/smaller_tp_proposer_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index fba2d9b22bca7..f59e0479714bd 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -122,7 +122,7 @@ def sampler_output( execute_model_req: ExecuteModelRequest, sample_len: int, ) -> Tuple[List[SamplerOutput], bool]: - # it's called after tp_group has already been overriden + # it's called after tp_group has already been overridden return self._worker.sampler_output(execute_model_req, sample_len) def get_spec_proposals( From e92ecdce24e84917fe8339fe7d9e96f324eaf599 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 18:12:04 +0900 Subject: [PATCH 031/126] temporal fix --- vllm/spec_decode/smaller_tp_proposer_worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index f59e0479714bd..8b13c695e94fb 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -117,6 +117,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int): with self._patch_tensor_parallel_group(): self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + @torch.inference_mode() def sampler_output( self, execute_model_req: ExecuteModelRequest, From b4216074d7ce4eb8f2802996107dab5b80639a8a Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 18:16:51 +0900 Subject: [PATCH 032/126] formatting --- vllm/config.py | 13 ++++++------- vllm/distributed/communication_op.py | 12 +++++------- vllm/distributed/parallel_state.py | 1 + vllm/engine/arg_utils.py | 13 +++++++------ vllm/spec_decode/smaller_tp_proposer_worker.py | 15 ++++++--------- vllm/spec_decode/spec_decode_worker.py | 5 ++--- 6 files changed, 27 insertions(+), 32 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index d83fa1304fcfa..c89611cc94c17 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -904,8 +904,8 @@ def maybe_create_spec_config( )) draft_parallel_config = ( - SpeculativeConfig.create_draft_parallel_config( - target_parallel_config, speculative_tensor_parallel_size)) + SpeculativeConfig.create_draft_parallel_config( + target_parallel_config, speculative_tensor_parallel_size)) return SpeculativeConfig( draft_model_config, @@ -962,18 +962,17 @@ def create_draft_parallel_config( speculative_tensor_parallel_size = ( speculative_tensor_parallel_size - or target_parallel_config.tensor_parallel_size - ) + or target_parallel_config.tensor_parallel_size) if speculative_tensor_parallel_size > \ target_parallel_config.tensor_parallel_size: raise ValueError( f"{speculative_tensor_parallel_size=} cannot be" - f"larger than {target_parallel_config.tensor_parallel_size}" - ) + f"larger than {target_parallel_config.tensor_parallel_size}") draft_parallel_config = ParallelConfig( - pipeline_parallel_size=target_parallel_config.pipeline_parallel_size, + pipeline_parallel_size=target_parallel_config. + pipeline_parallel_size, tensor_parallel_size=speculative_tensor_parallel_size, distributed_executor_backend=target_parallel_config. distributed_executor_backend, diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index b18a338e84a31..1fb74e3b37dc6 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -6,13 +6,11 @@ import torch from torch.distributed import ProcessGroup -from .parallel_state import (get_world_group, get_cpu_world_group, - get_pp_pynccl_communicator, - get_tensor_model_parallel_group, - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - get_tp_ca_communicator, - get_tp_pynccl_communicator) +from .parallel_state import ( + get_world_group, get_cpu_world_group, get_pp_pynccl_communicator, + get_tensor_model_parallel_group, get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, get_tp_ca_communicator, + get_tp_pynccl_communicator) @dataclass diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 4740e33328377..d7c64e21ae5c0 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -262,6 +262,7 @@ def model_parallel_is_initialized(): OVERRIDE_TP_STATE = False + @contextlib.contextmanager def patch_tensor_parallel_group(group, cpu_group, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6a2b156bca7f1..c39d99587469d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -535,12 +535,13 @@ def add_cli_args( default=EngineArgs.num_speculative_tokens, help='The number of speculative tokens to sample from ' 'the draft model in speculative decoding.') - parser.add_argument('--speculative-tensor-parallel-size', - '-spec-tp', - type=int, - default=EngineArgs.speculative_tensor_parallel_size, - help='Number of tensor parallel replicas for ' - 'the draft model in speculative decoding.') + parser.add_argument( + '--speculative-tensor-parallel-size', + '-spec-tp', + type=int, + default=EngineArgs.speculative_tensor_parallel_size, + help='Number of tensor parallel replicas for ' + 'the draft model in speculative decoding.') parser.add_argument( '--speculative-max-model-len', diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 8b13c695e94fb..b993865a66076 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -29,8 +29,8 @@ class SmallerTpProposerWorker(ProposerWorkerBase): @classmethod def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, - target_parallel_config: ParallelConfig, - rank: int, local_rank: int): + target_parallel_config: ParallelConfig, rank: int, + local_rank: int): """Wrap the worker in a SmallerTpProposerWorker if necessary. """ draft_tp = draft_parallel_config.tensor_parallel_size @@ -55,12 +55,8 @@ def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, logger.info("Returning dummy worker") return DummyProposerWorker(worker) - def __init__( - self, - worker: Union[Worker, ProposerWorkerBase], - ranks: List[int], - local_rank: int - ): + def __init__(self, worker: Union[Worker, ProposerWorkerBase], + ranks: List[int], local_rank: int): self._worker = worker self._ranks = ranks self._local_rank = local_rank @@ -71,7 +67,8 @@ def __init__( def _patch_tensor_parallel_group(self): return patch_tensor_parallel_group(self._tp_group, self._tp_cpu_group, - self._tp_pynccl_comm, self._tp_ca_comm) + self._tp_pynccl_comm, + self._tp_ca_comm) def init_device(self): """Initialize the device. diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index af797c1d6e0af..8f0040c1278e1 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -110,9 +110,8 @@ def create_worker( proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker( proposer_worker, draft_worker_kwargs['parallel_config'], - scorer_worker.parallel_config, - scorer_worker.rank, scorer_worker.local_rank - ) + scorer_worker.parallel_config, scorer_worker.rank, + scorer_worker.local_rank) logger.info("Configuring SpecDecodeWorker with proposer=%s", type(proposer_worker)) From 386ab9bc6ec8f02dc1e5481b26bdcd7052c7d832 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 18:23:12 +0900 Subject: [PATCH 033/126] isort --- vllm/distributed/communication_op.py | 11 +++++----- vllm/distributed/parallel_state.py | 7 +++---- .../spec_decode/smaller_tp_proposer_worker.py | 21 +++++++++---------- vllm/spec_decode/spec_decode_worker.py | 2 +- 4 files changed, 20 insertions(+), 21 deletions(-) diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index 1fb74e3b37dc6..bc815d3a9cf4c 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -6,11 +6,12 @@ import torch from torch.distributed import ProcessGroup -from .parallel_state import ( - get_world_group, get_cpu_world_group, get_pp_pynccl_communicator, - get_tensor_model_parallel_group, get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, get_tp_ca_communicator, - get_tp_pynccl_communicator) +from .parallel_state import (get_cpu_world_group, get_pp_pynccl_communicator, + get_tensor_model_parallel_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + get_tp_ca_communicator, + get_tp_pynccl_communicator, get_world_group) @dataclass diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index d7c64e21ae5c0..629f7eb2e131e 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -188,8 +188,7 @@ def initialize_model_parallel( _TP_DEVICE_GROUP = group _TP_CPU_GROUP = cpu_group - from vllm.distributed.device_communicators.pynccl \ - import PyNcclCommunicator + from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator if tensor_model_parallel_size > 1: _TP_PYNCCL_COMMUNICATOR = PyNcclCommunicator( group=_TP_CPU_GROUP, @@ -198,8 +197,8 @@ def initialize_model_parallel( # Initialize a custom fast all-reduce implementation. if _ENABLE_CUSTOM_ALL_REDUCE: - from vllm.distributed.device_communicators.custom_all_reduce \ - import CustomAllreduce + from vllm.distributed.device_communicators.custom_all_reduce import ( + CustomAllreduce) _TP_CA_COMMUNICATOR = CustomAllreduce( group=_TP_CPU_GROUP, device=_LOCAL_RANK, diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index b993865a66076..b1133bef1bac6 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,19 +1,18 @@ -from typing import List, Tuple, Set, Optional, Union from datetime import timedelta +from typing import List, Optional, Set, Tuple, Union import torch import torch.distributed +from vllm.config import ParallelConfig +from vllm.distributed.parallel_state import (_ENABLE_CUSTOM_ALL_REDUCE, + patch_tensor_parallel_group) +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.worker.worker import Worker -from vllm.lora.request import LoRARequest - -from vllm.distributed.parallel_state import (patch_tensor_parallel_group, - _ENABLE_CUSTOM_ALL_REDUCE) -from vllm.config import ParallelConfig -from vllm.logger import init_logger logger = init_logger(__name__) @@ -82,15 +81,15 @@ def init_device(self): ranks=self._ranks, timeout=timedelta(seconds=10), backend="gloo") if len(self._ranks) > 1: - from vllm.distributed.device_communicators.pynccl \ - import PyNcclCommunicator + from vllm.distributed.device_communicators.pynccl import ( + PyNcclCommunicator) self._tp_pynccl_comm = PyNcclCommunicator( group=self._tp_cpu_group, device=self._local_rank, ) if _ENABLE_CUSTOM_ALL_REDUCE: - from vllm.distributed.device_communicators.custom_all_reduce \ - import CustomAllreduce + from vllm.distributed.device_communicators.custom_all_reduce import ( + CustomAllreduce) self._tp_ca_comm = CustomAllreduce( group=self._tp_cpu_group, device=self._local_rank, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 8f0040c1278e1..2363bcd23bef4 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -14,9 +14,9 @@ SpeculativeScorer, SpeculativeScores) from vllm.spec_decode.metrics import AsyncMetricsCollector from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase +from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker from vllm.spec_decode.util import (create_sequence_group_output, get_all_num_logprobs, get_all_seq_ids, get_sampled_token_logprobs, nvtx_range, From b25f74e966af04eef9e5c358d2575842f429beb9 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 12 Jun 2024 18:26:49 +0900 Subject: [PATCH 034/126] line length --- vllm/spec_decode/smaller_tp_proposer_worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index b1133bef1bac6..ea3a4adf91256 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -88,8 +88,8 @@ def init_device(self): device=self._local_rank, ) if _ENABLE_CUSTOM_ALL_REDUCE: - from vllm.distributed.device_communicators.custom_all_reduce import ( - CustomAllreduce) + from vllm.distributed.device_communicators.custom_all_reduce \ + import CustomAllreduce self._tp_ca_comm = CustomAllreduce( group=self._tp_cpu_group, device=self._local_rank, From 8b51f08ae4e24fb7acd6745e447e0b35c561a519 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Thu, 13 Jun 2024 09:52:20 +0900 Subject: [PATCH 035/126] fix --- vllm/spec_decode/smaller_tp_proposer_worker.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index ea3a4adf91256..63fef310c4974 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -80,16 +80,18 @@ def init_device(self): self._tp_cpu_group = torch.distributed.new_group( ranks=self._ranks, timeout=timedelta(seconds=10), backend="gloo") + from vllm.distributed.device_communicators.pynccl import ( + PyNcclCommunicator) if len(self._ranks) > 1: - from vllm.distributed.device_communicators.pynccl import ( - PyNcclCommunicator) self._tp_pynccl_comm = PyNcclCommunicator( group=self._tp_cpu_group, device=self._local_rank, ) + + + from vllm.distributed.device_communicators.custom_all_reduce import ( + CustomAllreduce) if _ENABLE_CUSTOM_ALL_REDUCE: - from vllm.distributed.device_communicators.custom_all_reduce \ - import CustomAllreduce self._tp_ca_comm = CustomAllreduce( group=self._tp_cpu_group, device=self._local_rank, From dfc90cbb0bea35f04acbecead03e77b940450d9c Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Thu, 13 Jun 2024 10:39:07 +0900 Subject: [PATCH 036/126] line length --- vllm/spec_decode/smaller_tp_proposer_worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 62a7eb0d796b8..853e6c42a64b8 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,4 +1,3 @@ -from datetime import timedelta from typing import List, Optional, Set, Tuple, Union import torch @@ -72,7 +71,8 @@ def init_device(self): only a subset of the whole ranks. """ local_rank = get_world_group().local_rank - world_backend = torch.distributed.get_backend(get_world_group().device_group) + world_backend = torch.distributed.get_backend(get_world_group() + .device_group) tp_backend = torch.distributed.get_backend(get_tp_group().device_group) self._world_group = GroupCoordinator( From 9bef5e4388bd5cb523c3517fb4ae264a7073254e Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Thu, 13 Jun 2024 10:43:42 +0900 Subject: [PATCH 037/126] comment --- vllm/spec_decode/smaller_tp_proposer_worker.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 853e6c42a64b8..96b6d7fe1b5b2 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -114,7 +114,8 @@ def sampler_output( execute_model_req: ExecuteModelRequest, sample_len: int, ) -> Tuple[List[SamplerOutput], bool]: - # it's called after tp_group has already been overridden + # do not call _parch_tensor_parallel_group, because + # it's always called after tp_group has already been overridden return self._worker.sampler_output(execute_model_req, sample_len) def get_spec_proposals( From 85d087da5a45572e15315ba8340422e32ce6f840 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Thu, 13 Jun 2024 10:43:48 +0900 Subject: [PATCH 038/126] add type hint --- vllm/spec_decode/multi_step_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 8e57e5749c9c1..ffca818f4cb4d 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -74,7 +74,7 @@ def sampler_output( # Run model sample_len times. model_outputs = [] for _ in range(sample_len): - model_output = super().execute_model( + model_output: List[SamplerOutput] = super().execute_model( execute_model_req=copied_execute_model_req) assert (len(model_output) == 1 ), "composing multistep workers not supported" From 9af36b79df35b045941092f04bc85fc4cb405e0c Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Thu, 13 Jun 2024 10:45:02 +0900 Subject: [PATCH 039/126] isort --- vllm/spec_decode/smaller_tp_proposer_worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 96b6d7fe1b5b2..28ff0ab8ad451 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -5,8 +5,8 @@ from vllm.config import ParallelConfig from vllm.distributed.parallel_state import (_ENABLE_CUSTOM_ALL_REDUCE, - GroupCoordinator, - get_world_group, get_tp_group, + GroupCoordinator, get_tp_group, + get_world_group, patch_tensor_parallel_group) from vllm.logger import init_logger from vllm.lora.request import LoRARequest From 5a0bf45fdb4a2d24e3174213a7dc6bda5262d991 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Thu, 13 Jun 2024 10:53:12 +0900 Subject: [PATCH 040/126] add more type hints --- vllm/spec_decode/multi_step_worker.py | 2 +- vllm/spec_decode/smaller_tp_proposer_worker.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index ffca818f4cb4d..1e061a19ed2b5 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -72,7 +72,7 @@ def sampler_output( sample_len) # Run model sample_len times. - model_outputs = [] + model_outputs: List[SamplerOutput] = [] for _ in range(sample_len): model_output: List[SamplerOutput] = super().execute_model( execute_model_req=copied_execute_model_req) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 28ff0ab8ad451..9efe09c38a77a 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -116,7 +116,8 @@ def sampler_output( ) -> Tuple[List[SamplerOutput], bool]: # do not call _parch_tensor_parallel_group, because # it's always called after tp_group has already been overridden - return self._worker.sampler_output(execute_model_req, sample_len) + return Tuple[List[SamplerOutput], bool] (self._worker + .sampler_output(execute_model_req, sample_len)) def get_spec_proposals( self, From 531c9f0a36f0ce3525383cd1819b7f514c98ceb5 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Thu, 13 Jun 2024 10:54:50 +0900 Subject: [PATCH 041/126] fix --- vllm/spec_decode/smaller_tp_proposer_worker.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 9efe09c38a77a..5931f514d16fd 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -116,8 +116,9 @@ def sampler_output( ) -> Tuple[List[SamplerOutput], bool]: # do not call _parch_tensor_parallel_group, because # it's always called after tp_group has already been overridden - return Tuple[List[SamplerOutput], bool] (self._worker - .sampler_output(execute_model_req, sample_len)) + output = self._worker.sampler_output(execute_model_req, sample_len) + + return Tuple[List[SamplerOutput], bool](output) def get_spec_proposals( self, From 287da20da2bcb91de1777034dde99694aa75eee7 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Thu, 13 Jun 2024 10:56:06 +0900 Subject: [PATCH 042/126] test --- vllm/spec_decode/smaller_tp_proposer_worker.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 5931f514d16fd..28ff0ab8ad451 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -116,9 +116,7 @@ def sampler_output( ) -> Tuple[List[SamplerOutput], bool]: # do not call _parch_tensor_parallel_group, because # it's always called after tp_group has already been overridden - output = self._worker.sampler_output(execute_model_req, sample_len) - - return Tuple[List[SamplerOutput], bool](output) + return self._worker.sampler_output(execute_model_req, sample_len) def get_spec_proposals( self, From 08d1b2a233b5c8db68144df6800fb8f1e8631b57 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Thu, 13 Jun 2024 10:57:21 +0900 Subject: [PATCH 043/126] nit --- vllm/spec_decode/smaller_tp_proposer_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 28ff0ab8ad451..f4403b0aaf424 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -113,7 +113,7 @@ def sampler_output( self, execute_model_req: ExecuteModelRequest, sample_len: int, - ) -> Tuple[List[SamplerOutput], bool]: + ) -> Tuple[Optional[List[SamplerOutput]], bool]: # do not call _parch_tensor_parallel_group, because # it's always called after tp_group has already been overridden return self._worker.sampler_output(execute_model_req, sample_len) From 237c96671c95f2b3f2cb771292fc3e49718d4df1 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Thu, 13 Jun 2024 11:03:25 +0900 Subject: [PATCH 044/126] fix yapf --- vllm/distributed/parallel_state.py | 4 ++-- vllm/spec_decode/smaller_tp_proposer_worker.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index a1f9b6acf4cf6..8ca9571b53da3 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -675,9 +675,9 @@ def model_parallel_is_initialized(): OVERRIDE_TP_STATE = False + @contextlib.contextmanager -def patch_tensor_parallel_group(world_group, - tp_group): +def patch_tensor_parallel_group(world_group, tp_group): global OVERRIDE_TP_STATE if OVERRIDE_TP_STATE: return diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index f4403b0aaf424..264b6a4d9a171 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -71,8 +71,8 @@ def init_device(self): only a subset of the whole ranks. """ local_rank = get_world_group().local_rank - world_backend = torch.distributed.get_backend(get_world_group() - .device_group) + world_backend = torch.distributed.get_backend( + get_world_group().device_group) tp_backend = torch.distributed.get_backend(get_tp_group().device_group) self._world_group = GroupCoordinator( From 0bb38c2c2bfa070a749316990306984e938e9a72 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Thu, 13 Jun 2024 11:15:39 +0900 Subject: [PATCH 045/126] fix --- vllm/config.py | 2 +- vllm/spec_decode/smaller_tp_proposer_worker.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 76f75cb39032f..8f35dcf89ed5c 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -971,7 +971,7 @@ def create_draft_parallel_config( if speculative_tensor_parallel_size > \ target_parallel_config.tensor_parallel_size: raise ValueError( - f"{speculative_tensor_parallel_size=} cannot be" + f"{speculative_tensor_parallel_size=} cannot be " f"larger than {target_parallel_config.tensor_parallel_size}") draft_parallel_config = ParallelConfig( diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 264b6a4d9a171..2757c7815a5ac 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -76,14 +76,14 @@ def init_device(self): tp_backend = torch.distributed.get_backend(get_tp_group().device_group) self._world_group = GroupCoordinator( - group_ranks=[[self._ranks]], + group_ranks=[self._ranks], local_rank=local_rank, torch_distributed_backend=world_backend, use_pynccl=False, use_custom_allreduce=False, ) self._tp_group = GroupCoordinator( - group_ranks=[[self._ranks]], + group_ranks=[self._ranks], local_rank=local_rank, torch_distributed_backend=tp_backend, use_pynccl=True, From c097d6c05cf8ceb98ca8fe42d34a2b0fccdeace6 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Thu, 13 Jun 2024 15:27:44 +0900 Subject: [PATCH 046/126] fix --- vllm/spec_decode/smaller_tp_proposer_worker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 2757c7815a5ac..7ce6723ab7906 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -90,8 +90,7 @@ def init_device(self): use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, ) - with self._patch_tensor_parallel_group(): - self._worker.init_device() + self._worker.init_device() def set_include_gpu_probs_tensor(self): self._worker.set_include_gpu_probs_tensor() From 957a325ca75bbbcbd41d499aa911cf7d626214db Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Thu, 13 Jun 2024 15:47:42 +0900 Subject: [PATCH 047/126] fix --- vllm/distributed/parallel_state.py | 6 +++++- vllm/spec_decode/smaller_tp_proposer_worker.py | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 8ca9571b53da3..44756d75c7b86 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -550,6 +550,10 @@ def init_distributed_environment( global _WORLD if _WORLD is None: ranks = list(range(torch.distributed.get_world_size())) + if world_size != -1: + assert world_size == len(ranks), ( + "given world_size does not match with world_size of torch") + _WORLD = GroupCoordinator( group_ranks=[ranks], local_rank=local_rank, @@ -558,7 +562,7 @@ def init_distributed_environment( use_custom_allreduce=False, ) else: - assert _WORLD.world_size == torch.distributed.get_world_size(), ( + assert _WORLD.world_size == world_size, ( "world group already initialized with a different world size") diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 7ce6723ab7906..2757c7815a5ac 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -90,7 +90,8 @@ def init_device(self): use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, ) - self._worker.init_device() + with self._patch_tensor_parallel_group(): + self._worker.init_device() def set_include_gpu_probs_tensor(self): self._worker.set_include_gpu_probs_tensor() From 8a8a1e465d4d847882c2c6e14ad87e4e586548de Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 10:19:04 +0900 Subject: [PATCH 048/126] add comments --- vllm/distributed/parallel_state.py | 4 +++- vllm/spec_decode/smaller_tp_proposer_worker.py | 6 +++--- vllm/spec_decode/spec_decode_worker.py | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 44756d75c7b86..8625eb9743fc4 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -523,7 +523,7 @@ def init_distributed_environment( local_rank: int = -1, backend: str = "nccl", ): - logger.info( + logger.debug( "world_size=%d rank=%d local_rank=%d " "distributed_init_method=%s backend=%s", world_size, rank, local_rank, distributed_init_method, backend) @@ -682,6 +682,7 @@ def model_parallel_is_initialized(): @contextlib.contextmanager def patch_tensor_parallel_group(world_group, tp_group): + """Patch the tp group temporarily until this function ends.""" global OVERRIDE_TP_STATE if OVERRIDE_TP_STATE: return @@ -695,6 +696,7 @@ def patch_tensor_parallel_group(world_group, tp_group): try: yield finally: + # restore the original state OVERRIDE_TP_STATE = False _WORLD = old_world_group _TP = old_tp_group diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 2757c7815a5ac..79b516bd1ec6e 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -134,9 +134,6 @@ def execute_model( return self._worker.execute_model(execute_model_req) def get_cache_block_size_bytes(self) -> int: - """Return the size of a single cache block, in bytes. Used in - speculative decoding. - """ return self._worker.get_cache_block_size_bytes() def add_lora(self, lora_request: LoRARequest) -> bool: @@ -158,6 +155,9 @@ def vocab_size(self) -> int: class DummyProposerWorker(ProposerWorkerBase): + """Dummy proposer worker that do nothing. + It's for workers that do not participate in draft generation. + """ def __init__( self, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 537656d706458..df2d65484363f 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -292,6 +292,7 @@ def execute_model( ) == 0 or disable_all_speculation: return self._run_no_spec(execute_model_req, skip_proposer=disable_all_speculation) + return self._run_speculative_decoding_step(execute_model_req, num_lookahead_slots) From 7f06f6421b6e1d9f22261b2caa9d3991bb3f721a Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 15:41:43 +0900 Subject: [PATCH 049/126] combine smaller_tp_worker logic into multi_step_worker --- vllm/spec_decode/multi_step_worker.py | 89 ++++++++++++++++++++++++-- vllm/spec_decode/spec_decode_worker.py | 20 ++++-- 2 files changed, 97 insertions(+), 12 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 1e061a19ed2b5..a5f9ec15a086b 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -1,9 +1,13 @@ import copy import weakref -from typing import List, Tuple +from typing import List, Tuple, Optional import torch +from vllm.distributed.parallel_state import (_ENABLE_CUSTOM_ALL_REDUCE, + GroupCoordinator, get_tp_group, + get_world_group, + patch_tensor_parallel_group) from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceGroupMetadata) from vllm.spec_decode.interfaces import (SpeculativeProposals, @@ -25,14 +29,51 @@ class MultiStepWorker(Worker, ProposerWorkerBase): requires more thought for MultiStepWorker support. """ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, ranks, is_dummy, **kwargs): + self.is_dummy = is_dummy + if ranks is not None and not is_dummy: + self._ranks = ranks + self._tp_groups = None + + super().__init__(**kwargs) # Lazy initialization list. self._proposer: SpeculativeProposer + def _patch_tensor_parallel_group(self): + if self._tp_groups is not None: + return patch_tensor_parallel_group(self._tp_groups[0], + self._tp_groups[1]) + return None + def init_device(self): - super().init_device() + if self.is_dummy: + return + + local_rank = get_world_group().local_rank + world_backend = torch.distributed.get_backend( + get_world_group().device_group) + tp_backend = torch.distributed.get_backend(get_tp_group().device_group) + + world_group = GroupCoordinator( + group_ranks=[self._ranks], + local_rank=local_rank, + torch_distributed_backend=world_backend, + use_pynccl=False, + use_custom_allreduce=False, + ) + tp_group = GroupCoordinator( + group_ranks=[self._ranks], + local_rank=local_rank, + torch_distributed_backend=tp_backend, + use_pynccl=True, + use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, + ) + + self._tp_groups = world_group, tp_group + + with self._patch_tensor_parallel_group(): + super().init_device() self._proposer = Top1Proposer( weakref.proxy(self), # type: ignore[arg-type] @@ -45,6 +86,21 @@ def set_include_gpu_probs_tensor(self): # Need include_gpu_probs_tensor for multi_step_worker self.model_runner.model.sampler.include_gpu_probs_tensor = True + def load_model(self): + if not self.is_dummy: + with self._patch_tensor_parallel_group(): + super().load_model() + + def determine_num_available_blocks(self): + if not self.is_dummy: + with self._patch_tensor_parallel_group(): + return super().determine_num_available_blocks() + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int): + if not self.is_dummy: + with self._patch_tensor_parallel_group(): + super().initialize_cache(num_gpu_blocks, num_cpu_blocks) + @torch.inference_mode() def sampler_output( self, @@ -58,6 +114,9 @@ def sampler_output( For multi step worker, this indicator shall be True. """ + if not self.is_dummy: + return [], True + self._raise_if_unsupported(execute_model_req) # Shallow copy input data so modifications (such as appending tokens) @@ -93,8 +152,11 @@ def get_spec_proposals( """Produce speculations given an input batch of sequences. The number of speculative tokens per sequence is determined by max_proposal_len. """ + if not self.is_dummy: + return SpeculativeProposals(None, None, None) - return self._proposer.get_spec_proposals(execute_model_req) + with self._patch_tensor_parallel_group(): + return self._proposer.get_spec_proposals(execute_model_req) @staticmethod def _append_new_tokens( @@ -207,3 +269,20 @@ def _raise_if_unsupported( execute_model_req.seq_group_metadata_list): raise NotImplementedError( "MultiStepWorker does not support beam search.") + + @torch.inference_mode() + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + if self.is_dummy: + return [] + + with self._patch_tensor_parallel_group(): + return super().execute_model(execute_model_req) + + def get_cache_block_size_bytes(self) -> int: + if self.is_dummy: + return 0 + + return super().get_cache_block_size_bytes() diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index df2d65484363f..db866a09a1b4a 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -3,7 +3,7 @@ import torch -from vllm.config import SpeculativeConfig +from vllm.config import SpeculativeConfig, ParallelConfig from vllm.distributed.communication_op import broadcast_tensor_dict from vllm.logger import init_logger from vllm.model_executor.layers.rejection_sampler import RejectionSampler @@ -16,7 +16,6 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase -from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker from vllm.spec_decode.util import (create_sequence_group_output, get_all_num_logprobs, get_all_seq_ids, get_sampled_token_logprobs, nvtx_range, @@ -106,11 +105,18 @@ def create_worker( proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min, ngram_prompt_lookup_max) else: - proposer_worker = MultiStepWorker(**draft_worker_kwargs) - - proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker( - proposer_worker, draft_worker_kwargs['parallel_config'], - scorer_worker.parallel_config, scorer_worker.rank) + draft_parallel_config: ParallelConfig = draft_worker_kwargs['parallel_config'] + draft_tp = draft_parallel_config.tensor_parallel_size + target_tp = scorer_worker.parallel_config.tensor_parallel_size + + draft_ranks = None + is_dummy = None + if target_tp != draft_tp: + # gpu ranks that will generate draft tokens + draft_ranks = list(range(draft_tp)) + is_dummy = scorer_worker.rank not in draft_ranks + proposer_worker = MultiStepWorker(draft_ranks, is_dummy, + **draft_worker_kwargs) logger.info("Configuring SpecDecodeWorker with proposer=%s", type(proposer_worker)) From 1e87579b557a5ca4aea1180a38fe0b77b01337d8 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 16:06:09 +0900 Subject: [PATCH 050/126] fix --- vllm/spec_decode/multi_step_worker.py | 63 ++++++++++++++++---------- vllm/spec_decode/spec_decode_worker.py | 10 +--- 2 files changed, 40 insertions(+), 33 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index a5f9ec15a086b..a640ac3db6f78 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -15,6 +15,9 @@ from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.worker.worker import Worker +from vllm.logger import init_logger + +logger = init_logger(__name__) class MultiStepWorker(Worker, ProposerWorkerBase): @@ -29,11 +32,20 @@ class MultiStepWorker(Worker, ProposerWorkerBase): requires more thought for MultiStepWorker support. """ - def __init__(self, ranks, is_dummy, **kwargs): - self.is_dummy = is_dummy - if ranks is not None and not is_dummy: - self._ranks = ranks + def __init__(self, draft_ranks: Optional[List[int]], **kwargs): + """Create a MultiStepWorker. + + Args: + draft_ranks (Optional[List[int]]): if this value is given, only some of + the GPU ranks written in this value participaten in draft generation + """ + rank = kwargs['rank'] + self.is_dummy = rank not in draft_ranks + if draft_ranks is not None and not self.is_dummy: + self._ranks = draft_ranks self._tp_groups = None + logger.info(f"{self._ranks=}, {self._tp_groups=}") + logger.inf(f"{rank=}, {draft_ranks=}, {self.is_dummy=}") super().__init__(**kwargs) @@ -50,27 +62,28 @@ def init_device(self): if self.is_dummy: return - local_rank = get_world_group().local_rank - world_backend = torch.distributed.get_backend( - get_world_group().device_group) - tp_backend = torch.distributed.get_backend(get_tp_group().device_group) - - world_group = GroupCoordinator( - group_ranks=[self._ranks], - local_rank=local_rank, - torch_distributed_backend=world_backend, - use_pynccl=False, - use_custom_allreduce=False, - ) - tp_group = GroupCoordinator( - group_ranks=[self._ranks], - local_rank=local_rank, - torch_distributed_backend=tp_backend, - use_pynccl=True, - use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, - ) - - self._tp_groups = world_group, tp_group + if self._ranks: + local_rank = get_world_group().local_rank + world_backend = torch.distributed.get_backend( + get_world_group().device_group) + tp_backend = torch.distributed.get_backend(get_tp_group().device_group) + + world_group = GroupCoordinator( + group_ranks=[self._ranks], + local_rank=local_rank, + torch_distributed_backend=world_backend, + use_pynccl=False, + use_custom_allreduce=False, + ) + tp_group = GroupCoordinator( + group_ranks=[self._ranks], + local_rank=local_rank, + torch_distributed_backend=tp_backend, + use_pynccl=True, + use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, + ) + + self._tp_groups = world_group, tp_group with self._patch_tensor_parallel_group(): super().init_device() diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index db866a09a1b4a..d44f841ea0d4f 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -109,14 +109,8 @@ def create_worker( draft_tp = draft_parallel_config.tensor_parallel_size target_tp = scorer_worker.parallel_config.tensor_parallel_size - draft_ranks = None - is_dummy = None - if target_tp != draft_tp: - # gpu ranks that will generate draft tokens - draft_ranks = list(range(draft_tp)) - is_dummy = scorer_worker.rank not in draft_ranks - proposer_worker = MultiStepWorker(draft_ranks, is_dummy, - **draft_worker_kwargs) + draft_ranks = list(range(draft_tp)) if target_tp != draft_tp else None + proposer_worker = MultiStepWorker(draft_ranks, **draft_worker_kwargs) logger.info("Configuring SpecDecodeWorker with proposer=%s", type(proposer_worker)) From abc546cdee15cf018b692ea10cc8c221f7155eeb Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 16:18:36 +0900 Subject: [PATCH 051/126] fix --- vllm/spec_decode/multi_step_worker.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index a640ac3db6f78..9fad32f2fbe4b 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -40,11 +40,12 @@ def __init__(self, draft_ranks: Optional[List[int]], **kwargs): the GPU ranks written in this value participaten in draft generation """ rank = kwargs['rank'] - self.is_dummy = rank not in draft_ranks - if draft_ranks is not None and not self.is_dummy: - self._ranks = draft_ranks + self.is_dummy = False + if draft_ranks is not None: + self._draft_ranks = draft_ranks + self.is_dummy = rank not in draft_ranks self._tp_groups = None - logger.info(f"{self._ranks=}, {self._tp_groups=}") + logger.info(f"{self._draft_ranks=}, {self._tp_groups=}") logger.inf(f"{rank=}, {draft_ranks=}, {self.is_dummy=}") super().__init__(**kwargs) @@ -62,27 +63,26 @@ def init_device(self): if self.is_dummy: return - if self._ranks: + if self._draft_ranks: local_rank = get_world_group().local_rank world_backend = torch.distributed.get_backend( get_world_group().device_group) tp_backend = torch.distributed.get_backend(get_tp_group().device_group) world_group = GroupCoordinator( - group_ranks=[self._ranks], + group_ranks=[self._draft_ranks], local_rank=local_rank, torch_distributed_backend=world_backend, use_pynccl=False, use_custom_allreduce=False, ) tp_group = GroupCoordinator( - group_ranks=[self._ranks], + group_ranks=[self._draft_ranks], local_rank=local_rank, torch_distributed_backend=tp_backend, use_pynccl=True, use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, ) - self._tp_groups = world_group, tp_group with self._patch_tensor_parallel_group(): From 7880cb09e12cfe345c7a5a4cd49ab9a8ad84d2ec Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 16:19:07 +0900 Subject: [PATCH 052/126] add small_tp correctness test --- .../e2e/test_multistep_correctness.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index 94d71fb012727..5062a3f9a52e6 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -148,6 +148,42 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator, force_output_len=True) +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + # Note this is repeated in the test body; to initialize a tokenizer. + "model": "JackFram/llama-68m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + }]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "tensor_parallel_size": 2, + "speculative_tensor_parallel_size": 1, + }, +]) +@pytest.mark.parametrize("test_llm_kwargs", [{}]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_e2e_with_small_draft_tps(test_llm_generator, + baseline_llm_generator, + batch_size: int): + """Verify spec decode works well with async LLM engine. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=32, + force_output_len=True) + @pytest.mark.parametrize( "common_llm_kwargs", [{ From 2ebe6f33401e940496f715fac780bc14117acb8a Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 16:23:52 +0900 Subject: [PATCH 053/126] nit --- tests/spec_decode/e2e/test_multistep_correctness.py | 2 +- vllm/spec_decode/multi_step_worker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index 5062a3f9a52e6..233eeda505747 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -176,7 +176,7 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator, def test_spec_decode_e2e_with_small_draft_tps(test_llm_generator, baseline_llm_generator, batch_size: int): - """Verify spec decode works well with async LLM engine. + """Verify spec decode works well with smaller tp for draft models. """ run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 9fad32f2fbe4b..7dcc7eb878761 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -46,7 +46,7 @@ def __init__(self, draft_ranks: Optional[List[int]], **kwargs): self.is_dummy = rank not in draft_ranks self._tp_groups = None logger.info(f"{self._draft_ranks=}, {self._tp_groups=}") - logger.inf(f"{rank=}, {draft_ranks=}, {self.is_dummy=}") + logger.info(f"{rank=}, {draft_ranks=}, {self.is_dummy=}") super().__init__(**kwargs) From 90d46ee61058df052f8507626513bbfc57292b53 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 16:26:25 +0900 Subject: [PATCH 054/126] fix --- vllm/spec_decode/multi_step_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 7dcc7eb878761..f3c5b437dea52 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -41,8 +41,8 @@ def __init__(self, draft_ranks: Optional[List[int]], **kwargs): """ rank = kwargs['rank'] self.is_dummy = False + self._draft_ranks = draft_ranks if draft_ranks is not None: - self._draft_ranks = draft_ranks self.is_dummy = rank not in draft_ranks self._tp_groups = None logger.info(f"{self._draft_ranks=}, {self._tp_groups=}") From 7e1426cde5ca189769740854bc4b44f6ae19984a Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 16:29:05 +0900 Subject: [PATCH 055/126] refactor. remove log --- vllm/spec_decode/multi_step_worker.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index f3c5b437dea52..31458d1249b10 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -40,13 +40,11 @@ def __init__(self, draft_ranks: Optional[List[int]], **kwargs): the GPU ranks written in this value participaten in draft generation """ rank = kwargs['rank'] - self.is_dummy = False self._draft_ranks = draft_ranks - if draft_ranks is not None: - self.is_dummy = rank not in draft_ranks - self._tp_groups = None - logger.info(f"{self._draft_ranks=}, {self._tp_groups=}") - logger.info(f"{rank=}, {draft_ranks=}, {self.is_dummy=}") + self._tp_groups = None + self._is_dummy = False if draft_ranks is None else rank not in draft_ranks + + logger.info(f"{rank=}, {draft_ranks=}, {self._is_dummy=}") super().__init__(**kwargs) @@ -60,7 +58,7 @@ def _patch_tensor_parallel_group(self): return None def init_device(self): - if self.is_dummy: + if self._is_dummy: return if self._draft_ranks: @@ -100,17 +98,17 @@ def set_include_gpu_probs_tensor(self): self.model_runner.model.sampler.include_gpu_probs_tensor = True def load_model(self): - if not self.is_dummy: + if not self._is_dummy: with self._patch_tensor_parallel_group(): super().load_model() def determine_num_available_blocks(self): - if not self.is_dummy: + if not self._is_dummy: with self._patch_tensor_parallel_group(): return super().determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int): - if not self.is_dummy: + if not self._is_dummy: with self._patch_tensor_parallel_group(): super().initialize_cache(num_gpu_blocks, num_cpu_blocks) @@ -127,7 +125,7 @@ def sampler_output( For multi step worker, this indicator shall be True. """ - if not self.is_dummy: + if not self._is_dummy: return [], True self._raise_if_unsupported(execute_model_req) @@ -165,7 +163,7 @@ def get_spec_proposals( """Produce speculations given an input batch of sequences. The number of speculative tokens per sequence is determined by max_proposal_len. """ - if not self.is_dummy: + if not self._is_dummy: return SpeculativeProposals(None, None, None) with self._patch_tensor_parallel_group(): @@ -288,14 +286,14 @@ def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None ) -> List[SamplerOutput]: - if self.is_dummy: + if self._is_dummy: return [] with self._patch_tensor_parallel_group(): return super().execute_model(execute_model_req) def get_cache_block_size_bytes(self) -> int: - if self.is_dummy: + if self._is_dummy: return 0 return super().get_cache_block_size_bytes() From ad52d93eeb1e2a82af68f6415dde7077159d54f8 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 16:31:53 +0900 Subject: [PATCH 056/126] remove return --- vllm/spec_decode/multi_step_worker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 31458d1249b10..4d36c0ddedc22 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -55,7 +55,6 @@ def _patch_tensor_parallel_group(self): if self._tp_groups is not None: return patch_tensor_parallel_group(self._tp_groups[0], self._tp_groups[1]) - return None def init_device(self): if self._is_dummy: From 355475b59389f8e7c946147e2f1bf23a61d8f8ef Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 16:44:20 +0900 Subject: [PATCH 057/126] fix --- vllm/distributed/parallel_state.py | 2 +- vllm/spec_decode/multi_step_worker.py | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 8625eb9743fc4..d48ff44d19b40 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -684,7 +684,7 @@ def model_parallel_is_initialized(): def patch_tensor_parallel_group(world_group, tp_group): """Patch the tp group temporarily until this function ends.""" global OVERRIDE_TP_STATE - if OVERRIDE_TP_STATE: + if OVERRIDE_TP_STATE or not world_group or not tp_group: return OVERRIDE_TP_STATE = True diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 4d36c0ddedc22..127281e167e65 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -41,7 +41,8 @@ def __init__(self, draft_ranks: Optional[List[int]], **kwargs): """ rank = kwargs['rank'] self._draft_ranks = draft_ranks - self._tp_groups = None + self._world_group = None + self._tp_group = None self._is_dummy = False if draft_ranks is None else rank not in draft_ranks logger.info(f"{rank=}, {draft_ranks=}, {self._is_dummy=}") @@ -52,9 +53,7 @@ def __init__(self, draft_ranks: Optional[List[int]], **kwargs): self._proposer: SpeculativeProposer def _patch_tensor_parallel_group(self): - if self._tp_groups is not None: - return patch_tensor_parallel_group(self._tp_groups[0], - self._tp_groups[1]) + return patch_tensor_parallel_group(self._world_group, self._tp_group) def init_device(self): if self._is_dummy: @@ -66,21 +65,20 @@ def init_device(self): get_world_group().device_group) tp_backend = torch.distributed.get_backend(get_tp_group().device_group) - world_group = GroupCoordinator( + self._world_group = GroupCoordinator( group_ranks=[self._draft_ranks], local_rank=local_rank, torch_distributed_backend=world_backend, use_pynccl=False, use_custom_allreduce=False, ) - tp_group = GroupCoordinator( + self._tp_group = GroupCoordinator( group_ranks=[self._draft_ranks], local_rank=local_rank, torch_distributed_backend=tp_backend, use_pynccl=True, use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, ) - self._tp_groups = world_group, tp_group with self._patch_tensor_parallel_group(): super().init_device() From 9cfdb5b622d64f3ba821ad4a0b6bc831480adcf8 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 16:48:05 +0900 Subject: [PATCH 058/126] fix about context managing --- vllm/distributed/parallel_state.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index d48ff44d19b40..27b1b9c83f282 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -684,22 +684,21 @@ def model_parallel_is_initialized(): def patch_tensor_parallel_group(world_group, tp_group): """Patch the tp group temporarily until this function ends.""" global OVERRIDE_TP_STATE - if OVERRIDE_TP_STATE or not world_group or not tp_group: - return - - OVERRIDE_TP_STATE = True - old_world_group = get_world_group() - old_tp_group = get_tp_group() - global _WORLD, _TP - _WORLD = world_group - _TP = tp_group + if not OVERRIDE_TP_STATE and world_group and tp_group: + OVERRIDE_TP_STATE = True + old_world_group = get_world_group() + old_tp_group = get_tp_group() + global _WORLD, _TP + _WORLD = world_group + _TP = tp_group try: yield finally: # restore the original state - OVERRIDE_TP_STATE = False - _WORLD = old_world_group - _TP = old_tp_group + if OVERRIDE_TP_STATE: + OVERRIDE_TP_STATE = False + _WORLD = old_world_group + _TP = old_tp_group def get_tensor_model_parallel_world_size(): From 6a6c5ff9bf1826fb58468c5e9c0aa52c807db98f Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 16:59:27 +0900 Subject: [PATCH 059/126] nit --- vllm/spec_decode/multi_step_worker.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 127281e167e65..3f8e85c4d96a9 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -47,6 +47,8 @@ def __init__(self, draft_ranks: Optional[List[int]], **kwargs): logger.info(f"{rank=}, {draft_ranks=}, {self._is_dummy=}") + logger.info(f"{kwargs=}") + super().__init__(**kwargs) # Lazy initialization list. @@ -91,8 +93,9 @@ def init_device(self): ) def set_include_gpu_probs_tensor(self): - # Need include_gpu_probs_tensor for multi_step_worker - self.model_runner.model.sampler.include_gpu_probs_tensor = True + if not self._is_dummy: + # Need include_gpu_probs_tensor for multi_step_worker + self.model_runner.model.sampler.include_gpu_probs_tensor = True def load_model(self): if not self._is_dummy: @@ -100,9 +103,11 @@ def load_model(self): super().load_model() def determine_num_available_blocks(self): - if not self._is_dummy: - with self._patch_tensor_parallel_group(): - return super().determine_num_available_blocks() + if self._is_dummy: + return -1, -1 + + with self._patch_tensor_parallel_group(): + return super().determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int): if not self._is_dummy: From ddef229249eb603ebb3dc79a8b07566af2111591 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 17:05:35 +0900 Subject: [PATCH 060/126] consistent condition. if self._is_dummy: --- vllm/spec_decode/multi_step_worker.py | 30 ++++++++++++++++----------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 3f8e85c4d96a9..98d89248a9933 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -93,16 +93,20 @@ def init_device(self): ) def set_include_gpu_probs_tensor(self): - if not self._is_dummy: - # Need include_gpu_probs_tensor for multi_step_worker - self.model_runner.model.sampler.include_gpu_probs_tensor = True + if self._is_dummy: + return + + # Need include_gpu_probs_tensor for multi_step_worker + self.model_runner.model.sampler.include_gpu_probs_tensor = True def load_model(self): - if not self._is_dummy: - with self._patch_tensor_parallel_group(): - super().load_model() + if self._is_dummy: + return + + with self._patch_tensor_parallel_group(): + super().load_model() - def determine_num_available_blocks(self): + def determine_num_available_blocks(self) -> Tuple[int, int]: if self._is_dummy: return -1, -1 @@ -110,9 +114,11 @@ def determine_num_available_blocks(self): return super().determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int): - if not self._is_dummy: - with self._patch_tensor_parallel_group(): - super().initialize_cache(num_gpu_blocks, num_cpu_blocks) + if self._is_dummy: + return + + with self._patch_tensor_parallel_group(): + super().initialize_cache(num_gpu_blocks, num_cpu_blocks) @torch.inference_mode() def sampler_output( @@ -127,7 +133,7 @@ def sampler_output( For multi step worker, this indicator shall be True. """ - if not self._is_dummy: + if self._is_dummy: return [], True self._raise_if_unsupported(execute_model_req) @@ -165,7 +171,7 @@ def get_spec_proposals( """Produce speculations given an input batch of sequences. The number of speculative tokens per sequence is determined by max_proposal_len. """ - if not self._is_dummy: + if self._is_dummy: return SpeculativeProposals(None, None, None) with self._patch_tensor_parallel_group(): From 965f64883e5bebbe18e25b15919117159780b899 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 17:25:03 +0900 Subject: [PATCH 061/126] fix ruff errors --- vllm/spec_decode/multi_step_worker.py | 18 +++++++----------- vllm/spec_decode/spec_decode_worker.py | 7 ++++--- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 98d89248a9933..9010fdee7c5da 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -32,22 +32,17 @@ class MultiStepWorker(Worker, ProposerWorkerBase): requires more thought for MultiStepWorker support. """ - def __init__(self, draft_ranks: Optional[List[int]], **kwargs): + def __init__(self, ranks: Optional[List[int]], **kwargs): """Create a MultiStepWorker. Args: - draft_ranks (Optional[List[int]]): if this value is given, only some of - the GPU ranks written in this value participaten in draft generation + ranks (Optional[List[int]]): if this value is given, only some of + the GPU ranks written in this value participate in draft generation """ - rank = kwargs['rank'] - self._draft_ranks = draft_ranks + self._draft_ranks = ranks self._world_group = None self._tp_group = None - self._is_dummy = False if draft_ranks is None else rank not in draft_ranks - - logger.info(f"{rank=}, {draft_ranks=}, {self._is_dummy=}") - - logger.info(f"{kwargs=}") + self._is_dummy = False if ranks is None else kwargs['rank'] not in ranks super().__init__(**kwargs) @@ -65,7 +60,8 @@ def init_device(self): local_rank = get_world_group().local_rank world_backend = torch.distributed.get_backend( get_world_group().device_group) - tp_backend = torch.distributed.get_backend(get_tp_group().device_group) + tp_backend = torch.distributed.get_backend( + get_tp_group().device_group) self._world_group = GroupCoordinator( group_ranks=[self._draft_ranks], diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index d44f841ea0d4f..8165ddd906073 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -105,12 +105,13 @@ def create_worker( proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min, ngram_prompt_lookup_max) else: - draft_parallel_config: ParallelConfig = draft_worker_kwargs['parallel_config'] + draft_parallel_config: ParallelConfig = \ + draft_worker_kwargs['parallel_config'] draft_tp = draft_parallel_config.tensor_parallel_size target_tp = scorer_worker.parallel_config.tensor_parallel_size - draft_ranks = list(range(draft_tp)) if target_tp != draft_tp else None - proposer_worker = MultiStepWorker(draft_ranks, **draft_worker_kwargs) + ranks = list(range(draft_tp)) if target_tp != draft_tp else None + proposer_worker = MultiStepWorker(ranks, **draft_worker_kwargs) logger.info("Configuring SpecDecodeWorker with proposer=%s", type(proposer_worker)) From 1bb553473996700bf87ec53a927654a9d787ca28 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 17:25:42 +0900 Subject: [PATCH 062/126] isort --- vllm/spec_decode/multi_step_worker.py | 5 +---- vllm/spec_decode/spec_decode_worker.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 9010fdee7c5da..8f5f2862f5a10 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -1,6 +1,6 @@ import copy import weakref -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple import torch @@ -15,9 +15,6 @@ from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.worker.worker import Worker -from vllm.logger import init_logger - -logger = init_logger(__name__) class MultiStepWorker(Worker, ProposerWorkerBase): diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 8165ddd906073..11683135472b6 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -3,7 +3,7 @@ import torch -from vllm.config import SpeculativeConfig, ParallelConfig +from vllm.config import ParallelConfig, SpeculativeConfig from vllm.distributed.communication_op import broadcast_tensor_dict from vllm.logger import init_logger from vllm.model_executor.layers.rejection_sampler import RejectionSampler From ea6b8f58aed3d79a702516e3b8f7c7df6e01d4a5 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 17:28:21 +0900 Subject: [PATCH 063/126] fix yapf --- tests/spec_decode/e2e/test_multistep_correctness.py | 5 +++-- vllm/spec_decode/spec_decode_worker.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index 233eeda505747..2447f254893a7 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -174,8 +174,8 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator, @pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("seed", [1]) def test_spec_decode_e2e_with_small_draft_tps(test_llm_generator, - baseline_llm_generator, - batch_size: int): + baseline_llm_generator, + batch_size: int): """Verify spec decode works well with smaller tp for draft models. """ run_greedy_equality_correctness_test(baseline_llm_generator, @@ -184,6 +184,7 @@ def test_spec_decode_e2e_with_small_draft_tps(test_llm_generator, max_output_len=32, force_output_len=True) + @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 11683135472b6..62912288e22e6 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -105,8 +105,8 @@ def create_worker( proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min, ngram_prompt_lookup_max) else: - draft_parallel_config: ParallelConfig = \ - draft_worker_kwargs['parallel_config'] + draft_parallel_config: ParallelConfig = draft_worker_kwargs[ + 'parallel_config'] draft_tp = draft_parallel_config.tensor_parallel_size target_tp = scorer_worker.parallel_config.tensor_parallel_size From 71977d233f8db1628e23286abee2fadd9f8ed154 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 17:38:49 +0900 Subject: [PATCH 064/126] undo ngramworker support --- vllm/config.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 38bf813fec400..2af3cc2fdcdb7 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -879,6 +879,7 @@ def maybe_create_spec_config( # config, in future, we may try refactor it out, and set # draft related config as None here. draft_model_config = target_model_config + draft_parallel_config = target_parallel_config else: ngram_prompt_lookup_max = 0 ngram_prompt_lookup_min = 0 @@ -907,9 +908,9 @@ def maybe_create_spec_config( target_model_config.max_model_len, )) - draft_parallel_config = ( - SpeculativeConfig.create_draft_parallel_config( - target_parallel_config, speculative_tensor_parallel_size)) + draft_parallel_config = ( + SpeculativeConfig.create_draft_parallel_config( + target_parallel_config, speculative_tensor_parallel_size)) return SpeculativeConfig( draft_model_config, From bc5f77a0931aeb87e497991d7469b5836367c15c Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 17:38:58 +0900 Subject: [PATCH 065/126] add comment --- vllm/spec_decode/multi_step_worker.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 8f5f2862f5a10..ac4035119f863 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -32,6 +32,13 @@ class MultiStepWorker(Worker, ProposerWorkerBase): def __init__(self, ranks: Optional[List[int]], **kwargs): """Create a MultiStepWorker. + It allows a speculative draft model to run with smaller tensor + parallel degree than target model. + This reduces the communication overhead of small draft models. + + This is implemented by changing vLLM's tensor parallel group to a group of + size temporarily during forward passes of draft models. + Args: ranks (Optional[List[int]]): if this value is given, only some of the GPU ranks written in this value participate in draft generation @@ -39,6 +46,8 @@ def __init__(self, ranks: Optional[List[int]], **kwargs): self._draft_ranks = ranks self._world_group = None self._tp_group = None + + # whether the worker participates in draft generation or not self._is_dummy = False if ranks is None else kwargs['rank'] not in ranks super().__init__(**kwargs) @@ -54,6 +63,7 @@ def init_device(self): return if self._draft_ranks: + # creates tp process group containing only a subset of gpu ranks local_rank = get_world_group().local_rank world_backend = torch.distributed.get_backend( get_world_group().device_group) @@ -126,6 +136,8 @@ def sampler_output( For multi step worker, this indicator shall be True. """ + # NOTE: we do not call _patch_tensor_parallel_group() in this function, + # as it's always called after tp_group has already been overridden if self._is_dummy: return [], True From 5655a497d10bc70c66282927dcc73fdbcb722b8f Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 17:39:19 +0900 Subject: [PATCH 066/126] remove smaller_tp_proposer_worker --- .../spec_decode/smaller_tp_proposer_worker.py | 213 ------------------ 1 file changed, 213 deletions(-) delete mode 100644 vllm/spec_decode/smaller_tp_proposer_worker.py diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py deleted file mode 100644 index 79b516bd1ec6e..0000000000000 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ /dev/null @@ -1,213 +0,0 @@ -from typing import List, Optional, Set, Tuple, Union - -import torch -import torch.distributed - -from vllm.config import ParallelConfig -from vllm.distributed.parallel_state import (_ENABLE_CUSTOM_ALL_REDUCE, - GroupCoordinator, get_tp_group, - get_world_group, - patch_tensor_parallel_group) -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.sequence import ExecuteModelRequest, SamplerOutput -from vllm.spec_decode.interfaces import SpeculativeProposals -from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase -from vllm.worker.worker import Worker - -logger = init_logger(__name__) - - -class SmallerTpProposerWorker(ProposerWorkerBase): - """Class which allows a speculative draft model to run with smaller tensor - parallel degree than target model. - This reduces the communication overhead of small draft models. - - This is implemented by changing vLLM's tensor parallel group to a group of - size temporarily during forward passes of draft models. - """ - - @classmethod - def maybe_wrap_worker(cls, worker, draft_parallel_config: ParallelConfig, - target_parallel_config: ParallelConfig, rank: int): - """Wrap the worker in a SmallerTpProposerWorker if necessary. - """ - draft_tp = draft_parallel_config.tensor_parallel_size - target_tp = target_parallel_config.tensor_parallel_size - - if draft_tp == target_tp: - return worker - - if draft_tp > target_tp: - raise ValueError( - f"{cls} only supports draft_tp smaller than target_tp." - f"{draft_tp=} {target_tp=}") - - # gpu ranks that will generate draft tokens together - ranks = list(range(draft_tp)) - - if rank in ranks: - logger.info("Wrapping {%s} in {%s}", type(worker), cls) - return cls(worker, ranks) - else: - # for workers not participating in the draft generation - logger.info("Returning dummy worker") - return DummyProposerWorker(worker) - - def __init__(self, worker: Union[Worker, ProposerWorkerBase], - ranks: List[int]): - self._worker = worker - self._ranks = ranks - self._world_group = None - self._tp_group = None - - def _patch_tensor_parallel_group(self): - return patch_tensor_parallel_group(self._world_group, self._tp_group) - - def init_device(self): - """Initialize the device. - - This also creates an additional tensor-parallel process group containing - only a subset of the whole ranks. - """ - local_rank = get_world_group().local_rank - world_backend = torch.distributed.get_backend( - get_world_group().device_group) - tp_backend = torch.distributed.get_backend(get_tp_group().device_group) - - self._world_group = GroupCoordinator( - group_ranks=[self._ranks], - local_rank=local_rank, - torch_distributed_backend=world_backend, - use_pynccl=False, - use_custom_allreduce=False, - ) - self._tp_group = GroupCoordinator( - group_ranks=[self._ranks], - local_rank=local_rank, - torch_distributed_backend=tp_backend, - use_pynccl=True, - use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, - ) - - with self._patch_tensor_parallel_group(): - self._worker.init_device() - - def set_include_gpu_probs_tensor(self): - self._worker.set_include_gpu_probs_tensor() - - def load_model(self): - with self._patch_tensor_parallel_group(): - self._worker.load_model() - - def determine_num_available_blocks(self): - with self._patch_tensor_parallel_group(): - return self._worker.determine_num_available_blocks() - - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int): - with self._patch_tensor_parallel_group(): - self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - - @torch.inference_mode() - def sampler_output( - self, - execute_model_req: ExecuteModelRequest, - sample_len: int, - ) -> Tuple[Optional[List[SamplerOutput]], bool]: - # do not call _parch_tensor_parallel_group, because - # it's always called after tp_group has already been overridden - return self._worker.sampler_output(execute_model_req, sample_len) - - def get_spec_proposals( - self, - execute_model_req: ExecuteModelRequest, - ) -> SpeculativeProposals: - with self._patch_tensor_parallel_group(): - return self._worker.get_spec_proposals(execute_model_req) - - @torch.inference_mode() - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - with self._patch_tensor_parallel_group(): - return self._worker.execute_model(execute_model_req) - - def get_cache_block_size_bytes(self) -> int: - return self._worker.get_cache_block_size_bytes() - - def add_lora(self, lora_request: LoRARequest) -> bool: - raise NotImplementedError - - def remove_lora(self, lora_id: int) -> bool: - raise NotImplementedError - - def list_loras(self) -> Set[int]: - raise NotImplementedError - - @property - def max_model_len(self) -> int: - return self._worker.max_model_len - - @property - def vocab_size(self) -> int: - return self._worker.vocab_size - - -class DummyProposerWorker(ProposerWorkerBase): - """Dummy proposer worker that do nothing. - It's for workers that do not participate in draft generation. - """ - - def __init__( - self, - worker: Union[Worker, ProposerWorkerBase], - ): - self._worker = worker - - def init_device(self): - pass - - def load_model(self): - pass - - def determine_num_available_blocks(self): - pass - - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int): - pass - - def sampler_output( - self, - execute_model_req: ExecuteModelRequest, - sample_len: int, - ) -> Tuple[List[SamplerOutput], bool]: - return [], True - - def get_spec_proposals( - self, - execute_model_req: ExecuteModelRequest, - ) -> SpeculativeProposals: - return SpeculativeProposals(None, None, None) - - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - return [] - - def get_cache_block_size_bytes(self) -> int: - return 0 - - def add_lora(self, lora_request: LoRARequest) -> bool: - return False - - def remove_lora(self, lora_id: int) -> bool: - return False - - def list_loras(self) -> Set[int]: - return set() - - @property - def vocab_size(self) -> int: - return self._worker.vocab_size From eabc16a54db33e3e92ecffe350a9e818b769ba6a Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 14 Jun 2024 17:41:34 +0900 Subject: [PATCH 067/126] ruff --- vllm/spec_decode/multi_step_worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index ac4035119f863..38cdb38b5714f 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -36,8 +36,8 @@ def __init__(self, ranks: Optional[List[int]], **kwargs): parallel degree than target model. This reduces the communication overhead of small draft models. - This is implemented by changing vLLM's tensor parallel group to a group of - size temporarily during forward passes of draft models. + This is implemented by changing vLLM's tensor parallel group to a group + of the small size temporarily during forward passes of draft models. Args: ranks (Optional[List[int]]): if this value is given, only some of From f748edf7e8823db9afe381de4a9ca8a5f75b28d6 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 17 Jun 2024 09:59:06 +0900 Subject: [PATCH 068/126] remove ranks arg --- vllm/spec_decode/multi_step_worker.py | 15 ++++++++++----- vllm/spec_decode/spec_decode_worker.py | 5 +++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 38cdb38b5714f..a439c8796e1a1 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -29,7 +29,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase): requires more thought for MultiStepWorker support. """ - def __init__(self, ranks: Optional[List[int]], **kwargs): + def __init__(self, **kwargs): """Create a MultiStepWorker. It allows a speculative draft model to run with smaller tensor @@ -43,12 +43,17 @@ def __init__(self, ranks: Optional[List[int]], **kwargs): ranks (Optional[List[int]]): if this value is given, only some of the GPU ranks written in this value participate in draft generation """ - self._draft_ranks = ranks + + self._draft_ranks = None + self._is_dummy = False self._world_group = None self._tp_group = None - # whether the worker participates in draft generation or not - self._is_dummy = False if ranks is None else kwargs['rank'] not in ranks + if 'ranks' in kwargs: + ranks = kwargs['ranks'] + self._draft_ranks = ranks + # whether the worker participates in draft generation or not + self._is_dummy = kwargs['rank'] not in ranks super().__init__(**kwargs) @@ -62,7 +67,7 @@ def init_device(self): if self._is_dummy: return - if self._draft_ranks: + if self._draft_ranks is not None: # creates tp process group containing only a subset of gpu ranks local_rank = get_world_group().local_rank world_backend = torch.distributed.get_backend( diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 62912288e22e6..58d04cca22133 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -110,8 +110,9 @@ def create_worker( draft_tp = draft_parallel_config.tensor_parallel_size target_tp = scorer_worker.parallel_config.tensor_parallel_size - ranks = list(range(draft_tp)) if target_tp != draft_tp else None - proposer_worker = MultiStepWorker(ranks, **draft_worker_kwargs) + if target_tp != draft_tp: + draft_worker_kwargs['ranks'] = list(range(draft_tp)) + proposer_worker = MultiStepWorker(**draft_worker_kwargs) logger.info("Configuring SpecDecodeWorker with proposer=%s", type(proposer_worker)) From 4b74a4522e03779cdda011007337d3efedfdd3c8 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 17 Jun 2024 10:20:35 +0900 Subject: [PATCH 069/126] undo --- .../e2e/test_multistep_correctness.py | 37 ------------------- 1 file changed, 37 deletions(-) diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index 2447f254893a7..94d71fb012727 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -148,43 +148,6 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator, force_output_len=True) -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Use a small model for a fast test. - # Note this is repeated in the test body; to initialize a tokenizer. - "model": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Required for spec decode. - "use_v2_block_manager": True, - }]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "tensor_parallel_size": 2, - "speculative_tensor_parallel_size": 1, - }, -]) -@pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize("seed", [1]) -def test_spec_decode_e2e_with_small_draft_tps(test_llm_generator, - baseline_llm_generator, - batch_size: int): - """Verify spec decode works well with smaller tp for draft models. - """ - run_greedy_equality_correctness_test(baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len=32, - force_output_len=True) - - @pytest.mark.parametrize( "common_llm_kwargs", [{ From c9786ad5f9ff759fad2c79cdbc0a6b083d6f2542 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 17 Jun 2024 10:48:11 +0900 Subject: [PATCH 070/126] add dist test --- .../spec_decode/e2e/test_integration_dist.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py index d444ef24cbfda..ef6615d25bf02 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist.py @@ -63,3 +63,40 @@ def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, batch_size, max_output_len=output_len, force_output_len=True) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + # Note this is repeated in the test body; to initialize a tokenizer. + "model": "JackFram/llama-68m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + }]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "tensor_parallel_size": 2, + "speculative_tensor_parallel_size": 1, + }, +]) +@pytest.mark.parametrize("test_llm_kwargs", [{}]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_e2e_with_small_draft_tps(test_llm_generator, + baseline_llm_generator, + batch_size: int): + """Verify spec decode works well with smaller tp for draft models. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=32, + force_output_len=True) From a42664ab1be3bffd72d61ce555e849d0c513b8b2 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 17 Jun 2024 10:51:04 +0900 Subject: [PATCH 071/126] nit --- tests/spec_decode/e2e/test_integration_dist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py index ef6615d25bf02..8b0ca2e4b25af 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist.py @@ -77,13 +77,13 @@ def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, # Required for spec decode. "use_v2_block_manager": True, + "tensor_parallel_size": 2, }]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [ { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, - "tensor_parallel_size": 2, "speculative_tensor_parallel_size": 1, }, ]) From ac7701a02da50d382e36b1839ac426061b5e3a47 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 17 Jun 2024 10:55:46 +0900 Subject: [PATCH 072/126] fix --- vllm/spec_decode/multi_step_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 5cb4571842e8d..233f3e34b0c2d 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -50,7 +50,7 @@ def __init__(self, **kwargs): self._tp_group = None if 'ranks' in kwargs: - ranks = kwargs['ranks'] + ranks = kwargs.pop('ranks') self._draft_ranks = ranks # whether the worker participates in draft generation or not self._is_dummy = kwargs['rank'] not in ranks From eea6a7ee4b2f45a6b554ef7e7828af5f91096e5a Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 17 Jun 2024 11:12:29 +0900 Subject: [PATCH 073/126] test fix --- tests/spec_decode/e2e/test_integration_dist.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py index 8b0ca2e4b25af..4d0b0fcf2dcab 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist.py @@ -78,19 +78,26 @@ def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, # Required for spec decode. "use_v2_block_manager": True, "tensor_parallel_size": 2, + + # Use AsyncLLM engine, so that the engine runs in its own process. + # Otherwise, since vLLM does not follow true SPMD, the test runner + # process will have both the engine and the rank0 worker. NCCL is not + # cleaned up properly, and its server host thread leaks, causing the + # second run of the test to fail with internal NCCL error. + "use_async": True, }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ +@pytest.mark.parametrize("test_llm_kwargs", [ { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "speculative_tensor_parallel_size": 1, }, ]) -@pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("seed", [1]) -def test_spec_decode_e2e_with_small_draft_tps(test_llm_generator, +def test_draft_model_tp_lt_target_model(test_llm_generator, baseline_llm_generator, batch_size: int): """Verify spec decode works well with smaller tp for draft models. From a648f5df9943c7019f0b5366a1effe53ecedf98f Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 17 Jun 2024 11:16:59 +0900 Subject: [PATCH 074/126] yapf fix --- tests/spec_decode/e2e/test_integration_dist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py index 4d0b0fcf2dcab..14c5cf282b38c 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist.py @@ -98,8 +98,8 @@ def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, @pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("seed", [1]) def test_draft_model_tp_lt_target_model(test_llm_generator, - baseline_llm_generator, - batch_size: int): + baseline_llm_generator, + batch_size: int): """Verify spec decode works well with smaller tp for draft models. """ run_greedy_equality_correctness_test(baseline_llm_generator, From f23ba8cfc699235e5da3f26cce53bafc1626a147 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 17 Jun 2024 11:30:27 +0900 Subject: [PATCH 075/126] update comment --- vllm/spec_decode/multi_step_worker.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 233f3e34b0c2d..08a08f7f78d9c 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -38,17 +38,15 @@ def __init__(self, **kwargs): This is implemented by changing vLLM's tensor parallel group to a group of the small size temporarily during forward passes of draft models. - - Args: - ranks (Optional[List[int]]): if this value is given, only some of - the GPU ranks written in this value participate in draft generation - """ + """ self._draft_ranks = None self._is_dummy = False self._world_group = None self._tp_group = None + # if 'ranks' arg is given, only some of the GPU ranks written in this + # value participate in draft generation if 'ranks' in kwargs: ranks = kwargs.pop('ranks') self._draft_ranks = ranks From aa9af93528255bf1170e8faffb76c71afe13164a Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 17 Jun 2024 12:50:50 +0900 Subject: [PATCH 076/126] require 2 gpus --- tests/spec_decode/e2e/test_integration_dist.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py index 14c5cf282b38c..4fdb1567d51a8 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist.py @@ -65,6 +65,8 @@ def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, force_output_len=True) +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") @pytest.mark.parametrize( "common_llm_kwargs", [{ From 56c8927f4e9020dd823307997e4f858369e3e4aa Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 18 Jun 2024 10:12:46 +0900 Subject: [PATCH 077/126] restore draft_ranks arg in MultiStepWorker.__init__ --- tests/spec_decode/utils.py | 32 ++++++++++++++++---------- vllm/spec_decode/multi_step_worker.py | 14 +++++------ vllm/spec_decode/spec_decode_worker.py | 5 ++-- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index ce5b347832c30..496ba170d6316 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -15,6 +15,7 @@ from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker +from vllm.spec_decode.multi_step_worker import MultiStepWorker T = TypeVar("T", bound=Worker) @@ -66,6 +67,7 @@ def create_worker(cls: Callable[..., T], num_gpu_blocks: int, seed: int, is_driver_worker: bool = True, + draft_ranks: List[int] = None, enforce_eager: bool = True) -> T: engine_args = EngineArgs( model=model_name, @@ -78,18 +80,24 @@ def create_worker(cls: Callable[..., T], distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) - worker = cls( - model_config=engine_config.model_config, - parallel_config=engine_config.parallel_config, - scheduler_config=engine_config.scheduler_config, - device_config=engine_config.device_config, - cache_config=engine_config.cache_config, - load_config=engine_config.load_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - is_driver_worker=is_driver_worker, - ) + worker_kwargs={ + 'model_config': engine_config.model_config, + 'parallel_config': engine_config.parallel_config, + 'scheduler_config': engine_config.scheduler_config, + 'device_config': engine_config.device_config, + 'cache_config': engine_config.cache_config, + 'load_config': engine_config.load_config, + 'local_rank': 0, + 'rank': 0, + 'distributed_init_method': distributed_init_method, + 'is_driver_worker': is_driver_worker, + } + + if draft_ranks is not None: + assert cls is MultiStepWorker, "draft_ranks arg only works with MultiStepWorker" + worker_kwargs['draft_ranks'] = draft_ranks + + worker = cls(**worker_kwargs) worker.init_device() worker.load_model() diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 08a08f7f78d9c..81af1046f758c 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -29,7 +29,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase): requires more thought for MultiStepWorker support. """ - def __init__(self, **kwargs): + def __init__(self, draft_ranks: Optional[List[int]], **kwargs): """Create a MultiStepWorker. It allows a speculative draft model to run with smaller tensor @@ -40,18 +40,16 @@ def __init__(self, **kwargs): of the small size temporarily during forward passes of draft models. """ - self._draft_ranks = None + self._draft_ranks = draft_ranks self._is_dummy = False self._world_group = None self._tp_group = None - # if 'ranks' arg is given, only some of the GPU ranks written in this - # value participate in draft generation - if 'ranks' in kwargs: - ranks = kwargs.pop('ranks') - self._draft_ranks = ranks + # if 'draft_ranks' arg is given, only some of the GPU ranks written in + # this value participate in draft generation + if draft_ranks is not None: # whether the worker participates in draft generation or not - self._is_dummy = kwargs['rank'] not in ranks + self._is_dummy = kwargs['rank'] not in draft_ranks super().__init__(**kwargs) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index bde3ff69f9ced..27a35165c6315 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -110,9 +110,8 @@ def create_worker( draft_tp = draft_parallel_config.tensor_parallel_size target_tp = scorer_worker.parallel_config.tensor_parallel_size - if target_tp != draft_tp: - draft_worker_kwargs['ranks'] = list(range(draft_tp)) - proposer_worker = MultiStepWorker(**draft_worker_kwargs) + ranks = list(range(draft_tp)) if target_tp != draft_tp else None + proposer_worker = MultiStepWorker(ranks, **draft_worker_kwargs) logger.info("Configuring SpecDecodeWorker with proposer=%s", type(proposer_worker)) From 385b4f84b07b089014b3a8602c860f17b5d2a3e5 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 18 Jun 2024 10:36:33 +0900 Subject: [PATCH 078/126] comment --- vllm/spec_decode/multi_step_worker.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 81af1046f758c..c2a05632f66d6 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -38,6 +38,10 @@ def __init__(self, draft_ranks: Optional[List[int]], **kwargs): This is implemented by changing vLLM's tensor parallel group to a group of the small size temporarily during forward passes of draft models. + + Args: + draft_ranks (Optional[List[int]]): if this value is given, only some of + the GPU ranks written in this value participate in draft generation """ self._draft_ranks = draft_ranks @@ -45,8 +49,6 @@ def __init__(self, draft_ranks: Optional[List[int]], **kwargs): self._world_group = None self._tp_group = None - # if 'draft_ranks' arg is given, only some of the GPU ranks written in - # this value participate in draft generation if draft_ranks is not None: # whether the worker participates in draft generation or not self._is_dummy = kwargs['rank'] not in draft_ranks From 43f37eb5d27b22673ce29622453d5ba765b6a909 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 18 Jun 2024 10:42:07 +0900 Subject: [PATCH 079/126] ruff mypy --- tests/spec_decode/utils.py | 4 ++-- vllm/spec_decode/multi_step_worker.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 496ba170d6316..60303368df435 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -67,7 +67,7 @@ def create_worker(cls: Callable[..., T], num_gpu_blocks: int, seed: int, is_driver_worker: bool = True, - draft_ranks: List[int] = None, + draft_ranks: Optional[List[int]] = None, enforce_eager: bool = True) -> T: engine_args = EngineArgs( model=model_name, @@ -94,7 +94,7 @@ def create_worker(cls: Callable[..., T], } if draft_ranks is not None: - assert cls is MultiStepWorker, "draft_ranks arg only works with MultiStepWorker" + assert cls is MultiStepWorker, "draft_ranks arg is for MultiStepWorker" worker_kwargs['draft_ranks'] = draft_ranks worker = cls(**worker_kwargs) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index c2a05632f66d6..351f46fb827b5 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -40,8 +40,8 @@ def __init__(self, draft_ranks: Optional[List[int]], **kwargs): of the small size temporarily during forward passes of draft models. Args: - draft_ranks (Optional[List[int]]): if this value is given, only some of - the GPU ranks written in this value participate in draft generation + draft_ranks (Optional[List[int]]): if this value is given, only some + of the GPU ranks in this value participate in draft generation """ self._draft_ranks = draft_ranks From 99350e2a241f146c5502bdc0ceeaf67fa10e0320 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 18 Jun 2024 10:43:22 +0900 Subject: [PATCH 080/126] isort --- tests/spec_decode/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 60303368df435..f311f7877a4ec 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -12,10 +12,10 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, SamplerOutput, SequenceData, SequenceGroupMetadata, SequenceOutput) +from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker -from vllm.spec_decode.multi_step_worker import MultiStepWorker T = TypeVar("T", bound=Worker) From a9f3e238a300e17209b7f3da4ab442c86d3ea0fd Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 18 Jun 2024 10:46:19 +0900 Subject: [PATCH 081/126] yapf --- tests/spec_decode/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index f311f7877a4ec..2729bc57c5462 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -80,7 +80,7 @@ def create_worker(cls: Callable[..., T], distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) - worker_kwargs={ + worker_kwargs = { 'model_config': engine_config.model_config, 'parallel_config': engine_config.parallel_config, 'scheduler_config': engine_config.scheduler_config, From 6ba250d67ef4d9e48e06594dc744eec45c3301db Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 18 Jun 2024 10:55:33 +0900 Subject: [PATCH 082/126] allow None for draft_ranks --- vllm/spec_decode/multi_step_worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 351f46fb827b5..89913510ff178 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -29,7 +29,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase): requires more thought for MultiStepWorker support. """ - def __init__(self, draft_ranks: Optional[List[int]], **kwargs): + def __init__(self, draft_ranks: Optional[List[int]] = None, **kwargs): """Create a MultiStepWorker. It allows a speculative draft model to run with smaller tensor @@ -42,7 +42,7 @@ def __init__(self, draft_ranks: Optional[List[int]], **kwargs): Args: draft_ranks (Optional[List[int]]): if this value is given, only some of the GPU ranks in this value participate in draft generation - """ + """ self._draft_ranks = draft_ranks self._is_dummy = False From 3e786135948707444b173399a99429bca4d94c99 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 18 Jun 2024 16:11:20 +0900 Subject: [PATCH 083/126] spec-tp arg in benchmark_latency --- benchmarks/benchmark_latency.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 9937f8333fb7e..1b80d2c5f1102 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -23,6 +23,7 @@ def main(args: argparse.Namespace): llm = LLM(model=args.model, speculative_model=args.speculative_model, num_speculative_tokens=args.num_speculative_tokens, + speculative_tensor_parallel_size=args.speculative_tensor_parallel_size, tokenizer=args.tokenizer, quantization=args.quantization, tensor_parallel_size=args.tensor_parallel_size, @@ -122,6 +123,8 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument('--model', type=str, default='facebook/opt-125m') parser.add_argument('--speculative-model', type=str, default=None) parser.add_argument('--num-speculative-tokens', type=int, default=None) + parser.add_argument('--speculative-tensor-parallel-size', '-spec-tp', + type=int, default=None) parser.add_argument('--tokenizer', type=str, default=None) parser.add_argument('--quantization', '-q', From 6532af756bf5b37c1b657098a148d03fac86d90b Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 18 Jun 2024 16:25:18 +0900 Subject: [PATCH 084/126] yapf --- benchmarks/benchmark_latency.py | 49 +++++++++++++++++---------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 1b80d2c5f1102..f058dd4addd0e 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -20,27 +20,28 @@ def main(args: argparse.Namespace): # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. - llm = LLM(model=args.model, - speculative_model=args.speculative_model, - num_speculative_tokens=args.num_speculative_tokens, - speculative_tensor_parallel_size=args.speculative_tensor_parallel_size, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - quantization_param_path=args.quantization_param_path, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - use_v2_block_manager=args.use_v2_block_manager, - enable_chunked_prefill=args.enable_chunked_prefill, - download_dir=args.download_dir, - block_size=args.block_size, - gpu_memory_utilization=args.gpu_memory_utilization, - load_format=args.load_format, - distributed_executor_backend=args.distributed_executor_backend) + llm = LLM( + model=args.model, + speculative_model=args.speculative_model, + num_speculative_tokens=args.num_speculative_tokens, + speculative_tensor_parallel_size=args.speculative_tensor_parallel_size, + tokenizer=args.tokenizer, + quantization=args.quantization, + tensor_parallel_size=args.tensor_parallel_size, + trust_remote_code=args.trust_remote_code, + dtype=args.dtype, + enforce_eager=args.enforce_eager, + kv_cache_dtype=args.kv_cache_dtype, + quantization_param_path=args.quantization_param_path, + device=args.device, + ray_workers_use_nsight=args.ray_workers_use_nsight, + use_v2_block_manager=args.use_v2_block_manager, + enable_chunked_prefill=args.enable_chunked_prefill, + download_dir=args.download_dir, + block_size=args.block_size, + gpu_memory_utilization=args.gpu_memory_utilization, + load_format=args.load_format, + distributed_executor_backend=args.distributed_executor_backend) sampling_params = SamplingParams( n=args.n, @@ -123,8 +124,10 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument('--model', type=str, default='facebook/opt-125m') parser.add_argument('--speculative-model', type=str, default=None) parser.add_argument('--num-speculative-tokens', type=int, default=None) - parser.add_argument('--speculative-tensor-parallel-size', '-spec-tp', - type=int, default=None) + parser.add_argument('--speculative-tensor-parallel-size', + '-spec-tp', + type=int, + default=None) parser.add_argument('--tokenizer', type=str, default=None) parser.add_argument('--quantization', '-q', From 68397976635a8399a91878ed0809eab712ed4845 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 18 Jun 2024 18:10:15 +0900 Subject: [PATCH 085/126] yapf --- benchmarks/benchmark_latency.py | 40 ++++++++++++++++----------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index f058dd4addd0e..05af4c2782dec 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -22,26 +22,26 @@ def main(args: argparse.Namespace): # the engine will automatically process the request in multiple batches. llm = LLM( model=args.model, - speculative_model=args.speculative_model, - num_speculative_tokens=args.num_speculative_tokens, - speculative_tensor_parallel_size=args.speculative_tensor_parallel_size, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - quantization_param_path=args.quantization_param_path, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - use_v2_block_manager=args.use_v2_block_manager, - enable_chunked_prefill=args.enable_chunked_prefill, - download_dir=args.download_dir, - block_size=args.block_size, - gpu_memory_utilization=args.gpu_memory_utilization, - load_format=args.load_format, - distributed_executor_backend=args.distributed_executor_backend) + speculative_model=args.speculative_model, + num_speculative_tokens=args.num_speculative_tokens, + speculative_tensor_parallel_size=args.speculative_tensor_parallel_size, + tokenizer=args.tokenizer, + quantization=args.quantization, + tensor_parallel_size=args.tensor_parallel_size, + trust_remote_code=args.trust_remote_code, + dtype=args.dtype, + enforce_eager=args.enforce_eager, + kv_cache_dtype=args.kv_cache_dtype, + quantization_param_path=args.quantization_param_path, + device=args.device, + ray_workers_use_nsight=args.ray_workers_use_nsight, + use_v2_block_manager=args.use_v2_block_manager, + enable_chunked_prefill=args.enable_chunked_prefill, + download_dir=args.download_dir, + block_size=args.block_size, + gpu_memory_utilization=args.gpu_memory_utilization, + load_format=args.load_format, + distributed_executor_backend=args.distributed_executor_backend) sampling_params = SamplingParams( n=args.n, From 98e584da56f685aa8c034d0167801b7d150db301 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Wed, 19 Jun 2024 11:28:28 +0900 Subject: [PATCH 086/126] remove is_dummy check from sampler_output --- vllm/spec_decode/multi_step_worker.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 89913510ff178..13b1dd10ce6ae 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -139,10 +139,9 @@ def sampler_output( For multi step worker, this indicator shall be True. """ - # NOTE: we do not call _patch_tensor_parallel_group() in this function, - # as it's always called after tp_group has already been overridden - if self._is_dummy: - return [], True + # NOTE: here, neither _patch_tensor_parallel_group() call nor _is_dummy + # check, as it's always called after tp_group has already been + # overridden by get_spec_proposals() self._raise_if_unsupported(execute_model_req) From 2d5e64d259a0ef25ec2a94cfab2810bd053f14dd Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Thu, 20 Jun 2024 16:56:25 +0900 Subject: [PATCH 087/126] add comment --- vllm/distributed/parallel_state.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 4289085a998c9..278f36a853e07 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -678,15 +678,22 @@ def model_parallel_is_initialized(): return (_TP is not None and _PP is not None) -OVERRIDE_TP_STATE = False +TP_STATE_PATCHED = False @contextlib.contextmanager -def patch_tensor_parallel_group(world_group, tp_group): - """Patch the tp group temporarily until this function ends.""" - global OVERRIDE_TP_STATE - if not OVERRIDE_TP_STATE and world_group and tp_group: - OVERRIDE_TP_STATE = True +def patch_tensor_parallel_group(world_group: Optional[GroupCoordinator], + tp_group: Optional[GroupCoordinator]): + """Patch the tp group temporarily until this function ends. + It requires the world group to be patched together to keep the integrity. + Args: + world_group (Optional[GroupCoordinator]): the world group coordinator + tp_group (Optional[GroupCoordinator]): the tp group coordinator + """ + global TP_STATE_PATCHED + if (not TP_STATE_PATCHED and + world_group is not None and tp_group is not None): + TP_STATE_PATCHED = True old_world_group = get_world_group() old_tp_group = get_tp_group() global _WORLD, _TP @@ -696,8 +703,8 @@ def patch_tensor_parallel_group(world_group, tp_group): yield finally: # restore the original state - if OVERRIDE_TP_STATE: - OVERRIDE_TP_STATE = False + if TP_STATE_PATCHED: + TP_STATE_PATCHED = False _WORLD = old_world_group _TP = old_tp_group From ba88bd48e559c545417ead9d955648abf45fd4d7 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Thu, 20 Jun 2024 16:59:12 +0900 Subject: [PATCH 088/126] yapf --- vllm/distributed/parallel_state.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 278f36a853e07..e01fd46b38154 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -691,8 +691,8 @@ def patch_tensor_parallel_group(world_group: Optional[GroupCoordinator], tp_group (Optional[GroupCoordinator]): the tp group coordinator """ global TP_STATE_PATCHED - if (not TP_STATE_PATCHED and - world_group is not None and tp_group is not None): + if (not TP_STATE_PATCHED and world_group is not None + and tp_group is not None): TP_STATE_PATCHED = True old_world_group = get_world_group() old_tp_group = get_tp_group() From 46e5274d35704c6be31a8ff12eea3d59e2dca7af Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 11:38:07 +0900 Subject: [PATCH 089/126] resolve cade comments --- vllm/config.py | 7 ++-- vllm/distributed/parallel_state.py | 60 +++++++++++++++++---------- vllm/spec_decode/multi_step_worker.py | 46 +++++++++----------- 3 files changed, 59 insertions(+), 54 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 42ce1f6e8f826..c49f8d279f8fc 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -976,10 +976,9 @@ def create_draft_parallel_config( This is mostly a copy of the target parallel config, except the tp_size. """ - - speculative_tensor_parallel_size = ( - speculative_tensor_parallel_size - or target_parallel_config.tensor_parallel_size) + if speculative_tensor_parallel_size is None: + speculative_tensor_parallel_size = \ + target_parallel_config.tensor_parallel_size if speculative_tensor_parallel_size > \ target_parallel_config.tensor_parallel_size: diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index e01fd46b38154..f2553c2dc0e2a 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -463,6 +463,30 @@ def get_world_group() -> GroupCoordinator: return _WORLD +def init_world_group(ranks: List[int], + local_rank: int, + backend: str) -> GroupCoordinator: + return GroupCoordinator( + group_ranks=[ranks], + local_rank=local_rank, + torch_distributed_backend=backend, + use_pynccl=False, + use_custom_allreduce=False, + ) + + +def init_model_parallel_group(group_ranks: List[List[int]], + local_rank: int, + backend: str) -> GroupCoordinator: + return GroupCoordinator( + group_ranks=group_ranks, + local_rank=local_rank, + torch_distributed_backend=backend, + use_pynccl=True, + use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, + ) + + _TP: Optional[GroupCoordinator] = None @@ -553,15 +577,10 @@ def init_distributed_environment( ranks = list(range(torch.distributed.get_world_size())) if world_size != -1: assert world_size == len(ranks), ( - "given world_size does not match with world_size of torch") - - _WORLD = GroupCoordinator( - group_ranks=[ranks], - local_rank=local_rank, - torch_distributed_backend=backend, - use_pynccl=False, - use_custom_allreduce=False, - ) + f"given world_size ({world_size}) does not match with" + f"world_size of torch ({len(ranks)})") + + _WORLD = init_world_group(ranks, local_rank, backend) else: assert _WORLD.world_size == world_size, ( "world group already initialized with a different world size") @@ -618,13 +637,8 @@ def initialize_model_parallel( range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)) group_ranks.append(ranks) - _TP = GroupCoordinator( - group_ranks=group_ranks, - local_rank=get_world_group().local_rank, - torch_distributed_backend=backend, - use_pynccl=True, - use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, - ) + _TP = init_model_parallel_group(group_ranks, get_world_group().local_rank, + backend) # Build the pipeline model-parallel groups. num_pipeline_model_parallel_groups: int = (world_size // @@ -636,13 +650,8 @@ def initialize_model_parallel( for i in range(num_pipeline_model_parallel_groups): ranks = list(range(i, world_size, num_pipeline_model_parallel_groups)) group_ranks.append(ranks) - _PP = GroupCoordinator( - group_ranks=group_ranks, - local_rank=get_world_group().local_rank, - torch_distributed_backend=backend, - use_pynccl=True, - use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, - ) + _PP = init_model_parallel_group(group_ranks, get_world_group().local_rank, + backend) def ensure_model_parallel_initialized( @@ -686,6 +695,11 @@ def patch_tensor_parallel_group(world_group: Optional[GroupCoordinator], tp_group: Optional[GroupCoordinator]): """Patch the tp group temporarily until this function ends. It requires the world group to be patched together to keep the integrity. + If either world_group or tp_group is None, nothing happens. + + Also it does not allow additional patch during patching, otherwise the + original state, which should be restored, will be lost. + Args: world_group (Optional[GroupCoordinator]): the world group coordinator tp_group (Optional[GroupCoordinator]): the tp group coordinator diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 13b1dd10ce6ae..f7514dcc3e0b5 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -4,9 +4,9 @@ import torch -from vllm.distributed.parallel_state import (_ENABLE_CUSTOM_ALL_REDUCE, - GroupCoordinator, get_tp_group, - get_world_group, +from vllm.distributed.parallel_state import (get_tp_group, get_world_group, + init_model_parallel_group, + init_world_group, patch_tensor_parallel_group) from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData, SequenceGroupMetadata) @@ -23,6 +23,12 @@ class MultiStepWorker(Worker, ProposerWorkerBase): allocated enough space to store the additional KV. This reduces overhead by invoking the scheduler less. + In addition, it allows a speculative draft model to run with smaller tensor + parallel degree than target model. This is implemented by changing vLLM's + tensor parallel group to a group of the small size temporarily during + forward passes of draft models. This reduces the communication overhead of + small draft models. + The MultiStepWorker does not support cache swap operations, or beam search. Cache swap operations do not require large modifications. On the other hand, beam search requires memory allocations during sequence forks and thus @@ -32,17 +38,11 @@ class MultiStepWorker(Worker, ProposerWorkerBase): def __init__(self, draft_ranks: Optional[List[int]] = None, **kwargs): """Create a MultiStepWorker. - It allows a speculative draft model to run with smaller tensor - parallel degree than target model. - This reduces the communication overhead of small draft models. - - This is implemented by changing vLLM's tensor parallel group to a group - of the small size temporarily during forward passes of draft models. - Args: draft_ranks (Optional[List[int]]): if this value is given, only some of the GPU ranks in this value participate in draft generation """ + super().__init__(**kwargs) self._draft_ranks = draft_ranks self._is_dummy = False @@ -53,12 +53,14 @@ def __init__(self, draft_ranks: Optional[List[int]] = None, **kwargs): # whether the worker participates in draft generation or not self._is_dummy = kwargs['rank'] not in draft_ranks - super().__init__(**kwargs) - # Lazy initialization list. self._proposer: SpeculativeProposer def _patch_tensor_parallel_group(self): + """Temporarily patch the global tp group state with its own tp group + state. For consistency, it also updates the world group state. + Note that it has no effect when its tp group has not been initialized. + """ return patch_tensor_parallel_group(self._world_group, self._tp_group) def init_device(self): @@ -73,20 +75,10 @@ def init_device(self): tp_backend = torch.distributed.get_backend( get_tp_group().device_group) - self._world_group = GroupCoordinator( - group_ranks=[self._draft_ranks], - local_rank=local_rank, - torch_distributed_backend=world_backend, - use_pynccl=False, - use_custom_allreduce=False, - ) - self._tp_group = GroupCoordinator( - group_ranks=[self._draft_ranks], - local_rank=local_rank, - torch_distributed_backend=tp_backend, - use_pynccl=True, - use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, - ) + self._world_group = init_world_group(self._draft_ranks, local_rank, + world_backend) + self._tp_group = init_model_parallel_group([self._draft_ranks], + local_rank, tp_backend) with self._patch_tensor_parallel_group(): super().init_device() @@ -119,7 +111,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: with self._patch_tensor_parallel_group(): return super().determine_num_available_blocks() - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int): + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: if self._is_dummy: return From 85f4f2570e3b136d7d0d9bcd717e8b7534b64063 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 13:00:46 +0900 Subject: [PATCH 090/126] refactoring patch_tp_group --- vllm/distributed/parallel_state.py | 10 +++++----- vllm/spec_decode/multi_step_worker.py | 7 ++++++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index f2553c2dc0e2a..2755f261dcbf3 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -690,9 +690,9 @@ def model_parallel_is_initialized(): TP_STATE_PATCHED = False -@contextlib.contextmanager -def patch_tensor_parallel_group(world_group: Optional[GroupCoordinator], - tp_group: Optional[GroupCoordinator]): +@contextmanager +def patch_tensor_parallel_group(world_group: GroupCoordinator, + tp_group: GroupCoordinator): """Patch the tp group temporarily until this function ends. It requires the world group to be patched together to keep the integrity. If either world_group or tp_group is None, nothing happens. @@ -705,8 +705,8 @@ def patch_tensor_parallel_group(world_group: Optional[GroupCoordinator], tp_group (Optional[GroupCoordinator]): the tp group coordinator """ global TP_STATE_PATCHED - if (not TP_STATE_PATCHED and world_group is not None - and tp_group is not None): + assert not TP_STATE_PATCHED, "Should not call when it's already patched" + if not TP_STATE_PATCHED: TP_STATE_PATCHED = True old_world_group = get_world_group() old_tp_group = get_tp_group() diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index f7514dcc3e0b5..298d9fe662988 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -1,3 +1,4 @@ +from contextlib import contextmanager import copy import weakref from typing import Dict, List, Optional, Tuple @@ -56,12 +57,16 @@ def __init__(self, draft_ranks: Optional[List[int]] = None, **kwargs): # Lazy initialization list. self._proposer: SpeculativeProposer + @contextmanager def _patch_tensor_parallel_group(self): """Temporarily patch the global tp group state with its own tp group state. For consistency, it also updates the world group state. Note that it has no effect when its tp group has not been initialized. """ - return patch_tensor_parallel_group(self._world_group, self._tp_group) + if self._tp_group is None: + yield + else: + return patch_tensor_parallel_group(self._world_group, self._tp_group) def init_device(self): if self._is_dummy: From c1b537326bf855807998d31147605772cdaba403 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 13:21:02 +0900 Subject: [PATCH 091/126] cleanup patch_tp_group logic --- vllm/distributed/parallel_state.py | 29 +++++++++++---------------- vllm/spec_decode/multi_step_worker.py | 4 ++-- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 2755f261dcbf3..dfbef37db80c0 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -695,32 +695,27 @@ def patch_tensor_parallel_group(world_group: GroupCoordinator, tp_group: GroupCoordinator): """Patch the tp group temporarily until this function ends. It requires the world group to be patched together to keep the integrity. - If either world_group or tp_group is None, nothing happens. - - Also it does not allow additional patch during patching, otherwise the - original state, which should be restored, will be lost. Args: - world_group (Optional[GroupCoordinator]): the world group coordinator - tp_group (Optional[GroupCoordinator]): the tp group coordinator + world_group (GroupCoordinator): the world group coordinator + tp_group (GroupCoordinator): the tp group coordinator """ global TP_STATE_PATCHED assert not TP_STATE_PATCHED, "Should not call when it's already patched" - if not TP_STATE_PATCHED: - TP_STATE_PATCHED = True - old_world_group = get_world_group() - old_tp_group = get_tp_group() - global _WORLD, _TP - _WORLD = world_group - _TP = tp_group + + TP_STATE_PATCHED = True + old_world_group = get_world_group() + old_tp_group = get_tp_group() + global _WORLD, _TP + _WORLD = world_group + _TP = tp_group try: yield finally: # restore the original state - if TP_STATE_PATCHED: - TP_STATE_PATCHED = False - _WORLD = old_world_group - _TP = old_tp_group + TP_STATE_PATCHED = False + _WORLD = old_world_group + _TP = old_tp_group def get_tensor_model_parallel_world_size(): diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 298d9fe662988..0c36ef2077958 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -61,9 +61,9 @@ def __init__(self, draft_ranks: Optional[List[int]] = None, **kwargs): def _patch_tensor_parallel_group(self): """Temporarily patch the global tp group state with its own tp group state. For consistency, it also updates the world group state. - Note that it has no effect when its tp group has not been initialized. + Note that it has no effect when draft_ranks is None. """ - if self._tp_group is None: + if self._draft_ranks is None: yield else: return patch_tensor_parallel_group(self._world_group, self._tp_group) From 4a586170762e58e4e67aa30a9c8fb546c99c1226 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 13:36:21 +0900 Subject: [PATCH 092/126] speculative_draft_tensor_parallel_size --- benchmarks/benchmark_latency.py | 7 ++++--- tests/spec_decode/e2e/test_integration_dist.py | 2 +- vllm/config.py | 18 ++++++++++-------- vllm/distributed/parallel_state.py | 3 +++ vllm/engine/arg_utils.py | 8 ++++---- 5 files changed, 22 insertions(+), 16 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index faafb8b429507..5101993b0023b 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -24,7 +24,8 @@ def main(args: argparse.Namespace): model=args.model, speculative_model=args.speculative_model, num_speculative_tokens=args.num_speculative_tokens, - speculative_tensor_parallel_size=args.speculative_tensor_parallel_size, + speculative_draft_tensor_parallel_size=\ + args.speculative_draft_tensor_parallel_size, tokenizer=args.tokenizer, quantization=args.quantization, tensor_parallel_size=args.tensor_parallel_size, @@ -126,8 +127,8 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument('--model', type=str, default='facebook/opt-125m') parser.add_argument('--speculative-model', type=str, default=None) parser.add_argument('--num-speculative-tokens', type=int, default=None) - parser.add_argument('--speculative-tensor-parallel-size', - '-spec-tp', + parser.add_argument('--speculative-draft-tensor-parallel-size', + '-spec-draft-tp', type=int, default=None) parser.add_argument('--tokenizer', type=str, default=None) diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py index 4fdb1567d51a8..bebb1539438e3 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist.py @@ -94,7 +94,7 @@ def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, - "speculative_tensor_parallel_size": 1, + "speculative_draft_tensor_parallel_size": 1, }, ]) @pytest.mark.parametrize("batch_size", [2]) diff --git a/vllm/config.py b/vllm/config.py index c49f8d279f8fc..1a12bda59726a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -795,7 +795,7 @@ def maybe_create_spec_config( target_parallel_config: ParallelConfig, target_dtype: str, speculative_model: Optional[str], - speculative_tensor_parallel_size: Optional[int], + speculative_draft_tensor_parallel_size: Optional[int], num_speculative_tokens: Optional[int], speculative_max_model_len: Optional[int], enable_chunked_prefill: bool, @@ -818,6 +818,8 @@ def maybe_create_spec_config( target_dtype (str): The data type used for the target model. speculative_model (Optional[str]): The name of the speculative model, if provided. + speculative_draft_tensor_parallel_size (Optional[int]): The degree + of the tensor parallelism for the draft model. num_speculative_tokens (Optional[int]): The number of speculative tokens, if provided. speculative_max_model_len (Optional[int]): The maximum model len of @@ -922,7 +924,7 @@ def maybe_create_spec_config( draft_parallel_config = ( SpeculativeConfig.create_draft_parallel_config( - target_parallel_config, speculative_tensor_parallel_size)) + target_parallel_config, speculative_draft_tensor_parallel_size)) return SpeculativeConfig( draft_model_config, @@ -971,25 +973,25 @@ def _maybe_override_draft_max_model_len( @staticmethod def create_draft_parallel_config( target_parallel_config: ParallelConfig, - speculative_tensor_parallel_size: Optional[int]) -> ParallelConfig: + speculative_draft_tensor_parallel_size: Optional[int]) -> ParallelConfig: """Create a parallel config for use by the draft worker. This is mostly a copy of the target parallel config, except the tp_size. """ - if speculative_tensor_parallel_size is None: - speculative_tensor_parallel_size = \ + if speculative_draft_tensor_parallel_size is None: + speculative_draft_tensor_parallel_size = \ target_parallel_config.tensor_parallel_size - if speculative_tensor_parallel_size > \ + if speculative_draft_tensor_parallel_size > \ target_parallel_config.tensor_parallel_size: raise ValueError( - f"{speculative_tensor_parallel_size=} cannot be " + f"{speculative_draft_tensor_parallel_size=} cannot be " f"larger than {target_parallel_config.tensor_parallel_size}") draft_parallel_config = ParallelConfig( pipeline_parallel_size=target_parallel_config. pipeline_parallel_size, - tensor_parallel_size=speculative_tensor_parallel_size, + tensor_parallel_size=speculative_draft_tensor_parallel_size, distributed_executor_backend=target_parallel_config. distributed_executor_backend, max_parallel_loading_workers=target_parallel_config. diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index dfbef37db80c0..13c7604241c14 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -696,6 +696,9 @@ def patch_tensor_parallel_group(world_group: GroupCoordinator, """Patch the tp group temporarily until this function ends. It requires the world group to be patched together to keep the integrity. + This method is for draft workers of speculative decoding to run draft model + with different tp degree from that of target model workers. + Args: world_group (GroupCoordinator): the world group coordinator tp_group (GroupCoordinator): the tp group coordinator diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index aa37101a01782..c78e4b06d2e06 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -94,7 +94,7 @@ class EngineArgs: guided_decoding_backend: str = 'outlines' # Speculative decoding configuration. speculative_model: Optional[str] = None - speculative_tensor_parallel_size: Optional[int] = None + speculative_draft_tensor_parallel_size: Optional[int] = None num_speculative_tokens: Optional[int] = None speculative_max_model_len: Optional[int] = None speculative_disable_by_batch_size: Optional[int] = None @@ -543,7 +543,7 @@ def add_cli_args( '--speculative-tensor-parallel-size', '-spec-tp', type=int, - default=EngineArgs.speculative_tensor_parallel_size, + default=EngineArgs.speculative_draft_tensor_parallel_size, help='Number of tensor parallel replicas for ' 'the draft model in speculative decoding.') @@ -695,8 +695,8 @@ def create_engine_config(self, ) -> EngineConfig: target_parallel_config=parallel_config, target_dtype=self.dtype, speculative_model=self.speculative_model, - speculative_tensor_parallel_size = \ - self.speculative_tensor_parallel_size, + speculative_draft_tensor_parallel_size = \ + self.speculative_draft_tensor_parallel_size, num_speculative_tokens=self.num_speculative_tokens, speculative_disable_by_batch_size=self. speculative_disable_by_batch_size, From b09e7befd1cabfc0ba3390d61686f3a5fb735683 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 14:02:41 +0900 Subject: [PATCH 093/126] ruff, yapf --- vllm/config.py | 8 +++++--- vllm/distributed/parallel_state.py | 7 +++---- vllm/spec_decode/multi_step_worker.py | 6 ++++-- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 1a12bda59726a..6f6e836057fbc 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -924,7 +924,8 @@ def maybe_create_spec_config( draft_parallel_config = ( SpeculativeConfig.create_draft_parallel_config( - target_parallel_config, speculative_draft_tensor_parallel_size)) + target_parallel_config, + speculative_draft_tensor_parallel_size)) return SpeculativeConfig( draft_model_config, @@ -972,8 +973,9 @@ def _maybe_override_draft_max_model_len( @staticmethod def create_draft_parallel_config( - target_parallel_config: ParallelConfig, - speculative_draft_tensor_parallel_size: Optional[int]) -> ParallelConfig: + target_parallel_config: ParallelConfig, + speculative_draft_tensor_parallel_size: Optional[int] + ) -> ParallelConfig: """Create a parallel config for use by the draft worker. This is mostly a copy of the target parallel config, except the tp_size. diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 13c7604241c14..d2a577af5043b 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -28,6 +28,7 @@ from unittest.mock import patch import torch +import torch.distributed from torch.distributed import Backend, ProcessGroup import vllm.envs as envs @@ -463,8 +464,7 @@ def get_world_group() -> GroupCoordinator: return _WORLD -def init_world_group(ranks: List[int], - local_rank: int, +def init_world_group(ranks: List[int], local_rank: int, backend: str) -> GroupCoordinator: return GroupCoordinator( group_ranks=[ranks], @@ -475,8 +475,7 @@ def init_world_group(ranks: List[int], ) -def init_model_parallel_group(group_ranks: List[List[int]], - local_rank: int, +def init_model_parallel_group(group_ranks: List[List[int]], local_rank: int, backend: str) -> GroupCoordinator: return GroupCoordinator( group_ranks=group_ranks, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 0c36ef2077958..f255c2a1109f1 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -66,7 +66,8 @@ def _patch_tensor_parallel_group(self): if self._draft_ranks is None: yield else: - return patch_tensor_parallel_group(self._world_group, self._tp_group) + return patch_tensor_parallel_group(self._world_group, + self._tp_group) def init_device(self): if self._is_dummy: @@ -116,7 +117,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: with self._patch_tensor_parallel_group(): return super().determine_num_available_blocks() - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: if self._is_dummy: return From 7168d78c72bbd8ec5016ff9d313139ddcec19c43 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 14:08:15 +0900 Subject: [PATCH 094/126] remove world group patch --- .../spec_decode/e2e/test_integration_dist.py | 57 ++++++++++++++++++- vllm/distributed/parallel_state.py | 15 +---- vllm/spec_decode/multi_step_worker.py | 10 +--- 3 files changed, 61 insertions(+), 21 deletions(-) diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py index bebb1539438e3..2b183c34a785f 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist.py @@ -99,7 +99,62 @@ def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, ]) @pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("seed", [1]) -def test_draft_model_tp_lt_target_model(test_llm_generator, +def test_draft_model_tp_lt_target_model_tp2(test_llm_generator, + baseline_llm_generator, + batch_size: int): + """Verify spec decode works well with smaller tp for draft models. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=32, + force_output_len=True) + + +@pytest.mark.skipif(torch.cuda.device_count() < 4, + reason="Need at least 4 GPUs to run the test.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + # Note this is repeated in the test body; to initialize a tokenizer. + "model": "JackFram/llama-68m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + "tensor_parallel_size": 4, + + # Use AsyncLLM engine, so that the engine runs in its own process. + # Otherwise, since vLLM does not follow true SPMD, the test runner + # process will have both the engine and the rank0 worker. NCCL is not + # cleaned up properly, and its server host thread leaks, causing the + # second run of the test to fail with internal NCCL error. + "use_async": True, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, +]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_draft_tensor_parallel_size": 3, + }, + { + "speculative_draft_tensor_parallel_size": 2, + }, + { + "speculative_draft_tensor_parallel_size": 1, + }, +]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize("seed", [1]) +def test_draft_model_tp_lt_target_model_4gpus(test_llm_generator, baseline_llm_generator, batch_size: int): """Verify spec decode works well with smaller tp for draft models. diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index d2a577af5043b..be79598e70130 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -574,14 +574,9 @@ def init_distributed_environment( global _WORLD if _WORLD is None: ranks = list(range(torch.distributed.get_world_size())) - if world_size != -1: - assert world_size == len(ranks), ( - f"given world_size ({world_size}) does not match with" - f"world_size of torch ({len(ranks)})") - _WORLD = init_world_group(ranks, local_rank, backend) else: - assert _WORLD.world_size == world_size, ( + assert _WORLD.world_size == torch.distributed.get_world_size(), ( "world group already initialized with a different world size") @@ -690,8 +685,7 @@ def model_parallel_is_initialized(): @contextmanager -def patch_tensor_parallel_group(world_group: GroupCoordinator, - tp_group: GroupCoordinator): +def patch_tensor_parallel_group(tp_group: GroupCoordinator): """Patch the tp group temporarily until this function ends. It requires the world group to be patched together to keep the integrity. @@ -706,17 +700,14 @@ def patch_tensor_parallel_group(world_group: GroupCoordinator, assert not TP_STATE_PATCHED, "Should not call when it's already patched" TP_STATE_PATCHED = True - old_world_group = get_world_group() old_tp_group = get_tp_group() - global _WORLD, _TP - _WORLD = world_group + global _TP _TP = tp_group try: yield finally: # restore the original state TP_STATE_PATCHED = False - _WORLD = old_world_group _TP = old_tp_group diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index f255c2a1109f1..52c9fda9f1632 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -47,7 +47,6 @@ def __init__(self, draft_ranks: Optional[List[int]] = None, **kwargs): self._draft_ranks = draft_ranks self._is_dummy = False - self._world_group = None self._tp_group = None if draft_ranks is not None: @@ -66,8 +65,7 @@ def _patch_tensor_parallel_group(self): if self._draft_ranks is None: yield else: - return patch_tensor_parallel_group(self._world_group, - self._tp_group) + return patch_tensor_parallel_group(self._tp_group) def init_device(self): if self._is_dummy: @@ -75,14 +73,10 @@ def init_device(self): if self._draft_ranks is not None: # creates tp process group containing only a subset of gpu ranks - local_rank = get_world_group().local_rank - world_backend = torch.distributed.get_backend( - get_world_group().device_group) + local_rank = get_tp_group().local_rank tp_backend = torch.distributed.get_backend( get_tp_group().device_group) - self._world_group = init_world_group(self._draft_ranks, local_rank, - world_backend) self._tp_group = init_model_parallel_group([self._draft_ranks], local_rank, tp_backend) From fe0bd5be01f6819fd0727a5f7106dece9c2adf54 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 14:11:00 +0900 Subject: [PATCH 095/126] isort, yapf --- vllm/distributed/parallel_state.py | 8 ++++---- vllm/spec_decode/multi_step_worker.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index be79598e70130..1077b90ba78a1 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -631,8 +631,8 @@ def initialize_model_parallel( range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)) group_ranks.append(ranks) - _TP = init_model_parallel_group(group_ranks, get_world_group().local_rank, - backend) + _TP = init_model_parallel_group(group_ranks, + get_world_group().local_rank, backend) # Build the pipeline model-parallel groups. num_pipeline_model_parallel_groups: int = (world_size // @@ -644,8 +644,8 @@ def initialize_model_parallel( for i in range(num_pipeline_model_parallel_groups): ranks = list(range(i, world_size, num_pipeline_model_parallel_groups)) group_ranks.append(ranks) - _PP = init_model_parallel_group(group_ranks, get_world_group().local_rank, - backend) + _PP = init_model_parallel_group(group_ranks, + get_world_group().local_rank, backend) def ensure_model_parallel_initialized( diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 52c9fda9f1632..db7c72923b382 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -1,6 +1,6 @@ -from contextlib import contextmanager import copy import weakref +from contextlib import contextmanager from typing import Dict, List, Optional, Tuple import torch From 2e0d1703df9169e7c7e5a4cab957b3c4190e6398 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 15:06:17 +0900 Subject: [PATCH 096/126] yield fix --- vllm/spec_decode/multi_step_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index db7c72923b382..d21e668dd7da5 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -65,7 +65,7 @@ def _patch_tensor_parallel_group(self): if self._draft_ranks is None: yield else: - return patch_tensor_parallel_group(self._tp_group) + yield patch_tensor_parallel_group(self._tp_group) def init_device(self): if self._is_dummy: From 36f8aa508eeb63074b32800be85573854fa921d9 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 15:23:29 +0900 Subject: [PATCH 097/126] debugging --- vllm/distributed/parallel_state.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 1077b90ba78a1..74195dc098319 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -703,11 +703,13 @@ def patch_tensor_parallel_group(tp_group: GroupCoordinator): old_tp_group = get_tp_group() global _TP _TP = tp_group + logger.info(f"Patch tp_group. {old_tp_group.world_size} > {tp_group.world_size}") try: yield finally: # restore the original state TP_STATE_PATCHED = False + logger.info(f"Restore tp_group. {tp_group.world_size} > {old_tp_group.world_size}") _TP = old_tp_group From 54bf5146da9cff98fbbd976615f3e05b081db8f5 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 15:28:24 +0900 Subject: [PATCH 098/126] log --- vllm/spec_decode/multi_step_worker.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index d21e668dd7da5..b1163bf422f86 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -16,6 +16,9 @@ from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.worker.worker import Worker +from vllm.logger import init_logger + +logger = init_logger(__name__) class MultiStepWorker(Worker, ProposerWorkerBase): @@ -63,8 +66,10 @@ def _patch_tensor_parallel_group(self): Note that it has no effect when draft_ranks is None. """ if self._draft_ranks is None: + logger.info("Do not patch") yield else: + logger.info("Do patch") yield patch_tensor_parallel_group(self._tp_group) def init_device(self): From bfd7d2fc0562a5cb982d5b9b1449e4ad4a0ea52e Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 16:06:36 +0900 Subject: [PATCH 099/126] reintroduce smaller_tp_proposer_worker --- vllm/spec_decode/multi_step_worker.py | 89 +---------- .../spec_decode/smaller_tp_proposer_worker.py | 143 ++++++++++++++++++ vllm/spec_decode/spec_decode_worker.py | 6 +- 3 files changed, 155 insertions(+), 83 deletions(-) create mode 100644 vllm/spec_decode/smaller_tp_proposer_worker.py diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index b1163bf422f86..82426f5c08767 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -1,14 +1,9 @@ import copy import weakref -from contextlib import contextmanager from typing import Dict, List, Optional, Tuple import torch -from vllm.distributed.parallel_state import (get_tp_group, get_world_group, - init_model_parallel_group, - init_world_group, - patch_tensor_parallel_group) from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData, SequenceGroupMetadata) from vllm.spec_decode.interfaces import (SpeculativeProposals, @@ -39,54 +34,16 @@ class MultiStepWorker(Worker, ProposerWorkerBase): requires more thought for MultiStepWorker support. """ - def __init__(self, draft_ranks: Optional[List[int]] = None, **kwargs): + def __init__(self, **kwargs): """Create a MultiStepWorker. - - Args: - draft_ranks (Optional[List[int]]): if this value is given, only some - of the GPU ranks in this value participate in draft generation """ super().__init__(**kwargs) - self._draft_ranks = draft_ranks - self._is_dummy = False - self._tp_group = None - - if draft_ranks is not None: - # whether the worker participates in draft generation or not - self._is_dummy = kwargs['rank'] not in draft_ranks - # Lazy initialization list. self._proposer: SpeculativeProposer - @contextmanager - def _patch_tensor_parallel_group(self): - """Temporarily patch the global tp group state with its own tp group - state. For consistency, it also updates the world group state. - Note that it has no effect when draft_ranks is None. - """ - if self._draft_ranks is None: - logger.info("Do not patch") - yield - else: - logger.info("Do patch") - yield patch_tensor_parallel_group(self._tp_group) - def init_device(self): - if self._is_dummy: - return - - if self._draft_ranks is not None: - # creates tp process group containing only a subset of gpu ranks - local_rank = get_tp_group().local_rank - tp_backend = torch.distributed.get_backend( - get_tp_group().device_group) - - self._tp_group = init_model_parallel_group([self._draft_ranks], - local_rank, tp_backend) - - with self._patch_tensor_parallel_group(): - super().init_device() + super().init_device() self._proposer = Top1Proposer( weakref.proxy(self), # type: ignore[arg-type] @@ -96,33 +53,18 @@ def init_device(self): ) def set_include_gpu_probs_tensor(self): - if self._is_dummy: - return - # Need include_gpu_probs_tensor for multi_step_worker self.model_runner.model.sampler.include_gpu_probs_tensor = True - def load_model(self): - if self._is_dummy: - return - - with self._patch_tensor_parallel_group(): - super().load_model() + def load_model(self) -> None: + super().load_model() def determine_num_available_blocks(self) -> Tuple[int, int]: - if self._is_dummy: - return -1, -1 - - with self._patch_tensor_parallel_group(): - return super().determine_num_available_blocks() + return super().determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: - if self._is_dummy: - return - - with self._patch_tensor_parallel_group(): - super().initialize_cache(num_gpu_blocks, num_cpu_blocks) + super().initialize_cache(num_gpu_blocks, num_cpu_blocks) @torch.inference_mode() def sampler_output( @@ -137,10 +79,6 @@ def sampler_output( For multi step worker, this indicator shall be True. """ - # NOTE: here, neither _patch_tensor_parallel_group() call nor _is_dummy - # check, as it's always called after tp_group has already been - # overridden by get_spec_proposals() - self._raise_if_unsupported(execute_model_req) # Shallow copy input data so modifications (such as appending tokens) @@ -176,11 +114,7 @@ def get_spec_proposals( """Produce speculations given an input batch of sequences. The number of speculative tokens per sequence is determined by max_proposal_len. """ - if self._is_dummy: - return SpeculativeProposals(None, None, None) - - with self._patch_tensor_parallel_group(): - return self._proposer.get_spec_proposals(execute_model_req) + return self._proposer.get_spec_proposals(execute_model_req) @staticmethod def _append_new_tokens( @@ -299,14 +233,7 @@ def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None ) -> List[SamplerOutput]: - if self._is_dummy: - return [] - - with self._patch_tensor_parallel_group(): - return super().execute_model(execute_model_req) + return super().execute_model(execute_model_req) def get_cache_block_size_bytes(self) -> int: - if self._is_dummy: - return 0 - return super().get_cache_block_size_bytes() diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py new file mode 100644 index 0000000000000..2009179735958 --- /dev/null +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -0,0 +1,143 @@ +from typing import List, Optional, Tuple, Union + +import torch + +from vllm.distributed.parallel_state import (get_tp_group, + init_model_parallel_group, + patch_tensor_parallel_group) +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.spec_decode.interfaces import SpeculativeProposals +from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase +from vllm.worker.worker import Worker +from vllm.logger import init_logger + +logger = init_logger(__name__) + +class SmallerTpProposerWorker(ProposerWorkerBase): + """Class which allows a speculative draft model to run with smaller tensor + parallel degree than target model. + This reduces the communication overhead of small draft models. + + This is implemented by changing vLLM's tensor parallel group to a group of + size temporarily during forward passes of draft models. + """ + @classmethod + def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int, + target_tensor_parallel_size: int): + """Wrap the worker in a SmallerTpProposerWorker if necessary. + """ + draft_tp = draft_tensor_parallel_size + target_tp = target_tensor_parallel_size + + if draft_tp == target_tp: + return worker + + if draft_tp > target_tp: + raise ValueError( + f"{cls} only supports draft_tp smaller than target_tp." + f"{draft_tp=} {target_tp=}") + + # gpu ranks that will generate draft tokens together + ranks = list(range(draft_tp)) + + logger.info("Wrapping {%s} in {%s}", type(worker), cls) + return cls(worker, ranks) + + def __init__(self, worker: Union[Worker, ProposerWorkerBase], + draft_ranks: List[int]): + """Create a SmallerTpProposerWorker. + + Args: + worker (Union[Worker, ProposerWorkerBase]): _description_ + draft_ranks (List[int]): if this value is given, only some + of the GPU ranks in this value participate in draft generation + """ + self._worker = worker + self._draft_ranks = draft_ranks + + # init during init_device + self._is_dummy = False + self._tp_group = None + + def _patch_tensor_parallel_group(self): + """Temporarily patch the global tp group state with its own tp group + state. For consistency, it also updates the world group state. + Note that it has no effect when draft_ranks is None. + """ + return patch_tensor_parallel_group(self._tp_group) + + def init_device(self): + + self._is_dummy = get_tp_group().rank not in self._draft_ranks + + # dummy workers do nothing + if self._is_dummy: + return + + # creates tp process group containing only a subset of gpu ranks + local_rank = get_tp_group().local_rank + tp_backend = torch.distributed.get_backend(get_tp_group().device_group) + self._tp_group = init_model_parallel_group([self._draft_ranks], + local_rank, tp_backend) + + with self._patch_tensor_parallel_group(): + self._worker.init_device() + + def set_include_gpu_probs_tensor(self): + if self._is_dummy: + return + + # Need include_gpu_probs_tensor for multi_step_worker + self._worker.set_include_gpu_probs_tensor() + + def load_model(self) -> None: + if self._is_dummy: + return + + with self._patch_tensor_parallel_group(): + self._worker.load_model() + + def determine_num_available_blocks(self) -> Tuple[int, int]: + if self._is_dummy: + return -1, -1 + + with self._patch_tensor_parallel_group(): + return self._worker.determine_num_available_blocks() + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + if self._is_dummy: + return + + with self._patch_tensor_parallel_group(): + self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + + def get_spec_proposals( + self, + execute_model_req: ExecuteModelRequest, + ) -> SpeculativeProposals: + """Produce speculations given an input batch of sequences. The number of + speculative tokens per sequence is determined by max_proposal_len. + """ + if self._is_dummy: + return SpeculativeProposals(None, None, None) + + with self._patch_tensor_parallel_group(): + return self._worker.get_spec_proposals(execute_model_req) + + @torch.inference_mode() + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + if self._is_dummy: + return [] + + with self._patch_tensor_parallel_group(): + return self._worker.execute_model(execute_model_req) + + def get_cache_block_size_bytes(self) -> int: + if self._is_dummy: + return 0 + + return self._worker.get_cache_block_size_bytes() diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 27a35165c6315..de649b3783f70 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -16,6 +16,7 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase +from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker from vllm.spec_decode.util import (create_sequence_group_output, get_all_num_logprobs, get_all_seq_ids, get_sampled_token_logprobs, nvtx_range, @@ -110,8 +111,9 @@ def create_worker( draft_tp = draft_parallel_config.tensor_parallel_size target_tp = scorer_worker.parallel_config.tensor_parallel_size - ranks = list(range(draft_tp)) if target_tp != draft_tp else None - proposer_worker = MultiStepWorker(ranks, **draft_worker_kwargs) + proposer_worker = MultiStepWorker(**draft_worker_kwargs) + proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker( + proposer_worker, draft_tp, target_tp) logger.info("Configuring SpecDecodeWorker with proposer=%s", type(proposer_worker)) From f3374281e0dd676dca3ead0e95344386c83a62ad Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 16:14:17 +0900 Subject: [PATCH 100/126] add lora methods --- vllm/spec_decode/smaller_tp_proposer_worker.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 2009179735958..052958237ded8 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,10 +1,11 @@ -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Set, Tuple, Union import torch from vllm.distributed.parallel_state import (get_tp_group, init_model_parallel_group, patch_tensor_parallel_group) +from vllm.lora.request import LoRARequest from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase @@ -141,3 +142,12 @@ def get_cache_block_size_bytes(self) -> int: return 0 return self._worker.get_cache_block_size_bytes() + + def add_lora(self, lora_request: LoRARequest) -> bool: + return self._worker.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + return self._worker.remove_lora(lora_id) + + def list_loras(self) -> Set[int]: + return self._worker.list_loras() From 4654b9f80f29b6a6e39f9663f29caac43943f4a8 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 16:18:48 +0900 Subject: [PATCH 101/126] missing method --- vllm/spec_decode/smaller_tp_proposer_worker.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 052958237ded8..8c38961229345 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -113,6 +113,14 @@ def initialize_cache(self, num_gpu_blocks: int, with self._patch_tensor_parallel_group(): self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + def sampler_output( + self, + execute_model_req: ExecuteModelRequest, + sample_len: int, + ) -> Tuple[List[SamplerOutput], bool]: + # Do not check _is_dummy, as it's always called by get_spec_proposals + return self._worker.sampler_output(execute_model_req, sample_len) + def get_spec_proposals( self, execute_model_req: ExecuteModelRequest, @@ -126,7 +134,6 @@ def get_spec_proposals( with self._patch_tensor_parallel_group(): return self._worker.get_spec_proposals(execute_model_req) - @torch.inference_mode() def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None @@ -151,3 +158,7 @@ def remove_lora(self, lora_id: int) -> bool: def list_loras(self) -> Set[int]: return self._worker.list_loras() + + @property + def vocab_size(self) -> int: + return self._worker.vocab_size From e39926eddf9b16b32edbbcd5382620a998cc6765 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 16:32:43 +0900 Subject: [PATCH 102/126] remove world group related logics --- vllm/distributed/parallel_state.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 74195dc098319..37c6c704f9dea 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -687,13 +687,11 @@ def model_parallel_is_initialized(): @contextmanager def patch_tensor_parallel_group(tp_group: GroupCoordinator): """Patch the tp group temporarily until this function ends. - It requires the world group to be patched together to keep the integrity. This method is for draft workers of speculative decoding to run draft model with different tp degree from that of target model workers. Args: - world_group (GroupCoordinator): the world group coordinator tp_group (GroupCoordinator): the tp group coordinator """ global TP_STATE_PATCHED @@ -703,13 +701,11 @@ def patch_tensor_parallel_group(tp_group: GroupCoordinator): old_tp_group = get_tp_group() global _TP _TP = tp_group - logger.info(f"Patch tp_group. {old_tp_group.world_size} > {tp_group.world_size}") try: yield finally: # restore the original state TP_STATE_PATCHED = False - logger.info(f"Restore tp_group. {tp_group.world_size} > {old_tp_group.world_size}") _TP = old_tp_group From 1c6eefde18b9daaf5a739f6f336f3876cd1594cb Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 16:33:18 +0900 Subject: [PATCH 103/126] Always wrapping MultiStepWorker --- vllm/spec_decode/smaller_tp_proposer_worker.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 8c38961229345..b586ba1cab901 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Set, Tuple, Union +from typing import List, Optional, Set, Tuple import torch @@ -9,11 +9,12 @@ from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase -from vllm.worker.worker import Worker +from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.logger import init_logger logger = init_logger(__name__) + class SmallerTpProposerWorker(ProposerWorkerBase): """Class which allows a speculative draft model to run with smaller tensor parallel degree than target model. @@ -44,12 +45,12 @@ def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int, logger.info("Wrapping {%s} in {%s}", type(worker), cls) return cls(worker, ranks) - def __init__(self, worker: Union[Worker, ProposerWorkerBase], + def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]): """Create a SmallerTpProposerWorker. Args: - worker (Union[Worker, ProposerWorkerBase]): _description_ + worker (MultiStepWorker): an actual worker wrapped with this class draft_ranks (List[int]): if this value is given, only some of the GPU ranks in this value participate in draft generation """ @@ -62,8 +63,7 @@ def __init__(self, worker: Union[Worker, ProposerWorkerBase], def _patch_tensor_parallel_group(self): """Temporarily patch the global tp group state with its own tp group - state. For consistency, it also updates the world group state. - Note that it has no effect when draft_ranks is None. + state. """ return patch_tensor_parallel_group(self._tp_group) From f2d2ee5e5314be9ccd2699cc84270d497aa19029 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 16:34:04 +0900 Subject: [PATCH 104/126] remove unused logger --- vllm/spec_decode/multi_step_worker.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 82426f5c08767..1751451498643 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -11,9 +11,6 @@ from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.worker.worker import Worker -from vllm.logger import init_logger - -logger = init_logger(__name__) class MultiStepWorker(Worker, ProposerWorkerBase): From 302955c69c53d2b1901fd76435d68d1153b3320f Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 16:34:31 +0900 Subject: [PATCH 105/126] isort. minor rename --- tests/spec_decode/e2e/test_integration_dist.py | 2 +- vllm/spec_decode/smaller_tp_proposer_worker.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py index 2b183c34a785f..b3b53ff651a36 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist.py @@ -154,7 +154,7 @@ def test_draft_model_tp_lt_target_model_tp2(test_llm_generator, ]) @pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("seed", [1]) -def test_draft_model_tp_lt_target_model_4gpus(test_llm_generator, +def test_draft_model_tp_lt_target_model_tp4(test_llm_generator, baseline_llm_generator, batch_size: int): """Verify spec decode works well with smaller tp for draft models. diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index b586ba1cab901..0d250c6f2eb8f 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -5,12 +5,12 @@ from vllm.distributed.parallel_state import (get_tp_group, init_model_parallel_group, patch_tensor_parallel_group) +from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.spec_decode.interfaces import SpeculativeProposals -from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.logger import init_logger +from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase logger = init_logger(__name__) From 3d4754e91d21e11aaee25904dde1761a563ba0bb Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 16:53:25 +0900 Subject: [PATCH 106/126] LoraNotSupported. return type --- vllm/spec_decode/multi_step_worker.py | 4 +- vllm/spec_decode/proposer_worker_base.py | 4 +- .../spec_decode/smaller_tp_proposer_worker.py | 41 ++++++------------- 3 files changed, 16 insertions(+), 33 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 1751451498643..1f8034918891c 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -39,7 +39,7 @@ def __init__(self, **kwargs): # Lazy initialization list. self._proposer: SpeculativeProposer - def init_device(self): + def init_device(self) -> None: super().init_device() self._proposer = Top1Proposer( @@ -49,7 +49,7 @@ def init_device(self): max_proposal_len=self.max_model_len, ) - def set_include_gpu_probs_tensor(self): + def set_include_gpu_probs_tensor(self) -> None: # Need include_gpu_probs_tensor for multi_step_worker self.model_runner.model.sampler.include_gpu_probs_tensor = True diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py index fd67ceb912eee..b691659fb292b 100644 --- a/vllm/spec_decode/proposer_worker_base.py +++ b/vllm/spec_decode/proposer_worker_base.py @@ -3,10 +3,10 @@ from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.spec_decode.interfaces import SpeculativeProposer -from vllm.worker.worker_base import WorkerBase +from vllm.worker.worker_base import LoraNotSupportedWorkerBase -class ProposerWorkerBase(WorkerBase, SpeculativeProposer): +class ProposerWorkerBase(LoraNotSupportedWorkerBase, SpeculativeProposer): """Interface for proposer workers""" @abstractmethod diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 0d250c6f2eb8f..55a0ce65c8590 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -6,7 +6,6 @@ init_model_parallel_group, patch_tensor_parallel_group) from vllm.logger import init_logger -from vllm.lora.request import LoRARequest from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.multi_step_worker import MultiStepWorker @@ -20,30 +19,24 @@ class SmallerTpProposerWorker(ProposerWorkerBase): parallel degree than target model. This reduces the communication overhead of small draft models. - This is implemented by changing vLLM's tensor parallel group to a group of - size temporarily during forward passes of draft models. + To implement this feature, this class differs behavior based on is_dummy + flag, where dummy means worker that does not participate draft generation. + Participating workers use a smaller tp group by patching vLLM's tensor + parallel group temporarily during forward passes of draft models. """ @classmethod def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int, target_tensor_parallel_size: int): """Wrap the worker in a SmallerTpProposerWorker if necessary. """ - draft_tp = draft_tensor_parallel_size - target_tp = target_tensor_parallel_size - - if draft_tp == target_tp: + if draft_tensor_parallel_size == target_tensor_parallel_size: return worker - if draft_tp > target_tp: - raise ValueError( - f"{cls} only supports draft_tp smaller than target_tp." - f"{draft_tp=} {target_tp=}") - # gpu ranks that will generate draft tokens together - ranks = list(range(draft_tp)) + draft_ranks = list(range(draft_tensor_parallel_size)) logger.info("Wrapping {%s} in {%s}", type(worker), cls) - return cls(worker, ranks) + return cls(worker, draft_ranks) def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]): @@ -51,8 +44,8 @@ def __init__(self, worker: MultiStepWorker, Args: worker (MultiStepWorker): an actual worker wrapped with this class - draft_ranks (List[int]): if this value is given, only some - of the GPU ranks in this value participate in draft generation + draft_ranks (List[int]): if this value is given, only the GPU ranks + written in this value participate in draft generation """ self._worker = worker self._draft_ranks = draft_ranks @@ -67,10 +60,9 @@ def _patch_tensor_parallel_group(self): """ return patch_tensor_parallel_group(self._tp_group) - def init_device(self): - + def init_device(self) -> None: self._is_dummy = get_tp_group().rank not in self._draft_ranks - + # dummy workers do nothing if self._is_dummy: return @@ -84,7 +76,7 @@ def init_device(self): with self._patch_tensor_parallel_group(): self._worker.init_device() - def set_include_gpu_probs_tensor(self): + def set_include_gpu_probs_tensor(self) -> None: if self._is_dummy: return @@ -150,15 +142,6 @@ def get_cache_block_size_bytes(self) -> int: return self._worker.get_cache_block_size_bytes() - def add_lora(self, lora_request: LoRARequest) -> bool: - return self._worker.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - return self._worker.remove_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self._worker.list_loras() - @property def vocab_size(self) -> int: return self._worker.vocab_size From 620b224c48b8d1c1be7f1a6a44aa68bb8a3595b6 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 17:08:01 +0900 Subject: [PATCH 107/126] yapf, ruff --- tests/spec_decode/e2e/test_integration_dist.py | 8 ++++---- vllm/spec_decode/smaller_tp_proposer_worker.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py index b3b53ff651a36..1cfd4ef047724 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist.py @@ -100,8 +100,8 @@ def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, @pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("seed", [1]) def test_draft_model_tp_lt_target_model_tp2(test_llm_generator, - baseline_llm_generator, - batch_size: int): + baseline_llm_generator, + batch_size: int): """Verify spec decode works well with smaller tp for draft models. """ run_greedy_equality_correctness_test(baseline_llm_generator, @@ -155,8 +155,8 @@ def test_draft_model_tp_lt_target_model_tp2(test_llm_generator, @pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("seed", [1]) def test_draft_model_tp_lt_target_model_tp4(test_llm_generator, - baseline_llm_generator, - batch_size: int): + baseline_llm_generator, + batch_size: int): """Verify spec decode works well with smaller tp for draft models. """ run_greedy_equality_correctness_test(baseline_llm_generator, diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 55a0ce65c8590..6554b91fafa2e 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Set, Tuple +from typing import List, Optional, Tuple import torch @@ -24,6 +24,7 @@ class SmallerTpProposerWorker(ProposerWorkerBase): Participating workers use a smaller tp group by patching vLLM's tensor parallel group temporarily during forward passes of draft models. """ + @classmethod def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int, target_tensor_parallel_size: int): @@ -38,8 +39,7 @@ def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int, logger.info("Wrapping {%s} in {%s}", type(worker), cls) return cls(worker, draft_ranks) - def __init__(self, worker: MultiStepWorker, - draft_ranks: List[int]): + def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]): """Create a SmallerTpProposerWorker. Args: @@ -71,7 +71,7 @@ def init_device(self) -> None: local_rank = get_tp_group().local_rank tp_backend = torch.distributed.get_backend(get_tp_group().device_group) self._tp_group = init_model_parallel_group([self._draft_ranks], - local_rank, tp_backend) + local_rank, tp_backend) with self._patch_tensor_parallel_group(): self._worker.init_device() From b245d3cdaa53bcbcc111c4bc1b03d45d52fde27c Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 17:19:16 +0900 Subject: [PATCH 108/126] add skip_spec_test --- .../spec_decode/e2e/test_integration_dist.py | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py index 1cfd4ef047724..ecca1fd0ac67f 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist.py @@ -164,3 +164,70 @@ def test_draft_model_tp_lt_target_model_tp4(test_llm_generator, batch_size, max_output_len=32, force_output_len=True) + + +@pytest.mark.skipif(torch.cuda.device_count() < 4, + reason="Need at least 4 GPUs to run the test.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "model": "JackFram/llama-160m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + "tensor_parallel_size": 4, + + # Use AsyncLLM engine, so that the engine runs in its own process. + # Otherwise, since vLLM does not follow true SPMD, the test runner + # process will have both the engine and the rank0 worker. NCCL is not + # cleaned up properly, and its server host thread leaks, causing the + # second run of the test to fail with internal NCCL error. + "use_async": True, + + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + # Artificially limit the draft model max model len; this forces vLLM + # to skip speculation once the sequences grow beyond 32-k tokens. + "speculative_max_model_len": 32, + }, +]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_draft_tensor_parallel_size": 3, + }, + { + "speculative_draft_tensor_parallel_size": 2, + }, + { + "speculative_draft_tensor_parallel_size": 1, + }, +]) +@pytest.mark.parametrize("batch_size", [8]) +@pytest.mark.parametrize( + "output_len", + [ + # This must be a good bit larger than speculative_max_model_len so that + # we can test the case where all seqs are skipped, but still small to + # ensure fast test. + 64, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_skip_speculation(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify greedy equality when some (or all) sequences skip speculation. + We do this by setting the max model len of the draft model to an + artificially low value, such that when the sequences grow beyond it, they + are skipped in speculative decoding. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) From 1e71e98921cb79eae327d5e7b1732b048293b271 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 17:32:54 +0900 Subject: [PATCH 109/126] remove spec-tp 3 case --- tests/spec_decode/e2e/test_integration_dist.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist.py index ecca1fd0ac67f..399a61f5f28c4 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist.py @@ -142,9 +142,6 @@ def test_draft_model_tp_lt_target_model_tp2(test_llm_generator, ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_draft_tensor_parallel_size": 3, - }, { "speculative_draft_tensor_parallel_size": 2, }, @@ -199,9 +196,6 @@ def test_draft_model_tp_lt_target_model_tp4(test_llm_generator, ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_draft_tensor_parallel_size": 3, - }, { "speculative_draft_tensor_parallel_size": 2, }, From a01c00d98bcfde86d9e6b1a04e293e0361aedfa1 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Fri, 21 Jun 2024 17:39:01 +0900 Subject: [PATCH 110/126] spec-draft-tp --- vllm/engine/arg_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c78e4b06d2e06..2e691e52fc183 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -540,8 +540,8 @@ def add_cli_args( help='The number of speculative tokens to sample from ' 'the draft model in speculative decoding.') parser.add_argument( - '--speculative-tensor-parallel-size', - '-spec-tp', + '--speculative-draft-tensor-parallel-size', + '-spec-draft-tp', type=int, default=EngineArgs.speculative_draft_tensor_parallel_size, help='Number of tensor parallel replicas for ' From debffc296978741e711720a5e777c7d173cb6ba7 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 24 Jun 2024 09:57:53 +0900 Subject: [PATCH 111/126] _TP_STATE_PATCHED --- vllm/distributed/parallel_state.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 37c6c704f9dea..06b07b050d6c7 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -681,7 +681,7 @@ def model_parallel_is_initialized(): return (_TP is not None and _PP is not None) -TP_STATE_PATCHED = False +_TP_STATE_PATCHED = False @contextmanager @@ -694,10 +694,10 @@ def patch_tensor_parallel_group(tp_group: GroupCoordinator): Args: tp_group (GroupCoordinator): the tp group coordinator """ - global TP_STATE_PATCHED - assert not TP_STATE_PATCHED, "Should not call when it's already patched" + global _TP_STATE_PATCHED + assert not _TP_STATE_PATCHED, "Should not call when it's already patched" - TP_STATE_PATCHED = True + _TP_STATE_PATCHED = True old_tp_group = get_tp_group() global _TP _TP = tp_group @@ -705,7 +705,7 @@ def patch_tensor_parallel_group(tp_group: GroupCoordinator): yield finally: # restore the original state - TP_STATE_PATCHED = False + _TP_STATE_PATCHED = False _TP = old_tp_group From 39fe67f55d948a667d7c8ac5cdf807bf791246f0 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 24 Jun 2024 09:58:07 +0900 Subject: [PATCH 112/126] remove stale comment --- vllm/spec_decode/multi_step_worker.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 1f8034918891c..4aae90b31411f 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -19,12 +19,6 @@ class MultiStepWorker(Worker, ProposerWorkerBase): allocated enough space to store the additional KV. This reduces overhead by invoking the scheduler less. - In addition, it allows a speculative draft model to run with smaller tensor - parallel degree than target model. This is implemented by changing vLLM's - tensor parallel group to a group of the small size temporarily during - forward passes of draft models. This reduces the communication overhead of - small draft models. - The MultiStepWorker does not support cache swap operations, or beam search. Cache swap operations do not require large modifications. On the other hand, beam search requires memory allocations during sequence forks and thus From af1b0be348bdcb3644c49945fb419bfca189ef6f Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 24 Jun 2024 10:04:05 +0900 Subject: [PATCH 113/126] dist_tp2, dist_tp4 tests --- .buildkite/test-pipeline.yaml | 3 +- .../e2e/test_integration_dist_tp2.py | 111 ++++++++++++++++++ ...n_dist.py => test_integration_dist_tp4.py} | 103 ---------------- 3 files changed, 113 insertions(+), 104 deletions(-) create mode 100644 tests/spec_decode/e2e/test_integration_dist_tp2.py rename tests/spec_decode/e2e/{test_integration_dist.py => test_integration_dist_tp4.py} (54%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5afe3730210e8..a4ba379d79682 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -46,7 +46,7 @@ steps: - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - - pytest -v -s spec_decode/e2e/test_integration_dist.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py @@ -60,6 +60,7 @@ steps: # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - label: Engine Test mirror_hardwares: [amd] diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py new file mode 100644 index 0000000000000..5534b80c0aaa0 --- /dev/null +++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py @@ -0,0 +1,111 @@ +"""Tests which cover integration of the speculative decoding framework with +tensor parallelism. +""" + +import pytest +import torch + +from vllm.utils import is_hip + +from .conftest import run_greedy_equality_correctness_test + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "model": "JackFram/llama-68m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + "tensor_parallel_size": 2, + + # Use AsyncLLM engine, so that the engine runs in its own process. + # Otherwise, since vLLM does not follow true SPMD, the test runner + # process will have both the engine and the rank0 worker. NCCL is not + # cleaned up properly, and its server host thread leaks, causing the + # second run of the test to fail with internal NCCL error. + "use_async": True, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 3, + }, + { + "speculative_model": "[ngram]", + "num_speculative_tokens": 5, + "ngram_prompt_lookup_max": 3, + }, +]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify greedy equality when tensor parallelism is used. + """ + if is_hip(): + pytest.skip("hip is not well-supported yet") + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + # Note this is repeated in the test body; to initialize a tokenizer. + "model": "JackFram/llama-68m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + "tensor_parallel_size": 2, + + # Use AsyncLLM engine, so that the engine runs in its own process. + # Otherwise, since vLLM does not follow true SPMD, the test runner + # process will have both the engine and the rank0 worker. NCCL is not + # cleaned up properly, and its server host thread leaks, causing the + # second run of the test to fail with internal NCCL error. + "use_async": True, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "speculative_draft_tensor_parallel_size": 1, + }, +]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize("seed", [1]) +def test_draft_model_tp_lt_target_model_tp2(test_llm_generator, + baseline_llm_generator, + batch_size: int): + """Verify spec decode works well with smaller tp for draft models. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=32, + force_output_len=True) diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py similarity index 54% rename from tests/spec_decode/e2e/test_integration_dist.py rename to tests/spec_decode/e2e/test_integration_dist_tp4.py index 399a61f5f28c4..2160b3aa8b7f2 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -5,112 +5,9 @@ import pytest import torch -from vllm.utils import is_hip - from .conftest import run_greedy_equality_correctness_test -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Required for spec decode. - "use_v2_block_manager": True, - "tensor_parallel_size": 2, - - # Use AsyncLLM engine, so that the engine runs in its own process. - # Otherwise, since vLLM does not follow true SPMD, the test runner - # process will have both the engine and the rank0 worker. NCCL is not - # cleaned up properly, and its server host thread leaks, causing the - # second run of the test to fail with internal NCCL error. - "use_async": True, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - }, - { - "speculative_model": "[ngram]", - "num_speculative_tokens": 5, - "ngram_prompt_lookup_max": 3, - }, -]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, - batch_size: int, output_len: int): - """Verify greedy equality when tensor parallelism is used. - """ - if is_hip(): - pytest.skip("hip is not well-supported yet") - run_greedy_equality_correctness_test(baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len=output_len, - force_output_len=True) - - -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Use a small model for a fast test. - # Note this is repeated in the test body; to initialize a tokenizer. - "model": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Required for spec decode. - "use_v2_block_manager": True, - "tensor_parallel_size": 2, - - # Use AsyncLLM engine, so that the engine runs in its own process. - # Otherwise, since vLLM does not follow true SPMD, the test runner - # process will have both the engine and the rank0 worker. NCCL is not - # cleaned up properly, and its server host thread leaks, causing the - # second run of the test to fail with internal NCCL error. - "use_async": True, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "speculative_draft_tensor_parallel_size": 1, - }, -]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize("seed", [1]) -def test_draft_model_tp_lt_target_model_tp2(test_llm_generator, - baseline_llm_generator, - batch_size: int): - """Verify spec decode works well with smaller tp for draft models. - """ - run_greedy_equality_correctness_test(baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len=32, - force_output_len=True) - - @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test.") @pytest.mark.parametrize( From 834c6e013923d22721f46b3dd9865075a9a0da4a Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 24 Jun 2024 10:16:45 +0900 Subject: [PATCH 114/126] remove unnecessary overriding methods --- vllm/spec_decode/multi_step_worker.py | 29 ++++----------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 4aae90b31411f..e469fd7c3a160 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -1,6 +1,6 @@ import copy import weakref -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Tuple import torch @@ -25,10 +25,8 @@ class MultiStepWorker(Worker, ProposerWorkerBase): requires more thought for MultiStepWorker support. """ - def __init__(self, **kwargs): - """Create a MultiStepWorker. - """ - super().__init__(**kwargs) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) # Lazy initialization list. self._proposer: SpeculativeProposer @@ -47,16 +45,6 @@ def set_include_gpu_probs_tensor(self) -> None: # Need include_gpu_probs_tensor for multi_step_worker self.model_runner.model.sampler.include_gpu_probs_tensor = True - def load_model(self) -> None: - super().load_model() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - return super().determine_num_available_blocks() - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - super().initialize_cache(num_gpu_blocks, num_cpu_blocks) - @torch.inference_mode() def sampler_output( self, @@ -105,6 +93,7 @@ def get_spec_proposals( """Produce speculations given an input batch of sequences. The number of speculative tokens per sequence is determined by max_proposal_len. """ + return self._proposer.get_spec_proposals(execute_model_req) @staticmethod @@ -218,13 +207,3 @@ def _raise_if_unsupported( execute_model_req.seq_group_metadata_list): raise NotImplementedError( "MultiStepWorker does not support beam search.") - - @torch.inference_mode() - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - return super().execute_model(execute_model_req) - - def get_cache_block_size_bytes(self) -> int: - return super().get_cache_block_size_bytes() From 5bc2bc304ada15cd96074834f449738a2179fa4b Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 24 Jun 2024 10:16:52 +0900 Subject: [PATCH 115/126] comment --- vllm/spec_decode/smaller_tp_proposer_worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 6554b91fafa2e..c42430a6f3670 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -138,6 +138,7 @@ def execute_model( def get_cache_block_size_bytes(self) -> int: if self._is_dummy: + # by returning zero, target worker can use the entire kv cache space return 0 return self._worker.get_cache_block_size_bytes() From 874036956cc059742385bef21fe329b90569a444 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 24 Jun 2024 10:29:03 +0900 Subject: [PATCH 116/126] yapf --- tests/spec_decode/e2e/test_integration_dist_tp4.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py index 2160b3aa8b7f2..9c1cd95d7618b 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp4.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -80,17 +80,16 @@ def test_draft_model_tp_lt_target_model_tp4(test_llm_generator, # cleaned up properly, and its server host thread leaks, causing the # second run of the test to fail with internal NCCL error. "use_async": True, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [{ "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, # Artificially limit the draft model max model len; this forces vLLM # to skip speculation once the sequences grow beyond 32-k tokens. "speculative_max_model_len": 32, - }, -]) + }]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ { From 4d82ca1ed07457710739af645914f5239c73e464 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 24 Jun 2024 10:29:07 +0900 Subject: [PATCH 117/126] comment --- vllm/spec_decode/smaller_tp_proposer_worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index c42430a6f3670..b78e4489513f7 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -92,6 +92,7 @@ def load_model(self) -> None: def determine_num_available_blocks(self) -> Tuple[int, int]: if self._is_dummy: + # this case is not used now return -1, -1 with self._patch_tensor_parallel_group(): From 7bf831cf76a237aeed8986263b6cfdc56481eebf Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 24 Jun 2024 10:39:20 +0900 Subject: [PATCH 118/126] undo change in test utils --- tests/spec_decode/utils.py | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 2729bc57c5462..ce5b347832c30 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -12,7 +12,6 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, SamplerOutput, SequenceData, SequenceGroupMetadata, SequenceOutput) -from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker @@ -67,7 +66,6 @@ def create_worker(cls: Callable[..., T], num_gpu_blocks: int, seed: int, is_driver_worker: bool = True, - draft_ranks: Optional[List[int]] = None, enforce_eager: bool = True) -> T: engine_args = EngineArgs( model=model_name, @@ -80,24 +78,18 @@ def create_worker(cls: Callable[..., T], distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) - worker_kwargs = { - 'model_config': engine_config.model_config, - 'parallel_config': engine_config.parallel_config, - 'scheduler_config': engine_config.scheduler_config, - 'device_config': engine_config.device_config, - 'cache_config': engine_config.cache_config, - 'load_config': engine_config.load_config, - 'local_rank': 0, - 'rank': 0, - 'distributed_init_method': distributed_init_method, - 'is_driver_worker': is_driver_worker, - } - - if draft_ranks is not None: - assert cls is MultiStepWorker, "draft_ranks arg is for MultiStepWorker" - worker_kwargs['draft_ranks'] = draft_ranks - - worker = cls(**worker_kwargs) + worker = cls( + model_config=engine_config.model_config, + parallel_config=engine_config.parallel_config, + scheduler_config=engine_config.scheduler_config, + device_config=engine_config.device_config, + cache_config=engine_config.cache_config, + load_config=engine_config.load_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + is_driver_worker=is_driver_worker, + ) worker.init_device() worker.load_model() From 3fccc7602dc229e1e90d0a39dfaae74928081279 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Mon, 24 Jun 2024 15:56:34 +0900 Subject: [PATCH 119/126] remove test_skip_speculation --- .../e2e/test_integration_dist_tp4.py | 63 ------------------- 1 file changed, 63 deletions(-) diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py index 9c1cd95d7618b..5019e1bb40a1d 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp4.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -58,66 +58,3 @@ def test_draft_model_tp_lt_target_model_tp4(test_llm_generator, batch_size, max_output_len=32, force_output_len=True) - - -@pytest.mark.skipif(torch.cuda.device_count() < 4, - reason="Need at least 4 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Required for spec decode. - "use_v2_block_manager": True, - "tensor_parallel_size": 4, - - # Use AsyncLLM engine, so that the engine runs in its own process. - # Otherwise, since vLLM does not follow true SPMD, the test runner - # process will have both the engine and the rank0 worker. NCCL is not - # cleaned up properly, and its server host thread leaks, causing the - # second run of the test to fail with internal NCCL error. - "use_async": True, - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [{ - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - # Artificially limit the draft model max model len; this forces vLLM - # to skip speculation once the sequences grow beyond 32-k tokens. - "speculative_max_model_len": 32, - }]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_draft_tensor_parallel_size": 2, - }, - { - "speculative_draft_tensor_parallel_size": 1, - }, -]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize( - "output_len", - [ - # This must be a good bit larger than speculative_max_model_len so that - # we can test the case where all seqs are skipped, but still small to - # ensure fast test. - 64, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_skip_speculation(baseline_llm_generator, test_llm_generator, - batch_size: int, output_len: int): - """Verify greedy equality when some (or all) sequences skip speculation. - We do this by setting the max model len of the draft model to an - artificially low value, such that when the sequences grow beyond it, they - are skipped in speculative decoding. - """ - run_greedy_equality_correctness_test(baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len=output_len, - force_output_len=True) From e8d0e93b5575dfc413df8627f34ec9a5ec38c31c Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 25 Jun 2024 11:03:38 +0900 Subject: [PATCH 120/126] tp4 test only for spec_tp1 --- .../e2e/test_integration_dist_tp4.py | 66 ++++++++++++++++++- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py index 5019e1bb40a1d..a38d99d1ad1eb 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp4.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -39,9 +39,6 @@ ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_draft_tensor_parallel_size": 2, - }, { "speculative_draft_tensor_parallel_size": 1, }, @@ -58,3 +55,66 @@ def test_draft_model_tp_lt_target_model_tp4(test_llm_generator, batch_size, max_output_len=32, force_output_len=True) + + +@pytest.mark.skipif(torch.cuda.device_count() < 4, + reason="Need at least 4 GPUs to run the test.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "model": "JackFram/llama-160m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + "tensor_parallel_size": 4, + + # Use AsyncLLM engine, so that the engine runs in its own process. + # Otherwise, since vLLM does not follow true SPMD, the test runner + # process will have both the engine and the rank0 worker. NCCL is not + # cleaned up properly, and its server host thread leaks, causing the + # second run of the test to fail with internal NCCL error. + "use_async": True, + }]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + + # Artificially limit the draft model max model len; this forces vLLM + # to skip speculation once the sequences grow beyond 32-k tokens. + "speculative_max_model_len": 32, + }, + ]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs",[ + { + "speculative_draft_tensor_parallel_size": 1, + }, +]) +@pytest.mark.parametrize("batch_size", [8]) +@pytest.mark.parametrize( + "output_len", + [ + # This must be a good bit larger than speculative_max_model_len so that + # we can test the case where all seqs are skipped, but still small to + # ensure fast test. + 64, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_skip_speculation(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify greedy equality when some (or all) sequences skip speculation. + We do this by setting the max model len of the draft model to an + artificially low value, such that when the sequences grow beyond it, they + are skipped in speculative decoding. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) From 91c2e436757b2fc703c2e46651181c3b56f2adfd Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 25 Jun 2024 11:15:56 +0900 Subject: [PATCH 121/126] allow only value 1 for spec_tp --- vllm/config.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 6f6e836057fbc..e6324a24ae520 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -983,12 +983,11 @@ def create_draft_parallel_config( if speculative_draft_tensor_parallel_size is None: speculative_draft_tensor_parallel_size = \ target_parallel_config.tensor_parallel_size - - if speculative_draft_tensor_parallel_size > \ - target_parallel_config.tensor_parallel_size: + elif speculative_draft_tensor_parallel_size != 1: raise ValueError( - f"{speculative_draft_tensor_parallel_size=} cannot be " - f"larger than {target_parallel_config.tensor_parallel_size}") + f"{speculative_draft_tensor_parallel_size=} cannot be" + f"other value than 1" + ) draft_parallel_config = ParallelConfig( pipeline_parallel_size=target_parallel_config. From fac7e68d850aaa72351f0cb95b887d38861f0b21 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 25 Jun 2024 11:18:09 +0900 Subject: [PATCH 122/126] yapf --- tests/spec_decode/e2e/test_integration_dist_tp4.py | 2 +- vllm/config.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py index a38d99d1ad1eb..972490b311502 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp4.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -91,7 +91,7 @@ def test_draft_model_tp_lt_target_model_tp4(test_llm_generator, }, ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs",[ +@pytest.mark.parametrize("test_llm_kwargs", [ { "speculative_draft_tensor_parallel_size": 1, }, diff --git a/vllm/config.py b/vllm/config.py index e6324a24ae520..f4c8dbce6caba 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -986,8 +986,7 @@ def create_draft_parallel_config( elif speculative_draft_tensor_parallel_size != 1: raise ValueError( f"{speculative_draft_tensor_parallel_size=} cannot be" - f"other value than 1" - ) + f"other value than 1") draft_parallel_config = ParallelConfig( pipeline_parallel_size=target_parallel_config. From 271822e2858793c216d8a26bbe821670ec4bfe46 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 25 Jun 2024 13:12:07 +0900 Subject: [PATCH 123/126] add todo comment --- vllm/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/config.py b/vllm/config.py index f4c8dbce6caba..c9877f2a491a3 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -984,6 +984,7 @@ def create_draft_parallel_config( speculative_draft_tensor_parallel_size = \ target_parallel_config.tensor_parallel_size elif speculative_draft_tensor_parallel_size != 1: + # TODO(wooyeon): allow tp values larger than 1 raise ValueError( f"{speculative_draft_tensor_parallel_size=} cannot be" f"other value than 1") From ae0d7f1c013761cd983d17273ebe51f7601bab3c Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 25 Jun 2024 13:13:13 +0900 Subject: [PATCH 124/126] add tests for check that test_skip fails even there's no spec_draft_tp option --- .../e2e/test_integration_dist_tp2.py | 66 +++++++++++++++++++ .../e2e/test_integration_dist_tp4.py | 27 ++++---- 2 files changed, 81 insertions(+), 12 deletions(-) diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py index 5534b80c0aaa0..eeb21cf30c394 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp2.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py @@ -109,3 +109,69 @@ def test_draft_model_tp_lt_target_model_tp2(test_llm_generator, batch_size, max_output_len=32, force_output_len=True) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "model": "JackFram/llama-160m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + "tensor_parallel_size": 2, + + # Use AsyncLLM engine, so that the engine runs in its own process. + # Otherwise, since vLLM does not follow true SPMD, the test runner + # process will have both the engine and the rank0 worker. NCCL is not + # cleaned up properly, and its server host thread leaks, causing the + # second run of the test to fail with internal NCCL error. + "use_async": True, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + + # Artificially limit the draft model max model len; this forces vLLM + # to skip speculation once the sequences grow beyond 32-k tokens. + "speculative_max_model_len": 32, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + + # Artificially limit the draft model max model len; this forces vLLM + # to skip speculation once the sequences grow beyond 32-k tokens. + "speculative_max_model_len": 32, + "speculative_draft_tensor_parallel_size": 1, + }, +]) +@pytest.mark.parametrize("batch_size", [8]) +@pytest.mark.parametrize( + "output_len", + [ + # This must be a good bit larger than speculative_max_model_len so that + # we can test the case where all seqs are skipped, but still small to + # ensure fast test. + 64, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_skip_speculation(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify greedy equality when some (or all) sequences skip speculation. + We do this by setting the max model len of the draft model to an + artificially low value, such that when the sequences grow beyond it, they + are skipped in speculative decoding. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py index 972490b311502..4b3134e1e3f27 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp4.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -78,21 +78,24 @@ def test_draft_model_tp_lt_target_model_tp4(test_llm_generator, # second run of the test to fail with internal NCCL error. "use_async": True, }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - - # Artificially limit the draft model max model len; this forces vLLM - # to skip speculation once the sequences grow beyond 32-k tokens. - "speculative_max_model_len": 32, - }, - ]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + + # Artificially limit the draft model max model len; this forces vLLM + # to skip speculation once the sequences grow beyond 32-k tokens. + "speculative_max_model_len": 32, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + + # Artificially limit the draft model max model len; this forces vLLM + # to skip speculation once the sequences grow beyond 32-k tokens. + "speculative_max_model_len": 32, "speculative_draft_tensor_parallel_size": 1, }, ]) From b84a07076971d30524825f853e1649d6b6eff805 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 25 Jun 2024 14:56:30 +0900 Subject: [PATCH 125/126] remove test_skip_speculation from dist tests --- .../e2e/test_integration_dist_tp2.py | 66 ------------------ .../e2e/test_integration_dist_tp4.py | 67 +------------------ 2 files changed, 1 insertion(+), 132 deletions(-) diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py index eeb21cf30c394..5534b80c0aaa0 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp2.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py @@ -109,69 +109,3 @@ def test_draft_model_tp_lt_target_model_tp2(test_llm_generator, batch_size, max_output_len=32, force_output_len=True) - - -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Required for spec decode. - "use_v2_block_manager": True, - "tensor_parallel_size": 2, - - # Use AsyncLLM engine, so that the engine runs in its own process. - # Otherwise, since vLLM does not follow true SPMD, the test runner - # process will have both the engine and the rank0 worker. NCCL is not - # cleaned up properly, and its server host thread leaks, causing the - # second run of the test to fail with internal NCCL error. - "use_async": True, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - - # Artificially limit the draft model max model len; this forces vLLM - # to skip speculation once the sequences grow beyond 32-k tokens. - "speculative_max_model_len": 32, - }, - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - - # Artificially limit the draft model max model len; this forces vLLM - # to skip speculation once the sequences grow beyond 32-k tokens. - "speculative_max_model_len": 32, - "speculative_draft_tensor_parallel_size": 1, - }, -]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize( - "output_len", - [ - # This must be a good bit larger than speculative_max_model_len so that - # we can test the case where all seqs are skipped, but still small to - # ensure fast test. - 64, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_skip_speculation(baseline_llm_generator, test_llm_generator, - batch_size: int, output_len: int): - """Verify greedy equality when some (or all) sequences skip speculation. - We do this by setting the max model len of the draft model to an - artificially low value, such that when the sequences grow beyond it, they - are skipped in speculative decoding. - """ - run_greedy_equality_correctness_test(baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len=output_len, - force_output_len=True) diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py index 4b3134e1e3f27..14c41c8ad7030 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp4.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -39,6 +39,7 @@ ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ + #TODO(wooyeon): add spec_draft_dp=2 case { "speculative_draft_tensor_parallel_size": 1, }, @@ -55,69 +56,3 @@ def test_draft_model_tp_lt_target_model_tp4(test_llm_generator, batch_size, max_output_len=32, force_output_len=True) - - -@pytest.mark.skipif(torch.cuda.device_count() < 4, - reason="Need at least 4 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Required for spec decode. - "use_v2_block_manager": True, - "tensor_parallel_size": 4, - - # Use AsyncLLM engine, so that the engine runs in its own process. - # Otherwise, since vLLM does not follow true SPMD, the test runner - # process will have both the engine and the rank0 worker. NCCL is not - # cleaned up properly, and its server host thread leaks, causing the - # second run of the test to fail with internal NCCL error. - "use_async": True, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - - # Artificially limit the draft model max model len; this forces vLLM - # to skip speculation once the sequences grow beyond 32-k tokens. - "speculative_max_model_len": 32, - }, - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - - # Artificially limit the draft model max model len; this forces vLLM - # to skip speculation once the sequences grow beyond 32-k tokens. - "speculative_max_model_len": 32, - "speculative_draft_tensor_parallel_size": 1, - }, -]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize( - "output_len", - [ - # This must be a good bit larger than speculative_max_model_len so that - # we can test the case where all seqs are skipped, but still small to - # ensure fast test. - 64, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_skip_speculation(baseline_llm_generator, test_llm_generator, - batch_size: int, output_len: int): - """Verify greedy equality when some (or all) sequences skip speculation. - We do this by setting the max model len of the draft model to an - artificially low value, such that when the sequences grow beyond it, they - are skipped in speculative decoding. - """ - run_greedy_equality_correctness_test(baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len=output_len, - force_output_len=True) From 86fda24d9b9006fb08ac1624dac8d8d495f44db8 Mon Sep 17 00:00:00 2001 From: Wooyeon Lee Date: Tue, 25 Jun 2024 15:00:49 +0900 Subject: [PATCH 126/126] yapf --- tests/spec_decode/e2e/test_integration_dist_tp4.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py index 14c41c8ad7030..56cb0147d9e4f 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp4.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -38,12 +38,14 @@ }, ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - #TODO(wooyeon): add spec_draft_dp=2 case - { - "speculative_draft_tensor_parallel_size": 1, - }, -]) +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + #TODO(wooyeon): add spec_draft_dp=2 case + { + "speculative_draft_tensor_parallel_size": 1, + }, + ]) @pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("seed", [1]) def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,