vllm-project · comaniac · Jun 25, 2024 · Jun 10, 2024 · Jun 10, 2024 · Jun 10, 2024
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -20,26 +20,28 @@ def main(args: argparse.Namespace):
 
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
-    llm = LLM(model=args.model,
-              speculative_model=args.speculative_model,
-              num_speculative_tokens=args.num_speculative_tokens,
-              tokenizer=args.tokenizer,
-              quantization=args.quantization,
-              tensor_parallel_size=args.tensor_parallel_size,
-              trust_remote_code=args.trust_remote_code,
-              dtype=args.dtype,
-              enforce_eager=args.enforce_eager,
-              kv_cache_dtype=args.kv_cache_dtype,
-              quantization_param_path=args.quantization_param_path,
-              device=args.device,
-              ray_workers_use_nsight=args.ray_workers_use_nsight,
-              use_v2_block_manager=args.use_v2_block_manager,
-              enable_chunked_prefill=args.enable_chunked_prefill,
-              download_dir=args.download_dir,
-              block_size=args.block_size,
-              gpu_memory_utilization=args.gpu_memory_utilization,
-              load_format=args.load_format,
-              distributed_executor_backend=args.distributed_executor_backend)
+    llm = LLM(
+        model=args.model,
+        speculative_model=args.speculative_model,
+        num_speculative_tokens=args.num_speculative_tokens,
+        speculative_tensor_parallel_size=args.speculative_tensor_parallel_size,
+        tokenizer=args.tokenizer,
+        quantization=args.quantization,
+        tensor_parallel_size=args.tensor_parallel_size,
+        trust_remote_code=args.trust_remote_code,
+        dtype=args.dtype,
+        enforce_eager=args.enforce_eager,
+        kv_cache_dtype=args.kv_cache_dtype,
+        quantization_param_path=args.quantization_param_path,
+        device=args.device,
+        ray_workers_use_nsight=args.ray_workers_use_nsight,
+        use_v2_block_manager=args.use_v2_block_manager,
+        enable_chunked_prefill=args.enable_chunked_prefill,
+        download_dir=args.download_dir,
+        block_size=args.block_size,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        load_format=args.load_format,
+        distributed_executor_backend=args.distributed_executor_backend)
 
     sampling_params = SamplingParams(
         n=args.n,
@@ -122,6 +124,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--model', type=str, default='facebook/opt-125m')
     parser.add_argument('--speculative-model', type=str, default=None)
     parser.add_argument('--num-speculative-tokens', type=int, default=None)
+    parser.add_argument('--speculative-tensor-parallel-size',
+                        '-spec-tp',
+                        type=int,
+                        default=None)
     parser.add_argument('--tokenizer', type=str, default=None)
     parser.add_argument('--quantization',
                         '-q',

@@ -63,3 +63,49 @@ def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
                                          batch_size,
                                          max_output_len=output_len,
                                          force_output_len=True)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        # Note this is repeated in the test body; to initialize a tokenizer.
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 2,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "speculative_tensor_parallel_size": 1,
+    },
+])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_draft_model_tp_lt_target_model(test_llm_generator,
 def test_skip_speculation(baseline_llm_generator, test_llm_generator, 
                           batch_size: int, output_len: int): 
     """Verify greedy equality when some (or all) sequences skip speculation. 
     We do this by setting the max model len of the draft model to an 
     artificially low value, such that when the sequences grow beyond it, they 
     are skipped in speculative decoding. 
     """ 
     run_greedy_equality_correctness_test(baseline_llm_generator, 
                                          test_llm_generator, 
                                          batch_size, 
                                          max_output_len=output_len, 
                                          force_output_len=True) 
 def test_skip_speculation(baseline_llm_generator, test_llm_generator, 
                           batch_size: int, output_len: int): 
     """Verify greedy equality when some (or all) sequences skip speculation. 
     We do this by setting the max model len of the draft model to an 
     artificially low value, such that when the sequences grow beyond it, they 
     are skipped in speculative decoding. 
     """ 
     run_greedy_equality_correctness_test(baseline_llm_generator, 
                                          test_llm_generator, 
                                          batch_size, 
                                          max_output_len=output_len, 
                                          force_output_len=True) 
  
+                                        baseline_llm_generator,
+                                        batch_size: int):
+    """Verify spec decode works well with smaller tp for draft models.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=32,
+                                         force_output_len=True)
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
@@ -12,6 +12,7 @@
 from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
                            SamplerOutput, SequenceData, SequenceGroupMetadata,
                            SequenceOutput)
+from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.worker import Worker
@@ -66,6 +67,7 @@ def create_worker(cls: Callable[..., T],
                   num_gpu_blocks: int,
                   seed: int,
                   is_driver_worker: bool = True,
+                  draft_ranks: Optional[List[int]] = None,
                   enforce_eager: bool = True) -> T:
     engine_args = EngineArgs(
         model=model_name,
@@ -78,18 +80,24 @@ def create_worker(cls: Callable[..., T],
     distributed_init_method = get_distributed_init_method(
         get_ip(), get_open_port())
 
-    worker = cls(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-        cache_config=engine_config.cache_config,
-        load_config=engine_config.load_config,
-        local_rank=0,
-        rank=0,
-        distributed_init_method=distributed_init_method,
-        is_driver_worker=is_driver_worker,
-    )
+    worker_kwargs = {
+        'model_config': engine_config.model_config,
+        'parallel_config': engine_config.parallel_config,
+        'scheduler_config': engine_config.scheduler_config,
+        'device_config': engine_config.device_config,
+        'cache_config': engine_config.cache_config,
+        'load_config': engine_config.load_config,
+        'local_rank': 0,
+        'rank': 0,
+        'distributed_init_method': distributed_init_method,
+        'is_driver_worker': is_driver_worker,
+    }
+
+    if draft_ranks is not None:
+        assert cls is MultiStepWorker, "draft_ranks arg is for MultiStepWorker"
+        worker_kwargs['draft_ranks'] = draft_ranks
+
+    worker = cls(**worker_kwargs)
 
     worker.init_device()
     worker.load_model()

diff --git a/vllm/config.py b/vllm/config.py
@@ -788,6 +788,7 @@ def maybe_create_spec_config(
         target_parallel_config: ParallelConfig,
         target_dtype: str,
         speculative_model: Optional[str],
+        speculative_tensor_parallel_size: Optional[int],
         num_speculative_tokens: Optional[int],
         speculative_max_model_len: Optional[int],
         enable_chunked_prefill: bool,
@@ -914,7 +915,7 @@ def maybe_create_spec_config(
 
             draft_parallel_config = (
                 SpeculativeConfig.create_draft_parallel_config(
-                    target_parallel_config))
+                    target_parallel_config, speculative_tensor_parallel_size))
 
         return SpeculativeConfig(
             draft_model_config,
@@ -962,16 +963,27 @@ def _maybe_override_draft_max_model_len(
 
     @staticmethod
     def create_draft_parallel_config(
-            target_parallel_config: ParallelConfig) -> ParallelConfig:
+            target_parallel_config: ParallelConfig,
+            speculative_tensor_parallel_size: Optional[int]) -> ParallelConfig:
         """Create a parallel config for use by the draft worker.
 
-        This is mostly a copy of the target parallel config. In the future the
-        draft worker can have a different parallel strategy, e.g. TP=1.
+        This is mostly a copy of the target parallel config, except the tp_size.
         """
+
+        speculative_tensor_parallel_size = (
+            speculative_tensor_parallel_size
+            or target_parallel_config.tensor_parallel_size)
+
+        if speculative_tensor_parallel_size > \
+            target_parallel_config.tensor_parallel_size:
+            raise ValueError(
+                f"{speculative_tensor_parallel_size=} cannot be "
+                f"larger than {target_parallel_config.tensor_parallel_size}")
+
         draft_parallel_config = ParallelConfig(
             pipeline_parallel_size=target_parallel_config.
             pipeline_parallel_size,
-            tensor_parallel_size=target_parallel_config.tensor_parallel_size,
+            tensor_parallel_size=speculative_tensor_parallel_size,
             distributed_executor_backend=target_parallel_config.
             distributed_executor_backend,
             max_parallel_loading_workers=target_parallel_config.

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -551,6 +551,10 @@ def init_distributed_environment(
     global _WORLD
     if _WORLD is None:
         ranks = list(range(torch.distributed.get_world_size()))
+        if world_size != -1:
+            assert world_size == len(ranks), (
+                "given world_size does not match with world_size of torch")
+
         _WORLD = GroupCoordinator(
             group_ranks=[ranks],
             local_rank=local_rank,
@@ -559,7 +563,7 @@ def init_distributed_environment(
             use_custom_allreduce=False,
         )
     else:
-        assert _WORLD.world_size == torch.distributed.get_world_size(), (
+        assert _WORLD.world_size == world_size, (
             "world group already initialized with a different world size")
 
 
@@ -674,6 +678,30 @@ def model_parallel_is_initialized():
     return (_TP is not None and _PP is not None)
 
 
+OVERRIDE_TP_STATE = False
+
+
+@contextlib.contextmanager
+def patch_tensor_parallel_group(world_group, tp_group):
+    """Patch the tp group temporarily until this function ends."""
+    global OVERRIDE_TP_STATE
+    if not OVERRIDE_TP_STATE and world_group and tp_group:
+        OVERRIDE_TP_STATE = True
+        old_world_group = get_world_group()
+        old_tp_group = get_tp_group()
+        global _WORLD, _TP
+        _WORLD = world_group
+        _TP = tp_group
+    try:
+        yield
+    finally:
+        # restore the original state
+        if OVERRIDE_TP_STATE:
+            OVERRIDE_TP_STATE = False
+            _WORLD = old_world_group
+            _TP = old_tp_group
+
+
 def get_tensor_model_parallel_world_size():
     """Return world size for the tensor model parallel group."""
     return get_tp_group().world_size

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -93,6 +93,7 @@ class EngineArgs:
     guided_decoding_backend: str = 'outlines'
     # Speculative decoding configuration.
     speculative_model: Optional[str] = None
+    speculative_tensor_parallel_size: Optional[int] = None
     num_speculative_tokens: Optional[int] = None
     speculative_max_model_len: Optional[int] = None
     speculative_disable_by_batch_size: Optional[int] = None
@@ -534,6 +535,13 @@ def add_cli_args(
             default=EngineArgs.num_speculative_tokens,
             help='The number of speculative tokens to sample from '
             'the draft model in speculative decoding.')
+        parser.add_argument(
+            '--speculative-tensor-parallel-size',
+            '-spec-tp',
+            type=int,
+            default=EngineArgs.speculative_tensor_parallel_size,
+            help='Number of tensor parallel replicas for '
+            'the draft model in speculative decoding.')
 
         parser.add_argument(
             '--speculative-max-model-len',
@@ -676,6 +684,8 @@ def create_engine_config(self, ) -> EngineConfig:
             target_parallel_config=parallel_config,
             target_dtype=self.dtype,
             speculative_model=self.speculative_model,
+            speculative_tensor_parallel_size = \
+                self.speculative_tensor_parallel_size,
             num_speculative_tokens=self.num_speculative_tokens,
             speculative_disable_by_batch_size=self.
             speculative_disable_by_batch_size,