diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 833581035a31..c5f1f8e0e73f 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -346,6 +346,15 @@ def __post_init__(self): or self.model_config.is_encoder_decoder ): self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + + # decode context parallel do not support full cudagraphs now. + if self.parallel_config.decode_context_parallel_size > 1: + logger.warning( + "Decode context parallel (DCP) is enabled, which is " + "incompatible with full CUDA graphs. Set " + "cudagraph_mode to PIECEWISE." + ) + self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE else: self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE