From ab5f032c5b34188985a17e54fb4d343b83b6452d Mon Sep 17 00:00:00 2001 From: zejunchen-zejun Date: Mon, 3 Nov 2025 19:11:12 +0800 Subject: [PATCH] [WA][ROCm][DP] have a short-term WA for hip graph crash when capturing unsupported runtime op under DP+EP scenario Signed-off-by: zejunchen-zejun --- .../launch_deepseekr1_fp4_DP_EP.sh | 46 +++++++++++++++++++ ...kr1_fp4.sh => launch_deepseekr1_fp4_TP.sh} | 0 vllm/v1/worker/dp_utils.py | 16 ++++++- 3 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 evaluation/deepseek_fp4/launch_deepseekr1_fp4_DP_EP.sh rename evaluation/deepseek_fp4/{launch_deepseekr1_fp4.sh => launch_deepseekr1_fp4_TP.sh} (100%) diff --git a/evaluation/deepseek_fp4/launch_deepseekr1_fp4_DP_EP.sh b/evaluation/deepseek_fp4/launch_deepseekr1_fp4_DP_EP.sh new file mode 100644 index 000000000000..6e700689a1cd --- /dev/null +++ b/evaluation/deepseek_fp4/launch_deepseekr1_fp4_DP_EP.sh @@ -0,0 +1,46 @@ +export VLLM_USE_V1=1 +export VLLM_USE_TRITON_FLASH_ATTN=0 +# export VLLM_LOGGING_LEVEL=DEBUG +export VLLM_RPC_TIMEOUT=1800000 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_MHA=0 +export VLLM_ROCM_USE_AITER_MLA=1 +export VLLM_ROCM_USE_AITER_MOE=1 +export VLLM_ROCM_USE_TRITON_ROPE=1 # add for acc +export VLLM_DISABLE_COMPILE_CACHE=1 +# FIXME: for now disable fp4 asm gemm because of running issue +export VLLM_ROCM_USE_AITER_FP4_ASM_GEMM=0 +#export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0 # for now disable + +export TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 +export TRITON_HIP_USE_ASYNC_COPY=1 +export TRITON_HIP_USE_BLOCK_PINGPONG=1 +export TRITON_HIP_ASYNC_FAST_SWIZZLE=1 +export NCCL_DEBUG=WARN +export AMDGCN_USE_BUFFER_OPS=1 +export SAFETENSORS_FAST_GPU=1 + +# for profiling +#export VLLM_TORCH_PROFILER_DIR="deepseek_in3k_out1k" +#export VLLM_TORCH_PROFILER_WITH_STACK=1 +#export VLLM_TORCH_PROFILER_RECORD_SHAPES=1 + +model_path=/data/pretrained-models/amd/DeepSeek-R1-MXFP4-Preview +echo "running $model_path" + +vllm serve $model_path \ + --host localhost \ + --port 9000 \ + --data-parallel-size 8 \ + --enable-expert-parallel \ + --max-num-batched-tokens 32768 \ + --trust-remote-code \ + --no-enable-prefix-caching \ + --disable-log-requests \ + --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ + --gpu_memory_utilization 0.9 \ + --block-size 1 \ + --seed 123 2>&1 | tee log.server.log & + + # --enforce-eager \ + # --data-parallel-hybrid-lb \ diff --git a/evaluation/deepseek_fp4/launch_deepseekr1_fp4.sh b/evaluation/deepseek_fp4/launch_deepseekr1_fp4_TP.sh similarity index 100% rename from evaluation/deepseek_fp4/launch_deepseekr1_fp4.sh rename to evaluation/deepseek_fp4/launch_deepseekr1_fp4_TP.sh diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py index 3f24ff0a09de..466a51733cf5 100644 --- a/vllm/v1/worker/dp_utils.py +++ b/vllm/v1/worker/dp_utils.py @@ -49,7 +49,21 @@ def _run_ar( tensor[1][dp_rank] = padded_num_tokens_per_ubatch tensor[2][dp_rank] = 1 if should_ubatch else 0 tensor[3][dp_rank] = 1 if should_dp_pad else 0 - dist.all_reduce(tensor, group=group) + + # FIXME: Use custom allreduce for ROCm DP scenario + # when using torch.dist.all_reduce, there is a hip graph cature + # issue, which needs further investigation + # the tracking issue link: https://github.com/ROCm/hip/issues/3876 + dp_group = get_dp_group() + if ( + current_platform.is_rocm() + and dp_group.device_communicator is not None + and device != "cpu" + ): + tensor = dp_group.device_communicator.all_reduce(tensor) + else: + dist.all_reduce(tensor, group=group) + return tensor