Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions evaluation/deepseek_fp4/launch_deepseekr1_fp4_DP_EP.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
export VLLM_USE_V1=1
export VLLM_USE_TRITON_FLASH_ATTN=0
# export VLLM_LOGGING_LEVEL=DEBUG
export VLLM_RPC_TIMEOUT=1800000
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_MHA=0
export VLLM_ROCM_USE_AITER_MLA=1
export VLLM_ROCM_USE_AITER_MOE=1
export VLLM_ROCM_USE_TRITON_ROPE=1 # add for acc
export VLLM_DISABLE_COMPILE_CACHE=1
# FIXME: for now disable fp4 asm gemm because of running issue
export VLLM_ROCM_USE_AITER_FP4_ASM_GEMM=0
#export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0 # for now disable

export TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1
export TRITON_HIP_USE_ASYNC_COPY=1
export TRITON_HIP_USE_BLOCK_PINGPONG=1
export TRITON_HIP_ASYNC_FAST_SWIZZLE=1
export NCCL_DEBUG=WARN
export AMDGCN_USE_BUFFER_OPS=1
export SAFETENSORS_FAST_GPU=1

# for profiling
#export VLLM_TORCH_PROFILER_DIR="deepseek_in3k_out1k"
#export VLLM_TORCH_PROFILER_WITH_STACK=1
#export VLLM_TORCH_PROFILER_RECORD_SHAPES=1

model_path=/data/pretrained-models/amd/DeepSeek-R1-MXFP4-Preview
echo "running $model_path"

vllm serve $model_path \
--host localhost \
--port 9000 \
--data-parallel-size 8 \
--enable-expert-parallel \
--max-num-batched-tokens 32768 \
--trust-remote-code \
--no-enable-prefix-caching \
--disable-log-requests \
--compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
--gpu_memory_utilization 0.9 \
--block-size 1 \
--seed 123 2>&1 | tee log.server.log &

# --enforce-eager \
# --data-parallel-hybrid-lb \
16 changes: 15 additions & 1 deletion vllm/v1/worker/dp_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,21 @@ def _run_ar(
tensor[1][dp_rank] = padded_num_tokens_per_ubatch
tensor[2][dp_rank] = 1 if should_ubatch else 0
tensor[3][dp_rank] = 1 if should_dp_pad else 0
dist.all_reduce(tensor, group=group)

# FIXME: Use custom allreduce for ROCm DP scenario
# when using torch.dist.all_reduce, there is a hip graph cature
# issue, which needs further investigation
# the tracking issue link: https://github.com/ROCm/hip/issues/3876
dp_group = get_dp_group()
if (
current_platform.is_rocm()
and dp_group.device_communicator is not None
and device != "cpu"
):
tensor = dp_group.device_communicator.all_reduce(tensor)
else:
dist.all_reduce(tensor, group=group)

return tensor


Expand Down
Loading