-
-
Notifications
You must be signed in to change notification settings - Fork 10.9k
[Feature][Rocm] add quick all reduce for rocm #19744
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
vllm-bot
merged 44 commits into
vllm-project:main
from
lihaoyang-amd:lhy/add_quick_all_reduce
Jun 27, 2025
Merged
Changes from all commits
Commits
Show all changes
44 commits
Select commit
Hold shift + click to select a range
a3a7de7
Add quickreduce as alternative to custom allreduce
ad8ec75
WIP
f4c03ad
Add bf16 support
00c6d18
WIP
b537c0d
Refactor QuickReduce
1bff2fa
Cleanup
69f04dc
Remove config param. Add faster low latency OneShot algo
7effd4d
Some fixes
6970671
fix bfloat16 recv
lihaoyang-amd 343b272
fix env
lihaoyang-amd ecd85a0
fix log info
lihaoyang-amd bd921a5
for env
lihaoyang-amd f21d4ca
Add int8 quantization. Remove changes to custom_allreduce
6a0d8b0
Update after review comments
a2f2922
add Q6 support
lihaoyang-amd 87949fa
Adjusted to static constexpr int
lihaoyang-amd 8eb9e62
Remove useless functions
lihaoyang-amd 0425ac5
fix max size err
lihaoyang-amd 20fc13b
adjust for comments
lihaoyang-amd 982400b
integrate_qr2cr
lihaoyang-amd af265c1
fix message size
lihaoyang-amd ff506e1
Fix fp 2GB bug
796be62
adjust condition
lihaoyang-amd f524aad
fix vll_config
lihaoyang-amd 41907b1
change comment
lihaoyang-amd 776030b
Update test. Disable QR by default. Set fp16 ovfl flag.
db3f1d3
Fix CodecQ4
deb72c6
Update min sizes
ab99dfd
fix Q4
lihaoyang-amd 0bf6342
move bf2fp to cpp
lihaoyang-amd ce2b715
fix compile err
lihaoyang-amd 25f8e40
fix qr for cuda
lihaoyang-amd 210358d
fix f-string
lihaoyang-amd 0bada3c
adjust test case for quick allreduce
lihaoyang-amd 2173c38
del TODO and rebase
lihaoyang-amd a2dd7bd
Optimized format
lihaoyang-amd 816cf2d
add test for multi modes
lihaoyang-amd 876dbec
for fmt
lihaoyang-amd 42a0bdb
Adjustable max_size
lihaoyang-amd e40a61d
go back to splitting
lihaoyang-amd a02b2ef
change default of max_size to None
lihaoyang-amd 03f6163
adjust name of var
lihaoyang-amd 2b52580
restore custom allreduce
lihaoyang-amd a5d7963
check rocm for qr
lihaoyang-amd File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,114 @@ | ||
| #include <ATen/cuda/Exceptions.h> | ||
| #include <c10/cuda/CUDAGuard.h> | ||
| #include <c10/cuda/CUDAStream.h> | ||
| #include <torch/all.h> | ||
|
|
||
| #ifdef USE_ROCM | ||
|
|
||
| #include "quickreduce/quick_reduce.h" | ||
|
|
||
| quickreduce::fptr_t init_custom_qr(int64_t rank, int64_t world_size, | ||
| std::optional<int64_t> qr_max_size) { | ||
| if (world_size > 8) | ||
| throw std::invalid_argument("world size > 8 is not supported"); | ||
| if (world_size == 6) | ||
| throw std::invalid_argument("world size == 6 is not supported"); | ||
| if (world_size % 2 != 0) | ||
| throw std::invalid_argument("Odd num gpus is not supported for now"); | ||
| if (rank < 0 || rank >= world_size) | ||
| throw std::invalid_argument("invalid rank passed in"); | ||
| quickreduce::DeviceComms* fptr = new quickreduce::DeviceComms(); | ||
| fptr->init(world_size, rank, qr_max_size); | ||
| return (quickreduce::fptr_t)fptr; | ||
| } | ||
|
|
||
| void qr_destroy(quickreduce::fptr_t _fa) { | ||
| if (_fa) { | ||
| auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa); | ||
| fa->destroy(); | ||
| delete fa; | ||
| } | ||
| } | ||
|
|
||
| torch::Tensor qr_get_handle(quickreduce::fptr_t _fa) { | ||
| auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa); | ||
| hipIpcMemHandle_t handle = fa->get_handle(); | ||
| auto options = | ||
| torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU); | ||
| auto data_handle = | ||
| torch::empty({static_cast<int64_t>(sizeof(hipIpcMemHandle_t))}, options); | ||
| std::memcpy(data_handle.data_ptr(), &handle, sizeof(hipIpcMemHandle_t)); | ||
| return data_handle; | ||
| } | ||
|
|
||
| void qr_open_handles(quickreduce::fptr_t _fa, | ||
| const std::vector<torch::Tensor>& handles) { | ||
| auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa); | ||
| std::vector<hipIpcMemHandle_t> ipc_handles; | ||
| ipc_handles.reserve(handles.size()); | ||
| for (auto& handle : handles) { | ||
| // Ensure the tensor is on the same device as the current device. | ||
| hipIpcMemHandle_t ipc_handle; | ||
| std::memcpy(&ipc_handle, handle.data_ptr(), sizeof(hipIpcMemHandle_t)); | ||
| ipc_handles.push_back(ipc_handle); | ||
| } | ||
| fa->open_ipc_handles(ipc_handles); | ||
| } | ||
|
|
||
| void qr_all_reduce(quickreduce::fptr_t _fa, torch::Tensor& inp, | ||
| torch::Tensor& out, int64_t quant_level, bool cast_bf2half) { | ||
| auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa); | ||
| const at::cuda::OptionalCUDAGuard device_guard(device_of(inp)); | ||
| auto stream = at::cuda::getCurrentHIPStreamMasqueradingAsCUDA(); | ||
|
|
||
| TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type()); | ||
| TORCH_CHECK_EQ(inp.numel(), out.numel()); | ||
| TORCH_CHECK_LE(out.numel(), fa->kMaxProblemSize); | ||
| if (out.scalar_type() == at::ScalarType::Half) { | ||
| fa->allreduce<half, false>(reinterpret_cast<half*>(inp.data_ptr()), | ||
| reinterpret_cast<half*>(out.data_ptr()), | ||
| out.numel(), quant_level, stream); | ||
| } else if (out.scalar_type() == at::ScalarType::BFloat16) { | ||
| if (cast_bf2half) { | ||
| fa->allreduce<half, true>(reinterpret_cast<half*>(inp.data_ptr()), | ||
| reinterpret_cast<half*>(out.data_ptr()), | ||
| out.numel(), quant_level, stream); | ||
| } else { | ||
| fa->allreduce<quickreduce::nv_bfloat16, false>( | ||
| reinterpret_cast<quickreduce::nv_bfloat16*>(inp.data_ptr()), | ||
| reinterpret_cast<quickreduce::nv_bfloat16*>(out.data_ptr()), | ||
| out.numel(), quant_level, stream); | ||
| } | ||
| } else { | ||
| throw std::runtime_error( | ||
| "quick allreduce only supports float16 and bfloat16"); | ||
| } | ||
| } | ||
|
|
||
| int64_t qr_max_size() { | ||
| // The default is 2GB (2,147,483,648 bytes) | ||
| return static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1; | ||
| } | ||
|
|
||
| #define INSTANTIATE_FOR_WORLDSIZE(T, Codec, cast_bf2half) \ | ||
| template struct quickreduce::AllReduceTwoshot<T, Codec<T, 2>, \ | ||
| cast_bf2half>; \ | ||
| template struct quickreduce::AllReduceTwoshot<T, Codec<T, 4>, \ | ||
| cast_bf2half>; \ | ||
| template struct quickreduce::AllReduceTwoshot<T, Codec<T, 8>, cast_bf2half>; | ||
|
|
||
| INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, false) | ||
| INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, false) | ||
| INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, false) | ||
| INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, false) | ||
| INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, true) | ||
| INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, true) | ||
| INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, true) | ||
| INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, true) | ||
|
|
||
| INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecFP, false) | ||
| INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ4, false) | ||
| INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ6, false) | ||
| INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ8, false) | ||
|
|
||
| #endif // USE_ROCM | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.