fix: negative zero by type trait --> binary value (#1136)

yyihuang · web-flow · commit f484fd3c7f09 · 2025-06-11T13:12:32.000-07:00
## 📌 Description We are going to fix the has_neg_zero. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/include/flashinfer/comm/trtllm_allreduce.cuh b/include/flashinfer/comm/trtllm_allreduce.cuh
@@ -188,11 +188,40 @@ struct neg_zero<nv_bfloat16> {
 template <typename T>
 __device__ static constexpr T neg_zero_v = neg_zero<T>::value;
 
+template <typename T>
+__device__ bool is_negative_zero(T) {
+  return false;
+}
+
+// float specialization
+template <>
+__device__ bool is_negative_zero<float>(float x) {
+  return (__float_as_int(x) == 0x80000000);
+}
+
+// double specialization
+template <>
+__device__ bool is_negative_zero<double>(double x) {
+  return (__double_as_longlong(x) == 0x8000000000000000ULL);
+}
+
+// __half specialization
+template <>
+__device__ bool is_negative_zero<__half>(__half x) {
+  return (__half_as_ushort(x) == 0x8000);
+}
+
+// __nv_bfloat16 specialization
+template <>
+__device__ bool is_negative_zero<__nv_bfloat16>(__nv_bfloat16 x) {
+  return (__bfloat16_as_ushort(x) == 0x8000);
+}
+
 template <typename T, uint32_t VEC_SIZE>
 __device__ __forceinline__ bool has_neg_zero(const vec_t<T, VEC_SIZE>& vec) {
 #pragma unroll
   for (int i = 0; i < VEC_SIZE; ++i) {
-    if (vec[i] == neg_zero_v<T>) {
+    if (is_negative_zero(vec[i])) {
       return true;
     }
   }
@@ -203,7 +232,7 @@ template <typename T, uint32_t VEC_SIZE>
 __device__ __forceinline__ void remove_neg_zero(vec_t<T, VEC_SIZE>& vec) {
 #pragma unroll
   for (int i = 0; i < VEC_SIZE; ++i) {
-    vec[i] = (vec[i] == neg_zero_v<T>) ? static_cast<T>(0.f) : vec[i];
+    vec[i] = (is_negative_zero(vec[i])) ? static_cast<T>(0.f) : vec[i];
   }
 }
 
@@ -1694,10 +1723,8 @@ cudaError_t lamportInitializeAll(void* buffer_0, void* buffer_1, void* buffer_2,
   status = lamportInitialize<T>(buffer_2, size / sizeof(T), stream);
   FLASHINFER_CHECK(status == cudaSuccess, "lamportInitialize failed with error code " +
                                               std::string(cudaGetErrorString(status)));
-
+  cudaDeviceSynchronize();
   return cudaSuccess;
-  // todo(zihao): we can skip sycn with stream as below?
-  // cudaDeviceSynchronize();
 }
 
 }  // namespace trtllm_allreduce
diff --git a/tests/test_trtllm_allreduce.py b/tests/test_trtllm_allreduce.py
@@ -50,7 +50,7 @@ def _run_correctness_worker(world_size, rank, dtype, distributed_init_port):
 
         # below are the recommended hidden sizes for custom all-reduce in trtllm test
         # hidden_size should be in range [256, 8192], and maxHiddenSize should be 8192
-        hidden_sizes = [1024, 2048, 4096]
+        hidden_sizes = [1024, 4096]
         config_codes = [
             0,
             comm.AllReduceStrategyConfig.USE_MEMCPY,
@@ -79,7 +79,7 @@ def _run_correctness_worker(world_size, rank, dtype, distributed_init_port):
             group=group,
         )
 
-        test_loop = 1  # could be any number
+        test_loop = 2  # could be any number
 
         # NOTE: the barrier flag should be initialized to 1, and incremented by 1 for each AR
         flag_value = 1
@@ -165,20 +165,12 @@ def _run_correctness_worker(world_size, rank, dtype, distributed_init_port):
                                     )
                                     dist.all_reduce(inp1_ref, group=group)
 
-                                    tolerance = 1e-2 if dtype == torch.float16 else 5e-2
+                                    tolerance = 1e-2 if dtype == torch.float16 else 8e-2
 
                                     if fusion_op_code == comm.AllReduceFusionOp.NONE:
-                                        if not torch.allclose(
+                                        torch.testing.assert_close(
                                             out1, inp1_ref, atol=tolerance, rtol=3e-2
-                                        ):
-                                            print(
-                                                f"test RANK {rank}: {world_size}-{dtype}-{strategy_code}-{config_code}-{fusion_op_code}-{launch_with_pdl}-{hidden_size} failed"
-                                            )
-                                            print(f"out1: {out1}")
-                                            print(f"inp1_ref: {inp1_ref}")
-                                            print(f"tolerance: {tolerance}")
-                                            print(f"rtol: {3e-2}")
-                                            pass_flag = False
+                                        )
                                     elif (
                                         fusion_op_code
                                         == comm.AllReduceFusionOp.RESIDUAL_RMS_NORM
@@ -198,21 +190,12 @@ def _run_correctness_worker(world_size, rank, dtype, distributed_init_port):
                                                 + bias_float[i % hidden_size]
                                             )
                                         ref_half = ref_float.to(dtype)
-
-                                        if not torch.allclose(
+                                        torch.testing.assert_close(
                                             inter_buffer,
                                             ref_half,
                                             atol=tolerance,
                                             rtol=3e-2,
-                                        ):
-                                            print(
-                                                f"test RANK {rank}: {world_size}-{dtype}-{strategy_code}-{config_code}-{fusion_op_code}-{launch_with_pdl}-{hidden_size} failed"
-                                            )
-                                            print(f"inter_buffer: {inter_buffer}")
-                                            print(f"ref_half: {ref_half}")
-                                            print(f"tolerance: {tolerance}")
-                                            print(f"rtol: {3e-2}")
-                                            pass_flag = False
+                                        )
 
                                         # RMSNorm over hidden size
                                         ref_float = ref_float.view(
@@ -229,23 +212,12 @@ def _run_correctness_worker(world_size, rank, dtype, distributed_init_port):
                                             torch.float32
                                         )
                                         normed_half = normed_float.to(dtype)
-
-                                        if not torch.allclose(
+                                        torch.testing.assert_close(
                                             out1,
                                             normed_half.view(-1),
                                             atol=tolerance,
                                             rtol=3e-2,
-                                        ):
-                                            print(
-                                                f"test RANK {rank}: {world_size}-{dtype}-{strategy_code}-{config_code}-{fusion_op_code}-{launch_with_pdl}-{hidden_size} failed"
-                                            )
-                                            print(f"out1: {out1}")
-                                            print(
-                                                f"normed_half.view(-1): {normed_half.view(-1)}"
-                                            )
-                                            print(f"tolerance: {tolerance}")
-                                            print(f"rtol: {3e-2}")
-                                            pass_flag = False
+                                        )
 
                                     elif (
                                         fusion_op_code
@@ -259,7 +231,7 @@ def _run_correctness_worker(world_size, rank, dtype, distributed_init_port):
                                     print(
                                         f"test RANK {rank}: {world_size}-{dtype}-{strategy_code}-{config_code}-{fusion_op_code}-{launch_with_pdl}-{hidden_size} passed"
                                     )
-                                # dist.barrier(group=group)
+                                # torch.cuda.synchronize()
                                 # # you might want to enable this barrier for a better log output, but it's not mandatory across allReduce calls
     finally:
         dist.barrier(group=group)