[XLA:GPU] Add explicit rounding of the F32 arguments of dot to TF32 if the dot algorithm set as TF32.

loislo · tensorflower-gardener · commit db573bc1b3ac · 2025-03-28T03:15:49.000-07:00
Triton lowers the tf32 dot to mma instruction that does not have explicit rounding attribute for tf32 inputs.
As a result the precision of the tf32 dot is even worth than BF16_BF16_F32 algorithm.

Lets round explicitly the arguments when we have this execution sequence.

PiperOrigin-RevId: 741459476
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -561,6 +561,7 @@ xla_test(
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
@@ -571,6 +572,7 @@ xla_test(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:path",
     ],
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
@@ -53,6 +53,7 @@ absl::Status CreateTritonPipeline(mlir::OpPassManager* pm,
   const int ccAsInt = cc.major * 10 + cc.minor;
   const int threadsPerWarp = 32;
 
+  pm->addPass(mt_xla::CreateRoundF32ToTF32ForTf32DotRewritePass());
   if (is_xla_fusion) {
     pm->addPass(mt_xla::CreateInt4ToPackedInt4RewritePass());
   }
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
@@ -15,11 +15,16 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstddef>
+#include <cstdint>
+#include <cstdlib>
 #include <initializer_list>
+#include <iomanip>
+#include <ios>
 #include <iterator>
 #include <limits>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -36,6 +41,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/gpu/codegen/triton/kernel_name_tracer.h"
 #include "xla/backends/gpu/codegen/triton/test_utils.h"
@@ -53,6 +59,7 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tests/test_utils.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
@@ -1474,6 +1481,129 @@ INSTANTIATE_TEST_SUITE_P(
          PC::ALG_DOT_TF32_TF32_F32_X3, PC::ALG_DOT_F64_F64_F64, PC::ALG_UNSET}),
     AlgorithmTestParamToString);
 
+class PrecisionTestsForTriton : public TritonAlgorithmTest,
+                                public NumericTestsArguments,
+                                public WithParamInterface<PC::Algorithm> {
+ public:
+  PrecisionTestsForTriton() : TritonAlgorithmTest() {
+    algorithm_ = AlgorithmToString(GetParam());
+  }
+
+  std::string test_hlo_text() const {
+    return absl::StrReplaceAll(kHloText, {{"${test_name}", HloModuleTestName()},
+                                          {"${algorithm}", algorithm_}});
+  }
+  std::string reference_hlo_text() const {
+    return absl::StrReplaceAll(kHloText, {{"${test_name}", HloModuleTestName()},
+                                          {"${algorithm}", "dot_f32_f32_f32"}});
+  }
+
+  absl::string_view algorithm() const { return algorithm_; }
+
+  static constexpr absl::string_view kPattern = R"(CHECK: __triton_gemm)";
+
+  absl::StatusOr<std::unique_ptr<HloModule>> GetModule(
+      const std::string& hlo_text) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                        GetOptimizedModule(hlo_text));
+    auto module_text = module->ToString();
+    TF_ASSIGN_OR_RETURN(auto ok, RunFileCheck(module_text, kPattern));
+    if (!ok) {
+      return absl::InternalError(
+          "The module does not contain the pattern __triton_gemm.");
+    }
+    return module;
+  }
+
+ private:
+  static constexpr absl::string_view kHloText = R"(
+    HloModule ${test_name}
+
+    ENTRY main {
+      p0 = f32[1024,1024]{1,0} parameter(0)
+      p1 = f32[1024,1024]{1,0} parameter(1)
+      ROOT %dot = f32[1024,1024]{1,0} dot(p0, p1),
+        lhs_contracting_dims={1},
+        rhs_contracting_dims={0},
+        algorithm=${algorithm}
+    }
+  )";
+  std::string algorithm_;
+};
+
+TEST_P(PrecisionTestsForTriton, PrecisionCheck) {
+  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+    GTEST_SKIP() << "Precision tests is unknown for ROCM.";
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module, GetModule(test_hlo_text()));
+  TF_ASSERT_OK_AND_ASSIGN(auto ref_module, GetModule(reference_hlo_text()));
+
+  // Prepare arguments.
+  absl::StatusOr<std::vector<Literal>> fake_arguments = MakeFakeArguments(
+      test_module.get(), /*pseudo_random=*/true, /*use_large_range=*/false,
+      /*treat_gte_as_data_formatting=*/false, 23);
+  CHECK_OK(fake_arguments);
+
+  // abs the arguments.
+  for (auto& literal : *fake_arguments) {
+    literal.MutableEachCell<float>([](absl::Span<const int64_t> indices,
+                                      float value) { return std::abs(value); });
+  }
+  std::vector<Literal*> fake_argument_ptrs;
+  absl::c_transform(
+      *fake_arguments, std::back_inserter(fake_argument_ptrs),
+      [](const Literal& literal) { return const_cast<Literal*>(&literal); });
+
+  // Run the test and reference modules.
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto test_result,
+      test_runner().Execute(std::move(test_module), fake_argument_ptrs, false));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto ref_result,
+      test_runner().Execute(std::move(ref_module), fake_argument_ptrs, false));
+
+  // Calculate the relative and absolute errors.
+  absl::Span<const float> test_data = test_result.data<float>();
+  absl::Span<const float> ref_data = ref_result.data<float>();
+  float abs_error = 0.0f;
+  float rel_error = 0.0f;
+  for (int i = 0; i < test_data.size(); ++i) {
+    abs_error += std::abs(test_data[i] - ref_data[i]);
+    rel_error += std::abs((test_data[i] - ref_data[i]) / ref_data[i]);
+  }
+  abs_error /= test_data.size();
+  rel_error /= test_data.size();
+
+  std::unordered_map<PC::Algorithm, float> max_mean_rel_error = {
+      {PC::ALG_DOT_BF16_BF16_F32, 6e-5},
+      {PC::ALG_DOT_TF32_TF32_F32, 2e-5},
+      {PC::ALG_DOT_BF16_BF16_F32_X3, 2e-5},
+      {PC::ALG_DOT_BF16_BF16_F32_X6, 4e-7},
+      {PC::ALG_DOT_BF16_BF16_F32_X9, 4e-7},
+      {PC::ALG_DOT_TF32_TF32_F32_X3, 5e-7}};
+
+  LOG(INFO) << "mean(abs_error):    " << abs_error;
+  LOG(ERROR) << "mean(rel_error):    " << std::fixed << std::setprecision(9)
+             << rel_error;
+  LOG(ERROR) << "max_mean_rel_error: " << std::fixed << std::setprecision(9)
+             << max_mean_rel_error[GetParam()];
+
+  ASSERT_TRUE(max_mean_rel_error.find(GetParam()) != max_mean_rel_error.end())
+      << "No precision test for algorithm " << algorithm();
+  EXPECT_LT(rel_error, max_mean_rel_error[GetParam()])
+      << "mean(rel_error) is too high.";
+}
+
+INSTANTIATE_TEST_SUITE_P(PrecisionTestsForTriton, PrecisionTestsForTriton,
+                         ::testing::ValuesIn({PC::ALG_DOT_TF32_TF32_F32,
+                                              PC::ALG_DOT_TF32_TF32_F32_X3,
+                                              PC::ALG_DOT_BF16_BF16_F32,
+                                              PC::ALG_DOT_BF16_BF16_F32_X3,
+                                              PC::ALG_DOT_BF16_BF16_F32_X6,
+                                              PC::ALG_DOT_BF16_BF16_F32_X9}),
+                         AlgorithmTestParamToString);
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
@@ -37,6 +37,7 @@ cc_library(
         "generalize_kernel_signature.cc",
         "int4_passes.cc",
         "prevent_mmav3_loop_unrolling_pass.cc",
+        "round_f32_to_tf32_for_tf32_dot_pass.cc",
         "triton_xla_extract_insert_to_triton_pass.cc",
     ],
     hdrs = ["passes.h"],
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
@@ -38,6 +38,7 @@ std::unique_ptr<mlir::Pass> CreateTritonXLAExtractInsertToTritonPass(
 std::unique_ptr<mlir::Pass> CreateGeneralizeKernelSignaturePass();
 std::unique_ptr<mlir::Pass> CreatePreventMmaV3LoopUnrollingPass();
 std::unique_ptr<mlir::Pass> CreateInt4ToPackedInt4RewritePass();
+std::unique_ptr<mlir::Pass> CreateRoundF32ToTF32ForTf32DotRewritePass();
 
 // Returns true if the `op` contains an operation in it's regions that satisfies
 // the `fn`.
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
@@ -70,4 +70,16 @@ def LoadInt4RewritePass
   let constructor = "CreateInt4ToPackedInt4RewritePass()";
 }
 
+def RoundF32ToTF32ForTf32DotRewritePass
+    : Pass<"round-f32-to-tf32-for-tf32-dot-rewrite", "mlir::ModuleOp"> {
+  let summary = "dot with tf32 algorithm requires explicit rounding.";
+  let description = [{
+    This pass adds explicit rounding from f32 to tf32 for the dot with tf32 algorithm.
+    This is required because mma instruction does not have explicit rounding and
+    by default does truncation. As a result, the dot with tf32 algorithm has too
+    small precision. It is even less than for the dot with BF16 arguments.
+  }];
+  let constructor = "CreateRoundF32ToTF32ForTf32DotRewritePass()";
+}
+
 #endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_PASSES_TD_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/round_f32_to_tf32_for_tf32_dot_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/round_f32_to_tf32_for_tf32_dot_pass.cc
@@ -0,0 +1,95 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+
+namespace mlir::triton::xla {
+
+namespace mt = ::mlir::triton;
+
+#define GEN_PASS_DEF_ROUNDF32TOTF32FORTF32DOTREWRITEPASS
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h.inc"
+
+namespace {
+
+class Tf32DotPattern : public OpRewritePattern<mt::DotOp> {
+ public:
+  explicit Tf32DotPattern(MLIRContext *context)
+      : OpRewritePattern<mt::DotOp>(context) {}
+
+  using OpRewritePattern<mt::DotOp>::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mt::DotOp op, PatternRewriter &rewriter) const override {
+    constexpr auto tf32_args_rounded = "tf32_arguments_rounded";
+    if (op.getInputPrecision() != mt::InputPrecision::TF32) return failure();
+    if (!op.getA().getType().getElementType().isF32()) return failure();
+    if (!op.getB().getType().getElementType().isF32()) return failure();
+    if (op->hasAttr(tf32_args_rounded)) return failure();
+
+    auto f32ToTF32 = [&](Value value) -> Value {
+      return rewriter
+          .create<ElementwiseInlineAsmOp>(
+              op.getLoc(), value.getType(), "cvt.rna.tf32.f32 $0, $1;", "=r,r",
+              /*isPure=*/true, /*pack=*/1, ArrayRef<Value>{value})
+          ->getResult(0);
+    };
+    auto lhs = f32ToTF32(op.getA());
+    auto rhs = f32ToTF32(op.getB());
+    auto dot = rewriter.replaceOpWithNewOp<mt::DotOp>(
+        op, op.getC().getType(), lhs, rhs, op.getC(), mt::InputPrecision::TF32,
+        /*maxNumImpreciseAcc=*/0);
+    dot->setAttr(tf32_args_rounded, rewriter.getUnitAttr());
+
+    return success();
+  }
+};
+
+struct RoundF32ToTF32ForTf32DotRewritePass
+    : public impl::RoundF32ToTF32ForTf32DotRewritePassBase<
+          RoundF32ToTF32ForTf32DotRewritePass> {
+  void runOnOperation() override {
+    auto module = getOperation();
+    RewritePatternSet patterns(&getContext(),
+                               std::make_unique<Tf32DotPattern>(&getContext()));
+    if (failed(applyPatternsGreedily(module, std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Pass> CreateRoundF32ToTF32ForTf32DotRewritePass() {
+  return std::make_unique<RoundF32ToTF32ForTf32DotRewritePass>();
+}
+
+}  // namespace mlir::triton::xla

Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,7 @@ absl::Status CreateTritonPipeline(mlir::OpPassManager* pm,`
`53`	`53`	`const int ccAsInt = cc.major * 10 + cc.minor;`
`54`	`54`	`const int threadsPerWarp = 32;`
`55`	`55`
	`56`	`+ pm->addPass(mt_xla::CreateRoundF32ToTF32ForTf32DotRewritePass());`
`56`	`57`	`if (is_xla_fusion) {`
`57`	`58`	`pm->addPass(mt_xla::CreateInt4ToPackedInt4RewritePass());`
`58`	`59`	`}`