PR #8402: [XLA:CPU] [oneDNN] Enable Dot op (MatMul) in BF16 Type

Imported from GitHub PR #8402 This PR adds BF16 support in oneDNN Matmul op by allowing the Dot op to maintain the BF16 type until handled by OneDnnMatMulRewriter pass. Copybara import of the project: -- 4f7ddbc by Mahmoud Abuzaina <mahmoud.abuzaina@intel.com>: Enable MatMul op in BF16 Merging this change closes #8402 FUTURE_COPYBARA_INTEGRATE_REVIEW=#8402 from Intel-tensorflow:mabuzain/enable-bf16-matmul 4f7ddbc PiperOrigin-RevId: 598823232
openxla · Jan 16, 2024 · 7bd6d39 · 7bd6d39
1 parent d632353
commit 7bd6d39
Show file tree

Hide file tree

Showing 7 changed files with 215 additions and 0 deletions.
diff --git a/xla/service/cpu/BUILD b/xla/service/cpu/BUILD
@@ -209,6 +209,7 @@ cc_library(
         ":compiler_functor",
         ":conv_canonicalization",
         ":cpu_executable",
+        ":cpu_float_support",
         ":cpu_instruction_fusion",
         ":cpu_layout_assignment",
         ":cpu_options",
@@ -317,6 +318,7 @@ cc_library(
         "//xla/service:select_and_scatter_expander",
         "//xla/service:sharding_propagation",
         "//xla/service:sharding_remover",
+        "//xla/service:simplify_fp_conversions",
         "//xla/service:slice_sinker",
         "//xla/service:slow_operation_alarm",
         "//xla/service:sort_simplifier",
@@ -1701,6 +1703,17 @@ cc_library(
     ] + mkl_deps(),
 )
 
+cc_library(
+    name = "cpu_float_support",
+    srcs = ["cpu_float_support.cc"],
+    hdrs = ["cpu_float_support.h"],
+    copts = tsl_copts(),
+    deps = [
+        ":onednn_matmul_rewriter",
+        "//xla/service:float_support",
+    ],
+)
+
 cc_library(
     name = "cpu_symbol_repository",
     hdrs = ["cpu_symbol_repository.h"],

diff --git a/xla/service/cpu/cpu_compiler.cc b/xla/service/cpu/cpu_compiler.cc
@@ -139,6 +139,7 @@ limitations under the License.
 #include "xla/service/cpu/compiler_functor.h"
 #include "xla/service/cpu/conv_canonicalization.h"
 #include "xla/service/cpu/cpu_executable.h"
+#include "xla/service/cpu/cpu_float_support.h"
 #include "xla/service/cpu/cpu_instruction_fusion.h"
 #include "xla/service/cpu/cpu_layout_assignment.h"
 #include "xla/service/cpu/cpu_options.h"
@@ -203,6 +204,7 @@ limitations under the License.
 #include "xla/service/select_and_scatter_expander.h"
 #include "xla/service/sharding_propagation.h"
 #include "xla/service/sharding_remover.h"
+#include "xla/service/simplify_fp_conversions.h"
 #include "xla/service/slow_operation_alarm.h"
 #include "xla/service/sort_simplifier.h"
 #include "xla/service/spmd/stateful_rng_spmd_partitioner.h"
@@ -717,7 +719,11 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   // Convert BF16 and F8 operations to F32 and F16 respectively so that the CPU
   // backend can support BF16/F8 operations without directly implementing a
   // BF16/F8 lowering for most ops.
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+  CpuFloatSupport bf16_support(BF16);
+#else
   FloatSupport bf16_support(BF16);
+#endif
   pipeline.AddPass<FloatNormalization>(&bf16_support);
   FloatSupport f8e5m2_support(F8E5M2, F16);
   pipeline.AddPass<FloatNormalization>(&f8e5m2_support);
@@ -904,7 +910,15 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
   // AOT compiled code runs in single thread.
   if (!is_aot_compile) {
+    // Run SimplifyFPConversions pass to simplify the BF16 pattern and make it
+    // easier to match.
+    pipeline.AddPass<SimplifyFPConversions>(
+        SimplifyFPConversions::Scope::kSimplifyAllConversions);
     pipeline.AddPass<OneDnnMatMulRewriter>();
+    // Run SimplifyFPConversions pass again to remove redundant Convert ops
+    // that may exist as a result of running OneDnnMatMulRewriter pass.
+    pipeline.AddPass<SimplifyFPConversions>(
+        SimplifyFPConversions::Scope::kSimplifyAllConversions);
   }
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 

diff --git a/xla/service/cpu/cpu_float_support.cc b/xla/service/cpu/cpu_float_support.cc
@@ -0,0 +1,76 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include "xla/service/cpu/cpu_float_support.h"
+
+#include "xla/service/cpu/onednn_matmul_rewriter.h"
+
+namespace xla {
+namespace cpu {
+
+bool CpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
+  switch (hlo.opcode()) {
+    // Collective ops.
+    case HloOpcode::kAllGather:
+    case HloOpcode::kAllReduce:
+    case HloOpcode::kAllReduceStart:
+    case HloOpcode::kAllReduceDone:
+    case HloOpcode::kAllToAll:
+    case HloOpcode::kCollectivePermute:
+    case HloOpcode::kReduceScatter:
+    case HloOpcode::kDot:
+      return LowPrecisionType() == BF16 &&
+             OneDnnMatMulRewriter::ShouldRewrite(&hlo) && DotSupported(hlo);
+    // Data movement only ops.
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kCopy:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kGather:
+    case HloOpcode::kPad:
+    case HloOpcode::kReshape:
+    case HloOpcode::kReverse:
+    case HloOpcode::kScatter:
+    case HloOpcode::kSelect:
+    case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSlice:
+    case HloOpcode::kTranspose:
+    // Other special ops.
+    case HloOpcode::kBitcast:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool CpuFloatSupport::DotSupported(const HloInstruction& hlo) const {
+  bool supported = true;
+  const Shape& lhs_shape = hlo.operand(0)->shape();
+  const Shape& rhs_shape = hlo.operand(1)->shape();
+  if (lhs_shape.rank() == rhs_shape.rank() && lhs_shape.rank() == 2) {
+    // If first dim size is 1, it may be removed by a later pass which makes it
+    // unsupported case.
+    supported &= lhs_shape.dimensions(0) != 1;
+  }
+  return supported;
+}
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
diff --git a/xla/service/cpu/cpu_float_support.h b/xla/service/cpu/cpu_float_support.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_CPU_FLOAT_SUPPORT_H_
+#define XLA_SERVICE_CPU_CPU_FLOAT_SUPPORT_H_
+
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include "xla/service/float_support.h"
+
+namespace xla {
+namespace cpu {
+
+class CpuFloatSupport : public FloatSupport {
+ public:
+  explicit CpuFloatSupport(PrimitiveType low_precision_type)
+      : FloatSupport(low_precision_type) {}
+
+  bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
+                                   int64_t operand_index) const override {
+    return FloatSupport::SupportsLowPrecisionOperand(hlo, operand_index) ||
+           IsSupported(hlo);
+  }
+
+  bool SupportsLowPrecisionOutput(const HloInstruction& hlo) const override {
+    return FloatSupport::SupportsLowPrecisionOutput(hlo) || IsSupported(hlo);
+  }
+
+ private:
+  bool IsSupported(const HloInstruction& hlo) const;
+  // Performs early check for things that cannot be delayed becuase some later
+  // passes may change the shape of dot inputs.
+  bool DotSupported(const HloInstruction& hlo) const;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_CPU_FLOAT_SUPPORT_H_
diff --git a/xla/service/cpu/onednn_matmul_rewriter.cc b/xla/service/cpu/onednn_matmul_rewriter.cc
@@ -219,6 +219,7 @@ bool OneDnnMatMulRewriter::ShouldRewrite(const HloInstruction* dot_instr) {
   // Currently, blocking control dependencies
   if (dot_instr->HasControlDependencies()) return false;
   if (!IsSupportedType(dot_instr->shape().element_type())) return false;
+  if (dot_instr->operands().size() != 2) return false;
 
   // Currently, we rewrite when the data type is F32 or BF16. Note we do not
   // need to check equality of contraction dim-size of the operands. HLO

diff --git a/xla/service/simplify_fp_conversions.cc b/xla/service/simplify_fp_conversions.cc
@@ -34,6 +34,23 @@ namespace {
 // Simplifies floating-point conversions `A -> B -> C -> D` as `A -> D`.
 StatusOr<bool> RunOnComputation(HloComputation& computation,
                                 SimplifyFPConversions::Scope scope) {
+  // Since the goal of this pass is to simplify type conversions by removing
+  // some Convert ops, we don't want to run this pass for tests that are meant
+  // to test for functionality of the Convert op itself.
+  const absl::string_view comp_name = computation.name();
+  const std::vector<absl::string_view> test_names{
+      "ConvertF16F8e5m2Roundtrip",
+      "ConvertF16F8e4m3fnRoundtrip",
+      "ConvertF16F8e4m3b11fnuzRoundtrip",
+      "ConvertF16F8e5m2fnuzRoundtrip",
+      "ConvertF32F8e5m2fnuzRoundtrip",
+      "ConvertF8e5m2fnuzRoundtripExhaustive",
+      "ConvertF16F8e4m3fnuzRoundtrip",
+      "ConvertF32F8e4m3fnuzRoundtrip",
+      "ConvertF8e4m3fnuzRoundtripExhaustive"};
+  for (const auto& test_name : test_names) {
+    if (absl::StrContains(comp_name, test_name)) return false;
+  }
   const int minimum_logical_creation_pass_id =
       (scope == SimplifyFPConversions::Scope::kSimplifyAllConversions) ? -1 : 0;
   bool changed = false;

diff --git a/xla/tests/onednn_matmul_test.cc b/xla/tests/onednn_matmul_test.cc
@@ -392,6 +392,48 @@ TEST_F(MatmulTest, ReLUTestF32) {
   )");
 }
 
+TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_F32) {
+  const char* matmul_module_str = R"(
+  HloModule jit_apply, entry_computation_layout={(f32[3072]{0}, f32[768,3072]{1,0}, f32[16,128,768]{2,1,0})->bf16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+  ENTRY matmul.test.bf16 {
+    Arg_2.3 = f32[16,128,768]{2,1,0} parameter(2), sharding={replicated}
+    convert.4 = bf16[16,128,768]{2,1,0} convert(Arg_2.3)
+    Arg_1.2 = f32[768,3072]{1,0} parameter(1), sharding={replicated}
+    convert.5 = bf16[768,3072]{1,0} convert(Arg_1.2)
+    dot.7 = bf16[16,128,3072]{2,1,0} dot(convert.4, convert.5), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    Arg_0.1 = f32[3072]{0} parameter(0), sharding={replicated}
+    convert.6 = bf16[3072]{0} convert(Arg_0.1)
+    reshape.8 = bf16[1,1,3072]{2,1,0} reshape(convert.6)
+    broadcast.9 = bf16[1,1,3072]{2,1,0} broadcast(reshape.8), dimensions={0,1,2}
+    reshape.10 = bf16[3072]{0} reshape(broadcast.9)
+    broadcast.11 = bf16[16,128,3072]{2,1,0} broadcast(reshape.10), dimensions={2}
+    ROOT add.12 = bf16[16,128,3072]{2,1,0} add(dot.7, broadcast.11)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_);
+}
+
+TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_BF16) {
+  const char* matmul_module_str = R"(
+  HloModule jit_apply, entry_computation_layout={(bf16[3072]{0}, bf16[768,3072]{1,0}, f32[16,128,768]{2,1,0})->bf16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+  ENTRY matmul.test.bf16 {
+    Arg_2.3 = f32[16,128,768]{2,1,0} parameter(2), sharding={replicated}
+    convert.4 = bf16[16,128,768]{2,1,0} convert(Arg_2.3)
+    Arg_1.2 = bf16[768,3072]{1,0} parameter(1), sharding={replicated}
+    dot.5 = bf16[16,128,3072]{2,1,0} dot(convert.4, Arg_1.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    Arg_0.1 = bf16[3072]{0} parameter(0), sharding={replicated}
+    reshape.6 = bf16[1,1,3072]{2,1,0} reshape(Arg_0.1)
+    broadcast.7 = bf16[1,1,3072]{2,1,0} broadcast(reshape.6), dimensions={0,1,2}
+    reshape.8 = bf16[3072]{0} reshape(broadcast.7)
+    broadcast.9 = bf16[16,128,3072]{2,1,0} broadcast(reshape.8), dimensions={2}
+    ROOT add.10 = bf16[16,128,3072]{2,1,0} add(dot.5, broadcast.9)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_);
+}
+
 }  // namespace cpu
 }  // namespace xla