Skip to content

Commit

Permalink
PR #8402: [XLA:CPU] [oneDNN] Enable Dot op (MatMul) in BF16 Type
Browse files Browse the repository at this point in the history
Imported from GitHub PR #8402

This PR adds BF16 support in oneDNN Matmul op by allowing the Dot op to maintain the BF16 type until handled by OneDnnMatMulRewriter pass.
Copybara import of the project:

--
4f7ddbc by Mahmoud Abuzaina <mahmoud.abuzaina@intel.com>:

Enable MatMul op in BF16

Merging this change closes #8402

FUTURE_COPYBARA_INTEGRATE_REVIEW=#8402 from Intel-tensorflow:mabuzain/enable-bf16-matmul 4f7ddbc
PiperOrigin-RevId: 598823232
  • Loading branch information
mahmoud-abuzaina authored and copybara-github committed Jan 16, 2024
1 parent d632353 commit 7bd6d39
Show file tree
Hide file tree
Showing 7 changed files with 215 additions and 0 deletions.
13 changes: 13 additions & 0 deletions xla/service/cpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ cc_library(
":compiler_functor",
":conv_canonicalization",
":cpu_executable",
":cpu_float_support",
":cpu_instruction_fusion",
":cpu_layout_assignment",
":cpu_options",
Expand Down Expand Up @@ -317,6 +318,7 @@ cc_library(
"//xla/service:select_and_scatter_expander",
"//xla/service:sharding_propagation",
"//xla/service:sharding_remover",
"//xla/service:simplify_fp_conversions",
"//xla/service:slice_sinker",
"//xla/service:slow_operation_alarm",
"//xla/service:sort_simplifier",
Expand Down Expand Up @@ -1701,6 +1703,17 @@ cc_library(
] + mkl_deps(),
)

cc_library(
name = "cpu_float_support",
srcs = ["cpu_float_support.cc"],
hdrs = ["cpu_float_support.h"],
copts = tsl_copts(),
deps = [
":onednn_matmul_rewriter",
"//xla/service:float_support",
],
)

cc_library(
name = "cpu_symbol_repository",
hdrs = ["cpu_symbol_repository.h"],
Expand Down
14 changes: 14 additions & 0 deletions xla/service/cpu/cpu_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ limitations under the License.
#include "xla/service/cpu/compiler_functor.h"
#include "xla/service/cpu/conv_canonicalization.h"
#include "xla/service/cpu/cpu_executable.h"
#include "xla/service/cpu/cpu_float_support.h"
#include "xla/service/cpu/cpu_instruction_fusion.h"
#include "xla/service/cpu/cpu_layout_assignment.h"
#include "xla/service/cpu/cpu_options.h"
Expand Down Expand Up @@ -203,6 +204,7 @@ limitations under the License.
#include "xla/service/select_and_scatter_expander.h"
#include "xla/service/sharding_propagation.h"
#include "xla/service/sharding_remover.h"
#include "xla/service/simplify_fp_conversions.h"
#include "xla/service/slow_operation_alarm.h"
#include "xla/service/sort_simplifier.h"
#include "xla/service/spmd/stateful_rng_spmd_partitioner.h"
Expand Down Expand Up @@ -717,7 +719,11 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
// Convert BF16 and F8 operations to F32 and F16 respectively so that the CPU
// backend can support BF16/F8 operations without directly implementing a
// BF16/F8 lowering for most ops.
#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
CpuFloatSupport bf16_support(BF16);
#else
FloatSupport bf16_support(BF16);
#endif
pipeline.AddPass<FloatNormalization>(&bf16_support);
FloatSupport f8e5m2_support(F8E5M2, F16);
pipeline.AddPass<FloatNormalization>(&f8e5m2_support);
Expand Down Expand Up @@ -904,7 +910,15 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
// AOT compiled code runs in single thread.
if (!is_aot_compile) {
// Run SimplifyFPConversions pass to simplify the BF16 pattern and make it
// easier to match.
pipeline.AddPass<SimplifyFPConversions>(
SimplifyFPConversions::Scope::kSimplifyAllConversions);
pipeline.AddPass<OneDnnMatMulRewriter>();
// Run SimplifyFPConversions pass again to remove redundant Convert ops
// that may exist as a result of running OneDnnMatMulRewriter pass.
pipeline.AddPass<SimplifyFPConversions>(
SimplifyFPConversions::Scope::kSimplifyAllConversions);
}
#endif // INTEL_MKL && ENABLE_ONEDNN_V3

Expand Down
76 changes: 76 additions & 0 deletions xla/service/cpu/cpu_float_support.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)

#include "xla/service/cpu/cpu_float_support.h"

#include "xla/service/cpu/onednn_matmul_rewriter.h"

namespace xla {
namespace cpu {

bool CpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
switch (hlo.opcode()) {
// Collective ops.
case HloOpcode::kAllGather:
case HloOpcode::kAllReduce:
case HloOpcode::kAllReduceStart:
case HloOpcode::kAllReduceDone:
case HloOpcode::kAllToAll:
case HloOpcode::kCollectivePermute:
case HloOpcode::kReduceScatter:
case HloOpcode::kDot:
return LowPrecisionType() == BF16 &&
OneDnnMatMulRewriter::ShouldRewrite(&hlo) && DotSupported(hlo);
// Data movement only ops.
case HloOpcode::kBroadcast:
case HloOpcode::kConcatenate:
case HloOpcode::kCopy:
case HloOpcode::kDynamicSlice:
case HloOpcode::kDynamicUpdateSlice:
case HloOpcode::kGather:
case HloOpcode::kPad:
case HloOpcode::kReshape:
case HloOpcode::kReverse:
case HloOpcode::kScatter:
case HloOpcode::kSelect:
case HloOpcode::kSelectAndScatter:
case HloOpcode::kSlice:
case HloOpcode::kTranspose:
// Other special ops.
case HloOpcode::kBitcast:
return true;
default:
return false;
}
}

bool CpuFloatSupport::DotSupported(const HloInstruction& hlo) const {
bool supported = true;
const Shape& lhs_shape = hlo.operand(0)->shape();
const Shape& rhs_shape = hlo.operand(1)->shape();
if (lhs_shape.rank() == rhs_shape.rank() && lhs_shape.rank() == 2) {
// If first dim size is 1, it may be removed by a later pass which makes it
// unsupported case.
supported &= lhs_shape.dimensions(0) != 1;
}
return supported;
}

} // namespace cpu
} // namespace xla

#endif // INTEL_MKL && ENABLE_ONEDNN_V3
52 changes: 52 additions & 0 deletions xla/service/cpu/cpu_float_support.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#ifndef XLA_SERVICE_CPU_CPU_FLOAT_SUPPORT_H_
#define XLA_SERVICE_CPU_CPU_FLOAT_SUPPORT_H_

#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)

#include "xla/service/float_support.h"

namespace xla {
namespace cpu {

class CpuFloatSupport : public FloatSupport {
public:
explicit CpuFloatSupport(PrimitiveType low_precision_type)
: FloatSupport(low_precision_type) {}

bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
int64_t operand_index) const override {
return FloatSupport::SupportsLowPrecisionOperand(hlo, operand_index) ||
IsSupported(hlo);
}

bool SupportsLowPrecisionOutput(const HloInstruction& hlo) const override {
return FloatSupport::SupportsLowPrecisionOutput(hlo) || IsSupported(hlo);
}

private:
bool IsSupported(const HloInstruction& hlo) const;
// Performs early check for things that cannot be delayed becuase some later
// passes may change the shape of dot inputs.
bool DotSupported(const HloInstruction& hlo) const;
};

} // namespace cpu
} // namespace xla

#endif // INTEL_MKL && ENABLE_ONEDNN_V3
#endif // XLA_SERVICE_CPU_CPU_FLOAT_SUPPORT_H_
1 change: 1 addition & 0 deletions xla/service/cpu/onednn_matmul_rewriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ bool OneDnnMatMulRewriter::ShouldRewrite(const HloInstruction* dot_instr) {
// Currently, blocking control dependencies
if (dot_instr->HasControlDependencies()) return false;
if (!IsSupportedType(dot_instr->shape().element_type())) return false;
if (dot_instr->operands().size() != 2) return false;

// Currently, we rewrite when the data type is F32 or BF16. Note we do not
// need to check equality of contraction dim-size of the operands. HLO
Expand Down
17 changes: 17 additions & 0 deletions xla/service/simplify_fp_conversions.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,23 @@ namespace {
// Simplifies floating-point conversions `A -> B -> C -> D` as `A -> D`.
StatusOr<bool> RunOnComputation(HloComputation& computation,
SimplifyFPConversions::Scope scope) {
// Since the goal of this pass is to simplify type conversions by removing
// some Convert ops, we don't want to run this pass for tests that are meant
// to test for functionality of the Convert op itself.
const absl::string_view comp_name = computation.name();
const std::vector<absl::string_view> test_names{
"ConvertF16F8e5m2Roundtrip",
"ConvertF16F8e4m3fnRoundtrip",
"ConvertF16F8e4m3b11fnuzRoundtrip",
"ConvertF16F8e5m2fnuzRoundtrip",
"ConvertF32F8e5m2fnuzRoundtrip",
"ConvertF8e5m2fnuzRoundtripExhaustive",
"ConvertF16F8e4m3fnuzRoundtrip",
"ConvertF32F8e4m3fnuzRoundtrip",
"ConvertF8e4m3fnuzRoundtripExhaustive"};
for (const auto& test_name : test_names) {
if (absl::StrContains(comp_name, test_name)) return false;
}
const int minimum_logical_creation_pass_id =
(scope == SimplifyFPConversions::Scope::kSimplifyAllConversions) ? -1 : 0;
bool changed = false;
Expand Down
42 changes: 42 additions & 0 deletions xla/tests/onednn_matmul_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,48 @@ TEST_F(MatmulTest, ReLUTestF32) {
)");
}

TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_F32) {
const char* matmul_module_str = R"(
HloModule jit_apply, entry_computation_layout={(f32[3072]{0}, f32[768,3072]{1,0}, f32[16,128,768]{2,1,0})->bf16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
ENTRY matmul.test.bf16 {
Arg_2.3 = f32[16,128,768]{2,1,0} parameter(2), sharding={replicated}
convert.4 = bf16[16,128,768]{2,1,0} convert(Arg_2.3)
Arg_1.2 = f32[768,3072]{1,0} parameter(1), sharding={replicated}
convert.5 = bf16[768,3072]{1,0} convert(Arg_1.2)
dot.7 = bf16[16,128,3072]{2,1,0} dot(convert.4, convert.5), lhs_contracting_dims={2}, rhs_contracting_dims={0}
Arg_0.1 = f32[3072]{0} parameter(0), sharding={replicated}
convert.6 = bf16[3072]{0} convert(Arg_0.1)
reshape.8 = bf16[1,1,3072]{2,1,0} reshape(convert.6)
broadcast.9 = bf16[1,1,3072]{2,1,0} broadcast(reshape.8), dimensions={0,1,2}
reshape.10 = bf16[3072]{0} reshape(broadcast.9)
broadcast.11 = bf16[16,128,3072]{2,1,0} broadcast(reshape.10), dimensions={2}
ROOT add.12 = bf16[16,128,3072]{2,1,0} add(dot.7, broadcast.11)
})";

EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_);
}

TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_BF16) {
const char* matmul_module_str = R"(
HloModule jit_apply, entry_computation_layout={(bf16[3072]{0}, bf16[768,3072]{1,0}, f32[16,128,768]{2,1,0})->bf16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
ENTRY matmul.test.bf16 {
Arg_2.3 = f32[16,128,768]{2,1,0} parameter(2), sharding={replicated}
convert.4 = bf16[16,128,768]{2,1,0} convert(Arg_2.3)
Arg_1.2 = bf16[768,3072]{1,0} parameter(1), sharding={replicated}
dot.5 = bf16[16,128,3072]{2,1,0} dot(convert.4, Arg_1.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
Arg_0.1 = bf16[3072]{0} parameter(0), sharding={replicated}
reshape.6 = bf16[1,1,3072]{2,1,0} reshape(Arg_0.1)
broadcast.7 = bf16[1,1,3072]{2,1,0} broadcast(reshape.6), dimensions={0,1,2}
reshape.8 = bf16[3072]{0} reshape(broadcast.7)
broadcast.9 = bf16[16,128,3072]{2,1,0} broadcast(reshape.8), dimensions={2}
ROOT add.10 = bf16[16,128,3072]{2,1,0} add(dot.5, broadcast.9)
})";

EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_);
}

} // namespace cpu
} // namespace xla

Expand Down

0 comments on commit 7bd6d39

Please sign in to comment.