Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[XLA:CPU] [oneDNN] Enable Dot op (MatMul) in BF16 Type #8402

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions xla/service/cpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ cc_library(
":compiler_functor",
":conv_canonicalization",
":cpu_executable",
":cpu_float_support",
":cpu_instruction_fusion",
":cpu_layout_assignment",
":cpu_options",
Expand Down Expand Up @@ -317,6 +318,7 @@ cc_library(
"//xla/service:select_and_scatter_expander",
"//xla/service:sharding_propagation",
"//xla/service:sharding_remover",
"//xla/service:simplify_fp_conversions",
"//xla/service:slice_sinker",
"//xla/service:slow_operation_alarm",
"//xla/service:sort_simplifier",
Expand Down Expand Up @@ -1701,6 +1703,17 @@ cc_library(
] + mkl_deps(),
)

cc_library(
name = "cpu_float_support",
srcs = ["cpu_float_support.cc"],
hdrs = ["cpu_float_support.h"],
copts = tsl_copts(),
deps = [
":onednn_matmul_rewriter",
"//xla/service:float_support",
],
)

cc_library(
name = "cpu_symbol_repository",
hdrs = ["cpu_symbol_repository.h"],
Expand Down
12 changes: 11 additions & 1 deletion xla/service/cpu/cpu_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ limitations under the License.
#include "xla/service/cpu/compiler_functor.h"
#include "xla/service/cpu/conv_canonicalization.h"
#include "xla/service/cpu/cpu_executable.h"
#include "xla/service/cpu/cpu_float_support.h"
#include "xla/service/cpu/cpu_instruction_fusion.h"
#include "xla/service/cpu/cpu_layout_assignment.h"
#include "xla/service/cpu/cpu_options.h"
Expand Down Expand Up @@ -203,6 +204,7 @@ limitations under the License.
#include "xla/service/select_and_scatter_expander.h"
#include "xla/service/sharding_propagation.h"
#include "xla/service/sharding_remover.h"
#include "xla/service/simplify_fp_conversions.h"
#include "xla/service/slow_operation_alarm.h"
#include "xla/service/sort_simplifier.h"
#include "xla/service/spmd/stateful_rng_spmd_partitioner.h"
Expand Down Expand Up @@ -717,7 +719,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
// Convert BF16 and F8 operations to F32 and F16 respectively so that the CPU
// backend can support BF16/F8 operations without directly implementing a
// BF16/F8 lowering for most ops.
FloatSupport bf16_support(BF16);
CpuFloatSupport bf16_support(BF16);
pipeline.AddPass<FloatNormalization>(&bf16_support);
FloatSupport f8e5m2_support(F8E5M2, F16);
pipeline.AddPass<FloatNormalization>(&f8e5m2_support);
Expand Down Expand Up @@ -904,7 +906,15 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
// AOT compiled code runs in single thread.
if (!is_aot_compile) {
// Run SimplifyFPConversions pass to simplify the BF16 pattern and make it
// easier to match.
pipeline.AddPass<SimplifyFPConversions>(
SimplifyFPConversions::Scope::kSimplifyAllConversions);
pipeline.AddPass<OneDnnMatMulRewriter>();
// Run SimplifyFPConversions pass again to remove redundant Convert ops
// that may exist as a result of running OneDnnMatMulRewriter pass.
pipeline.AddPass<SimplifyFPConversions>(
SimplifyFPConversions::Scope::kSimplifyAllConversions);
}
#endif // INTEL_MKL && ENABLE_ONEDNN_V3

Expand Down
76 changes: 76 additions & 0 deletions xla/service/cpu/cpu_float_support.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)

#include "xla/service/cpu/cpu_float_support.h"

#include "xla/service/cpu/onednn_matmul_rewriter.h"

namespace xla {
namespace cpu {

bool CpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
switch (hlo.opcode()) {
// Collective ops.
case HloOpcode::kAllGather:
case HloOpcode::kAllReduce:
case HloOpcode::kAllReduceStart:
case HloOpcode::kAllReduceDone:
case HloOpcode::kAllToAll:
case HloOpcode::kCollectivePermute:
case HloOpcode::kReduceScatter:
case HloOpcode::kDot:
return LowPrecisionType() == BF16 &&
OneDnnMatMulRewriter::ShouldRewrite(&hlo) && DotSupported(hlo);
// Data movement only ops.
case HloOpcode::kBroadcast:
case HloOpcode::kConcatenate:
case HloOpcode::kCopy:
case HloOpcode::kDynamicSlice:
case HloOpcode::kDynamicUpdateSlice:
case HloOpcode::kGather:
case HloOpcode::kPad:
case HloOpcode::kReshape:
case HloOpcode::kReverse:
case HloOpcode::kScatter:
case HloOpcode::kSelect:
case HloOpcode::kSelectAndScatter:
case HloOpcode::kSlice:
case HloOpcode::kTranspose:
// Other special ops.
case HloOpcode::kBitcast:
return true;
default:
return false;
}
}

bool CpuFloatSupport::DotSupported(const HloInstruction& hlo) const {
bool supported = true;
const Shape& lhs_shape = hlo.operand(0)->shape();
const Shape& rhs_shape = hlo.operand(1)->shape();
if (lhs_shape.rank() == rhs_shape.rank() && lhs_shape.rank() == 2) {
// If first dim size is 1, it may be removed by a later pass which makes it
// unsupported case.
supported &= lhs_shape.dimensions(0) != 1;
}
return supported;
}

} // namespace cpu
} // namespace xla

#endif // INTEL_MKL && ENABLE_ONEDNN_V3
52 changes: 52 additions & 0 deletions xla/service/cpu/cpu_float_support.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)

#ifndef XLA_SERVICE_CPU_CPU_FLOAT_SUPPORT_H_
#define XLA_SERVICE_CPU_CPU_FLOAT_SUPPORT_H_

#include "xla/service/float_support.h"

namespace xla {
namespace cpu {

class CpuFloatSupport : public FloatSupport {
public:
explicit CpuFloatSupport(PrimitiveType low_precision_type)
: FloatSupport(low_precision_type) {}

bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
int64_t operand_index) const override {
return FloatSupport::SupportsLowPrecisionOperand(hlo, operand_index) ||
IsSupported(hlo);
}

bool SupportsLowPrecisionOutput(const HloInstruction& hlo) const override {
return FloatSupport::SupportsLowPrecisionOutput(hlo) || IsSupported(hlo);
}

private:
bool IsSupported(const HloInstruction& hlo) const;
// Performs early check for things that cannot be delayed becuase some later
// passes may change the shape of dot inputs.
bool DotSupported(const HloInstruction& hlo) const;
};

} // namespace cpu
} // namespace xla

#endif // XLA_SERVICE_CPU_CPU_FLOAT_SUPPORT_H_
#endif // INTEL_MKL && ENABLE_ONEDNN_V3
1 change: 1 addition & 0 deletions xla/service/cpu/onednn_matmul_rewriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ bool OneDnnMatMulRewriter::ShouldRewrite(const HloInstruction* dot_instr) {
// Currently, blocking control dependencies
if (dot_instr->HasControlDependencies()) return false;
if (!IsSupportedType(dot_instr->shape().element_type())) return false;
if (dot_instr->operands().size() != 2) return false;

// Currently, we rewrite when the data type is F32 or BF16. Note we do not
// need to check equality of contraction dim-size of the operands. HLO
Expand Down
17 changes: 17 additions & 0 deletions xla/service/simplify_fp_conversions.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,23 @@ namespace {
// Simplifies floating-point conversions `A -> B -> C -> D` as `A -> D`.
StatusOr<bool> RunOnComputation(HloComputation& computation,
SimplifyFPConversions::Scope scope) {
// Since the goal of this pass is to simplify type conversions by removing
// some Convert ops, we don't want to run this pass for tests that are meant
// to test for functionality of the Convert op itself.
const absl::string_view comp_name = computation.name();
const std::vector<absl::string_view> test_names{
"ConvertF16F8e5m2Roundtrip",
"ConvertF16F8e4m3fnRoundtrip",
"ConvertF16F8e4m3b11fnuzRoundtrip",
"ConvertF16F8e5m2fnuzRoundtrip",
"ConvertF32F8e5m2fnuzRoundtrip",
"ConvertF8e5m2fnuzRoundtripExhaustive",
"ConvertF16F8e4m3fnuzRoundtrip",
"ConvertF32F8e4m3fnuzRoundtrip",
"ConvertF8e4m3fnuzRoundtripExhaustive"};
for (const auto& test_name : test_names) {
if (absl::StrContains(comp_name, test_name)) return false;
}
const int minimum_logical_creation_pass_id =
(scope == SimplifyFPConversions::Scope::kSimplifyAllConversions) ? -1 : 0;
bool changed = false;
Expand Down
42 changes: 42 additions & 0 deletions xla/tests/onednn_matmul_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,48 @@ TEST_F(MatmulTest, ReLUTestF32) {
)");
}

TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_F32) {
const char* matmul_module_str = R"(
HloModule jit_apply, entry_computation_layout={(f32[3072]{0}, f32[768,3072]{1,0}, f32[16,128,768]{2,1,0})->bf16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
ENTRY matmul.test.bf16 {
Arg_2.3 = f32[16,128,768]{2,1,0} parameter(2), sharding={replicated}
convert.4 = bf16[16,128,768]{2,1,0} convert(Arg_2.3)
Arg_1.2 = f32[768,3072]{1,0} parameter(1), sharding={replicated}
convert.5 = bf16[768,3072]{1,0} convert(Arg_1.2)
dot.7 = bf16[16,128,3072]{2,1,0} dot(convert.4, convert.5), lhs_contracting_dims={2}, rhs_contracting_dims={0}
Arg_0.1 = f32[3072]{0} parameter(0), sharding={replicated}
convert.6 = bf16[3072]{0} convert(Arg_0.1)
reshape.8 = bf16[1,1,3072]{2,1,0} reshape(convert.6)
broadcast.9 = bf16[1,1,3072]{2,1,0} broadcast(reshape.8), dimensions={0,1,2}
reshape.10 = bf16[3072]{0} reshape(broadcast.9)
broadcast.11 = bf16[16,128,3072]{2,1,0} broadcast(reshape.10), dimensions={2}
ROOT add.12 = bf16[16,128,3072]{2,1,0} add(dot.7, broadcast.11)
})";

EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_);
}

TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_BF16) {
const char* matmul_module_str = R"(
HloModule jit_apply, entry_computation_layout={(bf16[3072]{0}, bf16[768,3072]{1,0}, f32[16,128,768]{2,1,0})->bf16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
ENTRY matmul.test.bf16 {
Arg_2.3 = f32[16,128,768]{2,1,0} parameter(2), sharding={replicated}
convert.4 = bf16[16,128,768]{2,1,0} convert(Arg_2.3)
Arg_1.2 = bf16[768,3072]{1,0} parameter(1), sharding={replicated}
dot.5 = bf16[16,128,3072]{2,1,0} dot(convert.4, Arg_1.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
Arg_0.1 = bf16[3072]{0} parameter(0), sharding={replicated}
reshape.6 = bf16[1,1,3072]{2,1,0} reshape(Arg_0.1)
broadcast.7 = bf16[1,1,3072]{2,1,0} broadcast(reshape.6), dimensions={0,1,2}
reshape.8 = bf16[3072]{0} reshape(broadcast.7)
broadcast.9 = bf16[16,128,3072]{2,1,0} broadcast(reshape.8), dimensions={2}
ROOT add.10 = bf16[16,128,3072]{2,1,0} add(dot.5, broadcast.9)
})";

EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_);
}

} // namespace cpu
} // namespace xla

Expand Down
Loading