Skip to content

Commit

Permalink
[XLA:GPU] Rename the debug flag xla_gpu_enable_address_computation_fu…
Browse files Browse the repository at this point in the history
…sion to xla_gpu_enable_dynamic_slice_fusion for consistency.

"AddressComputation" is confusing, it simply fuses dynamic slice (and dynamic update slice) into other thunks via buffer assignment tricks

PiperOrigin-RevId: 657304729
  • Loading branch information
sgerrard authored and copybara-github committed Jul 29, 2024
1 parent e6ef8a6 commit a517228
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 32 deletions.
9 changes: 4 additions & 5 deletions xla/debug_options_flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
opts.set_xla_enable_dumping(true);

opts.set_xla_gpu_enable_custom_fusions(false);
opts.set_xla_gpu_enable_address_computation_fusion(true);
opts.set_xla_gpu_enable_dynamic_slice_fusion(true);
opts.set_xla_gpu_nccl_termination_timeout_seconds(-1);
opts.set_xla_gpu_enable_shared_constants(true);
opts.set_xla_gpu_enable_nccl_user_buffers(false);
Expand Down Expand Up @@ -1298,10 +1298,9 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
"expression. Default is all custom fusions registerered in a current "
"process."));
flag_list->push_back(tsl::Flag(
"xla_gpu_enable_address_computation_fusion",
bool_setter_for(
&DebugOptions::set_xla_gpu_enable_address_computation_fusion),
debug_options->xla_gpu_enable_address_computation_fusion(),
"xla_gpu_enable_dynamic_slice_fusion",
bool_setter_for(&DebugOptions::set_xla_gpu_enable_dynamic_slice_fusion),
debug_options->xla_gpu_enable_dynamic_slice_fusion(),
"Whether to enable XLA address computation fusion"));
flag_list->push_back(tsl::Flag(
"xla_gpu_nccl_termination_timeout_seconds",
Expand Down
10 changes: 5 additions & 5 deletions xla/service/gpu/dynamic_slice_fusion_rewriter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -942,7 +942,7 @@ TEST_F(DynamicSliceFusionRewriterTest, SimpleCustomCall) {
xla::ProgramShape(computation.proto().host_program_shape()),
/*ignore_layouts=*/false);
DebugOptions debug_options = GetDebugOptionsForTest();
debug_options.set_xla_gpu_enable_address_computation_fusion(false);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));
Expand Down Expand Up @@ -990,7 +990,7 @@ TEST_F(DynamicSliceFusionRewriterTest, SimpleCustomCallLegacy) {
xla::ProgramShape(computation.proto().host_program_shape()),
/*ignore_layouts=*/false);
DebugOptions debug_options = GetDebugOptionsForTest();
debug_options.set_xla_gpu_enable_address_computation_fusion(false);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));
Expand Down Expand Up @@ -1050,7 +1050,7 @@ TEST_F(DynamicSliceFusionRewriterTest, TupleSliceCustomCallLegacy) {
xla::ProgramShape(computation.proto().host_program_shape()),
/*ignore_layouts=*/false);
DebugOptions debug_options = GetDebugOptionsForTest();
debug_options.set_xla_gpu_enable_address_computation_fusion(false);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));
Expand Down Expand Up @@ -1122,7 +1122,7 @@ TEST_F(DynamicSliceFusionRewriterTest, TupledOutputCustomCallLegacy) {
xla::ProgramShape(computation.proto().host_program_shape()),
/*ignore_layouts=*/false);
DebugOptions debug_options = GetDebugOptionsForTest();
debug_options.set_xla_gpu_enable_address_computation_fusion(false);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));
Expand Down Expand Up @@ -1183,7 +1183,7 @@ TEST_F(DynamicSliceFusionRewriterTest, UnalignedSlice) {
xla::ProgramShape(computation.proto().host_program_shape()),
/*ignore_layouts=*/false);
DebugOptions debug_options = GetDebugOptionsForTest();
debug_options.set_xla_gpu_enable_address_computation_fusion(false);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));
Expand Down
32 changes: 16 additions & 16 deletions xla/service/gpu/fusions/dynamic_slice_fusion_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -867,7 +867,7 @@ TEST_F(DynamicSliceFusionTest, CustomCallSimple) {
xla::ProgramShape(computation.proto().host_program_shape()),
/*ignore_layouts=*/false);
DebugOptions debug_options = GetDebugOptionsForTest();
debug_options.set_xla_gpu_enable_address_computation_fusion(false);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));
Expand Down Expand Up @@ -1010,12 +1010,12 @@ TEST_F(DynamicSliceFusionTest, CustomCallWithTuple) {
xla::ProgramShape(computation.proto().host_program_shape()),
/*ignore_layouts=*/true);
DebugOptions debug_options = GetDebugOptionsForTest();
debug_options.set_xla_gpu_enable_address_computation_fusion(false);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));

debug_options.set_xla_gpu_enable_address_computation_fusion(true);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));
Expand Down Expand Up @@ -1059,12 +1059,12 @@ TEST_F(DynamicSliceFusionTest, NilTuple) {
xla::ProgramShape(computation.proto().host_program_shape()),
/*ignore_layouts=*/false);
DebugOptions debug_options = GetDebugOptionsForTest();
debug_options.set_xla_gpu_enable_address_computation_fusion(false);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));

debug_options.set_xla_gpu_enable_address_computation_fusion(true);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));
Expand Down Expand Up @@ -1103,12 +1103,12 @@ TEST_F(DynamicSliceFusionTest, CustomCallLegacyAPI) {
xla::ProgramShape(computation.proto().host_program_shape()),
/*ignore_layouts=*/false);
DebugOptions debug_options = GetDebugOptionsForTest();
debug_options.set_xla_gpu_enable_address_computation_fusion(false);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));

debug_options.set_xla_gpu_enable_address_computation_fusion(true);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));
Expand Down Expand Up @@ -1141,12 +1141,12 @@ TEST_F(DynamicSliceFusionTest, NilTupleLegacyAPI) {
xla::ProgramShape(computation.proto().host_program_shape()),
/*ignore_layouts=*/false);
DebugOptions debug_options = GetDebugOptionsForTest();
debug_options.set_xla_gpu_enable_address_computation_fusion(false);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));

debug_options.set_xla_gpu_enable_address_computation_fusion(true);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));
Expand Down Expand Up @@ -2460,7 +2460,7 @@ TEST_F(DynamicSliceFusionTest, DynamicCustomCallSimple) {
xla::ProgramShape(computation.proto().host_program_shape()),
/*ignore_layouts=*/false);
DebugOptions debug_options = GetDebugOptionsForTest();
debug_options.set_xla_gpu_enable_address_computation_fusion(false);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));
Expand Down Expand Up @@ -2532,12 +2532,12 @@ TEST_F(DynamicSliceFusionTest, DynamicCustomCallWithTuple) {
xla::ProgramShape(computation.proto().host_program_shape()),
/*ignore_layouts=*/true);
DebugOptions debug_options = GetDebugOptionsForTest();
debug_options.set_xla_gpu_enable_address_computation_fusion(false);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));

debug_options.set_xla_gpu_enable_address_computation_fusion(true);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));
Expand Down Expand Up @@ -2639,12 +2639,12 @@ TEST_F(DynamicSliceFusionTest, CustomCallDUS) {
xla::ProgramShape(computation.proto().host_program_shape()),
/*ignore_layouts=*/false);
DebugOptions debug_options = GetDebugOptionsForTest();
debug_options.set_xla_gpu_enable_address_computation_fusion(false);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));

debug_options.set_xla_gpu_enable_address_computation_fusion(true);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));
Expand Down Expand Up @@ -2735,12 +2735,12 @@ TEST_F(DynamicSliceFusionTest, CustomCallDUSTuple) {
xla::ProgramShape(computation.proto().host_program_shape()),
/*ignore_layouts=*/false);
DebugOptions debug_options = GetDebugOptionsForTest();
debug_options.set_xla_gpu_enable_address_computation_fusion(false);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));

debug_options.set_xla_gpu_enable_address_computation_fusion(true);
debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true);
hlo_config.set_debug_options(debug_options);
TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
computation.proto(), hlo_config));
Expand Down
2 changes: 1 addition & 1 deletion xla/service/gpu/gpu_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1255,7 +1255,7 @@ absl::Status GpuCompiler::OptimizeHloModule(
// This is a "low effort, high impact" fusion that should be run first.
if (hlo_module->config()
.debug_options()
.xla_gpu_enable_address_computation_fusion()) {
.xla_gpu_enable_dynamic_slice_fusion()) {
HloPassPipeline pipeline("dynamic-slice");
TF_ASSIGN_OR_RETURN(se::Platform * platform,
se::PlatformManager::PlatformWithId(PlatformId()));
Expand Down
6 changes: 2 additions & 4 deletions xla/service/gpu/gpu_compiler_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -428,8 +428,7 @@ ENTRY main {

HloModuleConfig config;
DebugOptions triton_enabled_debug_options = GetDebugOptionsForTest();
triton_enabled_debug_options.set_xla_gpu_enable_address_computation_fusion(
false);
triton_enabled_debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
triton_enabled_debug_options
.set_xla_gpu_require_complete_aot_autotune_results(true);
config.set_debug_options(triton_enabled_debug_options);
Expand All @@ -448,8 +447,7 @@ ENTRY main {
GetOptimizedModule(std::move(module)));
AutotunerUtil::ClearAutotuneResults();
DebugOptions triton_disabled_debug_options = GetDebugOptionsForTest();
triton_disabled_debug_options.set_xla_gpu_enable_address_computation_fusion(
false);
triton_disabled_debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
triton_disabled_debug_options.set_xla_gpu_enable_triton_gemm(false);
config.set_debug_options(triton_disabled_debug_options);
TF_ASSERT_OK_AND_ASSIGN(module,
Expand Down
2 changes: 1 addition & 1 deletion xla/xla.proto
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ message DebugOptions {

// Enables address computation fusion to optimize dynamic-slice and
// dynamic-update-slice operations around library calls.
bool xla_gpu_enable_address_computation_fusion = 105;
bool xla_gpu_enable_dynamic_slice_fusion = 105;

reserved 233; // was xla_gpu_enable_gpu2_runtime
reserved 234; // was xla_gpu_enable_gpu2_hal
Expand Down

0 comments on commit a517228

Please sign in to comment.