From a517228de8d0ff179ff77c0ecb7acddb65ae24d6 Mon Sep 17 00:00:00 2001 From: Sara Smoot Date: Mon, 29 Jul 2024 13:21:24 -0700 Subject: [PATCH] [XLA:GPU] Rename the debug flag xla_gpu_enable_address_computation_fusion to xla_gpu_enable_dynamic_slice_fusion for consistency. "AddressComputation" is confusing, it simply fuses dynamic slice (and dynamic update slice) into other thunks via buffer assignment tricks PiperOrigin-RevId: 657304729 --- xla/debug_options_flags.cc | 9 +++--- .../gpu/dynamic_slice_fusion_rewriter_test.cc | 10 +++--- .../gpu/fusions/dynamic_slice_fusion_test.cc | 32 +++++++++---------- xla/service/gpu/gpu_compiler.cc | 2 +- xla/service/gpu/gpu_compiler_test.cc | 6 ++-- xla/xla.proto | 2 +- 6 files changed, 29 insertions(+), 32 deletions(-) diff --git a/xla/debug_options_flags.cc b/xla/debug_options_flags.cc index c35ea757728c8..1b9a00e7078ad 100644 --- a/xla/debug_options_flags.cc +++ b/xla/debug_options_flags.cc @@ -144,7 +144,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() { opts.set_xla_enable_dumping(true); opts.set_xla_gpu_enable_custom_fusions(false); - opts.set_xla_gpu_enable_address_computation_fusion(true); + opts.set_xla_gpu_enable_dynamic_slice_fusion(true); opts.set_xla_gpu_nccl_termination_timeout_seconds(-1); opts.set_xla_gpu_enable_shared_constants(true); opts.set_xla_gpu_enable_nccl_user_buffers(false); @@ -1298,10 +1298,9 @@ void MakeDebugOptionsFlags(std::vector* flag_list, "expression. Default is all custom fusions registerered in a current " "process.")); flag_list->push_back(tsl::Flag( - "xla_gpu_enable_address_computation_fusion", - bool_setter_for( - &DebugOptions::set_xla_gpu_enable_address_computation_fusion), - debug_options->xla_gpu_enable_address_computation_fusion(), + "xla_gpu_enable_dynamic_slice_fusion", + bool_setter_for(&DebugOptions::set_xla_gpu_enable_dynamic_slice_fusion), + debug_options->xla_gpu_enable_dynamic_slice_fusion(), "Whether to enable XLA address computation fusion")); flag_list->push_back(tsl::Flag( "xla_gpu_nccl_termination_timeout_seconds", diff --git a/xla/service/gpu/dynamic_slice_fusion_rewriter_test.cc b/xla/service/gpu/dynamic_slice_fusion_rewriter_test.cc index 3d3eef1e4a368..a539fb5e6ca5c 100644 --- a/xla/service/gpu/dynamic_slice_fusion_rewriter_test.cc +++ b/xla/service/gpu/dynamic_slice_fusion_rewriter_test.cc @@ -942,7 +942,7 @@ TEST_F(DynamicSliceFusionRewriterTest, SimpleCustomCall) { xla::ProgramShape(computation.proto().host_program_shape()), /*ignore_layouts=*/false); DebugOptions debug_options = GetDebugOptionsForTest(); - debug_options.set_xla_gpu_enable_address_computation_fusion(false); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); @@ -990,7 +990,7 @@ TEST_F(DynamicSliceFusionRewriterTest, SimpleCustomCallLegacy) { xla::ProgramShape(computation.proto().host_program_shape()), /*ignore_layouts=*/false); DebugOptions debug_options = GetDebugOptionsForTest(); - debug_options.set_xla_gpu_enable_address_computation_fusion(false); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); @@ -1050,7 +1050,7 @@ TEST_F(DynamicSliceFusionRewriterTest, TupleSliceCustomCallLegacy) { xla::ProgramShape(computation.proto().host_program_shape()), /*ignore_layouts=*/false); DebugOptions debug_options = GetDebugOptionsForTest(); - debug_options.set_xla_gpu_enable_address_computation_fusion(false); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); @@ -1122,7 +1122,7 @@ TEST_F(DynamicSliceFusionRewriterTest, TupledOutputCustomCallLegacy) { xla::ProgramShape(computation.proto().host_program_shape()), /*ignore_layouts=*/false); DebugOptions debug_options = GetDebugOptionsForTest(); - debug_options.set_xla_gpu_enable_address_computation_fusion(false); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); @@ -1183,7 +1183,7 @@ TEST_F(DynamicSliceFusionRewriterTest, UnalignedSlice) { xla::ProgramShape(computation.proto().host_program_shape()), /*ignore_layouts=*/false); DebugOptions debug_options = GetDebugOptionsForTest(); - debug_options.set_xla_gpu_enable_address_computation_fusion(false); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); diff --git a/xla/service/gpu/fusions/dynamic_slice_fusion_test.cc b/xla/service/gpu/fusions/dynamic_slice_fusion_test.cc index 954d4b656acb0..f53dc13077729 100644 --- a/xla/service/gpu/fusions/dynamic_slice_fusion_test.cc +++ b/xla/service/gpu/fusions/dynamic_slice_fusion_test.cc @@ -867,7 +867,7 @@ TEST_F(DynamicSliceFusionTest, CustomCallSimple) { xla::ProgramShape(computation.proto().host_program_shape()), /*ignore_layouts=*/false); DebugOptions debug_options = GetDebugOptionsForTest(); - debug_options.set_xla_gpu_enable_address_computation_fusion(false); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); @@ -1010,12 +1010,12 @@ TEST_F(DynamicSliceFusionTest, CustomCallWithTuple) { xla::ProgramShape(computation.proto().host_program_shape()), /*ignore_layouts=*/true); DebugOptions debug_options = GetDebugOptionsForTest(); - debug_options.set_xla_gpu_enable_address_computation_fusion(false); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); - debug_options.set_xla_gpu_enable_address_computation_fusion(true); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); @@ -1059,12 +1059,12 @@ TEST_F(DynamicSliceFusionTest, NilTuple) { xla::ProgramShape(computation.proto().host_program_shape()), /*ignore_layouts=*/false); DebugOptions debug_options = GetDebugOptionsForTest(); - debug_options.set_xla_gpu_enable_address_computation_fusion(false); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); - debug_options.set_xla_gpu_enable_address_computation_fusion(true); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); @@ -1103,12 +1103,12 @@ TEST_F(DynamicSliceFusionTest, CustomCallLegacyAPI) { xla::ProgramShape(computation.proto().host_program_shape()), /*ignore_layouts=*/false); DebugOptions debug_options = GetDebugOptionsForTest(); - debug_options.set_xla_gpu_enable_address_computation_fusion(false); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); - debug_options.set_xla_gpu_enable_address_computation_fusion(true); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); @@ -1141,12 +1141,12 @@ TEST_F(DynamicSliceFusionTest, NilTupleLegacyAPI) { xla::ProgramShape(computation.proto().host_program_shape()), /*ignore_layouts=*/false); DebugOptions debug_options = GetDebugOptionsForTest(); - debug_options.set_xla_gpu_enable_address_computation_fusion(false); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); - debug_options.set_xla_gpu_enable_address_computation_fusion(true); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); @@ -2460,7 +2460,7 @@ TEST_F(DynamicSliceFusionTest, DynamicCustomCallSimple) { xla::ProgramShape(computation.proto().host_program_shape()), /*ignore_layouts=*/false); DebugOptions debug_options = GetDebugOptionsForTest(); - debug_options.set_xla_gpu_enable_address_computation_fusion(false); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); @@ -2532,12 +2532,12 @@ TEST_F(DynamicSliceFusionTest, DynamicCustomCallWithTuple) { xla::ProgramShape(computation.proto().host_program_shape()), /*ignore_layouts=*/true); DebugOptions debug_options = GetDebugOptionsForTest(); - debug_options.set_xla_gpu_enable_address_computation_fusion(false); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); - debug_options.set_xla_gpu_enable_address_computation_fusion(true); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); @@ -2639,12 +2639,12 @@ TEST_F(DynamicSliceFusionTest, CustomCallDUS) { xla::ProgramShape(computation.proto().host_program_shape()), /*ignore_layouts=*/false); DebugOptions debug_options = GetDebugOptionsForTest(); - debug_options.set_xla_gpu_enable_address_computation_fusion(false); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); - debug_options.set_xla_gpu_enable_address_computation_fusion(true); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); @@ -2735,12 +2735,12 @@ TEST_F(DynamicSliceFusionTest, CustomCallDUSTuple) { xla::ProgramShape(computation.proto().host_program_shape()), /*ignore_layouts=*/false); DebugOptions debug_options = GetDebugOptionsForTest(); - debug_options.set_xla_gpu_enable_address_computation_fusion(false); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); - debug_options.set_xla_gpu_enable_address_computation_fusion(true); + debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true); hlo_config.set_debug_options(debug_options); TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto( computation.proto(), hlo_config)); diff --git a/xla/service/gpu/gpu_compiler.cc b/xla/service/gpu/gpu_compiler.cc index 15debb4020b85..73c3556019d6c 100644 --- a/xla/service/gpu/gpu_compiler.cc +++ b/xla/service/gpu/gpu_compiler.cc @@ -1255,7 +1255,7 @@ absl::Status GpuCompiler::OptimizeHloModule( // This is a "low effort, high impact" fusion that should be run first. if (hlo_module->config() .debug_options() - .xla_gpu_enable_address_computation_fusion()) { + .xla_gpu_enable_dynamic_slice_fusion()) { HloPassPipeline pipeline("dynamic-slice"); TF_ASSIGN_OR_RETURN(se::Platform * platform, se::PlatformManager::PlatformWithId(PlatformId())); diff --git a/xla/service/gpu/gpu_compiler_test.cc b/xla/service/gpu/gpu_compiler_test.cc index d95b411c7ac70..7e9e3a419890d 100644 --- a/xla/service/gpu/gpu_compiler_test.cc +++ b/xla/service/gpu/gpu_compiler_test.cc @@ -428,8 +428,7 @@ ENTRY main { HloModuleConfig config; DebugOptions triton_enabled_debug_options = GetDebugOptionsForTest(); - triton_enabled_debug_options.set_xla_gpu_enable_address_computation_fusion( - false); + triton_enabled_debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); triton_enabled_debug_options .set_xla_gpu_require_complete_aot_autotune_results(true); config.set_debug_options(triton_enabled_debug_options); @@ -448,8 +447,7 @@ ENTRY main { GetOptimizedModule(std::move(module))); AutotunerUtil::ClearAutotuneResults(); DebugOptions triton_disabled_debug_options = GetDebugOptionsForTest(); - triton_disabled_debug_options.set_xla_gpu_enable_address_computation_fusion( - false); + triton_disabled_debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false); triton_disabled_debug_options.set_xla_gpu_enable_triton_gemm(false); config.set_debug_options(triton_disabled_debug_options); TF_ASSERT_OK_AND_ASSIGN(module, diff --git a/xla/xla.proto b/xla/xla.proto index 35e795b1680df..c5e81a932147e 100644 --- a/xla/xla.proto +++ b/xla/xla.proto @@ -476,7 +476,7 @@ message DebugOptions { // Enables address computation fusion to optimize dynamic-slice and // dynamic-update-slice operations around library calls. - bool xla_gpu_enable_address_computation_fusion = 105; + bool xla_gpu_enable_dynamic_slice_fusion = 105; reserved 233; // was xla_gpu_enable_gpu2_runtime reserved 234; // was xla_gpu_enable_gpu2_hal