diff --git a/third_party/xla/xla/client/executable_build_options.cc b/third_party/xla/xla/client/executable_build_options.cc index 14ed0d45665e7f..68a7bd2dc90ea7 100644 --- a/third_party/xla/xla/client/executable_build_options.cc +++ b/third_party/xla/xla/client/executable_build_options.cc @@ -48,16 +48,6 @@ se::DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const { return device_allocator_; } -ExecutableBuildOptions& ExecutableBuildOptions::set_compute_stream( - se::Stream* stream) { - compute_stream_ = stream; - return *this; -} - -se::Stream* ExecutableBuildOptions::compute_stream() const { - return compute_stream_; -} - ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal( int device_ordinal) { CHECK_GE(device_ordinal, 0); diff --git a/third_party/xla/xla/client/executable_build_options.h b/third_party/xla/xla/client/executable_build_options.h index 45a6d39143cd4f..e73d9d763102c6 100644 --- a/third_party/xla/xla/client/executable_build_options.h +++ b/third_party/xla/xla/client/executable_build_options.h @@ -40,7 +40,6 @@ namespace stream_executor { // Forward-declared to avoid StreamExecutor dependency. class DeviceMemoryAllocator; -class Stream; } // namespace stream_executor @@ -92,10 +91,6 @@ class ExecutableBuildOptions { se::DeviceMemoryAllocator* allocator); se::DeviceMemoryAllocator* device_allocator() const; - // If set, this specifies a stream that can be used for autotuning. - ExecutableBuildOptions& set_compute_stream(se::Stream* stream); - se::Stream* compute_stream() const; - // The number of replicas of this computation that are to be executed. // Defaults to 1. int num_replicas() const { return num_replicas_; } @@ -292,7 +287,6 @@ class ExecutableBuildOptions { std::optional comp_envs_; std::optional debug_options_; se::DeviceMemoryAllocator* device_allocator_ = nullptr; - se::Stream* compute_stream_ = nullptr; int num_replicas_ = 1; int num_partitions_ = 1; bool use_spmd_partitioning_ = false; diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc index cd8abbf3dd91b0..672d76adb125bb 100644 --- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc +++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc @@ -3482,8 +3482,6 @@ PjRtStreamExecutorClient::GetExecutableExtras(CompileOptions* options) { build_options.set_device_ordinal( addressable_devices.front()->local_hardware_id().value()); } - build_options.set_compute_stream( - device_state(build_options.device_ordinal()).compute_stream()); } return extras; } diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD index ee9a60acb5d5c3..1bfbe9bca51b36 100644 --- a/third_party/xla/xla/service/BUILD +++ b/third_party/xla/xla/service/BUILD @@ -1457,7 +1457,6 @@ cc_library( "//xla/hlo/ir:hlo_module_group", "//xla/pjrt/distributed:key_value_store_interface", "//xla/stream_executor:dnn", - "//xla/stream_executor:stream", "//xla/stream_executor:stream_executor_h", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/status:statusor", diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h index 9cf435418bcd16..45dc7298c4e8d4 100644 --- a/third_party/xla/xla/service/compiler.h +++ b/third_party/xla/xla/service/compiler.h @@ -40,7 +40,6 @@ limitations under the License. #include "xla/service/executable.h" #include "xla/service/hlo_module_config.h" #include "xla/service/metrics_hook_interface.h" -#include "xla/stream_executor/stream.h" #include "xla/stream_executor/stream_executor.h" #include "tsl/platform/protobuf.h" #include "tsl/platform/threadpool.h" @@ -159,10 +158,6 @@ class Compiler { std::optional target_config; MultiProcessKeyValueStore key_value_store; - - // If compute_stream is set, this is the stream used for all autotuning - // during compilation. - se::Stream* compute_stream = nullptr; }; virtual ~Compiler() = default; diff --git a/third_party/xla/xla/service/gpu/autotuning/BUILD b/third_party/xla/xla/service/gpu/autotuning/BUILD index 13e79c8c8af1e3..2e21ea0b3cd02c 100644 --- a/third_party/xla/xla/service/gpu/autotuning/BUILD +++ b/third_party/xla/xla/service/gpu/autotuning/BUILD @@ -145,7 +145,6 @@ xla_test( "//xla/stream_executor:device_description", "//xla/stream_executor:device_description_proto_cc", "//xla/stream_executor:semantic_version", - "//xla/stream_executor:stream", "//xla/stream_executor:stream_executor_h", "//xla/tests:filecheck", "//xla/tests:hlo_test_base", @@ -303,8 +302,8 @@ xla_test( "//xla/hlo/ir:hlo", "//xla/service:platform_util", "//xla/stream_executor:platform", - "//xla/stream_executor:stream", "//xla/tests:hlo_test_base", + "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:string_view", "@com_google_googletest//:gtest_main", "@local_tsl//tsl/platform:statusor", @@ -332,7 +331,6 @@ xla_test( "//xla/stream_executor:device_description", "//xla/stream_executor:platform", "//xla/stream_executor:semantic_version", - "//xla/stream_executor:stream", "//xla/tests:hlo_test_base", "//xla/tsl/lib/core:status_test_util", "//xla/tsl/protobuf:dnn_proto_cc", @@ -439,7 +437,6 @@ xla_test( "//xla/stream_executor:device_description", "//xla/stream_executor:dnn", "//xla/stream_executor:platform", - "//xla/stream_executor:stream", "//xla/tests:hlo_test_base", "//xla/tsl/lib/core:status_test_util", "@com_google_absl//absl/strings:string_view", @@ -500,12 +497,10 @@ xla_test( "//xla:xla_proto_cc", "//xla/hlo/ir:hlo", "//xla/hlo/pass:hlo_pass_pipeline", - "//xla/stream_executor:stream", "//xla/tests:hlo_test_base", "//xla/tests:xla_internal_test_main", # fixdeps: keep "@com_google_googletest//:gtest", "@local_tsl//tsl/platform:path", - "@local_tsl//tsl/platform:statusor", "@local_tsl//tsl/platform:test", ], ) diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util_test.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util_test.cc index 28489d63573e6c..a8b959482ebba0 100644 --- a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util_test.cc +++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util_test.cc @@ -15,7 +15,6 @@ limitations under the License. #include "xla/service/gpu/autotuning/autotuner_compile_util.h" -#include #include #include @@ -24,7 +23,6 @@ limitations under the License. #include "xla/service/gpu/autotuning/autotuner_util.h" #include "xla/service/platform_util.h" #include "xla/stream_executor/platform.h" -#include "xla/stream_executor/stream.h" #include "xla/tests/hlo_test_base.h" #include "tsl/platform/statusor.h" @@ -49,12 +47,9 @@ ENTRY main { se::Platform* platform = PlatformUtil::GetDefaultPlatform().value(); TF_ASSERT_OK_AND_ASSIGN(std::vector executors, PlatformUtil::GetStreamExecutors(platform)); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr stream, - executors.at(0)->CreateStream()); - AutotuneConfig autotune_config{ - DeviceConfig{executors.at(0), nullptr, stream.get()}, - GetDebugOptionsForTest()}; + AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr}, + GetDebugOptionsForTest()}; auto& root = *module->entry_computation()->root_instruction(); @@ -106,11 +101,8 @@ ENTRY main { TF_ASSERT_OK_AND_ASSIGN(std::vector executors, PlatformUtil::GetStreamExecutors(platform)); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr stream, - executors.at(0)->CreateStream()); - AutotuneConfig autotune_config{ - DeviceConfig{executors.at(0), nullptr, stream.get()}, - GetDebugOptionsForTest()}; + AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr}, + GetDebugOptionsForTest()}; auto& root = *module->entry_computation()->root_instruction(); @@ -162,11 +154,8 @@ ENTRY main { TF_ASSERT_OK_AND_ASSIGN(std::vector executors, PlatformUtil::GetStreamExecutors(platform)); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr stream, - executors.at(0)->CreateStream()); - AutotuneConfig autotune_config{ - DeviceConfig{executors.at(0), nullptr, stream.get()}, - GetDebugOptionsForTest()}; + AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr}, + GetDebugOptionsForTest()}; auto& root = *module->entry_computation()->root_instruction(); diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h b/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h index 1fc28b8c1db719..48bb3e3b291442 100644 --- a/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h +++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h @@ -50,8 +50,6 @@ struct DeviceConfig { // memory while timing the various convolution algorithms. If it's null, // we'll use the default allocator on the StreamExecutor. se::DeviceMemoryAllocator* allocator = nullptr; // may be null - - se::Stream* compute_stream = nullptr; }; struct DevicelessConfig { @@ -179,8 +177,7 @@ class AutotuneConfig { absl::StatusOr GetStream() const { CHECK(std::holds_alternative(config_)); - se::Stream* stream = std::get(config_).compute_stream; - return stream; + return GetAllocator()->GetStream(GetExecutor()->device_ordinal()); } const se::GpuComputeCapability& GetGpuComputeCapability() const { diff --git a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc index ea31846a6fbcc8..aaa502f600f90e 100644 --- a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc +++ b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc @@ -16,7 +16,6 @@ limitations under the License. #include "xla/service/gpu/autotuning/conv_algorithm_picker.h" #include -#include #include #include @@ -36,7 +35,6 @@ limitations under the License. #include "xla/stream_executor/device_description.h" #include "xla/stream_executor/dnn.h" #include "xla/stream_executor/platform.h" -#include "xla/stream_executor/stream.h" #include "xla/tests/hlo_test_base.h" #include "xla/tsl/lib/core/status_test_util.h" #include "xla/xla.pb.h" @@ -80,8 +78,6 @@ ENTRY main { PlatformUtil::GetStreamExecutors(platform)); ASSERT_GT(executors.size(), 0); se::StreamExecutor* stream_exec = executors[0]; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr stream, - stream_exec->CreateStream()); const se::GpuComputeCapability& cc = backend() .default_stream_executor() @@ -92,7 +88,7 @@ ENTRY main { changed = false; DebugOptions opts = DefaultDebugOptionsIgnoringFlags(); - AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr, stream.get()}, opts}; + AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr}, opts}; TF_ASSERT_OK_AND_ASSIGN(changed, RunHloPass(GpuConvAlgorithmPicker(cfg), m.get())); ASSERT_TRUE(changed); @@ -204,9 +200,7 @@ ENTRY main { ASSERT_TRUE(changed); DebugOptions opts = DefaultDebugOptionsIgnoringFlags(); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr stream, - stream_exec->CreateStream()); - AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr, stream.get()}, opts}; + AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr}, opts}; TF_ASSERT_OK_AND_ASSIGN(changed, RunHloPass(GpuConvAlgorithmPicker(cfg), m.get())); ASSERT_TRUE(changed); diff --git a/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner_test.cc index 44a08788b49451..3a214ffeb56d5a 100644 --- a/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner_test.cc +++ b/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner_test.cc @@ -23,10 +23,8 @@ limitations under the License. #include "xla/hlo/ir/hlo_module.h" #include "xla/hlo/pass/hlo_pass_pipeline.h" #include "xla/service/gpu/autotuning/autotuner_util.h" -#include "xla/stream_executor/stream.h" #include "xla/tests/hlo_test_base.h" #include "xla/xla.pb.h" -#include "tsl/platform/statusor.h" #include "tsl/platform/test.h" namespace xla { @@ -68,12 +66,9 @@ TEST_F(CustomKernelFusionAutotunerTest, DontRunOnNonCustomFusions) { HloPassPipeline pipeline("custom_kernel_fusion_autotuner"); DebugOptions debug_options; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr stream, - backend().default_stream_executor()->CreateStream()); - AutotuneConfig autotune_config = AutotuneConfig{DeviceConfig{backend().default_stream_executor(), - backend().memory_allocator(), stream.get()}, + backend().memory_allocator()}, debug_options}; pipeline.AddPass(autotune_config); @@ -105,11 +100,9 @@ TEST_F(CustomKernelFusionAutotunerTest, HloPassPipeline pipeline("custom_kernel_fusion_autotuner"); DebugOptions debug_options; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr stream, - backend().default_stream_executor()->CreateStream()); AutotuneConfig autotune_config = AutotuneConfig{DeviceConfig{backend().default_stream_executor(), - backend().memory_allocator(), stream.get()}, + backend().memory_allocator()}, debug_options}; pipeline.AddPass(autotune_config); ASSERT_TRUE(pipeline.Run(hlo_module.get()).ok()); @@ -138,11 +131,9 @@ TEST_F(CustomKernelFusionAutotunerTest, HloPassPipeline pipeline("custom_kernel_fusion_autotuner"); DebugOptions debug_options; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr stream, - backend().default_stream_executor()->CreateStream()); AutotuneConfig autotune_config = AutotuneConfig{DeviceConfig{backend().default_stream_executor(), - backend().memory_allocator(), stream.get()}, + backend().memory_allocator()}, debug_options}; pipeline.AddPass(autotune_config); diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc index cca62d35976c96..6526e3338fb6c5 100644 --- a/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc +++ b/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc @@ -17,7 +17,6 @@ limitations under the License. #include #include -#include #include #include @@ -34,7 +33,6 @@ limitations under the License. #include "xla/stream_executor/device_description.h" #include "xla/stream_executor/platform.h" #include "xla/stream_executor/semantic_version.h" -#include "xla/stream_executor/stream.h" #include "xla/tests/hlo_test_base.h" #include "xla/tsl/lib/core/status_test_util.h" #include "xla/tsl/protobuf/dnn.pb.h" @@ -137,10 +135,7 @@ ENTRY main { /*toolkit_version=*/stream_executor::SemanticVersion{12, 4, 0}), module.get())); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr stream, - stream_exec()->CreateStream()); - AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()}, - debug_opts}; + AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, debug_opts}; GemmAlgorithmPicker gpicker(cfg); // Note that, we do not care if the algorithm index has been changed: // the thing matters is the # of algorithms left after sorting out. @@ -180,10 +175,7 @@ ENTRY main { /*toolkit_version=*/stream_executor::SemanticVersion{12, 4, 0}), module.get())); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr stream, - stream_exec()->CreateStream()); - AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()}, - debug_opts}; + AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, debug_opts}; GemmAlgorithmPicker gpicker(cfg); TF_ASSERT_OK_AND_ASSIGN(changed, RunHloPass(gpicker, module.get())); num_left2 = gpicker.num_algorithms_left(); @@ -216,9 +208,7 @@ ENTRY main { m.get())); changed = false; DebugOptions opts; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr stream, - stream_exec()->CreateStream()); - AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()}, opts}; + AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, opts}; TF_ASSERT_OK_AND_ASSIGN(changed, RunHloPass(GemmAlgorithmPicker(cfg), m.get())); ASSERT_TRUE(changed); @@ -283,9 +273,7 @@ ENTRY main { changed = false; DebugOptions opts; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr stream, - stream_exec()->CreateStream()); - AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()}, opts}; + AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, opts}; TF_ASSERT_OK_AND_ASSIGN(changed, RunHloPass(GemmAlgorithmPicker(cfg), m.get())); diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc index 47c25a26a574cb..a7b79d36e549e2 100644 --- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc +++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc @@ -53,7 +53,6 @@ limitations under the License. #include "xla/stream_executor/device_description.h" #include "xla/stream_executor/device_description.pb.h" #include "xla/stream_executor/semantic_version.h" -#include "xla/stream_executor/stream.h" #include "xla/stream_executor/stream_executor.h" #include "xla/tests/filecheck.h" #include "xla/tests/hlo_test_base.h" @@ -193,10 +192,8 @@ class StatelessAutotunerTest : public HloTestBase { ccc->set_major(compute_capability.major); ccc->set_minor(compute_capability.minor); - static se::Stream* stream = - backend().default_stream_executor()->CreateStream().value().release(); DeviceConfig test_config{backend().default_stream_executor(), - backend().memory_allocator(), stream}; + backend().memory_allocator()}; AutotuneConfig autotune_config{test_config, debug_options}; GemmFusionAutotunerImpl autotuner(autotune_config, toolkit_version, debug_options, nullptr); @@ -213,12 +210,8 @@ class StatelessAutotunerTest : public HloTestBase { // Returns the config for the current device. absl::StatusOr> GetPossibleMatmulAutotuneConfigs(const HloModule& module) { - static se::Stream* stream = - backend().default_stream_executor()->CreateStream().value().release(); - DeviceConfig device_config{backend().default_stream_executor(), backend().memory_allocator()}; - device_config.compute_stream = stream; AutotuneConfig autotune_config{device_config, GetDebugOptionsForTest()}; GemmFusionAutotunerImpl autotuner(autotune_config, GetToolkitVersion(), GetDebugOptionsForTest(), nullptr); @@ -324,14 +317,11 @@ class GemmFusionAutotunerTest : public StatelessAutotunerTest { tsl::port::MaxParallelism()); DebugOptions opts; MultiProcessKeyValueStore key_value_store; - static se::Stream* stream = - backend().default_stream_executor()->CreateStream().value().release(); - DeviceConfig device_config{backend().default_stream_executor(), - backend().memory_allocator()}; - device_config.compute_stream = stream; - pipeline.AddPass(AutotuneConfig{device_config, opts}, - GetToolkitVersion(), &thread_pool, - key_value_store); + pipeline.AddPass( + AutotuneConfig{DeviceConfig{backend().default_stream_executor(), + backend().memory_allocator()}, + opts}, + GetToolkitVersion(), &thread_pool, key_value_store); RunAndFilecheckHloRewrite( hlo, std::move(pipeline), expected, [](const HloModule* m) { @@ -713,12 +703,9 @@ ENTRY main { ParseAndReturnVerifiedModule(kHloText)); DebugOptions opts; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr stream, - backend().default_stream_executor()->CreateStream()); - AutotuneConfig autotune_config{ DeviceConfig{backend().default_stream_executor(), - backend().memory_allocator(), stream.get()}, + backend().memory_allocator()}, opts}; AutotuneCacheKey cache_key(autotune_config.GetModelStr(), *module->entry_computation()->root_instruction()); @@ -1267,12 +1254,11 @@ TEST_F(GemmFusionAutotunerTest, RewritesGemmFusionToCustomKernelFusion) { std::unique_ptr module = ParseAndReturnVerifiedModule(kHlo).value(); - static se::Stream* stream = - backend().default_stream_executor()->CreateStream().value().release(); DebugOptions opts; - DeviceConfig device_config{backend().default_stream_executor(), - backend().memory_allocator(), stream}; - AutotuneConfig autotune_config{device_config, opts}; + AutotuneConfig autotune_config{ + DeviceConfig{backend().default_stream_executor(), + backend().memory_allocator()}, + opts}; AutotuneCacheKey cache_key(autotune_config.GetModelStr(), *module->entry_computation()->root_instruction()); TF_ASSERT_OK_AND_ASSIGN(AutotuneResults autotune_results_override, diff --git a/third_party/xla/xla/service/gpu/determinism_test.cc b/third_party/xla/xla/service/gpu/determinism_test.cc index 47ec3460ee3cb3..19a3d1390fff1d 100644 --- a/third_party/xla/xla/service/gpu/determinism_test.cc +++ b/third_party/xla/xla/service/gpu/determinism_test.cc @@ -111,9 +111,6 @@ class DeterminismTest : public GpuCodegenTest { TF_ASSERT_OK_AND_ASSIGN(stream_executor::Platform * default_platform, PlatformUtil::GetDefaultPlatform()); stream_executor::gpu::MockGpuExecutor executor(default_platform, 0); - EXPECT_CALL(executor, CreateStream).WillRepeatedly([&]() { - return backend().default_stream_executor()->CreateStream(); - }); EXPECT_CALL(executor, CreateEventBasedTimer).Times(0); EXPECT_CALL(executor, GetDeviceDescription) .WillRepeatedly([this]() -> const se::DeviceDescription& { diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc index 1e52b4bcbf28ba..83b332a1ce4ffe 100755 --- a/third_party/xla/xla/service/gpu/gpu_compiler.cc +++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc @@ -307,6 +307,18 @@ MaybeOwningThreadPool CreateMaybeOwningThreadPool( } } +absl::StatusOr GetAutotuneConfig( + se::StreamExecutor* stream_exec, const DebugOptions& debug_options, + const GpuCompiler::CompileOptions& options, + const Compiler::TargetConfig& gpu_target_config) { + if (stream_exec) { + return AutotuneConfig{DeviceConfig{stream_exec, options.device_allocator}, + debug_options}; + } + return AutotuneConfig{DevicelessConfig{gpu_target_config.device_description}, + debug_options}; +} + se::GpuComputeCapability GetGpuVersion(const se::StreamExecutor* stream_exec) { return stream_exec->GetDeviceDescription().gpu_compute_capability(); } @@ -464,24 +476,6 @@ GpuCompiler::GpuCompiler(se::Platform::Id platform_id, pointer_size_(llvm::DataLayout(data_layout) .getPointerSize(0 /* default address space */)) {} -absl::StatusOr GpuCompiler::GetAutotuneConfig( - se::StreamExecutor* stream_exec, const DebugOptions& debug_options, - const GpuCompiler::CompileOptions& options, - const Compiler::TargetConfig& gpu_target_config) { - if (stream_exec) { - if ((options.compute_stream == nullptr) && (compute_stream_ == nullptr)) { - TF_ASSIGN_OR_RETURN(compute_stream_, stream_exec->CreateStream()); - } - return AutotuneConfig{ - DeviceConfig{stream_exec, options.device_allocator, - options.compute_stream != nullptr ? options.compute_stream - : compute_stream_.get()}, - debug_options}; - } - return AutotuneConfig{DevicelessConfig{gpu_target_config.device_description}, - debug_options}; -} - namespace { // Adds the HloVerifier for GPU to the given pipeline. void AddHloVerifier(HloPassPipeline* pipeline, @@ -1201,6 +1195,26 @@ absl::Status RunPostFusionSimplificationPasses( return pipeline.Run(hlo_module).status(); } +absl::Status RunPostFusionVerificationPasses( + HloModule* hlo_module, se::StreamExecutor* stream_exec, + const GpuCompiler::CompileOptions& options, + const Compiler::TargetConfig& gpu_target_config) { + HloPassPipeline pipeline("post-fusion-verification-pipeline optimization"); + + if (hlo_module->config() + .debug_options() + .xla_gpu_verify_triton_fusion_numerics()) { + TF_ASSIGN_OR_RETURN( + AutotuneConfig autotune_config, + GetAutotuneConfig(stream_exec, hlo_module->config().debug_options(), + options, gpu_target_config)); + + pipeline.AddPass(autotune_config); + } + + return pipeline.Run(hlo_module).status(); +} + absl::Status RunLayoutNormalizationPasses( HloModule* hlo_module, const se::GpuComputeCapability& gpu_version) { HloPassPipeline layout_normalization_pipeline("layout normalization"); @@ -1279,26 +1293,6 @@ absl::Status GpuCompiler::RunCollectiveScheduleLinearizerPasses( return pipeline.Run(hlo_module).status(); } -absl::Status GpuCompiler::RunPostFusionVerificationPasses( - HloModule* hlo_module, se::StreamExecutor* stream_exec, - const GpuCompiler::CompileOptions& options, - const Compiler::TargetConfig& gpu_target_config) { - HloPassPipeline pipeline("post-fusion-verification-pipeline optimization"); - - if (hlo_module->config() - .debug_options() - .xla_gpu_verify_triton_fusion_numerics()) { - TF_ASSIGN_OR_RETURN( - AutotuneConfig autotune_config, - GetAutotuneConfig(stream_exec, hlo_module->config().debug_options(), - options, gpu_target_config)); - - pipeline.AddPass(autotune_config); - } - - return pipeline.Run(hlo_module).status(); -} - // Runs optimization passes on the given HLO module. absl::Status GpuCompiler::OptimizeHloModule( HloModule* hlo_module, se::StreamExecutor* stream_exec, diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h index 85e75a46390256..b8fe422dbe1fac 100644 --- a/third_party/xla/xla/service/gpu/gpu_compiler.h +++ b/third_party/xla/xla/service/gpu/gpu_compiler.h @@ -245,18 +245,6 @@ class GpuCompiler : public LLVMCompiler { return Unimplemented("LinkModules is not implemented."); } - // Creates an AutotuneConfig for the given options. - absl::StatusOr GetAutotuneConfig( - se::StreamExecutor* stream_exec, const DebugOptions& debug_options, - const GpuCompiler::CompileOptions& options, - const Compiler::TargetConfig& gpu_target_config); - - // Runs verification passes after fusion. - absl::Status RunPostFusionVerificationPasses( - HloModule* hlo_module, se::StreamExecutor* stream_exec, - const GpuCompiler::CompileOptions& options, - const Compiler::TargetConfig& gpu_target_config); - se::Platform::Id platform_id_; // The triple that represents our target. @@ -268,9 +256,6 @@ class GpuCompiler : public LLVMCompiler { // The size in bytes of a pointer. Used by ShapeSizeBytesFunction. const int64_t pointer_size_; - // A stream to use for autotuning if none is provided. - std::unique_ptr compute_stream_; - GpuCompiler(const GpuCompiler&) = delete; GpuCompiler& operator=(const GpuCompiler&) = delete; }; diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD index 7e2eabfb92b41c..3c2d885e0b13b1 100644 --- a/third_party/xla/xla/service/gpu/transforms/BUILD +++ b/third_party/xla/xla/service/gpu/transforms/BUILD @@ -3223,6 +3223,7 @@ xla_test( deps = [ ":triton_fusion_numerics_verifier", "//xla:shape_util", + "//xla:test_helpers", "//xla:xla_data_proto_cc", "//xla:xla_proto_cc", "//xla/hlo/ir:hlo", @@ -3231,7 +3232,6 @@ xla_test( "//xla/service/gpu/autotuning:autotuner_compile_util", "//xla/service/gpu/autotuning:autotuner_util", "//xla/stream_executor:platform", - "//xla/stream_executor:stream", "//xla/tests:hlo_test_base", "//xla/tsl/lib/core:status_test_util", "@com_google_absl//absl/status", @@ -3239,7 +3239,6 @@ xla_test( "@com_google_absl//absl/strings:string_view", "@com_google_googletest//:gtest_main", "@local_tsl//tsl/platform:status_matchers", - "@local_tsl//tsl/platform:statusor", ], ) diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc index ea6fd9cbb25357..2762166278cdbf 100644 --- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc +++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc @@ -32,13 +32,11 @@ limitations under the License. #include "xla/service/gpu/autotuning/autotuner_util.h" #include "xla/service/platform_util.h" #include "xla/stream_executor/platform.h" -#include "xla/stream_executor/stream.h" #include "xla/tests/hlo_test_base.h" #include "xla/tsl/lib/core/status_test_util.h" #include "xla/xla.pb.h" #include "xla/xla_data.pb.h" #include "tsl/platform/status_matchers.h" -#include "tsl/platform/statusor.h" namespace xla::gpu { namespace { @@ -81,9 +79,7 @@ class TritonFusionNumericsVerifierTest se::Platform* platform = PlatformUtil::GetDefaultPlatform().value(); auto executors_or = PlatformUtil::GetStreamExecutors(platform); TF_EXPECT_OK(executors_or); - static se::Stream* stream = - executors_or->at(0)->CreateStream().value().release(); - return AutotuneConfig{DeviceConfig{executors_or->at(0), nullptr, stream}, + return AutotuneConfig{DeviceConfig{executors_or->at(0), nullptr}, GetDebugOptionsForTest()}; } @@ -298,11 +294,8 @@ ENTRY main { std::unique_ptr module = *ParseAndReturnVerifiedModule(hlo_text, GetModuleConfigForTest()); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr stream, - backend().default_stream_executor()->CreateStream()); AutotuneConfig autotune_config{ - DeviceConfig{backend().default_stream_executor(), GetAllocator(), - stream.get()}, + DeviceConfig{backend().default_stream_executor(), GetAllocator()}, module->config().debug_options()}; TritonFusionNumericsVerifier verifier(autotune_config); TF_EXPECT_OK(RunHloPass(verifier, module.get())); diff --git a/third_party/xla/xla/service/local_service.cc b/third_party/xla/xla/service/local_service.cc index d4c175a31effb2..557fb216582695 100644 --- a/third_party/xla/xla/service/local_service.cc +++ b/third_party/xla/xla/service/local_service.cc @@ -94,8 +94,7 @@ LocalService::CompileExecutables( false, {}, {build_options.key_value_store(), build_options.process_index(), - build_options.process_count()}, - build_options.compute_stream()}; + build_options.process_count()}}; if (build_options.num_partitions() == 1) { TF_ASSIGN_OR_RETURN( std::unique_ptr executable, @@ -140,12 +139,12 @@ LocalService::CompileAotResults( // cores per module, but otherwise only uses the first executor. std::vector executors(build_options.num_partitions(), executor); - Compiler::CompileOptions compile_options{build_options.device_allocator(), - build_options.compile_thread_pool()}; - compile_options.compute_stream = build_options.compute_stream(); + return BuildAotResults( /*module_protos=*/{&computation.proto()}, std::move(module_configs), - execute_backend_.get(), {executors}, compile_options, + execute_backend_.get(), {executors}, + Compiler::CompileOptions{build_options.device_allocator(), + build_options.compile_thread_pool()}, build_options.run_backend_only()); }