Reverts 58dc89e

PiperOrigin-RevId: 689884643
ROCm · Oct 25, 2024 · 2749345 · 2749345
1 parent 7f37924
commit 2749345
Show file tree

Hide file tree

Showing 18 changed files with 68 additions and 185 deletions.
diff --git a/third_party/xla/xla/client/executable_build_options.cc b/third_party/xla/xla/client/executable_build_options.cc
@@ -48,16 +48,6 @@ se::DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
   return device_allocator_;
 }
 
-ExecutableBuildOptions& ExecutableBuildOptions::set_compute_stream(
-    se::Stream* stream) {
-  compute_stream_ = stream;
-  return *this;
-}
-
-se::Stream* ExecutableBuildOptions::compute_stream() const {
-  return compute_stream_;
-}
-
 ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
     int device_ordinal) {
   CHECK_GE(device_ordinal, 0);

diff --git a/third_party/xla/xla/client/executable_build_options.h b/third_party/xla/xla/client/executable_build_options.h
@@ -40,7 +40,6 @@ namespace stream_executor {
 
 // Forward-declared to avoid StreamExecutor dependency.
 class DeviceMemoryAllocator;
-class Stream;
 
 }  // namespace stream_executor
 
@@ -92,10 +91,6 @@ class ExecutableBuildOptions {
       se::DeviceMemoryAllocator* allocator);
   se::DeviceMemoryAllocator* device_allocator() const;
 
-  // If set, this specifies a stream that can be used for autotuning.
-  ExecutableBuildOptions& set_compute_stream(se::Stream* stream);
-  se::Stream* compute_stream() const;
-
   // The number of replicas of this computation that are to be executed.
   // Defaults to 1.
   int num_replicas() const { return num_replicas_; }
@@ -292,7 +287,6 @@ class ExecutableBuildOptions {
   std::optional<CompilationEnvironments> comp_envs_;
   std::optional<DebugOptions> debug_options_;
   se::DeviceMemoryAllocator* device_allocator_ = nullptr;
-  se::Stream* compute_stream_ = nullptr;
   int num_replicas_ = 1;
   int num_partitions_ = 1;
   bool use_spmd_partitioning_ = false;

diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -3482,8 +3482,6 @@ PjRtStreamExecutorClient::GetExecutableExtras(CompileOptions* options) {
       build_options.set_device_ordinal(
           addressable_devices.front()->local_hardware_id().value());
     }
-    build_options.set_compute_stream(
-        device_state(build_options.device_ordinal()).compute_stream());
   }
   return extras;
 }

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
@@ -1457,7 +1457,6 @@ cc_library(
         "//xla/hlo/ir:hlo_module_group",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/stream_executor:dnn",
-        "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status:statusor",

diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/service/executable.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/metrics_hook_interface.h"
-#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/protobuf.h"
 #include "tsl/platform/threadpool.h"
@@ -159,10 +158,6 @@ class Compiler {
     std::optional<TargetConfig> target_config;
 
     MultiProcessKeyValueStore key_value_store;
-
-    // If compute_stream is set, this is the stream used for all autotuning
-    // during compilation.
-    se::Stream* compute_stream = nullptr;
   };
 
   virtual ~Compiler() = default;

diff --git a/third_party/xla/xla/service/gpu/autotuning/BUILD b/third_party/xla/xla/service/gpu/autotuning/BUILD
@@ -145,7 +145,6 @@ xla_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:semantic_version",
-        "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",
@@ -303,8 +302,8 @@ xla_test(
         "//xla/hlo/ir:hlo",
         "//xla/service:platform_util",
         "//xla/stream_executor:platform",
-        "//xla/stream_executor:stream",
         "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
@@ -332,7 +331,6 @@ xla_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:semantic_version",
-        "//xla/stream_executor:stream",
         "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/protobuf:dnn_proto_cc",
@@ -439,7 +437,6 @@ xla_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:platform",
-        "//xla/stream_executor:stream",
         "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/strings:string_view",
@@ -500,12 +497,10 @@ xla_test(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
-        "//xla/stream_executor:stream",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
 )

diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util_test.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/service/gpu/autotuning/autotuner_compile_util.h"
 
-#include <memory>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -24,7 +23,6 @@ limitations under the License.
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
@@ -49,12 +47,9 @@ ENTRY main {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
                           PlatformUtil::GetStreamExecutors(platform));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          executors.at(0)->CreateStream());
 
-  AutotuneConfig autotune_config{
-      DeviceConfig{executors.at(0), nullptr, stream.get()},
-      GetDebugOptionsForTest()};
+  AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr},
+                                 GetDebugOptionsForTest()};
 
   auto& root = *module->entry_computation()->root_instruction();
 
@@ -106,11 +101,8 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
                           PlatformUtil::GetStreamExecutors(platform));
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          executors.at(0)->CreateStream());
-  AutotuneConfig autotune_config{
-      DeviceConfig{executors.at(0), nullptr, stream.get()},
-      GetDebugOptionsForTest()};
+  AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr},
+                                 GetDebugOptionsForTest()};
 
   auto& root = *module->entry_computation()->root_instruction();
 
@@ -162,11 +154,8 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
                           PlatformUtil::GetStreamExecutors(platform));
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          executors.at(0)->CreateStream());
-  AutotuneConfig autotune_config{
-      DeviceConfig{executors.at(0), nullptr, stream.get()},
-      GetDebugOptionsForTest()};
+  AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr},
+                                 GetDebugOptionsForTest()};
 
   auto& root = *module->entry_computation()->root_instruction();
 

diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h b/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h
@@ -50,8 +50,6 @@ struct DeviceConfig {
   // memory while timing the various convolution algorithms.  If it's null,
   // we'll use the default allocator on the StreamExecutor.
   se::DeviceMemoryAllocator* allocator = nullptr;  // may be null
-
-  se::Stream* compute_stream = nullptr;
 };
 
 struct DevicelessConfig {
@@ -179,8 +177,7 @@ class AutotuneConfig {
 
   absl::StatusOr<se::Stream*> GetStream() const {
     CHECK(std::holds_alternative<DeviceConfig>(config_));
-    se::Stream* stream = std::get<DeviceConfig>(config_).compute_stream;
-    return stream;
+    return GetAllocator()->GetStream(GetExecutor()->device_ordinal());
   }
 
   const se::GpuComputeCapability& GetGpuComputeCapability() const {

diff --git a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/service/gpu/autotuning/conv_algorithm_picker.h"
 
 #include <cstdint>
-#include <memory>
 #include <variant>
 #include <vector>
 
@@ -36,7 +35,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla.pb.h"
@@ -80,8 +78,6 @@ ENTRY main {
                           PlatformUtil::GetStreamExecutors(platform));
   ASSERT_GT(executors.size(), 0);
   se::StreamExecutor* stream_exec = executors[0];
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          stream_exec->CreateStream());
 
   const se::GpuComputeCapability& cc = backend()
                                            .default_stream_executor()
@@ -92,7 +88,7 @@ ENTRY main {
   changed = false;
   DebugOptions opts = DefaultDebugOptionsIgnoringFlags();
 
-  AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr, stream.get()}, opts};
+  AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr}, opts};
   TF_ASSERT_OK_AND_ASSIGN(changed,
                           RunHloPass(GpuConvAlgorithmPicker(cfg), m.get()));
   ASSERT_TRUE(changed);
@@ -204,9 +200,7 @@ ENTRY main {
   ASSERT_TRUE(changed);
 
   DebugOptions opts = DefaultDebugOptionsIgnoringFlags();
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          stream_exec->CreateStream());
-  AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr, stream.get()}, opts};
+  AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr}, opts};
   TF_ASSERT_OK_AND_ASSIGN(changed,
                           RunHloPass(GpuConvAlgorithmPicker(cfg), m.get()));
   ASSERT_TRUE(changed);

diff --git a/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner_test.cc
@@ -23,10 +23,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
-#include "xla/stream_executor/stream.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/xla.pb.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -68,12 +66,9 @@ TEST_F(CustomKernelFusionAutotunerTest, DontRunOnNonCustomFusions) {
 
   HloPassPipeline pipeline("custom_kernel_fusion_autotuner");
   DebugOptions debug_options;
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          backend().default_stream_executor()->CreateStream());
-
   AutotuneConfig autotune_config =
       AutotuneConfig{DeviceConfig{backend().default_stream_executor(),
-                                  backend().memory_allocator(), stream.get()},
+                                  backend().memory_allocator()},
                      debug_options};
   pipeline.AddPass<CustomKernelFusionAutotuner>(autotune_config);
 
@@ -105,11 +100,9 @@ TEST_F(CustomKernelFusionAutotunerTest,
 
   HloPassPipeline pipeline("custom_kernel_fusion_autotuner");
   DebugOptions debug_options;
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          backend().default_stream_executor()->CreateStream());
   AutotuneConfig autotune_config =
       AutotuneConfig{DeviceConfig{backend().default_stream_executor(),
-                                  backend().memory_allocator(), stream.get()},
+                                  backend().memory_allocator()},
                      debug_options};
   pipeline.AddPass<CustomKernelFusionAutotuner>(autotune_config);
   ASSERT_TRUE(pipeline.Run(hlo_module.get()).ok());
@@ -138,11 +131,9 @@ TEST_F(CustomKernelFusionAutotunerTest,
 
   HloPassPipeline pipeline("custom_kernel_fusion_autotuner");
   DebugOptions debug_options;
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          backend().default_stream_executor()->CreateStream());
   AutotuneConfig autotune_config =
       AutotuneConfig{DeviceConfig{backend().default_stream_executor(),
-                                  backend().memory_allocator(), stream.get()},
+                                  backend().memory_allocator()},
                      debug_options};
   pipeline.AddPass<CustomKernelFusionAutotuner>(autotune_config);
 

diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
-#include <memory>
 #include <string>
 #include <variant>
 
@@ -34,7 +33,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/semantic_version.h"
-#include "xla/stream_executor/stream.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
@@ -137,10 +135,7 @@ ENTRY main {
                 /*toolkit_version=*/stream_executor::SemanticVersion{12, 4, 0}),
             module.get()));
 
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                            stream_exec()->CreateStream());
-    AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()},
-                       debug_opts};
+    AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, debug_opts};
     GemmAlgorithmPicker gpicker(cfg);
     // Note that, we do not care if the algorithm index has been changed:
     // the thing matters is the # of algorithms left after sorting out.
@@ -180,10 +175,7 @@ ENTRY main {
                 /*toolkit_version=*/stream_executor::SemanticVersion{12, 4, 0}),
             module.get()));
 
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                            stream_exec()->CreateStream());
-    AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()},
-                       debug_opts};
+    AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, debug_opts};
     GemmAlgorithmPicker gpicker(cfg);
     TF_ASSERT_OK_AND_ASSIGN(changed, RunHloPass(gpicker, module.get()));
     num_left2 = gpicker.num_algorithms_left();
@@ -216,9 +208,7 @@ ENTRY main {
           m.get()));
   changed = false;
   DebugOptions opts;
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          stream_exec()->CreateStream());
-  AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()}, opts};
+  AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, opts};
   TF_ASSERT_OK_AND_ASSIGN(changed,
                           RunHloPass(GemmAlgorithmPicker(cfg), m.get()));
   ASSERT_TRUE(changed);
@@ -283,9 +273,7 @@ ENTRY main {
   changed = false;
 
   DebugOptions opts;
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          stream_exec()->CreateStream());
-  AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()}, opts};
+  AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, opts};
 
   TF_ASSERT_OK_AND_ASSIGN(changed,
                           RunHloPass(GemmAlgorithmPicker(cfg), m.get()));