From 2749345bcb0bf1f0b40db52575271eecc4087184 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Fri, 25 Oct 2024 12:59:42 -0700
Subject: [PATCH] Reverts 58dc89eb749220752e8a6cfb970e5bcd1552d05c

PiperOrigin-RevId: 689884643
---
 .../xla/client/executable_build_options.cc    | 10 ---
 .../xla/xla/client/executable_build_options.h |  6 --
 .../xla/pjrt/pjrt_stream_executor_client.cc   |  2 -
 third_party/xla/xla/service/BUILD             |  1 -
 third_party/xla/xla/service/compiler.h        |  5 --
 .../xla/xla/service/gpu/autotuning/BUILD      |  7 +-
 .../autotuning/autotuner_compile_util_test.cc | 23 ++----
 .../service/gpu/autotuning/autotuner_util.h   |  5 +-
 .../autotuning/conv_algorithm_picker_test.cc  | 10 +--
 .../custom_kernel_fusion_autotuner_test.cc    | 15 +---
 .../autotuning/gemm_algorithm_picker_test.cc  | 20 ++----
 .../autotuning/gemm_fusion_autotuner_test.cc  | 36 +++-------
 .../xla/xla/service/gpu/determinism_test.cc   |  3 -
 .../xla/xla/service/gpu/gpu_compiler.cc       | 70 +++++++++----------
 .../xla/xla/service/gpu/gpu_compiler.h        | 15 ----
 .../xla/xla/service/gpu/transforms/BUILD      |  3 +-
 .../triton_fusion_numerics_verifier_test.cc   | 11 +--
 third_party/xla/xla/service/local_service.cc  | 11 ++-
 18 files changed, 68 insertions(+), 185 deletions(-)
diff --git a/third_party/xla/xla/client/executable_build_options.cc b/third_party/xla/xla/client/executable_build_options.cc
index 14ed0d45665e7f..68a7bd2dc90ea7 100644
--- a/third_party/xla/xla/client/executable_build_options.cc
+++ b/third_party/xla/xla/client/executable_build_options.cc
@@ -48,16 +48,6 @@ se::DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
   return device_allocator_;
 }
 
-ExecutableBuildOptions& ExecutableBuildOptions::set_compute_stream(
-    se::Stream* stream) {
-  compute_stream_ = stream;
-  return *this;
-}
-
-se::Stream* ExecutableBuildOptions::compute_stream() const {
-  return compute_stream_;
-}
-
 ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
     int device_ordinal) {
   CHECK_GE(device_ordinal, 0);
diff --git a/third_party/xla/xla/client/executable_build_options.h b/third_party/xla/xla/client/executable_build_options.h
index 45a6d39143cd4f..e73d9d763102c6 100644
--- a/third_party/xla/xla/client/executable_build_options.h
+++ b/third_party/xla/xla/client/executable_build_options.h
@@ -40,7 +40,6 @@ namespace stream_executor {
 
 // Forward-declared to avoid StreamExecutor dependency.
 class DeviceMemoryAllocator;
-class Stream;
 
 }  // namespace stream_executor
 
@@ -92,10 +91,6 @@ class ExecutableBuildOptions {
       se::DeviceMemoryAllocator* allocator);
   se::DeviceMemoryAllocator* device_allocator() const;
 
-  // If set, this specifies a stream that can be used for autotuning.
-  ExecutableBuildOptions& set_compute_stream(se::Stream* stream);
-  se::Stream* compute_stream() const;
-
   // The number of replicas of this computation that are to be executed.
   // Defaults to 1.
   int num_replicas() const { return num_replicas_; }
@@ -292,7 +287,6 @@ class ExecutableBuildOptions {
   std::optional<CompilationEnvironments> comp_envs_;
   std::optional<DebugOptions> debug_options_;
   se::DeviceMemoryAllocator* device_allocator_ = nullptr;
-  se::Stream* compute_stream_ = nullptr;
   int num_replicas_ = 1;
   int num_partitions_ = 1;
   bool use_spmd_partitioning_ = false;
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index cd8abbf3dd91b0..672d76adb125bb 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -3482,8 +3482,6 @@ PjRtStreamExecutorClient::GetExecutableExtras(CompileOptions* options) {
       build_options.set_device_ordinal(
           addressable_devices.front()->local_hardware_id().value());
     }
-    build_options.set_compute_stream(
-        device_state(build_options.device_ordinal()).compute_stream());
   }
   return extras;
 }
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index ee9a60acb5d5c3..1bfbe9bca51b36 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1457,7 +1457,6 @@ cc_library(
         "//xla/hlo/ir:hlo_module_group",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/stream_executor:dnn",
-        "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index 9cf435418bcd16..45dc7298c4e8d4 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/service/executable.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/metrics_hook_interface.h"
-#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/protobuf.h"
 #include "tsl/platform/threadpool.h"
@@ -159,10 +158,6 @@ class Compiler {
     std::optional<TargetConfig> target_config;
 
     MultiProcessKeyValueStore key_value_store;
-
-    // If compute_stream is set, this is the stream used for all autotuning
-    // during compilation.
-    se::Stream* compute_stream = nullptr;
   };
 
   virtual ~Compiler() = default;
diff --git a/third_party/xla/xla/service/gpu/autotuning/BUILD b/third_party/xla/xla/service/gpu/autotuning/BUILD
index 13e79c8c8af1e3..2e21ea0b3cd02c 100644
--- a/third_party/xla/xla/service/gpu/autotuning/BUILD
+++ b/third_party/xla/xla/service/gpu/autotuning/BUILD
@@ -145,7 +145,6 @@ xla_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:semantic_version",
-        "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",
@@ -303,8 +302,8 @@ xla_test(
         "//xla/hlo/ir:hlo",
         "//xla/service:platform_util",
         "//xla/stream_executor:platform",
-        "//xla/stream_executor:stream",
         "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
@@ -332,7 +331,6 @@ xla_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:semantic_version",
-        "//xla/stream_executor:stream",
         "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/protobuf:dnn_proto_cc",
@@ -439,7 +437,6 @@ xla_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:platform",
-        "//xla/stream_executor:stream",
         "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/strings:string_view",
@@ -500,12 +497,10 @@ xla_test(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
-        "//xla/stream_executor:stream",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util_test.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util_test.cc
index 28489d63573e6c..a8b959482ebba0 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/service/gpu/autotuning/autotuner_compile_util.h"
 
-#include <memory>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -24,7 +23,6 @@ limitations under the License.
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
@@ -49,12 +47,9 @@ ENTRY main {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
                           PlatformUtil::GetStreamExecutors(platform));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          executors.at(0)->CreateStream());
 
-  AutotuneConfig autotune_config{
-      DeviceConfig{executors.at(0), nullptr, stream.get()},
-      GetDebugOptionsForTest()};
+  AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr},
+                                 GetDebugOptionsForTest()};
 
   auto& root = *module->entry_computation()->root_instruction();
 
@@ -106,11 +101,8 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
                           PlatformUtil::GetStreamExecutors(platform));
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          executors.at(0)->CreateStream());
-  AutotuneConfig autotune_config{
-      DeviceConfig{executors.at(0), nullptr, stream.get()},
-      GetDebugOptionsForTest()};
+  AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr},
+                                 GetDebugOptionsForTest()};
 
   auto& root = *module->entry_computation()->root_instruction();
 
@@ -162,11 +154,8 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
                           PlatformUtil::GetStreamExecutors(platform));
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          executors.at(0)->CreateStream());
-  AutotuneConfig autotune_config{
-      DeviceConfig{executors.at(0), nullptr, stream.get()},
-      GetDebugOptionsForTest()};
+  AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr},
+                                 GetDebugOptionsForTest()};
 
   auto& root = *module->entry_computation()->root_instruction();
 
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h b/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h
index 1fc28b8c1db719..48bb3e3b291442 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h
@@ -50,8 +50,6 @@ struct DeviceConfig {
   // memory while timing the various convolution algorithms.  If it's null,
   // we'll use the default allocator on the StreamExecutor.
   se::DeviceMemoryAllocator* allocator = nullptr;  // may be null
-
-  se::Stream* compute_stream = nullptr;
 };
 
 struct DevicelessConfig {
@@ -179,8 +177,7 @@ class AutotuneConfig {
 
   absl::StatusOr<se::Stream*> GetStream() const {
     CHECK(std::holds_alternative<DeviceConfig>(config_));
-    se::Stream* stream = std::get<DeviceConfig>(config_).compute_stream;
-    return stream;
+    return GetAllocator()->GetStream(GetExecutor()->device_ordinal());
   }
 
   const se::GpuComputeCapability& GetGpuComputeCapability() const {
diff --git a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc
index ea31846a6fbcc8..aaa502f600f90e 100644
--- a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/service/gpu/autotuning/conv_algorithm_picker.h"
 
 #include <cstdint>
-#include <memory>
 #include <variant>
 #include <vector>
 
@@ -36,7 +35,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla.pb.h"
@@ -80,8 +78,6 @@ ENTRY main {
                           PlatformUtil::GetStreamExecutors(platform));
   ASSERT_GT(executors.size(), 0);
   se::StreamExecutor* stream_exec = executors[0];
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          stream_exec->CreateStream());
 
   const se::GpuComputeCapability& cc = backend()
                                            .default_stream_executor()
@@ -92,7 +88,7 @@ ENTRY main {
   changed = false;
   DebugOptions opts = DefaultDebugOptionsIgnoringFlags();
 
-  AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr, stream.get()}, opts};
+  AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr}, opts};
   TF_ASSERT_OK_AND_ASSIGN(changed,
                           RunHloPass(GpuConvAlgorithmPicker(cfg), m.get()));
   ASSERT_TRUE(changed);
@@ -204,9 +200,7 @@ ENTRY main {
   ASSERT_TRUE(changed);
 
   DebugOptions opts = DefaultDebugOptionsIgnoringFlags();
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          stream_exec->CreateStream());
-  AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr, stream.get()}, opts};
+  AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr}, opts};
   TF_ASSERT_OK_AND_ASSIGN(changed,
                           RunHloPass(GpuConvAlgorithmPicker(cfg), m.get()));
   ASSERT_TRUE(changed);
diff --git a/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner_test.cc
index 44a08788b49451..3a214ffeb56d5a 100644
--- a/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner_test.cc
@@ -23,10 +23,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
-#include "xla/stream_executor/stream.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/xla.pb.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -68,12 +66,9 @@ TEST_F(CustomKernelFusionAutotunerTest, DontRunOnNonCustomFusions) {
 
   HloPassPipeline pipeline("custom_kernel_fusion_autotuner");
   DebugOptions debug_options;
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          backend().default_stream_executor()->CreateStream());
-
   AutotuneConfig autotune_config =
       AutotuneConfig{DeviceConfig{backend().default_stream_executor(),
-                                  backend().memory_allocator(), stream.get()},
+                                  backend().memory_allocator()},
                      debug_options};
   pipeline.AddPass<CustomKernelFusionAutotuner>(autotune_config);
 
@@ -105,11 +100,9 @@ TEST_F(CustomKernelFusionAutotunerTest,
 
   HloPassPipeline pipeline("custom_kernel_fusion_autotuner");
   DebugOptions debug_options;
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          backend().default_stream_executor()->CreateStream());
   AutotuneConfig autotune_config =
       AutotuneConfig{DeviceConfig{backend().default_stream_executor(),
-                                  backend().memory_allocator(), stream.get()},
+                                  backend().memory_allocator()},
                      debug_options};
   pipeline.AddPass<CustomKernelFusionAutotuner>(autotune_config);
   ASSERT_TRUE(pipeline.Run(hlo_module.get()).ok());
@@ -138,11 +131,9 @@ TEST_F(CustomKernelFusionAutotunerTest,
 
   HloPassPipeline pipeline("custom_kernel_fusion_autotuner");
   DebugOptions debug_options;
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          backend().default_stream_executor()->CreateStream());
   AutotuneConfig autotune_config =
       AutotuneConfig{DeviceConfig{backend().default_stream_executor(),
-                                  backend().memory_allocator(), stream.get()},
+                                  backend().memory_allocator()},
                      debug_options};
   pipeline.AddPass<CustomKernelFusionAutotuner>(autotune_config);
 
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc
index cca62d35976c96..6526e3338fb6c5 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
-#include <memory>
 #include <string>
 #include <variant>
 
@@ -34,7 +33,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/semantic_version.h"
-#include "xla/stream_executor/stream.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
@@ -137,10 +135,7 @@ ENTRY main {
                 /*toolkit_version=*/stream_executor::SemanticVersion{12, 4, 0}),
             module.get()));
 
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                            stream_exec()->CreateStream());
-    AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()},
-                       debug_opts};
+    AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, debug_opts};
     GemmAlgorithmPicker gpicker(cfg);
     // Note that, we do not care if the algorithm index has been changed:
     // the thing matters is the # of algorithms left after sorting out.
@@ -180,10 +175,7 @@ ENTRY main {
                 /*toolkit_version=*/stream_executor::SemanticVersion{12, 4, 0}),
             module.get()));
 
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                            stream_exec()->CreateStream());
-    AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()},
-                       debug_opts};
+    AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, debug_opts};
     GemmAlgorithmPicker gpicker(cfg);
     TF_ASSERT_OK_AND_ASSIGN(changed, RunHloPass(gpicker, module.get()));
     num_left2 = gpicker.num_algorithms_left();
@@ -216,9 +208,7 @@ ENTRY main {
           m.get()));
   changed = false;
   DebugOptions opts;
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          stream_exec()->CreateStream());
-  AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()}, opts};
+  AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, opts};
   TF_ASSERT_OK_AND_ASSIGN(changed,
                           RunHloPass(GemmAlgorithmPicker(cfg), m.get()));
   ASSERT_TRUE(changed);
@@ -283,9 +273,7 @@ ENTRY main {
   changed = false;
 
   DebugOptions opts;
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          stream_exec()->CreateStream());
-  AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()}, opts};
+  AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, opts};
 
   TF_ASSERT_OK_AND_ASSIGN(changed,
                           RunHloPass(GemmAlgorithmPicker(cfg), m.get()));
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
index 47c25a26a574cb..a7b79d36e549e2 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
@@ -53,7 +53,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/semantic_version.h"
-#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tests/filecheck.h"
 #include "xla/tests/hlo_test_base.h"
@@ -193,10 +192,8 @@ class StatelessAutotunerTest : public HloTestBase {
     ccc->set_major(compute_capability.major);
     ccc->set_minor(compute_capability.minor);
 
-    static se::Stream* stream =
-        backend().default_stream_executor()->CreateStream().value().release();
     DeviceConfig test_config{backend().default_stream_executor(),
-                             backend().memory_allocator(), stream};
+                             backend().memory_allocator()};
     AutotuneConfig autotune_config{test_config, debug_options};
     GemmFusionAutotunerImpl autotuner(autotune_config, toolkit_version,
                                       debug_options, nullptr);
@@ -213,12 +210,8 @@ class StatelessAutotunerTest : public HloTestBase {
   // Returns the config for the current device.
   absl::StatusOr<std::vector<GemmFusionAutotunerImpl::BackendConfig>>
   GetPossibleMatmulAutotuneConfigs(const HloModule& module) {
-    static se::Stream* stream =
-        backend().default_stream_executor()->CreateStream().value().release();
-
     DeviceConfig device_config{backend().default_stream_executor(),
                                backend().memory_allocator()};
-    device_config.compute_stream = stream;
     AutotuneConfig autotune_config{device_config, GetDebugOptionsForTest()};
     GemmFusionAutotunerImpl autotuner(autotune_config, GetToolkitVersion(),
                                       GetDebugOptionsForTest(), nullptr);
@@ -324,14 +317,11 @@ class GemmFusionAutotunerTest : public StatelessAutotunerTest {
                                         tsl::port::MaxParallelism());
     DebugOptions opts;
     MultiProcessKeyValueStore key_value_store;
-    static se::Stream* stream =
-        backend().default_stream_executor()->CreateStream().value().release();
-    DeviceConfig device_config{backend().default_stream_executor(),
-                               backend().memory_allocator()};
-    device_config.compute_stream = stream;
-    pipeline.AddPass<GemmFusionAutotuner>(AutotuneConfig{device_config, opts},
-                                          GetToolkitVersion(), &thread_pool,
-                                          key_value_store);
+    pipeline.AddPass<GemmFusionAutotuner>(
+        AutotuneConfig{DeviceConfig{backend().default_stream_executor(),
+                                    backend().memory_allocator()},
+                       opts},
+        GetToolkitVersion(), &thread_pool, key_value_store);
 
     RunAndFilecheckHloRewrite(
         hlo, std::move(pipeline), expected, [](const HloModule* m) {
@@ -713,12 +703,9 @@ ENTRY main {
                           ParseAndReturnVerifiedModule(kHloText));
 
   DebugOptions opts;
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          backend().default_stream_executor()->CreateStream());
-
   AutotuneConfig autotune_config{
       DeviceConfig{backend().default_stream_executor(),
-                   backend().memory_allocator(), stream.get()},
+                   backend().memory_allocator()},
       opts};
   AutotuneCacheKey cache_key(autotune_config.GetModelStr(),
                              *module->entry_computation()->root_instruction());
@@ -1267,12 +1254,11 @@ TEST_F(GemmFusionAutotunerTest, RewritesGemmFusionToCustomKernelFusion) {
   std::unique_ptr<VerifiedHloModule> module =
       ParseAndReturnVerifiedModule(kHlo).value();
 
-  static se::Stream* stream =
-      backend().default_stream_executor()->CreateStream().value().release();
   DebugOptions opts;
-  DeviceConfig device_config{backend().default_stream_executor(),
-                             backend().memory_allocator(), stream};
-  AutotuneConfig autotune_config{device_config, opts};
+  AutotuneConfig autotune_config{
+      DeviceConfig{backend().default_stream_executor(),
+                   backend().memory_allocator()},
+      opts};
   AutotuneCacheKey cache_key(autotune_config.GetModelStr(),
                              *module->entry_computation()->root_instruction());
   TF_ASSERT_OK_AND_ASSIGN(AutotuneResults autotune_results_override,
diff --git a/third_party/xla/xla/service/gpu/determinism_test.cc b/third_party/xla/xla/service/gpu/determinism_test.cc
index 47ec3460ee3cb3..19a3d1390fff1d 100644
--- a/third_party/xla/xla/service/gpu/determinism_test.cc
+++ b/third_party/xla/xla/service/gpu/determinism_test.cc
@@ -111,9 +111,6 @@ class DeterminismTest : public GpuCodegenTest {
     TF_ASSERT_OK_AND_ASSIGN(stream_executor::Platform * default_platform,
                             PlatformUtil::GetDefaultPlatform());
     stream_executor::gpu::MockGpuExecutor executor(default_platform, 0);
-    EXPECT_CALL(executor, CreateStream).WillRepeatedly([&]() {
-      return backend().default_stream_executor()->CreateStream();
-    });
     EXPECT_CALL(executor, CreateEventBasedTimer).Times(0);
     EXPECT_CALL(executor, GetDeviceDescription)
         .WillRepeatedly([this]() -> const se::DeviceDescription& {
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 1e52b4bcbf28ba..83b332a1ce4ffe 100755
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -307,6 +307,18 @@ MaybeOwningThreadPool CreateMaybeOwningThreadPool(
   }
 }
 
+absl::StatusOr<AutotuneConfig> GetAutotuneConfig(
+    se::StreamExecutor* stream_exec, const DebugOptions& debug_options,
+    const GpuCompiler::CompileOptions& options,
+    const Compiler::TargetConfig& gpu_target_config) {
+  if (stream_exec) {
+    return AutotuneConfig{DeviceConfig{stream_exec, options.device_allocator},
+                          debug_options};
+  }
+  return AutotuneConfig{DevicelessConfig{gpu_target_config.device_description},
+                        debug_options};
+}
+
 se::GpuComputeCapability GetGpuVersion(const se::StreamExecutor* stream_exec) {
   return stream_exec->GetDeviceDescription().gpu_compute_capability();
 }
@@ -464,24 +476,6 @@ GpuCompiler::GpuCompiler(se::Platform::Id platform_id,
       pointer_size_(llvm::DataLayout(data_layout)
                         .getPointerSize(0 /* default address space */)) {}
 
-absl::StatusOr<AutotuneConfig> GpuCompiler::GetAutotuneConfig(
-    se::StreamExecutor* stream_exec, const DebugOptions& debug_options,
-    const GpuCompiler::CompileOptions& options,
-    const Compiler::TargetConfig& gpu_target_config) {
-  if (stream_exec) {
-    if ((options.compute_stream == nullptr) && (compute_stream_ == nullptr)) {
-      TF_ASSIGN_OR_RETURN(compute_stream_, stream_exec->CreateStream());
-    }
-    return AutotuneConfig{
-        DeviceConfig{stream_exec, options.device_allocator,
-                     options.compute_stream != nullptr ? options.compute_stream
-                                                       : compute_stream_.get()},
-        debug_options};
-  }
-  return AutotuneConfig{DevicelessConfig{gpu_target_config.device_description},
-                        debug_options};
-}
-
 namespace {
 // Adds the HloVerifier for GPU to the given pipeline.
 void AddHloVerifier(HloPassPipeline* pipeline,
@@ -1201,6 +1195,26 @@ absl::Status RunPostFusionSimplificationPasses(
   return pipeline.Run(hlo_module).status();
 }
 
+absl::Status RunPostFusionVerificationPasses(
+    HloModule* hlo_module, se::StreamExecutor* stream_exec,
+    const GpuCompiler::CompileOptions& options,
+    const Compiler::TargetConfig& gpu_target_config) {
+  HloPassPipeline pipeline("post-fusion-verification-pipeline optimization");
+
+  if (hlo_module->config()
+          .debug_options()
+          .xla_gpu_verify_triton_fusion_numerics()) {
+    TF_ASSIGN_OR_RETURN(
+        AutotuneConfig autotune_config,
+        GetAutotuneConfig(stream_exec, hlo_module->config().debug_options(),
+                          options, gpu_target_config));
+
+    pipeline.AddPass<TritonFusionNumericsVerifier>(autotune_config);
+  }
+
+  return pipeline.Run(hlo_module).status();
+}
+
 absl::Status RunLayoutNormalizationPasses(
     HloModule* hlo_module, const se::GpuComputeCapability& gpu_version) {
   HloPassPipeline layout_normalization_pipeline("layout normalization");
@@ -1279,26 +1293,6 @@ absl::Status GpuCompiler::RunCollectiveScheduleLinearizerPasses(
   return pipeline.Run(hlo_module).status();
 }
 
-absl::Status GpuCompiler::RunPostFusionVerificationPasses(
-    HloModule* hlo_module, se::StreamExecutor* stream_exec,
-    const GpuCompiler::CompileOptions& options,
-    const Compiler::TargetConfig& gpu_target_config) {
-  HloPassPipeline pipeline("post-fusion-verification-pipeline optimization");
-
-  if (hlo_module->config()
-          .debug_options()
-          .xla_gpu_verify_triton_fusion_numerics()) {
-    TF_ASSIGN_OR_RETURN(
-        AutotuneConfig autotune_config,
-        GetAutotuneConfig(stream_exec, hlo_module->config().debug_options(),
-                          options, gpu_target_config));
-
-    pipeline.AddPass<TritonFusionNumericsVerifier>(autotune_config);
-  }
-
-  return pipeline.Run(hlo_module).status();
-}
-
 // Runs optimization passes on the given HLO module.
 absl::Status GpuCompiler::OptimizeHloModule(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index 85e75a46390256..b8fe422dbe1fac 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -245,18 +245,6 @@ class GpuCompiler : public LLVMCompiler {
     return Unimplemented("LinkModules is not implemented.");
   }
 
-  // Creates an AutotuneConfig for the given options.
-  absl::StatusOr<AutotuneConfig> GetAutotuneConfig(
-      se::StreamExecutor* stream_exec, const DebugOptions& debug_options,
-      const GpuCompiler::CompileOptions& options,
-      const Compiler::TargetConfig& gpu_target_config);
-
-  // Runs verification passes after fusion.
-  absl::Status RunPostFusionVerificationPasses(
-      HloModule* hlo_module, se::StreamExecutor* stream_exec,
-      const GpuCompiler::CompileOptions& options,
-      const Compiler::TargetConfig& gpu_target_config);
-
   se::Platform::Id platform_id_;
 
   // The triple that represents our target.
@@ -268,9 +256,6 @@ class GpuCompiler : public LLVMCompiler {
   // The size in bytes of a pointer. Used by ShapeSizeBytesFunction.
   const int64_t pointer_size_;
 
-  // A stream to use for autotuning if none is provided.
-  std::unique_ptr<se::Stream> compute_stream_;
-
   GpuCompiler(const GpuCompiler&) = delete;
   GpuCompiler& operator=(const GpuCompiler&) = delete;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 7e2eabfb92b41c..3c2d885e0b13b1 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -3223,6 +3223,7 @@ xla_test(
     deps = [
         ":triton_fusion_numerics_verifier",
         "//xla:shape_util",
+        "//xla:test_helpers",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -3231,7 +3232,6 @@ xla_test(
         "//xla/service/gpu/autotuning:autotuner_compile_util",
         "//xla/service/gpu/autotuning:autotuner_util",
         "//xla/stream_executor:platform",
-        "//xla/stream_executor:stream",
         "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/status",
@@ -3239,7 +3239,6 @@ xla_test(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
index ea6fd9cbb25357..2762166278cdbf 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
@@ -32,13 +32,11 @@ limitations under the License.
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
@@ -81,9 +79,7 @@ class TritonFusionNumericsVerifierTest
     se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
     auto executors_or = PlatformUtil::GetStreamExecutors(platform);
     TF_EXPECT_OK(executors_or);
-    static se::Stream* stream =
-        executors_or->at(0)->CreateStream().value().release();
-    return AutotuneConfig{DeviceConfig{executors_or->at(0), nullptr, stream},
+    return AutotuneConfig{DeviceConfig{executors_or->at(0), nullptr},
                           GetDebugOptionsForTest()};
   }
 
@@ -298,11 +294,8 @@ ENTRY main {
 
   std::unique_ptr<HloModule> module =
       *ParseAndReturnVerifiedModule(hlo_text, GetModuleConfigForTest());
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          backend().default_stream_executor()->CreateStream());
   AutotuneConfig autotune_config{
-      DeviceConfig{backend().default_stream_executor(), GetAllocator(),
-                   stream.get()},
+      DeviceConfig{backend().default_stream_executor(), GetAllocator()},
       module->config().debug_options()};
   TritonFusionNumericsVerifier verifier(autotune_config);
   TF_EXPECT_OK(RunHloPass(verifier, module.get()));
diff --git a/third_party/xla/xla/service/local_service.cc b/third_party/xla/xla/service/local_service.cc
index d4c175a31effb2..557fb216582695 100644
--- a/third_party/xla/xla/service/local_service.cc
+++ b/third_party/xla/xla/service/local_service.cc
@@ -94,8 +94,7 @@ LocalService::CompileExecutables(
       false,
       {},
       {build_options.key_value_store(), build_options.process_index(),
-       build_options.process_count()},
-      build_options.compute_stream()};
+       build_options.process_count()}};
   if (build_options.num_partitions() == 1) {
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<Executable> executable,
@@ -140,12 +139,12 @@ LocalService::CompileAotResults(
   // cores per module, but otherwise only uses the first executor.
   std::vector<se::StreamExecutor*> executors(build_options.num_partitions(),
                                              executor);
-  Compiler::CompileOptions compile_options{build_options.device_allocator(),
-                                           build_options.compile_thread_pool()};
-  compile_options.compute_stream = build_options.compute_stream();
+
   return BuildAotResults(
       /*module_protos=*/{&computation.proto()}, std::move(module_configs),
-      execute_backend_.get(), {executors}, compile_options,
+      execute_backend_.get(), {executors},
+      Compiler::CompileOptions{build_options.device_allocator(),
+                               build_options.compile_thread_pool()},
       build_options.run_backend_only());
 }