openxla · copybara-service · Oct 24, 2024
diff --git a/xla/client/executable_build_options.cc b/xla/client/executable_build_options.cc
@@ -48,6 +48,16 @@ se::DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
   return device_allocator_;
 }
 
+ExecutableBuildOptions& ExecutableBuildOptions::set_compute_stream(
+    se::Stream* stream) {
+  compute_stream_ = stream;
+  return *this;
+}
+
+se::Stream* ExecutableBuildOptions::compute_stream() const {
+  return compute_stream_;
+}
+
 ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
     int device_ordinal) {
   CHECK_GE(device_ordinal, 0);

diff --git a/xla/client/executable_build_options.h b/xla/client/executable_build_options.h
@@ -40,6 +40,7 @@ namespace stream_executor {
 
 // Forward-declared to avoid StreamExecutor dependency.
 class DeviceMemoryAllocator;
+class Stream;
 
 }  // namespace stream_executor
 
@@ -91,6 +92,10 @@ class ExecutableBuildOptions {
       se::DeviceMemoryAllocator* allocator);
   se::DeviceMemoryAllocator* device_allocator() const;
 
+  // If set, this specifies a stream that can be used for autotuning.
+  ExecutableBuildOptions& set_compute_stream(se::Stream* stream);
+  se::Stream* compute_stream() const;
+
   // The number of replicas of this computation that are to be executed.
   // Defaults to 1.
   int num_replicas() const { return num_replicas_; }
@@ -287,6 +292,7 @@ class ExecutableBuildOptions {
   std::optional<CompilationEnvironments> comp_envs_;
   std::optional<DebugOptions> debug_options_;
   se::DeviceMemoryAllocator* device_allocator_ = nullptr;
+  se::Stream* compute_stream_ = nullptr;
   int num_replicas_ = 1;
   int num_partitions_ = 1;
   bool use_spmd_partitioning_ = false;

diff --git a/xla/pjrt/pjrt_stream_executor_client.cc b/xla/pjrt/pjrt_stream_executor_client.cc
@@ -3482,6 +3482,8 @@ PjRtStreamExecutorClient::GetExecutableExtras(CompileOptions* options) {
       build_options.set_device_ordinal(
           addressable_devices.front()->local_hardware_id().value());
     }
+    build_options.set_compute_stream(
+        device_state(build_options.device_ordinal()).compute_stream());
   }
   return extras;
 }

diff --git a/xla/service/BUILD b/xla/service/BUILD
@@ -1458,6 +1458,7 @@ cc_library(
         "//xla/hlo/ir:hlo_module_group",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/stream_executor:dnn",
+        "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status:statusor",

diff --git a/xla/service/compiler.h b/xla/service/compiler.h
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/service/executable.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/metrics_hook_interface.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/protobuf.h"
 #include "tsl/platform/threadpool.h"
@@ -158,6 +159,10 @@ class Compiler {
     std::optional<TargetConfig> target_config;
 
     MultiProcessKeyValueStore key_value_store;
+
+    // If compute_stream is set, this is the stream used for all autotuning
+    // during compilation.
+    se::Stream* compute_stream = nullptr;
   };
 
   virtual ~Compiler() = default;

diff --git a/xla/service/gpu/BUILD b/xla/service/gpu/BUILD
@@ -2983,6 +2983,7 @@ xla_test(
         "//xla/service/gpu/autotuning:autotuner_util",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor:mock_stream",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:mock_gpu_executor",

diff --git a/xla/service/gpu/autotuning/BUILD b/xla/service/gpu/autotuning/BUILD
@@ -145,6 +145,7 @@ xla_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:semantic_version",
+        "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",

diff --git a/xla/service/gpu/autotuning/autotuner_compile_util_test.cc b/xla/service/gpu/autotuning/autotuner_compile_util_test.cc
@@ -47,9 +47,12 @@ ENTRY main {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
                           PlatformUtil::GetStreamExecutors(platform));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executors.at(0)->CreateStream());
 
-  AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr},
-                                 GetDebugOptionsForTest()};
+  AutotuneConfig autotune_config{
+      DeviceConfig{executors.at(0), nullptr, stream.get()},
+      GetDebugOptionsForTest()};
 
   auto& root = *module->entry_computation()->root_instruction();
 
@@ -101,8 +104,11 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
                           PlatformUtil::GetStreamExecutors(platform));
 
-  AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr},
-                                 GetDebugOptionsForTest()};
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executors.at(0)->CreateStream());
+  AutotuneConfig autotune_config{
+      DeviceConfig{executors.at(0), nullptr, stream.get()},
+      GetDebugOptionsForTest()};
 
   auto& root = *module->entry_computation()->root_instruction();
 
@@ -154,8 +160,11 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
                           PlatformUtil::GetStreamExecutors(platform));
 
-  AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr},
-                                 GetDebugOptionsForTest()};
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executors.at(0)->CreateStream());
+  AutotuneConfig autotune_config{
+      DeviceConfig{executors.at(0), nullptr, stream.get()},
+      GetDebugOptionsForTest()};
 
   auto& root = *module->entry_computation()->root_instruction();
 

diff --git a/xla/service/gpu/autotuning/autotuner_util.h b/xla/service/gpu/autotuning/autotuner_util.h
@@ -50,6 +50,8 @@ struct DeviceConfig {
   // memory while timing the various convolution algorithms.  If it's null,
   // we'll use the default allocator on the StreamExecutor.
   se::DeviceMemoryAllocator* allocator = nullptr;  // may be null
+
+  se::Stream* compute_stream = nullptr;
 };
 
 struct DevicelessConfig {
@@ -177,7 +179,8 @@ class AutotuneConfig {
 
   absl::StatusOr<se::Stream*> GetStream() const {
     CHECK(std::holds_alternative<DeviceConfig>(config_));
-    return GetAllocator()->GetStream(GetExecutor()->device_ordinal());
+    se::Stream* stream = std::get<DeviceConfig>(config_).compute_stream;
+    return stream;
   }
 
   const se::GpuComputeCapability& GetGpuComputeCapability() const {

diff --git a/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc b/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc
@@ -78,6 +78,8 @@ ENTRY main {
                           PlatformUtil::GetStreamExecutors(platform));
   ASSERT_GT(executors.size(), 0);
   se::StreamExecutor* stream_exec = executors[0];
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          stream_exec->CreateStream());
 
   const se::GpuComputeCapability& cc = backend()
                                            .default_stream_executor()
@@ -88,7 +90,7 @@ ENTRY main {
   changed = false;
   DebugOptions opts = DefaultDebugOptionsIgnoringFlags();
 
-  AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr}, opts};
+  AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr, stream.get()}, opts};
   TF_ASSERT_OK_AND_ASSIGN(changed,
                           RunHloPass(GpuConvAlgorithmPicker(cfg), m.get()));
   ASSERT_TRUE(changed);
@@ -200,7 +202,9 @@ ENTRY main {
   ASSERT_TRUE(changed);
 
   DebugOptions opts = DefaultDebugOptionsIgnoringFlags();
-  AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr}, opts};
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          stream_exec->CreateStream());
+  AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr, stream.get()}, opts};
   TF_ASSERT_OK_AND_ASSIGN(changed,
                           RunHloPass(GpuConvAlgorithmPicker(cfg), m.get()));
   ASSERT_TRUE(changed);

diff --git a/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner_test.cc b/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner_test.cc
@@ -66,9 +66,12 @@ TEST_F(CustomKernelFusionAutotunerTest, DontRunOnNonCustomFusions) {
 
   HloPassPipeline pipeline("custom_kernel_fusion_autotuner");
   DebugOptions debug_options;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          backend().default_stream_executor()->CreateStream());
+
   AutotuneConfig autotune_config =
       AutotuneConfig{DeviceConfig{backend().default_stream_executor(),
-                                  backend().memory_allocator()},
+                                  backend().memory_allocator(), stream.get()},
                      debug_options};
   pipeline.AddPass<CustomKernelFusionAutotuner>(autotune_config);
 
@@ -100,9 +103,11 @@ TEST_F(CustomKernelFusionAutotunerTest,
 
   HloPassPipeline pipeline("custom_kernel_fusion_autotuner");
   DebugOptions debug_options;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          backend().default_stream_executor()->CreateStream());
   AutotuneConfig autotune_config =
       AutotuneConfig{DeviceConfig{backend().default_stream_executor(),
-                                  backend().memory_allocator()},
+                                  backend().memory_allocator(), stream.get()},
                      debug_options};
   pipeline.AddPass<CustomKernelFusionAutotuner>(autotune_config);
   ASSERT_TRUE(pipeline.Run(hlo_module.get()).ok());
@@ -131,9 +136,11 @@ TEST_F(CustomKernelFusionAutotunerTest,
 
   HloPassPipeline pipeline("custom_kernel_fusion_autotuner");
   DebugOptions debug_options;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          backend().default_stream_executor()->CreateStream());
   AutotuneConfig autotune_config =
       AutotuneConfig{DeviceConfig{backend().default_stream_executor(),
-                                  backend().memory_allocator()},
+                                  backend().memory_allocator(), stream.get()},
                      debug_options};
   pipeline.AddPass<CustomKernelFusionAutotuner>(autotune_config);
 

diff --git a/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc b/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc
@@ -135,7 +135,10 @@ ENTRY main {
                 /*toolkit_version=*/stream_executor::SemanticVersion{12, 4, 0}),
             module.get()));
 
-    AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, debug_opts};
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                            stream_exec()->CreateStream());
+    AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()},
+                       debug_opts};
     GemmAlgorithmPicker gpicker(cfg);
     // Note that, we do not care if the algorithm index has been changed:
     // the thing matters is the # of algorithms left after sorting out.
@@ -175,7 +178,10 @@ ENTRY main {
                 /*toolkit_version=*/stream_executor::SemanticVersion{12, 4, 0}),
             module.get()));
 
-    AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, debug_opts};
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                            stream_exec()->CreateStream());
+    AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()},
+                       debug_opts};
     GemmAlgorithmPicker gpicker(cfg);
     TF_ASSERT_OK_AND_ASSIGN(changed, RunHloPass(gpicker, module.get()));
     num_left2 = gpicker.num_algorithms_left();
@@ -208,7 +214,9 @@ ENTRY main {
           m.get()));
   changed = false;
   DebugOptions opts;
-  AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, opts};
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          stream_exec()->CreateStream());
+  AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()}, opts};
   TF_ASSERT_OK_AND_ASSIGN(changed,
                           RunHloPass(GemmAlgorithmPicker(cfg), m.get()));
   ASSERT_TRUE(changed);
@@ -273,7 +281,9 @@ ENTRY main {
   changed = false;
 
   DebugOptions opts;
-  AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr}, opts};
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          stream_exec()->CreateStream());
+  AutotuneConfig cfg{DeviceConfig{stream_exec(), nullptr, stream.get()}, opts};
 
   TF_ASSERT_OK_AND_ASSIGN(changed,
                           RunHloPass(GemmAlgorithmPicker(cfg), m.get()));

diff --git a/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc b/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
@@ -53,6 +53,7 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/semantic_version.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tests/filecheck.h"
 #include "xla/tests/hlo_test_base.h"
@@ -192,8 +193,10 @@ class StatelessAutotunerTest : public HloTestBase {
     ccc->set_major(compute_capability.major);
     ccc->set_minor(compute_capability.minor);
 
+    static se::Stream* stream =
+        backend().default_stream_executor()->CreateStream().value().release();
     DeviceConfig test_config{backend().default_stream_executor(),
-                             backend().memory_allocator()};
+                             backend().memory_allocator(), stream};
     AutotuneConfig autotune_config{test_config, debug_options};
     GemmFusionAutotunerImpl autotuner(autotune_config, toolkit_version,
                                       debug_options, nullptr);
@@ -210,8 +213,12 @@ class StatelessAutotunerTest : public HloTestBase {
   // Returns the config for the current device.
   absl::StatusOr<std::vector<GemmFusionAutotunerImpl::BackendConfig>>
   GetPossibleMatmulAutotuneConfigs(const HloModule& module) {
+    static se::Stream* stream =
+        backend().default_stream_executor()->CreateStream().value().release();
+
     DeviceConfig device_config{backend().default_stream_executor(),
                                backend().memory_allocator()};
+    device_config.compute_stream = stream;
     AutotuneConfig autotune_config{device_config, GetDebugOptionsForTest()};
     GemmFusionAutotunerImpl autotuner(autotune_config, GetToolkitVersion(),
                                       GetDebugOptionsForTest(), nullptr);
@@ -317,11 +324,14 @@ class GemmFusionAutotunerTest : public StatelessAutotunerTest {
                                         tsl::port::MaxParallelism());
     DebugOptions opts;
     MultiProcessKeyValueStore key_value_store;
-    pipeline.AddPass<GemmFusionAutotuner>(
-        AutotuneConfig{DeviceConfig{backend().default_stream_executor(),
-                                    backend().memory_allocator()},
-                       opts},
-        GetToolkitVersion(), &thread_pool, key_value_store);
+    static se::Stream* stream =
+        backend().default_stream_executor()->CreateStream().value().release();
+    DeviceConfig device_config{backend().default_stream_executor(),
+                               backend().memory_allocator()};
+    device_config.compute_stream = stream;
+    pipeline.AddPass<GemmFusionAutotuner>(AutotuneConfig{device_config, opts},
+                                          GetToolkitVersion(), &thread_pool,
+                                          key_value_store);
 
     RunAndFilecheckHloRewrite(
         hlo, std::move(pipeline), expected, [](const HloModule* m) {
@@ -703,9 +713,12 @@ ENTRY main {
                           ParseAndReturnVerifiedModule(kHloText));
 
   DebugOptions opts;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          backend().default_stream_executor()->CreateStream());
+
   AutotuneConfig autotune_config{
       DeviceConfig{backend().default_stream_executor(),
-                   backend().memory_allocator()},
+                   backend().memory_allocator(), stream.get()},
       opts};
   AutotuneCacheKey cache_key(autotune_config.GetModelStr(),
                              *module->entry_computation()->root_instruction());
@@ -1254,11 +1267,12 @@ TEST_F(GemmFusionAutotunerTest, RewritesGemmFusionToCustomKernelFusion) {
   std::unique_ptr<VerifiedHloModule> module =
       ParseAndReturnVerifiedModule(kHlo).value();
 
+  static se::Stream* stream =
+      backend().default_stream_executor()->CreateStream().value().release();
   DebugOptions opts;
-  AutotuneConfig autotune_config{
-      DeviceConfig{backend().default_stream_executor(),
-                   backend().memory_allocator()},
-      opts};
+  DeviceConfig device_config{backend().default_stream_executor(),
+                             backend().memory_allocator(), stream};
+  AutotuneConfig autotune_config{device_config, opts};
   AutotuneCacheKey cache_key(autotune_config.GetModelStr(),
                              *module->entry_computation()->root_instruction());
   TF_ASSERT_OK_AND_ASSIGN(AutotuneResults autotune_results_override,

diff --git a/xla/service/gpu/determinism_test.cc b/xla/service/gpu/determinism_test.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/mock_gpu_executor.h"
+#include "xla/stream_executor/mock_stream.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tests/filecheck.h"
@@ -111,6 +112,9 @@ class DeterminismTest : public GpuCodegenTest {
     TF_ASSERT_OK_AND_ASSIGN(stream_executor::Platform * default_platform,
                             PlatformUtil::GetDefaultPlatform());
     stream_executor::gpu::MockGpuExecutor executor(default_platform, 0);
+    EXPECT_CALL(executor, CreateStream).WillRepeatedly([&]() {
+      return backend().default_stream_executor()->CreateStream();
+    });
     EXPECT_CALL(executor, CreateEventBasedTimer).Times(0);
     EXPECT_CALL(executor, GetDeviceDescription)
         .WillRepeatedly([this]() -> const se::DeviceDescription& {