Move to NvshmemCollectives

trevor-m · trevor-m · commit f2d2f617d6d7 · 2025-01-14T12:52:18.000-08:00
diff --git a/xla/backends/gpu/collectives/BUILD b/xla/backends/gpu/collectives/BUILD
@@ -260,3 +260,30 @@ cc_library(
         "@local_config_rocm//rocm:rccl",
     ]),
 )
+
+cc_library(
+    name = "nvshmem_collectives",
+    srcs = if_gpu_is_configured(["nvshmem_collectives.cc"]),
+    hdrs = if_gpu_is_configured(["nvshmem_collectives.h"]),
+    local_defines = if_cuda_is_configured([
+        "GOOGLE_CUDA=1",
+    ]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
+    deps = [
+        "//xla/core/collectives",
+        "//xla/core/collectives:collectives_registry",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_googlesource_code_re2//:re2",
+        "@tsl//tsl/platform:errors",
+        "@tsl//tsl/platform:numbers",
+        "@tsl//tsl/platform:logging",
+        "@tsl//tsl/platform:statusor",
+    ]+ if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@nvshmem//:nvshmem",
+    ]),
+    alwayslink = True,  # registers collectives implementation
+)
diff --git a/xla/backends/gpu/collectives/nvshmem_collectives.cc b/xla/backends/gpu/collectives/nvshmem_collectives.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime/nvshmem_api.h"
+#include "xla/backends/gpu/collectives/nvshmem_collectives.h"
 
 #include "absl/strings/str_format.h"
 #include "tsl/platform/logging.h"
@@ -22,6 +22,9 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 #include "third_party/nvshmem/nvshmem.h"
 #include "third_party/nvshmem/nvshmemx.h"
+#include "xla/core/collectives/collectives_registry.h"
+
+#include <cuda.h>
 
 namespace xla::gpu {
 
@@ -60,20 +63,24 @@ static absl::Status NvshmemToStatus(int s, const char* file, int64_t line,
 
 #define XLA_NVSHMEM_CHECK(expr) CHECK(XLA_NVSHMEM_STATUS(expr).ok())
 
-int NvshmemApi::process_id_ = -1;
-size_t NvshmemApi::num_processes_ = 0;
-size_t NvshmemApi::device_count_per_process_ = 0;
-std::function<absl::StatusOr<std::string>(std::string_view)>
-    NvshmemApi::kv_store_get_ = nullptr;
-std::function<absl::Status(std::string_view, std::string_view)>
-    NvshmemApi::kv_store_set_ = nullptr;
-
-NvshmemApi& NvshmemApi::Default() {
-  static NvshmemApi instance;
-  return instance;
+NvshmemCollectives::~NvshmemCollectives() {
+  if (initialized_) Finalize();
+}
+
+NvshmemCollectives* NvshmemCollectives::Default() {
+  absl::StatusOr<Collectives*> collectives =
+      CollectivesRegistry::Get("gpu", "nvshmem");
+  CHECK_OK(collectives) << "Failed to get NVSHMEM collectives";  // Crash OK
+
+  if (auto* nvshmem_collectives =
+          tsl::down_cast<NvshmemCollectives*>(*collectives)) {
+    return nvshmem_collectives;
+  }
+
+  LOG(FATAL) << "Unsupported collectives implementation for NVSHMEM";
 }
 
-void NvshmemApi::SetEnvInfo(
+void NvshmemCollectives::SetEnvInfo(
     int process_id, size_t num_processes, size_t device_count_per_process,
     std::function<absl::StatusOr<std::string>(std::string_view)> kv_store_get,
     std::function<absl::Status(std::string_view, std::string_view)>
@@ -85,27 +92,14 @@ void NvshmemApi::SetEnvInfo(
   kv_store_set_ = kv_store_set;
 }
 
-NvshmemApi::NvshmemApi() {
-  // Initialize NVSHMEM here since code path
-  // is already protected by singleton pattern
+absl::Status NvshmemCollectives::Initialize() {
   if (process_id_ == -1) {
-    LOG(FATAL)
-        << "NvshmemApi::SetEnvInfo was not called before using NVSHMEM API";
+    LOG(FATAL) << "NvshmemCollectives::SetEnvInfo was not called before using "
+                  "NVSHMEM API";
   }
   if (device_count_per_process_ != 1) {
     LOG(FATAL) << "NVSHMEM API is only supported with one device per process";
   }
-  CHECK(Initialize().ok());
-}
-
-NvshmemApi::~NvshmemApi() {
-  VLOG(3) << absl::StreamFormat(
-      "Finilizing NVSHMEM on process %d; num_processes=%llu", process_id_,
-      num_processes_);
-  nvshmemx_hostlib_finalize();
-}
-
-absl::Status NvshmemApi::Initialize() {
   nvshmemx_init_attr_t nvshmem_init_attr = NVSHMEMX_INIT_ATTR_INITIALIZER;
   nvshmemx_uniqueid_t nvshmem_id = NVSHMEMX_UNIQUEID_INITIALIZER;
 
@@ -132,7 +126,25 @@ absl::Status NvshmemApi::Initialize() {
   return absl::OkStatus();
 }
 
-absl::StatusOr<void*> NvshmemApi::Allocate(uint64_t bytes) {
+absl::Status NvshmemCollectives::InitializeOnce() {
+  static absl::once_flag once_flag;
+  absl::Status status = absl::OkStatus();
+  absl::call_once(once_flag, [&]() {
+    status = Initialize();
+    initialized_ = true;
+  });
+  return status;
+}
+
+void NvshmemCollectives::Finalize() {
+  VLOG(3) << absl::StreamFormat(
+      "Finilizing NVSHMEM on process %d; num_processes=%llu", process_id_,
+      num_processes_);
+  nvshmemx_hostlib_finalize();
+}
+
+absl::StatusOr<void*> NvshmemCollectives::Allocate(uint64_t bytes) {
+  TF_RETURN_IF_ERROR(InitializeOnce());
   VLOG(3) << absl::StreamFormat(
       "Start allocation of %s (%llu bytes) for NVSHMEM",
       tsl::strings::HumanReadableNumBytes(bytes), bytes);
@@ -145,11 +157,15 @@ absl::StatusOr<void*> NvshmemApi::Allocate(uint64_t bytes) {
   return buffer;
 }
 
-absl::Status NvshmemApi::Deallocate(void* buffer) {
+absl::Status NvshmemCollectives::Deallocate(void* buffer) {
+  TF_RETURN_IF_ERROR(InitializeOnce());
   VLOG(3) << absl::StreamFormat("Start de-allocation for NVSHMEM buffer: %p",
                                 buffer);
   nvshmem_free(buffer);
   return absl::OkStatus();
 }
 
 }  // namespace xla::gpu
+
+XLA_COLLECTIVES_REGISTER("gpu", "nvshmem", 2,
+                         std::make_unique<xla::gpu::NvshmemCollectives>());
diff --git a/xla/backends/gpu/collectives/nvshmem_collectives.h b/xla/backends/gpu/collectives/nvshmem_collectives.h
@@ -0,0 +1,82 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_NVSHMEM_COLLECTIVES_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_NVSHMEM_COLLECTIVES_H_
+
+#include <functional>
+#include <string_view>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/core/collectives/collectives.h"
+
+namespace xla::gpu {
+
+// NVIDIA NVSHMEM library
+class NvshmemCollectives : public Collectives {
+ public:
+  ~NvshmemCollectives() override;
+
+  static NvshmemCollectives* Default();
+
+  void SetEnvInfo(
+      int process_id, size_t num_processes, size_t device_count_per_process,
+      std::function<absl::StatusOr<std::string>(std::string_view)> kv_store_get,
+      std::function<absl::Status(std::string_view, std::string_view)>
+          kv_store_set);
+
+  absl::StatusOr<void*> Allocate(uint64_t bytes);
+
+  absl::Status Deallocate(void* buffer);
+
+  absl::StatusOr<CliqueId> CreateUniqueCliqueId() const final {
+    return absl::UnimplementedError("Not implemented.");
+  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  CreateCommunicators(int32_t, const CliqueKey&, const std::optional<CliqueId>&,
+                      absl::Span<const DeviceRank>,
+                      const Collectives::Config&) final {
+    return absl::UnimplementedError("Not implemented.");
+  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
+      absl::Span<const Communicator* const>, int32_t, absl::Span<const RankId>,
+      const Collectives::Config&) final {
+    return absl::UnimplementedError("Not implemented.");
+  }
+
+ private:
+  absl::Status Initialize();
+  absl::Status InitializeOnce();
+
+  void Finalize();
+
+  int process_id_ = -1;
+  size_t num_processes_ = 0;
+  size_t device_count_per_process_ = 0;
+  std::function<absl::StatusOr<std::string>(std::string_view)> kv_store_get_ =
+      nullptr;
+  std::function<absl::Status(std::string_view, std::string_view)>
+      kv_store_set_ = nullptr;
+  bool initialized_ = false;
+
+  static constexpr char kv_store_key_[] = "nvshmem_global_init";
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_NVSHMEM_COLLECTIVES_H_
diff --git a/xla/core/collectives/collectives_registry.cc b/xla/core/collectives/collectives_registry.cc
@@ -99,4 +99,24 @@ absl::StatusOr<Collectives*> CollectivesRegistry::Default(
   return registry.platform_collectives[canonical_platform_name].begin()->second;
 }
 
+absl::StatusOr<Collectives*> CollectivesRegistry::Get(
+    absl::string_view platform_name, absl::string_view implementation_name) {
+  TF_ASSIGN_OR_RETURN(std::string canonical_platform_name,
+                      PlatformUtil::CanonicalPlatformName(platform_name));
+
+  auto& registry = GetCollectivesRegistry();
+  absl::MutexLock lock(&registry.mu);
+
+  for (const auto& registration : registry.collectives) {
+    if (registration.platform_name == canonical_platform_name &&
+        registration.name == implementation_name)
+      return registration.collectives.get();
+  }
+
+  return Internal(
+      "No collectives registered for platform: %s (canonical name: %s) and "
+      "implementation: %s",
+      platform_name, canonical_platform_name, implementation_name);
+}
+
 }  // namespace xla
diff --git a/xla/core/collectives/collectives_registry.h b/xla/core/collectives/collectives_registry.h
@@ -45,6 +45,11 @@ class CollectivesRegistry {
 
   // Returns the default collectives implementation for the given platform.
   static absl::StatusOr<Collectives*> Default(absl::string_view platform_name);
+
+  // Return a specific collectives implementation by name for the given
+  // platform.
+  static absl::StatusOr<Collectives*> Get(
+      absl::string_view platform_name, absl::string_view implementation_name);
 };
 
 }  // namespace xla
diff --git a/xla/python/BUILD b/xla/python/BUILD
@@ -1253,6 +1253,7 @@ tsl_pybind_extension(
             "-Wl,-rpath,$$ORIGIN/../nvidia/cudnn/lib",
             "-Wl,-rpath,$$ORIGIN/../nvidia/cusolver/lib",
             "-Wl,-rpath,$$ORIGIN/../nvidia/nccl/lib",
+            "-Wl,-rpath,$$ORIGIN/../nvidia/nvshmem/lib",
         ],
         "//conditions:default": [],
     }),
diff --git a/xla/service/gpu/runtime/BUILD b/xla/service/gpu/runtime/BUILD
@@ -187,28 +187,6 @@ xla_test(
     ],
 )
 
-#===-------------------------------------------------------------------------------------------===//
-# NVSHMEM Integration
-#===-------------------------------------------------------------------------------------------===//
-
-cc_library(
-    name = "nvshmem_api",
-    srcs = ["nvshmem_api.cc"],
-    hdrs = ["nvshmem_api.h"],
-    deps = [
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-        "@tsl//tsl/platform:errors",
-        "@tsl//tsl/platform:numbers",
-        "@tsl//tsl/platform:logging",
-        "@tsl//tsl/platform:statusor",
-    ]+ if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-        "@nvshmem//:nvshmem",
-    ]),
-)
-
 #===-------------------------------------------------------------------------------------------===//
 # XLA Thunks Runtime
 #===-------------------------------------------------------------------------------------------===//
diff --git a/xla/service/gpu/runtime/nvshmem_api.h b/xla/service/gpu/runtime/nvshmem_api.h
diff --git a/xla/service/gpu/tests/BUILD b/xla/service/gpu/tests/BUILD
diff --git a/xla/service/gpu/tests/nvshmem_test.cc b/xla/service/gpu/tests/nvshmem_test.cc