openxla
diff --git a/‎xla/backends/gpu/collectives/BUILD‎
Lines changed: 27 additions & 5 deletions b/‎xla/backends/gpu/collectives/BUILD‎
Lines changed: 27 additions & 5 deletions
diff --git a/‎xla/backends/gpu/collectives/gpu_collectives.h‎
Lines changed: 22 additions & 0 deletions b/‎xla/backends/gpu/collectives/gpu_collectives.h‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎xla/backends/gpu/collectives/gpu_collectives_stub.h‎
Lines changed: 9 additions & 0 deletions b/‎xla/backends/gpu/collectives/gpu_collectives_stub.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎xla/backends/gpu/collectives/nccl_collectives.cc‎
Lines changed: 132 additions & 0 deletions b/‎xla/backends/gpu/collectives/nccl_collectives.cc‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎xla/backends/gpu/collectives/nccl_collectives.h‎
Lines changed: 6 additions & 0 deletions b/‎xla/backends/gpu/collectives/nccl_collectives.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎xla/backends/gpu/collectives/nvshmem_collectives.cc‎
Lines changed: 6 additions & 0 deletions b/‎xla/backends/gpu/collectives/nvshmem_collectives.cc‎
Lines changed: 6 additions & 0 deletions
@@ -18,14 +18,23 @@ package_group(
     ],
 )
 
+config_setting(
+    name = "arm_build",
+    values = {"cpu": "arm"},
+)
+
 # Build target that registers all available GPU collectives implementations with the collectives
 # registry at link time.
 cc_library(
     name = "gpu_collectives_plugin",
     deps = [
         ":gpu_collectives_stub",
         ":nccl_collectives",
-    ],
+    ] + select({
+        # TODO(b/409709288): Fix nvshmem ARM issues and remove this condition.
+        ":arm_build": [],
+        "//conditions:default": [":nvshmem_collectives"],
+    }),
 )
 
 cc_library(
@@ -132,6 +141,7 @@ cc_library(
     srcs = ["gpu_collectives.cc"],
     hdrs = ["gpu_collectives.h"],
     deps = [
+        "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -140,10 +150,12 @@ cc_library(
         "//xla/core/collectives:clique_key",
         "//xla/core/collectives:collectives_registry",
         "//xla/core/collectives:communicator",
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/service:global_device_id",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -194,9 +206,11 @@ cc_library(
     ]),
     visibility = ["//visibility:private"],
     deps = [
+        ":gpu_clique_key",
         ":gpu_collectives",
         ":nccl_communicator",
         ":nccl_errors",
+        "//xla:debug_options_flags",
         "//xla:status_macros",
         "//xla:util",
         "//xla/core/collectives",
@@ -205,17 +219,24 @@ cc_library(
         "//xla/core/collectives:collectives_registry",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/service:global_device_id",
+        "//xla/service/gpu:gpu_executable_run_options",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@tsl//tsl/platform:casts",
+        "@tsl//tsl/platform:numbers",
     ] + if_cuda_is_configured([
         "@local_config_nccl//:nccl",
     ]) + if_rocm_is_configured([
@@ -265,14 +286,15 @@ cc_library(
 
 cc_library(
     name = "nvshmem_collectives",
-    srcs = ["nvshmem_collectives.cc"],
-    hdrs = ["nvshmem_collectives.h"],
+    srcs = if_cuda_is_configured(["nvshmem_collectives.cc"]),
+    hdrs = if_cuda_is_configured(["nvshmem_collectives.h"]),
     tags = [
         "cuda-only",
         "gpu",
     ],
     visibility = ["//visibility:private"],
     deps = [
+        ":gpu_collectives",
         "//xla/core/collectives",
         "//xla/core/collectives:clique_id",
         "//xla/core/collectives:clique_key",
@@ -295,7 +317,7 @@ cc_library(
         "@tsl//tsl/platform:casts",
         "@tsl//tsl/platform:numbers",
     ],
-    alwayslink = True,  # registers collectives implementation
+    alwayslink = True,
 )
 
 xla_cc_test(
 
@@ -19,13 +19,18 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <memory>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/core/collectives/clique_id.h"
 #include "xla/core/collectives/clique_key.h"
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/communicator.h"
+#include "xla/executable_run_options.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/global_device_id.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -103,6 +108,23 @@ class GpuCollectives : public Collectives {
   // Tries to cast a Collectives::Config to a GpuCollectives::Config.
   static absl::StatusOr<const Config*> TryCast(
       const Collectives::Config* config);
+
+  // TODO(patrios): Use smart wrapper instead of void*.
+  virtual absl::StatusOr<void*> Allocate(uint64_t bytes) = 0;
+
+  virtual absl::Status Deallocate(void* buffer) = 0;
+
+  struct Topology {
+    int32_t node_id;
+    int32_t num_nodes;
+    size_t device_count_per_process;
+    std::shared_ptr<KeyValueStoreInterface> kv_store;
+    absl::flat_hash_map<GlobalDeviceId, int32_t> device_id_to_node_id;
+    gpu::GpuExecutableRunOptions* gpu_executable_run_options;
+  };
+
+  // Initializes the topology information for the collectives backend.
+  virtual absl::Status InitializeTopology(Topology topology) = 0;
 };
 
 }  // namespace xla::gpu
 
@@ -64,6 +64,15 @@ class GpuCollectivesStub : public GpuCollectives {
 
   absl::Status GroupStart() final { return UnimplementedError(); }
   absl::Status GroupEnd() final { return UnimplementedError(); }
+  absl::StatusOr<void*> Allocate(uint64_t bytes) final {
+    return UnimplementedError();
+  }
+
+  absl::Status Deallocate(void* buffer) final { return UnimplementedError(); }
+
+  absl::Status InitializeTopology(Topology topology) final {
+    return UnimplementedError();
+  }
 
  protected:
   static absl::Status UnimplementedError() {
 
@@ -20,16 +20,21 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/nccl_communicator.h"
 #include "xla/backends/gpu/collectives/nccl_errors.h"
@@ -39,12 +44,17 @@ limitations under the License.
 #include "xla/core/collectives/collectives_registry.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/debug_options_flags.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/global_device_id.h"
+#include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
+#include "tsl/platform/numbers.h"
 
 #if TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
@@ -227,6 +237,128 @@ absl::Status NcclCollectives::GroupEnd() {
   return XLA_NCCL_STATUS(ncclGroupEnd());
 }
 
+static absl::StatusOr<xla::gpu::GpuCollectives*> GetNvshmemCollectives() {
+  TF_ASSIGN_OR_RETURN(xla::Collectives * collectives,
+                      xla::CollectivesRegistry::Get("gpu", "nvshmem"));
+  xla::gpu::GpuCollectives* nvshmem_collectives =
+      tsl::down_cast<xla::gpu::GpuCollectives*>(collectives);
+  if (nvshmem_collectives == nullptr) {
+    return absl::InternalError("Failed to get NVSHMEM collectives");
+  }
+
+  return nvshmem_collectives;
+}
+
+absl::StatusOr<void*> NcclCollectives::Allocate(uint64_t bytes) {
+  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
+    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
+    return nvshmem_collectives->Allocate(bytes);
+  }
+
+  void* ptr = nullptr;
+  ncclResult_t res = ncclMemAlloc(&ptr, bytes);
+  if (res != ncclSuccess) {
+    return absl::InternalError(absl::StrFormat(
+        "failed to allocate %s (%llu bytes) from device collective memory: %s, "
+        "Last NCCL warning(error) log entry (may be unrelated): %s",
+        tsl::strings::HumanReadableNumBytes(bytes), bytes,
+        ncclGetErrorString(res), ncclGetLastError(nullptr)));
+  }
+  VLOG(2) << "Allocated collective memory " << ptr << " of " << bytes
+          << " bytes";
+  return ptr;
+}
+
+absl::Status NcclCollectives::Deallocate(void* location) {
+  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
+    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
+    return nvshmem_collectives->Deallocate(location);
+  }
+
+  ncclResult_t res = ncclMemFree(location);
+  if (res != ncclSuccess) {
+    return absl::InternalError(absl::StrFormat(
+        "failed to free device collective memory at %p; result: %s, Last NCCL "
+        "warning(error) log entry (may be unrelated): %s",
+        location, ncclGetErrorString(res), ncclGetLastError(nullptr)));
+  }
+
+  VLOG(2) << "Deallocated collective memory " << location;
+  return absl::OkStatus();
+}
+
+class NcclIdStore {
+ public:
+  NcclIdStore(int node_id,
+              absl::flat_hash_map<GlobalDeviceId, int> device_to_node,
+              std::shared_ptr<KeyValueStoreInterface> kv_store)
+      : node_id_(node_id),
+        device_to_node_(std::move(device_to_node)),
+        kv_store_(std::move(kv_store)) {}
+
+  absl::StatusOr<CliqueId> GetNcclUniqueId(const CliqueKey& key) {
+    auto* gpu_key = tsl::down_cast<const gpu::GpuCliqueKey*>(&key);
+    if (gpu_key == nullptr) {
+      return InvalidArgument("Expected GPU clique key");
+    }
+
+    // The caller must ensure that threads calling this method concurrently have
+    // unique keys, otherwise the global key-value store may hold the wrong
+    // value.
+    {
+      absl::MutexLock lock(&mu_);
+      auto it = cache_.find(*gpu_key);
+      if (it != cache_.end()) {
+        return it->second;
+      }
+    }
+    CliqueId clique_id;
+    int primary_node_id = device_to_node_.at(gpu_key->root_device());
+    if (node_id_ == primary_node_id) {
+      TF_ASSIGN_OR_RETURN(
+          clique_id, gpu::GpuCollectives::Default()->CreateUniqueCliqueId());
+      TF_RETURN_IF_ERROR(
+          kv_store_->Set(gpu_key->ToString(), clique_id.ToString()));
+    } else {
+      TF_ASSIGN_OR_RETURN(
+          std::string id_str,
+          kv_store_->Get(gpu_key->ToString(), absl::Minutes(10)));
+      clique_id = CliqueId(id_str);
+    }
+    absl::MutexLock lock(&mu_);
+    auto result = cache_.emplace(*gpu_key, std::move(clique_id));
+    TF_RET_CHECK(result.second) << "Unique ID already in cache.";
+    return result.first->second;
+  }
+
+ private:
+  const int node_id_;
+  const absl::flat_hash_map<GlobalDeviceId, int> device_to_node_;
+  const std::shared_ptr<KeyValueStoreInterface> kv_store_;
+
+  absl::Mutex mu_;
+  absl::flat_hash_map<gpu::GpuCliqueKey, CliqueId> cache_ ABSL_GUARDED_BY(mu_);
+};
+
+absl::Status NcclCollectives::InitializeTopology(
+    NcclCollectives::Topology topology) {
+  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
+    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
+    TF_RETURN_IF_ERROR(nvshmem_collectives->InitializeTopology(topology));
+  }
+
+  if (topology.num_nodes > 1) {
+    auto nccl_id_store = std::make_shared<NcclIdStore>(
+        topology.node_id, topology.device_id_to_node_id,
+        std::move(topology.kv_store));
+    topology.gpu_executable_run_options->set_clique_id_callback(
+        [nccl_id_store](const CliqueKey& key) {
+          return nccl_id_store->GetNcclUniqueId(key);
+        });
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace xla::gpu
 
 XLA_COLLECTIVES_REGISTER("gpu", "nccl", 1,
 
@@ -57,6 +57,12 @@ class NcclCollectives : public GpuCollectives {
   absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
       absl::Span<const Communicator* const> comms, int32_t color,
       absl::Span<const RankId> keys, const Collectives::Config& config) final;
+
+  absl::StatusOr<void*> Allocate(uint64_t bytes) final;
+
+  absl::Status Deallocate(void* location) final;
+
+  absl::Status InitializeTopology(Topology topology) final;
 };
 
 }  // namespace xla::gpu
 
@@ -57,6 +57,12 @@ NvshmemCollectives* NvshmemCollectives::Default() {
   LOG(FATAL) << "Unsupported collectives implementation for NVSHMEM";
 }
 
+absl::Status NvshmemCollectives::InitializeTopology(Topology topology) {
+  SetEnvInfo(topology.node_id, topology.num_nodes,
+             topology.device_count_per_process, topology.kv_store);
+  return absl::OkStatus();
+}
+
 void NvshmemCollectives::SetEnvInfo(
     int process_id, size_t num_processes, size_t device_count_per_process,
     std::weak_ptr<KeyValueStoreInterface> kv_store) {