ROCm
diff --git a/‎third_party/xla/xla/backends/gpu/collectives/BUILD‎
Lines changed: 16 additions & 10 deletions b/‎third_party/xla/xla/backends/gpu/collectives/BUILD‎
Lines changed: 16 additions & 10 deletions
diff --git a/‎third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc‎
Lines changed: 29 additions & 0 deletions b/‎third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc‎
Lines changed: 6 additions & 0 deletions b/‎third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h‎
Lines changed: 24 additions & 5 deletions b/‎third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h‎
Lines changed: 24 additions & 5 deletions
diff --git a/‎third_party/xla/xla/debug_options_flags.cc‎
Lines changed: 6 additions & 0 deletions b/‎third_party/xla/xla/debug_options_flags.cc‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎third_party/xla/xla/pjrt/gpu/BUILD‎
Lines changed: 50 additions & 0 deletions b/‎third_party/xla/xla/pjrt/gpu/BUILD‎
Lines changed: 50 additions & 0 deletions
@@ -18,13 +18,22 @@ package_group(
     ],
 )
 
+config_setting(
+    name = "arm_build",
+    values = {"cpu": "arm"},
+)
+
 # Build target that registers all available GPU collectives implementations with the collectives
 # registry at link time.
 cc_library(
     name = "gpu_collectives_plugin",
     deps = [
         ":gpu_collectives_stub",
-    ] + if_nccl([":nccl_collectives"]),
+    ] + if_nccl([":nccl_collectives"]) + select({
+        # TODO(b/409709288): Fix nvshmem ARM issues and remove this condition.
+        ":arm_build": [],
+        "//conditions:default": [":nvshmem_collectives"],
+    }),
 )
 
 cc_library(
@@ -222,6 +231,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:numbers",
     ] + if_cuda_is_configured([
         "@local_config_nccl//:nccl",
     ]) + if_rocm_is_configured([
@@ -271,14 +281,11 @@ cc_library(
 
 cc_library(
     name = "nvshmem_collectives",
-    srcs = ["nvshmem_collectives.cc"],
-    hdrs = ["nvshmem_collectives.h"],
-    tags = [
-        "cuda-only",
-        "gpu",
-    ],
+    srcs = if_cuda_is_configured(["nvshmem_collectives.cc"]),
+    hdrs = if_cuda_is_configured(["nvshmem_collectives.h"]),
     visibility = ["//visibility:private"],
     deps = [
+        ":gpu_collectives",
         "//xla/core/collectives",
         "//xla/core/collectives:clique_id",
         "//xla/core/collectives:clique_key",
@@ -299,9 +306,8 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:numbers",
-        "@nvshmem//:nvshmem_lib",
-    ],
-    alwayslink = True,  # registers collectives implementation
+    ] + if_cuda_is_configured(["@nvshmem//:nvshmem_lib"]),
+    alwayslink = True,
 )
 
 xla_cc_test(
 
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/core/collectives/collectives_registry.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/debug_options_flags.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
@@ -53,6 +54,7 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
+#include "tsl/platform/numbers.h"
 
 #if TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
@@ -235,7 +237,24 @@ absl::Status NcclCollectives::GroupEnd() {
   return XLA_NCCL_STATUS(ncclGroupEnd());
 }
 
+static absl::StatusOr<xla::gpu::GpuCollectives*> GetNvshmemCollectives() {
+  TF_ASSIGN_OR_RETURN(xla::Collectives * collectives,
+                      xla::CollectivesRegistry::Get("gpu", "nvshmem"));
+  xla::gpu::GpuCollectives* nvshmem_collectives =
+      tsl::down_cast<xla::gpu::GpuCollectives*>(collectives);
+  if (nvshmem_collectives == nullptr) {
+    return absl::InternalError("Failed to get NVSHMEM collectives");
+  }
+
+  return nvshmem_collectives;
+}
+
 absl::StatusOr<void*> NcclCollectives::Allocate(uint64_t bytes) {
+  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
+    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
+    return nvshmem_collectives->Allocate(bytes);
+  }
+
   void* ptr = nullptr;
   ncclResult_t res = ncclMemAlloc(&ptr, bytes);
   if (res != ncclSuccess) {
@@ -251,6 +270,11 @@ absl::StatusOr<void*> NcclCollectives::Allocate(uint64_t bytes) {
 }
 
 absl::Status NcclCollectives::Deallocate(void* location) {
+  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
+    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
+    return nvshmem_collectives->Deallocate(location);
+  }
+
   ncclResult_t res = ncclMemFree(location);
   if (res != ncclSuccess) {
     return absl::InternalError(absl::StrFormat(
@@ -318,6 +342,11 @@ class NcclIdStore {
 
 absl::Status NcclCollectives::InitializeTopology(
     NcclCollectives::Topology topology) {
+  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
+    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
+    TF_RETURN_IF_ERROR(nvshmem_collectives->InitializeTopology(topology));
+  }
+
   if (topology.num_nodes > 1) {
     auto nccl_id_store = std::make_shared<NcclIdStore>(
         topology.node_id, topology.device_id_to_node_id,
 
@@ -57,6 +57,12 @@ NvshmemCollectives* NvshmemCollectives::Default() {
   LOG(FATAL) << "Unsupported collectives implementation for NVSHMEM";
 }
 
+absl::Status NvshmemCollectives::InitializeTopology(Topology topology) {
+  SetEnvInfo(topology.node_id, topology.num_nodes,
+             topology.device_count_per_process, topology.kv_store);
+  return absl::OkStatus();
+}
+
 void NvshmemCollectives::SetEnvInfo(
     int process_id, size_t num_processes, size_t device_count_per_process,
     std::weak_ptr<KeyValueStoreInterface> kv_store) {
 
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/core/collectives/clique_id.h"
 #include "xla/core/collectives/clique_key.h"
 #include "xla/core/collectives/collectives.h"
@@ -35,7 +36,7 @@ limitations under the License.
 namespace xla::gpu {
 
 // NVIDIA NVSHMEM library
-class NvshmemCollectives : public Collectives {
+class NvshmemCollectives : public GpuCollectives {
  public:
   ~NvshmemCollectives() override;
 
@@ -45,28 +46,46 @@ class NvshmemCollectives : public Collectives {
                   size_t device_count_per_process,
                   std::weak_ptr<KeyValueStoreInterface> kv_store);
 
-  absl::StatusOr<void*> Allocate(uint64_t bytes);
+  absl::StatusOr<void*> Allocate(uint64_t bytes) final;
 
-  absl::Status Deallocate(void* buffer);
+  absl::Status Deallocate(void* buffer) final;
 
   absl::StatusOr<CliqueId> CreateUniqueCliqueId() const final {
     return absl::UnimplementedError("Not implemented.");
   }
 
+  absl::Status GroupStart() final {
+    return absl::UnimplementedError("Not implemented.");
+  }
+  absl::Status GroupEnd() final {
+    return absl::UnimplementedError("Not implemented.");
+  }
+
+  bool IsImplemented() const final { return true; }
+
+  bool IsGlobalConfig() const final { return false; }
+
+  absl::StatusOr<const CliqueIdCallback*> GetCliqueIdCallback(
+      const CliqueIdCallback* clique_id_callback, bool is_local) final {
+    return absl::UnimplementedError("Not implemented.");
+  }
+
   absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
   CreateCommunicators(const CliqueKey& clique_key,
                       const std::optional<CliqueIds>& clique_ids,
                       absl::Span<const DeviceRank> ranks,
-                      const Config& config) final {
+                      const Collectives::Config& config) {
     return absl::UnimplementedError("Not implemented.");
   }
 
   absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
       absl::Span<const Communicator* const> comms, int32_t color,
-      absl::Span<const RankId> keys, const Config& config) final {
+      absl::Span<const RankId> keys, const Collectives::Config& config) final {
     return absl::UnimplementedError("Not implemented.");
   }
 
+  absl::Status InitializeTopology(Topology topology) final;
+
  private:
   absl::Status InitializeOnce();
 
 
@@ -167,6 +167,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_nccl_termination_timeout_seconds(-1);
   opts.set_xla_gpu_enable_shared_constants(true);
   opts.set_xla_gpu_enable_nccl_user_buffers(false);
+  opts.set_xla_gpu_experimental_enable_nvshmem(false);
   opts.set_xla_gpu_enable_nccl_comm_splitting(true);
   opts.set_xla_gpu_nccl_init_max_rank_per_root_ratio(0);
 
@@ -1581,6 +1582,11 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "Enables NCCL User Buffer Registration. collective_memory_size in the "
       "allocator config must also be set to a non-zero value that is large "
       "enough to meet peak collective memory usage."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_enable_nvshmem",
+      bool_setter_for(&DebugOptions::set_xla_gpu_experimental_enable_nvshmem),
+      debug_options->xla_gpu_experimental_enable_nvshmem(),
+      "Enables NVSHMEM."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_temp_buffer_use_separate_color",
       bool_setter_for(
 
@@ -237,6 +237,56 @@ xla_test(
     ],
 )
 
+# TODO(b/409713313): Move this test to collectives directory.
+xla_test(
+    name = "se_gpu_pjrt_client_nvshmem_test",
+    srcs = ["se_gpu_pjrt_client_nvshmem_test.cc"],
+    backend_tags = {"gpu": [
+        "multi_gpu_h100",
+        "no_oss",
+        "noasan",
+        "notap",  # TODO(b/399931591): Re-enable once flakiness is resolved.
+        "nomsan",
+    ]},
+    backends = ["gpu"],
+    env = {
+        "XLA_FLAGS": "--xla_gpu_experimental_enable_nvshmem=true",
+    },
+    deps = [
+        ":gpu_topology_proto_cc",
+        ":se_gpu_pjrt_client",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/hlo/builder:xla_computation",
+        "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt:raw_buffer",
+        "//xla/pjrt/distributed",
+        "//xla/pjrt/distributed:client",
+        "//xla/pjrt/distributed:in_memory_key_value_store",
+        "//xla/pjrt/distributed:service",
+        "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
+        "//xla/service:platform_util",
+        "//xla/tests:literal_test_util",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 xla_test(
     name = "pjrt_client_test_se_gpu",
     srcs = ["pjrt_client_test_se_gpu.cc"],