ROCm
diff --git a/‎third_party/xla/xla/backends/gpu/collectives/BUILD‎
Lines changed: 10 additions & 16 deletions b/‎third_party/xla/xla/backends/gpu/collectives/BUILD‎
Lines changed: 10 additions & 16 deletions
diff --git a/‎third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc‎
Lines changed: 0 additions & 29 deletions b/‎third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc‎
Lines changed: 0 additions & 29 deletions
diff --git a/‎third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc‎
Lines changed: 0 additions & 6 deletions b/‎third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h‎
Lines changed: 5 additions & 24 deletions b/‎third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h‎
Lines changed: 5 additions & 24 deletions
diff --git a/‎third_party/xla/xla/debug_options_flags.cc‎
Lines changed: 0 additions & 6 deletions b/‎third_party/xla/xla/debug_options_flags.cc‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎third_party/xla/xla/pjrt/gpu/BUILD‎
Lines changed: 0 additions & 50 deletions b/‎third_party/xla/xla/pjrt/gpu/BUILD‎
Lines changed: 0 additions & 50 deletions
@@ -18,22 +18,13 @@ package_group(
     ],
 )
 
-config_setting(
-    name = "arm_build",
-    values = {"cpu": "arm"},
-)
-
 # Build target that registers all available GPU collectives implementations with the collectives
 # registry at link time.
 cc_library(
     name = "gpu_collectives_plugin",
     deps = [
         ":gpu_collectives_stub",
-    ] + if_nccl([":nccl_collectives"]) + select({
-        # TODO(b/409709288): Fix nvshmem ARM issues and remove this condition.
-        ":arm_build": [],
-        "//conditions:default": [":nvshmem_collectives"],
-    }),
+    ] + if_nccl([":nccl_collectives"]),
 )
 
 cc_library(
@@ -231,7 +222,6 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:numbers",
     ] + if_cuda_is_configured([
         "@local_config_nccl//:nccl",
     ]) + if_rocm_is_configured([
@@ -281,11 +271,14 @@ cc_library(
 
 cc_library(
     name = "nvshmem_collectives",
-    srcs = if_cuda_is_configured(["nvshmem_collectives.cc"]),
-    hdrs = if_cuda_is_configured(["nvshmem_collectives.h"]),
+    srcs = ["nvshmem_collectives.cc"],
+    hdrs = ["nvshmem_collectives.h"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
     visibility = ["//visibility:private"],
     deps = [
-        ":gpu_collectives",
         "//xla/core/collectives",
         "//xla/core/collectives:clique_id",
         "//xla/core/collectives:clique_key",
@@ -306,8 +299,9 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:numbers",
-    ] + if_cuda_is_configured(["@nvshmem//:nvshmem_lib"]),
-    alwayslink = True,
+        "@nvshmem//:nvshmem_lib",
+    ],
+    alwayslink = True,  # registers collectives implementation
 )
 
 xla_cc_test(
 
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/core/collectives/collectives_registry.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
-#include "xla/debug_options_flags.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
@@ -54,7 +53,6 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
-#include "tsl/platform/numbers.h"
 
 #if TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
@@ -237,24 +235,7 @@ absl::Status NcclCollectives::GroupEnd() {
   return XLA_NCCL_STATUS(ncclGroupEnd());
 }
 
-static absl::StatusOr<xla::gpu::GpuCollectives*> GetNvshmemCollectives() {
-  TF_ASSIGN_OR_RETURN(xla::Collectives * collectives,
-                      xla::CollectivesRegistry::Get("gpu", "nvshmem"));
-  xla::gpu::GpuCollectives* nvshmem_collectives =
-      tsl::down_cast<xla::gpu::GpuCollectives*>(collectives);
-  if (nvshmem_collectives == nullptr) {
-    return absl::InternalError("Failed to get NVSHMEM collectives");
-  }
-
-  return nvshmem_collectives;
-}
-
 absl::StatusOr<void*> NcclCollectives::Allocate(uint64_t bytes) {
-  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
-    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
-    return nvshmem_collectives->Allocate(bytes);
-  }
-
   void* ptr = nullptr;
   ncclResult_t res = ncclMemAlloc(&ptr, bytes);
   if (res != ncclSuccess) {
@@ -270,11 +251,6 @@ absl::StatusOr<void*> NcclCollectives::Allocate(uint64_t bytes) {
 }
 
 absl::Status NcclCollectives::Deallocate(void* location) {
-  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
-    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
-    return nvshmem_collectives->Deallocate(location);
-  }
-
   ncclResult_t res = ncclMemFree(location);
   if (res != ncclSuccess) {
     return absl::InternalError(absl::StrFormat(
@@ -342,11 +318,6 @@ class NcclIdStore {
 
 absl::Status NcclCollectives::InitializeTopology(
     NcclCollectives::Topology topology) {
-  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
-    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
-    TF_RETURN_IF_ERROR(nvshmem_collectives->InitializeTopology(topology));
-  }
-
   if (topology.num_nodes > 1) {
     auto nccl_id_store = std::make_shared<NcclIdStore>(
         topology.node_id, topology.device_id_to_node_id,
 
@@ -57,12 +57,6 @@ NvshmemCollectives* NvshmemCollectives::Default() {
   LOG(FATAL) << "Unsupported collectives implementation for NVSHMEM";
 }
 
-absl::Status NvshmemCollectives::InitializeTopology(Topology topology) {
-  SetEnvInfo(topology.node_id, topology.num_nodes,
-             topology.device_count_per_process, topology.kv_store);
-  return absl::OkStatus();
-}
-
 void NvshmemCollectives::SetEnvInfo(
     int process_id, size_t num_processes, size_t device_count_per_process,
     std::weak_ptr<KeyValueStoreInterface> kv_store) {
 
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/core/collectives/clique_id.h"
 #include "xla/core/collectives/clique_key.h"
 #include "xla/core/collectives/collectives.h"
@@ -36,7 +35,7 @@ limitations under the License.
 namespace xla::gpu {
 
 // NVIDIA NVSHMEM library
-class NvshmemCollectives : public GpuCollectives {
+class NvshmemCollectives : public Collectives {
  public:
   ~NvshmemCollectives() override;
 
@@ -46,46 +45,28 @@ class NvshmemCollectives : public GpuCollectives {
                   size_t device_count_per_process,
                   std::weak_ptr<KeyValueStoreInterface> kv_store);
 
-  absl::StatusOr<void*> Allocate(uint64_t bytes) final;
+  absl::StatusOr<void*> Allocate(uint64_t bytes);
 
-  absl::Status Deallocate(void* buffer) final;
+  absl::Status Deallocate(void* buffer);
 
   absl::StatusOr<CliqueId> CreateUniqueCliqueId() const final {
     return absl::UnimplementedError("Not implemented.");
   }
 
-  absl::Status GroupStart() final {
-    return absl::UnimplementedError("Not implemented.");
-  }
-  absl::Status GroupEnd() final {
-    return absl::UnimplementedError("Not implemented.");
-  }
-
-  bool IsImplemented() const final { return true; }
-
-  bool IsGlobalConfig() const final { return false; }
-
-  absl::StatusOr<const CliqueIdCallback*> GetCliqueIdCallback(
-      const CliqueIdCallback* clique_id_callback, bool is_local) final {
-    return absl::UnimplementedError("Not implemented.");
-  }
-
   absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
   CreateCommunicators(const CliqueKey& clique_key,
                       const std::optional<CliqueIds>& clique_ids,
                       absl::Span<const DeviceRank> ranks,
-                      const Collectives::Config& config) {
+                      const Config& config) final {
     return absl::UnimplementedError("Not implemented.");
   }
 
   absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
       absl::Span<const Communicator* const> comms, int32_t color,
-      absl::Span<const RankId> keys, const Collectives::Config& config) final {
+      absl::Span<const RankId> keys, const Config& config) final {
     return absl::UnimplementedError("Not implemented.");
   }
 
-  absl::Status InitializeTopology(Topology topology) final;
-
  private:
   absl::Status InitializeOnce();
 
 
@@ -167,7 +167,6 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_nccl_termination_timeout_seconds(-1);
   opts.set_xla_gpu_enable_shared_constants(true);
   opts.set_xla_gpu_enable_nccl_user_buffers(false);
-  opts.set_xla_gpu_experimental_enable_nvshmem(false);
   opts.set_xla_gpu_enable_nccl_comm_splitting(true);
   opts.set_xla_gpu_nccl_init_max_rank_per_root_ratio(0);
 
@@ -1582,11 +1581,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "Enables NCCL User Buffer Registration. collective_memory_size in the "
       "allocator config must also be set to a non-zero value that is large "
       "enough to meet peak collective memory usage."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_experimental_enable_nvshmem",
-      bool_setter_for(&DebugOptions::set_xla_gpu_experimental_enable_nvshmem),
-      debug_options->xla_gpu_experimental_enable_nvshmem(),
-      "Enables NVSHMEM."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_temp_buffer_use_separate_color",
       bool_setter_for(
 
@@ -237,56 +237,6 @@ xla_test(
     ],
 )
 
-# TODO(b/409713313): Move this test to collectives directory.
-xla_test(
-    name = "se_gpu_pjrt_client_nvshmem_test",
-    srcs = ["se_gpu_pjrt_client_nvshmem_test.cc"],
-    backend_tags = {"gpu": [
-        "multi_gpu_h100",
-        "no_oss",
-        "noasan",
-        "notap",  # TODO(b/399931591): Re-enable once flakiness is resolved.
-        "nomsan",
-    ]},
-    backends = ["gpu"],
-    env = {
-        "XLA_FLAGS": "--xla_gpu_experimental_enable_nvshmem=true",
-    },
-    deps = [
-        ":gpu_topology_proto_cc",
-        ":se_gpu_pjrt_client",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/backends/gpu/collectives:gpu_collectives",
-        "//xla/ffi",
-        "//xla/ffi:ffi_api",
-        "//xla/hlo/builder:xla_computation",
-        "//xla/hlo/parser:hlo_parser",
-        "//xla/hlo/testlib:test",
-        "//xla/hlo/utils:hlo_query",
-        "//xla/pjrt:pjrt_client",
-        "//xla/pjrt:pjrt_compiler",
-        "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:raw_buffer",
-        "//xla/pjrt/distributed",
-        "//xla/pjrt/distributed:client",
-        "//xla/pjrt/distributed:in_memory_key_value_store",
-        "//xla/pjrt/distributed:service",
-        "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
-        "//xla/service:platform_util",
-        "//xla/tests:literal_test_util",
-        "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
 xla_test(
     name = "pjrt_client_test_se_gpu",
     srcs = ["pjrt_client_test_se_gpu.cc"],