openxla
diff --git a/‎third_party/tsl/third_party/nvshmem/BUILD‎ b/‎third_party/tsl/third_party/nvshmem/BUILD‎
diff --git a/‎third_party/tsl/third_party/nvshmem/nvshmem.BUILD‎
Lines changed: 102 additions & 0 deletions b/‎third_party/tsl/third_party/nvshmem/nvshmem.BUILD‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎third_party/tsl/third_party/nvshmem/workspace.bzl‎
Lines changed: 12 additions & 0 deletions b/‎third_party/tsl/third_party/nvshmem/workspace.bzl‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎third_party/tsl/workspace2.bzl‎
Lines changed: 2 additions & 0 deletions b/‎third_party/tsl/workspace2.bzl‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎xla/service/gpu/runtime/BUILD‎
Lines changed: 22 additions & 0 deletions b/‎xla/service/gpu/runtime/BUILD‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎xla/service/gpu/runtime/nvshmem_api.cc‎
Lines changed: 155 additions & 0 deletions b/‎xla/service/gpu/runtime/nvshmem_api.cc‎
Lines changed: 155 additions & 0 deletions
diff --git a/‎xla/service/gpu/runtime/nvshmem_api.h‎
Lines changed: 69 additions & 0 deletions b/‎xla/service/gpu/runtime/nvshmem_api.h‎
Lines changed: 69 additions & 0 deletions
@@ -0,0 +1,102 @@
+# NVSHMEM
+
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@bazel_skylib//rules:write_file.bzl", "write_file")
+
+options_substitions = {
+    "#cmakedefine NVSHMEM_COMPLEX_SUPPORT": "/* #undef NVSHMEM_COMPLEX_SUPPORT */",
+    "#cmakedefine NVSHMEM_DEBUG": "/* #undef NVSHMEM_DEBUG */",
+    "#cmakedefine NVSHMEM_DEVEL": "/* #undef NVSHMEM_DEVEL */",
+    "#cmakedefine NVSHMEM_TRACE": "/* #undef NVSHMEM_TRACE */",
+    "#cmakedefine NVSHMEM_DEFAULT_PMI2": "/* #undef NVSHMEM_DEFAULT_PMI2 */",
+    "#cmakedefine NVSHMEM_DEFAULT_PMIX": "/* #undef NVSHMEM_DEFAULT_PMIX */",
+    "#cmakedefine NVSHMEM_DEFAULT_UCX": "/* #undef NVSHMEM_DEFAULT_UCX */",
+    "#cmakedefine NVSHMEM_DISABLE_COLL_POLL": "#define NVSHMEM_DISABLE_COLL_POLL",
+    "#cmakedefine NVSHMEM_GPU_COLL_USE_LDST": "/* #undef NVSHMEM_GPU_COLL_USE_LDST */",
+    "#cmakedefine NVSHMEM_IBDEVX_SUPPORT": "/* #undef NVSHMEM_IBDEVX_SUPPORT */",
+    "#cmakedefine NVSHMEM_IBRC_SUPPORT": "#define NVSHMEM_IBRC_SUPPORT",
+    "#cmakedefine NVSHMEM_LIBFABRIC_SUPPORT": "/* #undef NVSHMEM_LIBFABRIC_SUPPORT */",
+    "#cmakedefine NVSHMEM_MPI_SUPPORT": "/* #undef NVSHMEM_MPI_SUPPORT */",
+    "#cmakedefine NVSHMEM_NVTX": "#define NVSHMEM_NVTX",
+    "#cmakedefine NVSHMEM_PMIX_SUPPORT": "/* #undef NVSHMEM_PMIX_SUPPORT */",
+    "#cmakedefine NVSHMEM_SHMEM_SUPPORT": "/* #undef NVSHMEM_SHMEM_SUPPORT */",
+    "#cmakedefine NVSHMEM_TEST_STATIC_LIB": "/* #undef NVSHMEM_TEST_STATIC_LIB */",
+    "#cmakedefine NVSHMEM_TIMEOUT_DEVICE_POLLING": "/* #undef NVSHMEM_TIMEOUT_DEVICE_POLLING */",
+    "#cmakedefine NVSHMEM_UCX_SUPPORT": "/* #undef NVSHMEM_UCX_SUPPORT */",
+    "#cmakedefine NVSHMEM_USE_DLMALLOC": "/* #undef NVSHMEM_USE_DLMALLOC */",
+    "#cmakedefine NVSHMEM_USE_NCCL": "/* #undef NVSHMEM_USE_NCCL */",
+    "#cmakedefine NVSHMEM_USE_GDRCOPY": "/* #undef NVSHMEM_USE_GDRCOPY */",
+    "#cmakedefine NVSHMEM_VERBOSE": "/* #undef NVSHMEM_VERBOSE */",
+    "#cmakedefine NVSHMEM_BUILD_TESTS": "#define NVSHMEM_BUILD_TESTS",
+    "#cmakedefine NVSHMEM_BUILD_EXAMPLES": "#define NVSHMEM_BUILD_EXAMPLES",
+    "#cmakedefine NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY": "/* #undef NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY */",
+    "#cmakedefine NVSHMEM_IBGDA_SUPPORT": "/* #undef NVSHMEM_IBGDA_SUPPORT */",
+    "#cmakedefine NVSHMEM_ENABLE_ALL_DEVICE_INLINING": "/* #undef NVSHMEM_ENABLE_ALL_DEVICE_INLINING */",
+}
+
+expand_template(
+    name = "nvshmem_build_options_h",
+    out = "src/include/non_abi/nvshmem_build_options.h",
+    substitutions = options_substitions,
+    template = "src/include/non_abi/nvshmem_build_options.h.in",
+)
+
+NVSHMEM_MAJOR = 3
+
+version_substitions = {
+    "@PROJECT_VERSION_MAJOR@": str(NVSHMEM_MAJOR),
+    "@PROJECT_VERSION_MINOR@": "0",
+    "@PROJECT_VERSION_PATCH@": "6",
+    "@PROJECT_VERSION_TWEAK@": "4",
+    "@TRANSPORT_VERSION_MAJOR@": "3",
+    "@TRANSPORT_VERSION_MINOR@": "0",
+    "@TRANSPORT_VERSION_PATCH@": "0",
+    "@BOOTSTRAP_VERSION_MAJOR@": "3",
+    "@BOOTSTRAP_VERSION_MINOR@": "0",
+    "@BOOTSTRAP_VERSION_PATCH@": "0",
+    "@INTERLIB_VERSION_MAJOR@": "3",
+    "@INTERLIB_VERSION_MINOR@": "0",
+    "@INTERLIB_VERSION_PATCH@": "0",
+    "@INFO_BUILD_VARS@": "",
+}
+
+expand_template(
+    name = "nvshmem_version_h",
+    out = "src/include/non_abi/nvshmem_version.h",
+    substitutions = version_substitions,
+    template = "src/include/non_abi/nvshmem_version.h.in",
+)
+
+cc_library(
+    name = "nvshmem",
+    hdrs = glob([
+        "src/include/**",
+    ]) + [
+        ":nvshmem_build_options_h",
+        ":nvshmem_version_h",
+    ],
+    includes = ["src/include"],
+    include_prefix = "third_party/nvshmem",
+    strip_include_prefix = "src/include",
+    visibility = ["//visibility:public"],
+    deps = [
+        "@xla//xla/tsl/cuda:nvshmem_stub",
+    ],
+)
+
+# This additional header allows us to determine the configured NVSHMEM version
+# without including the rest of NVSHMEM.
+write_file(
+    name = "nvshmem_config_header",
+    out = "nvshmem_config.h",
+    content = [
+        "#define TF_NVSHMEM_VERSION \"{}\"".format(NVSHMEM_MAJOR),
+    ],
+)
+
+cc_library(
+    name = "nvshmem_config",
+    hdrs = ["nvshmem_config.h"],
+    include_prefix = "third_party/nvshmem",
+    visibility = ["//visibility:public"],
+)
@@ -0,0 +1,12 @@
+"""NVSHMEM - NVIDIA Shared Memory"""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    tf_http_archive(
+        name = "nvshmem",
+        strip_prefix = "nvshmem_src_3.0.6-4",
+        sha256 = "4f435fdee320a365dd19d24b9f74df69b69886d3902ec99b16b553d485b18871",
+        urls = tf_mirror_urls("https://developer.download.nvidia.com/compute/redist/nvshmem/3.0.6/source/nvshmem_src_3.0.6-4.txz"),
+        build_file = "//third_party/nvshmem:nvshmem.BUILD",
+    )
@@ -23,6 +23,7 @@ load("//third_party/hwloc:workspace.bzl", hwloc = "repo")
 load("//third_party/implib_so:workspace.bzl", implib_so = "repo")
 load("//third_party/llvm:setup.bzl", "llvm_setup")
 load("//third_party/nasm:workspace.bzl", nasm = "repo")
+load("//third_party/nvshmem:workspace.bzl", nvshmem = "repo")
 load("//third_party/py:python_configure.bzl", "python_configure")
 load("//third_party/py/ml_dtypes:workspace.bzl", ml_dtypes = "repo")
 load("//third_party/pybind11_abseil:workspace.bzl", pybind11_abseil = "repo")
@@ -50,6 +51,7 @@ def _initialize_third_party():
     implib_so()
     ml_dtypes()
     nasm()
+    nvshmem()
     pybind11_abseil()
     pybind11_bazel()
     tensorrt()
 
@@ -188,6 +188,28 @@ xla_test(
     ],
 )
 
+#===-------------------------------------------------------------------------------------------===//
+# NVSHMEM Integration
+#===-------------------------------------------------------------------------------------------===//
+
+cc_library(
+    name = "nvshmem_api",
+    srcs = ["nvshmem_api.cc"],
+    hdrs = ["nvshmem_api.h"],
+    deps = [
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@tsl//tsl/platform:errors",
+        "@tsl//tsl/platform:numbers",
+        "@tsl//tsl/platform:logging",
+        "@tsl//tsl/platform:statusor",
+    ]+ if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@nvshmem//:nvshmem",
+    ]),
+)
+
 #===-------------------------------------------------------------------------------------------===//
 # XLA Thunks Runtime
 #===-------------------------------------------------------------------------------------------===//
 
@@ -0,0 +1,155 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/runtime/nvshmem_api.h"
+
+#include "absl/strings/str_format.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/numbers.h"
+#include "tsl/platform/statusor.h"
+#include "third_party/nvshmem/nvshmem.h"
+#include "third_party/nvshmem/nvshmemx.h"
+
+namespace xla::gpu {
+
+//==-----------------------------------------------------------------------===//
+// Macros to return or warn on NVSHMEM errors.
+//==-----------------------------------------------------------------------===//
+
+static absl::Status NvshmemToStatus(int s, const char* file, int64_t line,
+                                    const char* expr) {
+  if (s == 0) return absl::OkStatus();
+
+  return absl::InternalError(
+      absl::StrFormat("%s:%d: NVSHMEM operation %s failed."
+                      " For extra logging, rerun with 'NVSHMEM_DEBUG=INFO'.",
+                      file, line, expr));
+}
+
+#define XLA_NVSHMEM_STATUS(expr) \
+  xla::gpu::NvshmemToStatus(expr, __FILE__, __LINE__, #expr)
+
+#define XLA_NVSHMEM_RETURN_IF_ERROR(expr)      \
+  do {                                         \
+    absl::Status s = XLA_NVSHMEM_STATUS(expr); \
+    if (!s.ok()) {                             \
+      return s;                                \
+    }                                          \
+  } while (0)
+
+#define XLA_NVSHMEM_LOG_IF_ERROR(expr)         \
+  do {                                         \
+    absl::Status s = XLA_NVSHMEM_STATUS(expr); \
+    if (!s.ok()) {                             \
+      LOG(ERROR) << s.ToString();              \
+    }                                          \
+  } while (0)
+
+#define XLA_NVSHMEM_CHECK(expr) CHECK(XLA_NVSHMEM_STATUS(expr).ok())
+
+int NvshmemApi::process_id_ = -1;
+size_t NvshmemApi::num_processes_ = 0;
+size_t NvshmemApi::device_count_per_process_ = 0;
+std::function<absl::StatusOr<std::string>(std::string_view)>
+    NvshmemApi::kv_store_get_ = nullptr;
+std::function<absl::Status(std::string_view, std::string_view)>
+    NvshmemApi::kv_store_set_ = nullptr;
+
+NvshmemApi& NvshmemApi::Default() {
+  static NvshmemApi instance;
+  return instance;
+}
+
+void NvshmemApi::SetEnvInfo(
+    int process_id, size_t num_processes, size_t device_count_per_process,
+    std::function<absl::StatusOr<std::string>(std::string_view)> kv_store_get,
+    std::function<absl::Status(std::string_view, std::string_view)>
+        kv_store_set) {
+  process_id_ = process_id;
+  num_processes_ = num_processes;
+  device_count_per_process_ = device_count_per_process;
+  kv_store_get_ = kv_store_get;
+  kv_store_set_ = kv_store_set;
+}
+
+NvshmemApi::NvshmemApi() {
+  // Initialize NVSHMEM here since code path
+  // is already protected by singleton pattern
+  if (process_id_ == -1) {
+    LOG(FATAL)
+        << "NvshmemApi::SetEnvInfo was not called before using NVSHMEM API";
+  }
+  if (device_count_per_process_ != 1) {
+    LOG(FATAL) << "NVSHMEM API is only supported with one device per process";
+  }
+  CHECK(Initialize().ok());
+}
+
+NvshmemApi::~NvshmemApi() {
+  VLOG(3) << absl::StreamFormat(
+      "Finilizing NVSHMEM on process %d; num_processes=%llu", process_id_,
+      num_processes_);
+  nvshmemx_hostlib_finalize();
+}
+
+absl::Status NvshmemApi::Initialize() {
+  nvshmemx_init_attr_t nvshmem_init_attr;
+  nvshmemx_uniqueid_t nvshmem_id;
+
+  // Initialize NVSHMEM
+  if (process_id_ == 0) {
+    XLA_NVSHMEM_RETURN_IF_ERROR(nvshmemx_get_uniqueid(&nvshmem_id));
+    std::string_view nvshmem_id_str(reinterpret_cast<char*>(&nvshmem_id),
+                                    sizeof(nvshmemx_uniqueid_t));
+    TF_RETURN_IF_ERROR(kv_store_set_(kv_store_key_, nvshmem_id_str));
+  } else {
+    TF_ASSIGN_OR_RETURN(std::string id_str, kv_store_get_(kv_store_key_));
+    std::copy(id_str.data(), id_str.data() + sizeof(nvshmemx_uniqueid_t),
+              reinterpret_cast<char*>(&nvshmem_id));
+  }
+
+  XLA_NVSHMEM_RETURN_IF_ERROR(nvshmemx_set_attr_uniqueid_args(
+      process_id_, num_processes_, &nvshmem_id, &nvshmem_init_attr));
+  XLA_NVSHMEM_RETURN_IF_ERROR(nvshmemx_hostlib_init_attr(
+      NVSHMEMX_INIT_WITH_UNIQUEID, &nvshmem_init_attr));
+
+  VLOG(3) << absl::StreamFormat(
+      "Initialized NVSHMEM on process %d; num_processes=%llu", process_id_,
+      num_processes_);
+  return absl::OkStatus();
+}
+
+absl::StatusOr<void*> NvshmemApi::Allocate(uint64_t bytes) {
+  VLOG(3) << absl::StreamFormat(
+      "Start allocation of %s (%llu bytes) for NVSHMEM",
+      tsl::strings::HumanReadableNumBytes(bytes), bytes);
+  void* buffer = nvshmem_malloc(bytes);
+  if (buffer == nullptr) {
+    return absl::InternalError(absl::StrFormat(
+        "Failed to allocate %s (%llu bytes) from NVSHMEM memory",
+        tsl::strings::HumanReadableNumBytes(bytes), bytes));
+  }
+  return buffer;
+}
+
+absl::Status NvshmemApi::Deallocate(void* buffer) {
+  VLOG(3) << absl::StreamFormat("Start de-allocation for NVSHMEM buffer: %p",
+                                buffer);
+  nvshmem_free(buffer);
+  return absl::OkStatus();
+}
+
+}  // namespace xla::gpu
@@ -0,0 +1,69 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_NVSHMEM_API_H_
+#define XLA_SERVICE_GPU_RUNTIME_NVSHMEM_API_H_
+
+#include <functional>
+#include <string_view>
+
+#include <cuda.h>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+
+namespace xla::gpu {
+
+//===----------------------------------------------------------------------===//
+// NvshmemApi
+//===----------------------------------------------------------------------===//
+
+class NvshmemApi {
+ public:
+  // Returns a default NvshmemApi for a current process.
+  // NvshmemApi follows the Singleton design pattern
+  static NvshmemApi& Default();
+
+  static void SetEnvInfo(
+      int process_id, size_t num_processes, size_t device_count_per_process,
+      std::function<absl::StatusOr<std::string>(std::string_view)> kv_store_get,
+      std::function<absl::Status(std::string_view, std::string_view)>
+          kv_store_set);
+  NvshmemApi(NvshmemApi const&) = delete;
+  void operator=(NvshmemApi const&) = delete;
+
+  absl::StatusOr<void*> Allocate(uint64_t bytes);
+  absl::Status Deallocate(void* buffer);
+
+ private:
+  NvshmemApi();
+  ~NvshmemApi();
+
+  absl::Status Initialize();
+
+  // Env variable
+  static int process_id_;
+  static size_t num_processes_;
+  static size_t device_count_per_process_;
+  static std::function<absl::StatusOr<std::string>(std::string_view)>
+      kv_store_get_;
+  static std::function<absl::Status(std::string_view, std::string_view)>
+      kv_store_set_;
+  static constexpr char kv_store_key_[] = "nvshmem_global_init";
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_NVSHMEM_API_H_