Merge pull request #2681 from ROCm/r1.15-rocm61-bufcomparator-fix

i-chaochen · web-flow · commit 4483ada926ed · 2024-09-20T17:06:36.000+01:00
buffer comparator fix
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -23,10 +23,7 @@ load(
     "//tensorflow/core/platform:default/cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
-)
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured", "rocm_copts")
 
 package(
     default_visibility = [":friends"],
@@ -1416,6 +1413,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "stream_executor_util_kernel",
+    srcs = ["stream_executor_util_kernel.cu.cc"],
+    tags = ["no_rocm"],
+    copts = rocm_copts(),
+    deps = if_rocm_is_configured([
+       "@local_config_rocm//rocm:rocm_headers",
+    ]) + 
+    if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+    ])
+)
+
 cc_library(
     name = "stream_executor_util",
     srcs = ["stream_executor_util.cc"],
@@ -1439,6 +1449,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor:kernel_spec",
         "//tensorflow/stream_executor:gpu_asm_opts",
+        ":stream_executor_util_kernel",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
@@ -414,28 +414,28 @@ static void InitializeTypedBuffer(se::Stream* stream,
     // Nothing more to do
     return;
   }
-#ifdef GOOGLE_CUDA
   // Repeat the host_buffer_size elements at the start of `buf` to the end
   CHECK_EQ(elements_to_fill, buffer.size() / sizeof(T) - host_buffer_size);
   se::StreamExecutor* executor = stream->parent();
-  auto kernel =
-      se::TypedKernelFactory<se::DeviceMemoryBase, int64, int64>::Create(
-          executor, "RepeatBufferKernel", repeat_buffer_kernel::kernel());
+
+  auto kernel = 
+      executor->CreateTypedKernel<se::DeviceMemoryBase, int64_t, int64_t>(
+           "RepeatBufferKernel", repeat_buffer_kernel::kernel());
   if (!kernel.ok()) {
     LOG(FATAL) << "Could not create RepeatBufferKernel: " << kernel.status();
   }
   // Launch the kernel with at least host_buffer_bytes threads. Each thread
   // will read one byte of `host_buffer` from the start of `buffer`, where the
   // Memcpy call(s) above put it, and scatter it through the rest of `buffer`.
-  constexpr int64 host_buffer_bytes = host_buffer_size * sizeof(T);
+  constexpr int64_t host_buffer_bytes = host_buffer_size * sizeof(T);
   constexpr int threads_per_block = 256;
   constexpr int blocks_per_grid =
       (host_buffer_bytes + threads_per_block - 1) / threads_per_block;
-  TF_CHECK_OK(stream->ThenLaunch(se::ThreadDim(threads_per_block, 1, 1),
-                                 se::BlockDim(blocks_per_grid, 1, 1), *kernel,
+  stream->ThenLaunch(se::ThreadDim(threads_per_block, 1, 1),
+                                 se::BlockDim(blocks_per_grid, 1, 1), 
+                                 *kernel.ValueOrDie(),
                                  buffer, host_buffer_bytes,
-                                 static_cast<int64>(buffer.size())));
-#endif
+                                 static_cast<int64_t>(buffer.size()));
 }
 
 void InitializeBuffer(se::Stream* stream, PrimitiveType buffer_type,
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
@@ -33,6 +33,10 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+namespace repeat_buffer_kernel {
+void* kernel();
+} // namespace repeat_buffer_kernel
+
 // Returns true if the given StreamExecutor is for a Volta or newer nvidia GPU.
 bool IsVoltaOrLater(const se::StreamExecutor& stream_exec);
 
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util_kernel.cu.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util_kernel.cu.cc
@@ -0,0 +1,42 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+namespace xla {
+namespace gpu {
+namespace repeat_buffer_kernel {
+
+namespace {
+// Populate the last `buffer_size - repeat_size` bytes of `buffer` by repeating
+// the first `repeat_size` bytes. This should be launched with at least
+// `repeat_size` threads in total.
+__global__ void RepeatBufferKernel(char* buffer, int64_t repeat_size,
+                                   int64_t buffer_size) {
+  int64_t global_index = blockDim.x * blockIdx.x + threadIdx.x;
+  if (global_index >= repeat_size) {
+    return;
+  }
+  const char src_value = buffer[global_index];
+  for (int64_t dst_index = global_index + repeat_size; dst_index < buffer_size;
+       dst_index += repeat_size) {
+    buffer[dst_index] = src_value;
+  }
+}
+}  // namespace
+void* kernel() { return reinterpret_cast<void*>(RepeatBufferKernel); }
+} // namespace repeat_buffer_kernel
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
@@ -191,7 +191,8 @@ def _rocm_include_path(repository_ctx, rocm_config):
     inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/13.0.0/include")
     inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/14.0.0/include")
     inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/15.0.0/include")
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/15.0.0/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/16.0.0/include")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/17/include")
     inc_dirs.append(rocm_config.rocm_toolkit_path + "/lib/llvm/lib/clang/17/include")
     inc_dirs.append(rocm_config.rocm_toolkit_path + "/lib/llvm/lib/clang/18/include")
     inc_dirs.append(rocm_config.rocm_toolkit_path + "/lib/llvm/lib/clang/19/include")