Resolved conflicts for develop-upstream-sync-231030

ROCm · Nov 2, 2023 · 7fa9e76 · 7fa9e76
1 parent abf7b02
commit 7fa9e76
Show file tree

Hide file tree

Showing 12 changed files with 5 additions and 188 deletions.
diff --git a/third_party/xla/xla/service/bitcast_dtypes_expander_test.cc b/third_party/xla/xla/service/bitcast_dtypes_expander_test.cc
@@ -88,24 +88,6 @@ ENTRY main {
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
 // CHECK: HloModule bitcast_to_smaller, entry_computation_layout={(s64[10]{0})->s32[10,2]{1,0}}
 // CHECK: %xla.bitcast_convert_s64_10__2_s32_10_2_.17 (a.1: s64[10]) -> s32[10,2] {
-<<<<<<< HEAD
-// CHECK:   %a.1 = s64[10]{0} parameter(0)
-// CHECK:   %reshape.2 = s64[10,1]{1,0} reshape(s64[10]{0} %a.1)
-// CHECK:   %broadcast.3 = s64[10,1]{1,0} broadcast(s64[10,1]{1,0} %reshape.2), dimensions={0,1}
-// CHECK:   %reshape.4 = s64[10]{0} reshape(s64[10,1]{1,0} %broadcast.3)
-// CHECK:   %broadcast.5 = s64[10,2]{1,0} broadcast(s64[10]{0} %reshape.4), dimensions={0}
-// CHECK:   %bitcast-convert.6 = u64[10,2]{1,0} bitcast-convert(s64[10,2]{1,0} %broadcast.5)
-// CHECK:   %constant.8 = u64[] constant(32)
-// CHECK:   %broadcast.9 = u64[10,2]{1,0} broadcast(u64[] %constant.8), dimensions={}
-// CHECK:   %iota.7 = u64[10,2]{1,0} iota(), iota_dimension=1
-// CHECK:   %multiply.10 = u64[10,2]{1,0} multiply(u64[10,2]{1,0} %broadcast.9, u64[10,2]{1,0} %iota.7)
-// CHECK:   %shift-right-logical{{\.?[0-9]*}} = u64[10,2]{1,0} shift-right-logical(u64[10,2]{1,0} %bitcast-convert.6, u64[10,2]{1,0} %multiply.10)
-// CHECK:   %constant{{\.?[0-9]*}} = u64[] constant(4294967295)
-// CHECK:   %broadcast.13 = u64[10,2]{1,0} broadcast(u64[] %constant{{\.?[0-9]*}}), dimensions={}
-// CHECK:   %and.14 = u64[10,2]{1,0} and(u64[10,2]{1,0} %shift-right-logical{{\.?[0-9]*}}, u64[10,2]{1,0} %broadcast.13)
-// CHECK:   %convert.15 = u32[10,2]{1,0} convert(u64[10,2]{1,0} %and.14)
-// CHECK:   ROOT %bitcast-convert.16 = s32[10,2]{1,0} bitcast-convert(u32[10,2]{1,0} %convert.15)
-=======
 // CHECK:   %[[VAL_0:.*]] = s64[10]{0} parameter(0)
 // CHECK:   %[[VAL_1:.*]] = s64[10,1]{1,0} reshape(s64[10]{0} %[[VAL_0]])
 // CHECK:   %[[VAL_2:.*]] = s64[10,1]{1,0} broadcast(s64[10,1]{1,0} %[[VAL_1]]), dimensions={0,1}
@@ -122,7 +104,6 @@ ENTRY main {
 // CHECK:   %[[VAL_13:.*]] = u64[10,2]{1,0} and(u64[10,2]{1,0} %[[VAL_10]], u64[10,2]{1,0} %[[VAL_12]])
 // CHECK:   %[[VAL_14:.*]] = u32[10,2]{1,0} convert(u64[10,2]{1,0} %[[VAL_13]])
 // CHECK:   ROOT %[[VAL_15:.*]] = s32[10,2]{1,0} bitcast-convert(u32[10,2]{1,0} %[[VAL_14]])
->>>>>>> upstream/master
 // CHECK: }
 // CHECK: ENTRY %main (p: s64[10]) -> s32[10,2] {
 // CHECK:   %[[VAL_16:.*]] = s64[10]{0} parameter(0)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
@@ -261,17 +261,11 @@ cc_library(
     name = "ir_emitter_unnested",
     srcs = ["ir_emitter_unnested.cc"],
     hdrs = ["ir_emitter_unnested.h"],
-<<<<<<< HEAD
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + 
-                    if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]) + 
-                    if_rocm_hipblaslt(["TF_HIPBLASLT=1"]),
-=======
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]) + if_rocm_hipblaslt([
         "TF_HIPBLASLT=1",
     ]),
->>>>>>> upstream/master
     visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
@@ -1158,17 +1152,10 @@ cc_library(
         "TENSORFLOW_USE_ROCM=1",
     ]),
     visibility = ["//visibility:public"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
     deps = if_gpu_is_configured([
         ":buffer_allocations",
         ":thunk",
         "@com_google_absl//absl/log:check",
-<<<<<<< HEAD
-        ":gpu_prim_hdrs",
-=======
->>>>>>> upstream/master
         "//xla/service:buffer_assignment",
         "//xla:shape_util",
         "//xla/stream_executor:device_memory",
@@ -1180,20 +1167,6 @@ cc_library(
     ] + [":cub_sort_kernel_" + suffix for suffix in get_cub_sort_kernel_types()]),
 )
 
-cc_library(
-    name = "gpu_prim_hdrs",
-    hdrs = ["gpu_prim.h"],
-    deps = [
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:bfloat16",
-    ] +    
-    if_cuda_is_configured([
-        "@local_config_cuda//cuda:cub_headers",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocprim",
-    ]),
-)
-
 build_cub_sort_kernels(
     name = "cub_sort_kernel",
     srcs = if_gpu_is_configured(["cub_sort_kernel.cu.cc"]),
@@ -1202,37 +1175,10 @@ build_cub_sort_kernels(
         "TENSORFLOW_USE_ROCM=1",
     ]),
     types = get_cub_sort_kernel_types(),
-    deps = if_cuda_is_configured([
+    deps = if_gpu_is_configured([
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-<<<<<<< HEAD
-        ":gpu_prim_hdrs",
-    ]),
-)
-
-cc_library(
-    name = "fft_thunk",
-    srcs = ["fft_thunk.cc"],
-    hdrs = ["fft_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":buffer_allocations",
-        ":thunk",
-        "//xla:types",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
-    ],
-=======
     ]) + if_cuda_is_configured([":gpu_prim_cuda"]) + if_rocm_is_configured([":gpu_prim_rocm"]),
->>>>>>> upstream/master
 )
 
 cc_library(

diff --git a/third_party/xla/xla/service/gpu/build_defs.bzl b/third_party/xla/xla/service/gpu/build_defs.bzl
@@ -2,13 +2,7 @@
 """
 
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
-<<<<<<< HEAD
-load("@local_config_rocm//rocm:build_defs.bzl",
-    "rocm_copts", 
-)
-=======
 load("@local_config_rocm//rocm:build_defs.bzl", "rocm_copts")  # copybara:comment
->>>>>>> upstream/master
 
 def get_cub_sort_kernel_types(name = ""):
     """ List of supported types for CUB sort kernels.
@@ -42,12 +36,7 @@ def build_cub_sort_kernels(name, types, local_defines = [], **kwargs):
     for suffix in types:
         cuda_library(
             name = name + "_" + suffix,
-<<<<<<< HEAD
-            local_defines = ["CUB_TYPE_" + suffix.upper()],
-            copts = rocm_copts(),
-=======
             local_defines = local_defines + ["CUB_TYPE_" + suffix.upper()],
             copts = rocm_copts(),  # copybara:comment
->>>>>>> upstream/master
             **kwargs
         )
diff --git a/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc b/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc
@@ -14,40 +14,23 @@ limitations under the License.
 ==============================================================================*/
 
 #include "xla/service/gpu/cub_sort_kernel.h"
-#include "xla/service/gpu/gpu_prim.h"
 
 #include <cstddef>
 #include <cstdint>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
-<<<<<<< HEAD
-=======
 #if GOOGLE_CUDA
 #include "xla/service/gpu/gpu_prim_cuda.h"
 #elif TENSORFLOW_USE_ROCM
 #include "xla/service/gpu/gpu_prim_rocm.h"
 #endif  // TENSORFLOW_USE_ROCM
->>>>>>> upstream/master
 
 namespace xla {
 namespace gpu {
 namespace {
 
 #if GOOGLE_CUDA
-<<<<<<< HEAD
-#define CHK_GPU_ERR(err) if(err != cudaSuccess) { \
-      return absl::InvalidArgumentError(  \
-        absl::StrCat("CUB error: ", cudaGetErrorString(err))); \
-    }
-#elif TENSORFLOW_USE_ROCM
-#define CHK_GPU_ERR(err) if(err != hipSuccess) { \
-      return absl::InvalidArgumentError(  \
-        absl::StrCat("HIPCUB error: ", hipGetErrorString(err))); \
-    }
-#endif        
-
-=======
 #define CHK_GPU_ERR(err)                                       \
   if (err != cudaSuccess) {                                    \
     return absl::InvalidArgumentError(                         \
@@ -60,18 +43,13 @@ namespace {
         absl::StrCat("HIPCUB error: ", hipGetErrorString(err))); \
   }
 #endif
->>>>>>> upstream/master
 
 template <typename KeyT>
 absl::Status CubSortKeys(void* d_temp_storage, size_t& temp_bytes,
                          const void* d_keys_in, void* d_keys_out,
                          size_t num_items, bool descending) {
-<<<<<<< HEAD
-  auto err = descending
-=======
   auto err =
       descending
->>>>>>> upstream/master
           ? gpuprim::DeviceRadixSort::SortKeysDescending<KeyT>(
                 d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
                 static_cast<KeyT*>(d_keys_out), num_items)
@@ -87,12 +65,8 @@ absl::Status CubSortPairs(void* d_temp_storage, size_t& temp_bytes,
                           const void* d_keys_in, void* d_keys_out,
                           const void* d_values_in, void* d_values_out,
                           size_t num_items, bool descending) {
-<<<<<<< HEAD
-  auto err = descending
-=======
   auto err =
       descending
->>>>>>> upstream/master
           ? gpuprim::DeviceRadixSort::SortPairsDescending<KeyT, ValT>(
                 d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
                 static_cast<KeyT*>(d_keys_out),

diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -191,14 +191,9 @@ cc_library(
     name = "cub_sort",
     srcs = ["cub_sort.cc"],
     hdrs = ["cub_sort.h"],
-<<<<<<< HEAD
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + 
-                    if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
-=======
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
->>>>>>> upstream/master
     visibility = ["//visibility:public"],
     deps = [
         ":support",

diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -702,8 +702,6 @@ xla_cc_test(
         "TENSORFLOW_USE_ROCM=1",
     ]),
     tags = tf_cuda_tests_tags(),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + 
-                    if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
@@ -450,6 +450,7 @@ cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ],
+    alwayslink = True,
 )
 
 cc_library(
@@ -483,6 +484,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:statusor",
     ],
+    alwayslink = True,
 )
 
 cc_library(
@@ -520,6 +522,7 @@ cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ],
+    alwayslink = True,
 )
 
 cc_library(
@@ -626,6 +629,7 @@ cc_library(
         "@local_tsl//tsl/protobuf:dnn_proto_cc",
         "@local_tsl//tsl/util:env_var",
     ],
+    alwayslink = True,
 )
 
 # We have a separate `stream_executor_impl` target because in open source we are building multiple

diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
@@ -165,20 +165,12 @@ tsl::Status BlasLt::Init() {
   VLOG(2) << "BlasLt::MatmulDesc::Create compute_type" << int(compute_type)
           << " scale_type " << int(scale_type) << " epilogue " << int(epilogue)
           << " pointer_mode " << int(pointer_mode);
-<<<<<<< HEAD
-  auto hip_scale_type_ = AsHipblasDataType(scale_type);        
-  SE_HIPBLAS_RETURN_IF_ERROR(wrap::hipblasLtMatmulDescCreate(
-      &hip_desc, AsHipblasComputeType(compute_type), hip_scale_type_));
-  // Wrap hipblas handle immediately, so it is cleaned up if an error occurs.
-  BlasLt::MatmulDesc desc(hip_desc, hip_scale_type_);
-=======
   auto hip_scale_type = AsHipblasDataType(scale_type);
   auto hip_compute_type = AsHipblasComputeType(compute_type);
   SE_HIPBLAS_RETURN_IF_ERROR(wrap::hipblasLtMatmulDescCreate(
       &hip_desc, hip_compute_type, hip_scale_type));
   // Wrap hipblas handle immediately, so it is cleaned up if an error occurs.
   BlasLt::MatmulDesc desc(hip_desc, hip_compute_type, hip_scale_type);
->>>>>>> upstream/master
   if (pointer_mode != PointerMode::kHost) {
     return tsl::errors::Internal("hipblaslt does not support device pointers");
   }
@@ -473,27 +465,6 @@ tsl::Status BlasLt::MatmulPlan::ExecuteOnStream(
   std::tuple operand_types{a_desc_.type(), b_desc_.type(), c_desc_.type(),
                            d_desc_.type()};
 
-<<<<<<< HEAD
-#define TYPED_MATMUL(SCALENTYPE, ATYPE, BTYPE, CTYPE, DTYPE)                \
-  if (operand_types == std::make_tuple(ATYPE, BTYPE, CTYPE, DTYPE)) {       \
-    return gpu::BlasLt::MatmulPlan::DoMatmul<                               \
-        SCALENTYPE, HipToNativeT<ATYPE>::type, HipToNativeT<BTYPE>::type, \
-        HipToNativeT<CTYPE>::type, HipToNativeT<DTYPE>::type>(            \
-        stream, alpha_, a, b, beta_, c, d, bias, aux, a_scale, b_scale,     \
-        c_scale, d_scale, d_amax, algorithm, scratch_allocator,             \
-        profile_result);                                                    \
-  }
-
-  // Other data types:
-  TYPED_MATMUL(float, HIPBLASLT_R_16B, HIPBLASLT_R_16B, HIPBLASLT_R_16B, HIPBLASLT_R_16B)
-  TYPED_MATMUL(float, HIPBLASLT_R_16F, HIPBLASLT_R_16F, HIPBLASLT_R_16F, HIPBLASLT_R_16F)
-  TYPED_MATMUL(float, HIPBLASLT_R_16B, HIPBLASLT_R_16B, HIPBLASLT_R_32F, HIPBLASLT_R_32F)
-  TYPED_MATMUL(float, HIPBLASLT_R_16F, HIPBLASLT_R_16F, HIPBLASLT_R_32F, HIPBLASLT_R_32F)
-  TYPED_MATMUL(float, HIPBLASLT_R_32F, HIPBLASLT_R_32F, HIPBLASLT_R_32F, HIPBLASLT_R_32F)
-  TYPED_MATMUL(double, HIPBLASLT_R_64F, HIPBLASLT_R_64F, HIPBLASLT_R_64F, HIPBLASLT_R_64F)
-  TYPED_MATMUL(complex64, HIPBLASLT_C_32F, HIPBLASLT_C_32F, HIPBLASLT_C_32F, HIPBLASLT_C_32F)
-  TYPED_MATMUL(complex128, HIPBLASLT_C_64F, HIPBLASLT_C_64F, HIPBLASLT_C_64F, HIPBLASLT_C_64F)
-=======
 #define TYPED_MATMUL(SCALENTYPE, ATYPE, BTYPE, CTYPE, DTYPE)              \
   if (operand_types == std::make_tuple(ATYPE, BTYPE, CTYPE, DTYPE)) {     \
     return gpu::BlasLt::MatmulPlan::DoMatmul<                             \
@@ -521,7 +492,6 @@ tsl::Status BlasLt::MatmulPlan::ExecuteOnStream(
                HIPBLASLT_C_32F)
   TYPED_MATMUL(complex128, HIPBLASLT_C_64F, HIPBLASLT_C_64F, HIPBLASLT_C_64F,
                HIPBLASLT_C_64F)
->>>>>>> upstream/master
 
 #undef TYPED_MATMUL