Skip to content

Commit

Permalink
Resolved conflicts for develop-upstream-sync-231030
Browse files Browse the repository at this point in the history
  • Loading branch information
root authored and root committed Nov 2, 2023
1 parent abf7b02 commit 7fa9e76
Show file tree
Hide file tree
Showing 12 changed files with 5 additions and 188 deletions.
19 changes: 0 additions & 19 deletions third_party/xla/xla/service/bitcast_dtypes_expander_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -88,24 +88,6 @@ ENTRY main {
EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
// CHECK: HloModule bitcast_to_smaller, entry_computation_layout={(s64[10]{0})->s32[10,2]{1,0}}
// CHECK: %xla.bitcast_convert_s64_10__2_s32_10_2_.17 (a.1: s64[10]) -> s32[10,2] {
<<<<<<< HEAD
// CHECK: %a.1 = s64[10]{0} parameter(0)
// CHECK: %reshape.2 = s64[10,1]{1,0} reshape(s64[10]{0} %a.1)
// CHECK: %broadcast.3 = s64[10,1]{1,0} broadcast(s64[10,1]{1,0} %reshape.2), dimensions={0,1}
// CHECK: %reshape.4 = s64[10]{0} reshape(s64[10,1]{1,0} %broadcast.3)
// CHECK: %broadcast.5 = s64[10,2]{1,0} broadcast(s64[10]{0} %reshape.4), dimensions={0}
// CHECK: %bitcast-convert.6 = u64[10,2]{1,0} bitcast-convert(s64[10,2]{1,0} %broadcast.5)
// CHECK: %constant.8 = u64[] constant(32)
// CHECK: %broadcast.9 = u64[10,2]{1,0} broadcast(u64[] %constant.8), dimensions={}
// CHECK: %iota.7 = u64[10,2]{1,0} iota(), iota_dimension=1
// CHECK: %multiply.10 = u64[10,2]{1,0} multiply(u64[10,2]{1,0} %broadcast.9, u64[10,2]{1,0} %iota.7)
// CHECK: %shift-right-logical{{\.?[0-9]*}} = u64[10,2]{1,0} shift-right-logical(u64[10,2]{1,0} %bitcast-convert.6, u64[10,2]{1,0} %multiply.10)
// CHECK: %constant{{\.?[0-9]*}} = u64[] constant(4294967295)
// CHECK: %broadcast.13 = u64[10,2]{1,0} broadcast(u64[] %constant{{\.?[0-9]*}}), dimensions={}
// CHECK: %and.14 = u64[10,2]{1,0} and(u64[10,2]{1,0} %shift-right-logical{{\.?[0-9]*}}, u64[10,2]{1,0} %broadcast.13)
// CHECK: %convert.15 = u32[10,2]{1,0} convert(u64[10,2]{1,0} %and.14)
// CHECK: ROOT %bitcast-convert.16 = s32[10,2]{1,0} bitcast-convert(u32[10,2]{1,0} %convert.15)
=======
// CHECK: %[[VAL_0:.*]] = s64[10]{0} parameter(0)
// CHECK: %[[VAL_1:.*]] = s64[10,1]{1,0} reshape(s64[10]{0} %[[VAL_0]])
// CHECK: %[[VAL_2:.*]] = s64[10,1]{1,0} broadcast(s64[10,1]{1,0} %[[VAL_1]]), dimensions={0,1}
Expand All @@ -122,7 +104,6 @@ ENTRY main {
// CHECK: %[[VAL_13:.*]] = u64[10,2]{1,0} and(u64[10,2]{1,0} %[[VAL_10]], u64[10,2]{1,0} %[[VAL_12]])
// CHECK: %[[VAL_14:.*]] = u32[10,2]{1,0} convert(u64[10,2]{1,0} %[[VAL_13]])
// CHECK: ROOT %[[VAL_15:.*]] = s32[10,2]{1,0} bitcast-convert(u32[10,2]{1,0} %[[VAL_14]])
>>>>>>> upstream/master
// CHECK: }
// CHECK: ENTRY %main (p: s64[10]) -> s32[10,2] {
// CHECK: %[[VAL_16:.*]] = s64[10]{0} parameter(0)
Expand Down
56 changes: 1 addition & 55 deletions third_party/xla/xla/service/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -261,17 +261,11 @@ cc_library(
name = "ir_emitter_unnested",
srcs = ["ir_emitter_unnested.cc"],
hdrs = ["ir_emitter_unnested.h"],
<<<<<<< HEAD
local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) +
if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]) +
if_rocm_hipblaslt(["TF_HIPBLASLT=1"]),
=======
local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
"TENSORFLOW_USE_ROCM=1",
]) + if_rocm_hipblaslt([
"TF_HIPBLASLT=1",
]),
>>>>>>> upstream/master
visibility = ["//visibility:public"],
deps = [
":backend_configs_cc",
Expand Down Expand Up @@ -1158,17 +1152,10 @@ cc_library(
"TENSORFLOW_USE_ROCM=1",
]),
visibility = ["//visibility:public"],
local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
"TENSORFLOW_USE_ROCM=1",
]),
deps = if_gpu_is_configured([
":buffer_allocations",
":thunk",
"@com_google_absl//absl/log:check",
<<<<<<< HEAD
":gpu_prim_hdrs",
=======
>>>>>>> upstream/master
"//xla/service:buffer_assignment",
"//xla:shape_util",
"//xla/stream_executor:device_memory",
Expand All @@ -1180,20 +1167,6 @@ cc_library(
] + [":cub_sort_kernel_" + suffix for suffix in get_cub_sort_kernel_types()]),
)

cc_library(
name = "gpu_prim_hdrs",
hdrs = ["gpu_prim.h"],
deps = [
"@eigen_archive//:eigen3",
"@local_tsl//tsl/platform:bfloat16",
] +
if_cuda_is_configured([
"@local_config_cuda//cuda:cub_headers",
]) + if_rocm_is_configured([
"@local_config_rocm//rocm:rocprim",
]),
)

build_cub_sort_kernels(
name = "cub_sort_kernel",
srcs = if_gpu_is_configured(["cub_sort_kernel.cu.cc"]),
Expand All @@ -1202,37 +1175,10 @@ build_cub_sort_kernels(
"TENSORFLOW_USE_ROCM=1",
]),
types = get_cub_sort_kernel_types(),
deps = if_cuda_is_configured([
deps = if_gpu_is_configured([
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings",
<<<<<<< HEAD
":gpu_prim_hdrs",
]),
)

cc_library(
name = "fft_thunk",
srcs = ["fft_thunk.cc"],
hdrs = ["fft_thunk.h"],
visibility = ["//visibility:public"],
deps = [
":buffer_allocations",
":thunk",
"//xla:types",
"//xla:util",
"//xla:xla_data_proto_cc",
"//xla/hlo/ir:hlo",
"//xla/service:buffer_assignment",
"//xla/stream_executor",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
"@local_tsl//tsl/platform:logging",
"@local_tsl//tsl/platform:status",
],
=======
]) + if_cuda_is_configured([":gpu_prim_cuda"]) + if_rocm_is_configured([":gpu_prim_rocm"]),
>>>>>>> upstream/master
)

cc_library(
Expand Down
11 changes: 0 additions & 11 deletions third_party/xla/xla/service/gpu/build_defs.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,7 @@
"""

load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
<<<<<<< HEAD
load("@local_config_rocm//rocm:build_defs.bzl",
"rocm_copts",
)
=======
load("@local_config_rocm//rocm:build_defs.bzl", "rocm_copts") # copybara:comment
>>>>>>> upstream/master

def get_cub_sort_kernel_types(name = ""):
""" List of supported types for CUB sort kernels.
Expand Down Expand Up @@ -42,12 +36,7 @@ def build_cub_sort_kernels(name, types, local_defines = [], **kwargs):
for suffix in types:
cuda_library(
name = name + "_" + suffix,
<<<<<<< HEAD
local_defines = ["CUB_TYPE_" + suffix.upper()],
copts = rocm_copts(),
=======
local_defines = local_defines + ["CUB_TYPE_" + suffix.upper()],
copts = rocm_copts(), # copybara:comment
>>>>>>> upstream/master
**kwargs
)
26 changes: 0 additions & 26 deletions third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,40 +14,23 @@ limitations under the License.
==============================================================================*/

#include "xla/service/gpu/cub_sort_kernel.h"
#include "xla/service/gpu/gpu_prim.h"

#include <cstddef>
#include <cstdint>

#include "absl/status/status.h"
#include "absl/strings/str_cat.h"
<<<<<<< HEAD
=======
#if GOOGLE_CUDA
#include "xla/service/gpu/gpu_prim_cuda.h"
#elif TENSORFLOW_USE_ROCM
#include "xla/service/gpu/gpu_prim_rocm.h"
#endif // TENSORFLOW_USE_ROCM
>>>>>>> upstream/master

namespace xla {
namespace gpu {
namespace {

#if GOOGLE_CUDA
<<<<<<< HEAD
#define CHK_GPU_ERR(err) if(err != cudaSuccess) { \
return absl::InvalidArgumentError( \
absl::StrCat("CUB error: ", cudaGetErrorString(err))); \
}
#elif TENSORFLOW_USE_ROCM
#define CHK_GPU_ERR(err) if(err != hipSuccess) { \
return absl::InvalidArgumentError( \
absl::StrCat("HIPCUB error: ", hipGetErrorString(err))); \
}
#endif

=======
#define CHK_GPU_ERR(err) \
if (err != cudaSuccess) { \
return absl::InvalidArgumentError( \
Expand All @@ -60,18 +43,13 @@ namespace {
absl::StrCat("HIPCUB error: ", hipGetErrorString(err))); \
}
#endif
>>>>>>> upstream/master

template <typename KeyT>
absl::Status CubSortKeys(void* d_temp_storage, size_t& temp_bytes,
const void* d_keys_in, void* d_keys_out,
size_t num_items, bool descending) {
<<<<<<< HEAD
auto err = descending
=======
auto err =
descending
>>>>>>> upstream/master
? gpuprim::DeviceRadixSort::SortKeysDescending<KeyT>(
d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
static_cast<KeyT*>(d_keys_out), num_items)
Expand All @@ -87,12 +65,8 @@ absl::Status CubSortPairs(void* d_temp_storage, size_t& temp_bytes,
const void* d_keys_in, void* d_keys_out,
const void* d_values_in, void* d_values_out,
size_t num_items, bool descending) {
<<<<<<< HEAD
auto err = descending
=======
auto err =
descending
>>>>>>> upstream/master
? gpuprim::DeviceRadixSort::SortPairsDescending<KeyT, ValT>(
d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
static_cast<KeyT*>(d_keys_out),
Expand Down
5 changes: 0 additions & 5 deletions third_party/xla/xla/service/gpu/runtime/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -191,14 +191,9 @@ cc_library(
name = "cub_sort",
srcs = ["cub_sort.cc"],
hdrs = ["cub_sort.h"],
<<<<<<< HEAD
local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) +
if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
=======
local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
"TENSORFLOW_USE_ROCM=1",
]),
>>>>>>> upstream/master
visibility = ["//visibility:public"],
deps = [
":support",
Expand Down
2 changes: 0 additions & 2 deletions third_party/xla/xla/service/gpu/tests/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -702,8 +702,6 @@ xla_cc_test(
"TENSORFLOW_USE_ROCM=1",
]),
tags = tf_cuda_tests_tags(),
local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) +
if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
deps = [
":gpu_codegen_test",
"//xla:error_spec",
Expand Down
4 changes: 4 additions & 0 deletions third_party/xla/xla/stream_executor/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,7 @@ cc_library(
"@local_tsl//tsl/platform:status",
"@local_tsl//tsl/platform:statusor",
],
alwayslink = True,
)

cc_library(
Expand Down Expand Up @@ -483,6 +484,7 @@ cc_library(
"@com_google_absl//absl/synchronization",
"@local_tsl//tsl/platform:statusor",
],
alwayslink = True,
)

cc_library(
Expand Down Expand Up @@ -520,6 +522,7 @@ cc_library(
"@local_tsl//tsl/platform:status",
"@local_tsl//tsl/platform:statusor",
],
alwayslink = True,
)

cc_library(
Expand Down Expand Up @@ -626,6 +629,7 @@ cc_library(
"@local_tsl//tsl/protobuf:dnn_proto_cc",
"@local_tsl//tsl/util:env_var",
],
alwayslink = True,
)

# We have a separate `stream_executor_impl` target because in open source we are building multiple
Expand Down
30 changes: 0 additions & 30 deletions third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,20 +165,12 @@ tsl::Status BlasLt::Init() {
VLOG(2) << "BlasLt::MatmulDesc::Create compute_type" << int(compute_type)
<< " scale_type " << int(scale_type) << " epilogue " << int(epilogue)
<< " pointer_mode " << int(pointer_mode);
<<<<<<< HEAD
auto hip_scale_type_ = AsHipblasDataType(scale_type);
SE_HIPBLAS_RETURN_IF_ERROR(wrap::hipblasLtMatmulDescCreate(
&hip_desc, AsHipblasComputeType(compute_type), hip_scale_type_));
// Wrap hipblas handle immediately, so it is cleaned up if an error occurs.
BlasLt::MatmulDesc desc(hip_desc, hip_scale_type_);
=======
auto hip_scale_type = AsHipblasDataType(scale_type);
auto hip_compute_type = AsHipblasComputeType(compute_type);
SE_HIPBLAS_RETURN_IF_ERROR(wrap::hipblasLtMatmulDescCreate(
&hip_desc, hip_compute_type, hip_scale_type));
// Wrap hipblas handle immediately, so it is cleaned up if an error occurs.
BlasLt::MatmulDesc desc(hip_desc, hip_compute_type, hip_scale_type);
>>>>>>> upstream/master
if (pointer_mode != PointerMode::kHost) {
return tsl::errors::Internal("hipblaslt does not support device pointers");
}
Expand Down Expand Up @@ -473,27 +465,6 @@ tsl::Status BlasLt::MatmulPlan::ExecuteOnStream(
std::tuple operand_types{a_desc_.type(), b_desc_.type(), c_desc_.type(),
d_desc_.type()};

<<<<<<< HEAD
#define TYPED_MATMUL(SCALENTYPE, ATYPE, BTYPE, CTYPE, DTYPE) \
if (operand_types == std::make_tuple(ATYPE, BTYPE, CTYPE, DTYPE)) { \
return gpu::BlasLt::MatmulPlan::DoMatmul< \
SCALENTYPE, HipToNativeT<ATYPE>::type, HipToNativeT<BTYPE>::type, \
HipToNativeT<CTYPE>::type, HipToNativeT<DTYPE>::type>( \
stream, alpha_, a, b, beta_, c, d, bias, aux, a_scale, b_scale, \
c_scale, d_scale, d_amax, algorithm, scratch_allocator, \
profile_result); \
}

// Other data types:
TYPED_MATMUL(float, HIPBLASLT_R_16B, HIPBLASLT_R_16B, HIPBLASLT_R_16B, HIPBLASLT_R_16B)
TYPED_MATMUL(float, HIPBLASLT_R_16F, HIPBLASLT_R_16F, HIPBLASLT_R_16F, HIPBLASLT_R_16F)
TYPED_MATMUL(float, HIPBLASLT_R_16B, HIPBLASLT_R_16B, HIPBLASLT_R_32F, HIPBLASLT_R_32F)
TYPED_MATMUL(float, HIPBLASLT_R_16F, HIPBLASLT_R_16F, HIPBLASLT_R_32F, HIPBLASLT_R_32F)
TYPED_MATMUL(float, HIPBLASLT_R_32F, HIPBLASLT_R_32F, HIPBLASLT_R_32F, HIPBLASLT_R_32F)
TYPED_MATMUL(double, HIPBLASLT_R_64F, HIPBLASLT_R_64F, HIPBLASLT_R_64F, HIPBLASLT_R_64F)
TYPED_MATMUL(complex64, HIPBLASLT_C_32F, HIPBLASLT_C_32F, HIPBLASLT_C_32F, HIPBLASLT_C_32F)
TYPED_MATMUL(complex128, HIPBLASLT_C_64F, HIPBLASLT_C_64F, HIPBLASLT_C_64F, HIPBLASLT_C_64F)
=======
#define TYPED_MATMUL(SCALENTYPE, ATYPE, BTYPE, CTYPE, DTYPE) \
if (operand_types == std::make_tuple(ATYPE, BTYPE, CTYPE, DTYPE)) { \
return gpu::BlasLt::MatmulPlan::DoMatmul< \
Expand Down Expand Up @@ -521,7 +492,6 @@ tsl::Status BlasLt::MatmulPlan::ExecuteOnStream(
HIPBLASLT_C_32F)
TYPED_MATMUL(complex128, HIPBLASLT_C_64F, HIPBLASLT_C_64F, HIPBLASLT_C_64F,
HIPBLASLT_C_64F)
>>>>>>> upstream/master

#undef TYPED_MATMUL

Expand Down
Loading

0 comments on commit 7fa9e76

Please sign in to comment.