Skip to content

Commit 43382aa

Browse files
committed
Fix xla unit tests
1 parent d5a10eb commit 43382aa

File tree

17 files changed

+50
-124
lines changed

17 files changed

+50
-124
lines changed

tensorflow/tools/ci_build/linux/rocm/run_xla.sh

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,23 @@ EXCLUDED_TESTS=(
6666

6767
# @local_xla//xla/backends/gpu/codegen/triton:fusion_emitter_device_test_amdgpu_any
6868
TritonEmitterTest.FusionWithOutputContainingMoreThanInt32MaxElementsExecutesCorrectly
69+
TritonEmitterTest.ConvertF16ToF8E5M2Exhaustive
70+
TritonEmitterTest.RocmWarpSizeIsSetCorrectly
6971
BasicDotAlgorithmEmitterTestSuite/BasicDotAlgorithmEmitterTest.BasicAlgorithmIsEmittedCorrectly/ALG_DOT_F16_F16_F16
7072

7173
# @local_xla//xla/backends/gpu/codegen/triton:fusion_emitter_int4_device_test_amdgpu_any
7274
TritonTest.FuseSubchannelDequantizationWithTranspose
7375

7476
# @local_xla//xla/backends/gpu/codegen/triton:fusion_emitter_parametrized_test_amdgpu_any
7577
TritonNormalizationTest.CanFuseAndEmitDiamondWithBF16Converts
78+
ElementwiseTestSuiteF16/UnaryElementwiseTest.ElementwiseUnaryOpExecutesCorrectly/f16_cosine
79+
ElementwiseTestSuiteF16/BinaryElementwiseTest.ElementwiseBinaryOpExecutesCorrectly/f16_atan2
80+
ElementwiseTestSuiteF16/BinaryElementwiseTest.ElementwiseFusionExecutesCorrectly/f16_atan2
81+
82+
# @local_xla//xla/service/gpu/tests:command_buffer_test_amdgpu_any
83+
CommandBufferTests/CommandBufferTest.WhileLoop/*
84+
CommandBufferTests/CommandBufferTest.IndexConditional/*
85+
CommandBufferTests/CommandBufferTest.TrueFalseConditional/*
7686

7787
# @local_xla//xla/backends/gpu/runtime:command_buffer_conversion_pass_test_amdgpu_any
7888
CommandBufferConversionPassTest.ConvertWhileThunk
@@ -88,14 +98,33 @@ EXCLUDED_TESTS=(
8898
DotTf32Tf32F32Tests/DotAlgorithmSupportTest.AlgorithmIsSupportedFromCudaCapability/dot_tf32_tf32_f32_*
8999
DotTf32Tf32F32X3Tests/DotAlgorithmSupportTest.AlgorithmIsSupportedFromCudaCapability/dot_tf32_tf32_f32_*
90100

101+
# @local_xla//xla/service/gpu/transforms:triton_fusion_numerics_verifier_test_amdgpu_any_notfrt
102+
# @local_xla//xla/service/gpu/transforms:triton_fusion_numerics_verifier_test_amdgpu_any
103+
TritonFusionNumericsVerifierTest.CompilationSucceedsEvenIfKernelWillSpillRegisters
104+
TritonFusionNumericsVerifierTest.VerifyThatDisablingTritonIsFast
105+
91106
# @local_xla//xla/service/gpu/tests:gpu_cub_sort_test_amdgpu_any
92107
CubSortKeysTest.CompareToReferenceNumpyOrderGt
93108
CubSortKeysTest.CompareToReferenceTotalOrderLt
94109
CubSort/CubSortKeysTest.*
95110
CubSort/CubSortPairsTest.*
96111

112+
# @local_xla//xla/backends/gpu/runtime:cub_sort_thunk_test
113+
CubSortThunkTest.ProtoRoundTrip
114+
97115
# @local_xla//xla/service/gpu/transforms:cublas_gemm_rewriter_test_amdgpu_any
98116
CublasLtGemmRewriteTest.MatrixBiasSwishActivation
117+
CublasLtGemmRewriteTest.VectorBiasReluActivationF16Padded
118+
CublasLtGemmRewriteTest.VectorBiasF16Padded
119+
CublasLtGemmRewriteTest.ReluActivationF16Padded
120+
CublasLtGemmRewriteTest.VectorBiasReluActivationBF16Padded
121+
CublasLtGemmRewriteTest.BF16VectorBiasPadded
122+
CublasLtGemmRewriteTest.ApproxGeluActivationBF16
123+
CublasLtGemmRewriteTest.ReluActivationBF16Padded
124+
CublasLtGemmRewriteTest.VectorBiasBF16Padded
125+
126+
# @local_xla//xla/service/gpu:determinism_test_amdgpu_any
127+
DeterminismTest.Conv
99128

100129
# @local_xla//xla/tests:sample_file_test_amdgpu_any
101130
# @local_xla//xla/tests:sample_file_test_amdgpu_any_notfrt
@@ -107,8 +136,8 @@ EXCLUDED_TESTS=(
107136
# @local_xla//xla/tests:scatter_test_amdgpu_any_notfrt
108137
ScatterTest.TensorFlowScatterV1_UpdateTwice
109138

110-
# @local_xla//xla/service/gpu/llvm_gpu_backend:amdgpu_bitcode_link_test
111-
BitcodeLinkTest.TestLinkEmbeded
139+
# @local_xla//xla/tests:multioutput_fusion_test_amdgpu_any
140+
MultiOutputFusionTest.MultiOutputReduceFusionMajorWithExtraOutput
112141
)
113142

114143
bazel --bazelrc=tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/rocm.bazelrc test \
@@ -125,5 +154,9 @@ bazel --bazelrc=tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/rocm.b
125154
--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
126155
--test_filter=-$(IFS=: ; echo "${EXCLUDED_TESTS[*]}") \
127156
-- @local_xla//xla/... \
128-
-@local_xla//xla/service/gpu/tests:sorting.hlo.test_mi200
157+
-@local_xla//xla/service/gpu/tests:sorting_test_amdgpu_any \
158+
-@local_xla//xla/service/gpu/tests:sorting.hlo.test_mi200 \
159+
-@local_xla//xla/backends/gpu/codegen/emitters/tests:reduce_row/mof_scalar_variadic.hlo.test \
160+
-@local_xla//xla/backends/gpu/codegen/emitters/tests:reduce_row/side_output_broadcast.hlo.test \
161+
-@local_xla//xla/tools/hlo_opt:tests/gpu_hlo_llvm.hlo.test
129162
# ^^^ TODO (rocm) weekly-sync-20251021 excluded test files

third_party/xla/xla/backends/gpu/codegen/emitters/tests/BUILD

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ lit_test_suite(
2424
"@llvm-project//llvm:FileCheck",
2525
],
2626
tags_override = {
27-
"reduce_row/mof_scalar_variadic.hlo": ["cuda-only"], #TODO(rocm): weekly sync 25-07-14
28-
"reduce_row/side_output_broadcast.hlo": ["cuda-only"], #TODO(rocm): weekly sync 25-07-14
27+
"reduce_row/mof_scalar_variadic.hlo": [],
28+
"reduce_row/side_output_broadcast.hlo": [],
2929
},
3030
)

third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -141,9 +141,6 @@ class BlasAlgorithmTest : public AlgorithmTest {
141141
using TritonAlgorithmTest = AlgorithmTest;
142142

143143
TEST_F(AlgorithmTest, Algorithm3xBF16) {
144-
if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
145-
GTEST_SKIP() << "ALG_DOT_BF16_BF16_F32_X3 not supported on ROCM.";
146-
}
147144
constexpr absl::string_view kHloText = R"(
148145
HloModule Algorithm3xBF16
149146
@@ -160,9 +157,6 @@ TEST_F(AlgorithmTest, Algorithm3xBF16) {
160157
}
161158

162159
TEST_F(AlgorithmTest, Algorithm6xBF16) {
163-
if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
164-
GTEST_SKIP() << "ALG_DOT_BF16_BF16_F32_X6 not supported on ROCM.";
165-
}
166160
constexpr absl::string_view kHloText = R"(
167161
HloModule Algorithm6xBF16
168162
@@ -870,9 +864,6 @@ TEST_F(TritonAlgorithmTest, Algorithm_TF32_TF32_F32_X3) {
870864
}
871865

872866
TEST_F(TritonAlgorithmTest, Algorithm_BF16_BF16_F32) {
873-
if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
874-
GTEST_SKIP() << "Triton currently disabled on ROCM.";
875-
}
876867
if (!SupportsBF16(GpuComputeComp())) {
877868
GTEST_SKIP() << "BF16 not supported.";
878869
}
@@ -899,7 +890,7 @@ TEST_F(TritonAlgorithmTest, Algorithm_BF16_BF16_F32) {
899890
}
900891

901892
TEST_F(TritonAlgorithmTest, Dot_BF16_X6_WithConst) {
902-
constexpr std::string_view kHloText = R"(
893+
constexpr absl::string_view kHloText = R"(
903894
HloModule Dot_BF16_X6_WithConst
904895
905896
lhs {
@@ -1576,7 +1567,6 @@ TEST_P(TritonAndBlasSupportForDifferentTensorSizes, Regular2DDot) {
15761567

15771568
TEST_P(TritonAndBlasSupportForDifferentTensorSizes,
15781569
IsDotAlgorithmSupportedByTriton) {
1579-
15801570
// Here we test which dot algorithm is supported by triton.
15811571
// In case of a change you need to update the expected results.
15821572
constexpr absl::string_view kHloText = R"(

third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc

Lines changed: 3 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3205,7 +3205,6 @@ ENTRY entry_computation {
32053205
// Reproducer from b/384110192.
32063206
TEST_F(TritonEmitterTest,
32073207
FusionWithOutputContainingMoreThanInt32MaxElementsExecutesCorrectly) {
3208-
// "issue with triton.";
32093208
// The point here is to check the output of the Triton fusion. The `slice` op
32103209
// at the end is inserted to allow the comparison of output to run in a
32113210
// reasonable amount of time, and has been proven to still correctly capture
@@ -3270,12 +3269,6 @@ TEST_F(TritonEmitterTest, ConvertF16ToF8E5M2Exhaustive) {
32703269
"always correct";
32713270
}
32723271

3273-
if (std::holds_alternative<se::RocmComputeCapability>(
3274-
GpuComputeCapability())) {
3275-
GTEST_SKIP() << "Skipping tests on Rocm, Triton's conversion isn't "
3276-
"always correct";
3277-
}
3278-
32793272
constexpr absl::string_view kHloTextTemplate = R"(
32803273
computation {
32813274
p0 = f16[65536]{0} parameter(0)
@@ -4448,22 +4441,9 @@ TEST_F(TritonEmitterTest, RocmWarpSizeIsSetCorrectly) {
44484441
GTEST_SKIP() << "Warp size is always 32 on CUDA";
44494442
}
44504443

4451-
// TODO (rocm) weekly-sync-20251021 Use legacy emitter otherwise test segfaults
4452-
constexpr absl::string_view kHloText = R"(
4453-
%gemm_fusion___computation.clone {
4454-
%parameter_0 = f16[30,30]{1,0} parameter(0)
4455-
%parameter_1 = s8[30,30]{1,0} parameter(1)
4456-
%cp1.1 = f16[30,30]{1,0} convert(%parameter_1)
4457-
ROOT %_.1 = f16[30,30]{1,0} dot(%parameter_0, %cp1.1), lhs_contracting_dims={0}, rhs_contracting_dims={1}
4458-
}
4459-
ENTRY %entry_computation {
4460-
%p1 = s8[30,30]{1,0} parameter(1)
4461-
%p0 = f16[30,30]{1,0} parameter(0)
4462-
ROOT %gemm_fusion__ = f16[30,30]{1,0} fusion(%p0, %p1), kind=kCustom, calls=%gemm_fusion___computation.clone, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"16","block_k":"256","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}},"force_earliest_schedule":false,"reification_cost":[]}
4463-
})";
4464-
44654444
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> verified_module,
4466-
ParseAndReturnVerifiedModule(kHloText));
4445+
ParseAndReturnVerifiedModule(GetDotAlgorithmHlo(
4446+
F16, F16, PrecisionConfig::ALG_UNSET)));
44674447

44684448
std::string output_directory;
44694449
if (!tsl::io::GetTestUndeclaredOutputsDir(&output_directory)) {
@@ -4500,8 +4480,7 @@ TEST_F(TritonEmitterTest, RocmWarpSizeIsSetCorrectly) {
45004480
// CHECK: "ttg.threads-per-warp" = 64
45014481
)";
45024482
EXPECT_THAT(RunFileCheck(triton_passes_log, kPattern), true);
4503-
// TODO (rocm) weekly-sync-20251021 Enable this whence test pass
4504-
#if 0
4483+
45054484
// For RX7900 warp_size should be 32
45064485
const se::DeviceDescription dev_info_n =
45074486
TestGpuDeviceInfo::AMDRX7900DeviceInfo();
@@ -4518,7 +4497,6 @@ TEST_F(TritonEmitterTest, RocmWarpSizeIsSetCorrectly) {
45184497
// CHECK: "ttg.threads-per-warp" = 32
45194498
)";
45204499
EXPECT_THAT(RunFileCheck(triton_passes_log, kPattern_n), true);
4521-
#endif
45224500
}
45234501

45244502
TEST_F(TritonEmitterTest, EmitsCorrectlyForReshapeOfPad) {

third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -302,8 +302,6 @@ TEST(CommandBufferCmdTest, MemcpyCmd) {
302302
}
303303

304304
TEST(CommandBufferCmdTest, LaunchCmd) {
305-
// TODO(rocm): weekly sync 24-12-10
306-
GTEST_SKIP() << "CUDA graph conditionals are not supported";
307305
se::StreamExecutor* stream_executor = GpuExecutor();
308306

309307
auto stream = stream_executor->CreateStream().value();

third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -471,10 +471,6 @@ TEST(CommandBufferThunkTest, Memset32CmdOnDifferentStreams) {
471471

472472
TEST(CommandBufferThunkTest, LaunchCmd) {
473473
se::StreamExecutor* stream_executor = GpuExecutor();
474-
if (!IsAtLeastCuda12300(stream_executor)) {
475-
// TODO(rocm): weekly sync 24-12-10
476-
GTEST_SKIP() << "CUDA graph conditionals are not supported";
477-
}
478474

479475
TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
480476

@@ -569,10 +565,6 @@ TEST(CommandBufferThunkTest, LaunchCmd) {
569565

570566
TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
571567
se::StreamExecutor* stream_executor = GpuExecutor();
572-
if (!IsAtLeastCuda12300(stream_executor)) {
573-
// TODO(rocm): weekly sync 24-12-10
574-
GTEST_SKIP() << "CUDA graph conditionals are not supported";
575-
}
576568

577569
TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
578570

@@ -1237,10 +1229,6 @@ TEST(CommandBufferThunkTest, CublasLtCmd) {
12371229

12381230
TEST(CommandBufferThunkTest, MultipleLaunchCmd) {
12391231
se::StreamExecutor* stream_executor = GpuExecutor();
1240-
if (!IsAtLeastCuda12300(stream_executor)) {
1241-
// TODO(rocm): weekly sync 24-12-10
1242-
GTEST_SKIP() << "CUDA graph conditionals are not supported";
1243-
}
12441232

12451233
TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
12461234

third_party/xla/xla/service/gpu/autotuning/BUILD

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -676,7 +676,6 @@ xla_cc_test(
676676
],
677677
tags = [
678678
"gpu",
679-
"cuda-only", #TODO(rocm): weekly sync 24-10-01
680679
],
681680
deps = [
682681
":autotune_cache_key",

third_party/xla/xla/service/gpu/determinism_test.cc

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -262,9 +262,6 @@ ENTRY e {
262262
}
263263

264264
TEST_F(DeterminismTest, Conv) {
265-
if (IsRocm()) {
266-
GTEST_SKIP() << "Test temporarily disabled for ROCm!"; //TODO(rocm): weekly sync 25-08-25
267-
}
268265
constexpr absl::string_view kHloText = R"(
269266
ENTRY e {
270267
input = f32[16,3,64,64] parameter(0)

third_party/xla/xla/service/gpu/tests/BUILD

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,6 @@ xla_test(
238238
"gpu_too_many_blocks_test.cc",
239239
],
240240
backends = ["gpu"],
241-
#tags = ["cuda-only",], #(TODO)(rocm): weekly sync 24-11-05
242241
deps = [
243242
":gpu_codegen_test",
244243
"//xla/hlo/ir:hlo",

third_party/xla/xla/service/gpu/tests/command_buffer_test.cc

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -233,9 +233,6 @@ TEST_P(CommandBufferTest, Fusions) {
233233
}
234234

235235
TEST_P(CommandBufferTest, TrueFalseConditional) {
236-
if (IsRocm()) {
237-
GTEST_SKIP() << "Test currently failing on ROCm"; //TODO(rocm): weekly sync 25-07-14
238-
}
239236
constexpr absl::string_view hlo_text = R"(
240237
HloModule m, is_scheduled=true
241238
@@ -295,9 +292,6 @@ TEST_P(CommandBufferTest, TrueFalseConditional) {
295292
}
296293

297294
TEST_P(CommandBufferTest, IndexConditional) {
298-
if (IsRocm()) {
299-
GTEST_SKIP() << "Test currently failing on ROCm"; //TODO(rocm): weekly sync 25-07-14
300-
}
301295
constexpr absl::string_view hlo_text = R"(
302296
HloModule m, is_scheduled=true
303297
@@ -365,9 +359,6 @@ TEST_P(CommandBufferTest, IndexConditional) {
365359
}
366360

367361
TEST_P(CommandBufferTest, WhileLoop) {
368-
if (IsRocm()) {
369-
GTEST_SKIP() << "Test currently failing on ROCm"; //TODO(rocm): weekly sync 25-07-14
370-
}
371362
constexpr absl::string_view hlo_text = R"(
372363
HloModule m, is_scheduled=true
373364

0 commit comments

Comments
 (0)