From ea29fad5c970265542f7420c22820019ef14dfa2 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Tue, 3 Dec 2024 14:24:08 -0600
Subject: [PATCH] review-comments

Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
---
 compiler/plugins/target/ROCM/test/BUILD.bazel | 26 ++++++++++
 .../plugins/target/ROCM/test/CMakeLists.txt   | 31 ++++++++++--
 .../ROCM}/test/gpu_lower_to_ukernels.mlir     | 50 +++++++++----------
 .../test/ukernel_pipeline_transform.mlir      | 20 +++++---
 .../Codegen/Common/GPU/GPULowerToUKernels.cpp |  6 +--
 .../Codegen/Common/GPU/test/BUILD.bazel       | 32 ------------
 .../Codegen/Common/GPU/test/CMakeLists.txt    | 14 ------
 .../compiler/Codegen/LLVMGPU/test/BUILD.bazel | 33 +-----------
 .../Codegen/LLVMGPU/test/CMakeLists.txt       | 14 ------
 9 files changed, 94 insertions(+), 132 deletions(-)
 create mode 100644 compiler/plugins/target/ROCM/test/BUILD.bazel
 rename compiler/{src/iree/compiler/Codegen/Common/GPU => plugins/target/ROCM}/test/gpu_lower_to_ukernels.mlir (72%)
 rename compiler/{src/iree/compiler/Codegen/LLVMGPU => plugins/target/ROCM}/test/ukernel_pipeline_transform.mlir (78%)

diff --git a/compiler/plugins/target/ROCM/test/BUILD.bazel b/compiler/plugins/target/ROCM/test/BUILD.bazel
new file mode 100644
index 000000000000..bf9a18d582bd
--- /dev/null
+++ b/compiler/plugins/target/ROCM/test/BUILD.bazel
@@ -0,0 +1,26 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
+
+package(
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_lit_test_suite(
+    name = "lit",
+    srcs = [
+        "gpu_lower_to_ukernels.mlir",
+        "ukernel_pipeline_transform.mlir",
+    ],
+    cfg = "//compiler:lit.cfg.py",
+    tools = [
+        "//tools:iree-opt",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/compiler/plugins/target/ROCM/test/CMakeLists.txt b/compiler/plugins/target/ROCM/test/CMakeLists.txt
index df185a05e72b..6d2199d8c4bb 100644
--- a/compiler/plugins/target/ROCM/test/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/test/CMakeLists.txt
@@ -1,10 +1,33 @@
-# NOTE: Bazel testing of this backend is impossible because there is no way
-# for Bazel to bundle the AMD bitcode files that the backend depends on. Users
-# of the compiler can pass explicit flags, but we prefer that default tests
-# exercise default flags, which cannot be supported properly on Bazel builds.
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# compiler/plugins/target/ROCM/test/BUILD.bazel                                #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
 
 iree_add_all_subdirs()
 
+iree_lit_test_suite(
+  NAME
+    lit
+  SRCS
+    "gpu_lower_to_ukernels.mlir"
+    "ukernel_pipeline_transform.mlir"
+  TOOLS
+    FileCheck
+    iree-opt
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
+
+# NOTE: The following tests are CMake-only because they depend on AMD device
+# bitcode libraries that are provided by custom CMake code in target/ROCM.
+# By contrast, the above tests that only require ukernel bitcode are part of the
+# Bazel build because ukernel bitcode is something that we generate ourselves.
+
 iree_lit_test_suite(
   NAME
     lit
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir b/compiler/plugins/target/ROCM/test/gpu_lower_to_ukernels.mlir
similarity index 72%
rename from compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir
rename to compiler/plugins/target/ROCM/test/gpu_lower_to_ukernels.mlir
index 507d0e6530aa..177bd0b36f7c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir
+++ b/compiler/plugins/target/ROCM/test/gpu_lower_to_ukernels.mlir
@@ -1,9 +1,8 @@
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx908 --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s --check-prefix=CDNA1
 
-#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
 func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx942_target
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -35,9 +34,8 @@ func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes
 
 // -----
 
-#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
 func.func @argmax_4d_unit_parallel_f32i64(%arg0 : tensor<1x1x1x?xf32>) -> tensor<1x1x1xi64> attributes {
-  hal.executable.target = #gfx942_target
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -63,9 +61,8 @@ func.func @argmax_4d_unit_parallel_f32i64(%arg0 : tensor<1x1x1x?xf32>) -> tensor
 
 // -----
 
-#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
 func.func @argmax_2d_non_unit_parallel_f32i64(%arg0 : tensor<4x?xf32>) -> tensor<4xi64> attributes {
-  hal.executable.target = #gfx942_target
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -91,9 +88,8 @@ func.func @argmax_2d_non_unit_parallel_f32i64(%arg0 : tensor<4x?xf32>) -> tensor
 
 // -----
 
-#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
 func.func @argmax_2d_dyn_parallel_f32i64(%arg0 : tensor<?x?xf32>) -> tensor<?xi64> attributes {
-  hal.executable.target = #gfx942_target
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
 } {
   %c0 = arith.constant 0 : index
   %c0_i64 = arith.constant 0 : i64
@@ -121,9 +117,8 @@ func.func @argmax_2d_dyn_parallel_f32i64(%arg0 : tensor<?x?xf32>) -> tensor<?xi6
 
 // -----
 
-#gfx942_target_ukernels_none = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "none"}>
 func.func @argmax_none_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx942_target_ukernels_none
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "none"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -149,9 +144,8 @@ func.func @argmax_none_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64>
 
 // -----
 
-#gfx942_target_ukernels_argmax = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "argmax"}>
 func.func @argmax_only_argmax_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx942_target_ukernels_argmax
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -177,9 +171,8 @@ func.func @argmax_only_argmax_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor
 
 // -----
 
-#gfx942_target_ukernels_foo_argmax_bar = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "foo,argmax,bar"}>
 func.func @argmax_only_foo_argmax_bar_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx942_target_ukernels_foo_argmax_bar
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "foo,argmax,bar"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -207,9 +200,8 @@ func.func @argmax_only_foo_argmax_bar_ukernel_enabled(%arg0 : tensor<1x?xf32>) -
 
 // -----
 
-#gfx942_target_ukernels_foo_argmax_bar = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "foo"}>
 func.func @argmax_only_foo_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx942_target_ukernels_foo_argmax_bar
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "foo"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -236,9 +228,8 @@ func.func @argmax_only_foo_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1x
 // -----
 
 // Currently we do only handle -Inf case as initial values.
-#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
 func.func @argmax_2d_f32i64_not_neg_inf_init(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx942_target
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0.0 : f32
@@ -268,9 +259,8 @@ func.func @argmax_2d_f32i64_not_neg_inf_init(%arg0 : tensor<1x?xf32>) -> tensor<
 //       Currently just picking out popular chips to support,
 //       to minimize compile time and space.
 
-#gfx908_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx908", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
 func.func @argmax_ukernel_unsupported_arch(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx908_target
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -298,11 +288,15 @@ func.func @argmax_ukernel_unsupported_arch(%arg0 : tensor<1x?xf32>) -> tensor<1x
 
 // Test user-provided bitcode in the source IR.
 
-#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
 func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx942_target,
-  // Dummy bitcode with an unusual length of 12.
-  hal.executable.objects = [#hal.executable.object<{path = "iree_uk_amdgpu_argmax_f32i64.c.gfx942.bc", data = dense<"0x4243C0DE0123456789ABCDEF"> : tensor<12xi8>}>]
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>,
+  // Dummy bitcode with an unusual length of 12. The first 4 bytes are the .bc file format signature.
+  hal.executable.objects = [
+    #hal.executable.object<{
+      path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc",
+      data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8>
+    }>
+  ]
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -327,7 +321,13 @@ func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes
 //  CHECK-DAG:   %[[C1_index:.+]] = arith.constant 1 : index
 //  CHECK-DAG:   %[[C0_i64:.+]] = arith.constant 0
 //  CHECK-DAG:   %[[FILL:.+]] = linalg.fill ins(%[[C0_i64]]
-//      CHECK:   %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic {hal.executable.objects = [#hal.executable.object<{path = "iree_uk_amdgpu_argmax_f32i64.c.gfx942.bc", data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8>}>]} "iree_uk_amdgpu_argmax_f32i64"
+//      CHECK:   %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic {
+// CHECK-SAME:     hal.executable.objects = [
+// CHECK-SAME:       #hal.executable.object<{
+// CHECK-SAME:         path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc",
+// CHECK-SAME:         data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8>
+// CHECK-SAME:       }>
+// CHECK-SAME:     ]} "iree_uk_amdgpu_argmax_f32i64"
 // CHECK-SAME:       ins(%[[ARG0]] :
 // CHECK-SAME:       outs(%[[FILL]] :
 //      CHECK:   return %[[MICRO_KERNEL]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir b/compiler/plugins/target/ROCM/test/ukernel_pipeline_transform.mlir
similarity index 78%
rename from compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir
rename to compiler/plugins/target/ROCM/test/ukernel_pipeline_transform.mlir
index 40ed8ab25e90..26ce4c8959f4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir
+++ b/compiler/plugins/target/ROCM/test/ukernel_pipeline_transform.mlir
@@ -4,10 +4,11 @@
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 8192>>, ukernels = "argmax"}>
 #map = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
-func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @argmax_1d_f16i64() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0xFC00 : f16
   %c0_i64 = arith.constant 0 : i64
@@ -51,10 +52,11 @@ func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_ta
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 8192>>, ukernels = "argmax"}>
 #map = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0)>
-func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @argmax_2d_f32i64() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0xFF800000 : f32
   %c0_i64 = arith.constant 0 : i64
@@ -100,10 +102,11 @@ func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_ta
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb">
 #map = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
-func.func @no_ukernel_argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @no_ukernel_argmax_1d_f16i64() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb">
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0xFC00 : f16
   %c0_i64 = arith.constant 0 : i64
@@ -147,10 +150,11 @@ func.func @no_ukernel_argmax_1d_f16i64() attributes {hal.executable.target = #ex
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 8192>>, ukernels = "argmax"}>
 #map = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
-func.func @not_neg_inf_init_argmax_1d() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @not_neg_inf_init_argmax_1d() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0.000000e+00 : f16
   %c0_i64 = arith.constant 0 : i64
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp
index 676eafb5fed5..a72b4ff8e180 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp
@@ -146,11 +146,11 @@ matchArgmaxDAGForUKernel(RewriterBase &rewriter, linalg::GenericOp op) {
   FnNameAndDefAttrs fn =
       getFnNameAndDefAttrs(ukernelName, suffix, rewriter, targetAttr);
   if (!fn) {
-    return rewriter.notifyMatchFailure(op, "No ukernels on this backend.");
+    return rewriter.notifyMatchFailure(op, "no ukernels on this backend");
   }
 
   if (!hasUkernel(targetAttr, ukernelName)) {
-    return rewriter.notifyMatchFailure(op, "Ukernel not enabled.");
+    return rewriter.notifyMatchFailure(op, "ukernel not enabled");
   }
 
   // Currently only support argmax where parallel dims are 1.
@@ -176,7 +176,7 @@ matchArgmaxDAGForUKernel(RewriterBase &rewriter, linalg::GenericOp op) {
   IREE::HAL::ExecutableObjectAttr bitcodeObject =
       getUKernelBitcode(rewriter, execTarget, sourceExecutableObjects, fn.name);
   if (!bitcodeObject) {
-    return rewriter.notifyMatchFailure(op, "No ukernel bitcode for this op.");
+    return rewriter.notifyMatchFailure(op, "no ukernel bitcode for this op");
   }
   Location loc = op.getLoc();
   // Currently only support 1D reduction, where reduc is on fastest dim.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
index d429d18afdc2..7fe33161bca1 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
@@ -6,7 +6,6 @@
 
 # Tests for common transforms.
 
-load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content")
 load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
 load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 
@@ -15,12 +14,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-tests_requiring_rocm = [
-    # This test checks that ops are lowered to ukernels, which only happens if
-    # the ukernels, which are part of the ROCM plugin, are found.
-    "gpu_lower_to_ukernels.mlir",
-]
-
 iree_lit_test_suite(
     name = "lit",
     srcs = enforce_glob(
@@ -64,7 +57,6 @@ iree_lit_test_suite(
             "vector_reduction_to_gpu.mlir",
         ],
         include = ["*.mlir"],
-        exclude = tests_requiring_rocm,
     ),
     cfg = "//compiler:lit.cfg.py",
     tools = [
@@ -72,27 +64,3 @@ iree_lit_test_suite(
         "@llvm-project//llvm:FileCheck",
     ],
 )
-
-iree_cmake_extra_content(
-    content = """
-if(IREE_TARGET_BACKEND_ROCM)
-""",
-    inline = True,
-)
-
-iree_lit_test_suite(
-    name = "lit_requiring_rocm",
-    srcs = tests_requiring_rocm,
-    cfg = "//compiler:lit.cfg.py",
-    tools = [
-        "//tools:iree-opt",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
-
-iree_cmake_extra_content(
-    content = """
-endif()  # IREE_TARGET_BACKEND_ROCM
-""",
-    inline = True,
-)
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
index fdc57244a28a..4b9853df8213 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
@@ -56,18 +56,4 @@ iree_lit_test_suite(
     iree-opt
 )
 
-if(IREE_TARGET_BACKEND_ROCM)
-
-iree_lit_test_suite(
-  NAME
-    lit_requiring_rocm
-  SRCS
-    "gpu_lower_to_ukernels.mlir"
-  TOOLS
-    FileCheck
-    iree-opt
-)
-
-endif()  # IREE_TARGET_BACKEND_ROCM
-
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index f3f9fd641982..5d4042975f29 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -6,7 +6,6 @@
 
 # Tests for common transforms.
 
-load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content")
 load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
 load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 
@@ -15,12 +14,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-tests_requiring_rocm = [
-    # This test checks that ops are lowered to ukernels, which only happens if
-    # the ukernels, which are part of the ROCM plugin, are found.
-    "ukernel_pipeline_transform.mlir",
-]
-
 iree_lit_test_suite(
     name = "lit",
     srcs = enforce_glob(
@@ -85,7 +78,7 @@ iree_lit_test_suite(
             "transform_dialect_codegen_foreach_to_gpu_spec.mlir",
             "transform_dialect_codegen_vector_distribution_spec.mlir",
             "transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir",
-        ] + tests_requiring_rocm,
+        ],
     ),
     cfg = "//compiler:lit.cfg.py",
     data = [
@@ -99,27 +92,3 @@ iree_lit_test_suite(
         "@llvm-project//llvm:FileCheck",
     ],
 )
-
-iree_cmake_extra_content(
-    content = """
-if(IREE_TARGET_BACKEND_ROCM)
-""",
-    inline = True,
-)
-
-iree_lit_test_suite(
-    name = "lit_requiring_rocm",
-    srcs = tests_requiring_rocm,
-    cfg = "//compiler:lit.cfg.py",
-    tools = [
-        "//tools:iree-opt",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
-
-iree_cmake_extra_content(
-    content = """
-endif()  # IREE_TARGET_BACKEND_ROCM
-""",
-    inline = True,
-)
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index e51014cd19aa..fb9e495d9535 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -75,18 +75,4 @@ iree_lit_test_suite(
     transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir
 )
 
-if(IREE_TARGET_BACKEND_ROCM)
-
-iree_lit_test_suite(
-  NAME
-    lit_requiring_rocm
-  SRCS
-    "ukernel_pipeline_transform.mlir"
-  TOOLS
-    FileCheck
-    iree-opt
-)
-
-endif()  # IREE_TARGET_BACKEND_ROCM
-
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###