review-comments

Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
iree-org · Dec 3, 2024 · ea29fad · ea29fad
1 parent e796021
commit ea29fad
Show file tree

Hide file tree

Showing 9 changed files with 94 additions and 132 deletions.
diff --git a/compiler/plugins/target/ROCM/test/BUILD.bazel b/compiler/plugins/target/ROCM/test/BUILD.bazel
@@ -0,0 +1,26 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
+
+package(
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_lit_test_suite(
+    name = "lit",
+    srcs = [
+        "gpu_lower_to_ukernels.mlir",
+        "ukernel_pipeline_transform.mlir",
+    ],
+    cfg = "//compiler:lit.cfg.py",
+    tools = [
+        "//tools:iree-opt",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/compiler/plugins/target/ROCM/test/CMakeLists.txt b/compiler/plugins/target/ROCM/test/CMakeLists.txt
@@ -1,10 +1,33 @@
-# NOTE: Bazel testing of this backend is impossible because there is no way
-# for Bazel to bundle the AMD bitcode files that the backend depends on. Users
-# of the compiler can pass explicit flags, but we prefer that default tests
-# exercise default flags, which cannot be supported properly on Bazel builds.
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# compiler/plugins/target/ROCM/test/BUILD.bazel                                #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
 
 iree_add_all_subdirs()
 
+iree_lit_test_suite(
+  NAME
+    lit
+  SRCS
+    "gpu_lower_to_ukernels.mlir"
+    "ukernel_pipeline_transform.mlir"
+  TOOLS
+    FileCheck
+    iree-opt
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
+
+# NOTE: The following tests are CMake-only because they depend on AMD device
+# bitcode libraries that are provided by custom CMake code in target/ROCM.
+# By contrast, the above tests that only require ukernel bitcode are part of the
+# Bazel build because ukernel bitcode is something that we generate ourselves.
+
 iree_lit_test_suite(
   NAME
     lit

diff --git a/...ommon/GPU/test/gpu_lower_to_ukernels.mlir → ...rget/ROCM/test/gpu_lower_to_ukernels.mlir b/...ommon/GPU/test/gpu_lower_to_ukernels.mlir → ...rget/ROCM/test/gpu_lower_to_ukernels.mlir
@@ -1,9 +1,8 @@
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx908 --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s --check-prefix=CDNA1
 
-#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
 func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx942_target
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -35,9 +34,8 @@ func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes
 
 // -----
 
-#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
 func.func @argmax_4d_unit_parallel_f32i64(%arg0 : tensor<1x1x1x?xf32>) -> tensor<1x1x1xi64> attributes {
-  hal.executable.target = #gfx942_target
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -63,9 +61,8 @@ func.func @argmax_4d_unit_parallel_f32i64(%arg0 : tensor<1x1x1x?xf32>) -> tensor
 
 // -----
 
-#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
 func.func @argmax_2d_non_unit_parallel_f32i64(%arg0 : tensor<4x?xf32>) -> tensor<4xi64> attributes {
-  hal.executable.target = #gfx942_target
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -91,9 +88,8 @@ func.func @argmax_2d_non_unit_parallel_f32i64(%arg0 : tensor<4x?xf32>) -> tensor
 
 // -----
 
-#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
 func.func @argmax_2d_dyn_parallel_f32i64(%arg0 : tensor<?x?xf32>) -> tensor<?xi64> attributes {
-  hal.executable.target = #gfx942_target
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
 } {
   %c0 = arith.constant 0 : index
   %c0_i64 = arith.constant 0 : i64
@@ -121,9 +117,8 @@ func.func @argmax_2d_dyn_parallel_f32i64(%arg0 : tensor<?x?xf32>) -> tensor<?xi6
 
 // -----
 
-#gfx942_target_ukernels_none = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "none"}>
 func.func @argmax_none_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx942_target_ukernels_none
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "none"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -149,9 +144,8 @@ func.func @argmax_none_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64>
 
 // -----
 
-#gfx942_target_ukernels_argmax = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "argmax"}>
 func.func @argmax_only_argmax_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx942_target_ukernels_argmax
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -177,9 +171,8 @@ func.func @argmax_only_argmax_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor
 
 // -----
 
-#gfx942_target_ukernels_foo_argmax_bar = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "foo,argmax,bar"}>
 func.func @argmax_only_foo_argmax_bar_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx942_target_ukernels_foo_argmax_bar
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "foo,argmax,bar"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -207,9 +200,8 @@ func.func @argmax_only_foo_argmax_bar_ukernel_enabled(%arg0 : tensor<1x?xf32>) -
 
 // -----
 
-#gfx942_target_ukernels_foo_argmax_bar = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "foo"}>
 func.func @argmax_only_foo_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx942_target_ukernels_foo_argmax_bar
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "foo"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -236,9 +228,8 @@ func.func @argmax_only_foo_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1x
 // -----
 
 // Currently we do only handle -Inf case as initial values.
-#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
 func.func @argmax_2d_f32i64_not_neg_inf_init(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx942_target
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0.0 : f32
@@ -268,9 +259,8 @@ func.func @argmax_2d_f32i64_not_neg_inf_init(%arg0 : tensor<1x?xf32>) -> tensor<
 //       Currently just picking out popular chips to support,
 //       to minimize compile time and space.
 
-#gfx908_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx908", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
 func.func @argmax_ukernel_unsupported_arch(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx908_target
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -298,11 +288,15 @@ func.func @argmax_ukernel_unsupported_arch(%arg0 : tensor<1x?xf32>) -> tensor<1x
 
 // Test user-provided bitcode in the source IR.
 
-#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
 func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
-  hal.executable.target = #gfx942_target,
-  // Dummy bitcode with an unusual length of 12.
-  hal.executable.objects = [#hal.executable.object<{path = "iree_uk_amdgpu_argmax_f32i64.c.gfx942.bc", data = dense<"0x4243C0DE0123456789ABCDEF"> : tensor<12xi8>}>]
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>,
+  // Dummy bitcode with an unusual length of 12. The first 4 bytes are the .bc file format signature.
+  hal.executable.objects = [
+    #hal.executable.object<{
+      path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc",
+      data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8>
+    }>
+  ]
 } {
   %c0_i64 = arith.constant 0 : i64
   %cst = arith.constant 0xFF800000 : f32
@@ -327,7 +321,13 @@ func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes
 //  CHECK-DAG:   %[[C1_index:.+]] = arith.constant 1 : index
 //  CHECK-DAG:   %[[C0_i64:.+]] = arith.constant 0
 //  CHECK-DAG:   %[[FILL:.+]] = linalg.fill ins(%[[C0_i64]]
-//      CHECK:   %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic {hal.executable.objects = [#hal.executable.object<{path = "iree_uk_amdgpu_argmax_f32i64.c.gfx942.bc", data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8>}>]} "iree_uk_amdgpu_argmax_f32i64"
+//      CHECK:   %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic {
+// CHECK-SAME:     hal.executable.objects = [
+// CHECK-SAME:       #hal.executable.object<{
+// CHECK-SAME:         path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc",
+// CHECK-SAME:         data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8>
+// CHECK-SAME:       }>
+// CHECK-SAME:     ]} "iree_uk_amdgpu_argmax_f32i64"
 // CHECK-SAME:       ins(%[[ARG0]] :
 // CHECK-SAME:       outs(%[[FILL]] :
 //      CHECK:   return %[[MICRO_KERNEL]]
diff --git a/...MGPU/test/ukernel_pipeline_transform.mlir → ...ROCM/test/ukernel_pipeline_transform.mlir b/...MGPU/test/ukernel_pipeline_transform.mlir → ...ROCM/test/ukernel_pipeline_transform.mlir
@@ -4,10 +4,11 @@
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 8192>>, ukernels = "argmax"}>
 #map = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
-func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @argmax_1d_f16i64() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0xFC00 : f16
   %c0_i64 = arith.constant 0 : i64
@@ -51,10 +52,11 @@ func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_ta
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 8192>>, ukernels = "argmax"}>
 #map = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0)>
-func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @argmax_2d_f32i64() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0xFF800000 : f32
   %c0_i64 = arith.constant 0 : i64
@@ -100,10 +102,11 @@ func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_ta
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb">
 #map = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
-func.func @no_ukernel_argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @no_ukernel_argmax_1d_f16i64() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb">
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0xFC00 : f16
   %c0_i64 = arith.constant 0 : i64
@@ -147,10 +150,11 @@ func.func @no_ukernel_argmax_1d_f16i64() attributes {hal.executable.target = #ex
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 8192>>, ukernels = "argmax"}>
 #map = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
-func.func @not_neg_inf_init_argmax_1d() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @not_neg_inf_init_argmax_1d() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0.000000e+00 : f16
   %c0_i64 = arith.constant 0 : i64

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp
@@ -146,11 +146,11 @@ matchArgmaxDAGForUKernel(RewriterBase &rewriter, linalg::GenericOp op) {
   FnNameAndDefAttrs fn =
       getFnNameAndDefAttrs(ukernelName, suffix, rewriter, targetAttr);
   if (!fn) {
-    return rewriter.notifyMatchFailure(op, "No ukernels on this backend.");
+    return rewriter.notifyMatchFailure(op, "no ukernels on this backend");
   }
 
   if (!hasUkernel(targetAttr, ukernelName)) {
-    return rewriter.notifyMatchFailure(op, "Ukernel not enabled.");
+    return rewriter.notifyMatchFailure(op, "ukernel not enabled");
   }
 
   // Currently only support argmax where parallel dims are 1.
@@ -176,7 +176,7 @@ matchArgmaxDAGForUKernel(RewriterBase &rewriter, linalg::GenericOp op) {
   IREE::HAL::ExecutableObjectAttr bitcodeObject =
       getUKernelBitcode(rewriter, execTarget, sourceExecutableObjects, fn.name);
   if (!bitcodeObject) {
-    return rewriter.notifyMatchFailure(op, "No ukernel bitcode for this op.");
+    return rewriter.notifyMatchFailure(op, "no ukernel bitcode for this op");
   }
   Location loc = op.getLoc();
   // Currently only support 1D reduction, where reduc is on fastest dim.