Skip to content

Commit

Permalink
review-comments
Browse files Browse the repository at this point in the history
Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
  • Loading branch information
bjacob committed Dec 3, 2024
1 parent e796021 commit ea29fad
Show file tree
Hide file tree
Showing 9 changed files with 94 additions and 132 deletions.
26 changes: 26 additions & 0 deletions compiler/plugins/target/ROCM/test/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright 2024 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content")
load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")

package(
features = ["layering_check"],
licenses = ["notice"], # Apache 2.0
)

iree_lit_test_suite(
name = "lit",
srcs = [
"gpu_lower_to_ukernels.mlir",
"ukernel_pipeline_transform.mlir",
],
cfg = "//compiler:lit.cfg.py",
tools = [
"//tools:iree-opt",
"@llvm-project//llvm:FileCheck",
],
)
31 changes: 27 additions & 4 deletions compiler/plugins/target/ROCM/test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,33 @@
# NOTE: Bazel testing of this backend is impossible because there is no way
# for Bazel to bundle the AMD bitcode files that the backend depends on. Users
# of the compiler can pass explicit flags, but we prefer that default tests
# exercise default flags, which cannot be supported properly on Bazel builds.
################################################################################
# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from #
# compiler/plugins/target/ROCM/test/BUILD.bazel #
# #
# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary #
# CMake-only content. #
# #
# To disable autogeneration for this file entirely, delete this header. #
################################################################################

iree_add_all_subdirs()

iree_lit_test_suite(
NAME
lit
SRCS
"gpu_lower_to_ukernels.mlir"
"ukernel_pipeline_transform.mlir"
TOOLS
FileCheck
iree-opt
)

### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###

# NOTE: The following tests are CMake-only because they depend on AMD device
# bitcode libraries that are provided by custom CMake code in target/ROCM.
# By contrast, the above tests that only require ukernel bitcode are part of the
# Bazel build because ukernel bitcode is something that we generate ourselves.

iree_lit_test_suite(
NAME
lit
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx908 --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s --check-prefix=CDNA1

#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
hal.executable.target = #gfx942_target
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
} {
%c0_i64 = arith.constant 0 : i64
%cst = arith.constant 0xFF800000 : f32
Expand Down Expand Up @@ -35,9 +34,8 @@ func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes

// -----

#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
func.func @argmax_4d_unit_parallel_f32i64(%arg0 : tensor<1x1x1x?xf32>) -> tensor<1x1x1xi64> attributes {
hal.executable.target = #gfx942_target
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
} {
%c0_i64 = arith.constant 0 : i64
%cst = arith.constant 0xFF800000 : f32
Expand All @@ -63,9 +61,8 @@ func.func @argmax_4d_unit_parallel_f32i64(%arg0 : tensor<1x1x1x?xf32>) -> tensor

// -----

#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
func.func @argmax_2d_non_unit_parallel_f32i64(%arg0 : tensor<4x?xf32>) -> tensor<4xi64> attributes {
hal.executable.target = #gfx942_target
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
} {
%c0_i64 = arith.constant 0 : i64
%cst = arith.constant 0xFF800000 : f32
Expand All @@ -91,9 +88,8 @@ func.func @argmax_2d_non_unit_parallel_f32i64(%arg0 : tensor<4x?xf32>) -> tensor

// -----

#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
func.func @argmax_2d_dyn_parallel_f32i64(%arg0 : tensor<?x?xf32>) -> tensor<?xi64> attributes {
hal.executable.target = #gfx942_target
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
} {
%c0 = arith.constant 0 : index
%c0_i64 = arith.constant 0 : i64
Expand Down Expand Up @@ -121,9 +117,8 @@ func.func @argmax_2d_dyn_parallel_f32i64(%arg0 : tensor<?x?xf32>) -> tensor<?xi6

// -----

#gfx942_target_ukernels_none = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "none"}>
func.func @argmax_none_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
hal.executable.target = #gfx942_target_ukernels_none
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "none"}>
} {
%c0_i64 = arith.constant 0 : i64
%cst = arith.constant 0xFF800000 : f32
Expand All @@ -149,9 +144,8 @@ func.func @argmax_none_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64>

// -----

#gfx942_target_ukernels_argmax = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "argmax"}>
func.func @argmax_only_argmax_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
hal.executable.target = #gfx942_target_ukernels_argmax
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
} {
%c0_i64 = arith.constant 0 : i64
%cst = arith.constant 0xFF800000 : f32
Expand All @@ -177,9 +171,8 @@ func.func @argmax_only_argmax_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor

// -----

#gfx942_target_ukernels_foo_argmax_bar = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "foo,argmax,bar"}>
func.func @argmax_only_foo_argmax_bar_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
hal.executable.target = #gfx942_target_ukernels_foo_argmax_bar
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "foo,argmax,bar"}>
} {
%c0_i64 = arith.constant 0 : i64
%cst = arith.constant 0xFF800000 : f32
Expand Down Expand Up @@ -207,9 +200,8 @@ func.func @argmax_only_foo_argmax_bar_ukernel_enabled(%arg0 : tensor<1x?xf32>) -

// -----

#gfx942_target_ukernels_foo_argmax_bar = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "foo"}>
func.func @argmax_only_foo_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
hal.executable.target = #gfx942_target_ukernels_foo_argmax_bar
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "foo"}>
} {
%c0_i64 = arith.constant 0 : i64
%cst = arith.constant 0xFF800000 : f32
Expand All @@ -236,9 +228,8 @@ func.func @argmax_only_foo_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1x
// -----

// Currently we do only handle -Inf case as initial values.
#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
func.func @argmax_2d_f32i64_not_neg_inf_init(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
hal.executable.target = #gfx942_target
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
} {
%c0_i64 = arith.constant 0 : i64
%cst = arith.constant 0.0 : f32
Expand Down Expand Up @@ -268,9 +259,8 @@ func.func @argmax_2d_f32i64_not_neg_inf_init(%arg0 : tensor<1x?xf32>) -> tensor<
// Currently just picking out popular chips to support,
// to minimize compile time and space.

#gfx908_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx908", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
func.func @argmax_ukernel_unsupported_arch(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
hal.executable.target = #gfx908_target
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
} {
%c0_i64 = arith.constant 0 : i64
%cst = arith.constant 0xFF800000 : f32
Expand Down Expand Up @@ -298,11 +288,15 @@ func.func @argmax_ukernel_unsupported_arch(%arg0 : tensor<1x?xf32>) -> tensor<1x

// Test user-provided bitcode in the source IR.

#gfx942_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [], max_workgroup_sizes = [], max_thread_count_per_workgroup = 0, max_workgroup_memory_bytes = 0, max_workgroup_counts = []>>, ukernels = "all"}>
func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
hal.executable.target = #gfx942_target,
// Dummy bitcode with an unusual length of 12.
hal.executable.objects = [#hal.executable.object<{path = "iree_uk_amdgpu_argmax_f32i64.c.gfx942.bc", data = dense<"0x4243C0DE0123456789ABCDEF"> : tensor<12xi8>}>]
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>,
// Dummy bitcode with an unusual length of 12. The first 4 bytes are the .bc file format signature.
hal.executable.objects = [
#hal.executable.object<{
path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc",
data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8>
}>
]
} {
%c0_i64 = arith.constant 0 : i64
%cst = arith.constant 0xFF800000 : f32
Expand All @@ -327,7 +321,13 @@ func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes
// CHECK-DAG: %[[C1_index:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C0_i64:.+]] = arith.constant 0
// CHECK-DAG: %[[FILL:.+]] = linalg.fill ins(%[[C0_i64]]
// CHECK: %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic {hal.executable.objects = [#hal.executable.object<{path = "iree_uk_amdgpu_argmax_f32i64.c.gfx942.bc", data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8>}>]} "iree_uk_amdgpu_argmax_f32i64"
// CHECK: %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic {
// CHECK-SAME: hal.executable.objects = [
// CHECK-SAME: #hal.executable.object<{
// CHECK-SAME: path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc",
// CHECK-SAME: data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8>
// CHECK-SAME: }>
// CHECK-SAME: ]} "iree_uk_amdgpu_argmax_f32i64"
// CHECK-SAME: ins(%[[ARG0]] :
// CHECK-SAME: outs(%[[FILL]] :
// CHECK: return %[[MICRO_KERNEL]]
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
]>
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 8192>>, ukernels = "argmax"}>
#map = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0) -> ()>
func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
func.func @argmax_1d_f16i64() attributes {
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
} {
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0xFC00 : f16
%c0_i64 = arith.constant 0 : i64
Expand Down Expand Up @@ -51,10 +52,11 @@ func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_ta
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
]>
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 8192>>, ukernels = "argmax"}>
#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1) -> (d0)>
func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
func.func @argmax_2d_f32i64() attributes {
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
} {
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0xFF800000 : f32
%c0_i64 = arith.constant 0 : i64
Expand Down Expand Up @@ -100,10 +102,11 @@ func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_ta
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
]>
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb">
#map = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0) -> ()>
func.func @no_ukernel_argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
func.func @no_ukernel_argmax_1d_f16i64() attributes {
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb">
} {
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0xFC00 : f16
%c0_i64 = arith.constant 0 : i64
Expand Down Expand Up @@ -147,10 +150,11 @@ func.func @no_ukernel_argmax_1d_f16i64() attributes {hal.executable.target = #ex
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
]>
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>, <WMMA_I32_16x16x16_I8>], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 8192>>, ukernels = "argmax"}>
#map = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0) -> ()>
func.func @not_neg_inf_init_argmax_1d() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
func.func @not_neg_inf_init_argmax_1d() attributes {
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
} {
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f16
%c0_i64 = arith.constant 0 : i64
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,11 @@ matchArgmaxDAGForUKernel(RewriterBase &rewriter, linalg::GenericOp op) {
FnNameAndDefAttrs fn =
getFnNameAndDefAttrs(ukernelName, suffix, rewriter, targetAttr);
if (!fn) {
return rewriter.notifyMatchFailure(op, "No ukernels on this backend.");
return rewriter.notifyMatchFailure(op, "no ukernels on this backend");
}

if (!hasUkernel(targetAttr, ukernelName)) {
return rewriter.notifyMatchFailure(op, "Ukernel not enabled.");
return rewriter.notifyMatchFailure(op, "ukernel not enabled");
}

// Currently only support argmax where parallel dims are 1.
Expand All @@ -176,7 +176,7 @@ matchArgmaxDAGForUKernel(RewriterBase &rewriter, linalg::GenericOp op) {
IREE::HAL::ExecutableObjectAttr bitcodeObject =
getUKernelBitcode(rewriter, execTarget, sourceExecutableObjects, fn.name);
if (!bitcodeObject) {
return rewriter.notifyMatchFailure(op, "No ukernel bitcode for this op.");
return rewriter.notifyMatchFailure(op, "no ukernel bitcode for this op");
}
Location loc = op.getLoc();
// Currently only support 1D reduction, where reduc is on fastest dim.
Expand Down
Loading

0 comments on commit ea29fad

Please sign in to comment.