Skip to content

Commit

Permalink
[Hexagon] - Fix 8-bit unsigned saturating downcasts for HVX (Fixes ha…
Browse files Browse the repository at this point in the history
…lide#7806) (halide#7825)

* Dump the IR more frequently in HexagonOptimize.cpp

* Fix 8bit unsigned saturating downcasts for HVX

We do not have a way of reliably lowering the following expression
to LLVM bitcode for HVX.

u8_sat(uint16x)

where uint16x is a vector (preferably a HVX double vector) with
element type uint16.
Since there is no native HVX instruction to do this, this patch
introduces two helper functions in hvx_128.ll to perform this
operation. One function interleaves its input (trunc_satub.vuh) and the
other does not (pack_satub.vuh)

This patch also removes declaration of some intrinsics not use any
longer in hvx_128.ll

* Make IR dump messages in HexagonOptimize.cpp consistent with those in CodeGen_Hexagon.cpp

* fix clang-format complaints

---------

Co-authored-by: Steven Johnson <srj@google.com>
  • Loading branch information
2 people authored and ardier committed Mar 3, 2024
1 parent 8be6e2f commit d73cd03
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 2 deletions.
16 changes: 16 additions & 0 deletions src/HexagonOptimize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -911,9 +911,11 @@ class OptimizePatterns : public IRMutator {

// Saturating narrowing casts. These may interleave later with trunc_sat.
{"halide.hexagon.pack_satub.vh", u8_sat(wild_i16x)},
{"halide.hexagon.pack_satub.vuh", u8_sat(wild_u16x)},
{"halide.hexagon.pack_satuh.vw", u16_sat(wild_i32x)},
{"halide.hexagon.pack_satb.vh", i8_sat(wild_i16x)},
{"halide.hexagon.pack_sath.vw", i16_sat(wild_i32x)},
{"halide.hexagon.pack_satuh.vuw", u16_sat(wild_u32x)},

// We don't have a vpack equivalent to this one, so we match it directly.
{"halide.hexagon.trunc_satuh.vuw", u16_sat(wild_u32x), Pattern::DeinterleaveOp0},
Expand Down Expand Up @@ -1702,8 +1704,10 @@ class EliminateInterleaves : public IRMutator {
{"halide.hexagon.packhi.vh", "halide.hexagon.trunclo.vh"},
{"halide.hexagon.packhi.vw", "halide.hexagon.trunclo.vw"},
{"halide.hexagon.pack_satub.vh", "halide.hexagon.trunc_satub.vh"},
{"halide.hexagon.pack_satub.vuh", "halide.hexagon.trunc_satub.vuh"},
{"halide.hexagon.pack_sath.vw", "halide.hexagon.trunc_sath.vw"},
{"halide.hexagon.pack_satuh.vw", "halide.hexagon.trunc_satuh.vw"},
{"halide.hexagon.pack_satuh.vuw", "halide.hexagon.trunc_satuh.vuw"},
};

// The reverse mapping of the above.
Expand Down Expand Up @@ -2202,26 +2206,38 @@ Stmt optimize_hexagon_instructions(Stmt s, const Target &t) {
// We need to redo intrinsic matching due to simplification that has
// happened after the end of target independent lowering.
s = find_intrinsics(s);
debug(4) << "Hexagon: Lowering after find_intrinsics\n"
<< s << "\n";

// Hexagon prefers widening shifts to be expressed as multiplies to
// hopefully hit compound widening multiplies.
s = distribute_shifts(s, /* multiply_adds */ false);
debug(4) << "Hexagon: Lowering after DistributeShiftsAsMuls\n"
<< s << "\n";

// Pattern match VectorReduce IR node. Handle vector reduce instructions
// before OptimizePatterns to prevent being mutated by patterns like
// (v0 + v1 * c) -> add_mpy
s = VectorReducePatterns().mutate(s);
debug(4) << "Hexagon: Lowering after VectorReducePatterns\n"
<< s << "\n";

// Peephole optimize for Hexagon instructions. These can generate
// interleaves and deinterleaves alongside the HVX intrinsics.
s = OptimizePatterns(t).mutate(s);
debug(4) << "Hexagon: Lowering after OptimizePatterns\n"
<< s << "\n";

// Try to eliminate any redundant interleave/deinterleave pairs.
s = EliminateInterleaves(t.natural_vector_size(Int(8))).mutate(s);
debug(4) << "Hexagon: Lowering after EliminateInterleaves\n"
<< s << "\n";

// There may be interleaves left over that we can fuse with other
// operations.
s = FuseInterleaves().mutate(s);
debug(4) << "Hexagon: Lowering after FuseInterleaves\n"
<< s << "\n";
return s;
}

Expand Down
50 changes: 48 additions & 2 deletions src/runtime/hvx_128.ll
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@

declare void @llvm.trap() noreturn nounwind

declare <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32>)
declare <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32>)
declare <64 x i32> @llvm.hexagon.V6.vshuffvdd.128B(<32 x i32>, <32 x i32>, i32)
declare <64 x i32> @llvm.hexagon.V6.vdealvdd.128B(<32 x i32>, <32 x i32>, i32)
declare <32 x i32> @llvm.hexagon.V6.vasrwhsat.128B(<32 x i32>, <32 x i32>, i32)
declare <32 x i32> @llvm.hexagon.V6.vsathub.128B(<32 x i32>, <32 x i32>)
declare <32 x i32> @llvm.hexagon.V6.vsatuwuh.128B(<32 x i32>, <32 x i32>)

define weak_odr <64 x i32> @halide.hexagon.interleave.vw(<64 x i32> %arg) nounwind uwtable readnone alwaysinline {
%e = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %arg)
Expand Down Expand Up @@ -354,6 +354,52 @@ define weak_odr <64 x i16> @halide.hexagon.trunc_satuh.vw(<64 x i32> %arg) nounw
ret <64 x i16> %r
}

declare <32 x i32> @llvm.hexagon.V6.vpackeb.128B(<32 x i32>, <32 x i32>)
declare <32 x i32> @llvm.hexagon.V6.vminuh.128B(<32 x i32>, <32 x i32>)
declare <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32)

; We do not have saturating downcasts of unsigned 16bit types. So, we expand these
; in bitcode here.
; Note: pack_satub.vuh doesnt interleave its input.
define weak_odr <128 x i8> @halide.hexagon.pack_satub.vuh(<64 x i32> %arg) nounwind uwtable readnone alwaysinline {
%max = call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 255)
%lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %arg)
%hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %arg)
%lo_sat = call <32 x i32> @llvm.hexagon.V6.vminuh.128B(<32 x i32> %lo, <32 x i32> %max)
%hi_sat = call <32 x i32> @llvm.hexagon.V6.vminuh.128B(<32 x i32> %hi, <32 x i32> %max)
%r_32 = call <32 x i32> @llvm.hexagon.V6.vpackeb.128B(<32 x i32> %hi_sat, <32 x i32> %lo_sat)
%r = bitcast <32 x i32> %r_32 to <128 x i8>
ret <128 x i8> %r
}

; We cannot use the same strategy for halide.hexagon.pack_satuh.vuw as we did for halide.hexagon.pack_satub.vuh
; because HVX doesn't have a native min intrinsic for unsigned words like it does for unsigned half-words.
; Doing a signed min of an unsigned word with 65535 will make unsigned words > INT32_MAX become negative
; numbers does yielding the wrong result of 0 on subsequent saturation instead of 65535.
; Instead, we deinterleave the input double vector first and then use trunc_satuh.vuw. The latter is natively
; supported by the vsat instruction (vsatuwuh intrinsic). This is also the reason we don't have to
; provide halide.hexagon.trunc_satuh.vuw in the way that we had to provide halide.hexagon.trunc_satub.vuh below.
define weak_odr <64 x i16> @halide.hexagon.pack_satuh.vuw(<64 x i32> %arg) nounwind uwtable readnone alwaysinline {
%lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %arg)
%hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %arg)
%deal_dv = call <64 x i32> @llvm.hexagon.V6.vdealvdd.128B(<32 x i32> %hi, <32 x i32> %lo, i32 -4)
%e = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %deal_dv)
%o = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %deal_dv)
%r_32 = call <32 x i32> @llvm.hexagon.V6.vsatuwuh.128B(<32 x i32> %o, <32 x i32> %e)
%r = bitcast <32 x i32> %r_32 to <64 x i16>
ret <64 x i16> %r
}

declare <32 x i32> @llvm.hexagon.V6.vasruhubsat.128B(<32 x i32>, <32 x i32>, i32)
; This is the same as halide.hexagon.pack_satub.vuh except it interleaves its input.
define weak_odr <128 x i8> @halide.hexagon.trunc_satub.vuh(<64 x i32> %arg) nounwind uwtable readnone alwaysinline {
%e = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %arg)
%o = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %arg)
%r_32 = call <32 x i32> @llvm.hexagon.V6.vasruhubsat.128B(<32 x i32> %o, <32 x i32> %e, i32 0)
%r = bitcast <32 x i32> %r_32 to <128 x i8>
ret <128 x i8> %r
}

declare void @llvm.hexagon.V6.vgathermh.128B(i8*, i32, i32, <32 x i32>)
declare void @llvm.hexagon.V6.vgathermw.128B(i8*, i32, i32, <32 x i32>)

Expand Down
6 changes: 6 additions & 0 deletions test/correctness/simd_op_check_hvx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -301,8 +301,14 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
check("vlut16(v*.b,v*.h,r*)", hvx_width / 2, in_u32(clamp(u16_1, 0, 15)));

check("v*.ub = vpack(v*.h,v*.h):sat", hvx_width / 1, u8_sat(i16_1));
check("v*.b = vpacke(v*.h,v*.h)", hvx_width / 1, u8_sat(u16_1));
check("v*.b = vpack(v*.h,v*.h):sat", hvx_width / 1, i8_sat(i16_1));
check("v*.uh = vpack(v*.w,v*.w):sat", hvx_width / 2, u16_sat(i32_1));
// Due to the unavailability of an unsigned word "min" operation in HVX,
// we deinterlave a vector pair and then do a saturating downcast that interleaves
// (intrinsic:vsatuwuh). See halide.hexagon.pack_satuh.vuw in hvx_128.ll
// for a more detailed explanation.
check("v*.uh = vsat(v*.uw,v*.uw)", hvx_width / 2, u16_sat(u32_1));
check("v*.h = vpack(v*.w,v*.w):sat", hvx_width / 2, i16_sat(i32_1));

// vpack doesn't interleave its inputs, which means it doesn't
Expand Down

0 comments on commit d73cd03

Please sign in to comment.