[Hexagon] - Fix 8-bit unsigned saturating downcasts for HVX (Fixes ha…

…lide#7806) (halide#7825) * Dump the IR more frequently in HexagonOptimize.cpp * Fix 8bit unsigned saturating downcasts for HVX We do not have a way of reliably lowering the following expression to LLVM bitcode for HVX. u8_sat(uint16x) where uint16x is a vector (preferably a HVX double vector) with element type uint16. Since there is no native HVX instruction to do this, this patch introduces two helper functions in hvx_128.ll to perform this operation. One function interleaves its input (trunc_satub.vuh) and the other does not (pack_satub.vuh) This patch also removes declaration of some intrinsics not use any longer in hvx_128.ll * Make IR dump messages in HexagonOptimize.cpp consistent with those in CodeGen_Hexagon.cpp * fix clang-format complaints --------- Co-authored-by: Steven Johnson <srj@google.com>
ardier · Mar 3, 2024 · d73cd03 · d73cd03
1 parent 8be6e2f
commit d73cd03
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 2 deletions.
diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
@@ -911,9 +911,11 @@ class OptimizePatterns : public IRMutator {
 
  // Saturating narrowing casts. These may interleave later with trunc_sat.
  {"halide.hexagon.pack_satub.vh", u8_sat(wild_i16x)},
+ {"halide.hexagon.pack_satub.vuh", u8_sat(wild_u16x)},
  {"halide.hexagon.pack_satuh.vw", u16_sat(wild_i32x)},
  {"halide.hexagon.pack_satb.vh", i8_sat(wild_i16x)},
  {"halide.hexagon.pack_sath.vw", i16_sat(wild_i32x)},
+ {"halide.hexagon.pack_satuh.vuw", u16_sat(wild_u32x)},
 
  // We don't have a vpack equivalent to this one, so we match it directly.
  {"halide.hexagon.trunc_satuh.vuw", u16_sat(wild_u32x), Pattern::DeinterleaveOp0},
@@ -1702,8 +1704,10 @@ class EliminateInterleaves : public IRMutator {
  {"halide.hexagon.packhi.vh", "halide.hexagon.trunclo.vh"},
  {"halide.hexagon.packhi.vw", "halide.hexagon.trunclo.vw"},
  {"halide.hexagon.pack_satub.vh", "halide.hexagon.trunc_satub.vh"},
+ {"halide.hexagon.pack_satub.vuh", "halide.hexagon.trunc_satub.vuh"},
  {"halide.hexagon.pack_sath.vw", "halide.hexagon.trunc_sath.vw"},
  {"halide.hexagon.pack_satuh.vw", "halide.hexagon.trunc_satuh.vw"},
+ {"halide.hexagon.pack_satuh.vuw", "halide.hexagon.trunc_satuh.vuw"},
  };
 
  // The reverse mapping of the above.
@@ -2202,26 +2206,38 @@ Stmt optimize_hexagon_instructions(Stmt s, const Target &t) {
  // We need to redo intrinsic matching due to simplification that has
  // happened after the end of target independent lowering.
  s = find_intrinsics(s);
+ debug(4) << "Hexagon: Lowering after find_intrinsics\n"
+ << s << "\n";
 
  // Hexagon prefers widening shifts to be expressed as multiplies to
  // hopefully hit compound widening multiplies.
  s = distribute_shifts(s, /* multiply_adds */ false);
+ debug(4) << "Hexagon: Lowering after DistributeShiftsAsMuls\n"
+ << s << "\n";
 
  // Pattern match VectorReduce IR node. Handle vector reduce instructions
  // before OptimizePatterns to prevent being mutated by patterns like
  // (v0 + v1 * c) -> add_mpy
  s = VectorReducePatterns().mutate(s);
+ debug(4) << "Hexagon: Lowering after VectorReducePatterns\n"
+ << s << "\n";
 
  // Peephole optimize for Hexagon instructions. These can generate
  // interleaves and deinterleaves alongside the HVX intrinsics.
  s = OptimizePatterns(t).mutate(s);
+ debug(4) << "Hexagon: Lowering after OptimizePatterns\n"
+ << s << "\n";
 
  // Try to eliminate any redundant interleave/deinterleave pairs.
  s = EliminateInterleaves(t.natural_vector_size(Int(8))).mutate(s);
+ debug(4) << "Hexagon: Lowering after EliminateInterleaves\n"
+ << s << "\n";
 
  // There may be interleaves left over that we can fuse with other
  // operations.
  s = FuseInterleaves().mutate(s);
+ debug(4) << "Hexagon: Lowering after FuseInterleaves\n"
+ << s << "\n";
  return s;
 }
 

diff --git a/src/runtime/hvx_128.ll b/src/runtime/hvx_128.ll
@@ -1,11 +1,11 @@
+
 declare void @llvm.trap() noreturn nounwind
 
 declare <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32>)
 declare <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32>)
 declare <64 x i32> @llvm.hexagon.V6.vshuffvdd.128B(<32 x i32>, <32 x i32>, i32)
 declare <64 x i32> @llvm.hexagon.V6.vdealvdd.128B(<32 x i32>, <32 x i32>, i32)
-declare <32 x i32> @llvm.hexagon.V6.vasrwhsat.128B(<32 x i32>, <32 x i32>, i32)
-declare <32 x i32> @llvm.hexagon.V6.vsathub.128B(<32 x i32>, <32 x i32>)
+declare <32 x i32> @llvm.hexagon.V6.vsatuwuh.128B(<32 x i32>, <32 x i32>)
 
 define weak_odr <64 x i32> @halide.hexagon.interleave.vw(<64 x i32> %arg) nounwind uwtable readnone alwaysinline {
  %e = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %arg)
@@ -354,6 +354,52 @@ define weak_odr <64 x i16> @halide.hexagon.trunc_satuh.vw(<64 x i32> %arg) nounw
  ret <64 x i16> %r
 }
 
+declare <32 x i32> @llvm.hexagon.V6.vpackeb.128B(<32 x i32>, <32 x i32>)
+declare <32 x i32> @llvm.hexagon.V6.vminuh.128B(<32 x i32>, <32 x i32>)
+declare <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32)
+
+; We do not have saturating downcasts of unsigned 16bit types. So, we expand these
+; in bitcode here.
+; Note: pack_satub.vuh doesnt interleave its input.
+define weak_odr <128 x i8> @halide.hexagon.pack_satub.vuh(<64 x i32> %arg) nounwind uwtable readnone alwaysinline {
+ %max = call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 255)
+ %lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %arg)
+ %hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %arg)
+ %lo_sat = call <32 x i32> @llvm.hexagon.V6.vminuh.128B(<32 x i32> %lo, <32 x i32> %max)
+ %hi_sat = call <32 x i32> @llvm.hexagon.V6.vminuh.128B(<32 x i32> %hi, <32 x i32> %max)
+ %r_32 = call <32 x i32> @llvm.hexagon.V6.vpackeb.128B(<32 x i32> %hi_sat, <32 x i32> %lo_sat)
+ %r = bitcast <32 x i32> %r_32 to <128 x i8>
+ ret <128 x i8> %r
+}
+
+; We cannot use the same strategy for halide.hexagon.pack_satuh.vuw as we did for halide.hexagon.pack_satub.vuh
+; because HVX doesn't have a native min intrinsic for unsigned words like it does for unsigned half-words.
+; Doing a signed min of an unsigned word with 65535 will make unsigned words > INT32_MAX become negative
+; numbers does yielding the wrong result of 0 on subsequent saturation instead of 65535.
+; Instead, we deinterleave the input double vector first and then use trunc_satuh.vuw. The latter is natively
+; supported by the vsat instruction (vsatuwuh intrinsic). This is also the reason we don't have to
+; provide halide.hexagon.trunc_satuh.vuw in the way that we had to provide halide.hexagon.trunc_satub.vuh below.
+define weak_odr <64 x i16> @halide.hexagon.pack_satuh.vuw(<64 x i32> %arg) nounwind uwtable readnone alwaysinline {
+ %lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %arg)
+ %hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %arg)
+ %deal_dv = call <64 x i32> @llvm.hexagon.V6.vdealvdd.128B(<32 x i32> %hi, <32 x i32> %lo, i32 -4)
+ %e = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %deal_dv)
+ %o = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %deal_dv)
+ %r_32 = call <32 x i32> @llvm.hexagon.V6.vsatuwuh.128B(<32 x i32> %o, <32 x i32> %e)
+ %r = bitcast <32 x i32> %r_32 to <64 x i16>
+ ret <64 x i16> %r
+}
+
+declare <32 x i32> @llvm.hexagon.V6.vasruhubsat.128B(<32 x i32>, <32 x i32>, i32)
+; This is the same as halide.hexagon.pack_satub.vuh except it interleaves its input.
+define weak_odr <128 x i8> @halide.hexagon.trunc_satub.vuh(<64 x i32> %arg) nounwind uwtable readnone alwaysinline {
+ %e = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %arg)
+ %o = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %arg)
+ %r_32 = call <32 x i32> @llvm.hexagon.V6.vasruhubsat.128B(<32 x i32> %o, <32 x i32> %e, i32 0)
+ %r = bitcast <32 x i32> %r_32 to <128 x i8>
+ ret <128 x i8> %r
+}
+
 declare void @llvm.hexagon.V6.vgathermh.128B(i8*, i32, i32, <32 x i32>)
 declare void @llvm.hexagon.V6.vgathermw.128B(i8*, i32, i32, <32 x i32>)
 

diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp
@@ -301,8 +301,14 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
  check("vlut16(v*.b,v*.h,r*)", hvx_width / 2, in_u32(clamp(u16_1, 0, 15)));
 
  check("v*.ub = vpack(v*.h,v*.h):sat", hvx_width / 1, u8_sat(i16_1));
+ check("v*.b = vpacke(v*.h,v*.h)", hvx_width / 1, u8_sat(u16_1));
  check("v*.b = vpack(v*.h,v*.h):sat", hvx_width / 1, i8_sat(i16_1));
  check("v*.uh = vpack(v*.w,v*.w):sat", hvx_width / 2, u16_sat(i32_1));
+ // Due to the unavailability of an unsigned word "min" operation in HVX,
+ // we deinterlave a vector pair and then do a saturating downcast that interleaves
+ // (intrinsic:vsatuwuh). See halide.hexagon.pack_satuh.vuw in hvx_128.ll
+ // for a more detailed explanation.
+ check("v*.uh = vsat(v*.uw,v*.uw)", hvx_width / 2, u16_sat(u32_1));
  check("v*.h = vpack(v*.w,v*.w):sat", hvx_width / 2, i16_sat(i32_1));
 
  // vpack doesn't interleave its inputs, which means it doesn't