-
Notifications
You must be signed in to change notification settings - Fork 12.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[AArch64][SVE] Add codegen support for partial reduction lowering to …
…wide add instructions (#114406) For partial reductions in the situation of the number of elements being halved, a pair of wide add instructions can be used.
- Loading branch information
1 parent
e05d91b
commit c3c2e1e
Showing
2 changed files
with
199 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
141 changes: 141 additions & 0 deletions
141
llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s | ||
|
||
define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vscale x 4 x i32> %input){ | ||
; CHECK-LABEL: signed_wide_add_nxv4i32: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: saddwb z0.d, z0.d, z1.s | ||
; CHECK-NEXT: saddwt z0.d, z0.d, z1.s | ||
; CHECK-NEXT: ret | ||
entry: | ||
%input.wide = sext <vscale x 4 x i32> %input to <vscale x 4 x i64> | ||
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide) | ||
ret <vscale x 2 x i64> %partial.reduce | ||
} | ||
|
||
define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vscale x 4 x i32> %input){ | ||
; CHECK-LABEL: unsigned_wide_add_nxv4i32: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: uaddwb z0.d, z0.d, z1.s | ||
; CHECK-NEXT: uaddwt z0.d, z0.d, z1.s | ||
; CHECK-NEXT: ret | ||
entry: | ||
%input.wide = zext <vscale x 4 x i32> %input to <vscale x 4 x i64> | ||
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide) | ||
ret <vscale x 2 x i64> %partial.reduce | ||
} | ||
|
||
define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %input){ | ||
; CHECK-LABEL: signed_wide_add_nxv8i16: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: saddwb z0.s, z0.s, z1.h | ||
; CHECK-NEXT: saddwt z0.s, z0.s, z1.h | ||
; CHECK-NEXT: ret | ||
entry: | ||
%input.wide = sext <vscale x 8 x i16> %input to <vscale x 8 x i32> | ||
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide) | ||
ret <vscale x 4 x i32> %partial.reduce | ||
} | ||
|
||
define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %input){ | ||
; CHECK-LABEL: unsigned_wide_add_nxv8i16: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: uaddwb z0.s, z0.s, z1.h | ||
; CHECK-NEXT: uaddwt z0.s, z0.s, z1.h | ||
; CHECK-NEXT: ret | ||
entry: | ||
%input.wide = zext <vscale x 8 x i16> %input to <vscale x 8 x i32> | ||
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide) | ||
ret <vscale x 4 x i32> %partial.reduce | ||
} | ||
|
||
define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %input){ | ||
; CHECK-LABEL: signed_wide_add_nxv16i8: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: saddwb z0.h, z0.h, z1.b | ||
; CHECK-NEXT: saddwt z0.h, z0.h, z1.b | ||
; CHECK-NEXT: ret | ||
entry: | ||
%input.wide = sext <vscale x 16 x i8> %input to <vscale x 16 x i16> | ||
%partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide) | ||
ret <vscale x 8 x i16> %partial.reduce | ||
} | ||
|
||
define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %input){ | ||
; CHECK-LABEL: unsigned_wide_add_nxv16i8: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: uaddwb z0.h, z0.h, z1.b | ||
; CHECK-NEXT: uaddwt z0.h, z0.h, z1.b | ||
; CHECK-NEXT: ret | ||
entry: | ||
%input.wide = zext <vscale x 16 x i8> %input to <vscale x 16 x i16> | ||
%partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide) | ||
ret <vscale x 8 x i16> %partial.reduce | ||
} | ||
|
||
define <vscale x 2 x i32> @signed_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vscale x 4 x i16> %input){ | ||
; CHECK-LABEL: signed_wide_add_nxv4i16: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: ptrue p0.s | ||
; CHECK-NEXT: sxth z1.s, p0/m, z1.s | ||
; CHECK-NEXT: uunpklo z2.d, z1.s | ||
; CHECK-NEXT: uunpkhi z1.d, z1.s | ||
; CHECK-NEXT: add z0.d, z0.d, z2.d | ||
; CHECK-NEXT: add z0.d, z1.d, z0.d | ||
; CHECK-NEXT: ret | ||
entry: | ||
%input.wide = sext <vscale x 4 x i16> %input to <vscale x 4 x i32> | ||
%partial.reduce = tail call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide) | ||
ret <vscale x 2 x i32> %partial.reduce | ||
} | ||
|
||
define <vscale x 2 x i32> @unsigned_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vscale x 4 x i16> %input){ | ||
; CHECK-LABEL: unsigned_wide_add_nxv4i16: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: and z1.s, z1.s, #0xffff | ||
; CHECK-NEXT: uunpklo z2.d, z1.s | ||
; CHECK-NEXT: uunpkhi z1.d, z1.s | ||
; CHECK-NEXT: add z0.d, z0.d, z2.d | ||
; CHECK-NEXT: add z0.d, z1.d, z0.d | ||
; CHECK-NEXT: ret | ||
entry: | ||
%input.wide = zext <vscale x 4 x i16> %input to <vscale x 4 x i32> | ||
%partial.reduce = tail call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide) | ||
ret <vscale x 2 x i32> %partial.reduce | ||
} | ||
|
||
define <vscale x 4 x i64> @signed_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){ | ||
; CHECK-LABEL: signed_wide_add_nxv8i32: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: sunpkhi z4.d, z2.s | ||
; CHECK-NEXT: sunpklo z2.d, z2.s | ||
; CHECK-NEXT: sunpkhi z5.d, z3.s | ||
; CHECK-NEXT: sunpklo z3.d, z3.s | ||
; CHECK-NEXT: add z0.d, z0.d, z2.d | ||
; CHECK-NEXT: add z1.d, z1.d, z4.d | ||
; CHECK-NEXT: add z0.d, z3.d, z0.d | ||
; CHECK-NEXT: add z1.d, z5.d, z1.d | ||
; CHECK-NEXT: ret | ||
entry: | ||
%input.wide = sext <vscale x 8 x i32> %input to <vscale x 8 x i64> | ||
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide) | ||
ret <vscale x 4 x i64> %partial.reduce | ||
} | ||
|
||
define <vscale x 4 x i64> @unsigned_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){ | ||
; CHECK-LABEL: unsigned_wide_add_nxv8i32: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: uunpkhi z4.d, z2.s | ||
; CHECK-NEXT: uunpklo z2.d, z2.s | ||
; CHECK-NEXT: uunpkhi z5.d, z3.s | ||
; CHECK-NEXT: uunpklo z3.d, z3.s | ||
; CHECK-NEXT: add z0.d, z0.d, z2.d | ||
; CHECK-NEXT: add z1.d, z1.d, z4.d | ||
; CHECK-NEXT: add z0.d, z3.d, z0.d | ||
; CHECK-NEXT: add z1.d, z5.d, z1.d | ||
; CHECK-NEXT: ret | ||
entry: | ||
%input.wide = zext <vscale x 8 x i32> %input to <vscale x 8 x i64> | ||
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide) | ||
ret <vscale x 4 x i64> %partial.reduce | ||
} |