Skip to content

Commit ec941a4

Browse files
authored
[NVPTX] Legalize ctpop and ctlz in operation legalization (llvm#130668)
By pulling the truncates and extensions out of operations during operation legalization we enable more optimization via DAGCombiner. While the test cases show only cosmetic improvements (unlikely to impact the final SASS) in real programs the exposure of these truncates can allow for more optimization.
1 parent 48b1991 commit ec941a4

File tree

6 files changed

+324
-189
lines changed

6 files changed

+324
-189
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -5113,7 +5113,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
51135113
DAG.getConstant(NVT.getSizeInBits() -
51145114
OVT.getSizeInBits(), dl, NVT));
51155115
}
5116-
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
5116+
Results.push_back(
5117+
DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1, SDNodeFlags::NoWrap));
51175118
break;
51185119
}
51195120
case ISD::CTLZ_ZERO_UNDEF: {

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

+22-9
Original file line numberDiff line numberDiff line change
@@ -764,16 +764,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
764764
// Custom handling for i8 intrinsics
765765
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
766766

767-
for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
768-
setOperationAction(ISD::ABS, Ty, Legal);
769-
setOperationAction(ISD::SMIN, Ty, Legal);
770-
setOperationAction(ISD::SMAX, Ty, Legal);
771-
setOperationAction(ISD::UMIN, Ty, Legal);
772-
setOperationAction(ISD::UMAX, Ty, Legal);
767+
setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX},
768+
{MVT::i16, MVT::i32, MVT::i64}, Legal);
773769

774-
setOperationAction(ISD::CTPOP, Ty, Legal);
775-
setOperationAction(ISD::CTLZ, Ty, Legal);
776-
}
770+
setOperationAction({ISD::CTPOP, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i16,
771+
Promote);
772+
setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i32, Legal);
773+
setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i64, Custom);
777774

778775
setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
779776
setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
@@ -2748,6 +2745,19 @@ static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) {
27482745
return Op;
27492746
}
27502747

2748+
// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
2749+
// Lower these into a node returning the correct type which is zero-extended
2750+
// back to the correct size.
2751+
static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG) {
2752+
SDValue V = Op->getOperand(0);
2753+
assert(V.getValueType() == MVT::i64 &&
2754+
"Unexpected CTLZ/CTPOP type to legalize");
2755+
2756+
SDLoc DL(Op);
2757+
SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V);
2758+
return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
2759+
}
2760+
27512761
SDValue
27522762
NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
27532763
switch (Op.getOpcode()) {
@@ -2833,6 +2843,9 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
28332843
case ISD::FMUL:
28342844
// Used only for bf16 on SM80, where we select fma for non-ftz operation
28352845
return PromoteBinOpIfF32FTZ(Op, DAG);
2846+
case ISD::CTPOP:
2847+
case ISD::CTLZ:
2848+
return lowerCTLZCTPOP(Op, DAG);
28362849

28372850
default:
28382851
llvm_unreachable("Custom lowering not defined for operation");

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

+11-60
Original file line numberDiff line numberDiff line change
@@ -3267,69 +3267,20 @@ def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, i32:$amt)),
32673267
def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, (i32 imm:$amt))),
32683268
(SHF_R_CLAMP_i $lo, $hi, imm:$amt)>;
32693269

3270-
// Count leading zeros
32713270
let hasSideEffects = false in {
3272-
def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3273-
"clz.b32 \t$d, $a;", []>;
3274-
def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3275-
"clz.b64 \t$d, $a;", []>;
3271+
foreach RT = [I32RT, I64RT] in {
3272+
// Count leading zeros
3273+
def CLZr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
3274+
"clz.b" # RT.Size # " \t$d, $a;",
3275+
[(set i32:$d, (ctlz RT.Ty:$a))]>;
3276+
3277+
// Population count
3278+
def POPCr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
3279+
"popc.b" # RT.Size # " \t$d, $a;",
3280+
[(set i32:$d, (ctpop RT.Ty:$a))]>;
3281+
}
32763282
}
32773283

3278-
// 32-bit has a direct PTX instruction
3279-
def : Pat<(i32 (ctlz i32:$a)), (CLZr32 $a)>;
3280-
3281-
// The return type of the ctlz ISD node is the same as its input, but the PTX
3282-
// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the
3283-
// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
3284-
// truncating back down to 32 bits.
3285-
def : Pat<(i64 (ctlz i64:$a)), (CVT_u64_u32 (CLZr64 $a), CvtNONE)>;
3286-
def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 $a)>;
3287-
3288-
// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
3289-
// result back to 16-bits if necessary. We also need to subtract 16 because
3290-
// the high-order 16 zeros were counted.
3291-
//
3292-
// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
3293-
// use to save one SASS instruction (on sm_35 anyway):
3294-
//
3295-
// mov.b32 $tmp, {0xffff, $a}
3296-
// ctlz.b32 $result, $tmp
3297-
//
3298-
// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
3299-
// and then ctlz that value. This way we don't have to subtract 16 from the
3300-
// result. Unfortunately today we don't have a way to generate
3301-
// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
3302-
def : Pat<(i16 (ctlz i16:$a)),
3303-
(SUBi16ri (CVT_u16_u32
3304-
(CLZr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE), 16)>;
3305-
def : Pat<(i32 (zext (i16 (ctlz i16:$a)))),
3306-
(SUBi32ri (CLZr32 (CVT_u32_u16 $a, CvtNONE)), 16)>;
3307-
3308-
// Population count
3309-
let hasSideEffects = false in {
3310-
def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3311-
"popc.b32 \t$d, $a;", []>;
3312-
def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3313-
"popc.b64 \t$d, $a;", []>;
3314-
}
3315-
3316-
// 32-bit has a direct PTX instruction
3317-
def : Pat<(i32 (ctpop i32:$a)), (POPCr32 $a)>;
3318-
3319-
// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
3320-
// to match the LLVM semantics. Just as with ctlz.i64, we provide a second
3321-
// pattern that avoids the type conversion if we're truncating the result to
3322-
// i32 anyway.
3323-
def : Pat<(ctpop i64:$a), (CVT_u64_u32 (POPCr64 $a), CvtNONE)>;
3324-
def : Pat<(i32 (trunc (i64 (ctpop i64:$a)))), (POPCr64 $a)>;
3325-
3326-
// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
3327-
// If we know that we're storing into an i32, we can avoid the final trunc.
3328-
def : Pat<(ctpop i16:$a),
3329-
(CVT_u16_u32 (POPCr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE)>;
3330-
def : Pat<(i32 (zext (i16 (ctpop i16:$a)))),
3331-
(POPCr32 (CVT_u32_u16 $a, CvtNONE))>;
3332-
33333284
// fpround f32 -> f16
33343285
def : Pat<(f16 (fpround f32:$a)),
33353286
(CVT_f16_f32 $a, CvtRN)>;

llvm/test/CodeGen/NVPTX/ctlz.ll

+105-62
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
23
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
34

@@ -10,67 +11,95 @@ declare i64 @llvm.ctlz.i64(i64, i1) readnone
1011
; There should be no difference between llvm.ctlz.i32(%a, true) and
1112
; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0.
1213

13-
; CHECK-LABEL: myctlz(
1414
define i32 @myctlz(i32 %a) {
15-
; CHECK: ld.param.
16-
; CHECK-NEXT: clz.b32
17-
; CHECK-NEXT: st.param.
18-
; CHECK-NEXT: ret;
15+
; CHECK-LABEL: myctlz(
16+
; CHECK: {
17+
; CHECK-NEXT: .reg .b32 %r<3>;
18+
; CHECK-EMPTY:
19+
; CHECK-NEXT: // %bb.0:
20+
; CHECK-NEXT: ld.param.u32 %r1, [myctlz_param_0];
21+
; CHECK-NEXT: clz.b32 %r2, %r1;
22+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
23+
; CHECK-NEXT: ret;
1924
%val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone
2025
ret i32 %val
2126
}
22-
; CHECK-LABEL: myctlz_2(
2327
define i32 @myctlz_2(i32 %a) {
24-
; CHECK: ld.param.
25-
; CHECK-NEXT: clz.b32
26-
; CHECK-NEXT: st.param.
27-
; CHECK-NEXT: ret;
28+
; CHECK-LABEL: myctlz_2(
29+
; CHECK: {
30+
; CHECK-NEXT: .reg .b32 %r<3>;
31+
; CHECK-EMPTY:
32+
; CHECK-NEXT: // %bb.0:
33+
; CHECK-NEXT: ld.param.u32 %r1, [myctlz_2_param_0];
34+
; CHECK-NEXT: clz.b32 %r2, %r1;
35+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
36+
; CHECK-NEXT: ret;
2837
%val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
2938
ret i32 %val
3039
}
3140

3241
; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit
3342
; value, so here we have to zero-extend it.
34-
; CHECK-LABEL: myctlz64(
3543
define i64 @myctlz64(i64 %a) {
36-
; CHECK: ld.param.
37-
; CHECK-NEXT: clz.b64
38-
; CHECK-NEXT: cvt.u64.u32
39-
; CHECK-NEXT: st.param.
40-
; CHECK-NEXT: ret;
44+
; CHECK-LABEL: myctlz64(
45+
; CHECK: {
46+
; CHECK-NEXT: .reg .b32 %r<2>;
47+
; CHECK-NEXT: .reg .b64 %rd<3>;
48+
; CHECK-EMPTY:
49+
; CHECK-NEXT: // %bb.0:
50+
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_param_0];
51+
; CHECK-NEXT: clz.b64 %r1, %rd1;
52+
; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
53+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
54+
; CHECK-NEXT: ret;
4155
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
4256
ret i64 %val
4357
}
44-
; CHECK-LABEL: myctlz64_2(
4558
define i64 @myctlz64_2(i64 %a) {
46-
; CHECK: ld.param.
47-
; CHECK-NEXT: clz.b64
48-
; CHECK-NEXT: cvt.u64.u32
49-
; CHECK-NEXT: st.param.
50-
; CHECK-NEXT: ret;
59+
; CHECK-LABEL: myctlz64_2(
60+
; CHECK: {
61+
; CHECK-NEXT: .reg .b32 %r<2>;
62+
; CHECK-NEXT: .reg .b64 %rd<3>;
63+
; CHECK-EMPTY:
64+
; CHECK-NEXT: // %bb.0:
65+
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_2_param_0];
66+
; CHECK-NEXT: clz.b64 %r1, %rd1;
67+
; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
68+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
69+
; CHECK-NEXT: ret;
5170
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
5271
ret i64 %val
5372
}
5473

5574
; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the
5675
; natural return width of ptx's clz.b64 instruction. No conversions should be
5776
; necessary in the PTX.
58-
; CHECK-LABEL: myctlz64_as_32(
5977
define i32 @myctlz64_as_32(i64 %a) {
60-
; CHECK: ld.param.
61-
; CHECK-NEXT: clz.b64
62-
; CHECK-NEXT: st.param.
63-
; CHECK-NEXT: ret;
78+
; CHECK-LABEL: myctlz64_as_32(
79+
; CHECK: {
80+
; CHECK-NEXT: .reg .b32 %r<2>;
81+
; CHECK-NEXT: .reg .b64 %rd<2>;
82+
; CHECK-EMPTY:
83+
; CHECK-NEXT: // %bb.0:
84+
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_param_0];
85+
; CHECK-NEXT: clz.b64 %r1, %rd1;
86+
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
87+
; CHECK-NEXT: ret;
6488
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
6589
%trunc = trunc i64 %val to i32
6690
ret i32 %trunc
6791
}
68-
; CHECK-LABEL: myctlz64_as_32_2(
6992
define i32 @myctlz64_as_32_2(i64 %a) {
70-
; CHECK: ld.param.
71-
; CHECK-NEXT: clz.b64
72-
; CHECK-NEXT: st.param.
73-
; CHECK-NEXT: ret;
93+
; CHECK-LABEL: myctlz64_as_32_2(
94+
; CHECK: {
95+
; CHECK-NEXT: .reg .b32 %r<2>;
96+
; CHECK-NEXT: .reg .b64 %rd<2>;
97+
; CHECK-EMPTY:
98+
; CHECK-NEXT: // %bb.0:
99+
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_2_param_0];
100+
; CHECK-NEXT: clz.b64 %r1, %rd1;
101+
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
102+
; CHECK-NEXT: ret;
74103
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
75104
%trunc = trunc i64 %val to i32
76105
ret i32 %trunc
@@ -80,53 +109,67 @@ define i32 @myctlz64_as_32_2(i64 %a) {
80109
; and then truncating the result back down to i16. But the NVPTX ABI
81110
; zero-extends i16 return values to i32, so the final truncation doesn't appear
82111
; in this function.
83-
; CHECK-LABEL: myctlz_ret16(
84112
define i16 @myctlz_ret16(i16 %a) {
85-
; CHECK: ld.param.
86-
; CHECK-NEXT: cvt.u32.u16
87-
; CHECK-NEXT: clz.b32
88-
; CHECK-NEXT: sub.
89-
; CHECK-NEXT: st.param.
90-
; CHECK-NEXT: ret;
113+
; CHECK-LABEL: myctlz_ret16(
114+
; CHECK: {
115+
; CHECK-NEXT: .reg .b32 %r<4>;
116+
; CHECK-EMPTY:
117+
; CHECK-NEXT: // %bb.0:
118+
; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_param_0];
119+
; CHECK-NEXT: clz.b32 %r2, %r1;
120+
; CHECK-NEXT: add.s32 %r3, %r2, -16;
121+
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
122+
; CHECK-NEXT: ret;
91123
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
92124
ret i16 %val
93125
}
94-
; CHECK-LABEL: myctlz_ret16_2(
95126
define i16 @myctlz_ret16_2(i16 %a) {
96-
; CHECK: ld.param.
97-
; CHECK-NEXT: cvt.u32.u16
98-
; CHECK-NEXT: clz.b32
99-
; CHECK-NEXT: sub.
100-
; CHECK-NEXT: st.param.
101-
; CHECK-NEXT: ret;
127+
; CHECK-LABEL: myctlz_ret16_2(
128+
; CHECK: {
129+
; CHECK-NEXT: .reg .b32 %r<4>;
130+
; CHECK-EMPTY:
131+
; CHECK-NEXT: // %bb.0:
132+
; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_2_param_0];
133+
; CHECK-NEXT: shl.b32 %r2, %r1, 16;
134+
; CHECK-NEXT: clz.b32 %r3, %r2;
135+
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
136+
; CHECK-NEXT: ret;
102137
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
103138
ret i16 %val
104139
}
105140

106141
; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should
107142
; remain.
108-
; CHECK-LABEL: myctlz_store16(
109143
define void @myctlz_store16(i16 %a, ptr %b) {
110-
; CHECK: ld.param.
111-
; CHECK-NEXT: cvt.u32.u16
112-
; CHECK-NEXT: clz.b32
113-
; CHECK-DAG: cvt.u16.u32
114-
; CHECK-DAG: sub.
115-
; CHECK: st.{{[a-z]}}16
116-
; CHECK: ret;
144+
; CHECK-LABEL: myctlz_store16(
145+
; CHECK: {
146+
; CHECK-NEXT: .reg .b32 %r<4>;
147+
; CHECK-NEXT: .reg .b64 %rd<2>;
148+
; CHECK-EMPTY:
149+
; CHECK-NEXT: // %bb.0:
150+
; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_param_0];
151+
; CHECK-NEXT: clz.b32 %r2, %r1;
152+
; CHECK-NEXT: add.s32 %r3, %r2, -16;
153+
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_param_1];
154+
; CHECK-NEXT: st.u16 [%rd1], %r3;
155+
; CHECK-NEXT: ret;
117156
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
118157
store i16 %val, ptr %b
119158
ret void
120159
}
121-
; CHECK-LABEL: myctlz_store16_2(
122160
define void @myctlz_store16_2(i16 %a, ptr %b) {
123-
; CHECK: ld.param.
124-
; CHECK-NEXT: cvt.u32.u16
125-
; CHECK-NEXT: clz.b32
126-
; CHECK-DAG: cvt.u16.u32
127-
; CHECK-DAG: sub.
128-
; CHECK: st.{{[a-z]}}16
129-
; CHECK: ret;
161+
; CHECK-LABEL: myctlz_store16_2(
162+
; CHECK: {
163+
; CHECK-NEXT: .reg .b32 %r<4>;
164+
; CHECK-NEXT: .reg .b64 %rd<2>;
165+
; CHECK-EMPTY:
166+
; CHECK-NEXT: // %bb.0:
167+
; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_2_param_0];
168+
; CHECK-NEXT: clz.b32 %r2, %r1;
169+
; CHECK-NEXT: add.s32 %r3, %r2, -16;
170+
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_2_param_1];
171+
; CHECK-NEXT: st.u16 [%rd1], %r3;
172+
; CHECK-NEXT: ret;
130173
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
131174
store i16 %val, ptr %b
132175
ret void

0 commit comments

Comments
 (0)