Skip to content

Commit ec941a4

Browse files
authoredMar 12, 2025··
[NVPTX] Legalize ctpop and ctlz in operation legalization (#130668)
By pulling the truncates and extensions out of operations during operation legalization we enable more optimization via DAGCombiner. While the test cases show only cosmetic improvements (unlikely to impact the final SASS) in real programs the exposure of these truncates can allow for more optimization.
1 parent 48b1991 commit ec941a4

File tree

6 files changed

+324
-189
lines changed

6 files changed

+324
-189
lines changed
 

‎llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -5113,7 +5113,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
51135113
DAG.getConstant(NVT.getSizeInBits() -
51145114
OVT.getSizeInBits(), dl, NVT));
51155115
}
5116-
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
5116+
Results.push_back(
5117+
DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1, SDNodeFlags::NoWrap));
51175118
break;
51185119
}
51195120
case ISD::CTLZ_ZERO_UNDEF: {

‎llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

+22-9
Original file line numberDiff line numberDiff line change
@@ -764,16 +764,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
764764
// Custom handling for i8 intrinsics
765765
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
766766

767-
for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
768-
setOperationAction(ISD::ABS, Ty, Legal);
769-
setOperationAction(ISD::SMIN, Ty, Legal);
770-
setOperationAction(ISD::SMAX, Ty, Legal);
771-
setOperationAction(ISD::UMIN, Ty, Legal);
772-
setOperationAction(ISD::UMAX, Ty, Legal);
767+
setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX},
768+
{MVT::i16, MVT::i32, MVT::i64}, Legal);
773769

774-
setOperationAction(ISD::CTPOP, Ty, Legal);
775-
setOperationAction(ISD::CTLZ, Ty, Legal);
776-
}
770+
setOperationAction({ISD::CTPOP, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i16,
771+
Promote);
772+
setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i32, Legal);
773+
setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i64, Custom);
777774

778775
setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
779776
setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
@@ -2748,6 +2745,19 @@ static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) {
27482745
return Op;
27492746
}
27502747

2748+
// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
2749+
// Lower these into a node returning the correct type which is zero-extended
2750+
// back to the correct size.
2751+
static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG) {
2752+
SDValue V = Op->getOperand(0);
2753+
assert(V.getValueType() == MVT::i64 &&
2754+
"Unexpected CTLZ/CTPOP type to legalize");
2755+
2756+
SDLoc DL(Op);
2757+
SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V);
2758+
return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
2759+
}
2760+
27512761
SDValue
27522762
NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
27532763
switch (Op.getOpcode()) {
@@ -2833,6 +2843,9 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
28332843
case ISD::FMUL:
28342844
// Used only for bf16 on SM80, where we select fma for non-ftz operation
28352845
return PromoteBinOpIfF32FTZ(Op, DAG);
2846+
case ISD::CTPOP:
2847+
case ISD::CTLZ:
2848+
return lowerCTLZCTPOP(Op, DAG);
28362849

28372850
default:
28382851
llvm_unreachable("Custom lowering not defined for operation");

‎llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

+11-60
Original file line numberDiff line numberDiff line change
@@ -3267,69 +3267,20 @@ def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, i32:$amt)),
32673267
def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, (i32 imm:$amt))),
32683268
(SHF_R_CLAMP_i $lo, $hi, imm:$amt)>;
32693269

3270-
// Count leading zeros
32713270
let hasSideEffects = false in {
3272-
def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3273-
"clz.b32 \t$d, $a;", []>;
3274-
def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3275-
"clz.b64 \t$d, $a;", []>;
3271+
foreach RT = [I32RT, I64RT] in {
3272+
// Count leading zeros
3273+
def CLZr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
3274+
"clz.b" # RT.Size # " \t$d, $a;",
3275+
[(set i32:$d, (ctlz RT.Ty:$a))]>;
3276+
3277+
// Population count
3278+
def POPCr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
3279+
"popc.b" # RT.Size # " \t$d, $a;",
3280+
[(set i32:$d, (ctpop RT.Ty:$a))]>;
3281+
}
32763282
}
32773283

3278-
// 32-bit has a direct PTX instruction
3279-
def : Pat<(i32 (ctlz i32:$a)), (CLZr32 $a)>;
3280-
3281-
// The return type of the ctlz ISD node is the same as its input, but the PTX
3282-
// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the
3283-
// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
3284-
// truncating back down to 32 bits.
3285-
def : Pat<(i64 (ctlz i64:$a)), (CVT_u64_u32 (CLZr64 $a), CvtNONE)>;
3286-
def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 $a)>;
3287-
3288-
// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
3289-
// result back to 16-bits if necessary. We also need to subtract 16 because
3290-
// the high-order 16 zeros were counted.
3291-
//
3292-
// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
3293-
// use to save one SASS instruction (on sm_35 anyway):
3294-
//
3295-
// mov.b32 $tmp, {0xffff, $a}
3296-
// ctlz.b32 $result, $tmp
3297-
//
3298-
// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
3299-
// and then ctlz that value. This way we don't have to subtract 16 from the
3300-
// result. Unfortunately today we don't have a way to generate
3301-
// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
3302-
def : Pat<(i16 (ctlz i16:$a)),
3303-
(SUBi16ri (CVT_u16_u32
3304-
(CLZr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE), 16)>;
3305-
def : Pat<(i32 (zext (i16 (ctlz i16:$a)))),
3306-
(SUBi32ri (CLZr32 (CVT_u32_u16 $a, CvtNONE)), 16)>;
3307-
3308-
// Population count
3309-
let hasSideEffects = false in {
3310-
def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3311-
"popc.b32 \t$d, $a;", []>;
3312-
def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3313-
"popc.b64 \t$d, $a;", []>;
3314-
}
3315-
3316-
// 32-bit has a direct PTX instruction
3317-
def : Pat<(i32 (ctpop i32:$a)), (POPCr32 $a)>;
3318-
3319-
// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
3320-
// to match the LLVM semantics. Just as with ctlz.i64, we provide a second
3321-
// pattern that avoids the type conversion if we're truncating the result to
3322-
// i32 anyway.
3323-
def : Pat<(ctpop i64:$a), (CVT_u64_u32 (POPCr64 $a), CvtNONE)>;
3324-
def : Pat<(i32 (trunc (i64 (ctpop i64:$a)))), (POPCr64 $a)>;
3325-
3326-
// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
3327-
// If we know that we're storing into an i32, we can avoid the final trunc.
3328-
def : Pat<(ctpop i16:$a),
3329-
(CVT_u16_u32 (POPCr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE)>;
3330-
def : Pat<(i32 (zext (i16 (ctpop i16:$a)))),
3331-
(POPCr32 (CVT_u32_u16 $a, CvtNONE))>;
3332-
33333284
// fpround f32 -> f16
33343285
def : Pat<(f16 (fpround f32:$a)),
33353286
(CVT_f16_f32 $a, CvtRN)>;

‎llvm/test/CodeGen/NVPTX/ctlz.ll

+105-62
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
23
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
34

@@ -10,67 +11,95 @@ declare i64 @llvm.ctlz.i64(i64, i1) readnone
1011
; There should be no difference between llvm.ctlz.i32(%a, true) and
1112
; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0.
1213

13-
; CHECK-LABEL: myctlz(
1414
define i32 @myctlz(i32 %a) {
15-
; CHECK: ld.param.
16-
; CHECK-NEXT: clz.b32
17-
; CHECK-NEXT: st.param.
18-
; CHECK-NEXT: ret;
15+
; CHECK-LABEL: myctlz(
16+
; CHECK: {
17+
; CHECK-NEXT: .reg .b32 %r<3>;
18+
; CHECK-EMPTY:
19+
; CHECK-NEXT: // %bb.0:
20+
; CHECK-NEXT: ld.param.u32 %r1, [myctlz_param_0];
21+
; CHECK-NEXT: clz.b32 %r2, %r1;
22+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
23+
; CHECK-NEXT: ret;
1924
%val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone
2025
ret i32 %val
2126
}
22-
; CHECK-LABEL: myctlz_2(
2327
define i32 @myctlz_2(i32 %a) {
24-
; CHECK: ld.param.
25-
; CHECK-NEXT: clz.b32
26-
; CHECK-NEXT: st.param.
27-
; CHECK-NEXT: ret;
28+
; CHECK-LABEL: myctlz_2(
29+
; CHECK: {
30+
; CHECK-NEXT: .reg .b32 %r<3>;
31+
; CHECK-EMPTY:
32+
; CHECK-NEXT: // %bb.0:
33+
; CHECK-NEXT: ld.param.u32 %r1, [myctlz_2_param_0];
34+
; CHECK-NEXT: clz.b32 %r2, %r1;
35+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
36+
; CHECK-NEXT: ret;
2837
%val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
2938
ret i32 %val
3039
}
3140

3241
; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit
3342
; value, so here we have to zero-extend it.
34-
; CHECK-LABEL: myctlz64(
3543
define i64 @myctlz64(i64 %a) {
36-
; CHECK: ld.param.
37-
; CHECK-NEXT: clz.b64
38-
; CHECK-NEXT: cvt.u64.u32
39-
; CHECK-NEXT: st.param.
40-
; CHECK-NEXT: ret;
44+
; CHECK-LABEL: myctlz64(
45+
; CHECK: {
46+
; CHECK-NEXT: .reg .b32 %r<2>;
47+
; CHECK-NEXT: .reg .b64 %rd<3>;
48+
; CHECK-EMPTY:
49+
; CHECK-NEXT: // %bb.0:
50+
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_param_0];
51+
; CHECK-NEXT: clz.b64 %r1, %rd1;
52+
; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
53+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
54+
; CHECK-NEXT: ret;
4155
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
4256
ret i64 %val
4357
}
44-
; CHECK-LABEL: myctlz64_2(
4558
define i64 @myctlz64_2(i64 %a) {
46-
; CHECK: ld.param.
47-
; CHECK-NEXT: clz.b64
48-
; CHECK-NEXT: cvt.u64.u32
49-
; CHECK-NEXT: st.param.
50-
; CHECK-NEXT: ret;
59+
; CHECK-LABEL: myctlz64_2(
60+
; CHECK: {
61+
; CHECK-NEXT: .reg .b32 %r<2>;
62+
; CHECK-NEXT: .reg .b64 %rd<3>;
63+
; CHECK-EMPTY:
64+
; CHECK-NEXT: // %bb.0:
65+
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_2_param_0];
66+
; CHECK-NEXT: clz.b64 %r1, %rd1;
67+
; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
68+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
69+
; CHECK-NEXT: ret;
5170
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
5271
ret i64 %val
5372
}
5473

5574
; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the
5675
; natural return width of ptx's clz.b64 instruction. No conversions should be
5776
; necessary in the PTX.
58-
; CHECK-LABEL: myctlz64_as_32(
5977
define i32 @myctlz64_as_32(i64 %a) {
60-
; CHECK: ld.param.
61-
; CHECK-NEXT: clz.b64
62-
; CHECK-NEXT: st.param.
63-
; CHECK-NEXT: ret;
78+
; CHECK-LABEL: myctlz64_as_32(
79+
; CHECK: {
80+
; CHECK-NEXT: .reg .b32 %r<2>;
81+
; CHECK-NEXT: .reg .b64 %rd<2>;
82+
; CHECK-EMPTY:
83+
; CHECK-NEXT: // %bb.0:
84+
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_param_0];
85+
; CHECK-NEXT: clz.b64 %r1, %rd1;
86+
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
87+
; CHECK-NEXT: ret;
6488
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
6589
%trunc = trunc i64 %val to i32
6690
ret i32 %trunc
6791
}
68-
; CHECK-LABEL: myctlz64_as_32_2(
6992
define i32 @myctlz64_as_32_2(i64 %a) {
70-
; CHECK: ld.param.
71-
; CHECK-NEXT: clz.b64
72-
; CHECK-NEXT: st.param.
73-
; CHECK-NEXT: ret;
93+
; CHECK-LABEL: myctlz64_as_32_2(
94+
; CHECK: {
95+
; CHECK-NEXT: .reg .b32 %r<2>;
96+
; CHECK-NEXT: .reg .b64 %rd<2>;
97+
; CHECK-EMPTY:
98+
; CHECK-NEXT: // %bb.0:
99+
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_2_param_0];
100+
; CHECK-NEXT: clz.b64 %r1, %rd1;
101+
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
102+
; CHECK-NEXT: ret;
74103
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
75104
%trunc = trunc i64 %val to i32
76105
ret i32 %trunc
@@ -80,53 +109,67 @@ define i32 @myctlz64_as_32_2(i64 %a) {
80109
; and then truncating the result back down to i16. But the NVPTX ABI
81110
; zero-extends i16 return values to i32, so the final truncation doesn't appear
82111
; in this function.
83-
; CHECK-LABEL: myctlz_ret16(
84112
define i16 @myctlz_ret16(i16 %a) {
85-
; CHECK: ld.param.
86-
; CHECK-NEXT: cvt.u32.u16
87-
; CHECK-NEXT: clz.b32
88-
; CHECK-NEXT: sub.
89-
; CHECK-NEXT: st.param.
90-
; CHECK-NEXT: ret;
113+
; CHECK-LABEL: myctlz_ret16(
114+
; CHECK: {
115+
; CHECK-NEXT: .reg .b32 %r<4>;
116+
; CHECK-EMPTY:
117+
; CHECK-NEXT: // %bb.0:
118+
; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_param_0];
119+
; CHECK-NEXT: clz.b32 %r2, %r1;
120+
; CHECK-NEXT: add.s32 %r3, %r2, -16;
121+
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
122+
; CHECK-NEXT: ret;
91123
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
92124
ret i16 %val
93125
}
94-
; CHECK-LABEL: myctlz_ret16_2(
95126
define i16 @myctlz_ret16_2(i16 %a) {
96-
; CHECK: ld.param.
97-
; CHECK-NEXT: cvt.u32.u16
98-
; CHECK-NEXT: clz.b32
99-
; CHECK-NEXT: sub.
100-
; CHECK-NEXT: st.param.
101-
; CHECK-NEXT: ret;
127+
; CHECK-LABEL: myctlz_ret16_2(
128+
; CHECK: {
129+
; CHECK-NEXT: .reg .b32 %r<4>;
130+
; CHECK-EMPTY:
131+
; CHECK-NEXT: // %bb.0:
132+
; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_2_param_0];
133+
; CHECK-NEXT: shl.b32 %r2, %r1, 16;
134+
; CHECK-NEXT: clz.b32 %r3, %r2;
135+
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
136+
; CHECK-NEXT: ret;
102137
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
103138
ret i16 %val
104139
}
105140

106141
; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should
107142
; remain.
108-
; CHECK-LABEL: myctlz_store16(
109143
define void @myctlz_store16(i16 %a, ptr %b) {
110-
; CHECK: ld.param.
111-
; CHECK-NEXT: cvt.u32.u16
112-
; CHECK-NEXT: clz.b32
113-
; CHECK-DAG: cvt.u16.u32
114-
; CHECK-DAG: sub.
115-
; CHECK: st.{{[a-z]}}16
116-
; CHECK: ret;
144+
; CHECK-LABEL: myctlz_store16(
145+
; CHECK: {
146+
; CHECK-NEXT: .reg .b32 %r<4>;
147+
; CHECK-NEXT: .reg .b64 %rd<2>;
148+
; CHECK-EMPTY:
149+
; CHECK-NEXT: // %bb.0:
150+
; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_param_0];
151+
; CHECK-NEXT: clz.b32 %r2, %r1;
152+
; CHECK-NEXT: add.s32 %r3, %r2, -16;
153+
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_param_1];
154+
; CHECK-NEXT: st.u16 [%rd1], %r3;
155+
; CHECK-NEXT: ret;
117156
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
118157
store i16 %val, ptr %b
119158
ret void
120159
}
121-
; CHECK-LABEL: myctlz_store16_2(
122160
define void @myctlz_store16_2(i16 %a, ptr %b) {
123-
; CHECK: ld.param.
124-
; CHECK-NEXT: cvt.u32.u16
125-
; CHECK-NEXT: clz.b32
126-
; CHECK-DAG: cvt.u16.u32
127-
; CHECK-DAG: sub.
128-
; CHECK: st.{{[a-z]}}16
129-
; CHECK: ret;
161+
; CHECK-LABEL: myctlz_store16_2(
162+
; CHECK: {
163+
; CHECK-NEXT: .reg .b32 %r<4>;
164+
; CHECK-NEXT: .reg .b64 %rd<2>;
165+
; CHECK-EMPTY:
166+
; CHECK-NEXT: // %bb.0:
167+
; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_2_param_0];
168+
; CHECK-NEXT: clz.b32 %r2, %r1;
169+
; CHECK-NEXT: add.s32 %r3, %r2, -16;
170+
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_2_param_1];
171+
; CHECK-NEXT: st.u16 [%rd1], %r3;
172+
; CHECK-NEXT: ret;
130173
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
131174
store i16 %val, ptr %b
132175
ret void

‎llvm/test/CodeGen/NVPTX/intrinsics.ll

+184-56
Original file line numberDiff line numberDiff line change
@@ -1,175 +1,303 @@
1-
; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 | FileCheck %s
2-
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 | FileCheck %s --check-prefixes=CHECK,CHECK32
3+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s --check-prefixes=CHECK,CHECK64
34
; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify %}
45
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify %}
56

6-
; CHECK-LABEL: test_fabsf(
77
define float @test_fabsf(float %f) {
8-
; CHECK: abs.f32
8+
; CHECK-LABEL: test_fabsf(
9+
; CHECK: {
10+
; CHECK-NEXT: .reg .f32 %f<3>;
11+
; CHECK-EMPTY:
12+
; CHECK-NEXT: // %bb.0:
13+
; CHECK-NEXT: ld.param.f32 %f1, [test_fabsf_param_0];
14+
; CHECK-NEXT: abs.f32 %f2, %f1;
15+
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
16+
; CHECK-NEXT: ret;
917
%x = call float @llvm.fabs.f32(float %f)
1018
ret float %x
1119
}
1220

13-
; CHECK-LABEL: test_fabs(
1421
define double @test_fabs(double %d) {
15-
; CHECK: abs.f64
22+
; CHECK-LABEL: test_fabs(
23+
; CHECK: {
24+
; CHECK-NEXT: .reg .f64 %fd<3>;
25+
; CHECK-EMPTY:
26+
; CHECK-NEXT: // %bb.0:
27+
; CHECK-NEXT: ld.param.f64 %fd1, [test_fabs_param_0];
28+
; CHECK-NEXT: abs.f64 %fd2, %fd1;
29+
; CHECK-NEXT: st.param.f64 [func_retval0], %fd2;
30+
; CHECK-NEXT: ret;
1631
%x = call double @llvm.fabs.f64(double %d)
1732
ret double %x
1833
}
1934

20-
; CHECK-LABEL: test_nvvm_sqrt(
2135
define float @test_nvvm_sqrt(float %a) {
22-
; CHECK: sqrt.rn.f32
36+
; CHECK-LABEL: test_nvvm_sqrt(
37+
; CHECK: {
38+
; CHECK-NEXT: .reg .f32 %f<3>;
39+
; CHECK-EMPTY:
40+
; CHECK-NEXT: // %bb.0:
41+
; CHECK-NEXT: ld.param.f32 %f1, [test_nvvm_sqrt_param_0];
42+
; CHECK-NEXT: sqrt.rn.f32 %f2, %f1;
43+
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
44+
; CHECK-NEXT: ret;
2345
%val = call float @llvm.nvvm.sqrt.f(float %a)
2446
ret float %val
2547
}
2648

27-
; CHECK-LABEL: test_llvm_sqrt(
2849
define float @test_llvm_sqrt(float %a) {
29-
; CHECK: sqrt.rn.f32
50+
; CHECK-LABEL: test_llvm_sqrt(
51+
; CHECK: {
52+
; CHECK-NEXT: .reg .f32 %f<3>;
53+
; CHECK-EMPTY:
54+
; CHECK-NEXT: // %bb.0:
55+
; CHECK-NEXT: ld.param.f32 %f1, [test_llvm_sqrt_param_0];
56+
; CHECK-NEXT: sqrt.rn.f32 %f2, %f1;
57+
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
58+
; CHECK-NEXT: ret;
3059
%val = call float @llvm.sqrt.f32(float %a)
3160
ret float %val
3261
}
3362

34-
; CHECK-LABEL: test_bitreverse32(
3563
define i32 @test_bitreverse32(i32 %a) {
36-
; CHECK: brev.b32
64+
; CHECK-LABEL: test_bitreverse32(
65+
; CHECK: {
66+
; CHECK-NEXT: .reg .b32 %r<3>;
67+
; CHECK-EMPTY:
68+
; CHECK-NEXT: // %bb.0:
69+
; CHECK-NEXT: ld.param.u32 %r1, [test_bitreverse32_param_0];
70+
; CHECK-NEXT: brev.b32 %r2, %r1;
71+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
72+
; CHECK-NEXT: ret;
3773
%val = call i32 @llvm.bitreverse.i32(i32 %a)
3874
ret i32 %val
3975
}
4076

41-
; CHECK-LABEL: test_bitreverse64(
4277
define i64 @test_bitreverse64(i64 %a) {
43-
; CHECK: brev.b64
78+
; CHECK-LABEL: test_bitreverse64(
79+
; CHECK: {
80+
; CHECK-NEXT: .reg .b64 %rd<3>;
81+
; CHECK-EMPTY:
82+
; CHECK-NEXT: // %bb.0:
83+
; CHECK-NEXT: ld.param.u64 %rd1, [test_bitreverse64_param_0];
84+
; CHECK-NEXT: brev.b64 %rd2, %rd1;
85+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
86+
; CHECK-NEXT: ret;
4487
%val = call i64 @llvm.bitreverse.i64(i64 %a)
4588
ret i64 %val
4689
}
4790

48-
; CHECK-LABEL: test_popc32(
4991
define i32 @test_popc32(i32 %a) {
50-
; CHECK: popc.b32
92+
; CHECK-LABEL: test_popc32(
93+
; CHECK: {
94+
; CHECK-NEXT: .reg .b32 %r<3>;
95+
; CHECK-EMPTY:
96+
; CHECK-NEXT: // %bb.0:
97+
; CHECK-NEXT: ld.param.u32 %r1, [test_popc32_param_0];
98+
; CHECK-NEXT: popc.b32 %r2, %r1;
99+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
100+
; CHECK-NEXT: ret;
51101
%val = call i32 @llvm.ctpop.i32(i32 %a)
52102
ret i32 %val
53103
}
54104

55-
; CHECK-LABEL: test_popc64
56105
define i64 @test_popc64(i64 %a) {
57-
; CHECK: popc.b64
58-
; CHECK: cvt.u64.u32
106+
; CHECK-LABEL: test_popc64(
107+
; CHECK: {
108+
; CHECK-NEXT: .reg .b32 %r<2>;
109+
; CHECK-NEXT: .reg .b64 %rd<3>;
110+
; CHECK-EMPTY:
111+
; CHECK-NEXT: // %bb.0:
112+
; CHECK-NEXT: ld.param.u64 %rd1, [test_popc64_param_0];
113+
; CHECK-NEXT: popc.b64 %r1, %rd1;
114+
; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
115+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
116+
; CHECK-NEXT: ret;
59117
%val = call i64 @llvm.ctpop.i64(i64 %a)
60118
ret i64 %val
61119
}
62120

63121
; NVPTX popc.b64 returns an i32 even though @llvm.ctpop.i64 returns an i64, so
64122
; if this function returns an i32, there's no need to do any type conversions
65123
; in the ptx.
66-
; CHECK-LABEL: test_popc64_trunc
67124
define i32 @test_popc64_trunc(i64 %a) {
68-
; CHECK: popc.b64
69-
; CHECK-NOT: cvt.
125+
; CHECK-LABEL: test_popc64_trunc(
126+
; CHECK: {
127+
; CHECK-NEXT: .reg .b32 %r<2>;
128+
; CHECK-NEXT: .reg .b64 %rd<2>;
129+
; CHECK-EMPTY:
130+
; CHECK-NEXT: // %bb.0:
131+
; CHECK-NEXT: ld.param.u64 %rd1, [test_popc64_trunc_param_0];
132+
; CHECK-NEXT: popc.b64 %r1, %rd1;
133+
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
134+
; CHECK-NEXT: ret;
70135
%val = call i64 @llvm.ctpop.i64(i64 %a)
71136
%trunc = trunc i64 %val to i32
72137
ret i32 %trunc
73138
}
74139

75140
; llvm.ctpop.i16 is implemenented by converting to i32, running popc.b32, and
76141
; then converting back to i16.
77-
; CHECK-LABEL: test_popc16
78142
define void @test_popc16(i16 %a, ptr %b) {
79-
; CHECK: cvt.u32.u16
80-
; CHECK: popc.b32
81-
; CHECK: cvt.u16.u32
143+
; CHECK32-LABEL: test_popc16(
144+
; CHECK32: {
145+
; CHECK32-NEXT: .reg .b32 %r<4>;
146+
; CHECK32-EMPTY:
147+
; CHECK32-NEXT: // %bb.0:
148+
; CHECK32-NEXT: ld.param.u16 %r1, [test_popc16_param_0];
149+
; CHECK32-NEXT: popc.b32 %r2, %r1;
150+
; CHECK32-NEXT: ld.param.u32 %r3, [test_popc16_param_1];
151+
; CHECK32-NEXT: st.u16 [%r3], %r2;
152+
; CHECK32-NEXT: ret;
153+
;
154+
; CHECK64-LABEL: test_popc16(
155+
; CHECK64: {
156+
; CHECK64-NEXT: .reg .b32 %r<3>;
157+
; CHECK64-NEXT: .reg .b64 %rd<2>;
158+
; CHECK64-EMPTY:
159+
; CHECK64-NEXT: // %bb.0:
160+
; CHECK64-NEXT: ld.param.u16 %r1, [test_popc16_param_0];
161+
; CHECK64-NEXT: popc.b32 %r2, %r1;
162+
; CHECK64-NEXT: ld.param.u64 %rd1, [test_popc16_param_1];
163+
; CHECK64-NEXT: st.u16 [%rd1], %r2;
164+
; CHECK64-NEXT: ret;
82165
%val = call i16 @llvm.ctpop.i16(i16 %a)
83166
store i16 %val, ptr %b
84167
ret void
85168
}
86169

87170
; If we call llvm.ctpop.i16 and then zext the result to i32, we shouldn't need
88171
; to do any conversions after calling popc.b32, because that returns an i32.
89-
; CHECK-LABEL: test_popc16_to_32
90172
define i32 @test_popc16_to_32(i16 %a) {
91-
; CHECK: cvt.u32.u16
92-
; CHECK: popc.b32
93-
; CHECK-NOT: cvt.
173+
; CHECK-LABEL: test_popc16_to_32(
174+
; CHECK: {
175+
; CHECK-NEXT: .reg .b32 %r<3>;
176+
; CHECK-EMPTY:
177+
; CHECK-NEXT: // %bb.0:
178+
; CHECK-NEXT: ld.param.u16 %r1, [test_popc16_to_32_param_0];
179+
; CHECK-NEXT: popc.b32 %r2, %r1;
180+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
181+
; CHECK-NEXT: ret;
94182
%val = call i16 @llvm.ctpop.i16(i16 %a)
95183
%zext = zext i16 %val to i32
96184
ret i32 %zext
97185
}
98186

99187
; Most of nvvm.read.ptx.sreg.* intrinsics always return the same value and may
100188
; be CSE'd.
101-
; CHECK-LABEL: test_tid
102189
define i32 @test_tid() {
103-
; CHECK: mov.u32 %r{{.*}}, %tid.x;
190+
; CHECK-LABEL: test_tid(
191+
; CHECK: {
192+
; CHECK-NEXT: .reg .b32 %r<3>;
193+
; CHECK-EMPTY:
194+
; CHECK-NEXT: // %bb.0:
195+
; CHECK-NEXT: mov.u32 %r1, %tid.x;
196+
; CHECK-NEXT: add.s32 %r2, %r1, %r1;
197+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
198+
; CHECK-NEXT: ret;
104199
%a = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
105-
; CHECK-NOT: mov.u32 %r{{.*}}, %tid.x;
106200
%b = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
107201
%ret = add i32 %a, %b
108-
; CHECK: ret
109202
ret i32 %ret
110203
}
111204

112205
; reading clock() or clock64() should not be CSE'd as each read may return
113206
; different value.
114-
; CHECK-LABEL: test_clock
115207
define i32 @test_clock() {
116-
; CHECK: mov.u32 %r{{.*}}, %clock;
208+
; CHECK-LABEL: test_clock(
209+
; CHECK: {
210+
; CHECK-NEXT: .reg .b32 %r<4>;
211+
; CHECK-EMPTY:
212+
; CHECK-NEXT: // %bb.0:
213+
; CHECK-NEXT: mov.u32 %r1, %clock;
214+
; CHECK-NEXT: mov.u32 %r2, %clock;
215+
; CHECK-NEXT: add.s32 %r3, %r1, %r2;
216+
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
217+
; CHECK-NEXT: ret;
117218
%a = tail call i32 @llvm.nvvm.read.ptx.sreg.clock()
118-
; CHECK: mov.u32 %r{{.*}}, %clock;
119219
%b = tail call i32 @llvm.nvvm.read.ptx.sreg.clock()
120220
%ret = add i32 %a, %b
121-
; CHECK: ret
122221
ret i32 %ret
123222
}
124223

125-
; CHECK-LABEL: test_clock64
126224
define i64 @test_clock64() {
127-
; CHECK: mov.u64 %r{{.*}}, %clock64;
225+
; CHECK-LABEL: test_clock64(
226+
; CHECK: {
227+
; CHECK-NEXT: .reg .b64 %rd<4>;
228+
; CHECK-EMPTY:
229+
; CHECK-NEXT: // %bb.0:
230+
; CHECK-NEXT: mov.u64 %rd1, %clock64;
231+
; CHECK-NEXT: mov.u64 %rd2, %clock64;
232+
; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2;
233+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
234+
; CHECK-NEXT: ret;
128235
%a = tail call i64 @llvm.nvvm.read.ptx.sreg.clock64()
129-
; CHECK: mov.u64 %r{{.*}}, %clock64;
130236
%b = tail call i64 @llvm.nvvm.read.ptx.sreg.clock64()
131237
%ret = add i64 %a, %b
132-
; CHECK: ret
133238
ret i64 %ret
134239
}
135240

136-
; CHECK-LABEL: test_exit
137241
define void @test_exit() {
138-
; CHECK: exit;
242+
; CHECK-LABEL: test_exit(
243+
; CHECK: {
244+
; CHECK-EMPTY:
245+
; CHECK-EMPTY:
246+
; CHECK-NEXT: // %bb.0:
247+
; CHECK-NEXT: exit;
248+
; CHECK-NEXT: ret;
139249
call void @llvm.nvvm.exit()
140250
ret void
141251
}
142252

143-
; CHECK-LABEL: test_globaltimer
144253
define i64 @test_globaltimer() {
145-
; CHECK: mov.u64 %r{{.*}}, %globaltimer;
254+
; CHECK-LABEL: test_globaltimer(
255+
; CHECK: {
256+
; CHECK-NEXT: .reg .b64 %rd<4>;
257+
; CHECK-EMPTY:
258+
; CHECK-NEXT: // %bb.0:
259+
; CHECK-NEXT: mov.u64 %rd1, %globaltimer;
260+
; CHECK-NEXT: mov.u64 %rd2, %globaltimer;
261+
; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2;
262+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
263+
; CHECK-NEXT: ret;
146264
%a = tail call i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
147-
; CHECK: mov.u64 %r{{.*}}, %globaltimer;
148265
%b = tail call i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
149266
%ret = add i64 %a, %b
150-
; CHECK: ret
151267
ret i64 %ret
152268
}
153269

154-
; CHECK-LABEL: test_cyclecounter
155270
define i64 @test_cyclecounter() {
156-
; CHECK: mov.u64 %r{{.*}}, %clock64;
271+
; CHECK-LABEL: test_cyclecounter(
272+
; CHECK: {
273+
; CHECK-NEXT: .reg .b64 %rd<4>;
274+
; CHECK-EMPTY:
275+
; CHECK-NEXT: // %bb.0:
276+
; CHECK-NEXT: mov.u64 %rd1, %clock64;
277+
; CHECK-NEXT: mov.u64 %rd2, %clock64;
278+
; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2;
279+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
280+
; CHECK-NEXT: ret;
157281
%a = tail call i64 @llvm.readcyclecounter()
158-
; CHECK: mov.u64 %r{{.*}}, %clock64;
159282
%b = tail call i64 @llvm.readcyclecounter()
160283
%ret = add i64 %a, %b
161-
; CHECK: ret
162284
ret i64 %ret
163285
}
164286

165-
; CHECK-LABEL: test_steadycounter
166287
define i64 @test_steadycounter() {
167-
; CHECK: mov.u64 %r{{.*}}, %globaltimer;
288+
; CHECK-LABEL: test_steadycounter(
289+
; CHECK: {
290+
; CHECK-NEXT: .reg .b64 %rd<4>;
291+
; CHECK-EMPTY:
292+
; CHECK-NEXT: // %bb.0:
293+
; CHECK-NEXT: mov.u64 %rd1, %globaltimer;
294+
; CHECK-NEXT: mov.u64 %rd2, %globaltimer;
295+
; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2;
296+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
297+
; CHECK-NEXT: ret;
168298
%a = tail call i64 @llvm.readsteadycounter()
169-
; CHECK: mov.u64 %r{{.*}}, %globaltimer;
170299
%b = tail call i64 @llvm.readsteadycounter()
171300
%ret = add i64 %a, %b
172-
; CHECK: ret
173301
ret i64 %ret
174302
}
175303

‎llvm/test/CodeGen/VE/Scalar/ctlz.ll

-1
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,6 @@ define zeroext i32 @func32zx(i32 zeroext %p) {
200200
; CHECK: # %bb.0:
201201
; CHECK-NEXT: ldz %s0, %s0
202202
; CHECK-NEXT: lea %s0, -32(, %s0)
203-
; CHECK-NEXT: and %s0, %s0, (32)0
204203
; CHECK-NEXT: b.l.t (, %s10)
205204
%r = tail call i32 @llvm.ctlz.i32(i32 %p, i1 false)
206205
ret i32 %r

0 commit comments

Comments
 (0)
Please sign in to comment.