-
Notifications
You must be signed in to change notification settings - Fork 13k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] Legalize ctpop and ctlz in operation legalization #130668
[NVPTX] Legalize ctpop and ctlz in operation legalization #130668
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-nvptx Author: Alex MacLean (AlexMaclean) ChangesBy pulling the truncates and extensions out of operations during operation legalization we enable more optimization via DAGCombiner. While the test cases show only cosmetic improvements (unlikely to impact the final SASS) in real programs the exposure of these truncates can allow for more optimization. Patch is 25.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130668.diff 4 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 3e755c25fd91a..004acbab58dab 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -766,16 +766,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// Custom handling for i8 intrinsics
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
- for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
- setOperationAction(ISD::ABS, Ty, Legal);
- setOperationAction(ISD::SMIN, Ty, Legal);
- setOperationAction(ISD::SMAX, Ty, Legal);
- setOperationAction(ISD::UMIN, Ty, Legal);
- setOperationAction(ISD::UMAX, Ty, Legal);
+ setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX},
+ {MVT::i16, MVT::i32, MVT::i64}, Legal);
- setOperationAction(ISD::CTPOP, Ty, Legal);
- setOperationAction(ISD::CTLZ, Ty, Legal);
- }
+ setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i32, Legal);
+ setOperationAction({ISD::CTPOP, ISD::CTLZ}, {MVT::i16, MVT::i64}, Custom);
setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
@@ -2750,6 +2745,42 @@ static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) {
return Op;
}
+static SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) {
+ SDValue V = Op->getOperand(0);
+ SDLoc DL(Op);
+
+ if (V.getValueType() == MVT::i16) {
+ SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V);
+ SDValue CT = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Zext);
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, CT, SDNodeFlags::NoWrap);
+ }
+ if (V.getValueType() == MVT::i64) {
+ SDValue CT = DAG.getNode(ISD::CTPOP, DL, MVT::i32, V);
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT);
+ }
+ llvm_unreachable("Unexpected CTPOP type to legalize");
+}
+
+static SDValue lowerCTLZ(SDValue Op, SelectionDAG &DAG) {
+ SDValue V = Op->getOperand(0);
+ SDLoc DL(Op);
+
+ if (V.getValueType() == MVT::i16) {
+ SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V);
+ SDValue CT = DAG.getNode(ISD::CTLZ, DL, MVT::i32, Zext);
+ SDValue Sub =
+ DAG.getNode(ISD::ADD, DL, MVT::i32, CT,
+ DAG.getConstant(APInt(32, -16, true), DL, MVT::i32),
+ SDNodeFlags::NoSignedWrap);
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Sub, SDNodeFlags::NoWrap);
+ }
+ if (V.getValueType() == MVT::i64) {
+ SDValue CT = DAG.getNode(ISD::CTLZ, DL, MVT::i32, V);
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT);
+ }
+ llvm_unreachable("Unexpected CTLZ type to legalize");
+}
+
SDValue
NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -2835,6 +2866,10 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMUL:
// Used only for bf16 on SM80, where we select fma for non-ftz operation
return PromoteBinOpIfF32FTZ(Op, DAG);
+ case ISD::CTPOP:
+ return lowerCTPOP(Op, DAG);
+ case ISD::CTLZ:
+ return lowerCTLZ(Op, DAG);
default:
llvm_unreachable("Custom lowering not defined for operation");
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index f94d7099f1b0e..3c88551d7b23c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3267,69 +3267,20 @@ def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, i32:$amt)),
def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, (i32 imm:$amt))),
(SHF_R_CLAMP_i $lo, $hi, imm:$amt)>;
-// Count leading zeros
let hasSideEffects = false in {
- def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
- "clz.b32 \t$d, $a;", []>;
- def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
- "clz.b64 \t$d, $a;", []>;
+ foreach RT = [I32RT, I64RT] in {
+ // Count leading zeros
+ def CLZr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
+ "clz.b" # RT.Size # " \t$d, $a;",
+ [(set i32:$d, (ctlz RT.Ty:$a))]>;
+
+ // Population count
+ def POPCr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
+ "popc.b" # RT.Size # " \t$d, $a;",
+ [(set i32:$d, (ctpop RT.Ty:$a))]>;
+ }
}
-// 32-bit has a direct PTX instruction
-def : Pat<(i32 (ctlz i32:$a)), (CLZr32 $a)>;
-
-// The return type of the ctlz ISD node is the same as its input, but the PTX
-// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the
-// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
-// truncating back down to 32 bits.
-def : Pat<(i64 (ctlz i64:$a)), (CVT_u64_u32 (CLZr64 $a), CvtNONE)>;
-def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 $a)>;
-
-// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
-// result back to 16-bits if necessary. We also need to subtract 16 because
-// the high-order 16 zeros were counted.
-//
-// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
-// use to save one SASS instruction (on sm_35 anyway):
-//
-// mov.b32 $tmp, {0xffff, $a}
-// ctlz.b32 $result, $tmp
-//
-// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
-// and then ctlz that value. This way we don't have to subtract 16 from the
-// result. Unfortunately today we don't have a way to generate
-// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
-def : Pat<(i16 (ctlz i16:$a)),
- (SUBi16ri (CVT_u16_u32
- (CLZr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE), 16)>;
-def : Pat<(i32 (zext (i16 (ctlz i16:$a)))),
- (SUBi32ri (CLZr32 (CVT_u32_u16 $a, CvtNONE)), 16)>;
-
-// Population count
-let hasSideEffects = false in {
- def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
- "popc.b32 \t$d, $a;", []>;
- def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
- "popc.b64 \t$d, $a;", []>;
-}
-
-// 32-bit has a direct PTX instruction
-def : Pat<(i32 (ctpop i32:$a)), (POPCr32 $a)>;
-
-// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
-// to match the LLVM semantics. Just as with ctlz.i64, we provide a second
-// pattern that avoids the type conversion if we're truncating the result to
-// i32 anyway.
-def : Pat<(ctpop i64:$a), (CVT_u64_u32 (POPCr64 $a), CvtNONE)>;
-def : Pat<(i32 (trunc (i64 (ctpop i64:$a)))), (POPCr64 $a)>;
-
-// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
-// If we know that we're storing into an i32, we can avoid the final trunc.
-def : Pat<(ctpop i16:$a),
- (CVT_u16_u32 (POPCr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE)>;
-def : Pat<(i32 (zext (i16 (ctpop i16:$a)))),
- (POPCr32 (CVT_u32_u16 $a, CvtNONE))>;
-
// fpround f32 -> f16
def : Pat<(f16 (fpround f32:$a)),
(CVT_f16_f32 $a, CvtRN)>;
diff --git a/llvm/test/CodeGen/NVPTX/ctlz.ll b/llvm/test/CodeGen/NVPTX/ctlz.ll
index 9f91504ad9966..4f414d600f248 100644
--- a/llvm/test/CodeGen/NVPTX/ctlz.ll
+++ b/llvm/test/CodeGen/NVPTX/ctlz.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
@@ -10,44 +11,62 @@ declare i64 @llvm.ctlz.i64(i64, i1) readnone
; There should be no difference between llvm.ctlz.i32(%a, true) and
; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0.
-; CHECK-LABEL: myctlz(
define i32 @myctlz(i32 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b32
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [myctlz_param_0];
+; CHECK-NEXT: clz.b32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone
ret i32 %val
}
-; CHECK-LABEL: myctlz_2(
define i32 @myctlz_2(i32 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b32
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz_2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [myctlz_2_param_0];
+; CHECK-NEXT: clz.b32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
ret i32 %val
}
; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit
; value, so here we have to zero-extend it.
-; CHECK-LABEL: myctlz64(
define i64 @myctlz64(i64 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b64
-; CHECK-NEXT: cvt.u64.u32
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz64(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_param_0];
+; CHECK-NEXT: clz.b64 %r1, %rd1;
+; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT: ret;
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
ret i64 %val
}
-; CHECK-LABEL: myctlz64_2(
define i64 @myctlz64_2(i64 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b64
-; CHECK-NEXT: cvt.u64.u32
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz64_2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_2_param_0];
+; CHECK-NEXT: clz.b64 %r1, %rd1;
+; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT: ret;
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
ret i64 %val
}
@@ -55,22 +74,32 @@ define i64 @myctlz64_2(i64 %a) {
; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the
; natural return width of ptx's clz.b64 instruction. No conversions should be
; necessary in the PTX.
-; CHECK-LABEL: myctlz64_as_32(
define i32 @myctlz64_as_32(i64 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b64
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz64_as_32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_param_0];
+; CHECK-NEXT: clz.b64 %r1, %rd1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
%trunc = trunc i64 %val to i32
ret i32 %trunc
}
-; CHECK-LABEL: myctlz64_as_32_2(
define i32 @myctlz64_as_32_2(i64 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b64
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz64_as_32_2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_2_param_0];
+; CHECK-NEXT: clz.b64 %r1, %rd1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
%trunc = trunc i64 %val to i32
ret i32 %trunc
@@ -80,53 +109,67 @@ define i32 @myctlz64_as_32_2(i64 %a) {
; and then truncating the result back down to i16. But the NVPTX ABI
; zero-extends i16 return values to i32, so the final truncation doesn't appear
; in this function.
-; CHECK-LABEL: myctlz_ret16(
define i16 @myctlz_ret16(i16 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: cvt.u32.u16
-; CHECK-NEXT: clz.b32
-; CHECK-NEXT: sub.
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz_ret16(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_param_0];
+; CHECK-NEXT: clz.b32 %r2, %r1;
+; CHECK-NEXT: add.s32 %r3, %r2, -16;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
ret i16 %val
}
-; CHECK-LABEL: myctlz_ret16_2(
define i16 @myctlz_ret16_2(i16 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: cvt.u32.u16
-; CHECK-NEXT: clz.b32
-; CHECK-NEXT: sub.
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz_ret16_2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_2_param_0];
+; CHECK-NEXT: clz.b32 %r2, %r1;
+; CHECK-NEXT: add.s32 %r3, %r2, -16;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
ret i16 %val
}
; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should
; remain.
-; CHECK-LABEL: myctlz_store16(
define void @myctlz_store16(i16 %a, ptr %b) {
-; CHECK: ld.param.
-; CHECK-NEXT: cvt.u32.u16
-; CHECK-NEXT: clz.b32
-; CHECK-DAG: cvt.u16.u32
-; CHECK-DAG: sub.
-; CHECK: st.{{[a-z]}}16
-; CHECK: ret;
+; CHECK-LABEL: myctlz_store16(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_param_0];
+; CHECK-NEXT: clz.b32 %r2, %r1;
+; CHECK-NEXT: add.s32 %r3, %r2, -16;
+; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_param_1];
+; CHECK-NEXT: st.u16 [%rd1], %r3;
+; CHECK-NEXT: ret;
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
store i16 %val, ptr %b
ret void
}
-; CHECK-LABEL: myctlz_store16_2(
define void @myctlz_store16_2(i16 %a, ptr %b) {
-; CHECK: ld.param.
-; CHECK-NEXT: cvt.u32.u16
-; CHECK-NEXT: clz.b32
-; CHECK-DAG: cvt.u16.u32
-; CHECK-DAG: sub.
-; CHECK: st.{{[a-z]}}16
-; CHECK: ret;
+; CHECK-LABEL: myctlz_store16_2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_2_param_0];
+; CHECK-NEXT: clz.b32 %r2, %r1;
+; CHECK-NEXT: add.s32 %r3, %r2, -16;
+; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_2_param_1];
+; CHECK-NEXT: st.u16 [%rd1], %r3;
+; CHECK-NEXT: ret;
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
store i16 %val, ptr %b
ret void
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index e424e72ecc8f5..57493558419c5 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -1,61 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 | FileCheck %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s
; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify %}
-; CHECK-LABEL: test_fabsf(
define float @test_fabsf(float %f) {
-; CHECK: abs.f32
+; CHECK-LABEL: test_fabsf(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [test_fabsf_param_0];
+; CHECK-NEXT: abs.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
%x = call float @llvm.fabs.f32(float %f)
ret float %x
}
-; CHECK-LABEL: test_fabs(
define double @test_fabs(double %d) {
-; CHECK: abs.f64
+; CHECK-LABEL: test_fabs(
+; CHECK: {
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f64 %fd1, [test_fabs_param_0];
+; CHECK-NEXT: abs.f64 %fd2, %fd1;
+; CHECK-NEXT: st.param.f64 [func_retval0], %fd2;
+; CHECK-NEXT: ret;
%x = call double @llvm.fabs.f64(double %d)
ret double %x
}
-; CHECK-LABEL: test_nvvm_sqrt(
define float @test_nvvm_sqrt(float %a) {
-; CHECK: sqrt.rn.f32
+; CHECK-LABEL: test_nvvm_sqrt(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [test_nvvm_sqrt_param_0];
+; CHECK-NEXT: sqrt.rn.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
%val = call float @llvm.nvvm.sqrt.f(float %a)
ret float %val
}
-; CHECK-LABEL: test_llvm_sqrt(
define float @test_llvm_sqrt(float %a) {
-; CHECK: sqrt.rn.f32
+; CHECK-LABEL: test_llvm_sqrt(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [test_llvm_sqrt_param_0];
+; CHECK-NEXT: sqrt.rn.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
%val = call float @llvm.sqrt.f32(float %a)
ret float %val
}
-; CHECK-LABEL: test_bitreverse32(
define i32 @test_bitreverse32(i32 %a) {
-; CHECK: brev.b32
+; CHECK-LABEL: test_bitreverse32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_bitreverse32_param_0];
+; CHECK-NEXT: brev.b32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.bitreverse.i32(i32 %a)
ret i32 %val
}
-; CHECK-LABEL: test_bitreverse64(
define i64 @test_bitreverse64(i64 %a) {
-; CHECK: brev.b64
+; CHECK-LABEL: test_bitreverse64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_bitreverse64_param_0];
+; CHECK-NEXT: brev.b64 %rd2, %rd1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT: ret;
%val = call i64 @llvm.bitreverse.i64(i64 %a)
ret i64 %val
}
-; CHECK-LABEL: test_popc32(
define i32 @test_popc32(i32 %a) {
-; CHECK: popc.b32
+; CHECK-LABEL: test_popc32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_popc32_param_0];
+; CHECK-NEXT: popc.b32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.ctpop.i32(i32 %a)
ret i32 %val
}
-; CHECK-LABEL: test_popc64
define i64 @test_popc64(i64 %a) {
-; CHECK: popc.b64
-; CHECK: cvt.u64.u32
+; CHECK-LABEL: test_popc64(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_popc64_param_0];
+; CHECK-NEXT: popc.b64 %r1, %rd1;
+; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT: ret;
%val = call i64 @llvm.ctpop.i64(i64 %a)
ret i64 %val
}
@@ -63,10 +121,17 @@ define i64 @test_popc64(i64 %a) {
; NVPTX popc.b64 returns an i32 even though @llvm.ctpop.i64 returns an i64, so
; if this function returns an i32, there's no need to do any type conversions
; in the ptx.
-; CHECK-LABEL: test_popc64_trunc
define i32 @test_popc64_trunc(i64 %a) {
-; CHECK: popc.b64
-; CHECK-NOT: cvt.
+; CHECK-LABEL: test_popc64_trunc(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_popc64_trunc_param_0];
+; CHECK-NEXT: popc.b64 %r1, %rd1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%val = call i64 @llvm.ctpop.i64(i64 %a)
%trunc = trunc i64 %val to i32
ret i32 %trunc
@@ -74,11 +139,7 @@ define i32 @test_popc64_trunc(i64 %a) {
; llvm.ctpop.i16 is implemenented by converting to i32, running popc.b32, and
; then converting back to i16.
-; CHECK-LABEL: test_popc16
define void @test_popc16(i16 %a, ptr %b) {
-; CHECK: cvt.u32.u16
-; CHECK: popc.b32
-; CHECK: cvt.u16.u32
%val = call i16 @llvm.ctpop.i16(i16 %a)
store i16 %val, ptr %b
ret void
@@ -86,11 +147,16 @@ define void @test_popc16...
[truncated]
|
if (V.getValueType() == MVT::i64) { | ||
SDValue CT = DAG.getNode(ISD::CTPOP, DL, MVT::i32, V); | ||
return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This may be worth a comment that CTPOP node here is intentionally created with 64-bit input and 32-bit output and that we'll later pattern-match it to popc
instruction.
Without the explanation this code looks as if we accidentally run CTPOP only only half of the bits of the input value.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I ended up doing some more refactoring of this change, but I've added a comment to the new lowering function which hopefully explains what is going on.
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Sub, SDNodeFlags::NoWrap); | ||
} | ||
if (V.getValueType() == MVT::i64) { | ||
SDValue CT = DAG.getNode(ISD::CTLZ, DL, MVT::i32, V); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ditto.
; CHECK-LABEL: test_popc16 | ||
define void @test_popc16(i16 %a, ptr %b) { | ||
; CHECK: cvt.u32.u16 | ||
; CHECK: popc.b32 | ||
; CHECK: cvt.u16.u32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is odd. Where did the checks for this function go?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops, this happened because different CHECKs were needed for nvptx vs nvptx64. I've updated the check-prefixes and now checks are generated for this test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes me wonder why we need to store the result as a pointer at all. I suspect it used to be needed to avoid collapsing trunc
as we'd promote return value to i32. In this case, we generate a bit better code by using truncating store. I suspect it will also happen if we'd just store the result in the return value.
If you change the test function to return i16, and it does clearly show that we're storing only 16 bits, then we can remove the pointer and collapse this test checks to just one variant. If there are no other pointer-using test cases in the file, we could skip the 32-bit run.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you change the test function to return i16, and it does clearly show that we're storing only 16 bits, then we can remove the pointer and collapse this test checks to just one variant.
It looks like the return type just gets promoted and the 32-bit value gets stored:
// .globl test_popc16 // -- Begin function test_popc16
.visible .func (.param .b32 func_retval0) test_popc16(
.param .b32 test_popc16_param_0,
.param .b32 test_popc16_param_1
) // @test_popc16
{
.reg .b32 %r<3>;
// %bb.0:
ld.param.u16 %r1, [test_popc16_param_0];
popc.b32 %r2, %r1;
st.param.b32 [func_retval0], %r2;
ret;
// -- End function
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps we could return <2 x i16>
, with one element set to a constant, and the other one with the result of popc. That would keep result truncate in place.
Up to you.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I'll just leave as is for simplicity.
e2a60bb
to
554db43
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changes themselves are LGTM.
The test may benefit from a few more tweaks.
; CHECK-LABEL: test_popc16 | ||
define void @test_popc16(i16 %a, ptr %b) { | ||
; CHECK: cvt.u32.u16 | ||
; CHECK: popc.b32 | ||
; CHECK: cvt.u16.u32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes me wonder why we need to store the result as a pointer at all. I suspect it used to be needed to avoid collapsing trunc
as we'd promote return value to i32. In this case, we generate a bit better code by using truncating store. I suspect it will also happen if we'd just store the result in the return value.
If you change the test function to return i16, and it does clearly show that we're storing only 16 bits, then we can remove the pointer and collapse this test checks to just one variant. If there are no other pointer-using test cases in the file, we could skip the 32-bit run.
By pulling the truncates and extensions out of operations during operation legalization we enable more optimization via DAGCombiner. While the test cases show only cosmetic improvements (unlikely to impact the final SASS) in real programs the exposure of these truncates can allow for more optimization.
By pulling the truncates and extensions out of operations during operation legalization we enable more optimization via DAGCombiner. While the test cases show only cosmetic improvements (unlikely to impact the final SASS) in real programs the exposure of these truncates can allow for more optimization.