[NVPTX] Legalize ctpop and ctlz in operation legalization #130668

AlexMaclean · 2025-03-10T21:03:49Z

By pulling the truncates and extensions out of operations during operation legalization we enable more optimization via DAGCombiner. While the test cases show only cosmetic improvements (unlikely to impact the final SASS) in real programs the exposure of these truncates can allow for more optimization.

llvmbot · 2025-03-10T21:04:25Z

@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

Changes

By pulling the truncates and extensions out of operations during operation legalization we enable more optimization via DAGCombiner. While the test cases show only cosmetic improvements (unlikely to impact the final SASS) in real programs the exposure of these truncates can allow for more optimization.

Patch is 25.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130668.diff

4 Files Affected:

(modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+44-9)
(modified) llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (+11-60)
(modified) llvm/test/CodeGen/NVPTX/ctlz.ll (+105-62)
(modified) llvm/test/CodeGen/NVPTX/intrinsics.ll (+160-54)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 3e755c25fd91a..004acbab58dab 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -766,16 +766,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // Custom handling for i8 intrinsics
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
 
-  for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
-    setOperationAction(ISD::ABS,  Ty, Legal);
-    setOperationAction(ISD::SMIN, Ty, Legal);
-    setOperationAction(ISD::SMAX, Ty, Legal);
-    setOperationAction(ISD::UMIN, Ty, Legal);
-    setOperationAction(ISD::UMAX, Ty, Legal);
+  setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX},
+                     {MVT::i16, MVT::i32, MVT::i64}, Legal);
 
-    setOperationAction(ISD::CTPOP, Ty, Legal);
-    setOperationAction(ISD::CTLZ, Ty, Legal);
-  }
+  setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i32, Legal);
+  setOperationAction({ISD::CTPOP, ISD::CTLZ}, {MVT::i16, MVT::i64}, Custom);
 
   setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
   setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
@@ -2750,6 +2745,42 @@ static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) {
   return Op;
 }
 
+static SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) {
+  SDValue V = Op->getOperand(0);
+  SDLoc DL(Op);
+
+  if (V.getValueType() == MVT::i16) {
+    SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V);
+    SDValue CT = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Zext);
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, CT, SDNodeFlags::NoWrap);
+  }
+  if (V.getValueType() == MVT::i64) {
+    SDValue CT = DAG.getNode(ISD::CTPOP, DL, MVT::i32, V);
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT);
+  }
+  llvm_unreachable("Unexpected CTPOP type to legalize");
+}
+
+static SDValue lowerCTLZ(SDValue Op, SelectionDAG &DAG) {
+  SDValue V = Op->getOperand(0);
+  SDLoc DL(Op);
+
+  if (V.getValueType() == MVT::i16) {
+    SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V);
+    SDValue CT = DAG.getNode(ISD::CTLZ, DL, MVT::i32, Zext);
+    SDValue Sub =
+        DAG.getNode(ISD::ADD, DL, MVT::i32, CT,
+                    DAG.getConstant(APInt(32, -16, true), DL, MVT::i32),
+                    SDNodeFlags::NoSignedWrap);
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Sub, SDNodeFlags::NoWrap);
+  }
+  if (V.getValueType() == MVT::i64) {
+    SDValue CT = DAG.getNode(ISD::CTLZ, DL, MVT::i32, V);
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT);
+  }
+  llvm_unreachable("Unexpected CTLZ type to legalize");
+}
+
 SDValue
 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -2835,6 +2866,10 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMUL:
     // Used only for bf16 on SM80, where we select fma for non-ftz operation
     return PromoteBinOpIfF32FTZ(Op, DAG);
+  case ISD::CTPOP:
+    return lowerCTPOP(Op, DAG);
+  case ISD::CTLZ:
+    return lowerCTLZ(Op, DAG);
 
   default:
     llvm_unreachable("Custom lowering not defined for operation");
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index f94d7099f1b0e..3c88551d7b23c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3267,69 +3267,20 @@ def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, i32:$amt)),
 def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, (i32 imm:$amt))),
           (SHF_R_CLAMP_i $lo, $hi, imm:$amt)>;
 
-// Count leading zeros
 let hasSideEffects = false in {
-  def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
-                         "clz.b32 \t$d, $a;", []>;
-  def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-                         "clz.b64 \t$d, $a;", []>;
+  foreach RT = [I32RT, I64RT] in {
+    // Count leading zeros
+    def CLZr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
+                                   "clz.b" # RT.Size # " \t$d, $a;",
+                                   [(set i32:$d, (ctlz RT.Ty:$a))]>;
+
+    // Population count
+    def POPCr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
+                                    "popc.b" # RT.Size # " \t$d, $a;",
+                                    [(set i32:$d, (ctpop RT.Ty:$a))]>;
+  }
 }
 
-// 32-bit has a direct PTX instruction
-def : Pat<(i32 (ctlz i32:$a)), (CLZr32 $a)>;
-
-// The return type of the ctlz ISD node is the same as its input, but the PTX
-// ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
-// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
-// truncating back down to 32 bits.
-def : Pat<(i64 (ctlz i64:$a)), (CVT_u64_u32 (CLZr64 $a), CvtNONE)>;
-def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 $a)>;
-
-// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
-// result back to 16-bits if necessary.  We also need to subtract 16 because
-// the high-order 16 zeros were counted.
-//
-// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
-// use to save one SASS instruction (on sm_35 anyway):
-//
-//   mov.b32 $tmp, {0xffff, $a}
-//   ctlz.b32 $result, $tmp
-//
-// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
-// and then ctlz that value.  This way we don't have to subtract 16 from the
-// result.  Unfortunately today we don't have a way to generate
-// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
-def : Pat<(i16 (ctlz i16:$a)),
-          (SUBi16ri (CVT_u16_u32
-           (CLZr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE), 16)>;
-def : Pat<(i32 (zext (i16 (ctlz i16:$a)))),
-          (SUBi32ri (CLZr32 (CVT_u32_u16 $a, CvtNONE)), 16)>;
-
-// Population count
-let hasSideEffects = false in {
-  def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
-                          "popc.b32 \t$d, $a;", []>;
-  def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-                          "popc.b64 \t$d, $a;", []>;
-}
-
-// 32-bit has a direct PTX instruction
-def : Pat<(i32 (ctpop i32:$a)), (POPCr32 $a)>;
-
-// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
-// to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
-// pattern that avoids the type conversion if we're truncating the result to
-// i32 anyway.
-def : Pat<(ctpop i64:$a), (CVT_u64_u32 (POPCr64 $a), CvtNONE)>;
-def : Pat<(i32 (trunc (i64 (ctpop i64:$a)))), (POPCr64 $a)>;
-
-// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
-// If we know that we're storing into an i32, we can avoid the final trunc.
-def : Pat<(ctpop i16:$a),
-          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE)>;
-def : Pat<(i32 (zext (i16 (ctpop i16:$a)))),
-          (POPCr32 (CVT_u32_u16 $a, CvtNONE))>;
-
 // fpround f32 -> f16
 def : Pat<(f16 (fpround f32:$a)),
           (CVT_f16_f32 $a, CvtRN)>;
diff --git a/llvm/test/CodeGen/NVPTX/ctlz.ll b/llvm/test/CodeGen/NVPTX/ctlz.ll
index 9f91504ad9966..4f414d600f248 100644
--- a/llvm/test/CodeGen/NVPTX/ctlz.ll
+++ b/llvm/test/CodeGen/NVPTX/ctlz.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
@@ -10,44 +11,62 @@ declare i64 @llvm.ctlz.i64(i64, i1) readnone
 ; There should be no difference between llvm.ctlz.i32(%a, true) and
 ; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0.
 
-; CHECK-LABEL: myctlz(
 define i32 @myctlz(i32 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b32
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [myctlz_param_0];
+; CHECK-NEXT:    clz.b32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone
   ret i32 %val
 }
-; CHECK-LABEL: myctlz_2(
 define i32 @myctlz_2(i32 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b32
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz_2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [myctlz_2_param_0];
+; CHECK-NEXT:    clz.b32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
   ret i32 %val
 }
 
 ; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit
 ; value, so here we have to zero-extend it.
-; CHECK-LABEL: myctlz64(
 define i64 @myctlz64(i64 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b64
-; CHECK-NEXT: cvt.u64.u32
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [myctlz64_param_0];
+; CHECK-NEXT:    clz.b64 %r1, %rd1;
+; CHECK-NEXT:    cvt.u64.u32 %rd2, %r1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
   ret i64 %val
 }
-; CHECK-LABEL: myctlz64_2(
 define i64 @myctlz64_2(i64 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b64
-; CHECK-NEXT: cvt.u64.u32
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz64_2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [myctlz64_2_param_0];
+; CHECK-NEXT:    clz.b64 %r1, %rd1;
+; CHECK-NEXT:    cvt.u64.u32 %rd2, %r1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
   ret i64 %val
 }
@@ -55,22 +74,32 @@ define i64 @myctlz64_2(i64 %a) {
 ; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the
 ; natural return width of ptx's clz.b64 instruction.  No conversions should be
 ; necessary in the PTX.
-; CHECK-LABEL: myctlz64_as_32(
 define i32 @myctlz64_as_32(i64 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b64
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz64_as_32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [myctlz64_as_32_param_0];
+; CHECK-NEXT:    clz.b64 %r1, %rd1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
   %trunc = trunc i64 %val to i32
   ret i32 %trunc
 }
-; CHECK-LABEL: myctlz64_as_32_2(
 define i32 @myctlz64_as_32_2(i64 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b64
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz64_as_32_2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [myctlz64_as_32_2_param_0];
+; CHECK-NEXT:    clz.b64 %r1, %rd1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
   %trunc = trunc i64 %val to i32
   ret i32 %trunc
@@ -80,53 +109,67 @@ define i32 @myctlz64_as_32_2(i64 %a) {
 ; and then truncating the result back down to i16.  But the NVPTX ABI
 ; zero-extends i16 return values to i32, so the final truncation doesn't appear
 ; in this function.
-; CHECK-LABEL: myctlz_ret16(
 define i16 @myctlz_ret16(i16 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: cvt.u32.u16
-; CHECK-NEXT: clz.b32
-; CHECK-NEXT: sub.
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz_ret16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %r1, [myctlz_ret16_param_0];
+; CHECK-NEXT:    clz.b32 %r2, %r1;
+; CHECK-NEXT:    add.s32 %r3, %r2, -16;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
   ret i16 %val
 }
-; CHECK-LABEL: myctlz_ret16_2(
 define i16 @myctlz_ret16_2(i16 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: cvt.u32.u16
-; CHECK-NEXT: clz.b32
-; CHECK-NEXT: sub.
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz_ret16_2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %r1, [myctlz_ret16_2_param_0];
+; CHECK-NEXT:    clz.b32 %r2, %r1;
+; CHECK-NEXT:    add.s32 %r3, %r2, -16;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
   ret i16 %val
 }
 
 ; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should
 ; remain.
-; CHECK-LABEL: myctlz_store16(
 define void @myctlz_store16(i16 %a, ptr %b) {
-; CHECK: ld.param.
-; CHECK-NEXT: cvt.u32.u16
-; CHECK-NEXT: clz.b32
-; CHECK-DAG: cvt.u16.u32
-; CHECK-DAG: sub.
-; CHECK: st.{{[a-z]}}16
-; CHECK: ret;
+; CHECK-LABEL: myctlz_store16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %r1, [myctlz_store16_param_0];
+; CHECK-NEXT:    clz.b32 %r2, %r1;
+; CHECK-NEXT:    add.s32 %r3, %r2, -16;
+; CHECK-NEXT:    ld.param.u64 %rd1, [myctlz_store16_param_1];
+; CHECK-NEXT:    st.u16 [%rd1], %r3;
+; CHECK-NEXT:    ret;
   %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
   store i16 %val, ptr %b
   ret void
 }
-; CHECK-LABEL: myctlz_store16_2(
 define void @myctlz_store16_2(i16 %a, ptr %b) {
-; CHECK: ld.param.
-; CHECK-NEXT: cvt.u32.u16
-; CHECK-NEXT: clz.b32
-; CHECK-DAG: cvt.u16.u32
-; CHECK-DAG: sub.
-; CHECK: st.{{[a-z]}}16
-; CHECK: ret;
+; CHECK-LABEL: myctlz_store16_2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %r1, [myctlz_store16_2_param_0];
+; CHECK-NEXT:    clz.b32 %r2, %r1;
+; CHECK-NEXT:    add.s32 %r3, %r2, -16;
+; CHECK-NEXT:    ld.param.u64 %rd1, [myctlz_store16_2_param_1];
+; CHECK-NEXT:    st.u16 [%rd1], %r3;
+; CHECK-NEXT:    ret;
   %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
   store i16 %val, ptr %b
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index e424e72ecc8f5..57493558419c5 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -1,61 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 | FileCheck %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s
 ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify %}
 
-; CHECK-LABEL: test_fabsf(
 define float @test_fabsf(float %f) {
-; CHECK: abs.f32
+; CHECK-LABEL: test_fabsf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [test_fabsf_param_0];
+; CHECK-NEXT:    abs.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.fabs.f32(float %f)
   ret float %x
 }
 
-; CHECK-LABEL: test_fabs(
 define double @test_fabs(double %d) {
-; CHECK: abs.f64
+; CHECK-LABEL: test_fabs(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [test_fabs_param_0];
+; CHECK-NEXT:    abs.f64 %fd2, %fd1;
+; CHECK-NEXT:    st.param.f64 [func_retval0], %fd2;
+; CHECK-NEXT:    ret;
   %x = call double @llvm.fabs.f64(double %d)
   ret double %x
 }
 
-; CHECK-LABEL: test_nvvm_sqrt(
 define float @test_nvvm_sqrt(float %a) {
-; CHECK: sqrt.rn.f32
+; CHECK-LABEL: test_nvvm_sqrt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [test_nvvm_sqrt_param_0];
+; CHECK-NEXT:    sqrt.rn.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.sqrt.f(float %a)
   ret float %val
 }
 
-; CHECK-LABEL: test_llvm_sqrt(
 define float @test_llvm_sqrt(float %a) {
-; CHECK: sqrt.rn.f32
+; CHECK-LABEL: test_llvm_sqrt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [test_llvm_sqrt_param_0];
+; CHECK-NEXT:    sqrt.rn.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
   %val = call float @llvm.sqrt.f32(float %a)
   ret float %val
 }
 
-; CHECK-LABEL: test_bitreverse32(
 define i32 @test_bitreverse32(i32 %a) {
-; CHECK: brev.b32
+; CHECK-LABEL: test_bitreverse32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_bitreverse32_param_0];
+; CHECK-NEXT:    brev.b32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %val = call i32 @llvm.bitreverse.i32(i32 %a)
   ret i32 %val
 }
 
-; CHECK-LABEL: test_bitreverse64(
 define i64 @test_bitreverse64(i64 %a) {
-; CHECK: brev.b64
+; CHECK-LABEL: test_bitreverse64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_bitreverse64_param_0];
+; CHECK-NEXT:    brev.b64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %val = call i64 @llvm.bitreverse.i64(i64 %a)
   ret i64 %val
 }
 
-; CHECK-LABEL: test_popc32(
 define i32 @test_popc32(i32 %a) {
-; CHECK: popc.b32
+; CHECK-LABEL: test_popc32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_popc32_param_0];
+; CHECK-NEXT:    popc.b32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %val = call i32 @llvm.ctpop.i32(i32 %a)
   ret i32 %val
 }
 
-; CHECK-LABEL: test_popc64
 define i64 @test_popc64(i64 %a) {
-; CHECK: popc.b64
-; CHECK: cvt.u64.u32
+; CHECK-LABEL: test_popc64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_popc64_param_0];
+; CHECK-NEXT:    popc.b64 %r1, %rd1;
+; CHECK-NEXT:    cvt.u64.u32 %rd2, %r1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %val = call i64 @llvm.ctpop.i64(i64 %a)
   ret i64 %val
 }
@@ -63,10 +121,17 @@ define i64 @test_popc64(i64 %a) {
 ; NVPTX popc.b64 returns an i32 even though @llvm.ctpop.i64 returns an i64, so
 ; if this function returns an i32, there's no need to do any type conversions
 ; in the ptx.
-; CHECK-LABEL: test_popc64_trunc
 define i32 @test_popc64_trunc(i64 %a) {
-; CHECK: popc.b64
-; CHECK-NOT: cvt.
+; CHECK-LABEL: test_popc64_trunc(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_popc64_trunc_param_0];
+; CHECK-NEXT:    popc.b64 %r1, %rd1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %val = call i64 @llvm.ctpop.i64(i64 %a)
   %trunc = trunc i64 %val to i32
   ret i32 %trunc
@@ -74,11 +139,7 @@ define i32 @test_popc64_trunc(i64 %a) {
 
 ; llvm.ctpop.i16 is implemenented by converting to i32, running popc.b32, and
 ; then converting back to i16.
-; CHECK-LABEL: test_popc16
 define void @test_popc16(i16 %a, ptr %b) {
-; CHECK: cvt.u32.u16
-; CHECK: popc.b32
-; CHECK: cvt.u16.u32
   %val = call i16 @llvm.ctpop.i16(i16 %a)
   store i16 %val, ptr %b
   ret void
@@ -86,11 +147,16 @@ define void @test_popc16...
[truncated]

Artem-B · 2025-03-10T21:21:41Z

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

+  if (V.getValueType() == MVT::i64) {
+    SDValue CT = DAG.getNode(ISD::CTPOP, DL, MVT::i32, V);
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT);
+  }


This may be worth a comment that CTPOP node here is intentionally created with 64-bit input and 32-bit output and that we'll later pattern-match it to popc instruction.

Without the explanation this code looks as if we accidentally run CTPOP only only half of the bits of the input value.

I ended up doing some more refactoring of this change, but I've added a comment to the new lowering function which hopefully explains what is going on.

Artem-B · 2025-03-10T21:21:55Z

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Sub, SDNodeFlags::NoWrap);
+  }
+  if (V.getValueType() == MVT::i64) {
+    SDValue CT = DAG.getNode(ISD::CTLZ, DL, MVT::i32, V);


Artem-B · 2025-03-10T21:24:16Z

llvm/test/CodeGen/NVPTX/intrinsics.ll

-; CHECK-LABEL: test_popc16
 define void @test_popc16(i16 %a, ptr %b) {
-; CHECK: cvt.u32.u16
-; CHECK: popc.b32
-; CHECK: cvt.u16.u32


This is odd. Where did the checks for this function go?

Oops, this happened because different CHECKs were needed for nvptx vs nvptx64. I've updated the check-prefixes and now checks are generated for this test.

Makes me wonder why we need to store the result as a pointer at all. I suspect it used to be needed to avoid collapsing trunc as we'd promote return value to i32. In this case, we generate a bit better code by using truncating store. I suspect it will also happen if we'd just store the result in the return value.

If you change the test function to return i16, and it does clearly show that we're storing only 16 bits, then we can remove the pointer and collapse this test checks to just one variant. If there are no other pointer-using test cases in the file, we could skip the 32-bit run.

If you change the test function to return i16, and it does clearly show that we're storing only 16 bits, then we can remove the pointer and collapse this test checks to just one variant.

It looks like the return type just gets promoted and the 32-bit value gets stored:

// .globl test_popc16 // -- Begin function test_popc16 .visible .func (.param .b32 func_retval0) test_popc16( .param .b32 test_popc16_param_0, .param .b32 test_popc16_param_1 ) // @test_popc16 { .reg .b32 %r<3>; // %bb.0: ld.param.u16 %r1, [test_popc16_param_0]; popc.b32 %r2, %r1; st.param.b32 [func_retval0], %r2; ret; // -- End function }

Perhaps we could return <2 x i16>, with one element set to a constant, and the other one with the result of popc. That would keep result truncate in place.

Up to you.

I think I'll just leave as is for simplicity.

Artem-B

Changes themselves are LGTM.
The test may benefit from a few more tweaks.

Artem-B · 2025-03-10T23:24:42Z

llvm/test/CodeGen/NVPTX/intrinsics.ll

-; CHECK-LABEL: test_popc16
 define void @test_popc16(i16 %a, ptr %b) {
-; CHECK: cvt.u32.u16
-; CHECK: popc.b32
-; CHECK: cvt.u16.u32


Makes me wonder why we need to store the result as a pointer at all. I suspect it used to be needed to avoid collapsing trunc as we'd promote return value to i32. In this case, we generate a bit better code by using truncating store. I suspect it will also happen if we'd just store the result in the return value.

If you change the test function to return i16, and it does clearly show that we're storing only 16 bits, then we can remove the pointer and collapse this test checks to just one variant. If there are no other pointer-using test cases in the file, we could skip the 32-bit run.

By pulling the truncates and extensions out of operations during operation legalization we enable more optimization via DAGCombiner. While the test cases show only cosmetic improvements (unlikely to impact the final SASS) in real programs the exposure of these truncates can allow for more optimization.

AlexMaclean requested review from Artem-B and justinfargnoli March 10, 2025 21:03

AlexMaclean self-assigned this Mar 10, 2025

llvmbot added the backend:NVPTX label Mar 10, 2025

Artem-B reviewed Mar 10, 2025

View reviewed changes

AlexMaclean added 3 commits March 10, 2025 21:47

pre-commit tests -- use update_llc_test_checks.py

ba2619f

[NVPTX] Legalize ctpop and ctlz in operation legalization

ee990a9

address comments + cleanup

554db43

AlexMaclean force-pushed the dev/amaclean/upstream/ctlz-ctpop-cleanup branch from e2a60bb to 554db43 Compare March 10, 2025 22:41

llvmbot added the llvm:SelectionDAG SelectionDAGISel as well label Mar 10, 2025

Artem-B approved these changes Mar 10, 2025

View reviewed changes

update test

c5b25d3

justinfargnoli approved these changes Mar 11, 2025

View reviewed changes

AlexMaclean merged commit ec941a4 into llvm:main Mar 12, 2025
11 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[NVPTX] Legalize ctpop and ctlz in operation legalization #130668

[NVPTX] Legalize ctpop and ctlz in operation legalization #130668

AlexMaclean commented Mar 10, 2025

llvmbot commented Mar 10, 2025 •

edited

Loading

Artem-B Mar 10, 2025

AlexMaclean Mar 10, 2025

Artem-B Mar 10, 2025

Artem-B Mar 10, 2025

AlexMaclean Mar 10, 2025

Artem-B Mar 10, 2025

AlexMaclean Mar 11, 2025

Artem-B Mar 11, 2025

AlexMaclean Mar 12, 2025

Artem-B left a comment

Artem-B Mar 10, 2025

[NVPTX] Legalize ctpop and ctlz in operation legalization #130668

[NVPTX] Legalize ctpop and ctlz in operation legalization #130668

Conversation

AlexMaclean commented Mar 10, 2025

llvmbot commented Mar 10, 2025 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Artem-B left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

llvmbot commented Mar 10, 2025 •

edited

Loading