diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 9e8227de261f1..1cacab9528caa 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -5113,7 +5113,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { DAG.getConstant(NVT.getSizeInBits() - OVT.getSizeInBits(), dl, NVT)); } - Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1)); + Results.push_back( + DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1, SDNodeFlags::NoWrap)); break; } case ISD::CTLZ_ZERO_UNDEF: { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index b62c15ddb97d3..b768725b04256 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -764,16 +764,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // Custom handling for i8 intrinsics setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); - for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { - setOperationAction(ISD::ABS, Ty, Legal); - setOperationAction(ISD::SMIN, Ty, Legal); - setOperationAction(ISD::SMAX, Ty, Legal); - setOperationAction(ISD::UMIN, Ty, Legal); - setOperationAction(ISD::UMAX, Ty, Legal); + setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, + {MVT::i16, MVT::i32, MVT::i64}, Legal); - setOperationAction(ISD::CTPOP, Ty, Legal); - setOperationAction(ISD::CTLZ, Ty, Legal); - } + setOperationAction({ISD::CTPOP, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i16, + Promote); + setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i32, Legal); + setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i64, Custom); setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom); setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom); @@ -2748,6 +2745,19 @@ static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) { return Op; } +// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value. +// Lower these into a node returning the correct type which is zero-extended +// back to the correct size. +static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG) { + SDValue V = Op->getOperand(0); + assert(V.getValueType() == MVT::i64 && + "Unexpected CTLZ/CTPOP type to legalize"); + + SDLoc DL(Op); + SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V); + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg); +} + SDValue NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -2833,6 +2843,9 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMUL: // Used only for bf16 on SM80, where we select fma for non-ftz operation return PromoteBinOpIfF32FTZ(Op, DAG); + case ISD::CTPOP: + case ISD::CTLZ: + return lowerCTLZCTPOP(Op, DAG); default: llvm_unreachable("Custom lowering not defined for operation"); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index f94d7099f1b0e..3c88551d7b23c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -3267,69 +3267,20 @@ def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, i32:$amt)), def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, (i32 imm:$amt))), (SHF_R_CLAMP_i $lo, $hi, imm:$amt)>; -// Count leading zeros let hasSideEffects = false in { - def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), - "clz.b32 \t$d, $a;", []>; - def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "clz.b64 \t$d, $a;", []>; + foreach RT = [I32RT, I64RT] in { + // Count leading zeros + def CLZr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a), + "clz.b" # RT.Size # " \t$d, $a;", + [(set i32:$d, (ctlz RT.Ty:$a))]>; + + // Population count + def POPCr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a), + "popc.b" # RT.Size # " \t$d, $a;", + [(set i32:$d, (ctpop RT.Ty:$a))]>; + } } -// 32-bit has a direct PTX instruction -def : Pat<(i32 (ctlz i32:$a)), (CLZr32 $a)>; - -// The return type of the ctlz ISD node is the same as its input, but the PTX -// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the -// ptx value to 64 bits to match the ISD node's semantics, unless we know we're -// truncating back down to 32 bits. -def : Pat<(i64 (ctlz i64:$a)), (CVT_u64_u32 (CLZr64 $a), CvtNONE)>; -def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 $a)>; - -// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the -// result back to 16-bits if necessary. We also need to subtract 16 because -// the high-order 16 zeros were counted. -// -// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could -// use to save one SASS instruction (on sm_35 anyway): -// -// mov.b32 $tmp, {0xffff, $a} -// ctlz.b32 $result, $tmp -// -// That is, instead of zero-extending the input to 32 bits, we'd "one-extend" -// and then ctlz that value. This way we don't have to subtract 16 from the -// result. Unfortunately today we don't have a way to generate -// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. -def : Pat<(i16 (ctlz i16:$a)), - (SUBi16ri (CVT_u16_u32 - (CLZr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE), 16)>; -def : Pat<(i32 (zext (i16 (ctlz i16:$a)))), - (SUBi32ri (CLZr32 (CVT_u32_u16 $a, CvtNONE)), 16)>; - -// Population count -let hasSideEffects = false in { - def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), - "popc.b32 \t$d, $a;", []>; - def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "popc.b64 \t$d, $a;", []>; -} - -// 32-bit has a direct PTX instruction -def : Pat<(i32 (ctpop i32:$a)), (POPCr32 $a)>; - -// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit -// to match the LLVM semantics. Just as with ctlz.i64, we provide a second -// pattern that avoids the type conversion if we're truncating the result to -// i32 anyway. -def : Pat<(ctpop i64:$a), (CVT_u64_u32 (POPCr64 $a), CvtNONE)>; -def : Pat<(i32 (trunc (i64 (ctpop i64:$a)))), (POPCr64 $a)>; - -// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits. -// If we know that we're storing into an i32, we can avoid the final trunc. -def : Pat<(ctpop i16:$a), - (CVT_u16_u32 (POPCr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE)>; -def : Pat<(i32 (zext (i16 (ctpop i16:$a)))), - (POPCr32 (CVT_u32_u16 $a, CvtNONE))>; - // fpround f32 -> f16 def : Pat<(f16 (fpround f32:$a)), (CVT_f16_f32 $a, CvtRN)>; diff --git a/llvm/test/CodeGen/NVPTX/ctlz.ll b/llvm/test/CodeGen/NVPTX/ctlz.ll index 9f91504ad9966..1443e5c46346c 100644 --- a/llvm/test/CodeGen/NVPTX/ctlz.ll +++ b/llvm/test/CodeGen/NVPTX/ctlz.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} @@ -10,44 +11,62 @@ declare i64 @llvm.ctlz.i64(i64, i1) readnone ; There should be no difference between llvm.ctlz.i32(%a, true) and ; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0. -; CHECK-LABEL: myctlz( define i32 @myctlz(i32 %a) { -; CHECK: ld.param. -; CHECK-NEXT: clz.b32 -; CHECK-NEXT: st.param. -; CHECK-NEXT: ret; +; CHECK-LABEL: myctlz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [myctlz_param_0]; +; CHECK-NEXT: clz.b32 %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone ret i32 %val } -; CHECK-LABEL: myctlz_2( define i32 @myctlz_2(i32 %a) { -; CHECK: ld.param. -; CHECK-NEXT: clz.b32 -; CHECK-NEXT: st.param. -; CHECK-NEXT: ret; +; CHECK-LABEL: myctlz_2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [myctlz_2_param_0]; +; CHECK-NEXT: clz.b32 %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone ret i32 %val } ; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit ; value, so here we have to zero-extend it. -; CHECK-LABEL: myctlz64( define i64 @myctlz64(i64 %a) { -; CHECK: ld.param. -; CHECK-NEXT: clz.b64 -; CHECK-NEXT: cvt.u64.u32 -; CHECK-NEXT: st.param. -; CHECK-NEXT: ret; +; CHECK-LABEL: myctlz64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_param_0]; +; CHECK-NEXT: clz.b64 %r1, %rd1; +; CHECK-NEXT: cvt.u64.u32 %rd2, %r1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone ret i64 %val } -; CHECK-LABEL: myctlz64_2( define i64 @myctlz64_2(i64 %a) { -; CHECK: ld.param. -; CHECK-NEXT: clz.b64 -; CHECK-NEXT: cvt.u64.u32 -; CHECK-NEXT: st.param. -; CHECK-NEXT: ret; +; CHECK-LABEL: myctlz64_2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_2_param_0]; +; CHECK-NEXT: clz.b64 %r1, %rd1; +; CHECK-NEXT: cvt.u64.u32 %rd2, %r1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone ret i64 %val } @@ -55,22 +74,32 @@ define i64 @myctlz64_2(i64 %a) { ; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the ; natural return width of ptx's clz.b64 instruction. No conversions should be ; necessary in the PTX. -; CHECK-LABEL: myctlz64_as_32( define i32 @myctlz64_as_32(i64 %a) { -; CHECK: ld.param. -; CHECK-NEXT: clz.b64 -; CHECK-NEXT: st.param. -; CHECK-NEXT: ret; +; CHECK-LABEL: myctlz64_as_32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_param_0]; +; CHECK-NEXT: clz.b64 %r1, %rd1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone %trunc = trunc i64 %val to i32 ret i32 %trunc } -; CHECK-LABEL: myctlz64_as_32_2( define i32 @myctlz64_as_32_2(i64 %a) { -; CHECK: ld.param. -; CHECK-NEXT: clz.b64 -; CHECK-NEXT: st.param. -; CHECK-NEXT: ret; +; CHECK-LABEL: myctlz64_as_32_2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_2_param_0]; +; CHECK-NEXT: clz.b64 %r1, %rd1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone %trunc = trunc i64 %val to i32 ret i32 %trunc @@ -80,53 +109,67 @@ define i32 @myctlz64_as_32_2(i64 %a) { ; and then truncating the result back down to i16. But the NVPTX ABI ; zero-extends i16 return values to i32, so the final truncation doesn't appear ; in this function. -; CHECK-LABEL: myctlz_ret16( define i16 @myctlz_ret16(i16 %a) { -; CHECK: ld.param. -; CHECK-NEXT: cvt.u32.u16 -; CHECK-NEXT: clz.b32 -; CHECK-NEXT: sub. -; CHECK-NEXT: st.param. -; CHECK-NEXT: ret; +; CHECK-LABEL: myctlz_ret16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_param_0]; +; CHECK-NEXT: clz.b32 %r2, %r1; +; CHECK-NEXT: add.s32 %r3, %r2, -16; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone ret i16 %val } -; CHECK-LABEL: myctlz_ret16_2( define i16 @myctlz_ret16_2(i16 %a) { -; CHECK: ld.param. -; CHECK-NEXT: cvt.u32.u16 -; CHECK-NEXT: clz.b32 -; CHECK-NEXT: sub. -; CHECK-NEXT: st.param. -; CHECK-NEXT: ret; +; CHECK-LABEL: myctlz_ret16_2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_2_param_0]; +; CHECK-NEXT: shl.b32 %r2, %r1, 16; +; CHECK-NEXT: clz.b32 %r3, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone ret i16 %val } ; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should ; remain. -; CHECK-LABEL: myctlz_store16( define void @myctlz_store16(i16 %a, ptr %b) { -; CHECK: ld.param. -; CHECK-NEXT: cvt.u32.u16 -; CHECK-NEXT: clz.b32 -; CHECK-DAG: cvt.u16.u32 -; CHECK-DAG: sub. -; CHECK: st.{{[a-z]}}16 -; CHECK: ret; +; CHECK-LABEL: myctlz_store16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_param_0]; +; CHECK-NEXT: clz.b32 %r2, %r1; +; CHECK-NEXT: add.s32 %r3, %r2, -16; +; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_param_1]; +; CHECK-NEXT: st.u16 [%rd1], %r3; +; CHECK-NEXT: ret; %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone store i16 %val, ptr %b ret void } -; CHECK-LABEL: myctlz_store16_2( define void @myctlz_store16_2(i16 %a, ptr %b) { -; CHECK: ld.param. -; CHECK-NEXT: cvt.u32.u16 -; CHECK-NEXT: clz.b32 -; CHECK-DAG: cvt.u16.u32 -; CHECK-DAG: sub. -; CHECK: st.{{[a-z]}}16 -; CHECK: ret; +; CHECK-LABEL: myctlz_store16_2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_2_param_0]; +; CHECK-NEXT: clz.b32 %r2, %r1; +; CHECK-NEXT: add.s32 %r3, %r2, -16; +; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_2_param_1]; +; CHECK-NEXT: st.u16 [%rd1], %r3; +; CHECK-NEXT: ret; %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone store i16 %val, ptr %b ret void diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll index e424e72ecc8f5..cc6af060d6c0a 100644 --- a/llvm/test/CodeGen/NVPTX/intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll @@ -1,61 +1,119 @@ -; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 | FileCheck %s -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 | FileCheck %s --check-prefixes=CHECK,CHECK32 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s --check-prefixes=CHECK,CHECK64 ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify %} -; CHECK-LABEL: test_fabsf( define float @test_fabsf(float %f) { -; CHECK: abs.f32 +; CHECK-LABEL: test_fabsf( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [test_fabsf_param_0]; +; CHECK-NEXT: abs.f32 %f2, %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; %x = call float @llvm.fabs.f32(float %f) ret float %x } -; CHECK-LABEL: test_fabs( define double @test_fabs(double %d) { -; CHECK: abs.f64 +; CHECK-LABEL: test_fabs( +; CHECK: { +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f64 %fd1, [test_fabs_param_0]; +; CHECK-NEXT: abs.f64 %fd2, %fd1; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; +; CHECK-NEXT: ret; %x = call double @llvm.fabs.f64(double %d) ret double %x } -; CHECK-LABEL: test_nvvm_sqrt( define float @test_nvvm_sqrt(float %a) { -; CHECK: sqrt.rn.f32 +; CHECK-LABEL: test_nvvm_sqrt( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [test_nvvm_sqrt_param_0]; +; CHECK-NEXT: sqrt.rn.f32 %f2, %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; %val = call float @llvm.nvvm.sqrt.f(float %a) ret float %val } -; CHECK-LABEL: test_llvm_sqrt( define float @test_llvm_sqrt(float %a) { -; CHECK: sqrt.rn.f32 +; CHECK-LABEL: test_llvm_sqrt( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [test_llvm_sqrt_param_0]; +; CHECK-NEXT: sqrt.rn.f32 %f2, %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; %val = call float @llvm.sqrt.f32(float %a) ret float %val } -; CHECK-LABEL: test_bitreverse32( define i32 @test_bitreverse32(i32 %a) { -; CHECK: brev.b32 +; CHECK-LABEL: test_bitreverse32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_bitreverse32_param_0]; +; CHECK-NEXT: brev.b32 %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %val = call i32 @llvm.bitreverse.i32(i32 %a) ret i32 %val } -; CHECK-LABEL: test_bitreverse64( define i64 @test_bitreverse64(i64 %a) { -; CHECK: brev.b64 +; CHECK-LABEL: test_bitreverse64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [test_bitreverse64_param_0]; +; CHECK-NEXT: brev.b64 %rd2, %rd1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; %val = call i64 @llvm.bitreverse.i64(i64 %a) ret i64 %val } -; CHECK-LABEL: test_popc32( define i32 @test_popc32(i32 %a) { -; CHECK: popc.b32 +; CHECK-LABEL: test_popc32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_popc32_param_0]; +; CHECK-NEXT: popc.b32 %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %val = call i32 @llvm.ctpop.i32(i32 %a) ret i32 %val } -; CHECK-LABEL: test_popc64 define i64 @test_popc64(i64 %a) { -; CHECK: popc.b64 -; CHECK: cvt.u64.u32 +; CHECK-LABEL: test_popc64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [test_popc64_param_0]; +; CHECK-NEXT: popc.b64 %r1, %rd1; +; CHECK-NEXT: cvt.u64.u32 %rd2, %r1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; %val = call i64 @llvm.ctpop.i64(i64 %a) ret i64 %val } @@ -63,10 +121,17 @@ define i64 @test_popc64(i64 %a) { ; NVPTX popc.b64 returns an i32 even though @llvm.ctpop.i64 returns an i64, so ; if this function returns an i32, there's no need to do any type conversions ; in the ptx. -; CHECK-LABEL: test_popc64_trunc define i32 @test_popc64_trunc(i64 %a) { -; CHECK: popc.b64 -; CHECK-NOT: cvt. +; CHECK-LABEL: test_popc64_trunc( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [test_popc64_trunc_param_0]; +; CHECK-NEXT: popc.b64 %r1, %rd1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %val = call i64 @llvm.ctpop.i64(i64 %a) %trunc = trunc i64 %val to i32 ret i32 %trunc @@ -74,11 +139,29 @@ define i32 @test_popc64_trunc(i64 %a) { ; llvm.ctpop.i16 is implemenented by converting to i32, running popc.b32, and ; then converting back to i16. -; CHECK-LABEL: test_popc16 define void @test_popc16(i16 %a, ptr %b) { -; CHECK: cvt.u32.u16 -; CHECK: popc.b32 -; CHECK: cvt.u16.u32 +; CHECK32-LABEL: test_popc16( +; CHECK32: { +; CHECK32-NEXT: .reg .b32 %r<4>; +; CHECK32-EMPTY: +; CHECK32-NEXT: // %bb.0: +; CHECK32-NEXT: ld.param.u16 %r1, [test_popc16_param_0]; +; CHECK32-NEXT: popc.b32 %r2, %r1; +; CHECK32-NEXT: ld.param.u32 %r3, [test_popc16_param_1]; +; CHECK32-NEXT: st.u16 [%r3], %r2; +; CHECK32-NEXT: ret; +; +; CHECK64-LABEL: test_popc16( +; CHECK64: { +; CHECK64-NEXT: .reg .b32 %r<3>; +; CHECK64-NEXT: .reg .b64 %rd<2>; +; CHECK64-EMPTY: +; CHECK64-NEXT: // %bb.0: +; CHECK64-NEXT: ld.param.u16 %r1, [test_popc16_param_0]; +; CHECK64-NEXT: popc.b32 %r2, %r1; +; CHECK64-NEXT: ld.param.u64 %rd1, [test_popc16_param_1]; +; CHECK64-NEXT: st.u16 [%rd1], %r2; +; CHECK64-NEXT: ret; %val = call i16 @llvm.ctpop.i16(i16 %a) store i16 %val, ptr %b ret void @@ -86,11 +169,16 @@ define void @test_popc16(i16 %a, ptr %b) { ; If we call llvm.ctpop.i16 and then zext the result to i32, we shouldn't need ; to do any conversions after calling popc.b32, because that returns an i32. -; CHECK-LABEL: test_popc16_to_32 define i32 @test_popc16_to_32(i16 %a) { -; CHECK: cvt.u32.u16 -; CHECK: popc.b32 -; CHECK-NOT: cvt. +; CHECK-LABEL: test_popc16_to_32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %r1, [test_popc16_to_32_param_0]; +; CHECK-NEXT: popc.b32 %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %val = call i16 @llvm.ctpop.i16(i16 %a) %zext = zext i16 %val to i32 ret i32 %zext @@ -98,78 +186,118 @@ define i32 @test_popc16_to_32(i16 %a) { ; Most of nvvm.read.ptx.sreg.* intrinsics always return the same value and may ; be CSE'd. -; CHECK-LABEL: test_tid define i32 @test_tid() { -; CHECK: mov.u32 %r{{.*}}, %tid.x; +; CHECK-LABEL: test_tid( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.u32 %r1, %tid.x; +; CHECK-NEXT: add.s32 %r2, %r1, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %a = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -; CHECK-NOT: mov.u32 %r{{.*}}, %tid.x; %b = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() %ret = add i32 %a, %b -; CHECK: ret ret i32 %ret } ; reading clock() or clock64() should not be CSE'd as each read may return ; different value. -; CHECK-LABEL: test_clock define i32 @test_clock() { -; CHECK: mov.u32 %r{{.*}}, %clock; +; CHECK-LABEL: test_clock( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.u32 %r1, %clock; +; CHECK-NEXT: mov.u32 %r2, %clock; +; CHECK-NEXT: add.s32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %a = tail call i32 @llvm.nvvm.read.ptx.sreg.clock() -; CHECK: mov.u32 %r{{.*}}, %clock; %b = tail call i32 @llvm.nvvm.read.ptx.sreg.clock() %ret = add i32 %a, %b -; CHECK: ret ret i32 %ret } -; CHECK-LABEL: test_clock64 define i64 @test_clock64() { -; CHECK: mov.u64 %r{{.*}}, %clock64; +; CHECK-LABEL: test_clock64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.u64 %rd1, %clock64; +; CHECK-NEXT: mov.u64 %rd2, %clock64; +; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %a = tail call i64 @llvm.nvvm.read.ptx.sreg.clock64() -; CHECK: mov.u64 %r{{.*}}, %clock64; %b = tail call i64 @llvm.nvvm.read.ptx.sreg.clock64() %ret = add i64 %a, %b -; CHECK: ret ret i64 %ret } -; CHECK-LABEL: test_exit define void @test_exit() { -; CHECK: exit; +; CHECK-LABEL: test_exit( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: exit; +; CHECK-NEXT: ret; call void @llvm.nvvm.exit() ret void } -; CHECK-LABEL: test_globaltimer define i64 @test_globaltimer() { -; CHECK: mov.u64 %r{{.*}}, %globaltimer; +; CHECK-LABEL: test_globaltimer( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.u64 %rd1, %globaltimer; +; CHECK-NEXT: mov.u64 %rd2, %globaltimer; +; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %a = tail call i64 @llvm.nvvm.read.ptx.sreg.globaltimer() -; CHECK: mov.u64 %r{{.*}}, %globaltimer; %b = tail call i64 @llvm.nvvm.read.ptx.sreg.globaltimer() %ret = add i64 %a, %b -; CHECK: ret ret i64 %ret } -; CHECK-LABEL: test_cyclecounter define i64 @test_cyclecounter() { -; CHECK: mov.u64 %r{{.*}}, %clock64; +; CHECK-LABEL: test_cyclecounter( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.u64 %rd1, %clock64; +; CHECK-NEXT: mov.u64 %rd2, %clock64; +; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %a = tail call i64 @llvm.readcyclecounter() -; CHECK: mov.u64 %r{{.*}}, %clock64; %b = tail call i64 @llvm.readcyclecounter() %ret = add i64 %a, %b -; CHECK: ret ret i64 %ret } -; CHECK-LABEL: test_steadycounter define i64 @test_steadycounter() { -; CHECK: mov.u64 %r{{.*}}, %globaltimer; +; CHECK-LABEL: test_steadycounter( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.u64 %rd1, %globaltimer; +; CHECK-NEXT: mov.u64 %rd2, %globaltimer; +; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %a = tail call i64 @llvm.readsteadycounter() -; CHECK: mov.u64 %r{{.*}}, %globaltimer; %b = tail call i64 @llvm.readsteadycounter() %ret = add i64 %a, %b -; CHECK: ret ret i64 %ret } diff --git a/llvm/test/CodeGen/VE/Scalar/ctlz.ll b/llvm/test/CodeGen/VE/Scalar/ctlz.ll index 602b9a86bf032..c2af9753f8bb6 100644 --- a/llvm/test/CodeGen/VE/Scalar/ctlz.ll +++ b/llvm/test/CodeGen/VE/Scalar/ctlz.ll @@ -200,7 +200,6 @@ define zeroext i32 @func32zx(i32 zeroext %p) { ; CHECK: # %bb.0: ; CHECK-NEXT: ldz %s0, %s0 ; CHECK-NEXT: lea %s0, -32(, %s0) -; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: b.l.t (, %s10) %r = tail call i32 @llvm.ctlz.i32(i32 %p, i1 false) ret i32 %r