Skip to content

Commit

Permalink
[RISCV] Emit VP strided loads/stores in RISCVGatherScatterLowering
Browse files Browse the repository at this point in the history
RISCVGatherScatterLowering is the main user of riscv_masked_strided_{load,store}, which we can remove if we replace them with their VP equivalents.

Submitting early as a draft to show the regressions in the test diff that llvm#97800 and llvm#97798 (or the CGP version) are needed to fix.
  • Loading branch information
lukel97 committed Jul 9, 2024
1 parent 3f83a69 commit d0899ba
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 132 deletions.
18 changes: 12 additions & 6 deletions llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -515,17 +515,23 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II,

Builder.SetInsertPoint(II);

Value *EVL = Builder.CreateElementCount(
IntegerType::get(Ctx, 32), cast<VectorType>(DataType)->getElementCount());

CallInst *Call;
if (II->getIntrinsicID() == Intrinsic::masked_gather)
if (II->getIntrinsicID() == Intrinsic::masked_gather) {
Call = Builder.CreateIntrinsic(
Intrinsic::riscv_masked_strided_load,
Intrinsic::experimental_vp_strided_load,
{DataType, BasePtr->getType(), Stride->getType()},
{II->getArgOperand(3), BasePtr, Stride, II->getArgOperand(2)});
else
{BasePtr, Stride, II->getArgOperand(2), EVL});
Call = Builder.CreateIntrinsic(
Intrinsic::vp_select, {DataType},
{II->getOperand(2), Call, II->getArgOperand(3), EVL});
} else
Call = Builder.CreateIntrinsic(
Intrinsic::riscv_masked_strided_store,
Intrinsic::experimental_vp_strided_store,
{DataType, BasePtr->getType(), Stride->getType()},
{II->getArgOperand(0), BasePtr, Stride, II->getArgOperand(3)});
{II->getArgOperand(0), BasePtr, Stride, II->getArgOperand(3), EVL});

Call->takeName(II);
II->replaceAllUsesWith(Call);
Expand Down
87 changes: 27 additions & 60 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,9 @@ define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture
; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-NEXT: .LBB3_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: lbu a3, 0(a1)
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vadd.vx v8, v8, a3
; CHECK-NEXT: vlse8.v v8, (a1), zero
; CHECK-NEXT: vle8.v v9, (a0)
; CHECK-NEXT: vadd.vv v8, v9, v8
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: addi a0, a0, 32
; CHECK-NEXT: addi a1, a1, 160
Expand Down Expand Up @@ -182,9 +182,9 @@ define void @gather_zero_stride_i32(ptr noalias nocapture %A, ptr noalias nocapt
; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
; CHECK-NEXT: .LBB4_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: lw a3, 0(a1)
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vadd.vx v8, v8, a3
; CHECK-NEXT: vlse32.v v8, (a1), zero
; CHECK-NEXT: vle32.v v9, (a0)
; CHECK-NEXT: vadd.vv v8, v9, v8
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: addi a0, a0, 8
; CHECK-NEXT: addi a1, a1, 160
Expand Down Expand Up @@ -214,57 +214,22 @@ for.cond.cleanup: ; preds = %vector.body
}

define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
; V-LABEL: gather_zero_stride_unfold:
; V: # %bb.0: # %entry
; V-NEXT: addi a2, a0, 1024
; V-NEXT: li a3, 32
; V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; V-NEXT: .LBB5_1: # %vector.body
; V-NEXT: # =>This Inner Loop Header: Depth=1
; V-NEXT: vlse8.v v8, (a1), zero
; V-NEXT: vle8.v v9, (a0)
; V-NEXT: vdivu.vv v8, v8, v9
; V-NEXT: vse8.v v8, (a0)
; V-NEXT: addi a0, a0, 32
; V-NEXT: addi a1, a1, 160
; V-NEXT: bne a0, a2, .LBB5_1
; V-NEXT: # %bb.2: # %for.cond.cleanup
; V-NEXT: ret
;
; ZVE32F-LABEL: gather_zero_stride_unfold:
; ZVE32F: # %bb.0: # %entry
; ZVE32F-NEXT: addi a2, a0, 1024
; ZVE32F-NEXT: li a3, 32
; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; ZVE32F-NEXT: .LBB5_1: # %vector.body
; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
; ZVE32F-NEXT: vlse8.v v8, (a1), zero
; ZVE32F-NEXT: vle8.v v9, (a0)
; ZVE32F-NEXT: vdivu.vv v8, v8, v9
; ZVE32F-NEXT: vse8.v v8, (a0)
; ZVE32F-NEXT: addi a0, a0, 32
; ZVE32F-NEXT: addi a1, a1, 160
; ZVE32F-NEXT: bne a0, a2, .LBB5_1
; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup
; ZVE32F-NEXT: ret
;
; NOT-OPTIMIZED-LABEL: gather_zero_stride_unfold:
; NOT-OPTIMIZED: # %bb.0: # %entry
; NOT-OPTIMIZED-NEXT: addi a2, a0, 1024
; NOT-OPTIMIZED-NEXT: li a3, 32
; NOT-OPTIMIZED-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; NOT-OPTIMIZED-NEXT: .LBB5_1: # %vector.body
; NOT-OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1
; NOT-OPTIMIZED-NEXT: lbu a3, 0(a1)
; NOT-OPTIMIZED-NEXT: vle8.v v8, (a0)
; NOT-OPTIMIZED-NEXT: vmv.v.x v9, a3
; NOT-OPTIMIZED-NEXT: vdivu.vv v8, v9, v8
; NOT-OPTIMIZED-NEXT: vse8.v v8, (a0)
; NOT-OPTIMIZED-NEXT: addi a0, a0, 32
; NOT-OPTIMIZED-NEXT: addi a1, a1, 160
; NOT-OPTIMIZED-NEXT: bne a0, a2, .LBB5_1
; NOT-OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup
; NOT-OPTIMIZED-NEXT: ret
; CHECK-LABEL: gather_zero_stride_unfold:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addi a2, a0, 1024
; CHECK-NEXT: li a3, 32
; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-NEXT: .LBB5_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vlse8.v v8, (a1), zero
; CHECK-NEXT: vle8.v v9, (a0)
; CHECK-NEXT: vdivu.vv v8, v8, v9
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: addi a0, a0, 32
; CHECK-NEXT: addi a1, a1, 160
; CHECK-NEXT: bne a0, a2, .LBB5_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
; CHECK-NEXT: ret
entry:
br label %vector.body

Expand Down Expand Up @@ -962,9 +927,9 @@ define void @gather_zero_stride_fp(ptr noalias nocapture %A, ptr noalias nocaptu
; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
; CHECK-NEXT: .LBB16_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flw fa5, 0(a1)
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vfadd.vf v8, v8, fa5
; CHECK-NEXT: vlse32.v v8, (a1), zero
; CHECK-NEXT: vle32.v v9, (a0)
; CHECK-NEXT: vfadd.vv v8, v9, v8
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: addi a0, a0, 128
; CHECK-NEXT: addi a1, a1, 640
Expand Down Expand Up @@ -992,3 +957,5 @@ vector.body: ; preds = %vector.body, %entry
for.cond.cleanup: ; preds = %vector.body
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; NOT-OPTIMIZED: {{.*}}
Loading

0 comments on commit d0899ba

Please sign in to comment.