Skip to content

Commit c3a3d20

Browse files
committed
[LV] Add analysis remark for mixed precision conversions
Floating point conversions inside vectorized loops have performance implications but are very subtle. The user could specify a floating point constant, or call a function without realizing that it will force a change in the vector width. An example of this behaviour is seen in https://godbolt.org/z/M3nT6c . The vectorizer should indicate when this happens becuase it is most likely unintended behaviour. This patch adds a simple check for this behaviour by following floating point stores in the original loop and checking if a floating point conversion operation occurs. Reviewed By: fhahn Differential Revision: https://reviews.llvm.org/D95539
1 parent 00c4e0a commit c3a3d20

File tree

2 files changed

+117
-0
lines changed

2 files changed

+117
-0
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9441,6 +9441,51 @@ static bool processLoopInVPlanNativePath(
94419441
return true;
94429442
}
94439443

9444+
// Emit a remark if there are stores to floats that required a floating point
9445+
// extension. If the vectorized loop was generated with floating point there
9446+
// will be a performance penalty from the conversion overhead and the change in
9447+
// the vector width.
9448+
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9449+
SmallVector<Instruction *, 4> Worklist;
9450+
for (BasicBlock *BB : L->getBlocks()) {
9451+
for (Instruction &Inst : *BB) {
9452+
if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9453+
if (S->getValueOperand()->getType()->isFloatTy())
9454+
Worklist.push_back(S);
9455+
}
9456+
}
9457+
}
9458+
9459+
// Traverse the floating point stores upwards searching, for floating point
9460+
// conversions.
9461+
SmallPtrSet<const Instruction *, 4> Visited;
9462+
SmallPtrSet<const Instruction *, 4> EmittedRemark;
9463+
while (!Worklist.empty()) {
9464+
auto *I = Worklist.pop_back_val();
9465+
if (!L->contains(I))
9466+
continue;
9467+
if (!Visited.insert(I).second)
9468+
continue;
9469+
9470+
// Emit a remark if the floating point store required a floating
9471+
// point conversion.
9472+
// TODO: More work could be done to identify the root cause such as a
9473+
// constant or a function return type and point the user to it.
9474+
if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9475+
ORE->emit([&]() {
9476+
return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9477+
I->getDebugLoc(), L->getHeader())
9478+
<< "floating point conversion changes vector width. "
9479+
<< "Mixed floating point precision requires an up/down "
9480+
<< "cast that will negatively impact performance.";
9481+
});
9482+
9483+
for (Use &Op : I->operands())
9484+
if (auto *OpI = dyn_cast<Instruction>(Op))
9485+
Worklist.push_back(OpI);
9486+
}
9487+
}
9488+
94449489
LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
94459490
: InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
94469491
!EnableLoopInterleaving),
@@ -9759,6 +9804,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
97599804
<< NV("VectorizationFactor", VF.Width)
97609805
<< ", interleaved count: " << NV("InterleaveCount", IC) << ")";
97619806
});
9807+
9808+
if (ORE->allowExtraAnalysis(LV_NAME))
9809+
checkMixedPrecision(L, ORE);
97629810
}
97639811

97649812
Optional<MDNode *> RemainderLoopID =
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
; RUN: opt -force-vector-interleave=2 -force-vector-width=4 -loop-vectorize -pass-remarks-analysis=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s
2+
3+
; CHECK: remark: mixed-precision.c:3:26: floating point conversion changes vector width. Mixed floating point precision requires an up/down cast that will negatively impact performance.
4+
define void @f(float* noalias nocapture %X, i64 %N) {
5+
entry:
6+
br label %for.body
7+
8+
for.cond.cleanup:
9+
ret void
10+
11+
for.body:
12+
%i = phi i64 [ %inc, %for.body ], [ 0, %entry ]
13+
%arrayidx = getelementptr inbounds float, float* %X, i64 %i
14+
%0 = load float, float* %arrayidx, align 4
15+
%conv = fpext float %0 to double, !dbg !9
16+
%mul = fmul double %conv, 0x3FD5555555555555
17+
%conv3 = fptrunc double %mul to float
18+
store float %conv3, float* %arrayidx, align 4
19+
%inc = add nuw i64 %i, 1
20+
%exitcond.not = icmp eq i64 %inc, %N
21+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
22+
}
23+
24+
; CHECK: remark: mixed-precision.c:8:8: floating point conversion changes vector width. Mixed floating point precision requires an up/down cast that will negatively impact performance.
25+
; CHECK: remark: mixed-precision.c:7:16: floating point conversion changes vector width. Mixed floating point precision requires an up/down cast that will negatively impact performance.
26+
; CHECK-NOT: remark: mixed-precision.c:7:16: floating point conversion changes vector width. Mixed floating point precision requires an up/down cast that will negatively impact performance.
27+
define void @g(float* noalias nocapture %X, float* noalias nocapture %Y, i64 %N) {
28+
entry:
29+
%pi = alloca double
30+
store double 0x400921FB54442D18, double* %pi
31+
%fac = load double, double* %pi
32+
br label %for.body
33+
34+
for.body:
35+
%i = phi i64 [ %inc, %for.body ], [ 0, %entry ]
36+
%arrayidx = getelementptr inbounds float, float* %X, i64 %i
37+
%0 = load float, float* %arrayidx, align 4
38+
%conv = fpext float %0 to double, !dbg !10
39+
%mul = fmul double %conv, %fac
40+
%conv1 = fptrunc double %mul to float
41+
store float %conv1, float* %arrayidx, align 4
42+
%arrayidx5 = getelementptr inbounds float, float* %Y, i64 %i
43+
%1 = load float, float* %arrayidx5, align 4
44+
%conv2 = fpext float %1 to double, !dbg !11
45+
%mul2 = fmul double %conv2, %fac
46+
%conv3 = fptrunc double %mul2 to float
47+
store float %conv3, float* %arrayidx5, align 4
48+
%inc = add nuw nsw i64 %i, 1
49+
%exitcond.not = icmp eq i64 %inc, %N
50+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
51+
52+
for.cond.cleanup:
53+
ret void
54+
}
55+
56+
!llvm.dbg.cu = !{!0}
57+
!llvm.module.flags = !{!3, !4}
58+
59+
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
60+
!1 = !DIFile(filename: "mixed-precision.c", directory: "/tmp/mixed-precision.c")
61+
!2 = !{}
62+
!3 = !{i32 2, !"Debug Info Version", i32 3}
63+
!4 = !{i32 1, !"wchar_size", i32 4}
64+
!6 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
65+
!7 = distinct !DISubprogram(name: "g", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
66+
!8 = !DISubroutineType(types: !2)
67+
!9 = !DILocation(line: 3, column: 26, scope: !6)
68+
!10 = !DILocation(line: 7, column: 16, scope: !7)
69+
!11 = !DILocation(line: 8, column: 8, scope: !7)

0 commit comments

Comments
 (0)