Skip to content

Commit

Permalink
BlockFrequencyInfoImpl: Avoid big numbers, increase precision for sma…
Browse files Browse the repository at this point in the history
…ll spreads

BlockFrequencyInfo calculates block frequencies as Scaled64 numbers but as a last step converts them to unsigned 64bit integers (`BlockFrequency`). This improves the factors picked for this conversion so that:

* Avoid big numbers close to UINT64_MAX to avoid users overflowing/saturating when adding multiply frequencies together or when multiplying with integers. This leaves the topmost 10 bits unused to allow for some room.
* Spread the difference between hottest/coldest block as much as possible to increase precision.
* If the hot/cold spread cannot be represented loose precision at the lower end, but keep the frequencies at the upper end for hot blocks differentiable.
  • Loading branch information
MatzeB authored Oct 25, 2023
1 parent 69ade08 commit e3cf80c
Show file tree
Hide file tree
Showing 90 changed files with 1,822 additions and 1,616 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,25 @@
// CHECK-NEXT: -: 4:
// CHECK-NEXT: 1: 5: int a = 1;
// CHECK-NEXT: 1: 6: if (a) {
// CHECK-NEXT:branch 0 taken 1
// CHECK-NEXT:branch 1 taken 0
// CHECK-NEXT:branch 0 taken 0
// CHECK-NEXT:branch 1 taken 1
// CHECK-NEXT: 1: 7: var++;
// CHECK-NEXT: 1: 8: }
// CHECK-NEXT: -: 9:
// CHECK-NEXT: 1: 10: if (a) {}
// CHECK-NEXT:branch 0 taken 1
// CHECK-NEXT:branch 1 taken 0
// CHECK-NEXT:branch 0 taken 0
// CHECK-NEXT:branch 1 taken 1
// CHECK-NEXT: -: 11:
// CHECK-NEXT: 1: 12: int b = 0;
// CHECK-NEXT: 1: 13: if (b) {
// CHECK-NEXT:branch 0 taken 0
// CHECK-NEXT:branch 1 taken 1
// CHECK-NEXT:branch 0 taken 1
// CHECK-NEXT:branch 1 taken 0
// CHECK-NEXT: #####: 14: var++;
// CHECK-NEXT: #####: 15: }
// CHECK-NEXT: -: 16:
// CHECK-NEXT: 1: 17: if (b) {}
// CHECK-NEXT:branch 0 taken 0
// CHECK-NEXT:branch 1 taken 1
// CHECK-NEXT:branch 0 taken 1
// CHECK-NEXT:branch 1 taken 0
// CHECK-NEXT: -: 18:
// CHECK-NEXT: 1: 19: return 0;
// CHECK-NEXT: -: 20:}
34 changes: 14 additions & 20 deletions llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,30 +481,24 @@ void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,

static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
const Scaled64 &Min, const Scaled64 &Max) {
// Scale the Factor to a size that creates integers. Ideally, integers would
// be scaled so that Max == UINT64_MAX so that they can be best
// differentiated. However, in the presence of large frequency values, small
// frequencies are scaled down to 1, making it impossible to differentiate
// small, unequal numbers. When the spread between Min and Max frequencies
// fits well within MaxBits, we make the scale be at least 8.
const unsigned MaxBits = 64;
const unsigned SpreadBits = (Max / Min).lg();
Scaled64 ScalingFactor;
if (SpreadBits <= MaxBits - 3) {
// If the values are small enough, make the scaling factor at least 8 to
// allow distinguishing small values.
ScalingFactor = Min.inverse();
ScalingFactor <<= 3;
} else {
// If the values need more than MaxBits to be represented, saturate small
// frequency values down to 1 by using a scaling factor that benefits large
// frequency values.
ScalingFactor = Scaled64(1, MaxBits) / Max;
}
// Scale the Factor to a size that creates integers. If possible scale
// integers so that Max == UINT64_MAX so that they can be best differentiated.
// Is is possible that the range between min and max cannot be accurately
// represented in a 64bit integer without either loosing precision for small
// values (so small unequal numbers all map to 1) or saturaturing big numbers
// loosing precision for big numbers (so unequal big numbers may map to
// UINT64_MAX). We choose to loose precision for small numbers.
const unsigned MaxBits = sizeof(Scaled64::DigitsType) * CHAR_BIT;
// Users often add up multiple BlockFrequency values or multiply them with
// things like instruction costs. Leave some room to avoid saturating
// operations reaching UIN64_MAX too early.
const unsigned Slack = 10;
Scaled64 ScalingFactor = Scaled64(1, MaxBits - Slack) / Max;

// Translate the floats to integers.
LLVM_DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
<< ", factor = " << ScalingFactor << "\n");
(void)Min;
for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) {
Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor;
BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>());
Expand Down
19 changes: 10 additions & 9 deletions llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ declare i32 @printf(i8*, ...)

; CHECK: Printing analysis {{.*}} for function 'main':
; CHECK-NEXT: block-frequency-info: main
define i32 @main() {
define i32 @main() !prof !6 {
entry:
%retval = alloca i32, align 4
%i = alloca i32, align 4
Expand Down Expand Up @@ -93,7 +93,7 @@ for.cond4: ; preds = %for.inc, %for.body3
%cmp5 = icmp slt i32 %2, 100
br i1 %cmp5, label %for.body6, label %for.end, !prof !3

; CHECK: - for.body6: float = 500000.5, int = 4000004
; CHECK: - for.body6: float = 1000000.0,{{.*}}count = 1000000
for.body6: ; preds = %for.cond4
call void @bar()
br label %for.inc
Expand Down Expand Up @@ -143,7 +143,7 @@ for.cond16: ; preds = %for.inc19, %for.bod
%cmp17 = icmp slt i32 %8, 10000
br i1 %cmp17, label %for.body18, label %for.end21, !prof !4

; CHECK: - for.body18: float = 499999.9, int = 3999998
; CHECK: - for.body18: float = 999999.5,{{.*}}count = 1000000
for.body18: ; preds = %for.cond16
call void @bar()
br label %for.inc19
Expand Down Expand Up @@ -175,7 +175,7 @@ for.cond26: ; preds = %for.inc29, %for.end
%cmp27 = icmp slt i32 %12, 1000000
br i1 %cmp27, label %for.body28, label %for.end31, !prof !5

; CHECK: - for.body28: float = 499995.2, int = 3999961
; CHECK: - for.body28: float = 1000224.3,{{.*}}count = 1000224
for.body28: ; preds = %for.cond26
call void @bar()
br label %for.inc29
Expand All @@ -197,8 +197,9 @@ for.end31: ; preds = %for.cond26
!llvm.ident = !{!0}

!0 = !{!"clang version 3.7.0 (trunk 232635) (llvm/trunk 232636)"}
!1 = !{!"branch_weights", i32 101, i32 2}
!2 = !{!"branch_weights", i32 10001, i32 101}
!3 = !{!"branch_weights", i32 1000001, i32 10001}
!4 = !{!"branch_weights", i32 1000001, i32 101}
!5 = !{!"branch_weights", i32 1000001, i32 2}
!1 = !{!"branch_weights", i32 100, i32 1}
!2 = !{!"branch_weights", i32 10000, i32 100}
!3 = !{!"branch_weights", i32 1000000, i32 10000}
!4 = !{!"branch_weights", i32 1000000, i32 100}
!5 = !{!"branch_weights", i32 1000000, i32 1}
!6 = !{!"function_entry_count", i32 1}
43 changes: 43 additions & 0 deletions llvm/test/Analysis/BlockFrequencyInfo/precision.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
; RUN: opt < %s -disable-output -passes="print<block-freq>" 2>&1 | FileCheck %s
; Sanity check precision for small-ish min/max spread.

@g = global i32 0

; CHECK-LABEL: block-frequency-info: func0
; CHECK: - entry: float = 1.0, {{.*}}, count = 1000
; CHECK: - cmp0_true: float = 0.4, {{.*}}, count = 400
; CHECK: - cmp0_false: float = 0.6, {{.*}}, count = 600
; CHECK: - cmp1_true: float = 0.1, {{.*}}, count = 100
; CHECK: - cmp1_false: float = 0.3, {{.*}}, count = 300
; CHECK: - join: float = 1.0, {{.*}}, count = 1000

define void @func0(i32 %a0, i32 %a1) !prof !0 {
entry:
%cmp0 = icmp ne i32 %a0, 0
br i1 %cmp0, label %cmp0_true, label %cmp0_false, !prof !1

cmp0_true:
store volatile i32 1, ptr @g
%cmp1 = icmp ne i32 %a1, 0
br i1 %cmp1, label %cmp1_true, label %cmp1_false, !prof !2

cmp0_false:
store volatile i32 2, ptr @g
br label %join

cmp1_true:
store volatile i32 3, ptr @g
br label %join

cmp1_false:
store volatile i32 4, ptr @g
br label %join

join:
store volatile i32 5, ptr @g
ret void
}

!0 = !{!"function_entry_count", i64 1000}
!1 = !{!"branch_weights", i32 400, i32 600}
!2 = !{!"branch_weights", i32 1, i32 3}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
; RUN: -pass-remarks-with-hotness -pass-remarks-hotness-threshold=1 \
; RUN: 2>&1 | FileCheck -check-prefix=THRESHOLD %s

; CHECK: remark: /tmp/kk.c:3:20: 1 spills 3.187500e+01 total spills cost 1 reloads 3.187500e+01 total reloads cost generated in loop{{$}}
; CHECK: remark: /tmp/kk.c:3:20: 1 spills 3.200000e+01 total spills cost 1 reloads 3.200000e+01 total reloads cost generated in loop{{$}}
; THRESHOLD-NOT: remark

define void @fpr128(ptr %p) nounwind ssp {
Expand Down
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/AArch64/cfi-fixup.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ define i32 @f0(i32 %x) #0 {
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: .cfi_remember_state
; CHECK-NEXT: cbz w0, .LBB0_4
; CHECK-NEXT: cbz w0, .LBB0_5
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: cmp w0, #2
; CHECK-NEXT: b.eq .LBB0_5
; CHECK-NEXT: b.eq .LBB0_4
; CHECK-NEXT: // %bb.2: // %entry
; CHECK-NEXT: cmp w0, #1
; CHECK-NEXT: b.ne .LBB0_6
Expand All @@ -22,20 +22,20 @@ define i32 @f0(i32 %x) #0 {
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: .cfi_restore w30
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_4:
; CHECK-NEXT: .LBB0_4: // %if.then5
; CHECK-NEXT: .cfi_restore_state
; CHECK-NEXT: .cfi_remember_state
; CHECK-NEXT: mov w0, #1
; CHECK-NEXT: bl g0
; CHECK-NEXT: mov w8, #1 // =0x1
; CHECK-NEXT: sub w0, w8, w0
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: .cfi_restore w30
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_5: // %if.then5
; CHECK-NEXT: .LBB0_5:
; CHECK-NEXT: .cfi_restore_state
; CHECK-NEXT: .cfi_remember_state
; CHECK-NEXT: bl g0
; CHECK-NEXT: mov w8, #1
; CHECK-NEXT: sub w0, w8, w0
; CHECK-NEXT: mov w0, #1 // =0x1
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: .cfi_restore w30
Expand Down Expand Up @@ -115,7 +115,7 @@ define i32 @f2(i32 %x) #0 {
; CHECK-NEXT: cbz w0, .LBB2_2
; CHECK-NEXT: // %bb.1: // %if.end
; CHECK-NEXT: bl g1
; CHECK-NEXT: mov w8, #1
; CHECK-NEXT: mov w8, #1 // =0x1
; CHECK-NEXT: sub w0, w8, w0
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: .cfi_def_cfa_offset 0
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AArch64/redundant-mov-from-zero-extend.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ define i32 @test(i32 %input, i32 %n, i32 %a) {
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_2: // %bb.0
; CHECK-NEXT: add w8, w0, w1
; CHECK-NEXT: mov w0, #100
; CHECK-NEXT: mov w0, #100 // =0x64
; CHECK-NEXT: cmp w8, #4
; CHECK-NEXT: b.hi .LBB0_5
; CHECK-NEXT: // %bb.3: // %bb.0
Expand All @@ -25,19 +25,19 @@ define i32 @test(i32 %input, i32 %n, i32 %a) {
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_5: // %bb.0
; CHECK-NEXT: cmp w8, #200
; CHECK-NEXT: b.ne .LBB0_10
; CHECK-NEXT: b.ne .LBB0_9
; CHECK-NEXT: // %bb.6: // %sw.bb7
; CHECK-NEXT: add w0, w2, #7
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_7: // %sw.bb1
; CHECK-NEXT: add w0, w2, #3
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_8: // %sw.bb3
; CHECK-NEXT: .LBB0_7: // %sw.bb3
; CHECK-NEXT: add w0, w2, #4
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_9: // %sw.bb5
; CHECK-NEXT: .LBB0_8: // %sw.bb5
; CHECK-NEXT: add w0, w2, #5
; CHECK-NEXT: .LBB0_10: // %return
; CHECK-NEXT: .LBB0_9: // %return
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_10: // %sw.bb1
; CHECK-NEXT: add w0, w2, #3
; CHECK-NEXT: ret
entry:
%b = add nsw i32 %input, %n
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AArch64/win64-jumptable.ll
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ declare void @g(i32, i32)
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: .LJTI0_0:
; CHECK: .word .LBB0_2-.Ltmp0
; CHECK: .word .LBB0_5-.Ltmp0
; CHECK: .word .LBB0_3-.Ltmp0
; CHECK: .word .LBB0_4-.Ltmp0
; CHECK: .word .LBB0_5-.Ltmp0
; CHECK: .text
; CHECK: .seh_endproc

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/wineh-bti.ll
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@ lbl4:

; CHECK: .LBB0_3:
; CHECK-NEXT: hint #36
; CHECK-NEXT: mov w0, #2
; CHECK-NEXT: mov w0, #4

; CHECK: .LBB0_4:
; CHECK-NEXT: hint #36
; CHECK-NEXT: mov w0, #4
; CHECK-NEXT: mov w0, #2

; CHECK: .LBB0_5:
; CHECK-NEXT: hint #36
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ machineFunctionInfo:
body: |
; GCN-LABEL: name: ra_introduces_vreg_def
; GCN: [[COPY_V0:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[COPY_V0]]:vgpr_32 =
; GCN: [[COPY_V1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
bb.0:
liveins: $vgpr0, $vgpr1
%0:vgpr_32 = COPY $vgpr0
Expand Down
Loading

0 comments on commit e3cf80c

Please sign in to comment.