BlockFrequencyInfoImpl: Avoid big numbers, increase precision for sma…

…ll spreads BlockFrequencyInfo calculates block frequencies as Scaled64 numbers but as a last step converts them to unsigned 64bit integers (`BlockFrequency`). This improves the factors picked for this conversion so that: * Avoid big numbers close to UINT64_MAX to avoid users overflowing/saturating when adding multiply frequencies together or when multiplying with integers. This leaves the topmost 10 bits unused to allow for some room. * Spread the difference between hottest/coldest block as much as possible to increase precision. * If the hot/cold spread cannot be represented loose precision at the lower end, but keep the frequencies at the upper end for hot blocks differentiable.
llvm · Oct 25, 2023 · e3cf80c · e3cf80c
1 parent 69ade08
commit e3cf80c
Show file tree

Hide file tree

Showing 90 changed files with 1,822 additions and 1,616 deletions.
diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov
@@ -10,25 +10,25 @@
 // CHECK-NEXT:        -:    4:
 // CHECK-NEXT:        1:    5:  int a = 1;
 // CHECK-NEXT:        1:    6:  if (a) {
-// CHECK-NEXT:branch  0 taken 1
-// CHECK-NEXT:branch  1 taken 0
+// CHECK-NEXT:branch  0 taken 0
+// CHECK-NEXT:branch  1 taken 1
 // CHECK-NEXT:        1:    7:    var++;
 // CHECK-NEXT:        1:    8:  }
 // CHECK-NEXT:        -:    9:
 // CHECK-NEXT:        1:   10:  if (a) {}
-// CHECK-NEXT:branch  0 taken 1
-// CHECK-NEXT:branch  1 taken 0
+// CHECK-NEXT:branch  0 taken 0
+// CHECK-NEXT:branch  1 taken 1
 // CHECK-NEXT:        -:   11:
 // CHECK-NEXT:        1:   12:  int b = 0;
 // CHECK-NEXT:        1:   13:  if (b) {
-// CHECK-NEXT:branch  0 taken 0
-// CHECK-NEXT:branch  1 taken 1
+// CHECK-NEXT:branch  0 taken 1
+// CHECK-NEXT:branch  1 taken 0
 // CHECK-NEXT:    #####:   14:    var++;
 // CHECK-NEXT:    #####:   15:  }
 // CHECK-NEXT:        -:   16:
 // CHECK-NEXT:        1:   17:  if (b) {}
-// CHECK-NEXT:branch  0 taken 0
-// CHECK-NEXT:branch  1 taken 1
+// CHECK-NEXT:branch  0 taken 1
+// CHECK-NEXT:branch  1 taken 0
 // CHECK-NEXT:        -:   18:
 // CHECK-NEXT:        1:   19:  return 0;
 // CHECK-NEXT:        -:   20:}
diff --git a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -481,30 +481,24 @@ void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
 
 static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
                                      const Scaled64 &Min, const Scaled64 &Max) {
-  // Scale the Factor to a size that creates integers.  Ideally, integers would
-  // be scaled so that Max == UINT64_MAX so that they can be best
-  // differentiated.  However, in the presence of large frequency values, small
-  // frequencies are scaled down to 1, making it impossible to differentiate
-  // small, unequal numbers. When the spread between Min and Max frequencies
-  // fits well within MaxBits, we make the scale be at least 8.
-  const unsigned MaxBits = 64;
-  const unsigned SpreadBits = (Max / Min).lg();
-  Scaled64 ScalingFactor;
-  if (SpreadBits <= MaxBits - 3) {
-    // If the values are small enough, make the scaling factor at least 8 to
-    // allow distinguishing small values.
-    ScalingFactor = Min.inverse();
-    ScalingFactor <<= 3;
-  } else {
-    // If the values need more than MaxBits to be represented, saturate small
-    // frequency values down to 1 by using a scaling factor that benefits large
-    // frequency values.
-    ScalingFactor = Scaled64(1, MaxBits) / Max;
-  }
+  // Scale the Factor to a size that creates integers.  If possible scale
+  // integers so that Max == UINT64_MAX so that they can be best differentiated.
+  // Is is possible that the range between min and max cannot be accurately
+  // represented in a 64bit integer without either loosing precision for small
+  // values (so small unequal numbers all map to 1) or saturaturing big numbers
+  // loosing precision for big numbers (so unequal big numbers may map to
+  // UINT64_MAX). We choose to loose precision for small numbers.
+  const unsigned MaxBits = sizeof(Scaled64::DigitsType) * CHAR_BIT;
+  // Users often add up multiple BlockFrequency values or multiply them with
+  // things like instruction costs. Leave some room to avoid saturating
+  // operations reaching UIN64_MAX too early.
+  const unsigned Slack = 10;
+  Scaled64 ScalingFactor = Scaled64(1, MaxBits - Slack) / Max;
 
   // Translate the floats to integers.
   LLVM_DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
                     << ", factor = " << ScalingFactor << "\n");
+  (void)Min;
   for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) {
     Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor;
     BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>());

diff --git a/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll b/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
@@ -59,7 +59,7 @@ declare i32 @printf(i8*, ...)
 
 ; CHECK: Printing analysis {{.*}} for function 'main':
 ; CHECK-NEXT: block-frequency-info: main
-define i32 @main() {
+define i32 @main() !prof !6 {
 entry:
   %retval = alloca i32, align 4
   %i = alloca i32, align 4
@@ -93,7 +93,7 @@ for.cond4:                                        ; preds = %for.inc, %for.body3
   %cmp5 = icmp slt i32 %2, 100
   br i1 %cmp5, label %for.body6, label %for.end, !prof !3
 
-; CHECK: - for.body6: float = 500000.5, int = 4000004
+; CHECK: - for.body6: float = 1000000.0,{{.*}}count = 1000000
 for.body6:                                        ; preds = %for.cond4
   call void @bar()
   br label %for.inc
@@ -143,7 +143,7 @@ for.cond16:                                       ; preds = %for.inc19, %for.bod
   %cmp17 = icmp slt i32 %8, 10000
   br i1 %cmp17, label %for.body18, label %for.end21, !prof !4
 
-; CHECK: - for.body18: float = 499999.9, int = 3999998
+; CHECK: - for.body18: float = 999999.5,{{.*}}count = 1000000
 for.body18:                                       ; preds = %for.cond16
   call void @bar()
   br label %for.inc19
@@ -175,7 +175,7 @@ for.cond26:                                       ; preds = %for.inc29, %for.end
   %cmp27 = icmp slt i32 %12, 1000000
   br i1 %cmp27, label %for.body28, label %for.end31, !prof !5
 
-; CHECK: - for.body28: float = 499995.2, int = 3999961
+; CHECK: - for.body28: float = 1000224.3,{{.*}}count = 1000224
 for.body28:                                       ; preds = %for.cond26
   call void @bar()
   br label %for.inc29
@@ -197,8 +197,9 @@ for.end31:                                        ; preds = %for.cond26
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.7.0 (trunk 232635) (llvm/trunk 232636)"}
-!1 = !{!"branch_weights", i32 101, i32 2}
-!2 = !{!"branch_weights", i32 10001, i32 101}
-!3 = !{!"branch_weights", i32 1000001, i32 10001}
-!4 = !{!"branch_weights", i32 1000001, i32 101}
-!5 = !{!"branch_weights", i32 1000001, i32 2}
+!1 = !{!"branch_weights", i32 100, i32 1}
+!2 = !{!"branch_weights", i32 10000, i32 100}
+!3 = !{!"branch_weights", i32 1000000, i32 10000}
+!4 = !{!"branch_weights", i32 1000000, i32 100}
+!5 = !{!"branch_weights", i32 1000000, i32 1}
+!6 = !{!"function_entry_count", i32 1}
diff --git a/llvm/test/Analysis/BlockFrequencyInfo/precision.ll b/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s -disable-output -passes="print<block-freq>" 2>&1 | FileCheck %s
+; Sanity check precision for small-ish min/max spread.
+
+@g = global i32 0
+
+; CHECK-LABEL: block-frequency-info: func0
+; CHECK: - entry: float = 1.0, {{.*}}, count = 1000
+; CHECK: - cmp0_true: float = 0.4, {{.*}}, count = 400
+; CHECK: - cmp0_false: float = 0.6, {{.*}}, count = 600
+; CHECK: - cmp1_true: float = 0.1, {{.*}}, count = 100
+; CHECK: - cmp1_false: float = 0.3, {{.*}}, count = 300
+; CHECK: - join: float = 1.0, {{.*}}, count = 1000
+
+define void @func0(i32 %a0, i32 %a1) !prof !0 {
+entry:
+  %cmp0 = icmp ne i32 %a0, 0
+  br i1 %cmp0, label %cmp0_true, label %cmp0_false, !prof !1
+
+cmp0_true:
+  store volatile i32 1, ptr @g
+  %cmp1 = icmp ne i32 %a1, 0
+  br i1 %cmp1, label %cmp1_true, label %cmp1_false, !prof !2
+
+cmp0_false:
+  store volatile i32 2, ptr @g
+  br label %join
+
+cmp1_true:
+  store volatile i32 3, ptr @g
+  br label %join
+
+cmp1_false:
+  store volatile i32 4, ptr @g
+  br label %join
+
+join:
+  store volatile i32 5, ptr @g
+  ret void
+}
+
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 400, i32 600}
+!2 = !{!"branch_weights", i32 1, i32 3}
diff --git a/llvm/test/CodeGen/AArch64/arm64-spill-remarks-treshold-hotness.ll b/llvm/test/CodeGen/AArch64/arm64-spill-remarks-treshold-hotness.ll
@@ -5,7 +5,7 @@
 ; RUN:       -pass-remarks-with-hotness -pass-remarks-hotness-threshold=1 \
 ; RUN:       2>&1 | FileCheck -check-prefix=THRESHOLD %s
 
-; CHECK: remark: /tmp/kk.c:3:20: 1 spills 3.187500e+01 total spills cost 1 reloads 3.187500e+01 total reloads cost generated in loop{{$}}
+; CHECK: remark: /tmp/kk.c:3:20: 1 spills 3.200000e+01 total spills cost 1 reloads 3.200000e+01 total reloads cost generated in loop{{$}}
 ; THRESHOLD-NOT: remark
 
 define void @fpr128(ptr %p) nounwind ssp {

diff --git a/llvm/test/CodeGen/AArch64/cfi-fixup.ll b/llvm/test/CodeGen/AArch64/cfi-fixup.ll
@@ -8,10 +8,10 @@ define i32 @f0(i32 %x) #0 {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    .cfi_remember_state
-; CHECK-NEXT:    cbz w0, .LBB0_4
+; CHECK-NEXT:    cbz w0, .LBB0_5
 ; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    cmp w0, #2
-; CHECK-NEXT:    b.eq .LBB0_5
+; CHECK-NEXT:    b.eq .LBB0_4
 ; CHECK-NEXT:  // %bb.2: // %entry
 ; CHECK-NEXT:    cmp w0, #1
 ; CHECK-NEXT:    b.ne .LBB0_6
@@ -22,20 +22,20 @@ define i32 @f0(i32 %x) #0 {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w30
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:  .LBB0_4: // %if.then5
 ; CHECK-NEXT:    .cfi_restore_state
 ; CHECK-NEXT:    .cfi_remember_state
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    bl g0
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    sub w0, w8, w0
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w30
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_5: // %if.then5
+; CHECK-NEXT:  .LBB0_5:
 ; CHECK-NEXT:    .cfi_restore_state
 ; CHECK-NEXT:    .cfi_remember_state
-; CHECK-NEXT:    bl g0
-; CHECK-NEXT:    mov w8, #1
-; CHECK-NEXT:    sub w0, w8, w0
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w30
@@ -115,7 +115,7 @@ define i32 @f2(i32 %x) #0 {
 ; CHECK-NEXT:    cbz w0, .LBB2_2
 ; CHECK-NEXT:  // %bb.1: // %if.end
 ; CHECK-NEXT:    bl g1
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    sub w0, w8, w0
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0

diff --git a/llvm/test/CodeGen/AArch64/redundant-mov-from-zero-extend.ll b/llvm/test/CodeGen/AArch64/redundant-mov-from-zero-extend.ll
@@ -10,7 +10,7 @@ define i32 @test(i32 %input, i32 %n, i32 %a) {
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_2: // %bb.0
 ; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    mov w0, #100
+; CHECK-NEXT:    mov w0, #100 // =0x64
 ; CHECK-NEXT:    cmp w8, #4
 ; CHECK-NEXT:    b.hi .LBB0_5
 ; CHECK-NEXT:  // %bb.3: // %bb.0
@@ -25,19 +25,19 @@ define i32 @test(i32 %input, i32 %n, i32 %a) {
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_5: // %bb.0
 ; CHECK-NEXT:    cmp w8, #200
-; CHECK-NEXT:    b.ne .LBB0_10
+; CHECK-NEXT:    b.ne .LBB0_9
 ; CHECK-NEXT:  // %bb.6: // %sw.bb7
 ; CHECK-NEXT:    add w0, w2, #7
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_7: // %sw.bb1
-; CHECK-NEXT:    add w0, w2, #3
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_8: // %sw.bb3
+; CHECK-NEXT:  .LBB0_7: // %sw.bb3
 ; CHECK-NEXT:    add w0, w2, #4
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_9: // %sw.bb5
+; CHECK-NEXT:  .LBB0_8: // %sw.bb5
 ; CHECK-NEXT:    add w0, w2, #5
-; CHECK-NEXT:  .LBB0_10: // %return
+; CHECK-NEXT:  .LBB0_9: // %return
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_10: // %sw.bb1
+; CHECK-NEXT:    add w0, w2, #3
 ; CHECK-NEXT:    ret
 entry:
   %b = add nsw i32 %input, %n

diff --git a/llvm/test/CodeGen/AArch64/win64-jumptable.ll b/llvm/test/CodeGen/AArch64/win64-jumptable.ll
@@ -42,9 +42,9 @@ declare void @g(i32, i32)
 ; CHECK-NEXT: .p2align  2
 ; CHECK-NEXT: .LJTI0_0:
 ; CHECK:    .word .LBB0_2-.Ltmp0
+; CHECK:    .word .LBB0_5-.Ltmp0
 ; CHECK:    .word .LBB0_3-.Ltmp0
 ; CHECK:    .word .LBB0_4-.Ltmp0
-; CHECK:    .word .LBB0_5-.Ltmp0
 ; CHECK:    .text
 ; CHECK:    .seh_endproc
 

diff --git a/llvm/test/CodeGen/AArch64/wineh-bti.ll b/llvm/test/CodeGen/AArch64/wineh-bti.ll
@@ -47,11 +47,11 @@ lbl4:
 
 ; CHECK:      .LBB0_3:
 ; CHECK-NEXT: hint #36
-; CHECK-NEXT: mov w0, #2
+; CHECK-NEXT: mov w0, #4
 
 ; CHECK:      .LBB0_4:
 ; CHECK-NEXT: hint #36
-; CHECK-NEXT: mov w0, #4
+; CHECK-NEXT: mov w0, #2
 
 ; CHECK:      .LBB0_5:
 ; CHECK-NEXT: hint #36

diff --git a/llvm/test/CodeGen/AMDGPU/greedy-broken-ssa-verifier-error.mir b/llvm/test/CodeGen/AMDGPU/greedy-broken-ssa-verifier-error.mir
@@ -15,7 +15,7 @@ machineFunctionInfo:
 body:             |
   ; GCN-LABEL: name: ra_introduces_vreg_def
   ; GCN: [[COPY_V0:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GCN: [[COPY_V0]]:vgpr_32 =
+  ; GCN: [[COPY_V1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   bb.0:
     liveins: $vgpr0, $vgpr1
     %0:vgpr_32 = COPY $vgpr0