-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[SimplifyCFG] Fold the contiguous wrapping cases into ICmp. #161000
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-coroutines @llvm/pr-subscribers-llvm-transforms Author: dianqk (dianqk) ChangesFixes #157113. Take the following IR as an example; we know the destination of the define i32 @<!-- -->src(i8 range(i8 0, 6) %arg) {
switch i8 %arg, label %else [
i8 0, label %if
i8 4, label %if
i8 5, label %if
]
if:
ret i32 0
else:
ret i32 1
} We can first try the non-wrapping range for both destinations, but I don't see how that would be any better. Proof: https://alive2.llvm.org/ce/z/acdWD4. Full diff: https://github.com/llvm/llvm-project/pull/161000.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 5e719c6c8cbb7..b02aed3f8f5ea 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5761,15 +5761,49 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
return Changed;
}
-static bool casesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
+static bool casesAreContiguous(Value *Condition,
+ SmallVectorImpl<ConstantInt *> &Cases,
+ ConstantInt *&ContiguousCasesMin,
+ ConstantInt *&ContiguousCasesMax,
+ bool &IsWrapping) {
assert(Cases.size() >= 1);
array_pod_sort(Cases.begin(), Cases.end(), constantIntSortPredicate);
- for (size_t I = 1, E = Cases.size(); I != E; ++I) {
- if (Cases[I - 1]->getValue() != Cases[I]->getValue() + 1)
+ auto Min = Cases.back()->getValue();
+ auto Max = Cases.front()->getValue();
+ auto Offset = Max - Min;
+ auto ContiguousOffset = Cases.size() - 1;
+ if (Offset == ContiguousOffset) {
+ ContiguousCasesMin = Cases.back();
+ ContiguousCasesMax = Cases.front();
+ return true;
+ }
+ ConstantRange CR = computeConstantRange(Condition, /*ForSigned=*/false);
+ // If this is a wrapping contiguous range, that is, [Min, OtherMin] +
+ // [OtherMax, Max] (also [OtherMax, OtherMin]), [OtherMin+1, OtherMax-1] is a
+ // contiguous range for the other destination. N.B. If CR is not a full range,
+ // Max+1 is not equal to Min. It's not continuous in arithmetic.
+ if (Max == CR.getUnsignedMax() && Min == CR.getUnsignedMin()) {
+ assert(Cases.size() >= 2);
+ auto *It =
+ std::adjacent_find(Cases.begin(), Cases.end(), [](auto L, auto R) {
+ return L->getValue() != R->getValue() + 1;
+ });
+ if (It == Cases.end())
return false;
+ auto *OtherMax = *It;
+ auto *OtherMin = *(It + 1);
+ if ((Max - OtherMax->getValue()) + (OtherMin->getValue() - Min) ==
+ Cases.size() - 2) {
+ ContiguousCasesMin = cast<ConstantInt>(
+ ConstantInt::get(OtherMin->getType(), OtherMin->getValue() + 1));
+ ContiguousCasesMax = cast<ConstantInt>(
+ ConstantInt::get(OtherMax->getType(), OtherMax->getValue() - 1));
+ IsWrapping = true;
+ return true;
+ }
}
- return true;
+ return false;
}
static void createUnreachableSwitchDefault(SwitchInst *Switch,
@@ -5840,25 +5874,34 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
assert(!CasesA.empty() || HasDefault);
// Figure out if one of the sets of cases form a contiguous range.
- SmallVectorImpl<ConstantInt *> *ContiguousCases = nullptr;
+ ConstantInt *ContiguousCasesMin = nullptr;
+ ConstantInt *ContiguousCasesMax = nullptr;
BasicBlock *ContiguousDest = nullptr;
BasicBlock *OtherDest = nullptr;
- if (!CasesA.empty() && casesAreContiguous(CasesA)) {
- ContiguousCases = &CasesA;
+ bool IsWrapping = false;
+ if (!CasesA.empty() &&
+ casesAreContiguous(SI->getCondition(), CasesA, ContiguousCasesMin,
+ ContiguousCasesMax, IsWrapping)) {
ContiguousDest = DestA;
OtherDest = DestB;
- } else if (casesAreContiguous(CasesB)) {
- ContiguousCases = &CasesB;
+ } else if (casesAreContiguous(SI->getCondition(), CasesB, ContiguousCasesMin,
+ ContiguousCasesMax, IsWrapping)) {
ContiguousDest = DestB;
OtherDest = DestA;
} else
return false;
+ if (IsWrapping)
+ std::swap(ContiguousDest, OtherDest);
+
// Start building the compare and branch.
- Constant *Offset = ConstantExpr::getNeg(ContiguousCases->back());
- Constant *NumCases =
- ConstantInt::get(Offset->getType(), ContiguousCases->size());
+ auto ContiguousCasesSize =
+ (ContiguousCasesMax->getValue() - ContiguousCasesMin->getValue())
+ .getZExtValue() +
+ 1;
+ Constant *Offset = ConstantExpr::getNeg(ContiguousCasesMin);
+ Constant *NumCases = ConstantInt::get(Offset->getType(), ContiguousCasesSize);
Value *Sub = SI->getCondition();
if (!Offset->isNullValue())
@@ -5866,7 +5909,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
Value *Cmp;
// If NumCases overflowed, then all possible values jump to the successor.
- if (NumCases->isNullValue() && !ContiguousCases->empty())
+ if (NumCases->isNullValue() && ContiguousCasesSize != 0)
Cmp = ConstantInt::getTrue(SI->getContext());
else
Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
@@ -5895,14 +5938,14 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
// Prune obsolete incoming values off the successors' PHI nodes.
for (auto BBI = ContiguousDest->begin(); isa<PHINode>(BBI); ++BBI) {
- unsigned PreviousEdges = ContiguousCases->size();
+ unsigned PreviousEdges = ContiguousCasesSize;
if (ContiguousDest == SI->getDefaultDest())
++PreviousEdges;
for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
}
for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) {
- unsigned PreviousEdges = SI->getNumCases() - ContiguousCases->size();
+ unsigned PreviousEdges = SI->getNumCases() - ContiguousCasesSize;
if (OtherDest == SI->getDefaultDest())
++PreviousEdges;
for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
index 8f2ae2d054f1e..c6904fe273f09 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
@@ -188,4 +188,97 @@ exit:
ret void
}
+define i32 @wrapping_known_range(i8 range(i8 0, 6) %arg) {
+; CHECK-LABEL: @wrapping_known_range(
+; CHECK-NEXT: [[ARG_OFF:%.*]] = add i8 [[ARG:%.*]], -1
+; CHECK-NEXT: [[SWITCH:%.*]] = icmp ult i8 [[ARG_OFF]], 3
+; CHECK-NEXT: br i1 [[SWITCH]], label [[ELSE:%.*]], label [[IF:%.*]]
+; CHECK: common.ret:
+; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ [[I0:%.*]], [[IF]] ], [ [[I1:%.*]], [[ELSE]] ]
+; CHECK-NEXT: ret i32 [[COMMON_RET_OP]]
+; CHECK: if:
+; CHECK-NEXT: [[I0]] = call i32 @f(i32 0)
+; CHECK-NEXT: br label [[COMMON_RET:%.*]]
+; CHECK: else:
+; CHECK-NEXT: [[I1]] = call i32 @f(i32 1)
+; CHECK-NEXT: br label [[COMMON_RET]]
+;
+ switch i8 %arg, label %else [
+ i8 0, label %if
+ i8 4, label %if
+ i8 5, label %if
+ ]
+
+if:
+ %i0 = call i32 @f(i32 0)
+ ret i32 %i0
+
+else:
+ %i1 = call i32 @f(i32 1)
+ ret i32 %i1
+}
+
+define i32 @wrapping_range(i8 %arg) {
+; CHECK-LABEL: @wrapping_range(
+; CHECK-NEXT: [[ARG_OFF:%.*]] = add i8 [[ARG:%.*]], -1
+; CHECK-NEXT: [[SWITCH:%.*]] = icmp ult i8 [[ARG_OFF]], -4
+; CHECK-NEXT: br i1 [[SWITCH]], label [[ELSE:%.*]], label [[IF:%.*]]
+; CHECK: common.ret:
+; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ [[I0:%.*]], [[IF]] ], [ [[I1:%.*]], [[ELSE]] ]
+; CHECK-NEXT: ret i32 [[COMMON_RET_OP]]
+; CHECK: if:
+; CHECK-NEXT: [[I0]] = call i32 @f(i32 0)
+; CHECK-NEXT: br label [[COMMON_RET:%.*]]
+; CHECK: else:
+; CHECK-NEXT: [[I1]] = call i32 @f(i32 1)
+; CHECK-NEXT: br label [[COMMON_RET]]
+;
+ switch i8 %arg, label %else [
+ i8 0, label %if
+ i8 -3, label %if
+ i8 -2, label %if
+ i8 -1, label %if
+ ]
+
+if:
+ %i0 = call i32 @f(i32 0)
+ ret i32 %i0
+
+else:
+ %i1 = call i32 @f(i32 1)
+ ret i32 %i1
+}
+
+define i32 @no_continuous_wrapping_range(i8 %arg) {
+; CHECK-LABEL: @no_continuous_wrapping_range(
+; CHECK-NEXT: switch i8 [[ARG:%.*]], label [[ELSE:%.*]] [
+; CHECK-NEXT: i8 0, label [[IF:%.*]]
+; CHECK-NEXT: i8 -3, label [[IF]]
+; CHECK-NEXT: i8 -1, label [[IF]]
+; CHECK-NEXT: ]
+; CHECK: common.ret:
+; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ [[I0:%.*]], [[IF]] ], [ [[I1:%.*]], [[ELSE]] ]
+; CHECK-NEXT: ret i32 [[COMMON_RET_OP]]
+; CHECK: if:
+; CHECK-NEXT: [[I0]] = call i32 @f(i32 0)
+; CHECK-NEXT: br label [[COMMON_RET:%.*]]
+; CHECK: else:
+; CHECK-NEXT: [[I1]] = call i32 @f(i32 1)
+; CHECK-NEXT: br label [[COMMON_RET]]
+;
+ switch i8 %arg, label %else [
+ i8 0, label %if
+ i8 -3, label %if
+ i8 -1, label %if
+ ]
+
+if:
+ %i0 = call i32 @f(i32 0)
+ ret i32 %i0
+
+else:
+ %i1 = call i32 @f(i32 1)
+ ret i32 %i1
+}
+
declare void @bar(ptr nonnull dereferenceable(4))
|
@zyw-bot csmith-quick-fuzz |
Maybe limit this to >2 cases to start to avoid regressions? |
bb9720c
to
9864e17
Compare
This is still needed, some 2 cases have not been transformed: https://llvm.godbolt.org/z/Y71EovY4M. I think it's fine because it runs later in the passes pipeline. |
I improved this with only one case also. |
// Max+1 is not equal to Min. It's not continuous in arithmetic. | ||
if (Max == CR.getUnsignedMax() && Min == CR.getUnsignedMin()) { | ||
assert(Cases.size() >= 2); | ||
auto *It = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about something as follows before the logic here?
// Find the first non-consecutive pair, and ensure this pair
// happens to be unique.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(I meant this before the std::adjacent_find, which is not appearing)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is what you want. std::adjacent_find
finds one non-consecutive pair and checks their distance to check if this pair is unique.
auto Min = Cases.back()->getValue(); | ||
auto Max = Cases.front()->getValue(); | ||
auto Offset = Max - Min; | ||
auto ContiguousOffset = Cases.size() - 1; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need for auto here, having the type explicit (const APInt&, unsigned) would make the code more readable.
auto *OtherMax = *It; | ||
auto *OtherMin = *(It + 1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
auto *OtherMax = *It; | |
auto *OtherMin = *(It + 1); | |
auto [OtherMax, OtherMin] = std::make_pair(*It, *std::next(It)); |
|
||
static bool casesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) { | ||
static bool casesAreContiguous(Value *Condition, | ||
SmallVectorImpl<ConstantInt *> &Cases, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the function name would deserve an update, e.g., findContiguousCases
(and maybe return a std::optional struct instead of the three values as out parameters)?
if (auto Result = findContiguousCases(SI->getCondition(), CasesA, CasesB, | ||
DestA, DestB)) | ||
ContiguousCases = *Result; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (auto Result = findContiguousCases(SI->getCondition(), CasesA, CasesB, | |
DestA, DestB)) | |
ContiguousCases = *Result; | |
ContiguousCases = findContiguousCases(SI->getCondition(), CasesA, CasesB, | |
DestA, DestB); |
if (auto Result = findContiguousCases(SI->getCondition(), CasesB, CasesA, | ||
DestB, DestA)) | ||
ContiguousCases = *Result; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (auto Result = findContiguousCases(SI->getCondition(), CasesB, CasesA, | |
DestB, DestA)) | |
ContiguousCases = *Result; | |
ContiguousCases = findContiguousCases(SI->getCondition(), CasesB, CasesA, | |
DestB, DestA); |
|
||
// Start building the compare and branch. | ||
// Correctness: Cases to the default destination cannot be contiguous cases. | ||
if (!ContiguousCases && !HasDefault && !CasesA.empty()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (!ContiguousCases && !HasDefault && !CasesA.empty()) | |
else if (!HasDefault) |
!HasDefault
implies !CasesA.empty()
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. TBH the cases are unnecessary to be contiguous. There can be holes between them (unreachable cases).
This patch implements the todo discussed in llvm#158495 (comment). It also fixes a regression introduced by llvm#161000. See also dtcxzyw/llvm-opt-benchmark#2890 (comment). IR diff: dtcxzyw/llvm-opt-benchmark#2892
I am seeing a "Broken module" error when building the Linux kernel with full LTO (it shows up for AArch64 /
I was able to find two
If there is any other information I can provide, I am more than happy to do so. |
Reproduced. I will provide a smaller IR reproducer. |
…is unreachable (llvm#162677) Fixes llvm#162585. llvm#161000 changed `br i1 true, label %if, label %else` to `br label %if`, so we should remove one more incoming value.
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) torvalds#618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
…is unreachable (llvm#162677) Fixes llvm#162585. llvm#161000 changed `br i1 true, label %if, label %else` to `br label %if`, so we should remove one more incoming value.
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
With latest llvm22, I hit the verif_scale_strobemeta selftest failure below: $ ./test_progs -n 618 libbpf: prog 'on_event': BPF program load failed: -E2BIG libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG -- BPF program is too large. Processed 1000001 insn verification time 7019091 usec stack depth 488 processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'on_event': failed to load: -E2BIG libbpf: failed to load object 'strobemeta.bpf.o' scale_test:FAIL:expect_success unexpected error: -7 (errno 7) #618 verif_scale_strobemeta:FAIL But if I increase the verificaiton insn limit from 1M to 10M, the above test_progs run actually will succeed. The below is the result from veristat: $ ./veristat strobemeta.bpf.o Processing 'strobemeta.bpf.o'... File Program Verdict Duration (us) Insns States Program size Jited size ---------------- -------- ------- ------------- ------- ------ ------------ ---------- strobemeta.bpf.o on_event success 90250893 9777685 358230 15954 80794 ---------------- -------- ------- ------------- ------- ------ ------------ ---------- Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs. Further debugging shows the llvm commit [1] is responsible for the verificaiton failure as it tries to convert certain switch statement to if-condition. Such change may cause different transformation compared to original switch statement. In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) #2 !dbg !535 { %6 = alloca ptr, align 8 %7 = alloca i64, align 8 %8 = alloca ptr, align 8 %9 = alloca ptr, align 8 %10 = alloca ptr, align 8 %11 = alloca ptr, align 8 %12 = alloca i32, align 4 ... %20 = icmp ne ptr %19, null, !dbg !561 br i1 %20, label %22, label %21, !dbg !562 21: ; preds = %5 store i32 1, ptr %12, align 4 br label %48, !dbg !563 22: %23 = load ptr, ptr %9, align 8, !dbg !564 ... 47: ; preds = %38, %22 store i32 0, ptr %12, align 4, !dbg !588 br label %48, !dbg !588 48: ; preds = %47, %21 call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588 %49 = load i32, ptr %12, align 4 switch i32 %49, label %51 [ i32 0, label %50 i32 1, label %50 ] 50: ; preds = %48, %48 ret void, !dbg !589 51: ; preds = %48 unreachable } Note that the above 'switch' statement is added by clang frontend. Without [1], the switch statement will survive until SelectionDag, so the switch statement acts like a 'barrier' and prevents some transformation involved with both 'before' and 'after' the switch statement. But with [1], the switch statement will be removed during middle end optimization and later middle end passes (esp. after inlining) have more freedom to reorder the code. The following is the related source code: static void *calc_location(struct strobe_value_loc *loc, void *tls_base): bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; In read_int_var() func, we have: void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) return; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); ... The static func calc_location() is called inside read_int_var(). The asm code without [1]: 77: .123....89 (85) call bpf_probe_read_user#112 78: ........89 (79) r1 = *(u64 *)(r10 -368) 79: .1......89 (79) r2 = *(u64 *)(r10 -8) 80: .12.....89 (bf) r3 = r2 81: .123....89 (0f) r3 += r1 82: ..23....89 (07) r2 += 1 83: ..23....89 (79) r4 = *(u64 *)(r10 -464) 84: ..234...89 (a5) if r2 < 0x2 goto pc+13 85: ...34...89 (15) if r3 == 0x0 goto pc+12 86: ...3....89 (bf) r1 = r10 87: .1.3....89 (07) r1 += -400 88: .1.3....89 (b4) w2 = 16 In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place, so the verifier actually prefers to do verification first at 'r1 = r10' etc. The asm code with [1]: 119: .123....89 (85) call bpf_probe_read_user#112 120: ........89 (79) r1 = *(u64 *)(r10 -368) 121: .1......89 (79) r2 = *(u64 *)(r10 -8) 122: .12.....89 (bf) r3 = r2 123: .123....89 (0f) r3 += r1 124: ..23....89 (07) r2 += -1 125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6 126: ........89 (05) goto pc+17 ... 144: ........89 (b4) w1 = 0 145: .1......89 (6b) *(u16 *)(r8 +80) = r1 In this case, if 'r2 < 0xfffffffe' is true, the control will go to non-null 'location' branch, so 'goto pc+17' will actually go to null 'location' branch. This seems causing tremendous amount of verificaiton state. To fix the issue, rewrite the following code return tls_ptr && tls_ptr != (void *)-1 ? tls_ptr + tls_index.offset : NULL; to if/then statement and hopefully these explicit if/then statements are sticky during middle-end optimizations. Test with llvm20 and llvm21 as well and all strobemeta related selftests are passed. [1] llvm/llvm-project#161000 Signed-off-by: Yonghong Song <yonghong.song@linux.dev> Link: https://lore.kernel.org/r/20251014051639.1996331-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Fixes #157113.
Take the following IR as an example; we know the destination of the
[1, 3]
cases is%else
.We can first try the non-wrapping range for both destinations, but I don't see how that would be any better.
Proof: https://alive2.llvm.org/ce/z/acdWD4.