// Overview / Examples / API / FAQ
Performance
- branch is relatively stable through its life cycle
and/or
- branch is expensive to compute / require memory access
and/or
- branch is hard to learn by the branch predictor
Examples: logging, tracing, dispatching, fast path, devirtualization, ...
- Single header (https://raw.githubusercontent.com/qlibs/jmp/main/jmp - for integration see FAQ)
- Minimal API
- Verifies itself upon include (can be disabled with
-DNTEST
- see FAQ)
- C++20 (clang++10+, g++10+) / x86-64 / Linux
static_branch<bool>
(https://godbolt.org/z/v8W3Pzbxd)
/**
* constexpr minimal overhead static branch changed at run-time via code patching
*/
constexpr jmp::static_branch<bool> static_bool = false;
/**
* Note: `fn` can be inline/noinline/constexpr/etc.
*/
void fn() {
if (static_bool) { // Note: [[likely]], [[unlikely]] has no impact
std::puts("taken");
} else {
std::puts("not taken");
}
}
int main() {
if (not jmp::init()) { // enables run-time code patching
return errno;
}
fn(); // not taken
static_bool = true;
fn(); // taken
}
main: // $CXX -O3
# init
xor eax, eax # return 0
# fn(); // not taken
nop
.Ltmp0:
mov edi, OFFSET FLAT:.LC0
jmp puts
ret
.Ltmp1:
mov edi, OFFSET FLAT:.LC1
jmp puts
ret
# static_bool = true;
call static_bool.operator=(true)
# fn(); // taken
jmp .Ltmp1 // code patching (nop->jmp .Ltmp1)
.Ltmp0:
mov edi, OFFSET FLAT:.LC0
jmp puts
ret
.Ltmp1:
mov edi, OFFSET FLAT:.LC1
jmp puts
ret
.LC0: .asciz "not taken"
.LC1: .asciz "taken"
static_branch<bool> vs bool
(https://godbolt.org/z/jvKGdPMWK)
constexpr jmp::static_branch<bool> static_bool = false;
void fn() {
if (static_bool) {
throw;
} else {
std::puts("else");
}
}
fn():
// static_bool = false;
[1] [2] [3] [4] [5] [6] Instructions:
1 0 0.25 nop
1 1 0.33 lea rdi, [rip + .L.str]
1 1 0.50 jmp puts
1 1 1.00 * .LBB0_1: push rax
1 1 0.50 call __cxa_rethrow@PLT
// static_bool = true;
[1] [2] [3] [4] [5] [6] Instructions:
1 1 0.50 jmp .LBB0_1
1 1 0.50 lea rdi, [rip + .L.str]
1 1 0.50 jmp puts
3 2 1.00 * .LBB0_1: push rax
4 3 1.00 call __cxa_rethrow@PLT
[1]: #uOps [2]: Latency [3]: RThroughput
[4]: MayLoad [5]: MayStore [6]: HasSideEffects (U)
bool dynamic_bool = false;
void fn() {
if (dynamic_bool) {
throw;
} else {
std::puts("else");
}
}
fn():
// dynamic_bool = false;
[1] [2] [3] [4] [5] [6] Instructions:
2 5 0.50 * cmp byte ptr [rip + dynamic_bool], 1
1 1 0.25 je .LBB0_1
1 1 0.25 lea rdi, [rip + .L.str]
1 1 0.25 jmp puts
1 1 0.50 * .LBB0_1: push rax
1 1 0.25 call __cxa_rethrow@PLT
// dynamic_bool = true;
[1] [2] [3] [4] [5] [6] Instructions:
2 5 0.50 * cmp byte ptr [rip + dynamic_bool], 1
1 1 0.25 je .LBB0_1
1 1 0.25 lea rdi, [rip + .L.str]
1 1 0.25 jmp puts
1 1 0.50 * .LBB0_1: push rax
1 1 0.25 call __cxa_rethrow@PLT
[1]: #uOps [2]: Latency [3]: RThroughput
[4]: MayLoad [5]: MayStore [6]: HasSideEffects (U)
static_branch<T, T Min, T Max>
(https://godbolt.org/z/Tz4ox7ncv)
constexpr jmp::static_branch<int, 0, 2> static_int = 0; // range: <0, 2>
void fn() {
switch (static_int) {
default: std::unreachable();
case 0: std::puts("0"); return;
case 1: std::puts("1"); return;
case 2: std::puts("2"); return;
}
}
int main() {
if (not jmp::init()) { // enables run-time code patching
return errno;
}
fn(); // 0
static_int = 1;
fn(); // 1
static_int = 2;
fn(); // 2
}
fn: // $CXX -O3 -fno-inline
nop # code patching (nop->jmp .Ltmp1|.Ltmp2)
.Ltmp0:
mov edi, OFFSET FLAT:.LC0
jmp puts
ret
.Ltmp1:
mov edi, OFFSET FLAT:.LC1
jmp puts
ret
.Ltmp2:
mov edi, OFFSET FLAT:.LC2
jmp puts
ret
main:
// ... init
fn() // 0
call static_int.operator=(1)
fn() // 1
call static_int.operator=(2)
fn() // 2
.LC0: .asciz "0"
.LC1: .asciz "1"
.LC2: .asciz "2"
variant
(https://godbolt.org/z/TKPdYPv3P) | (https://wg21.link/P2996)
template<class... Ts>
class variant {
static constexpr jmp::static_branch<std::size_t, 0, sizeof...(Ts)> index_ = 0u;
public:
constexpr variant() = default;
template<class T> requires (not std::is_base_of_v<variant, std::remove_cvref_t<T>>)
constexpr explicit(false) variant(T&& t) {
constexpr auto index = [] {
std::array match{std::is_same_v<Ts, std::remove_cvref_t<T>>...};
return std::ranges::find(match, true) - match.begin();
}();
index_ = index;
std::construct_at(&storage_.[:
nonstatic_data_members_of(^storage)[index + 1u]
:], std::forward<T>(t));
}
constexpr ~variant()
requires (std::is_trivially_destructible_v<Ts> and ...) = default;
template<class Fn>
constexpr auto visit(Fn&& fn) const -> decltype(auto) {
return [&]<auto I = 0u>(this auto&& self) {
if constexpr (I == sizeof...(Ts)) {
std::unreachable();
} else {
switch (index_) {
default: return self.template operator()<I + 1u>();
case I: return std::invoke(std::forward<Fn>(fn), storage_.[:
nonstatic_data_members_of(^storage)[I + 1u]
:]);
}
}
}();
}
private:
union storage;
struct empty{ };
static_assert(is_type(define_class(^storage, {
std::meta::data_member_spec(^empty, {.name = "empty"}),
std::meta::data_member_spec(^Ts)...
})));
storage storage_{.empty={}};
};
void usage(const variant<bool, int, float>& v) {
v.visit(overload{
[](bool) { std::puts("bool"); },
[](int) { std::puts("int"); },
[](float) { std::puts("float"); },
});
}
int main() {
if (not jmp::init()) { // enables run-time code patching
return errno;
}
variant<bool, int, float> v{};
v = true;
usage(v);
v = 42;
usage(v);
v = 42.f;
usage(v);
}
usage(variant<bool, int, float> const&):
nop # code patching (nop->jmp .Ltmp1|.Ltmp2)
.Ltmp0:
mov edi, OFFSET FLAT:.LC0
jmp puts
ret
.Ltmp1:
mov edi, OFFSET FLAT:.LC1
jmp puts
ret
.Ltmp2:
mov edi, OFFSET FLAT:.LC2
jmp puts
ret
.LC0: .asciz "bool"
.LC1: .asciz "int"
.LC2: .asciz "float"
Dispatching techniques (https://godbolt.org/z/cfKP9E8W9)
auto f1() -> int { return 42; }
auto f2() -> int { return 77; }
auto f3() -> int { return 99; }
auto if_else(bool b) -> int { # if_else(bool):
if (b) { # testl %edi, %edi
return f1(); # movl $42, %ecx
} else { # movl $77, %eax
return f2(); # cmovnel %ecx, %eax # cmove or cmp
} # retq
} #
auto if_else_likely(bool b) -> int { # if_else_likely(bool):
if (b) [[likely]] { # movl $42, %eax # likely
return f1(); # testl %edi, %edi
} else { # je .LBB3_1
return f2(); # retq
} # .LBB3_1:
} # movl $77, %eax
# retq
auto ternary_op(bool b) -> int { # ternary_op(bool):
return b ? f1() : f2(); # testl %edi, %edi
} # movl $42, %ecx
# movl $77, %eax
# cmovnel %ecx, %eax # often cmove
# retq
auto jump_table(bool b) -> int { # jump_table(bool):
static constexpr int (*dispatch[])(){ # movl %edi, %eax
&f1, &f2 # leaq dispatch(%rip), %rcx
}; # jmpq *(%rcx,%rax,8) # or call
return dispatch[b](); #
} # dispatch:
# .quad f1()
# .quad f2()
auto jump_table_musttail(bool b) -> int { # jump_table_musttail(bool):
static constexpr int (*dispatch[])(bool){ # movl %edi, %eax
[](bool) { return f1(); }, # leaq dispatch(%rip), %rcx
[](bool) { return f2(); }, # jmpq *(%rcx,%rax,8) # always jmp
}; #
[[clang::musttail]] return dispatch[b](b); # dispatch:
} # .quad f1::__invoke(bool)
# .quad f2::__invoke(bool)
auto computed_goto(bool b) -> int { # computed_goto(bool):
static constexpr void* labels[]{&&L1, &&L2}; # movl %edi, %eax
goto *labels[b]; # leaq labels(%rip), %rcx
L1: return f1(); # jmpq *(%rcx,%rax,8)
L2: return f2(); # .Ltmp15:
} # movl $42, %eax
# retq
# .Ltmp17:
# movl $77, %eax
# retq
#
# labels:
# .quad .Ltmp15
# .quad .Ltmp17
jmp::static_branch<bool> branch = false; # jmp():
auto jmp() -> int { # nop|jmp .Ltmp1 # code patching
if (branch) { # .Ltmp0:
return f1(); # movl $42, %eax
} else { # retq
return f2(); # .Ltmp1:
} # movl $77, %eax
} # retq
auto if_else(int i) -> int { # if_else(int):
[[assume(i >= 0 and i <= 2)]]; # cmpl $1, %edi
if (i == 0) { # movl $77, %eax
return f1(); # movl $99, %ecx
} else if (i == 1) { # cmovel %eax, %ecx
return f2(); # testl %edi, %edi
} else if (i == 2) { # movl $42, %eax
return f3(); # cmovnel %ecx, %eax
} else { # retq
std::unreachable();
}
}
auto switch_case(int i) -> int { # switch_case(int):
[[assume(i >= 0 and i <= 2)]]; # movl %edi, %eax
switch (i) { # leaq .Lswitch.table(%rip), %rcx
default: std::unreachable(); # movl (%rcx,%rax,4), %eax
case 0: return f1(); # retq
case 1: return f2(); # .Lswitch.table(int):
case 2: return f3(); # .long 42
} # .long 77
} # .long 99
auto jump_table(int i) -> int { # jump_table(int):
[[assume(i >= 0 and i <= 2)]]; # movl %edi, %eax
static constexpr int (*dispatch[])(int){ # leaq dispatch(%rip), %rcx
[](int) { return f1(); }, # jmpq *(%rcx,%rax,8) # always jmp
[](int) { return f2(); }, # dispatch:
[](int) { return f3(); }, # .quad f1()
}; # .quad f2()
[[clang::musttail]] return dispatch[i](i); # .quad f3()
}
auto computed_goto(int i) -> int { # computed_goto(int):
[[assume(i >= 0 and i <= 2)]]; # movl %edi, %eax
static constexpr void* labels[]{ # leaq labels(%rip), %rcx
&&L1, &&L2, &&L3 # jmpq *(%rcx,%rax,8)
}; # .Ltmp35:
goto *labels[i]; # movl $42, %eax
L1: return f1(); # retq
L2: return f2(); # .Ltmp37:
L3: return f3(); # movl $99, %eax
} # retq
# .Ltmp39:
# movl $77, %eax
# retq
#
# labels:
# .quad .Ltmp35
# .quad .Ltmp37
# .quad .Ltmp39
jmp::static_branch<int, 0, 2> branch = 0; # jmp():
auto jmp() -> int { # jmp .LBB21_0|.LBB21_1|.LBB21_2
switch (branch) { # .LBB21_0:
default: std::unreachable(); # movl $42, %eax
case 0: return f1(); # retq
case 1: return f2(); # .LBB21_1:
case 2: return f3(); # movl $99, %eax
} # retq
} # .LBB21_2:
# movl $77, %eax
# retq
[[gnu::always_inline]] inline void fast_path() { std::puts("fast_path"); }
[[gnu::cold]] void slow_path() { std::puts("slow_path"); }
constexpr jmp::static_branch<bool> disarmed = false;
void trigger() {
if (not disarmed) { // { false: nop, true: jmp }
fast_path();
} else {
slow_path();
}
}
trigger(): // $CXX -O3
nop # code patching (nop->jmp .Ltmp1)
.Ltmp0: # fast path (inlined)
mov edi, OFFSET FLAT:.LC1
jmp puts
.Ltmp1: # slow path (cold)
jmp slow_path() # [clone .cold]
/**
* Minimal overhead (via code patching) static branch
*/
template<class T, T...> struct static_branch;
template<> struct static_branch<bool> final {
/**
* static_assert(sizeof(static_branch<bool>) == 1u)
* @param value initial branch value (false)
*/
constexpr explicit(false) static_branch(const bool value) noexcept;
constexpr static_branch(const static_branch&) noexcept = delete;
constexpr static_branch(static_branch&&) noexcept = delete;
constexpr static_branch& operator=(const static_branch&) noexcept = delete;
constexpr static_branch& operator=(static_branch&&) noexcept = delete;
/**
* Updates branch value
* @param value new branch value
*/
constexpr const auto& operator=(const bool value) const noexcept;
[[gnu::always_inline]] [[nodiscard]]
inline explicit(false) operator bool() const noexcept;
};
template<class T, T Min, T Max>
requires requires(T t) { reinterpret_cast<T>(t); } and
(Max - Min >= 2 and Max - Min <= 7)
struct static_branch<T, Min, Max> final {
/**
* static_assert(sizeof(static_branch<bool>) == 1u)
* @param value initial branch value (false)
*/
constexpr explicit(false) static_branch(const T value) noexcept;
constexpr static_branch(const static_branch&) noexcept = delete;
constexpr static_branch(static_branch&&) noexcept = delete;
constexpr static_branch& operator=(const static_branch&) noexcept = delete;
constexpr static_branch& operator=(static_branch&&) noexcept = delete;
/**
* Updates branch value
* @param value new branch value
*/
constexpr const auto& operator=(const T value) const noexcept;
[[gnu::always_inline]] [[nodiscard]]
inline explicit(false) operator T() const noexcept;
};
-
How does it work?
jmp
is using technique called code patching - which basically means that the code modifies itself.jmp::static_branch
is based on https://docs.kernel.org/staging/static-keys.html and it requiresasm goto
support (gcc, clang).jmp
currently supports x86-64 Linux, but other platforms can be added using the same technique.Walkthrough
constexpr jmp::static_branch<bool> b = false; if (b) { return 42; } else { return 0; }
Will emit...
main: .byte 15 31 68 0 0 # nop - https://www.felixcloutier.com/x86/nop .LBB0: xor eax, eax # return 0 ret .LBB1: mov eax, 42 # return 42 ret
Will effectively execute...
main: nop xor eax, eax # return 0 ret
If the branch value will be changed (at run-time)...
b = true; if (b) { return 42; } else { return 0; }
Will emit...
main: call b.operator=(true); # nop->jmp or jmp->nop jmp .LBB1: (nop->jmp - changed in the memory of the program) .LBB0: xor eax, eax # return 0 ret .LBB1: mov eax, 42 # return 42 ret
-
What platforms are supported?
Only x86_64 is currently supported but the technique is compatible with other platforms as proven by https://docs.kernel.org/staging/static-keys.html.
-
What is the cost of switching the branch?
Cost = number of inlined versions * memcpy (
5 bytes
forstatic_branch<bool>
or4 bytes
forstatic_branch<T>
). In case of[[gnu::noinline]]
the cost is a single memcpy otherwise it will be a loop over all inlined versions. -
How to integrate with CMake.FetchContent?
include(FetchContent) FetchContent_Declare( qlibs.jmp GIT_REPOSITORY https://github.com/qlibs/jmp GIT_TAG v5.0.1 ) FetchContent_MakeAvailable(qlibs.jmp)
target_link_libraries(${PROJECT_NAME} PUBLIC qlibs.jmp);
-
Acknowledgments
https://docs.kernel.org/staging/static-keys.html, https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html, https://www.agner.org/optimize/instruction_tables.pdf, https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html, https://www.felixcloutier.com/documents/gcc-asm.html, https://www.felixcloutier.com/x86, https://uops.info/table.html, https://arxiv.org/abs/2308.14185, https://arxiv.org/abs/2011.13127