From d2c6967f1d2d05cb39fbd0919c4909dbc0be8eb8 Mon Sep 17 00:00:00 2001 From: Karl Wiberg Date: Tue, 29 Sep 2020 13:55:13 +0200 Subject: [PATCH] Optimize RoboCaller::AddReceiver() for code size Essentially, instead of having the inlined UntypedFunction::Create(f) return an UntypedFunction which is then passed as an argument to non-inlined RoboCallerReceivers::AddReceiverImpl(), we let UntypedFunction::PrepareArgs(f) return a few different kinds of trivial structs (depending on what sort of type f has) which are passed as arguments to non-inlined RoboCallerReceivers::AddReceiver() (which then converts them to UntypedFunction by calling UntypedFunction::Create()). These structs are smaller than UntypedFunction and optimized for argument passing, so many fewer instructions are needed. Example code: struct Foo { void Receive(int, float, int, float); void TestAddLambdaReceiver(); webrtc::RoboCaller rc; }; void Foo::TestAddLambdaReceiver() { rc.AddReceiver([this](int a, float b, int c, float d){ Receive(a, b, c, d);}); } On arm32, we get before this CL: Foo::TestAddLambdaReceiver(): push {r11, lr} mov r11, sp sub sp, sp, #24 ldr r1, .LCPI0_0 mov r2, #0 stm sp, {r0, r2} add r1, pc, r1 str r2, [sp, #20] str r1, [sp, #16] mov r1, sp bl RoboCallerReceivers::AddReceiverImpl mov sp, r11 pop {r11, pc} .LCPI0_0: .long CallInlineStorage CallInlineStorage: ldr r0, [r0] b Foo::Receive(int, float, int, float) After this CL: Foo::TestAddLambdaReceiver(): ldr r3, .LCPI0_0 mov r2, r0 add r3, pc, r3 b RoboCallerReceivers::AddReceiver<1u> .LCPI0_0: .long CallInlineStorage CallInlineStorage: ldr r0, [r0] b Foo::Receive(int, float, int, float) (Symbol names abbreviated so that they'll fit on one line.) So a reduction from 64 to 28 bytes. The improvements on arm64 and x86_64 are similar. Bug: webrtc:11943 Change-Id: I93fbba083be0235051c3279d3e3f6852a4a9fdad Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/185960 Commit-Queue: Karl Wiberg Reviewed-by: Mirko Bonadei Cr-Commit-Position: refs/heads/master@{#32244} --- rtc_base/BUILD.gn | 1 + rtc_base/robo_caller.cc | 17 ++- rtc_base/robo_caller.h | 25 +++-- rtc_base/untyped_function.h | 199 ++++++++++++++++++++++++++---------- 4 files changed, 176 insertions(+), 66 deletions(-) diff --git a/rtc_base/BUILD.gn b/rtc_base/BUILD.gn index a09c06ed75e..489a5c6056d 100644 --- a/rtc_base/BUILD.gn +++ b/rtc_base/BUILD.gn @@ -57,6 +57,7 @@ rtc_source_set("robo_caller") { ":untyped_function", "../api:function_view", "system:assume", + "system:inline", ] } diff --git a/rtc_base/robo_caller.cc b/rtc_base/robo_caller.cc index a97687a76a3..a7c35cbfdbc 100644 --- a/rtc_base/robo_caller.cc +++ b/rtc_base/robo_caller.cc @@ -16,10 +16,6 @@ namespace robo_caller_impl { RoboCallerReceivers::RoboCallerReceivers() = default; RoboCallerReceivers::~RoboCallerReceivers() = default; -void RoboCallerReceivers::AddReceiverImpl(UntypedFunction* f) { - receivers_.push_back(std::move(*f)); -} - void RoboCallerReceivers::Foreach( rtc::FunctionView fv) { for (auto& r : receivers_) { @@ -27,5 +23,18 @@ void RoboCallerReceivers::Foreach( } } +template void RoboCallerReceivers::AddReceiver( + UntypedFunction::TrivialUntypedFunctionArgs<1>); +template void RoboCallerReceivers::AddReceiver( + UntypedFunction::TrivialUntypedFunctionArgs<2>); +template void RoboCallerReceivers::AddReceiver( + UntypedFunction::TrivialUntypedFunctionArgs<3>); +template void RoboCallerReceivers::AddReceiver( + UntypedFunction::TrivialUntypedFunctionArgs<4>); +template void RoboCallerReceivers::AddReceiver( + UntypedFunction::NontrivialUntypedFunctionArgs); +template void RoboCallerReceivers::AddReceiver( + UntypedFunction::FunctionPointerUntypedFunctionArgs); + } // namespace robo_caller_impl } // namespace webrtc diff --git a/rtc_base/robo_caller.h b/rtc_base/robo_caller.h index 573e7b64c8f..9df6a48a5e5 100644 --- a/rtc_base/robo_caller.h +++ b/rtc_base/robo_caller.h @@ -16,6 +16,7 @@ #include "api/function_view.h" #include "rtc_base/system/assume.h" +#include "rtc_base/system/inline.h" #include "rtc_base/untyped_function.h" namespace webrtc { @@ -30,20 +31,30 @@ class RoboCallerReceivers { RoboCallerReceivers& operator=(RoboCallerReceivers&&) = delete; ~RoboCallerReceivers(); - void AddReceiver(UntypedFunction&& f) { - AddReceiverImpl(&f); - // Assume that f was moved from and is now trivially destructible. - // This helps the compiler optimize away the destructor call. - RTC_ASSUME(f.IsTriviallyDestructible()); + template + RTC_NO_INLINE void AddReceiver(UntypedFunctionArgsT args) { + receivers_.push_back(UntypedFunction::Create(args)); } void Foreach(rtc::FunctionView fv); private: - void AddReceiverImpl(UntypedFunction* f); std::vector receivers_; }; +extern template void RoboCallerReceivers::AddReceiver( + UntypedFunction::TrivialUntypedFunctionArgs<1>); +extern template void RoboCallerReceivers::AddReceiver( + UntypedFunction::TrivialUntypedFunctionArgs<2>); +extern template void RoboCallerReceivers::AddReceiver( + UntypedFunction::TrivialUntypedFunctionArgs<3>); +extern template void RoboCallerReceivers::AddReceiver( + UntypedFunction::TrivialUntypedFunctionArgs<4>); +extern template void RoboCallerReceivers::AddReceiver( + UntypedFunction::NontrivialUntypedFunctionArgs); +extern template void RoboCallerReceivers::AddReceiver( + UntypedFunction::FunctionPointerUntypedFunctionArgs); + } // namespace robo_caller_impl // A collection of receivers (callable objects) that can be called all at once. @@ -71,7 +82,7 @@ class RoboCaller { template void AddReceiver(F&& f) { receivers_.AddReceiver( - UntypedFunction::Create(std::forward(f))); + UntypedFunction::PrepareArgs(std::forward(f))); } // Calls all receivers with the given arguments. diff --git a/rtc_base/untyped_function.h b/rtc_base/untyped_function.h index 9220d3f9e40..16c5ba0c8eb 100644 --- a/rtc_base/untyped_function.h +++ b/rtc_base/untyped_function.h @@ -11,6 +11,8 @@ #ifndef RTC_BASE_UNTYPED_FUNCTION_H_ #define RTC_BASE_UNTYPED_FUNCTION_H_ +#include +#include #include #include #include @@ -25,9 +27,17 @@ using FunVoid = void(); union VoidUnion { void* void_ptr; FunVoid* fun_ptr; - typename std::aligned_storage<16>::type inline_storage; + typename std::aligned_storage<4 * sizeof(uintptr_t)>::type inline_storage; }; +// Returns the number of elements of the `inline_storage` array required to +// store an object of type T. +template +constexpr size_t InlineStorageSize() { + // sizeof(T) / sizeof(uintptr_t), but rounded up. + return (sizeof(T) + sizeof(uintptr_t) - 1) / sizeof(uintptr_t); +} + template struct CallHelpers; template @@ -64,76 +74,155 @@ struct CallHelpers { // size. class UntypedFunction final { public: - // Create function for lambdas and other callables; it accepts every type of - // argument except those noted in its enable_if call. + // The *UntypedFunctionArgs structs are used to transfer arguments from + // PrepareArgs() to Create(). They are trivial, but may own heap allocations, + // so make sure to pass them to Create() exactly once! + // + // The point of doing Create(PrepareArgs(foo)) instead of just Create(foo) is + // to separate the code that has to be inlined (PrepareArgs) from the code + // that can be noninlined (Create); the *UntypedFunctionArgs types are + // designed to efficiently carry the required information from one to the + // other. + template + struct TrivialUntypedFunctionArgs { + // We use an uintptr_t array here instead of std::aligned_storage, because + // the former can be efficiently passed in registers when using + // TrivialUntypedFunctionArgs as a function argument. (We can't do the same + // in VoidUnion, because std::aligned_storage but not uintptr_t can be + // legally reinterpret_casted to arbitrary types. + // TrivialUntypedFunctionArgs, on the other hand, only needs to handle + // placement new and memcpy.) + alignas(std::max_align_t) uintptr_t inline_storage[N]; + webrtc_function_impl::FunVoid* call; + }; + struct NontrivialUntypedFunctionArgs { + void* void_ptr; + webrtc_function_impl::FunVoid* call; + void (*del)(webrtc_function_impl::VoidUnion*); + }; + struct FunctionPointerUntypedFunctionArgs { + webrtc_function_impl::FunVoid* fun_ptr; + webrtc_function_impl::FunVoid* call; + }; + + // Create function for lambdas and other callables that are trivial and small; + // it accepts every type of argument except those noted in its enable_if call. template < typename Signature, typename F, + typename F_deref = typename std::remove_reference::type, typename std::enable_if< // Not for function pointers; we have another overload for that below. - !std::is_function::type>::type>::value && + !std::is_function< + typename std::remove_pointer::type>::value && - // Not for nullptr; we have another overload for that below. + // Not for nullptr; we have a constructor for that below. !std::is_same::type>::value && - // Not for UntypedFunction objects; we have another overload for that. + // Not for UntypedFunction objects; use move construction or + // assignment. !std::is_same::type>::type>::value>::type* = nullptr> - static UntypedFunction Create(F&& f) { - using F_deref = typename std::remove_reference::type; - // TODO(C++17): Use `constexpr if` here. The compiler appears to do the - // right thing anyway w.r.t. resolving the branch statically and - // eliminating dead code, but it would be good for readability. - if (std::is_trivially_move_constructible::value && - std::is_trivially_destructible::value && - sizeof(F_deref) <= - sizeof(webrtc_function_impl::VoidUnion::inline_storage)) { - // The callable is trivial and small enough, so we just store its bytes - // in the inline storage. - webrtc_function_impl::VoidUnion vu; - new (&vu.inline_storage) F_deref(std::forward(f)); - return UntypedFunction( - vu, - reinterpret_cast( - webrtc_function_impl::CallHelpers< - Signature>::template CallInlineStorage), - nullptr); - } else { - // The callable is either nontrivial or too large, so we can't keep it - // in the inline storage; use the heap instead. - webrtc_function_impl::VoidUnion vu; - vu.void_ptr = new F_deref(std::forward(f)); - return UntypedFunction( - vu, - reinterpret_cast( - webrtc_function_impl::CallHelpers< - Signature>::template CallVoidPtr), - static_cast( - [](webrtc_function_impl::VoidUnion* vu) { - // Assuming that this pointer isn't null allows the - // compiler to eliminate a null check in the (inlined) - // delete operation. - RTC_ASSUME(vu->void_ptr != nullptr); - delete reinterpret_cast(vu->void_ptr); - })); - } + typename std::remove_cv::type>::value && + + // Only for trivial callables that will fit in + // VoidUnion::inline_storage. + std::is_trivially_move_constructible::value && + std::is_trivially_destructible::value && + sizeof(F_deref) <= + sizeof(webrtc_function_impl::VoidUnion::inline_storage)>::type* = + nullptr, + size_t InlineSize = webrtc_function_impl::InlineStorageSize()> + static TrivialUntypedFunctionArgs PrepareArgs(F&& f) { + // The callable is trivial and small enough, so we just store its bytes + // in the inline storage. + TrivialUntypedFunctionArgs args; + new (&args.inline_storage) F_deref(std::forward(f)); + args.call = reinterpret_cast( + webrtc_function_impl::CallHelpers< + Signature>::template CallInlineStorage); + return args; + } + template + static UntypedFunction Create(TrivialUntypedFunctionArgs args) { + webrtc_function_impl::VoidUnion vu; + std::memcpy(&vu.inline_storage, args.inline_storage, + sizeof(args.inline_storage)); + return UntypedFunction(vu, args.call, nullptr); + } + + // Create function for lambdas and other callables that are nontrivial or + // large; it accepts every type of argument except those noted in its + // enable_if call. + template < + typename Signature, + typename F, + typename F_deref = typename std::remove_reference::type, + typename std::enable_if< + // Not for function pointers; we have another overload for that below. + !std::is_function< + typename std::remove_pointer::type>::value && + + // Not for nullptr; we have a constructor for that below. + !std::is_same::type>::value && + + // Not for UntypedFunction objects; use move construction or + // assignment. + !std::is_same::type>::value && + + // Only for nontrivial callables, or callables that won't fit in + // VoidUnion::inline_storage. + !(std::is_trivially_move_constructible::value && + std::is_trivially_destructible::value && + sizeof(F_deref) <= sizeof(webrtc_function_impl::VoidUnion:: + inline_storage))>::type* = nullptr> + static NontrivialUntypedFunctionArgs PrepareArgs(F&& f) { + // The callable is either nontrivial or too large, so we can't keep it + // in the inline storage; use the heap instead. + NontrivialUntypedFunctionArgs args; + args.void_ptr = new F_deref(std::forward(f)); + args.call = reinterpret_cast( + webrtc_function_impl::CallHelpers::template CallVoidPtr< + F_deref>); + args.del = static_cast( + [](webrtc_function_impl::VoidUnion* vu) { + // Assuming that this pointer isn't null allows the + // compiler to eliminate a null check in the (inlined) + // delete operation. + RTC_ASSUME(vu->void_ptr != nullptr); + delete reinterpret_cast(vu->void_ptr); + }); + return args; + } + static UntypedFunction Create(NontrivialUntypedFunctionArgs args) { + webrtc_function_impl::VoidUnion vu; + vu.void_ptr = args.void_ptr; + return UntypedFunction(vu, args.call, args.del); } // Create function that accepts function pointers. If the argument is null, // the result is an empty UntypedFunction. template - static UntypedFunction Create(Signature* f) { + static FunctionPointerUntypedFunctionArgs PrepareArgs(Signature* f) { + FunctionPointerUntypedFunctionArgs args; + args.fun_ptr = reinterpret_cast(f); + args.call = reinterpret_cast( + webrtc_function_impl::CallHelpers::CallFunPtr); + return args; + } + static UntypedFunction Create(FunctionPointerUntypedFunctionArgs args) { webrtc_function_impl::VoidUnion vu; - vu.fun_ptr = reinterpret_cast(f); - return UntypedFunction( - vu, - f ? reinterpret_cast( - webrtc_function_impl::CallHelpers::CallFunPtr) - : nullptr, - nullptr); + vu.fun_ptr = args.fun_ptr; + return UntypedFunction(vu, args.fun_ptr == nullptr ? nullptr : args.call, + nullptr); + } + + // Prepares arguments and creates an UntypedFunction in one go. + template + static UntypedFunction Create(F&& f) { + return Create(PrepareArgs(std::forward(f))); } // Default constructor. Creates an empty UntypedFunction.