-
-
Notifications
You must be signed in to change notification settings - Fork 3.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Merged by Bors] - Extract monomorphic get_insert_bundle_info function #1910
Conversation
38db0f0
to
31f8d87
Compare
Out of curiosity, which tools do you use to find which functions contribute the most to binary size? |
Nothing fancy. Just https://github.com/m4b/bingrep to look at which symbols are defined in the executable. It also lists the function size.
|
I'm also on a pretty noisy / unreliable laptop right now, so I'll hold off on merging this until I can bench it on my tower (next week). In theory I'm sold though! |
Just ran benchmarks on my tower and there were no significant perf changes here. Lets merge this! |
bors r+ |
This shrinks breakout from 316k to 310k when using `--feature dynamic`. I haven't run the ecs benchmark to test performance as my laptop is too noisy for reliable benchmarking.
Pull request successfully merged into main. Build succeeded: |
Based on #1910 This shrinks breakout from 310k to 293k. Most of the win is in outlining the drop glue of `App`. The other two commits save about 800 bytes total when using two empty systems and two simple resources. After this PR the full disassembly for ```rust fn main() { App::build().run(); } ``` is about as minimal as it gets, so pretty much all other costs scale linear in the amount of resources, systems, etc. ```asm 0000000000001100 <_ZN4core3ptr54drop_in_place$LT$bevy_app..app_builder..AppBuilder$GT$17h76850422c20653deE>: 1100: ff 25 52 21 00 00 jmpq *0x2152(%rip) # 3258 <_ZN60_$LT$bevy_app..app..App$u20$as$u20$core..ops..drop..Drop$GT$4drop17h67d177ae549d917bE@Base> 1106: cc int3 1107: cc int3 1108: cc int3 1109: cc int3 110a: cc int3 110b: cc int3 110c: cc int3 110d: cc int3 110e: cc int3 110f: cc int3 0000000000001110 <_ZN8breakout4main17h7cbe07b319de1042E>: 1110: 53 push %rbx 1111: 48 81 ec 00 03 00 00 sub $0x300,%rsp 1118: 48 8d 5c 24 08 lea 0x8(%rsp),%rbx 111d: 48 89 df mov %rbx,%rdi 1120: ff 15 3a 21 00 00 callq *0x213a(%rip) # 3260 <_ZN8bevy_app3app3App5build17h8b0ea6be9050d6ccE@Base> 1126: 48 89 df mov %rbx,%rdi 1129: ff 15 39 21 00 00 callq *0x2139(%rip) # 3268 <_ZN8bevy_app11app_builder10AppBuilder3run17hfc8cf50692acdbdeE@Base> 112f: 48 8d 7c 24 08 lea 0x8(%rsp),%rdi 1134: ff 15 1e 21 00 00 callq *0x211e(%rip) # 3258 <_ZN60_$LT$bevy_app..app..App$u20$as$u20$core..ops..drop..Drop$GT$4drop17h67d177ae549d917bE@Base> 113a: 48 81 c4 00 03 00 00 add $0x300,%rsp 1141: 5b pop %rbx 1142: c3 retq 1143: 48 89 c3 mov %rax,%rbx 1146: 48 8d 7c 24 08 lea 0x8(%rsp),%rdi 114b: e8 b0 ff ff ff callq 1100 <_ZN4core3ptr54drop_in_place$LT$bevy_app..app_builder..AppBuilder$GT$17h76850422c20653deE> 1150: 48 89 df mov %rbx,%rdi 1153: e8 18 01 00 00 callq 1270 <_Unwind_Resume@plt> 1158: 0f 0b ud2 115a: cc int3 115b: cc int3 115c: cc int3 115d: cc int3 115e: cc int3 115f: cc int3 0000000000001160 <main>: 1160: 48 83 ec 08 sub $0x8,%rsp 1164: 48 89 f1 mov %rsi,%rcx 1167: 48 63 d7 movslq %edi,%rdx 116a: 48 8d 05 9f ff ff ff lea -0x61(%rip),%rax # 1110 <_ZN8breakout4main17h7cbe07b319de1042E> 1171: 48 89 04 24 mov %rax,(%rsp) 1175: 48 8d 35 94 1e 00 00 lea 0x1e94(%rip),%rsi # 3010 <__init_array_end> 117c: 48 89 e7 mov %rsp,%rdi 117f: ff 15 eb 20 00 00 callq *0x20eb(%rip) # 3270 <_ZN3std2rt19lang_start_internal17he77194431b0ee4a2E@Base> 1185: 59 pop %rcx 1186: c3 retq 1187: cc int3 1188: cc int3 1189: cc int3 118a: cc int3 118b: cc int3 118c: cc int3 118d: cc int3 118e: cc int3 118f: cc int3 0000000000001190 <_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17h83a5b8d55f23dff8E.llvm.909376793398482062>: 1190: 48 83 ec 08 sub $0x8,%rsp 1194: 48 8b 3f mov (%rdi),%rdi 1197: e8 54 ff ff ff callq 10f0 <_ZN3std10sys_common9backtrace28__rust_begin_short_backtrace17h6e238af75680eb28E> 119c: 31 c0 xor %eax,%eax 119e: 59 pop %rcx 119f: c3 retq 00000000000011a0 <_ZN4core3ops8function6FnOnce40call_once$u7b$$u7b$vtable.shim$u7d$$u7d$17hb05d591cd29dea4fE.llvm.909376793398482062>: 11a0: 48 83 ec 08 sub $0x8,%rsp 11a4: 48 8b 3f mov (%rdi),%rdi 11a7: e8 44 ff ff ff callq 10f0 <_ZN3std10sys_common9backtrace28__rust_begin_short_backtrace17h6e238af75680eb28E> 11ac: 31 c0 xor %eax,%eax 11ae: 59 pop %rcx 11af: c3 retq 00000000000011b0 <_ZN4core3ptr85drop_in_place$LT$std..rt..lang_start$LT$$LP$$RP$$GT$..$u7b$$u7b$closure$u7d$$u7d$$GT$17he9aeeba375093b99E.llvm.909376793398482062>: 11b0: c3 retq 11b1: cc int3 11b2: cc int3 11b3: cc int3 11b4: cc int3 11b5: cc int3 11b6: cc int3 11b7: cc int3 11b8: cc int3 11b9: cc int3 11ba: cc int3 11bb: cc int3 11bc: cc int3 11bd: cc int3 11be: cc int3 11bf: cc int3 ```
Based on bevyengine#1910 This shrinks breakout from 310k to 293k. Most of the win is in outlining the drop glue of `App`. The other two commits save about 800 bytes total when using two empty systems and two simple resources. After this PR the full disassembly for ```rust fn main() { App::build().run(); } ``` is about as minimal as it gets, so pretty much all other costs scale linear in the amount of resources, systems, etc. ```asm 0000000000001100 <_ZN4core3ptr54drop_in_place$LT$bevy_app..app_builder..AppBuilder$GT$17h76850422c20653deE>: 1100: ff 25 52 21 00 00 jmpq *0x2152(%rip) # 3258 <_ZN60_$LT$bevy_app..app..App$u20$as$u20$core..ops..drop..Drop$GT$4drop17h67d177ae549d917bE@Base> 1106: cc int3 1107: cc int3 1108: cc int3 1109: cc int3 110a: cc int3 110b: cc int3 110c: cc int3 110d: cc int3 110e: cc int3 110f: cc int3 0000000000001110 <_ZN8breakout4main17h7cbe07b319de1042E>: 1110: 53 push %rbx 1111: 48 81 ec 00 03 00 00 sub $0x300,%rsp 1118: 48 8d 5c 24 08 lea 0x8(%rsp),%rbx 111d: 48 89 df mov %rbx,%rdi 1120: ff 15 3a 21 00 00 callq *0x213a(%rip) # 3260 <_ZN8bevy_app3app3App5build17h8b0ea6be9050d6ccE@Base> 1126: 48 89 df mov %rbx,%rdi 1129: ff 15 39 21 00 00 callq *0x2139(%rip) # 3268 <_ZN8bevy_app11app_builder10AppBuilder3run17hfc8cf50692acdbdeE@Base> 112f: 48 8d 7c 24 08 lea 0x8(%rsp),%rdi 1134: ff 15 1e 21 00 00 callq *0x211e(%rip) # 3258 <_ZN60_$LT$bevy_app..app..App$u20$as$u20$core..ops..drop..Drop$GT$4drop17h67d177ae549d917bE@Base> 113a: 48 81 c4 00 03 00 00 add $0x300,%rsp 1141: 5b pop %rbx 1142: c3 retq 1143: 48 89 c3 mov %rax,%rbx 1146: 48 8d 7c 24 08 lea 0x8(%rsp),%rdi 114b: e8 b0 ff ff ff callq 1100 <_ZN4core3ptr54drop_in_place$LT$bevy_app..app_builder..AppBuilder$GT$17h76850422c20653deE> 1150: 48 89 df mov %rbx,%rdi 1153: e8 18 01 00 00 callq 1270 <_Unwind_Resume@plt> 1158: 0f 0b ud2 115a: cc int3 115b: cc int3 115c: cc int3 115d: cc int3 115e: cc int3 115f: cc int3 0000000000001160 <main>: 1160: 48 83 ec 08 sub $0x8,%rsp 1164: 48 89 f1 mov %rsi,%rcx 1167: 48 63 d7 movslq %edi,%rdx 116a: 48 8d 05 9f ff ff ff lea -0x61(%rip),%rax # 1110 <_ZN8breakout4main17h7cbe07b319de1042E> 1171: 48 89 04 24 mov %rax,(%rsp) 1175: 48 8d 35 94 1e 00 00 lea 0x1e94(%rip),%rsi # 3010 <__init_array_end> 117c: 48 89 e7 mov %rsp,%rdi 117f: ff 15 eb 20 00 00 callq *0x20eb(%rip) # 3270 <_ZN3std2rt19lang_start_internal17he77194431b0ee4a2E@Base> 1185: 59 pop %rcx 1186: c3 retq 1187: cc int3 1188: cc int3 1189: cc int3 118a: cc int3 118b: cc int3 118c: cc int3 118d: cc int3 118e: cc int3 118f: cc int3 0000000000001190 <_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17h83a5b8d55f23dff8E.llvm.909376793398482062>: 1190: 48 83 ec 08 sub $0x8,%rsp 1194: 48 8b 3f mov (%rdi),%rdi 1197: e8 54 ff ff ff callq 10f0 <_ZN3std10sys_common9backtrace28__rust_begin_short_backtrace17h6e238af75680eb28E> 119c: 31 c0 xor %eax,%eax 119e: 59 pop %rcx 119f: c3 retq 00000000000011a0 <_ZN4core3ops8function6FnOnce40call_once$u7b$$u7b$vtable.shim$u7d$$u7d$17hb05d591cd29dea4fE.llvm.909376793398482062>: 11a0: 48 83 ec 08 sub $0x8,%rsp 11a4: 48 8b 3f mov (%rdi),%rdi 11a7: e8 44 ff ff ff callq 10f0 <_ZN3std10sys_common9backtrace28__rust_begin_short_backtrace17h6e238af75680eb28E> 11ac: 31 c0 xor %eax,%eax 11ae: 59 pop %rcx 11af: c3 retq 00000000000011b0 <_ZN4core3ptr85drop_in_place$LT$std..rt..lang_start$LT$$LP$$RP$$GT$..$u7b$$u7b$closure$u7d$$u7d$$GT$17he9aeeba375093b99E.llvm.909376793398482062>: 11b0: c3 retq 11b1: cc int3 11b2: cc int3 11b3: cc int3 11b4: cc int3 11b5: cc int3 11b6: cc int3 11b7: cc int3 11b8: cc int3 11b9: cc int3 11ba: cc int3 11bb: cc int3 11bc: cc int3 11bd: cc int3 11be: cc int3 11bf: cc int3 ```
This shrinks breakout from 316k to 310k when using `--feature dynamic`. I haven't run the ecs benchmark to test performance as my laptop is too noisy for reliable benchmarking.
Based on bevyengine#1910 This shrinks breakout from 310k to 293k. Most of the win is in outlining the drop glue of `App`. The other two commits save about 800 bytes total when using two empty systems and two simple resources. After this PR the full disassembly for ```rust fn main() { App::build().run(); } ``` is about as minimal as it gets, so pretty much all other costs scale linear in the amount of resources, systems, etc. ```asm 0000000000001100 <_ZN4core3ptr54drop_in_place$LT$bevy_app..app_builder..AppBuilder$GT$17h76850422c20653deE>: 1100: ff 25 52 21 00 00 jmpq *0x2152(%rip) # 3258 <_ZN60_$LT$bevy_app..app..App$u20$as$u20$core..ops..drop..Drop$GT$4drop17h67d177ae549d917bE@Base> 1106: cc int3 1107: cc int3 1108: cc int3 1109: cc int3 110a: cc int3 110b: cc int3 110c: cc int3 110d: cc int3 110e: cc int3 110f: cc int3 0000000000001110 <_ZN8breakout4main17h7cbe07b319de1042E>: 1110: 53 push %rbx 1111: 48 81 ec 00 03 00 00 sub $0x300,%rsp 1118: 48 8d 5c 24 08 lea 0x8(%rsp),%rbx 111d: 48 89 df mov %rbx,%rdi 1120: ff 15 3a 21 00 00 callq *0x213a(%rip) # 3260 <_ZN8bevy_app3app3App5build17h8b0ea6be9050d6ccE@Base> 1126: 48 89 df mov %rbx,%rdi 1129: ff 15 39 21 00 00 callq *0x2139(%rip) # 3268 <_ZN8bevy_app11app_builder10AppBuilder3run17hfc8cf50692acdbdeE@Base> 112f: 48 8d 7c 24 08 lea 0x8(%rsp),%rdi 1134: ff 15 1e 21 00 00 callq *0x211e(%rip) # 3258 <_ZN60_$LT$bevy_app..app..App$u20$as$u20$core..ops..drop..Drop$GT$4drop17h67d177ae549d917bE@Base> 113a: 48 81 c4 00 03 00 00 add $0x300,%rsp 1141: 5b pop %rbx 1142: c3 retq 1143: 48 89 c3 mov %rax,%rbx 1146: 48 8d 7c 24 08 lea 0x8(%rsp),%rdi 114b: e8 b0 ff ff ff callq 1100 <_ZN4core3ptr54drop_in_place$LT$bevy_app..app_builder..AppBuilder$GT$17h76850422c20653deE> 1150: 48 89 df mov %rbx,%rdi 1153: e8 18 01 00 00 callq 1270 <_Unwind_Resume@plt> 1158: 0f 0b ud2 115a: cc int3 115b: cc int3 115c: cc int3 115d: cc int3 115e: cc int3 115f: cc int3 0000000000001160 <main>: 1160: 48 83 ec 08 sub $0x8,%rsp 1164: 48 89 f1 mov %rsi,%rcx 1167: 48 63 d7 movslq %edi,%rdx 116a: 48 8d 05 9f ff ff ff lea -0x61(%rip),%rax # 1110 <_ZN8breakout4main17h7cbe07b319de1042E> 1171: 48 89 04 24 mov %rax,(%rsp) 1175: 48 8d 35 94 1e 00 00 lea 0x1e94(%rip),%rsi # 3010 <__init_array_end> 117c: 48 89 e7 mov %rsp,%rdi 117f: ff 15 eb 20 00 00 callq *0x20eb(%rip) # 3270 <_ZN3std2rt19lang_start_internal17he77194431b0ee4a2E@Base> 1185: 59 pop %rcx 1186: c3 retq 1187: cc int3 1188: cc int3 1189: cc int3 118a: cc int3 118b: cc int3 118c: cc int3 118d: cc int3 118e: cc int3 118f: cc int3 0000000000001190 <_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17h83a5b8d55f23dff8E.llvm.909376793398482062>: 1190: 48 83 ec 08 sub $0x8,%rsp 1194: 48 8b 3f mov (%rdi),%rdi 1197: e8 54 ff ff ff callq 10f0 <_ZN3std10sys_common9backtrace28__rust_begin_short_backtrace17h6e238af75680eb28E> 119c: 31 c0 xor %eax,%eax 119e: 59 pop %rcx 119f: c3 retq 00000000000011a0 <_ZN4core3ops8function6FnOnce40call_once$u7b$$u7b$vtable.shim$u7d$$u7d$17hb05d591cd29dea4fE.llvm.909376793398482062>: 11a0: 48 83 ec 08 sub $0x8,%rsp 11a4: 48 8b 3f mov (%rdi),%rdi 11a7: e8 44 ff ff ff callq 10f0 <_ZN3std10sys_common9backtrace28__rust_begin_short_backtrace17h6e238af75680eb28E> 11ac: 31 c0 xor %eax,%eax 11ae: 59 pop %rcx 11af: c3 retq 00000000000011b0 <_ZN4core3ptr85drop_in_place$LT$std..rt..lang_start$LT$$LP$$RP$$GT$..$u7b$$u7b$closure$u7d$$u7d$$GT$17he9aeeba375093b99E.llvm.909376793398482062>: 11b0: c3 retq 11b1: cc int3 11b2: cc int3 11b3: cc int3 11b4: cc int3 11b5: cc int3 11b6: cc int3 11b7: cc int3 11b8: cc int3 11b9: cc int3 11ba: cc int3 11bb: cc int3 11bc: cc int3 11bd: cc int3 11be: cc int3 11bf: cc int3 ```
This shrinks breakout from 316k to 310k when using
--feature dynamic
.I haven't run the ecs benchmark to test performance as my laptop is too noisy for reliable benchmarking.