-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Perf -35%] System.Memory.Span<Char>.IndexOfValue #39722
Comments
This one is a bit noisy, but @AndyAyersMS and @kunalspathak wanted to take a look at it. |
I compared JIT output of baseline vs. test and didn't find anything noticeably different:
I also saw that we are eliminating redundant Below is the baseline and test output I produced by locally building COMPlus_JitDiffableDasm=1
COMPlus_JitDisasm=*
COMPlus_TieredCompilation=0 Update:
|
@DrewScoggins thanks for making sure the diff link is against the runtime repo. However this still produces a unicorn as it covers July 14-July 21. From eyeballing the graph, shouldn't the SHA's be about July 14-July 15? (I realize that viewing even the tightly bracketed diff in Github is often not going to be feasible, but having well bracketed SHA's to examine offline is certainly important.) |
Yes, getting a tight bracket is very important, and it is something that we are thinking about how to solve in a consistent, accurate, and automated fashion. In the meantime, we believed that having a link that can at least bracket to the time of the report, in this case a week, was still useful as a starting point. For what it is worth, when I clicked the link I was able to get the comparison and didn't see an error, although I have had that happen plenty of times. For now what we will do is when a bug is copied over we will use the tools in the generated report to try and tighten the window as best we can for the diff link. It might not be perfect, but hopefully will be a start. |
@DrewScoggins makes sense. Sounds good. I hope it didn't appear as a criticism. I see this as an iterative process to make these as useful as possible. |
I was able to reproduce the regression. Link to asm diff https://www.diffchecker.com/daxuXSFS full details below: BenchmarkDotNet=v0.12.1.1405-nightly, OS=Windows 10.0.18363.959 (1909/November2019Update/19H2)
Intel Xeon CPU E5-1650 v4 3.60GHz, 1 CPU, 12 logical and 6 physical cores
.NET Core SDK=5.0.100-rc.1.20413.9
[Host] : .NET Core 3.1.6 (CoreCLR 4.700.20.26901, CoreFX 4.700.20.31603), X64 RyuJIT
Job-RHSWLL : .NET Core 3.1.6 (CoreCLR 4.700.20.26901, CoreFX 4.700.20.31603), X64 RyuJIT
PowerPlanMode=00000000-0000-0000-0000-000000000000 Runtime=.NET Core 3.1 Arguments=/p:DebugType=portable
Toolchain=netcoreapp3.1 IterationTime=250.0000 ms MaxIterationCount=20
MinIterationCount=15 WarmupCount=1
.NET Core 3.1.6 (CoreCLR 4.700.20.26901, CoreFX 4.700.20.31603), X64 RyuJIT; System.Memory.Span`1[[System.Char, System.Private.CoreLib]].IndexOfValue()
sub rsp,28
mov rdx,[rcx+18]
test rdx,rdx
jne short M00_L00
xor r8d,r8d
xor eax,eax
jmp short M00_L01
M00_L00:
lea r8,[rdx+10]
mov eax,[rdx+8]
M00_L01:
movzx ecx,word ptr [rcx+2C]
mov [rsp+24],ecx
mov edx,[rsp+24]
mov rcx,r8
mov r8d,eax
call System.SpanHelpers.IndexOf(Char ByRef, Char, Int32)
nop
add rsp,28
ret
; Total bytes of code 56 ; System.SpanHelpers.IndexOf(Char ByRef, Char, Int32)
push rsi
vzeroupper
xor r9d,r9d
movsxd r10,r8d
mov rax,r10
mov r11,rcx
test r11b,1
jne short M01_L01
cmp r8d,10
jl short M01_L01
mov rax,rcx
neg eax
mov r8d,eax
shr r8d,1F
add eax,r8d
sar eax,1
mov eax,eax
and rax,7
cmp rax,4
jl short M01_L02
M01_L00:
lea r8,[rcx+r9*2]
movzx r11d,dx
movzx esi,word ptr [r8]
cmp r11d,esi
je near ptr M01_L16
movzx esi,word ptr [r8+2]
cmp r11d,esi
je near ptr M01_L15
movzx esi,word ptr [r8+4]
cmp r11d,esi
je near ptr M01_L14
movzx r8d,word ptr [r8+6]
cmp r11d,r8d
je near ptr M01_L13
add r9,4
add rax,0FFFFFFFFFFFFFFFC
M01_L01:
cmp rax,4
jge short M01_L00
M01_L02:
test rax,rax
jle short M01_L04
movzx r11d,dx
M01_L03:
movzx r8d,word ptr [rcx+r9*2]
cmp r8d,r11d
je near ptr M01_L16
inc r9
dec rax
test rax,rax
jg short M01_L03
M01_L04:
cmp r9,r10
jge near ptr M01_L12
lea rax,[rcx+r9*2]
mov r11,rax
test r11b,1F
je short M01_L06
movzx r11d,dx
vmovd xmm0,r11d
vpbroadcastw xmm0,xmm0
vmovupd xmm1,[rax]
vpcmpeqw xmm0,xmm0,xmm1
vpmovmskb r11d,xmm0
test r11d,r11d
jne short M01_L05
add r9,8
jmp short M01_L06
M01_L05:
xor eax,eax
tzcnt eax,r11d
mov r10d,eax
shr r10d,1F
add eax,r10d
sar eax,1
add eax,r9d
jmp near ptr M01_L17
M01_L06:
mov rax,r10
sub rax,r9
and rax,0FFFFFFFFFFFFFFF0
test rax,rax
jle short M01_L08
movzx r11d,dx
vmovd xmm0,r11d
vpbroadcastw ymm0,xmm0
M01_L07:
vmovupd ymm1,[rcx+r9*2]
vpcmpeqw ymm1,ymm0,ymm1
vpmovmskb r8d,ymm1
test r8d,r8d
jne short M01_L09
add r9,10
add rax,0FFFFFFFFFFFFFFF0
test rax,rax
jg short M01_L07
M01_L08:
mov rax,r10
sub rax,r9
and rax,0FFFFFFFFFFFFFFF8
test rax,rax
jle short M01_L11
movzx r11d,dx
vmovd xmm0,r11d
vpbroadcastw xmm0,xmm0
vmovupd xmm1,[rcx+r9*2]
vpcmpeqw xmm0,xmm0,xmm1
vpmovmskb eax,xmm0
test eax,eax
jne short M01_L10
add r9,8
jmp short M01_L11
M01_L09:
xor eax,eax
tzcnt eax,r8d
mov r10d,eax
shr r10d,1F
add eax,r10d
sar eax,1
add eax,r9d
jmp short M01_L17
M01_L10:
tzcnt eax,eax
mov edx,eax
shr edx,1F
add eax,edx
sar eax,1
add eax,r9d
jmp short M01_L17
M01_L11:
cmp r9,r10
jge short M01_L12
mov rax,r10
sub rax,r9
jmp near ptr M01_L01
M01_L12:
mov eax,0FFFFFFFF
vzeroupper
pop rsi
ret
M01_L13:
lea eax,[r9+3]
jmp short M01_L17
M01_L14:
lea eax,[r9+2]
jmp short M01_L17
M01_L15:
lea eax,[r9+1]
jmp short M01_L17
M01_L16:
mov eax,r9d
M01_L17:
vzeroupper
pop rsi
ret
; Total bytes of code 462 BenchmarkDotNet=v0.12.1.1405-nightly, OS=Windows 10.0.18363.959 (1909/November2019Update/19H2)
Intel Xeon CPU E5-1650 v4 3.60GHz, 1 CPU, 12 logical and 6 physical cores
.NET Core SDK=5.0.100-rc.1.20413.9
[Host] : .NET Core 5.0.0 (CoreCLR 5.0.20.40416, CoreFX 5.0.20.40416), X64 RyuJIT
Job-GTRGSX : .NET Core 5.0.0 (CoreCLR 5.0.20.40416, CoreFX 5.0.20.40416), X64 RyuJIT
PowerPlanMode=00000000-0000-0000-0000-000000000000 Runtime=.NET Core 5.0 Arguments=/p:DebugType=portable
Toolchain=netcoreapp5.0 IterationTime=250.0000 ms MaxIterationCount=20
MinIterationCount=15 WarmupCount=1
.NET Core 5.0.0 (CoreCLR 5.0.20.40416, CoreFX 5.0.20.40416), X64 RyuJIT; System.Memory.Span`1[[System.Char, System.Private.CoreLib]].IndexOfValue()
sub rsp,28
mov rdx,[rcx+18]
test rdx,rdx
jne short M00_L00
xor r8d,r8d
xor eax,eax
jmp short M00_L01
M00_L00:
lea r8,[rdx+10]
mov eax,[rdx+8]
M00_L01:
movzx edx,word ptr [rcx+2C]
mov rcx,r8
mov r8d,eax
call System.SpanHelpers.IndexOf(Char ByRef, Char, Int32)
nop
add rsp,28
ret
; Total bytes of code 48 ; System.SpanHelpers.IndexOf(Char ByRef, Char, Int32)
push rsi
vzeroupper
xor r9d,r9d
movsxd r10,r8d
mov rax,r10
mov r11,rcx
test r11b,1
jne short M01_L01
cmp r8d,10
jl short M01_L01
mov rax,rcx
neg eax
mov r8d,eax
shr r8d,1F
add eax,r8d
sar eax,1
mov eax,eax
and rax,7
cmp rax,4
jl short M01_L02
M01_L00:
lea r8,[rcx+r9*2]
movzx r11d,dx
movzx esi,word ptr [r8]
cmp r11d,esi
je near ptr M01_L16
movzx esi,word ptr [r8+2]
cmp r11d,esi
je near ptr M01_L15
movzx esi,word ptr [r8+4]
cmp r11d,esi
je near ptr M01_L14
movzx r8d,word ptr [r8+6]
cmp r11d,r8d
je near ptr M01_L13
add r9,4
add rax,0FFFFFFFFFFFFFFFC
M01_L01:
cmp rax,4
jge short M01_L00
M01_L02:
test rax,rax
jle short M01_L04
movzx r11d,dx
M01_L03:
movzx r8d,word ptr [rcx+r9*2]
cmp r8d,r11d
je near ptr M01_L16
inc r9
dec rax
test rax,rax
jg short M01_L03
M01_L04:
cmp r9,r10
jge near ptr M01_L12
lea rax,[rcx+r9*2]
mov r11,rax
mov r8d,r11d
test r8b,1F
je short M01_L06
movzx r11d,dx
vmovd xmm0,r11d
vpbroadcastw xmm0,xmm0
vmovupd xmm1,[rax]
vpcmpeqw xmm0,xmm0,xmm1
vpmovmskb r11d,xmm0
test r11d,r11d
jne short M01_L05
add r9,8
jmp short M01_L06
M01_L05:
xor r10d,r10d
tzcnt r10d,r11d
shr r10d,1
mov edx,r10d
mov eax,edx
add eax,r9d
jmp near ptr M01_L17
M01_L06:
mov rax,r10
sub rax,r9
and rax,0FFFFFFFFFFFFFFF0
jle short M01_L08
movzx r11d,dx
vmovd xmm0,r11d
vpbroadcastw ymm0,xmm0
M01_L07:
vmovupd ymm1,[rcx+r9*2]
vpcmpeqw ymm1,ymm0,ymm1
vpmovmskb r8d,ymm1
test r8d,r8d
jne short M01_L09
add r9,10
add rax,0FFFFFFFFFFFFFFF0
test rax,rax
jg short M01_L07
M01_L08:
mov rax,r10
sub rax,r9
and rax,0FFFFFFFFFFFFFFF8
jle short M01_L11
movzx r11d,dx
vmovd xmm0,r11d
vpbroadcastw xmm0,xmm0
vmovupd xmm1,[rcx+r9*2]
vpcmpeqw xmm0,xmm0,xmm1
vpmovmskb eax,xmm0
test eax,eax
jne short M01_L10
add r9,8
jmp short M01_L11
M01_L09:
xor eax,eax
tzcnt eax,r8d
shr eax,1
mov r10d,eax
mov eax,r10d
add eax,r9d
jmp short M01_L17
M01_L10:
xor edx,edx
tzcnt edx,eax
shr edx,1
mov ecx,edx
mov eax,ecx
add eax,r9d
jmp short M01_L17
M01_L11:
cmp r9,r10
jge short M01_L12
mov rax,r10
sub rax,r9
jmp near ptr M01_L01
M01_L12:
mov eax,0FFFFFFFF
vzeroupper
pop rsi
ret
M01_L13:
lea eax,[r9+3]
jmp short M01_L17
M01_L14:
lea eax,[r9+2]
jmp short M01_L17
M01_L15:
lea eax,[r9+1]
jmp short M01_L17
M01_L16:
mov eax,r9d
M01_L17:
vzeroupper
pop rsi
ret
; Total bytes of code 451 |
Thanks @adamsitnik for collecting the numbers. Few things:
Update: Looking at the jitdiff, I don't see anything suspicious. The code size reduced from 462 -> 451 bytes. |
You can find the full test history here, https://pvscmdupload.blob.core.windows.net/reports/08_13_2020/allTestHistory/System.Memory.Span(Char).IndexOfValue(Size%3A%20512).html |
Same as #39721 this regression is caused by code alignment: .AddDiagnoser(new DisassemblyDiagnoser(new DisassemblyDiagnoserConfig(printInstructionAddresses: true))) py .\performance\scripts\benchmarks_ci.py -f netcoreapp5.0 --filter 'System.Memory.Span<Char>.IndexOfValue' --bdn-arguments "--envVars COMPlus_JitAlignLoops:0" EnvironmentVariables=COMPlus_JitAlignLoops=0 PowerPlanMode=00000000-0000-0000-0000-000000000000 Runtime=.NET Core 5.0
.NET Core 5.0.0 (CoreCLR 5.0.20.40416, CoreFX 5.0.20.40416), X64 RyuJIT; System.Memory.Span`1[[System.Char, System.Private.CoreLib]].IndexOfValue()
7FFB81759C60 sub rsp,28
7FFB81759C64 mov rdx,[rcx+18]
7FFB81759C68 test rdx,rdx
7FFB81759C6B jne short M00_L00
7FFB81759C6D xor r8d,r8d
7FFB81759C70 xor eax,eax
7FFB81759C72 jmp short M00_L01
M00_L00:
7FFB81759C74 lea r8,[rdx+10]
7FFB81759C78 mov eax,[rdx+8]
M00_L01:
7FFB81759C7B movzx edx,word ptr [rcx+2C]
7FFB81759C7F mov rcx,r8
7FFB81759C82 mov r8d,eax
7FFB81759C85 call System.SpanHelpers.IndexOf(Char ByRef, Char, Int32)
7FFB81759C8A nop
7FFB81759C8B add rsp,28
7FFB81759C8F ret
; Total bytes of code 48 ; System.SpanHelpers.IndexOf(Char ByRef, Char, Int32)
7FFB814CB900 push rsi
7FFB814CB901 vzeroupper
7FFB814CB904 xor r9d,r9d
7FFB814CB907 movsxd r10,r8d
7FFB814CB90A mov rax,r10
7FFB814CB90D mov r11,rcx
7FFB814CB910 test r11b,1
7FFB814CB914 jne short M01_L01
7FFB814CB916 cmp r8d,10
7FFB814CB91A jl short M01_L01
7FFB814CB91C mov rax,rcx
7FFB814CB91F neg eax
7FFB814CB921 mov r8d,eax
7FFB814CB924 shr r8d,1F
7FFB814CB928 add eax,r8d
7FFB814CB92B sar eax,1
7FFB814CB92D mov eax,eax
7FFB814CB92F and rax,7
7FFB814CB933 cmp rax,4
7FFB814CB937 jl short M01_L02
M01_L00:
7FFB814CB939 lea r8,[rcx+r9*2]
7FFB814CB93D movzx r11d,dx
7FFB814CB941 movzx esi,word ptr [r8]
7FFB814CB945 cmp r11d,esi
7FFB814CB948 je near ptr M01_L16
7FFB814CB94E movzx esi,word ptr [r8+2]
7FFB814CB953 cmp r11d,esi
7FFB814CB956 je near ptr M01_L15
7FFB814CB95C movzx esi,word ptr [r8+4]
7FFB814CB961 cmp r11d,esi
7FFB814CB964 je near ptr M01_L14
7FFB814CB96A movzx r8d,word ptr [r8+6]
7FFB814CB96F cmp r11d,r8d
7FFB814CB972 je near ptr M01_L13
7FFB814CB978 add r9,4
7FFB814CB97C add rax,0FFFFFFFFFFFFFFFC
M01_L01:
7FFB814CB980 cmp rax,4
7FFB814CB984 jge short M01_L00
M01_L02:
7FFB814CB986 test rax,rax
7FFB814CB989 jle short M01_L04
7FFB814CB98B movzx r11d,dx
M01_L03:
7FFB814CB98F movzx r8d,word ptr [rcx+r9*2]
7FFB814CB994 cmp r8d,r11d
7FFB814CB997 je near ptr M01_L16
7FFB814CB99D inc r9
7FFB814CB9A0 dec rax
7FFB814CB9A3 test rax,rax
7FFB814CB9A6 jg short M01_L03
M01_L04:
7FFB814CB9A8 cmp r9,r10
7FFB814CB9AB jge near ptr M01_L12
7FFB814CB9B1 lea rax,[rcx+r9*2]
7FFB814CB9B5 mov r11,rax
7FFB814CB9B8 mov r8d,r11d
7FFB814CB9BB test r8b,1F
7FFB814CB9BF je short M01_L06
7FFB814CB9C1 movzx r11d,dx
7FFB814CB9C5 vmovd xmm0,r11d
7FFB814CB9CA vpbroadcastw xmm0,xmm0
7FFB814CB9CF vmovupd xmm1,[rax]
7FFB814CB9D3 vpcmpeqw xmm0,xmm0,xmm1
7FFB814CB9D7 vpmovmskb r11d,xmm0
7FFB814CB9DB test r11d,r11d
7FFB814CB9DE jne short M01_L05
7FFB814CB9E0 add r9,8
7FFB814CB9E4 jmp short M01_L06
M01_L05:
7FFB814CB9E6 xor r10d,r10d
7FFB814CB9E9 tzcnt r10d,r11d
7FFB814CB9EE shr r10d,1
7FFB814CB9F1 mov edx,r10d
7FFB814CB9F4 mov eax,edx
7FFB814CB9F6 add eax,r9d
7FFB814CB9F9 jmp near ptr M01_L17
M01_L06:
7FFB814CB9FE mov rax,r10
7FFB814CBA01 sub rax,r9
7FFB814CBA04 and rax,0FFFFFFFFFFFFFFF0
7FFB814CBA08 jle short M01_L08
7FFB814CBA0A movzx r11d,dx
7FFB814CBA0E vmovd xmm0,r11d
7FFB814CBA13 vpbroadcastw ymm0,xmm0
M01_L07:
7FFB814CBA18 vmovupd ymm1,[rcx+r9*2]
7FFB814CBA1E vpcmpeqw ymm1,ymm0,ymm1
7FFB814CBA22 vpmovmskb r8d,ymm1
7FFB814CBA26 test r8d,r8d
7FFB814CBA29 jne short M01_L09
7FFB814CBA2B add r9,10
7FFB814CBA2F add rax,0FFFFFFFFFFFFFFF0
7FFB814CBA33 test rax,rax
7FFB814CBA36 jg short M01_L07
M01_L08:
7FFB814CBA38 mov rax,r10
7FFB814CBA3B sub rax,r9
7FFB814CBA3E and rax,0FFFFFFFFFFFFFFF8
7FFB814CBA42 jle short M01_L11
7FFB814CBA44 movzx r11d,dx
7FFB814CBA48 vmovd xmm0,r11d
7FFB814CBA4D vpbroadcastw xmm0,xmm0
7FFB814CBA52 vmovupd xmm1,[rcx+r9*2]
7FFB814CBA58 vpcmpeqw xmm0,xmm0,xmm1
7FFB814CBA5C vpmovmskb eax,xmm0
7FFB814CBA60 test eax,eax
7FFB814CBA62 jne short M01_L10
7FFB814CBA64 add r9,8
7FFB814CBA68 jmp short M01_L11
M01_L09:
7FFB814CBA6A xor eax,eax
7FFB814CBA6C tzcnt eax,r8d
7FFB814CBA71 shr eax,1
7FFB814CBA73 mov r10d,eax
7FFB814CBA76 mov eax,r10d
7FFB814CBA79 add eax,r9d
7FFB814CBA7C jmp short M01_L17
M01_L10:
7FFB814CBA7E xor edx,edx
7FFB814CBA80 tzcnt edx,eax
7FFB814CBA84 shr edx,1
7FFB814CBA86 mov ecx,edx
7FFB814CBA88 mov eax,ecx
7FFB814CBA8A add eax,r9d
7FFB814CBA8D jmp short M01_L17
M01_L11:
7FFB814CBA8F cmp r9,r10
7FFB814CBA92 jge short M01_L12
7FFB814CBA94 mov rax,r10
7FFB814CBA97 sub rax,r9
7FFB814CBA9A jmp near ptr M01_L01
M01_L12:
7FFB814CBA9F mov eax,0FFFFFFFF
7FFB814CBAA4 vzeroupper
7FFB814CBAA7 pop rsi
7FFB814CBAA8 ret
M01_L13:
7FFB814CBAA9 lea eax,[r9+3]
7FFB814CBAAD jmp short M01_L17
M01_L14:
7FFB814CBAAF lea eax,[r9+2]
7FFB814CBAB3 jmp short M01_L17
M01_L15:
7FFB814CBAB5 lea eax,[r9+1]
7FFB814CBAB9 jmp short M01_L17
M01_L16:
7FFB814CBABB mov eax,r9d
M01_L17:
7FFB814CBABE vzeroupper
7FFB814CBAC1 pop rsi
7FFB814CBAC2 ret
; Total bytes of code 451 py .\performance\scripts\benchmarks_ci.py -f netcoreapp5.0 --filter 'System.Memory.Span<Char>.IndexOfValue' --bdn-arguments "--envVars COMPlus_JitAlignLoops:1" EnvironmentVariables=COMPlus_JitAlignLoops=1 PowerPlanMode=00000000-0000-0000-0000-000000000000 Runtime=.NET Core 5.0
.NET Core 5.0.0 (CoreCLR 5.0.20.40416, CoreFX 5.0.20.40416), X64 RyuJIT; System.Memory.Span`1[[System.Char, System.Private.CoreLib]].IndexOfValue()
7FFB8A839ED0 sub rsp,28
7FFB8A839ED4 mov rdx,[rcx+18]
7FFB8A839ED8 test rdx,rdx
7FFB8A839EDB jne short M00_L00
7FFB8A839EDD xor r8d,r8d
7FFB8A839EE0 xor eax,eax
7FFB8A839EE2 jmp short M00_L01
M00_L00:
7FFB8A839EE4 lea r8,[rdx+10]
7FFB8A839EE8 mov eax,[rdx+8]
M00_L01:
7FFB8A839EEB movzx edx,word ptr [rcx+2C]
7FFB8A839EEF mov rcx,r8
7FFB8A839EF2 mov r8d,eax
7FFB8A839EF5 call System.SpanHelpers.IndexOf(Char ByRef, Char, Int32)
7FFB8A839EFA nop
7FFB8A839EFB add rsp,28
7FFB8A839EFF ret
; Total bytes of code 48 ; System.SpanHelpers.IndexOf(Char ByRef, Char, Int32)
7FFB8A5AB900 push rsi
7FFB8A5AB901 vzeroupper
7FFB8A5AB904 xor r9d,r9d
7FFB8A5AB907 movsxd r10,r8d
7FFB8A5AB90A mov rax,r10
7FFB8A5AB90D mov r11,rcx
7FFB8A5AB910 test r11b,1
7FFB8A5AB914 jne near ptr M01_L01
7FFB8A5AB91A cmp r8d,10
7FFB8A5AB91E jl near ptr M01_L01
7FFB8A5AB924 mov rax,rcx
7FFB8A5AB927 neg eax
7FFB8A5AB929 mov r8d,eax
7FFB8A5AB92C shr r8d,1F
7FFB8A5AB930 add eax,r8d
7FFB8A5AB933 sar eax,1
7FFB8A5AB935 mov eax,eax
7FFB8A5AB937 and rax,7
7FFB8A5AB93B cmp rax,4
7FFB8A5AB93F jl short M01_L02
7FFB8A5AB941 nop dword ptr [rax]
7FFB8A5AB948 nop dword ptr [rax+rax]
M01_L00:
7FFB8A5AB950 lea r8,[rcx+r9*2]
7FFB8A5AB954 movzx r11d,dx
7FFB8A5AB958 movzx esi,word ptr [r8]
7FFB8A5AB95C cmp r11d,esi
7FFB8A5AB95F je near ptr M01_L16
7FFB8A5AB965 movzx esi,word ptr [r8+2]
7FFB8A5AB96A cmp r11d,esi
7FFB8A5AB96D je near ptr M01_L15
7FFB8A5AB973 movzx esi,word ptr [r8+4]
7FFB8A5AB978 cmp r11d,esi
7FFB8A5AB97B je near ptr M01_L14
7FFB8A5AB981 movzx r8d,word ptr [r8+6]
7FFB8A5AB986 cmp r11d,r8d
7FFB8A5AB989 je near ptr M01_L13
7FFB8A5AB98F add r9,4
7FFB8A5AB993 add rax,0FFFFFFFFFFFFFFFC
7FFB8A5AB997 nop word ptr [rax+rax]
M01_L01:
7FFB8A5AB9A0 cmp rax,4
7FFB8A5AB9A4 jge short M01_L00
M01_L02:
7FFB8A5AB9A6 test rax,rax
7FFB8A5AB9A9 jle short M01_L04
7FFB8A5AB9AB movzx r11d,dx
7FFB8A5AB9AF nop
M01_L03:
7FFB8A5AB9B0 movzx r8d,word ptr [rcx+r9*2]
7FFB8A5AB9B5 cmp r8d,r11d
7FFB8A5AB9B8 je near ptr M01_L16
7FFB8A5AB9BE inc r9
7FFB8A5AB9C1 dec rax
7FFB8A5AB9C4 test rax,rax
7FFB8A5AB9C7 jg short M01_L03
M01_L04:
7FFB8A5AB9C9 cmp r9,r10
7FFB8A5AB9CC jge near ptr M01_L12
7FFB8A5AB9D2 lea rax,[rcx+r9*2]
7FFB8A5AB9D6 mov r11,rax
7FFB8A5AB9D9 mov r8d,r11d
7FFB8A5AB9DC test r8b,1F
7FFB8A5AB9E0 je short M01_L06
7FFB8A5AB9E2 movzx r11d,dx
7FFB8A5AB9E6 vmovd xmm0,r11d
7FFB8A5AB9EB vpbroadcastw xmm0,xmm0
7FFB8A5AB9F0 vmovupd xmm1,[rax]
7FFB8A5AB9F4 vpcmpeqw xmm0,xmm0,xmm1
7FFB8A5AB9F8 vpmovmskb r11d,xmm0
7FFB8A5AB9FC test r11d,r11d
7FFB8A5AB9FF jne short M01_L05
7FFB8A5ABA01 add r9,8
7FFB8A5ABA05 jmp short M01_L06
M01_L05:
7FFB8A5ABA07 xor r10d,r10d
7FFB8A5ABA0A tzcnt r10d,r11d
7FFB8A5ABA0F shr r10d,1
7FFB8A5ABA12 mov edx,r10d
7FFB8A5ABA15 mov eax,edx
7FFB8A5ABA17 add eax,r9d
7FFB8A5ABA1A jmp near ptr M01_L17
M01_L06:
7FFB8A5ABA1F mov rax,r10
7FFB8A5ABA22 sub rax,r9
7FFB8A5ABA25 and rax,0FFFFFFFFFFFFFFF0
7FFB8A5ABA29 jle short M01_L08
7FFB8A5ABA2B movzx r11d,dx
7FFB8A5ABA2F vmovd xmm0,r11d
7FFB8A5ABA34 vpbroadcastw ymm0,xmm0
7FFB8A5ABA39 nop dword ptr [rax]
M01_L07:
7FFB8A5ABA40 vmovupd ymm1,[rcx+r9*2]
7FFB8A5ABA46 vpcmpeqw ymm1,ymm0,ymm1
7FFB8A5ABA4A vpmovmskb r8d,ymm1
7FFB8A5ABA4E test r8d,r8d
7FFB8A5ABA51 jne short M01_L09
7FFB8A5ABA53 add r9,10
7FFB8A5ABA57 add rax,0FFFFFFFFFFFFFFF0
7FFB8A5ABA5B test rax,rax
7FFB8A5ABA5E jg short M01_L07
M01_L08:
7FFB8A5ABA60 mov rax,r10
7FFB8A5ABA63 sub rax,r9
7FFB8A5ABA66 and rax,0FFFFFFFFFFFFFFF8
7FFB8A5ABA6A jle short M01_L11
7FFB8A5ABA6C movzx r11d,dx
7FFB8A5ABA70 vmovd xmm0,r11d
7FFB8A5ABA75 vpbroadcastw xmm0,xmm0
7FFB8A5ABA7A vmovupd xmm1,[rcx+r9*2]
7FFB8A5ABA80 vpcmpeqw xmm0,xmm0,xmm1
7FFB8A5ABA84 vpmovmskb eax,xmm0
7FFB8A5ABA88 test eax,eax
7FFB8A5ABA8A jne short M01_L10
7FFB8A5ABA8C add r9,8
7FFB8A5ABA90 jmp short M01_L11
M01_L09:
7FFB8A5ABA92 xor eax,eax
7FFB8A5ABA94 tzcnt eax,r8d
7FFB8A5ABA99 shr eax,1
7FFB8A5ABA9B mov r10d,eax
7FFB8A5ABA9E mov eax,r10d
7FFB8A5ABAA1 add eax,r9d
7FFB8A5ABAA4 jmp short M01_L17
M01_L10:
7FFB8A5ABAA6 xor edx,edx
7FFB8A5ABAA8 tzcnt edx,eax
7FFB8A5ABAAC shr edx,1
7FFB8A5ABAAE mov ecx,edx
7FFB8A5ABAB0 mov eax,ecx
7FFB8A5ABAB2 add eax,r9d
7FFB8A5ABAB5 jmp short M01_L17
M01_L11:
7FFB8A5ABAB7 cmp r9,r10
7FFB8A5ABABA jge short M01_L12
7FFB8A5ABABC mov rax,r10
7FFB8A5ABABF sub rax,r9
7FFB8A5ABAC2 jmp near ptr M01_L01
M01_L12:
7FFB8A5ABAC7 mov eax,0FFFFFFFF
7FFB8A5ABACC vzeroupper
7FFB8A5ABACF pop rsi
7FFB8A5ABAD0 ret
M01_L13:
7FFB8A5ABAD1 lea eax,[r9+3]
7FFB8A5ABAD5 jmp short M01_L17
M01_L14:
7FFB8A5ABAD7 lea eax,[r9+2]
7FFB8A5ABADB jmp short M01_L17
M01_L15:
7FFB8A5ABADD lea eax,[r9+1]
7FFB8A5ABAE1 jmp short M01_L17
M01_L16:
7FFB8A5ABAE3 mov eax,r9d
M01_L17:
7FFB8A5ABAE6 vzeroupper
7FFB8A5ABAE9 pop rsi
7FFB8A5ABAEA ret
; Total bytes of code 491 |
Linking to #8108. |
Run Information
Regressions in System.Memory.Span
Related Issue on x64 Windows
[Perf -14%] System.Memory.Span.IndexOfValue
Related Issue on x86 Windows
[Perf -11%] System.Memory.Span (2)
Historical Data in Reporting System
Repro
Histogram
System.Memory.Span.IndexOfValue(Size: 512)
Docs
Profiling workflow for dotnet/runtime repository
Benchmarking workflow for dotnet/runtime repository
The text was updated successfully, but these errors were encountered: