Skip to content
This repository has been archived by the owner on Jan 23, 2023. It is now read-only.
/ corefx Public archive

Improve deserialization perf with changes to property name lookup #40998

Merged
merged 3 commits into from
Sep 12, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 142 additions & 76 deletions src/System.Text.Json/src/System/Text/Json/Serialization/JsonClassInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text.Json.Serialization;
using System.Text.Json.Serialization.Converters;
Expand All @@ -16,7 +17,9 @@ namespace System.Text.Json
internal sealed partial class JsonClassInfo
{
// The length of the property name embedded in the key (in bytes).
private const int PropertyNameKeyLength = 6;
// The key is a ulong (8 bytes) containing the first 7 bytes of the property name
// followed by a byte representing the length.
private const int PropertyNameKeyLength = 7;
steveharter marked this conversation as resolved.
Show resolved Hide resolved

// The limit to how many property names from the JSON are cached in _propertyRefsSorted before using PropertyCache.
private const int PropertyNameCountCacheThreshold = 64;
Expand Down Expand Up @@ -257,85 +260,132 @@ private JsonPropertyInfo GetPropertyWithUniqueAttribute(Type attributeType, Dict
return property;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool TryIsPropertyRefEqual(in PropertyRef propertyRef, ReadOnlySpan<byte> propertyName, ulong key, ref JsonPropertyInfo info)
steveharter marked this conversation as resolved.
Show resolved Hide resolved
{
if (key == propertyRef.Key)
{
// We compare the whole name, although we could skip the first 7 bytes (but it's not any faster)
if (propertyName.Length <= PropertyNameKeyLength ||
propertyName.SequenceEqual(propertyRef.Info.Name))
{
info = propertyRef.Info;
return true;
}
}

return false;
}

// AggressiveInlining used although a large method it is only called from one location and is on a hot path.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public JsonPropertyInfo GetProperty(ReadOnlySpan<byte> propertyName, ref ReadStackFrame frame)
{
JsonPropertyInfo info = null;

// Keep a local copy of the cache in case it changes by another thread.
PropertyRef[] localPropertyRefsSorted = _propertyRefsSorted;

ulong key = GetKey(propertyName);

// If there is an existing cache, then use it.
if (localPropertyRefsSorted != null)
{
ulong key = GetKey(propertyName);

// Start with the current property index, and then go forwards\backwards.
int propertyIndex = frame.PropertyIndex;

int count = localPropertyRefsSorted.Length;
int iForward = Math.Min(propertyIndex, count);
int iBackward = iForward - 1;

while (iForward < count || iBackward >= 0)
for (;;)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have some evidence to suggest that this forward/backwards search will be beneficial in practice, for perf?

steveharter marked this conversation as resolved.
Show resolved Hide resolved
{
if (iForward < count)
{
if (TryIsPropertyRefEqual(localPropertyRefsSorted[iForward], propertyName, key, ref info))
PropertyRef propertyRef = localPropertyRefsSorted[iForward];
if (TryIsPropertyRefEqual(propertyRef, propertyName, key, ref info))
{
return info;
}

++iForward;
}

if (iBackward >= 0)
if (iBackward >= 0)
{
propertyRef = localPropertyRefsSorted[iBackward];
if (TryIsPropertyRefEqual(propertyRef, propertyName, key, ref info))
{
return info;
}

--iBackward;
}
}
else if (iBackward >= 0)
{
if (TryIsPropertyRefEqual(localPropertyRefsSorted[iBackward], propertyName, key, ref info))
PropertyRef propertyRef = localPropertyRefsSorted[iBackward];
if (TryIsPropertyRefEqual(propertyRef, propertyName, key, ref info))
{
return info;
}

--iBackward;
}
else
{
// Property was not found.
break;
}
}
}

// No cached item was found. Try the main list which has all of the properties.

string stringPropertyName = JsonHelpers.Utf8GetString(propertyName);
if (PropertyCache.TryGetValue(stringPropertyName, out info))
if (!PropertyCache.TryGetValue(stringPropertyName, out info))
{
// Check if we should add this to the cache.
// Only cache up to a threshold length and then just use the dictionary when an item is not found in the cache.
int count;
if (localPropertyRefsSorted != null)
{
count = localPropertyRefsSorted.Length;
}
else
info = JsonPropertyInfo.s_missingProperty;
}

Debug.Assert(info != null);

// Three code paths to get here:
// 1) info == s_missingProperty. Property not found.
// 2) key == info.PropertyNameKey. Exact match found.
// 3) key != info.PropertyNameKey. Match found due to case insensitivity.
steveharter marked this conversation as resolved.
Show resolved Hide resolved
Debug.Assert(info == JsonPropertyInfo.s_missingProperty || key == info.PropertyNameKey || Options.PropertyNameCaseInsensitive);

// Check if we should add this to the cache.
// Only cache up to a threshold length and then just use the dictionary when an item is not found in the cache.
int cacheCount;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

int cacheCount = 0;
if (localPropertyRefsSorted != null)
{
    cacheCount = localPropertyRefsSorted.Length;
}

if (localPropertyRefsSorted != null)
{
cacheCount = localPropertyRefsSorted.Length;
}
else
{
cacheCount = 0;
}

// Do a quick check for the stable (after warm-up) case.
if (cacheCount < PropertyNameCountCacheThreshold)
{
// Do a slower check for the warm-up case.
if (frame.PropertyRefCache != null)
{
count = 0;
cacheCount += frame.PropertyRefCache.Count;
}

// Do a quick check for the stable (after warm-up) case.
if (count < PropertyNameCountCacheThreshold)
// Check again to append the cache up to the threshold.
if (cacheCount < PropertyNameCountCacheThreshold)
{
// Do a slower check for the warm-up case.
if (frame.PropertyRefCache != null)
if (frame.PropertyRefCache == null)
{
count += frame.PropertyRefCache.Count;
frame.PropertyRefCache = new List<PropertyRef>();
}

// Check again to append the cache up to the threshold.
if (count < PropertyNameCountCacheThreshold)
{
if (frame.PropertyRefCache == null)
{
frame.PropertyRefCache = new List<PropertyRef>();
}

ulong key = info.PropertyNameKey;
steveharter marked this conversation as resolved.
Show resolved Hide resolved
PropertyRef propertyRef = new PropertyRef(key, info);
frame.PropertyRefCache.Add(propertyRef);
}
PropertyRef propertyRef = new PropertyRef(key, info);
frame.PropertyRefCache.Add(propertyRef);
}
}

Expand All @@ -360,74 +410,90 @@ private Dictionary<string, JsonPropertyInfo> CreatePropertyCache(int capacity)

public JsonPropertyInfo PolicyProperty { get; private set; }

private static bool TryIsPropertyRefEqual(in PropertyRef propertyRef, ReadOnlySpan<byte> propertyName, ulong key, ref JsonPropertyInfo info)
{
if (key == propertyRef.Key)
{
if (propertyName.Length <= PropertyNameKeyLength ||
// We compare the whole name, although we could skip the first 6 bytes (but it's likely not any faster)
propertyName.SequenceEqual(propertyRef.Info.Name))
{
info = propertyRef.Info;
return true;
}
}

return false;
}

private static bool IsPropertyRefEqual(ref PropertyRef propertyRef, PropertyRef other)
{
if (propertyRef.Key == other.Key)
{
if (propertyRef.Info.Name.Length <= PropertyNameKeyLength ||
propertyRef.Info.Name.AsSpan().SequenceEqual(other.Info.Name.AsSpan()))
{
return true;
}
}

return false;
}

/// <summary>
/// Get a key from the property name.
/// The key consists of the first 7 bytes of the property name and then the length.
/// </summary>
public static ulong GetKey(ReadOnlySpan<byte> propertyName)
{
const int BitsInByte = 8;
ulong key;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we init key to 0 up front, can we lose the last else branch? Might be worth it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But that would cause an extra initialization for the other cases. (?)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe it's better to avoid branches, and assignment isn't very expensive.

Also, without the extra branch, we produce smaller assembly:
sharplab.io - with ints

sharplab.io - with ulong

sharplab.io - whole method

It's 1 fewer instruction if you init up front.

int length = propertyName.Length;

// Embed the propertyName in the first 6 bytes of the key.
if (length > 3)
if (length > 7)
{
key = MemoryMarshal.Read<ulong>(propertyName);

// Clear the high byte so it can hold the length.
key &= 0x00FFFFFFFFFFFFFF;

// Use a length of 8. Any length > 7 (PropertyNameKeyLength) would work since
// the comparison logic tests for equality against the full contents instead of just
// the key if the property name is 8 or more characters.
steveharter marked this conversation as resolved.
Show resolved Hide resolved
key |= (ulong) 8 << (7 * BitsInByte);
}
else if (length > 3)
{
key = MemoryMarshal.Read<uint>(propertyName);
if (length > 4)

if (length == 7)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Beside the other comment, maybe the code can be written as

key = MemoryMarshal.Read<uint>(propertyName);
ReadOnlySpan<byte> tmp = propertyName;

switch (length)
{
    case 7: key |= (ulong)tmp[6] << (6 * BitsInByte); goto case 6;
    case 6: key |= (ulong)tmp[5] << (5 * BitsInByte); goto case 5;
    case 5: key |= (ulong)tmp[4] << (4 * BitsInByte); goto default;
    default: key |= (ulong)length << (7 * BitsInByte); break;
}

as this avoids the repeated steps for the lower indices, the dasm looks quite good, but I haven't perf-tested this variant.

dasm
; Assembly listing for method Program:GetKey(struct):long
; Emitting BLENDED_CODE for X64 CPU with AVX - Unix
; optimized code
; rbp based frame
; partially interruptible
; Final local variable assignments
;
;  V00 arg0         [V00    ] ( 11,  6.25)  struct (16) [rbp-0x10]   do-not-enreg[XSFB] addr-exposed ld-addr-op
;  V01 loc0         [V01,T02] (  8,  5   )     int  ->  rdi
;  V02 loc1         [V02,T03] (  6,  3   )    long  ->  rax
;  V03 loc2         [V03,T01] ( 10,  5   )    long  ->  rsi
;* V04 loc3         [V04    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op
;  V05 loc4         [V05,T04] (  6,  3   )    long  ->  rax
;  V06 loc5         [V06,T18] (  2,  1   )    long  ->  rax
;# V07 OutArgs      [V07    ] (  1,  1   )  lclBlk ( 0) [rsp+0x00]   "OutgoingArgSpace"
;* V08 tmp1         [V08    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg"
;* V09 tmp2         [V09    ] (  0,  0   )     int  ->  zero-ref    "impAppendStmt"
;* V10 tmp3         [V10    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg"
;  V11 tmp4         [V11,T09] (  2,  2   )   byref  ->  rax         "Inlining Arg"
;* V12 tmp5         [V12    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg"
;* V13 tmp6         [V13    ] (  0,  0   )     int  ->  zero-ref    "impAppendStmt"
;* V14 tmp7         [V14    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg"
;  V15 tmp8         [V15,T10] (  2,  2   )   byref  ->  rsi         "Inlining Arg"
;* V16 tmp9         [V16    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg"
;* V17 tmp10        [V17    ] (  0,  0   )     int  ->  zero-ref    "impAppendStmt"
;* V18 tmp11        [V18    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg"
;  V19 tmp12        [V19,T11] (  2,  2   )   byref  ->   r8         "Inlining Arg"
;  V20 tmp13        [V20,T00] (  6,  7   )    long  ->  rax         "Single return block return value"
;  V21 tmp14        [V21,T07] (  4,  2   )   byref  ->  rdx         V04._pointer(offs=0x00) P-INDEP "field V04._pointer (fldOffset=0x0)"
;  V22 tmp15        [V22,T08] (  4,  2   )     int  ->  rcx         V04._length(offs=0x08) P-INDEP "field V04._length (fldOffset=0x8)"
;  V23 tmp16        [V23,T19] (  2,  0.75)   byref  ->  rax         V08._pointer(offs=0x00) P-INDEP "field V08._pointer (fldOffset=0x0)"
;  V24 tmp17        [V24,T22] (  2,  0.50)     int  ->  rdi         V08._length(offs=0x08) P-INDEP "field V08._length (fldOffset=0x8)"
;  V25 tmp18        [V25,T15] (  2,  1   )   byref  ->  rax         V10._pointer(offs=0x00) P-INDEP "field V10._pointer (fldOffset=0x0)"
;* V26 tmp19        [V26    ] (  0,  0   )     int  ->  zero-ref    V10._length(offs=0x08) P-INDEP "field V10._length (fldOffset=0x8)"
;  V27 tmp20        [V27,T20] (  2,  0.75)   byref  ->  rsi         V12._pointer(offs=0x00) P-INDEP "field V12._pointer (fldOffset=0x0)"
;  V28 tmp21        [V28,T23] (  2,  0.50)     int  ->  rax         V12._length(offs=0x08) P-INDEP "field V12._length (fldOffset=0x8)"
;  V29 tmp22        [V29,T16] (  2,  1   )   byref  ->  rsi         V14._pointer(offs=0x00) P-INDEP "field V14._pointer (fldOffset=0x0)"
;* V30 tmp23        [V30    ] (  0,  0   )     int  ->  zero-ref    V14._length(offs=0x08) P-INDEP "field V14._length (fldOffset=0x8)"
;  V31 tmp24        [V31,T21] (  2,  0.75)   byref  ->   r8         V16._pointer(offs=0x00) P-INDEP "field V16._pointer (fldOffset=0x0)"
;  V32 tmp25        [V32,T24] (  2,  0.50)     int  ->  rax         V16._length(offs=0x08) P-INDEP "field V16._length (fldOffset=0x8)"
;  V33 tmp26        [V33,T17] (  2,  1   )   byref  ->   r8         V18._pointer(offs=0x00) P-INDEP "field V18._pointer (fldOffset=0x0)"
;* V34 tmp27        [V34    ] (  0,  0   )     int  ->  zero-ref    V18._length(offs=0x08) P-INDEP "field V18._length (fldOffset=0x8)"
;  V35 tmp28        [V35,T12] (  3,  1.50)   byref  ->  rdi         "BlockOp address local"
;  V36 tmp29        [V36,T13] (  3,  1.50)   byref  ->  rax         "BlockOp address local"
;  V37 tmp30        [V37,T05] (  3,  3   )   byref  ->  rax         "BlockOp address local"
;  V38 tmp31        [V38,T14] (  3,  1.50)   byref  ->  rax         "BlockOp address local"
;  V39 rat0         [V39,T06] (  3,  3   )     int  ->   r8         "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 16

G_M53623_IG01:
       55                   push     rbp
       4883EC10             sub      rsp, 16
       488D6C2410           lea      rbp, [rsp+10H]
       48897DF0             mov      bword ptr [rbp-10H], rdi
       488975F8             mov      qword ptr [rbp-08H], rsi

G_M53623_IG02:
       8B7DF8               mov      edi, dword ptr [rbp-08H]
       83FF07               cmp      edi, 7
       7E35                 jle      SHORT G_M53623_IG04
       488D7DF0             lea      rdi, bword ptr [rbp-10H]
       488B07               mov      rax, bword ptr [rdi]
       8B7F08               mov      edi, dword ptr [rdi+8]
       83FF08               cmp      edi, 8
       0F8C3B010000         jl       G_M53623_IG17

G_M53623_IG03:
       488B00               mov      rax, qword ptr [rax]
       48BFFFFFFFFFFFFFFF00 mov      rdi, 0xFFFFFFFFFFFFFF
       4823C7               and      rax, rdi
       48BF0000000000000008 mov      rdi, 0x800000000000000
       480BC7               or       rax, rdi
       E913010000           jmp      G_M53623_IG16

G_M53623_IG04:
       83FF03               cmp      edi, 3
       0F8E91000000         jle      G_M53623_IG10
       488D45F0             lea      rax, bword ptr [rbp-10H]
       488B30               mov      rsi, bword ptr [rax]
       8B4008               mov      eax, dword ptr [rax+8]
       83F804               cmp      eax, 4
       0F8C08010000         jl       G_M53623_IG18

G_M53623_IG05:
       8B06                 mov      eax, dword ptr [rsi]
       8BF0                 mov      esi, eax
       488D45F0             lea      rax, bword ptr [rbp-10H]
       488B10               mov      rdx, bword ptr [rax]
       8B4808               mov      ecx, dword ptr [rax+8]
       448D47FB             lea      r8d, [rdi-5]
       4183F802             cmp      r8d, 2
       7757                 ja       SHORT G_M53623_IG09
       418BC0               mov      eax, r8d
       4C8D0503010000       lea      r8, [reloc @RWD00]
       458B0480             mov      r8d, dword ptr [r8+4*rax]
       4C8D0D7AFFFFFF       lea      r9, G_M53623_IG02
       4D03C1               add      r8, r9
       41FFE0               jmp      r8

G_M53623_IG06:
       83F906               cmp      ecx, 6
       0F86E2000000         jbe      G_M53623_IG20
       0FB64206             movzx    rax, byte  ptr [rdx+6]
       48C1E030             shl      rax, 48
       480BF0               or       rsi, rax

G_M53623_IG07:
       83F905               cmp      ecx, 5
       0F86CE000000         jbe      G_M53623_IG20
       0FB64205             movzx    rax, byte  ptr [rdx+5]
       48C1E028             shl      rax, 40
       480BF0               or       rsi, rax

G_M53623_IG08:
       83F904               cmp      ecx, 4
       0F86BA000000         jbe      G_M53623_IG20
       0FB64204             movzx    rax, byte  ptr [rdx+4]
       48C1E020             shl      rax, 32
       480BF0               or       rsi, rax

G_M53623_IG09:
       4863FF               movsxd   rdi, edi
       48C1E738             shl      rdi, 56
       480BF7               or       rsi, rdi
       488BC6               mov      rax, rsi
       EB79                 jmp      SHORT G_M53623_IG16

G_M53623_IG10:
       83FF01               cmp      edi, 1
       7E51                 jle      SHORT G_M53623_IG14
       488D45F0             lea      rax, bword ptr [rbp-10H]
       4C8B00               mov      r8, bword ptr [rax]
       8B4008               mov      eax, dword ptr [rax+8]
       83F802               cmp      eax, 2
       0F8C7D000000         jl       G_M53623_IG19

G_M53623_IG11:
       410FB700             movzx    rax, word  ptr [r8]
       83FF03               cmp      edi, 3
       7526                 jne      SHORT G_M53623_IG12
       837DF802             cmp      dword ptr [rbp-08H], 2
       7679                 jbe      SHORT G_M53623_IG20
       488B7DF0             mov      rdi, bword ptr [rbp-10H]
       0FB64F02             movzx    rcx, byte  ptr [rdi+2]
       8BD1                 mov      edx, ecx
       48C1E210             shl      rdx, 16
       480BC2               or       rax, rdx
       48BE0000000000000003 mov      rsi, 0x300000000000000
       480BC6               or       rax, rsi
       EB0D                 jmp      SHORT G_M53623_IG13

G_M53623_IG12:
       48BF0000000000000002 mov      rdi, 0x200000000000000
       480BC7               or       rax, rdi

G_M53623_IG13:
       EB23                 jmp      SHORT G_M53623_IG16

G_M53623_IG14:
       83FF01               cmp      edi, 1
       751C                 jne      SHORT G_M53623_IG15
       837DF800             cmp      dword ptr [rbp-08H], 0
       763F                 jbe      SHORT G_M53623_IG20
       488B45F0             mov      rax, bword ptr [rbp-10H]
       0FB600               movzx    rax, byte  ptr [rax]
       48BF0000000000000001 mov      rdi, 0x100000000000000
       480BC7               or       rax, rdi
       EB02                 jmp      SHORT G_M53623_IG16

G_M53623_IG15:
       33C0                 xor      rax, rax

G_M53623_IG16:
       488D6500             lea      rsp, [rbp]
       5D                   pop      rbp
       C3                   ret

G_M53623_IG17:
       BF28000000           mov      edi, 40
       E8DEAEFFFF           call     ThrowHelper:ThrowArgumentOutOfRangeException(int)
       CC                   int3

G_M53623_IG18:
       BF28000000           mov      edi, 40
       E8D3AEFFFF           call     ThrowHelper:ThrowArgumentOutOfRangeException(int)
       CC                   int3

G_M53623_IG19:
       BF28000000           mov      edi, 40
       E8C8AEFFFF           call     ThrowHelper:ThrowArgumentOutOfRangeException(int)
       CC                   int3

G_M53623_IG20:
       E8526C3679           call     CORINFO_HELP_RNGCHKFAIL
       CC                   int3

RWD00  dd   000000B4h ; case G_M53623_IG08
       dd   000000A0h ; case G_M53623_IG07
       dd   0000008Ch ; case G_M53623_IG06

; Total bytes of code 399, prolog size 10 for method Program:GetKey(struct):long
; ============================================================

{
key |= (ulong) propertyName[4] << (4 * BitsInByte)
| (ulong) propertyName[5] << (5 * BitsInByte)
| (ulong) propertyName[6] << (6 * BitsInByte)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can write the highest index at first, so the JIT will emit only one bound check, instead of one for every indexed access to the span.

So

key |= (ulong)propertyName[6] << (6 * BitsInByte)
    | (ulong)propertyName[5] << (5 * BitInByte)
      // ...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, there is something strange happening, as this optimization won't kick in here.

It seems that MemoryMarshal.Read<uint>(propertyName) causes the JIT not to elide the subsequent bound checks after the access to index 6.
(at least when I remove the call to MemoryMarshal the optimization kicks in as expected).

asm
G_M53623_IG04:
       83F803               cmp      eax, 3
       0F8EFD000000         jle      G_M53623_IG10
       488D7DF0             lea      rdi, bword ptr [rbp-10H]
       488B37               mov      rsi, bword ptr [rdi]
       8B7F08               mov      edi, dword ptr [rdi+8]
       83FF04               cmp      edi, 4
       0F8C74010000         jl       G_M53623_IG18

G_M53623_IG05:
       8B3E                 mov      edi, dword ptr [rsi]
       83F807               cmp      eax, 7
       755D                 jne      SHORT G_M53623_IG06
       837DF806             cmp      dword ptr [rbp-08H], 6 ; bound check
       0F8679010000         jbe      G_M53623_IG20
       488B45F0             mov      rax, bword ptr [rbp-10H]
       0FB64006             movzx    rax, byte  ptr [rax+6]
       48C1E030             shl      rax, 48
       480BF8               or       rdi, rax
       837DF805             cmp      dword ptr [rbp-08H], 5 ; bound check
       0F8660010000         jbe      G_M53623_IG20
       488B45F0             mov      rax, bword ptr [rbp-10H]
       0FB64005             movzx    rax, byte  ptr [rax+5]
       48C1E028             shl      rax, 40
       480BF8               or       rdi, rax
       837DF804             cmp      dword ptr [rbp-08H], 4 ; bound check
       0F8647010000         jbe      G_M53623_IG20
       488B45F0             mov      rax, bword ptr [rbp-10H]
       0FB64004             movzx    rax, byte  ptr [rax+4]
       48C1E020             shl      rax, 32
       480BF8               or       rdi, rax
       48B80000000000000007 mov      rax, 0x700000000000000
       480BF8               or       rdi, rax
       E981000000           jmp      G_M53623_IG09

This can be circumvented with a local like

// ...
ulong key = MemoryMarshal.Read<uint>(propertyName);
ReadOnlySpan<byte> tmp = propertyName;

if (length == 7)
{
	key |= (ulong)tmp[6] << (6 * BitsInByte)
		| (ulong)tmp[5] << (5 * BitsInByte)
		| (ulong)tmp[4] << (4 * BitsInByte)
		| (ulong)7 << (7 * BitsInByte);
}
// ...
asm
G_M53623_IG04:
       83F803               cmp      eax, 3
       0F8ED1000000         jle      G_M53623_IG10
       488D7DF0             lea      rdi, bword ptr [rbp-10H]
       488B37               mov      rsi, bword ptr [rdi]
       8B7F08               mov      edi, dword ptr [rdi+8]
       83FF04               cmp      edi, 4
       0F8C48010000         jl       G_M53623_IG18

G_M53623_IG05:
       8B3E                 mov      edi, dword ptr [rsi]
       488D75F0             lea      rsi, bword ptr [rbp-10H]
       488B16               mov      rdx, bword ptr [rsi]
       8B7608               mov      esi, dword ptr [rsi+8]
       83F807               cmp      eax, 7
       753D                 jne      SHORT G_M53623_IG06
       83FE06               cmp      esi, 6
       0F8644010000         jbe      G_M53623_IG20
       0FB64206             movzx    rax, byte  ptr [rdx+6]
       8BF0                 mov      esi, eax
       48C1E630             shl      rsi, 48
       480BFE               or       rdi, rsi
       0FB64205             movzx    rax, byte  ptr [rdx+5]
       48C1E028             shl      rax, 40
       480BF8               or       rdi, rax
       0FB65204             movzx    rdx, byte  ptr [rdx+4]
       8BC2                 mov      eax, edx
       48C1E020             shl      rax, 32
       480BF8               or       rdi, rax
       48B80000000000000007 mov      rax, 0x700000000000000
       480BF8               or       rdi, rax
       EB6B                 jmp      SHORT G_M53623_IG09

@AndyAyersMS is this known?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No...

cc @dotnet/jit-contrib

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the end this code just needs to copy from a variable length byte array (up to length 7) to a ulong. I could write a native method using memcpy, but that is overkill since we don't have any native code for System.Text.Json...

Copy link
Member Author

@steveharter steveharter Sep 12, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, there is something strange happening, as this optimization won't kick in here.

I tried all variants

  • Original (acsending order).
  • In descending order (same assembly as original)
  • In descending order + temp variable (didn't see a difference in assembly w.r.t. boundary check)
  • Using switch\case (overhead for setting up switch)

The original and descending order performed best (by inspecting assembly and somewhat verifying benchmark -- close to margin of error). However, I'll use descending order based on possible bounds check optimization.

Copy link
Member

@erozenfeld erozenfeld Sep 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

x64 Windows code has no bounds checks in this example. x64 Linux has bounds checks because we are not "promoting" propertyName struct. We are running into this issue:

https://github.com/dotnet/coreclr/blob/9479f67577bbb02ea611777b00308f42252fb2bc/src/jit/lclvars.cpp#L1914-L1926

Incoming multi-reg structs with more than one field are not getting promoted.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we are not promoting the struct we are not tracking the _length field properly and can't eliminate the bounds checks.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to understand it: when I use the tmp, so it is local and the bound checks for [5] and [4] can be elided, as the one check for [6] is done (as expected)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's correct. The bounds check for [6] is not needed either once we are inside if (length==7) but we don't realize that length and tmp._length have the same values.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

| (ulong) 7 << (7 * BitsInByte);
}
else if (length == 6)
{
key |= (ulong)propertyName[4] << 32;
key |= (ulong) propertyName[4] << (4 * BitsInByte)
| (ulong) propertyName[5] << (5 * BitsInByte)
| (ulong) 6 << (7 * BitsInByte);
}
if (length > 5)
else if (length == 5)
steveharter marked this conversation as resolved.
Show resolved Hide resolved
{
key |= (ulong) propertyName[4] << (4 * BitsInByte)
| (ulong) 5 << (7 * BitsInByte);
}
else
{
key |= (ulong)propertyName[5] << 40;
key |= (ulong) 4 << (7 * BitsInByte);
}
}
else if (length > 1)
{
key = MemoryMarshal.Read<ushort>(propertyName);
if (length > 2)

if (length == 3)
{
key |= (ulong)propertyName[2] << 16;
key |= (ulong) propertyName[2] << (2 * BitsInByte)
| (ulong) 3 << (7 * BitsInByte);
}
else
{
key |= (ulong) 2 << (7 * BitsInByte);
}
}
else if (length == 1)
{
key = propertyName[0];
key = propertyName[0]
| (ulong) 1 << (7 * BitsInByte);
}
else
{
// An empty name is valid.
key = 0;
}

// Embed the propertyName length in the last two bytes.
key |= (ulong)propertyName.Length << 48;
// Verify key contains the embedded bytes as expected.
Debug.Assert(
length < 1 || propertyName[0] == (key & ((ulong)0xFF << 8 * 0)) >> 8 * 0 &&
steveharter marked this conversation as resolved.
Show resolved Hide resolved
length < 2 || propertyName[1] == (key & ((ulong)0xFF << 8 * 1)) >> 8 * 1 &&
length < 3 || propertyName[2] == (key & ((ulong)0xFF << 8 * 2)) >> 8 * 2 &&
length < 4 || propertyName[3] == (key & ((ulong)0xFF << 8 * 3)) >> 8 * 3 &&
length < 5 || propertyName[4] == (key & ((ulong)0xFF << 8 * 4)) >> 8 * 4 &&
length < 6 || propertyName[5] == (key & ((ulong)0xFF << 8 * 5)) >> 8 * 5 &&
length < 7 || propertyName[6] == (key & ((ulong)0xFF << 8 * 6)) >> 8 * 6);

return key;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@
using System.Buffers;
using System.Collections;
using System.Diagnostics;
using System.Runtime.CompilerServices;

namespace System.Text.Json
{
public static partial class JsonSerializer
{
// AggressiveInlining used although a large method it is only called from one locations and is on a hot path.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@stephentoub, @jkotas - Given our previous discussions on the use of this attribute, what are your thoughts on the use of AggressiveInlining in cases like these, where there are very few (1-2) callers of somewhat large methods, and benchmarks show improvements for adding them?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For cases like this, it may be ok if you know what you are doing. There are a lot of cases where it can fire back - regressing the rest of the calling method, hitting JIT complexity thresholds and disabling optimizations, etc.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason for this method and similar ones is for readability \ understandability. Otherwise the method would not exist and code would just be inline, so I suppose that is another option -- to just push the implementation down to the caller.

private static void HandlePropertyName(
JsonSerializerOptions options,
ref Utf8JsonReader reader,
Expand Down Expand Up @@ -52,7 +55,7 @@ private static void HandlePropertyName(
}

JsonPropertyInfo jsonPropertyInfo = state.Current.JsonClassInfo.GetProperty(propertyName, ref state.Current);
if (jsonPropertyInfo == null)
if (jsonPropertyInfo == JsonPropertyInfo.s_missingProperty)
steveharter marked this conversation as resolved.
Show resolved Hide resolved
{
JsonPropertyInfo dataExtProperty = state.Current.JsonClassInfo.DataExtensionProperty;
if (dataExtProperty == null)
Expand Down Expand Up @@ -94,9 +97,10 @@ private static void HandlePropertyName(
state.Current.JsonPropertyInfo.JsonPropertyName = propertyNameArray;
}
}

state.Current.PropertyIndex++;
steveharter marked this conversation as resolved.
Show resolved Hide resolved
}

// Increment the PropertyIndex so JsonClassInfo.GetProperty() starts with the next property.
state.Current.PropertyIndex++;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,19 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Runtime.CompilerServices;

namespace System.Text.Json
{
public static partial class JsonSerializer
{
private static bool HandleValue(JsonTokenType tokenType, JsonSerializerOptions options, ref Utf8JsonReader reader, ref ReadStack state)
// AggressiveInlining used although a large method it is only called from two locations and is on a hot path.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void HandleValue(JsonTokenType tokenType, JsonSerializerOptions options, ref Utf8JsonReader reader, ref ReadStack state)
{
if (state.Current.SkipProperty)
{
return false;
return;
}

JsonPropertyInfo jsonPropertyInfo = state.Current.JsonPropertyInfo;
Expand All @@ -23,10 +27,7 @@ private static bool HandleValue(JsonTokenType tokenType, JsonSerializerOptions o
jsonPropertyInfo = state.Current.JsonClassInfo.CreatePolymorphicProperty(jsonPropertyInfo, typeof(object), options);
}

bool lastCall = (!state.Current.IsProcessingEnumerableOrDictionary && state.Current.ReturnValue == null);

jsonPropertyInfo.Read(tokenType, ref state, ref reader);
return lastCall;
}
}
}
Loading