@@ -2038,79 +2038,17 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
2038
2038
2039
2039
if ( BitConverter . IsLittleEndian && Vector128 . IsHardwareAccelerated && elementCount >= ( uint ) Vector128 < byte > . Count )
2040
2040
{
2041
- ushort * pCurrentWriteAddress = ( ushort * ) pUtf16Buffer;
2042
-
2043
- if ( Vector512 . IsHardwareAccelerated && elementCount >= ( uint ) Vector512 < byte > . Count )
2041
+ if ( Vector512 . IsHardwareAccelerated && ( elementCount - currentOffset ) >= ( uint ) Vector512 < byte > . Count )
2044
2042
{
2045
- // Calculating the destination address outside the loop results in significant
2046
- // perf wins vs. relying on the JIT to fold memory addressing logic into the
2047
- // write instructions. See: https://github.com/dotnet/runtime/issues/33002
2048
- nuint finalOffsetWhereCanRunLoop = elementCount - ( uint ) Vector512 < byte > . Count ;
2049
-
2050
- do
2051
- {
2052
- Vector512 < byte > asciiVector = Vector512 . Load ( pAsciiBuffer + currentOffset ) ;
2053
-
2054
- if ( asciiVector . ExtractMostSignificantBits ( ) != 0 )
2055
- {
2056
- break ;
2057
- }
2058
-
2059
- ( Vector512 < ushort > utf16LowVector , Vector512 < ushort > utf16HighVector ) = Vector512 . Widen ( asciiVector ) ;
2060
- utf16LowVector . Store ( pCurrentWriteAddress ) ;
2061
- utf16HighVector . Store ( pCurrentWriteAddress + Vector512 < ushort > . Count ) ;
2062
-
2063
- currentOffset += ( nuint ) Vector512 < byte > . Count ;
2064
- pCurrentWriteAddress += ( nuint ) Vector512 < byte > . Count ;
2065
- } while ( currentOffset < = finalOffsetWhereCanRunLoop ) ;
2043
+ WidenAsciiToUtf1_Vector < Vector512 < byte > , Vector512 < ushort > > ( pAsciiBuffer , pUtf16Buffer , ref currentOffset , elementCount ) ;
2066
2044
}
2067
- else if ( Vector256 . IsHardwareAccelerated && elementCount >= ( uint ) Vector256 < byte > . Count )
2045
+ if ( Vector256 . IsHardwareAccelerated && ( elementCount - currentOffset ) >= ( uint ) Vector256 < byte > . Count )
2068
2046
{
2069
- // Calculating the destination address outside the loop results in significant
2070
- // perf wins vs. relying on the JIT to fold memory addressing logic into the
2071
- // write instructions. See: https://github.com/dotnet/runtime/issues/33002
2072
- nuint finalOffsetWhereCanRunLoop = elementCount - ( uint ) Vector256 < byte > . Count ;
2073
-
2074
- do
2075
- {
2076
- Vector256 < byte > asciiVector = Vector256 . Load ( pAsciiBuffer + currentOffset ) ;
2077
-
2078
- if ( asciiVector . ExtractMostSignificantBits ( ) != 0 )
2079
- {
2080
- break ;
2081
- }
2082
-
2083
- ( Vector256 < ushort > utf16LowVector , Vector256 < ushort > utf16HighVector ) = Vector256 . Widen ( asciiVector ) ;
2084
- utf16LowVector . Store ( pCurrentWriteAddress ) ;
2085
- utf16HighVector . Store ( pCurrentWriteAddress + Vector256 < ushort > . Count ) ;
2086
-
2087
- currentOffset += ( nuint ) Vector256 < byte > . Count ;
2088
- pCurrentWriteAddress += ( nuint ) Vector256 < byte > . Count ;
2089
- } while ( currentOffset < = finalOffsetWhereCanRunLoop ) ;
2047
+ WidenAsciiToUtf1_Vector < Vector256 < byte > , Vector256 < ushort > > ( pAsciiBuffer , pUtf16Buffer , ref currentOffset , elementCount ) ;
2090
2048
}
2091
- else
2049
+ if ( Vector128 . IsHardwareAccelerated && ( elementCount - currentOffset ) >= ( uint ) Vector128 < byte > . Count )
2092
2050
{
2093
- // Calculating the destination address outside the loop results in significant
2094
- // perf wins vs. relying on the JIT to fold memory addressing logic into the
2095
- // write instructions. See: https://github.com/dotnet/runtime/issues/33002
2096
- nuint finalOffsetWhereCanRunLoop = elementCount - ( uint ) Vector128 < byte > . Count ;
2097
-
2098
- do
2099
- {
2100
- Vector128 < byte > asciiVector = Vector128. Load( pAsciiBuffer + currentOffset) ;
2101
-
2102
- if ( VectorContainsNonAsciiChar( asciiVector) )
2103
- {
2104
- break ;
2105
- }
2106
-
2107
- ( Vector128< ushort > utf16LowVector , Vector128 < ushort > utf16HighVector ) = Vector128 . Widen ( asciiVector ) ;
2108
- utf16LowVector. Store( pCurrentWriteAddress) ;
2109
- utf16HighVector. Store( pCurrentWriteAddress + Vector128< ushort > . Count) ;
2110
-
2111
- currentOffset += ( nuint ) Vector128< byte > . Count;
2112
- pCurrentWriteAddress += ( nuint ) Vector128< byte > . Count;
2113
- } while ( currentOffset <= finalOffsetWhereCanRunLoop) ;
2051
+ WidenAsciiToUtf1_Vector < Vector128 < byte > , Vector128 < ushort > > ( pAsciiBuffer , pUtf16Buffer , ref currentOffset , elementCount ) ;
2114
2052
}
2115
2053
}
2116
2054
@@ -2212,6 +2150,84 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
2212
2150
goto Finish ;
2213
2151
}
2214
2152
2153
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
2154
+ private static unsafe void WidenAsciiToUtf1_Vector< TVectorByte, TVectorUShort > ( byte * pAsciiBuffer , char * pUtf16Buffer , ref nuint currentOffset , nuint elementCount )
2155
+ where TVectorByte : unmanaged , ISimdVector < TVectorByte , byte >
2156
+ where TVectorUShort : unmanaged , ISimdVector < TVectorUShort , ushort >
2157
+ {
2158
+ ushort * pCurrentWriteAddress = ( ushort * ) pUtf16Buffer;
2159
+ // Calculating the destination address outside the loop results in significant
2160
+ // perf wins vs. relying on the JIT to fold memory addressing logic into the
2161
+ // write instructions. See: https://github.com/dotnet/runtime/issues/33002
2162
+ nuint finalOffsetWhereCanRunLoop = elementCount - ( nuint ) TVectorByte . Count ;
2163
+ TVectorByte asciiVector = TVectorByte . Load ( pAsciiBuffer + currentOffset ) ;
2164
+ if ( ! HasMatch < TVectorByte > ( asciiVector ) )
2165
+ {
2166
+ ( TVectorUShort utf16LowVector , TVectorUShort utf16HighVector ) = Widen < TVectorByte , TVectorUShort > ( asciiVector ) ;
2167
+ utf16LowVector . Store ( pCurrentWriteAddress ) ;
2168
+ utf16HighVector . Store ( pCurrentWriteAddress + TVectorUShort . Count ) ;
2169
+ pCurrentWriteAddress += ( nuint ) ( TVectorUShort . Count * 2 ) ;
2170
+ if ( ( ( int ) pCurrentWriteAddress & 1 ) == 0 )
2171
+ {
2172
+ // Bump write buffer up to the next aligned boundary
2173
+ pCurrentWriteAddress = ( ushort * ) ( ( nuint ) pCurrentWriteAddress & ~ ( nuint ) ( TVectorUShort. Alignment - 1 ) ) ;
2174
+ nuint numBytesWritten = ( nuint ) pCurrentWriteAddress - ( nuint ) pUtf16Buffer;
2175
+ currentOffset += ( nuint ) numBytesWritten / 2 ;
2176
+ }
2177
+ else
2178
+ {
2179
+ // If input isn't char aligned, we won't be able to align it to a Vector
2180
+ currentOffset += ( nuint ) TVectorByte. Count;
2181
+ }
2182
+ while ( currentOffset <= finalOffsetWhereCanRunLoop)
2183
+ {
2184
+ asciiVector = TVectorByte. Load( pAsciiBuffer + currentOffset) ;
2185
+ if ( HasMatch< TVectorByte> ( asciiVector) )
2186
+ {
2187
+ break ;
2188
+ }
2189
+ ( utf16LowVector, utf16HighVector) = Widen< TVectorByte, TVectorUShort> ( asciiVector) ;
2190
+ utf16LowVector. StoreAligned( pCurrentWriteAddress) ;
2191
+ utf16HighVector. StoreAligned( pCurrentWriteAddress + TVectorUShort. Count) ;
2192
+ currentOffset += ( nuint ) TVectorByte. Count;
2193
+ pCurrentWriteAddress += ( nuint ) ( TVectorUShort. Count * 2 ) ;
2194
+ }
2195
+ }
2196
+ return ;
2197
+ }
2198
+
2199
+ [ MethodImpl( MethodImplOptions. AggressiveInlining) ]
2200
+ private static unsafe bool HasMatch< TVectorByte> ( TVectorByte vector)
2201
+ where TVectorByte : unmanaged , ISimdVector < TVectorByte , byte >
2202
+ {
2203
+ if ( AdvSimd . IsSupported && typeof ( TVectorByte ) == typeof ( Vector128 < byte > ) )
2204
+ {
2205
+ return VectorContainsNonAsciiChar( ( Vector128 < byte > ) ( object ) vector) ;
2206
+ }
2207
+ return TVectorByte . AnyMatches ( vector ) ;
2208
+ }
2209
+
2210
+
2211
+ [ MethodImpl( MethodImplOptions . AggressiveInlining ) ]
2212
+ private static unsafe ( TVectorUShort Lower , TVectorUShort Upper ) Widen < TVectorByte, TVectorUShort > ( TVectorByte vector)
2213
+ where TVectorByte : unmanaged , ISimdVector < TVectorByte , byte >
2214
+ where TVectorUShort : unmanaged , ISimdVector < TVectorUShort , ushort >
2215
+ {
2216
+ if ( typeof ( TVectorByte ) == typeof ( Vector256 < byte > ) )
2217
+ {
2218
+ ( Vector256 < ushort > Lower256 , Vector256 < ushort > Upper256 ) = Vector256 . Widen ( ( Vector256 < byte > ) ( object ) vector) ;
2219
+ return ( ( TVectorUShort ) ( object ) Lower256, ( TVectorUShort ) ( object ) Upper256) ;
2220
+ }
2221
+ else if ( typeof ( TVectorByte ) == typeof ( Vector512 < byte > ) )
2222
+ {
2223
+ ( Vector512 < ushort > Lower512 , Vector512 < ushort > Upper512 ) = Vector512 . Widen ( ( Vector512 < byte > ) ( object ) vector) ;
2224
+ return ( ( TVectorUShort ) ( object ) Lower512, ( TVectorUShort ) ( object ) Upper512) ;
2225
+ }
2226
+ ( Vector128 < ushort > Lower128 , Vector128 < ushort > Upper128 ) = Vector128 . Widen ( ( Vector128 < byte > ) ( object ) vector) ;
2227
+ return ( ( TVectorUShort ) ( object ) Lower128, ( TVectorUShort ) ( object ) Upper128) ;
2228
+ }
2229
+
2230
+
2215
2231
/// <summary>
2216
2232
/// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
2217
2233
/// writes them to the output buffer with machine endianness.
0 commit comments