Skip to content

Commit 75b550d

Browse files
ificatorPranavSenthilnathaneiriktsarpalis
authored
Implement WriteStringValueSegment defined in Issue 67337 (#101356)
* Implement WriteStringValueSegment defined in Issue 67337 * Fix some review comments * Handle split surrogate pair * Commit old changes responding to comments * utf8 and utf16 * fix build error * Update src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs Co-authored-by: Eirik Tsarpalis <eirik.tsarpalis@gmail.com> * Update src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs Co-authored-by: Eirik Tsarpalis <eirik.tsarpalis@gmail.com> * PR comments * add encoding flags * add test for switching encoding * use CoreLib Rune for polyfill instead of having a separate copy * move warning disabling to top and fix up tests * add fuzzer * Fix some tests I missed * clean up and add another test to fuzzer * comment typo Co-authored-by: Eirik Tsarpalis <eirik.tsarpalis@gmail.com> * pr comments * throw when encodings are mixed * update fuzzer to assert that mixing encodings always throws * pr comments --------- Co-authored-by: Pranav Senthilnathan <pranas@microsoft.com> Co-authored-by: Pranav Senthilnathan <pranav.senthilnathan@live.com> Co-authored-by: Eirik Tsarpalis <eirik.tsarpalis@gmail.com>
1 parent e5878e9 commit 75b550d

29 files changed

+2800
-844
lines changed

eng/pipelines/libraries/fuzzing/deploy-to-onefuzz.yml

+8
Original file line numberDiff line numberDiff line change
@@ -153,4 +153,12 @@ extends:
153153
onefuzzDropDirectory: $(fuzzerProject)/deployment/UTF8Fuzzer
154154
SYSTEM_ACCESSTOKEN: $(System.AccessToken)
155155
displayName: Send UTF8Fuzzer to OneFuzz
156+
157+
- task: onefuzz-task@0
158+
inputs:
159+
onefuzzOSes: 'Windows'
160+
env:
161+
onefuzzDropDirectory: $(fuzzerProject)/deployment/Utf8JsonWriterFuzzer
162+
SYSTEM_ACCESSTOKEN: $(System.AccessToken)
163+
displayName: Send Utf8JsonWriterFuzzer to OneFuzz
156164
# ONEFUZZ_TASK_WORKAROUND_END

src/libraries/Fuzzing/DotnetFuzzing/Assert.cs

+20
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,24 @@ static void Throw(ReadOnlySpan<T> expected, ReadOnlySpan<T> actual)
4545
throw new Exception($"Expected={expected[diffIndex]} Actual={actual[diffIndex]} at index {diffIndex}");
4646
}
4747
}
48+
49+
public static TException Throws<TException, TState>(Action<TState> action, TState state)
50+
where TException : Exception
51+
where TState : allows ref struct
52+
{
53+
try
54+
{
55+
action(state);
56+
}
57+
catch (TException ex)
58+
{
59+
return ex;
60+
}
61+
catch (Exception ex)
62+
{
63+
throw new Exception($"Expected exception of type {typeof(TException).Name} but got {ex.GetType().Name}");
64+
}
65+
66+
throw new Exception($"Expected exception of type {typeof(TException).Name} but no exception was thrown");
67+
}
4868
}

src/libraries/Fuzzing/DotnetFuzzing/DotnetFuzzing.csproj

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<Project Sdk="Microsoft.NET.Sdk">
1+
<Project Sdk="Microsoft.NET.Sdk">
22

33
<PropertyGroup>
44
<OutputType>Exe</OutputType>
@@ -29,6 +29,7 @@
2929
<Compile Include="Fuzzers\TextEncodingFuzzer.cs" />
3030
<Compile Include="Fuzzers\TypeNameFuzzer.cs" />
3131
<Compile Include="Fuzzers\UTF8Fuzzer.cs" />
32+
<Compile Include="Fuzzers\Utf8JsonWriterFuzzer.cs" />
3233
<Compile Include="IFuzzer.cs" />
3334
<Compile Include="PooledBoundedMemory.cs" />
3435
<Compile Include="Program.cs" />
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
using System;
5+
using System.Buffers;
6+
using System.Collections;
7+
using System.Collections.Generic;
8+
using System.Diagnostics;
9+
using System.IO;
10+
using System.Runtime.CompilerServices;
11+
using System.Runtime.InteropServices;
12+
using System.Text;
13+
using System.Text.Encodings.Web;
14+
using System.Text.Json;
15+
using System.Text.Unicode;
16+
using SharpFuzz;
17+
18+
namespace DotnetFuzzing.Fuzzers;
19+
20+
internal sealed class Utf8JsonWriterFuzzer : IFuzzer
21+
{
22+
public string[] TargetAssemblies { get; } = ["System.Text.Json"];
23+
24+
public string[] TargetCoreLibPrefixes => [];
25+
26+
// One of the bytes in the input is used to set various test options.
27+
// Each bit in that byte represents a different option as indicated here.
28+
29+
// Options for JsonWriterOptions
30+
private const byte IndentFlag = 1;
31+
private const byte EncoderFlag = 1 << 1;
32+
private const byte MaxDepthFlag = 1 << 2;
33+
private const byte NewLineFlag = 1 << 3;
34+
private const byte SkipValidationFlag = 1 << 4;
35+
36+
// Options for choosing between UTF-8 and UTF-16 encoding
37+
private const byte EncodingFlag = 1 << 5;
38+
39+
public void FuzzTarget(ReadOnlySpan<byte> bytes)
40+
{
41+
const int minLength = 10; // 2 ints, 1 byte, and 1 padding to align chars
42+
if (bytes.Length < minLength)
43+
{
44+
return;
45+
}
46+
47+
// First 2 ints are used as indices to slice the input and the following byte is used for options
48+
ReadOnlySpan<int> ints = MemoryMarshal.Cast<byte, int>(bytes);
49+
int slice1 = ints[0];
50+
int slice2 = ints[1];
51+
byte optionsByte = bytes[8];
52+
bytes = bytes.Slice(minLength);
53+
ReadOnlySpan<char> chars = MemoryMarshal.Cast<byte, char>(bytes);
54+
55+
// Validate that the indices are within bounds of the input
56+
bool utf8 = (optionsByte & EncodingFlag) == 0;
57+
if (!(0 <= slice1 && slice1 <= slice2 && slice2 <= (utf8 ? bytes.Length : chars.Length)))
58+
{
59+
return;
60+
}
61+
62+
// Set up options based on the first byte
63+
bool indented = (optionsByte & IndentFlag) == 0;
64+
JsonWriterOptions options = new()
65+
{
66+
Encoder = (optionsByte & EncodingFlag) == 0 ? JavaScriptEncoder.Default : JavaScriptEncoder.UnsafeRelaxedJsonEscaping,
67+
Indented = indented,
68+
MaxDepth = (optionsByte & MaxDepthFlag) == 0 ? 1 : 0,
69+
NewLine = (optionsByte & NewLineFlag) == 0 ? "\n" : "\r\n",
70+
SkipValidation = (optionsByte & SkipValidationFlag) == 0,
71+
};
72+
73+
// Compute the expected result by using the encoder directly and the input
74+
int maxExpandedSizeBytes = 6 * bytes.Length + 2;
75+
byte[] expectedBuffer = ArrayPool<byte>.Shared.Rent(maxExpandedSizeBytes);
76+
Span<byte> expected =
77+
expectedBuffer.AsSpan(0, utf8
78+
? EncodeToUtf8(bytes, expectedBuffer, options.Encoder)
79+
: EncodeToUtf8(chars, expectedBuffer, options.Encoder));
80+
81+
// Compute the actual result by using Utf8JsonWriter. Each iteration is a different slice of the input, but the result should be the same.
82+
byte[] actualBuffer = new byte[expected.Length];
83+
foreach (ReadOnlySpan<Range> ranges in new[]
84+
{
85+
new[] { 0.. },
86+
new[] { 0..slice1, slice1.. },
87+
new[] { 0..slice1, slice1..slice2, slice2.. },
88+
})
89+
{
90+
using MemoryStream stream = new(actualBuffer);
91+
using Utf8JsonWriter writer = new(stream, options);
92+
93+
if (utf8)
94+
{
95+
WriteStringValueSegments(writer, bytes, ranges);
96+
}
97+
else
98+
{
99+
WriteStringValueSegments(writer, chars, ranges);
100+
}
101+
102+
writer.Flush();
103+
104+
// Compare the expected and actual results
105+
Assert.SequenceEqual(expected, actualBuffer);
106+
Assert.Equal(expected.Length, writer.BytesCommitted);
107+
Assert.Equal(0, writer.BytesPending);
108+
109+
Array.Clear(actualBuffer);
110+
}
111+
112+
// Additional test for mixing UTF-8 and UTF-16 encoding. The alignment math is easier in UTF-16 mode so just run it for that.
113+
if (!utf8)
114+
{
115+
Array.Clear(expectedBuffer);
116+
117+
{
118+
ReadOnlySpan<char> firstSegment = chars[slice1..];
119+
ReadOnlySpan<byte> secondSegment = bytes[0..(2 * slice1)];
120+
121+
expected = expectedBuffer.AsSpan(0, EncodeToUtf8(firstSegment, expectedBuffer, options.Encoder));
122+
123+
actualBuffer = new byte[expected.Length];
124+
using MemoryStream stream = new(actualBuffer);
125+
using Utf8JsonWriter writer = new(stream, options);
126+
127+
writer.WriteStringValueSegment(firstSegment, false);
128+
129+
Assert.Throws<InvalidOperationException, ReadOnlySpan<byte>>(state => writer.WriteStringValueSegment(state, true), secondSegment);
130+
}
131+
132+
Array.Clear(expectedBuffer);
133+
134+
{
135+
ReadOnlySpan<byte> firstSegment = bytes[0..(2 * slice1)];
136+
ReadOnlySpan<char> secondSegment = chars[slice1..];
137+
138+
expected = expectedBuffer.AsSpan(0, EncodeToUtf8(firstSegment, secondSegment, expectedBuffer, options.Encoder));
139+
140+
actualBuffer = new byte[expected.Length];
141+
using MemoryStream stream = new(actualBuffer);
142+
using Utf8JsonWriter writer = new(stream, options);
143+
144+
writer.WriteStringValueSegment(firstSegment, false);
145+
Assert.Throws<InvalidOperationException, ReadOnlySpan<char>>(state => writer.WriteStringValueSegment(state, true), secondSegment);
146+
}
147+
}
148+
149+
ArrayPool<byte>.Shared.Return(expectedBuffer);
150+
}
151+
152+
private static void WriteStringValueSegments(Utf8JsonWriter writer, ReadOnlySpan<byte> bytes, ReadOnlySpan<Range> ranges)
153+
{
154+
for (int i = 0; i < ranges.Length; i++)
155+
{
156+
writer.WriteStringValueSegment(bytes[ranges[i]], i == ranges.Length - 1);
157+
}
158+
}
159+
160+
private static void WriteStringValueSegments(Utf8JsonWriter writer, ReadOnlySpan<char> chars, ReadOnlySpan<Range> ranges)
161+
{
162+
for (int i = 0; i < ranges.Length; i++)
163+
{
164+
writer.WriteStringValueSegment(chars[ranges[i]], i == ranges.Length - 1);
165+
}
166+
}
167+
168+
private static int EncodeToUtf8(ReadOnlySpan<byte> bytes, Span<byte> destBuffer, JavaScriptEncoder encoder)
169+
{
170+
destBuffer[0] = (byte)'"';
171+
encoder.EncodeUtf8(bytes, destBuffer[1..], out _, out int written, isFinalBlock: true);
172+
destBuffer[++written] = (byte)'"';
173+
return written + 1;
174+
}
175+
176+
private static int EncodeToUtf8(ReadOnlySpan<char> chars, Span<byte> destBuffer, JavaScriptEncoder encoder)
177+
{
178+
int written = 1;
179+
destBuffer[0] = (byte)'"';
180+
destBuffer[written += EncodeTranscode(chars, destBuffer[1..], encoder)] = (byte)'"';
181+
return written + 1;
182+
}
183+
184+
private static int EncodeToUtf8(ReadOnlySpan<byte> bytes, ReadOnlySpan<char> chars, Span<byte> destBuffer, JavaScriptEncoder encoder)
185+
{
186+
int written = 1;
187+
destBuffer[0] = (byte)'"';
188+
encoder.EncodeUtf8(bytes, destBuffer[1..], out _, out int writtenTemp, isFinalBlock: true);
189+
written += writtenTemp;
190+
destBuffer[written += EncodeTranscode(chars, destBuffer[written..], encoder, isFinalBlock: true)] = (byte)'"';
191+
return written + 1;
192+
}
193+
194+
private static int EncodeToUtf8(ReadOnlySpan<char> chars, ReadOnlySpan<byte> bytes, Span<byte> destBuffer, JavaScriptEncoder encoder)
195+
{
196+
int written = 1;
197+
destBuffer[0] = (byte)'"';
198+
written += EncodeTranscode(chars, destBuffer[1..], encoder, isFinalBlock: true);
199+
encoder.EncodeUtf8(bytes, destBuffer[written..], out _, out int writtenTemp, isFinalBlock: true);
200+
written += writtenTemp;
201+
destBuffer[written] = (byte)'"';
202+
return written + 1;
203+
}
204+
205+
private static int EncodeTranscode(ReadOnlySpan<char> chars, Span<byte> destBuffer, JavaScriptEncoder encoder, bool isFinalBlock = true)
206+
{
207+
var utf16buffer = ArrayPool<char>.Shared.Rent(6 * chars.Length);
208+
encoder.Encode(chars, utf16buffer, out _, out int written, isFinalBlock: true);
209+
210+
Utf8.FromUtf16(utf16buffer.AsSpan(0, written), destBuffer, out _, out written, isFinalBlock);
211+
ArrayPool<char>.Shared.Return(utf16buffer);
212+
return written;
213+
}
214+
}

src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs

+22-7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
using System.Runtime.CompilerServices;
99
using System.Text.Unicode;
1010

11+
#if !SYSTEM_PRIVATE_CORELIB
12+
#pragma warning disable CS3019 // CLS compliance checking will not be performed because it is not visible from outside this assembly
13+
#endif
14+
1115
namespace System.Text
1216
{
1317
/// <summary>
@@ -18,7 +22,12 @@ namespace System.Text
1822
/// assuming that the underlying <see cref="Rune"/> instance is well-formed.
1923
/// </remarks>
2024
[DebuggerDisplay("{DebuggerDisplay,nq}")]
21-
public readonly struct Rune : IComparable, IComparable<Rune>, IEquatable<Rune>
25+
#if SYSTEM_PRIVATE_CORELIB
26+
public
27+
#else
28+
internal
29+
#endif
30+
readonly struct Rune : IComparable, IComparable<Rune>, IEquatable<Rune>
2231
#if SYSTEM_PRIVATE_CORELIB
2332
#pragma warning disable SA1001 // Commas should be spaced correctly
2433
, ISpanFormattable
@@ -141,7 +150,14 @@ private Rune(uint scalarValue, bool _)
141150
public static explicit operator Rune(int value) => new Rune(value);
142151

143152
// Displayed as "'<char>' (U+XXXX)"; e.g., "'e' (U+0065)"
144-
private string DebuggerDisplay => string.Create(CultureInfo.InvariantCulture, $"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'");
153+
private string DebuggerDisplay =>
154+
#if SYSTEM_PRIVATE_CORELIB
155+
string.Create(
156+
CultureInfo.InvariantCulture,
157+
#else
158+
FormattableString.Invariant(
159+
#endif
160+
$"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'");
145161

146162
/// <summary>
147163
/// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ])
@@ -242,7 +258,6 @@ private static Rune ChangeCaseCultureAware(Rune rune, TextInfo textInfo, bool to
242258
#else
243259
private static Rune ChangeCaseCultureAware(Rune rune, CultureInfo culture, bool toUpper)
244260
{
245-
Debug.Assert(!GlobalizationMode.Invariant, "This should've been checked by the caller.");
246261
Debug.Assert(culture != null, "This should've been checked by the caller.");
247262

248263
Span<char> original = stackalloc char[MaxUtf16CharsPerRune]; // worst case scenario = 2 code units (for a surrogate pair)
@@ -1375,12 +1390,12 @@ public static Rune ToLower(Rune value, CultureInfo culture)
13751390
// ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
13761391
// we'll just jump straight to the globalization tables if they're available.
13771392

1393+
#if SYSTEM_PRIVATE_CORELIB
13781394
if (GlobalizationMode.Invariant)
13791395
{
13801396
return ToLowerInvariant(value);
13811397
}
13821398

1383-
#if SYSTEM_PRIVATE_CORELIB
13841399
return ChangeCaseCultureAware(value, culture.TextInfo, toUpper: false);
13851400
#else
13861401
return ChangeCaseCultureAware(value, culture, toUpper: false);
@@ -1399,14 +1414,14 @@ public static Rune ToLowerInvariant(Rune value)
13991414
return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(value._value));
14001415
}
14011416

1417+
#if SYSTEM_PRIVATE_CORELIB
14021418
if (GlobalizationMode.Invariant)
14031419
{
14041420
return UnsafeCreate(CharUnicodeInfo.ToLower(value._value));
14051421
}
14061422

14071423
// Non-ASCII data requires going through the case folding tables.
14081424

1409-
#if SYSTEM_PRIVATE_CORELIB
14101425
return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: false);
14111426
#else
14121427
return ChangeCaseCultureAware(value, CultureInfo.InvariantCulture, toUpper: false);
@@ -1424,12 +1439,12 @@ public static Rune ToUpper(Rune value, CultureInfo culture)
14241439
// ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
14251440
// we'll just jump straight to the globalization tables if they're available.
14261441

1442+
#if SYSTEM_PRIVATE_CORELIB
14271443
if (GlobalizationMode.Invariant)
14281444
{
14291445
return ToUpperInvariant(value);
14301446
}
14311447

1432-
#if SYSTEM_PRIVATE_CORELIB
14331448
return ChangeCaseCultureAware(value, culture.TextInfo, toUpper: true);
14341449
#else
14351450
return ChangeCaseCultureAware(value, culture, toUpper: true);
@@ -1448,14 +1463,14 @@ public static Rune ToUpperInvariant(Rune value)
14481463
return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(value._value));
14491464
}
14501465

1466+
#if SYSTEM_PRIVATE_CORELIB
14511467
if (GlobalizationMode.Invariant)
14521468
{
14531469
return UnsafeCreate(CharUnicodeInfo.ToUpper(value._value));
14541470
}
14551471

14561472
// Non-ASCII data requires going through the case folding tables.
14571473

1458-
#if SYSTEM_PRIVATE_CORELIB
14591474
return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: true);
14601475
#else
14611476
return ChangeCaseCultureAware(value, CultureInfo.InvariantCulture, toUpper: true);

0 commit comments

Comments
 (0)