From 9122ef8aa91a56b7461a8b55a75ccb89c8c33535 Mon Sep 17 00:00:00 2001 From: ificator Date: Sun, 21 Apr 2024 13:28:31 -0700 Subject: [PATCH 01/21] Implement WriteStringValueSegment defined in Issue 67337 --- .../System.Text.Json/ref/System.Text.Json.cs | 5 + .../src/System/Text/Json/JsonTokenType.cs | 9 + .../Utf8JsonWriter.WriteProperties.Helpers.cs | 4 +- .../Utf8JsonWriter.WriteValues.String.cs | 259 ++++++++++++++---- .../System/Text/Json/Writer/Utf8JsonWriter.cs | 2 +- .../Utf8JsonWriterTests.cs | 47 ++++ 6 files changed, 265 insertions(+), 61 deletions(-) diff --git a/src/libraries/System.Text.Json/ref/System.Text.Json.cs b/src/libraries/System.Text.Json/ref/System.Text.Json.cs index 0ff5bdc90a66ee..5e6978b4fe1ce8 100644 --- a/src/libraries/System.Text.Json/ref/System.Text.Json.cs +++ b/src/libraries/System.Text.Json/ref/System.Text.Json.cs @@ -426,6 +426,7 @@ public enum JsonTokenType : byte True = (byte)9, False = (byte)10, Null = (byte)11, + StringSegment = (byte)12, } public enum JsonValueKind : byte { @@ -652,6 +653,10 @@ public void WriteStringValue(System.ReadOnlySpan utf8Value) { } public void WriteStringValue(System.ReadOnlySpan value) { } public void WriteStringValue(string? value) { } public void WriteStringValue(System.Text.Json.JsonEncodedText value) { } + public void WriteStringValueSegment(System.ReadOnlySpan utf8Value, bool isFinalSegment) { } + public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegment) { } + public void WriteStringValueSegment(string? value, bool isFinalSegment) { } + public void WriteStringValueSegment(System.Text.Json.JsonEncodedText value, bool isFinalSegment) { } } } namespace System.Text.Json.Nodes diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/JsonTokenType.cs b/src/libraries/System.Text.Json/src/System/Text/Json/JsonTokenType.cs index 049da2220a22f8..e86ebdef0c31f1 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/JsonTokenType.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/JsonTokenType.cs @@ -77,5 +77,14 @@ public enum JsonTokenType : byte /// Indicates that the token type is the JSON literal null. /// Null, + + /// + /// Indicates that the token type is a segment of a JSON string. + /// + /// + /// This does not represent a token defined in the JSON specification, but rather provides a means to track + /// that a string is being written in segments. This value will never be seen during deserialization. + /// + StringSegment, } } diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteProperties.Helpers.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteProperties.Helpers.cs index 95a0a67451641c..11b0b3885ac34f 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteProperties.Helpers.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteProperties.Helpers.cs @@ -36,7 +36,7 @@ private void ValidateWritingProperty() { if (!_options.SkipValidation) { - if (!_inObject || _tokenType == JsonTokenType.PropertyName) + if (!_inObject || _tokenType == JsonTokenType.PropertyName || _tokenType == JsonTokenType.StringSegment) { Debug.Assert(_tokenType != JsonTokenType.StartObject); ThrowHelper.ThrowInvalidOperationException(ExceptionResource.CannotWritePropertyWithinArray, currentDepth: default, maxDepth: _options.MaxDepth, token: default, _tokenType); @@ -49,7 +49,7 @@ private void ValidateWritingProperty(byte token) { if (!_options.SkipValidation) { - if (!_inObject || _tokenType == JsonTokenType.PropertyName) + if (!_inObject || _tokenType == JsonTokenType.PropertyName || _tokenType == JsonTokenType.StringSegment) { Debug.Assert(_tokenType != JsonTokenType.StartObject); ThrowHelper.ThrowInvalidOperationException(ExceptionResource.CannotWritePropertyWithinArray, currentDepth: default, maxDepth: _options.MaxDepth, token: default, _tokenType); diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs index 5ff4064d2b59bd..3c6040e51d329b 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs @@ -20,12 +20,36 @@ public void WriteStringValue(JsonEncodedText value) ReadOnlySpan utf8Value = value.EncodedUtf8Bytes; Debug.Assert(utf8Value.Length <= JsonConstants.MaxUnescapedTokenSize); - WriteStringByOptions(utf8Value); + WriteStringByOptions(utf8Value, JsonTokenType.String); SetFlagToAddListSeparatorBeforeNextItem(); _tokenType = JsonTokenType.String; } + /// + /// Writes the pre-encoded text value segment as a partial JSON string. + /// + /// The JSON-encoded value to write. + /// Indicates that this is the final segment of the string. + /// + /// Thrown if this would result in invalid JSON being written (while validation is enabled). + /// + public void WriteStringValueSegment(JsonEncodedText value, bool isFinalSegment) + { + ReadOnlySpan utf8Value = value.EncodedUtf8Bytes; + Debug.Assert(utf8Value.Length <= JsonConstants.MaxUnescapedTokenSize); + + JsonTokenType nextTokenType = isFinalSegment ? JsonTokenType.String : JsonTokenType.StringSegment; + WriteStringByOptions(utf8Value, nextTokenType); + + if (isFinalSegment) + { + SetFlagToAddListSeparatorBeforeNextItem(); + } + + _tokenType = nextTokenType; + } + /// /// Writes the string text value (as a JSON string) as an element of a JSON array. /// @@ -56,6 +80,38 @@ public void WriteStringValue(string? value) } } + + /// + /// Writes the string text value segment as a partial JSON string. + /// + /// The value to write. + /// Indicates that this is the final segment of the string. + /// + /// Thrown when the specified value is too large. + /// + /// + /// Thrown if this would result in invalid JSON being written (while validation is enabled). + /// + /// + /// + /// The value is escaped before writing. + /// + /// If is the JSON null value is written, + /// as if was called. + /// + /// + public void WriteStringValueSegment(string? value, bool isFinalSegment) + { + if (value == null) + { + WriteNullValue(); + } + else + { + WriteStringValueSegment(value.AsSpan(), isFinalSegment); + } + } + /// /// Writes the text value (as a JSON string) as an element of a JSON array. /// @@ -73,13 +129,43 @@ public void WriteStringValue(ReadOnlySpan value) { JsonWriterHelper.ValidateValue(value); - WriteStringEscape(value); + WriteStringEscape(value, JsonTokenType.String); SetFlagToAddListSeparatorBeforeNextItem(); _tokenType = JsonTokenType.String; } - private void WriteStringEscape(ReadOnlySpan value) + /// + /// Writes the text value segment as a partial JSON string. + /// + /// The value to write. + /// Indicates that this is the final segment of the string. + /// + /// Thrown when the specified value is too large. + /// + /// + /// Thrown if this would result in invalid JSON being written (while validation is enabled). + /// + /// + /// The value is escaped before writing. + /// + public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegment) + { + JsonWriterHelper.ValidateValue(value); + + JsonTokenType nextTokenType = isFinalSegment ? JsonTokenType.String : JsonTokenType.StringSegment; + + WriteStringEscape(value, nextTokenType); + + if (isFinalSegment) + { + SetFlagToAddListSeparatorBeforeNextItem(); + } + + _tokenType = nextTokenType; + } + + private void WriteStringEscape(ReadOnlySpan value, JsonTokenType stringTokenType) { int valueIdx = JsonWriterHelper.NeedsEscaping(value, _options.Encoder); @@ -87,33 +173,33 @@ private void WriteStringEscape(ReadOnlySpan value) if (valueIdx != -1) { - WriteStringEscapeValue(value, valueIdx); + WriteStringEscapeValue(value, valueIdx, stringTokenType); } else { - WriteStringByOptions(value); + WriteStringByOptions(value, stringTokenType); } } - private void WriteStringByOptions(ReadOnlySpan value) + private void WriteStringByOptions(ReadOnlySpan value, JsonTokenType stringTokenType) { - if (!_options.SkipValidation) + if (!_options.SkipValidation && _tokenType != JsonTokenType.StringSegment) { ValidateWritingValue(); } if (_options.Indented) { - WriteStringIndented(value); + WriteStringIndented(value, stringTokenType); } else { - WriteStringMinimized(value); + WriteStringMinimized(value, stringTokenType); } } // TODO: https://github.com/dotnet/runtime/issues/29293 - private void WriteStringMinimized(ReadOnlySpan escapedValue) + private void WriteStringMinimized(ReadOnlySpan escapedValue, JsonTokenType stringTokenType) { Debug.Assert(escapedValue.Length < (int.MaxValue / JsonConstants.MaxExpansionFactorWhileTranscoding) - 3); @@ -128,19 +214,26 @@ private void WriteStringMinimized(ReadOnlySpan escapedValue) Span output = _memory.Span; - if (_currentDepth < 0) + if (_tokenType != JsonTokenType.StringSegment) { - output[BytesPending++] = JsonConstants.ListSeparator; + if (_currentDepth < 0) + { + output[BytesPending++] = JsonConstants.ListSeparator; + } + + output[BytesPending++] = JsonConstants.Quote; } - output[BytesPending++] = JsonConstants.Quote; TranscodeAndWrite(escapedValue, output); - output[BytesPending++] = JsonConstants.Quote; + if (stringTokenType != JsonTokenType.StringSegment) + { + output[BytesPending++] = JsonConstants.Quote; + } } // TODO: https://github.com/dotnet/runtime/issues/29293 - private void WriteStringIndented(ReadOnlySpan escapedValue) + private void WriteStringIndented(ReadOnlySpan escapedValue, JsonTokenType stringTokenType) { int indent = Indentation; Debug.Assert(indent <= _indentLength * _options.MaxDepth); @@ -158,29 +251,38 @@ private void WriteStringIndented(ReadOnlySpan escapedValue) Span output = _memory.Span; - if (_currentDepth < 0) + if (_tokenType != JsonTokenType.StringSegment) { - output[BytesPending++] = JsonConstants.ListSeparator; - } + if (_currentDepth < 0) + { + output[BytesPending++] = JsonConstants.ListSeparator; + } - if (_tokenType != JsonTokenType.PropertyName) - { - if (_tokenType != JsonTokenType.None) + if (_tokenType != JsonTokenType.PropertyName && _tokenType != JsonTokenType.StringSegment) { - WriteNewLine(output); + if (_tokenType != JsonTokenType.None) + { + WriteNewLine(output); + } + WriteIndentation(output.Slice(BytesPending), indent); + BytesPending += indent; } - WriteIndentation(output.Slice(BytesPending), indent); - BytesPending += indent; - } - output[BytesPending++] = JsonConstants.Quote; + output[BytesPending++] = JsonConstants.Quote; + } TranscodeAndWrite(escapedValue, output); - output[BytesPending++] = JsonConstants.Quote; + if (stringTokenType != JsonTokenType.StringSegment) + { + output[BytesPending++] = JsonConstants.Quote; + } } - private void WriteStringEscapeValue(ReadOnlySpan value, int firstEscapeIndexVal) + private void WriteStringEscapeValue( + ReadOnlySpan value, + int firstEscapeIndexVal, + JsonTokenType stringTokenType) { Debug.Assert(int.MaxValue / JsonConstants.MaxExpansionFactorWhileEscaping >= value.Length); Debug.Assert(firstEscapeIndexVal >= 0 && firstEscapeIndexVal < value.Length); @@ -195,7 +297,7 @@ private void WriteStringEscapeValue(ReadOnlySpan value, int firstEscapeInd JsonWriterHelper.EscapeString(value, escapedValue, firstEscapeIndexVal, _options.Encoder, out int written); - WriteStringByOptions(escapedValue.Slice(0, written)); + WriteStringByOptions(escapedValue.Slice(0, written), stringTokenType); if (valueArray != null) { @@ -220,13 +322,42 @@ public void WriteStringValue(ReadOnlySpan utf8Value) { JsonWriterHelper.ValidateValue(utf8Value); - WriteStringEscape(utf8Value); + WriteStringEscape(utf8Value, JsonTokenType.String); SetFlagToAddListSeparatorBeforeNextItem(); _tokenType = JsonTokenType.String; } - private void WriteStringEscape(ReadOnlySpan utf8Value) + /// + /// Writes the UTF-8 text value segment as a partial JSON string. + /// + /// The UTF-8 encoded value to be written as a JSON string element of a JSON array. + /// Indicates that this is the final segment of the string. + /// + /// Thrown when the specified value is too large. + /// + /// + /// Thrown if this would result in invalid JSON being written (while validation is enabled). + /// + /// + /// The value is escaped before writing. + /// + public void WriteStringValueSegment(ReadOnlySpan utf8Value, bool isFinalSegment) + { + JsonWriterHelper.ValidateValue(utf8Value); + + JsonTokenType nextTokenType = isFinalSegment ? JsonTokenType.String : JsonTokenType.StringSegment; + WriteStringEscape(utf8Value, nextTokenType); + + if (isFinalSegment) + { + SetFlagToAddListSeparatorBeforeNextItem(); + } + + _tokenType = nextTokenType; + } + + private void WriteStringEscape(ReadOnlySpan utf8Value, JsonTokenType stringTokenType) { int valueIdx = JsonWriterHelper.NeedsEscaping(utf8Value, _options.Encoder); @@ -234,33 +365,33 @@ private void WriteStringEscape(ReadOnlySpan utf8Value) if (valueIdx != -1) { - WriteStringEscapeValue(utf8Value, valueIdx); + WriteStringEscapeValue(utf8Value, valueIdx, stringTokenType); } else { - WriteStringByOptions(utf8Value); + WriteStringByOptions(utf8Value, stringTokenType); } } - private void WriteStringByOptions(ReadOnlySpan utf8Value) + private void WriteStringByOptions(ReadOnlySpan utf8Value, JsonTokenType stringTokenType) { - if (!_options.SkipValidation) + if (!_options.SkipValidation && _tokenType != JsonTokenType.StringSegment) { ValidateWritingValue(); } if (_options.Indented) { - WriteStringIndented(utf8Value); + WriteStringIndented(utf8Value, stringTokenType); } else { - WriteStringMinimized(utf8Value); + WriteStringMinimized(utf8Value, stringTokenType); } } // TODO: https://github.com/dotnet/runtime/issues/29293 - private void WriteStringMinimized(ReadOnlySpan escapedValue) + private void WriteStringMinimized(ReadOnlySpan escapedValue, JsonTokenType stringTokenType) { Debug.Assert(escapedValue.Length < int.MaxValue - 3); @@ -274,20 +405,26 @@ private void WriteStringMinimized(ReadOnlySpan escapedValue) Span output = _memory.Span; - if (_currentDepth < 0) + if (_tokenType != JsonTokenType.StringSegment) { - output[BytesPending++] = JsonConstants.ListSeparator; + if (_currentDepth < 0) + { + output[BytesPending++] = JsonConstants.ListSeparator; + } + output[BytesPending++] = JsonConstants.Quote; } - output[BytesPending++] = JsonConstants.Quote; escapedValue.CopyTo(output.Slice(BytesPending)); BytesPending += escapedValue.Length; - output[BytesPending++] = JsonConstants.Quote; + if (stringTokenType != JsonTokenType.StringSegment) + { + output[BytesPending++] = JsonConstants.Quote; + } } // TODO: https://github.com/dotnet/runtime/issues/29293 - private void WriteStringIndented(ReadOnlySpan escapedValue) + private void WriteStringIndented(ReadOnlySpan escapedValue, JsonTokenType stringTokenType) { int indent = Indentation; Debug.Assert(indent <= _indentLength * _options.MaxDepth); @@ -304,30 +441,36 @@ private void WriteStringIndented(ReadOnlySpan escapedValue) Span output = _memory.Span; - if (_currentDepth < 0) + if (_tokenType != JsonTokenType.StringSegment) { - output[BytesPending++] = JsonConstants.ListSeparator; - } + if (_currentDepth < 0) + { + output[BytesPending++] = JsonConstants.ListSeparator; + } - if (_tokenType != JsonTokenType.PropertyName) - { - if (_tokenType != JsonTokenType.None) + if (_tokenType != JsonTokenType.PropertyName) { - WriteNewLine(output); + if (_tokenType != JsonTokenType.None) + { + WriteNewLine(output); + } + WriteIndentation(output.Slice(BytesPending), indent); + BytesPending += indent; } - WriteIndentation(output.Slice(BytesPending), indent); - BytesPending += indent; - } - output[BytesPending++] = JsonConstants.Quote; + output[BytesPending++] = JsonConstants.Quote; + } escapedValue.CopyTo(output.Slice(BytesPending)); BytesPending += escapedValue.Length; - output[BytesPending++] = JsonConstants.Quote; + if (stringTokenType != JsonTokenType.StringSegment) + { + output[BytesPending++] = JsonConstants.Quote; + } } - private void WriteStringEscapeValue(ReadOnlySpan utf8Value, int firstEscapeIndexVal) + private void WriteStringEscapeValue(ReadOnlySpan utf8Value, int firstEscapeIndexVal, JsonTokenType stringTokenType) { Debug.Assert(int.MaxValue / JsonConstants.MaxExpansionFactorWhileEscaping >= utf8Value.Length); Debug.Assert(firstEscapeIndexVal >= 0 && firstEscapeIndexVal < utf8Value.Length); @@ -342,7 +485,7 @@ private void WriteStringEscapeValue(ReadOnlySpan utf8Value, int firstEscap JsonWriterHelper.EscapeString(utf8Value, escapedValue, firstEscapeIndexVal, _options.Encoder, out int written); - WriteStringByOptions(escapedValue.Slice(0, written)); + WriteStringByOptions(escapedValue.Slice(0, written), stringTokenType); if (valueArray != null) { @@ -358,7 +501,7 @@ internal void WriteNumberValueAsStringUnescaped(ReadOnlySpan utf8Value) { // The value has been validated prior to calling this method. - WriteStringByOptions(utf8Value); + WriteStringByOptions(utf8Value, JsonTokenType.String); SetFlagToAddListSeparatorBeforeNextItem(); _tokenType = JsonTokenType.String; diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs index fcf3ab2d4c2547..483d0c6cd263b0 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs @@ -959,7 +959,7 @@ private void WriteEndSlow(byte token) private void ValidateEnd(byte token) { - if (_bitStack.CurrentDepth <= 0 || _tokenType == JsonTokenType.PropertyName) + if (_bitStack.CurrentDepth <= 0 || _tokenType == JsonTokenType.PropertyName || _tokenType == JsonTokenType.StringSegment) ThrowHelper.ThrowInvalidOperationException(ExceptionResource.MismatchedObjectArray, currentDepth: default, maxDepth: _options.MaxDepth, token, _tokenType); if (token == JsonConstants.CloseBracket) diff --git a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs index c776982b56d925..5fe4742a28ce1b 100644 --- a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs +++ b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs @@ -6800,6 +6800,53 @@ public static void WriteStringValue_IndentationOptions() Assert.Equal(expectedOutput, output); } + [Fact] + public static void WriteStringValueSegment() + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStartObject(); + jsonUtf8.WritePropertyName("test"); + jsonUtf8.WriteStringValueSegment("Hello ".AsSpan(), isFinalSegment: false); + jsonUtf8.WriteStringValueSegment("World!".AsSpan(), isFinalSegment: true); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents($"{{\"test\":\"Hello World!\"}}", output); + } + + [Fact] + public static void WriteStringValueSegment_NotFinalized() + { + static ArrayBufferWriter executeScenario(Action implementation) + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStartObject(); + jsonUtf8.WritePropertyName("test"); + jsonUtf8.WriteStringValueSegment("Hello ".AsSpan(), isFinalSegment: false); + implementation(jsonUtf8); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + return output; + } + + // The following are expected to fail. + Assert.Throws(static () => executeScenario(w => w.WriteEndArray())); + Assert.Throws(static () => executeScenario(w => w.WriteCommentValue("comment"))); + Assert.Throws(static () => executeScenario(w => w.WriteEndArray())); + Assert.Throws(static () => executeScenario(w => w.WriteEndObject())); + Assert.Throws(static () => executeScenario(w => w.WriteNullValue())); + Assert.Throws(static () => executeScenario(w => w.WriteNumberValue(123))); + Assert.Throws(static () => executeScenario(w => w.WritePropertyName("test"))); + Assert.Throws(static () => executeScenario(w => w.WriteStartArray())); + Assert.Throws(static () => executeScenario(w => w.WriteStartObject())); + + // WriteStringValue is a special case that implicitly finalizes. + ArrayBufferWriter writeStringValueOutput = executeScenario(w => w.WriteStringValue("World!")); + JsonTestHelper.AssertContents($"{{\"test\":\"Hello World!\"}}", writeStringValueOutput); + } + private delegate void WriteValueSpanAction( Utf8JsonWriter writer, ReadOnlySpan value); From e044b1329f3de8937f2e9475ad17d2914de1e6e3 Mon Sep 17 00:00:00 2001 From: ificator Date: Sun, 26 May 2024 14:02:54 -0700 Subject: [PATCH 02/21] Fix some review comments --- .../System.Text.Json/ref/System.Text.Json.cs | 3 - .../src/Resources/Strings.resx | 39 +++++---- .../src/System/Text/Json/JsonTokenType.cs | 9 -- .../src/System/Text/Json/ThrowHelper.cs | 4 + .../Utf8JsonWriter.WriteProperties.Helpers.cs | 10 ++- .../Utf8JsonWriter.WriteValues.Comment.cs | 2 + .../Utf8JsonWriter.WriteValues.Helpers.cs | 19 +++++ .../Utf8JsonWriter.WriteValues.String.cs | 82 +++---------------- .../System/Text/Json/Writer/Utf8JsonWriter.cs | 11 ++- .../Utf8JsonWriterTests.cs | 41 ++++++---- 10 files changed, 103 insertions(+), 117 deletions(-) diff --git a/src/libraries/System.Text.Json/ref/System.Text.Json.cs b/src/libraries/System.Text.Json/ref/System.Text.Json.cs index 5e6978b4fe1ce8..5f187c2b40e687 100644 --- a/src/libraries/System.Text.Json/ref/System.Text.Json.cs +++ b/src/libraries/System.Text.Json/ref/System.Text.Json.cs @@ -426,7 +426,6 @@ public enum JsonTokenType : byte True = (byte)9, False = (byte)10, Null = (byte)11, - StringSegment = (byte)12, } public enum JsonValueKind : byte { @@ -655,8 +654,6 @@ public void WriteStringValue(string? value) { } public void WriteStringValue(System.Text.Json.JsonEncodedText value) { } public void WriteStringValueSegment(System.ReadOnlySpan utf8Value, bool isFinalSegment) { } public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegment) { } - public void WriteStringValueSegment(string? value, bool isFinalSegment) { } - public void WriteStringValueSegment(System.Text.Json.JsonEncodedText value, bool isFinalSegment) { } } } namespace System.Text.Json.Nodes diff --git a/src/libraries/System.Text.Json/src/Resources/Strings.resx b/src/libraries/System.Text.Json/src/Resources/Strings.resx index 3654ae7dbd3807..172986bb5b88bd 100644 --- a/src/libraries/System.Text.Json/src/Resources/Strings.resx +++ b/src/libraries/System.Text.Json/src/Resources/Strings.resx @@ -1,11 +1,11 @@ - @@ -720,4 +720,7 @@ New line can be only "\n" or "\r\n". + + The current JSON string must be finalized before a token of type '{0}' can be added. + diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/JsonTokenType.cs b/src/libraries/System.Text.Json/src/System/Text/Json/JsonTokenType.cs index e86ebdef0c31f1..049da2220a22f8 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/JsonTokenType.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/JsonTokenType.cs @@ -77,14 +77,5 @@ public enum JsonTokenType : byte /// Indicates that the token type is the JSON literal null. /// Null, - - /// - /// Indicates that the token type is a segment of a JSON string. - /// - /// - /// This does not represent a token defined in the JSON specification, but rather provides a means to track - /// that a string is being written in segments. This value will never be seen during deserialization. - /// - StringSegment, } } diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/ThrowHelper.cs b/src/libraries/System.Text.Json/src/System/Text/Json/ThrowHelper.cs index 6976d42b967bc9..a85a2ddcba3a78 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/ThrowHelper.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/ThrowHelper.cs @@ -598,6 +598,9 @@ private static string GetResourceString(ExceptionResource resource, int currentD case ExceptionResource.CannotWriteValueAfterPrimitiveOrClose: message = SR.Format(SR.CannotWriteValueAfterPrimitiveOrClose, tokenType); break; + case ExceptionResource.CannotWriteWithinString: + message = SR.Format(SR.CannotWriteWithinString, tokenType); + break; default: Debug.Fail($"The ExceptionResource enum value: {resource} is not part of the switch. Add the appropriate case and exception message."); break; @@ -758,6 +761,7 @@ internal enum ExceptionResource ExpectedOneCompleteToken, NotEnoughData, InvalidLeadingZeroInNumber, + CannotWriteWithinString, } internal enum NumericType diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteProperties.Helpers.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteProperties.Helpers.cs index 11b0b3885ac34f..b3b85281b82cc9 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteProperties.Helpers.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteProperties.Helpers.cs @@ -36,7 +36,10 @@ private void ValidateWritingProperty() { if (!_options.SkipValidation) { - if (!_inObject || _tokenType == JsonTokenType.PropertyName || _tokenType == JsonTokenType.StringSegment) + // Make sure a new property is not attempted within an unfinalized string. + ValidateNotWithinUnfinalizedString(); + + if (!_inObject || _tokenType == JsonTokenType.PropertyName) { Debug.Assert(_tokenType != JsonTokenType.StartObject); ThrowHelper.ThrowInvalidOperationException(ExceptionResource.CannotWritePropertyWithinArray, currentDepth: default, maxDepth: _options.MaxDepth, token: default, _tokenType); @@ -49,7 +52,10 @@ private void ValidateWritingProperty(byte token) { if (!_options.SkipValidation) { - if (!_inObject || _tokenType == JsonTokenType.PropertyName || _tokenType == JsonTokenType.StringSegment) + // Make sure a new property is not attempted within an unfinalized string. + ValidateNotWithinUnfinalizedString(); + + if (!_inObject || _tokenType == JsonTokenType.PropertyName) { Debug.Assert(_tokenType != JsonTokenType.StartObject); ThrowHelper.ThrowInvalidOperationException(ExceptionResource.CannotWritePropertyWithinArray, currentDepth: default, maxDepth: _options.MaxDepth, token: default, _tokenType); diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs index aa62396df6c876..a23c406ff02c63 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs @@ -61,6 +61,8 @@ public void WriteCommentValue(ReadOnlySpan value) private void WriteCommentByOptions(ReadOnlySpan value) { + ValidateWritingComment(); + if (_options.Indented) { WriteCommentIndented(value); diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs index eeee39e0447d67..fdb859b3862de1 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs @@ -10,10 +10,29 @@ namespace System.Text.Json { public sealed partial class Utf8JsonWriter { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ValidateNotWithinUnfinalizedString() + { + if (_tokenType == StringSegmentSentinel) + { + ThrowHelper.ThrowInvalidOperationException(ExceptionResource.CannotWriteWithinString, currentDepth: default, maxDepth: _options.MaxDepth, token: default, _tokenType); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ValidateWritingComment() + { + // Make sure a new comment is not attempted within an unfinalized string. + ValidateNotWithinUnfinalizedString(); + } + private void ValidateWritingValue() { Debug.Assert(!_options.SkipValidation); + // Make sure a new value is not attempted within an unfinalized string. + ValidateNotWithinUnfinalizedString(); + if (_inObject) { if (_tokenType != JsonTokenType.PropertyName) diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs index 3c6040e51d329b..f2d558d0eea694 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs @@ -26,30 +26,6 @@ public void WriteStringValue(JsonEncodedText value) _tokenType = JsonTokenType.String; } - /// - /// Writes the pre-encoded text value segment as a partial JSON string. - /// - /// The JSON-encoded value to write. - /// Indicates that this is the final segment of the string. - /// - /// Thrown if this would result in invalid JSON being written (while validation is enabled). - /// - public void WriteStringValueSegment(JsonEncodedText value, bool isFinalSegment) - { - ReadOnlySpan utf8Value = value.EncodedUtf8Bytes; - Debug.Assert(utf8Value.Length <= JsonConstants.MaxUnescapedTokenSize); - - JsonTokenType nextTokenType = isFinalSegment ? JsonTokenType.String : JsonTokenType.StringSegment; - WriteStringByOptions(utf8Value, nextTokenType); - - if (isFinalSegment) - { - SetFlagToAddListSeparatorBeforeNextItem(); - } - - _tokenType = nextTokenType; - } - /// /// Writes the string text value (as a JSON string) as an element of a JSON array. /// @@ -80,38 +56,6 @@ public void WriteStringValue(string? value) } } - - /// - /// Writes the string text value segment as a partial JSON string. - /// - /// The value to write. - /// Indicates that this is the final segment of the string. - /// - /// Thrown when the specified value is too large. - /// - /// - /// Thrown if this would result in invalid JSON being written (while validation is enabled). - /// - /// - /// - /// The value is escaped before writing. - /// - /// If is the JSON null value is written, - /// as if was called. - /// - /// - public void WriteStringValueSegment(string? value, bool isFinalSegment) - { - if (value == null) - { - WriteNullValue(); - } - else - { - WriteStringValueSegment(value.AsSpan(), isFinalSegment); - } - } - /// /// Writes the text value (as a JSON string) as an element of a JSON array. /// @@ -153,7 +97,7 @@ public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegmen { JsonWriterHelper.ValidateValue(value); - JsonTokenType nextTokenType = isFinalSegment ? JsonTokenType.String : JsonTokenType.StringSegment; + JsonTokenType nextTokenType = isFinalSegment ? JsonTokenType.String : StringSegmentSentinel; WriteStringEscape(value, nextTokenType); @@ -183,7 +127,7 @@ private void WriteStringEscape(ReadOnlySpan value, JsonTokenType stringTok private void WriteStringByOptions(ReadOnlySpan value, JsonTokenType stringTokenType) { - if (!_options.SkipValidation && _tokenType != JsonTokenType.StringSegment) + if (!_options.SkipValidation && _tokenType != StringSegmentSentinel) { ValidateWritingValue(); } @@ -214,7 +158,7 @@ private void WriteStringMinimized(ReadOnlySpan escapedValue, JsonTokenType Span output = _memory.Span; - if (_tokenType != JsonTokenType.StringSegment) + if (_tokenType != Utf8JsonWriter.StringSegmentSentinel) { if (_currentDepth < 0) { @@ -226,7 +170,7 @@ private void WriteStringMinimized(ReadOnlySpan escapedValue, JsonTokenType TranscodeAndWrite(escapedValue, output); - if (stringTokenType != JsonTokenType.StringSegment) + if (stringTokenType != Utf8JsonWriter.StringSegmentSentinel) { output[BytesPending++] = JsonConstants.Quote; } @@ -251,14 +195,14 @@ private void WriteStringIndented(ReadOnlySpan escapedValue, JsonTokenType Span output = _memory.Span; - if (_tokenType != JsonTokenType.StringSegment) + if (_tokenType != Utf8JsonWriter.StringSegmentSentinel) { if (_currentDepth < 0) { output[BytesPending++] = JsonConstants.ListSeparator; } - if (_tokenType != JsonTokenType.PropertyName && _tokenType != JsonTokenType.StringSegment) + if (_tokenType != JsonTokenType.PropertyName && _tokenType != Utf8JsonWriter.StringSegmentSentinel) { if (_tokenType != JsonTokenType.None) { @@ -273,7 +217,7 @@ private void WriteStringIndented(ReadOnlySpan escapedValue, JsonTokenType TranscodeAndWrite(escapedValue, output); - if (stringTokenType != JsonTokenType.StringSegment) + if (stringTokenType != Utf8JsonWriter.StringSegmentSentinel) { output[BytesPending++] = JsonConstants.Quote; } @@ -346,7 +290,7 @@ public void WriteStringValueSegment(ReadOnlySpan utf8Value, bool isFinalSe { JsonWriterHelper.ValidateValue(utf8Value); - JsonTokenType nextTokenType = isFinalSegment ? JsonTokenType.String : JsonTokenType.StringSegment; + JsonTokenType nextTokenType = isFinalSegment ? JsonTokenType.String : Utf8JsonWriter.StringSegmentSentinel; WriteStringEscape(utf8Value, nextTokenType); if (isFinalSegment) @@ -375,7 +319,7 @@ private void WriteStringEscape(ReadOnlySpan utf8Value, JsonTokenType strin private void WriteStringByOptions(ReadOnlySpan utf8Value, JsonTokenType stringTokenType) { - if (!_options.SkipValidation && _tokenType != JsonTokenType.StringSegment) + if (!_options.SkipValidation && _tokenType != Utf8JsonWriter.StringSegmentSentinel) { ValidateWritingValue(); } @@ -405,7 +349,7 @@ private void WriteStringMinimized(ReadOnlySpan escapedValue, JsonTokenType Span output = _memory.Span; - if (_tokenType != JsonTokenType.StringSegment) + if (_tokenType != Utf8JsonWriter.StringSegmentSentinel) { if (_currentDepth < 0) { @@ -417,7 +361,7 @@ private void WriteStringMinimized(ReadOnlySpan escapedValue, JsonTokenType escapedValue.CopyTo(output.Slice(BytesPending)); BytesPending += escapedValue.Length; - if (stringTokenType != JsonTokenType.StringSegment) + if (stringTokenType != Utf8JsonWriter.StringSegmentSentinel) { output[BytesPending++] = JsonConstants.Quote; } @@ -441,7 +385,7 @@ private void WriteStringIndented(ReadOnlySpan escapedValue, JsonTokenType Span output = _memory.Span; - if (_tokenType != JsonTokenType.StringSegment) + if (_tokenType != Utf8JsonWriter.StringSegmentSentinel) { if (_currentDepth < 0) { @@ -464,7 +408,7 @@ private void WriteStringIndented(ReadOnlySpan escapedValue, JsonTokenType escapedValue.CopyTo(output.Slice(BytesPending)); BytesPending += escapedValue.Length; - if (stringTokenType != JsonTokenType.StringSegment) + if (stringTokenType != Utf8JsonWriter.StringSegmentSentinel) { output[BytesPending++] = JsonConstants.Quote; } diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs index 483d0c6cd263b0..9313a233f28110 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs @@ -37,6 +37,9 @@ public sealed partial class Utf8JsonWriter : IDisposable, IAsyncDisposable private const int DefaultGrowthSize = 4096; private const int InitialGrowthSize = 256; + // A special value for JsonTokenType that lets the writer keep track of string segments. + private const JsonTokenType StringSegmentSentinel = (JsonTokenType)255; + private IBufferWriter? _output; private Stream? _stream; private ArrayBufferWriter? _arrayBufferWriter; @@ -534,6 +537,9 @@ private void WriteStartSlow(byte token) private void ValidateStart() { + // Make sure a new object or array is not attempted within an unfinalized string. + ValidateNotWithinUnfinalizedString(); + if (_inObject) { if (_tokenType != JsonTokenType.PropertyName) @@ -959,7 +965,10 @@ private void WriteEndSlow(byte token) private void ValidateEnd(byte token) { - if (_bitStack.CurrentDepth <= 0 || _tokenType == JsonTokenType.PropertyName || _tokenType == JsonTokenType.StringSegment) + // Make sure an object is not ended within an unfinalized string. + ValidateNotWithinUnfinalizedString(); + + if (_bitStack.CurrentDepth <= 0 || _tokenType == JsonTokenType.PropertyName) ThrowHelper.ThrowInvalidOperationException(ExceptionResource.MismatchedObjectArray, currentDepth: default, maxDepth: _options.MaxDepth, token, _tokenType); if (token == JsonConstants.CloseBracket) diff --git a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs index 5fe4742a28ce1b..2aa1b79347fc44 100644 --- a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs +++ b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs @@ -6818,32 +6818,43 @@ public static void WriteStringValueSegment() [Fact] public static void WriteStringValueSegment_NotFinalized() { - static ArrayBufferWriter executeScenario(Action implementation) + static ArrayBufferWriter executeScenario(Action implementation, bool expectFailure) { var output = new ArrayBufferWriter(); using var jsonUtf8 = new Utf8JsonWriter(output); jsonUtf8.WriteStartObject(); jsonUtf8.WritePropertyName("test"); jsonUtf8.WriteStringValueSegment("Hello ".AsSpan(), isFinalSegment: false); - implementation(jsonUtf8); - jsonUtf8.WriteEndObject(); - jsonUtf8.Flush(); - return output; + + if (expectFailure) + { + InvalidOperationException invalidOperationexception = Assert.Throws( + () => implementation(jsonUtf8)); + Assert.Contains("The current JSON string must be finalized before a token of type", invalidOperationexception.Message); + return null; + } + else + { + implementation(jsonUtf8); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + return output; + } } // The following are expected to fail. - Assert.Throws(static () => executeScenario(w => w.WriteEndArray())); - Assert.Throws(static () => executeScenario(w => w.WriteCommentValue("comment"))); - Assert.Throws(static () => executeScenario(w => w.WriteEndArray())); - Assert.Throws(static () => executeScenario(w => w.WriteEndObject())); - Assert.Throws(static () => executeScenario(w => w.WriteNullValue())); - Assert.Throws(static () => executeScenario(w => w.WriteNumberValue(123))); - Assert.Throws(static () => executeScenario(w => w.WritePropertyName("test"))); - Assert.Throws(static () => executeScenario(w => w.WriteStartArray())); - Assert.Throws(static () => executeScenario(w => w.WriteStartObject())); + executeScenario(w => w.WriteEndArray(), expectFailure: true); + executeScenario(w => w.WriteCommentValue("comment"), expectFailure: true); + executeScenario(w => w.WriteEndArray(), expectFailure: true); + executeScenario(w => w.WriteEndObject(), expectFailure: true); + executeScenario(w => w.WriteNullValue(), expectFailure: true); + executeScenario(w => w.WriteNumberValue(123), expectFailure: true); + executeScenario(w => w.WritePropertyName("test"), expectFailure: true); + executeScenario(w => w.WriteStartArray(), expectFailure: true); + executeScenario(w => w.WriteStartObject(), expectFailure: true); // WriteStringValue is a special case that implicitly finalizes. - ArrayBufferWriter writeStringValueOutput = executeScenario(w => w.WriteStringValue("World!")); + ArrayBufferWriter writeStringValueOutput = executeScenario(w => w.WriteStringValue("World!"), expectFailure: false); JsonTestHelper.AssertContents($"{{\"test\":\"Hello World!\"}}", writeStringValueOutput); } From b8d578c9fd06f8d5da79fb88553a1dcaa7cc25f6 Mon Sep 17 00:00:00 2001 From: ificator Date: Sun, 26 May 2024 15:38:56 -0700 Subject: [PATCH 03/21] Handle split surrogate pair --- .../Utf8JsonWriter.WriteValues.String.cs | 34 +++++++++++ .../Utf8JsonWriterTests.cs | 56 +++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs index f2d558d0eea694..13ea6d17c1772e 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs @@ -8,6 +8,8 @@ namespace System.Text.Json { public sealed partial class Utf8JsonWriter { + private char _cachedHighSurrogate; + /// /// Writes the pre-encoded text value (as a JSON string) as an element of a JSON array. /// @@ -99,6 +101,38 @@ public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegmen JsonTokenType nextTokenType = isFinalSegment ? JsonTokenType.String : StringSegmentSentinel; + // If we have a high surrogate left over from the last segment we need to make sure it's written out. When + // the first character of the current segment is a low surrogate we'll write as a complete pair, otherwise + // we'll write it on its own. + if (_cachedHighSurrogate != '\0') + { + if (value.Length > 0 && char.IsLowSurrogate(value[0])) + { + ReadOnlySpan surrogatePair = stackalloc char[] { _cachedHighSurrogate, value[0] }; + WriteStringEscape(surrogatePair, StringSegmentSentinel); + value = value.Slice(1); + } + else + { + ReadOnlySpan surrogate = stackalloc char[] { _cachedHighSurrogate }; + WriteStringEscape(surrogate, StringSegmentSentinel); + } + + _cachedHighSurrogate = '\0'; + } + + // If the last character of the segment is a high surrogate we need to cache it and write the rest of the + // string. The cached value will be written when the next segment is written. + if (value.Length > 0) + { + char finalChar = value[value.Length - 1]; + if (char.IsHighSurrogate(finalChar)) + { + _cachedHighSurrogate = finalChar; + value = value.Slice(0, value.Length - 1); + } + } + WriteStringEscape(value, nextTokenType); if (isFinalSegment) diff --git a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs index c975b43f8953e7..031b3390df1f10 100644 --- a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs +++ b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs @@ -6814,6 +6814,62 @@ public static void WriteStringValueSegment() JsonTestHelper.AssertContents($"{{\"test\":\"Hello World!\"}}", output); } + [Fact] + public static void WriteStringValueSegment_BadSurrogatePairs() + { + const string result = "\\uFFFD\\uD83D\\uDE00\\uFFFD"; + + Span surrogates = stackalloc char[] { '\uD83D', '\uD83D', '\uDE00', '\uDE00' }; + + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStartObject(); + jsonUtf8.WritePropertyName("full"); + // complete string -> expect 0xFFFD 0xD83D 0xDE00 0xFFFD + jsonUtf8.WriteStringValue(surrogates); + jsonUtf8.WritePropertyName("segmented"); + // only high surrogate -> expect cached + jsonUtf8.WriteStringValueSegment(surrogates.Slice(0, 1), isFinalSegment: false); + // only high surrogate -> expect 0xFFFD + jsonUtf8.WriteStringValueSegment(surrogates.Slice(0, 1), isFinalSegment: false); + // only low surrogate -> expect 0xD83D 0xDE00 + jsonUtf8.WriteStringValueSegment(surrogates.Slice(2, 1), isFinalSegment: false); + // only low surrogate -> expect 0xFFFD + jsonUtf8.WriteStringValueSegment(surrogates.Slice(2, 1), isFinalSegment: true); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents($"{{\"full\":\"{result}\",\"segmented\":\"{result}\"}}", output); + } + + [Fact] + public static void WriteStringValueSegment_SplitInSurrogatePair() + { + const string result = "\\uD83D\\uDE00\\uD83D\\uDE00\\uD83D\\uDE00"; + + Span surrogates = stackalloc char[] { '\uD83D', '\uDE00', '\uD83D', '\uDE00', '\uD83D', '\uDE00' }; + + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStartObject(); + jsonUtf8.WritePropertyName("full"); + // complete string -> expect 0xD83D 0xDE00 0xD83D 0xDE00 0xD83D 0xDE00 + jsonUtf8.WriteStringValue(surrogates); + jsonUtf8.WritePropertyName("segmented"); + // only high surrogate -> expect cached + jsonUtf8.WriteStringValueSegment(surrogates.Slice(0, 2), isFinalSegment: false); + // only low surrogate -> expect 0xD83D 0xDE00 + jsonUtf8.WriteStringValueSegment(surrogates.Slice(0, 1), isFinalSegment: false); + // low surrogate followed by another high surrogate -> expect 0xD83D 0xDE00 + cached + jsonUtf8.WriteStringValueSegment(surrogates.Slice(1, 2), isFinalSegment: false); + // only low surrogate -> expect 0xD83D 0xDE00 + jsonUtf8.WriteStringValueSegment(surrogates.Slice(1, 1), isFinalSegment: true); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents($"{{\"full\":\"{result}\",\"segmented\":\"{result}\"}}", output); + } + [Fact] public static void WriteStringValueSegment_NotFinalized() { From 65006ce624d65d8cb901d779b62eae1d8f9447dc Mon Sep 17 00:00:00 2001 From: ificator Date: Fri, 6 Dec 2024 11:13:37 -0800 Subject: [PATCH 04/21] Commit old changes responding to comments --- .../Text/Json/Writer/JsonWriterHelper.cs | 32 ++++ .../Utf8JsonWriter.WriteValues.String.cs | 158 ++++++++++++++++-- .../Utf8JsonWriterTests.cs | 92 +++++++++- 3 files changed, 264 insertions(+), 18 deletions(-) diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.cs index b2e05f589fc10c..fca6792eb69f45 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.cs @@ -284,6 +284,38 @@ public static unsafe bool IsValidUtf8String(ReadOnlySpan bytes) #endif } + internal static int GetUtf8CharByteCount(byte firstUtf8Byte) + { + byte upperUtf8Bits = (byte)(firstUtf8Byte & 0xC0); + byte lowerUtf8Bits = (byte)(firstUtf8Byte & 0x30); + + switch (upperUtf8Bits) + { + case 0b00_00_0000: + case 0b01_00_0000: + return 1; + + case 0b11_00_0000: + switch (lowerUtf8Bits) + { + case 0b00_00_0000: + case 0b00_01_0000: + return 2; + + case 0b00_10_0000: + return 3; + + case 0b00_11_0000: + return 4; + } + break; + } + + // This should really only be an extension byte (10xxxxxx), but we'll return here instead of having a case for it to + // keep the compiler happy. + return 0; + } + internal static unsafe OperationStatus ToUtf8(ReadOnlySpan source, Span destination, out int written) { #if NET diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs index 13ea6d17c1772e..6ae59a524fbbb3 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs @@ -2,13 +2,19 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Buffers; +using System.Buffers.Text; using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; namespace System.Text.Json { public sealed partial class Utf8JsonWriter { - private char _cachedHighSurrogate; + private const byte HighSurrogateByteSentinel = 0xFF; + private const char HighSurrogateCharSentinel = (char)(HighSurrogateByteSentinel<<8 | HighSurrogateByteSentinel); + + private int _partialStringSegmentChar; /// /// Writes the pre-encoded text value (as a JSON string) as an element of a JSON array. @@ -104,31 +110,60 @@ public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegmen // If we have a high surrogate left over from the last segment we need to make sure it's written out. When // the first character of the current segment is a low surrogate we'll write as a complete pair, otherwise // we'll write it on its own. - if (_cachedHighSurrogate != '\0') + if (_partialStringSegmentChar != 0) { - if (value.Length > 0 && char.IsLowSurrogate(value[0])) - { - ReadOnlySpan surrogatePair = stackalloc char[] { _cachedHighSurrogate, value[0] }; - WriteStringEscape(surrogatePair, StringSegmentSentinel); - value = value.Slice(1); - } - else + // Unfortunately we cannot use MemoryMarshal.CreateSpan here because it is not available in netstandard2.0. + unsafe { - ReadOnlySpan surrogate = stackalloc char[] { _cachedHighSurrogate }; - WriteStringEscape(surrogate, StringSegmentSentinel); + fixed (int* partialStringSegmentCharPtr = &_partialStringSegmentChar) + { + Span partialStringSegmentChar = new Span(partialStringSegmentCharPtr, 2); + if (partialStringSegmentChar[1] == HighSurrogateCharSentinel) + { + if (value.Length > 0 && char.IsLowSurrogate(value[0])) + { + partialStringSegmentChar[1] = value[0]; + WriteStringEscape(partialStringSegmentChar, StringSegmentSentinel); + value = value.Slice(1); + } + else + { + // The caller sent a high surrogate on the previous call to this method, but did not provide a + // low surrogate on the this call. We should handle it gracefully. + WriteStringEscape(partialStringSegmentChar.Slice(0, 1), StringSegmentSentinel); + } + } + else + { + // The caller sent a partial UTF-8 sequence on a previous call to WriteStringValueSegment(byte) but + // switched to calling WriteStringValueSegment(char) on this call. We should handle this gracefully. + Span partialStringSegmentUtf8Bytes = MemoryMarshal.Cast(partialStringSegmentChar); + WriteStringEscape(partialStringSegmentUtf8Bytes.Slice(0, partialStringSegmentUtf8Bytes[3]), StringSegmentSentinel); + } + } } - _cachedHighSurrogate = '\0'; + _partialStringSegmentChar = 0; } // If the last character of the segment is a high surrogate we need to cache it and write the rest of the // string. The cached value will be written when the next segment is written. - if (value.Length > 0) + if (!isFinalSegment && value.Length > 0) { char finalChar = value[value.Length - 1]; if (char.IsHighSurrogate(finalChar)) { - _cachedHighSurrogate = finalChar; + // Unfortunately we cannot use MemoryMarshal.CreateSpan here because it is not available in netstandard2.0. + unsafe + { + fixed (int* partialStringSegmentCharPtr = &_partialStringSegmentChar) + { + Span partialStringSegmentChar = new Span(partialStringSegmentCharPtr, 2); + partialStringSegmentChar[0] = finalChar; + partialStringSegmentChar[1] = HighSurrogateCharSentinel; + } + } + value = value.Slice(0, value.Length - 1); } } @@ -325,6 +360,101 @@ public void WriteStringValueSegment(ReadOnlySpan utf8Value, bool isFinalSe JsonWriterHelper.ValidateValue(utf8Value); JsonTokenType nextTokenType = isFinalSegment ? JsonTokenType.String : Utf8JsonWriter.StringSegmentSentinel; + + if (_partialStringSegmentChar != 0) + { + // Unfortunately we cannot use MemoryMarshal.CreateSpan here because it is not available in netstandard2.0. + unsafe + { + fixed (int* partialStringSegmentCharPtr = &_partialStringSegmentChar) + { + Span partialStringSegmentUtf8Bytes = new Span(partialStringSegmentCharPtr, 4); + if (partialStringSegmentUtf8Bytes[3] == HighSurrogateByteSentinel) + { + // The caller sent a high surrogate on a previous call to WriteStringValueSegment(char) but switched + // to calling WriteStringValueSegment(byte) on this call. We'll handle this gracefully by writing the + // high surrogate on its own. + Span surrogatePair = MemoryMarshal.Cast(partialStringSegmentUtf8Bytes); + WriteStringEscape(surrogatePair.Slice(0, 1), StringSegmentSentinel); + } + else + { + // Attempt to complete the UTF-8 sequence from the previous segment. + int requiredByteCount = JsonWriterHelper.GetUtf8CharByteCount(partialStringSegmentUtf8Bytes[0]); + int remainingByteCount = requiredByteCount - partialStringSegmentUtf8Bytes[3]; + int availableByteCount = Math.Min(remainingByteCount, utf8Value.Length); + + for (int i = 0; i < availableByteCount; i++) + { + int nextByteIndex = partialStringSegmentUtf8Bytes[3] + i; + + byte remainingByte = utf8Value[0]; + if (JsonWriterHelper.GetUtf8CharByteCount(remainingByte) != 0) + { + // Invalid UTF-8 sequence! Write what we cached without trying to complete the sequence. + requiredByteCount = nextByteIndex; + remainingByteCount = 0; + break; + } + + partialStringSegmentUtf8Bytes[nextByteIndex] = remainingByte; + remainingByteCount--; + utf8Value = utf8Value.Slice(1); + } + + if (isFinalSegment || remainingByteCount == 0) + { + WriteStringEscape(partialStringSegmentUtf8Bytes.Slice(0, requiredByteCount), StringSegmentSentinel); + } + else + { + // We didn't have enough to complete the sequence, so update the count of bytes we do have so that + // the next iteration will pick up where we left off. + partialStringSegmentUtf8Bytes[3] = (byte)(requiredByteCount - remainingByteCount); + } + } + } + } + } + + if (!isFinalSegment && utf8Value.Length > 0) + { + int expectedUtf8ByteCount = 0; + int startOfPartialUtf8Sequence = -1; + for (int i = utf8Value.Length - 1; i >= utf8Value.Length - 3; i--) + { + expectedUtf8ByteCount = JsonWriterHelper.GetUtf8CharByteCount(utf8Value[i]); + if (expectedUtf8ByteCount == 0) + { + continue; + } + + if (expectedUtf8ByteCount > 1) + { + startOfPartialUtf8Sequence = i; + } + + break; + } + + if (startOfPartialUtf8Sequence >= 0) + { + // Unfortunately we cannot use MemoryMarshal.CreateSpan here because it is not available in netstandard2.0. + unsafe + { + fixed (int* partialStringSegmentCharPtr = &_partialStringSegmentChar) + { + Span partialStringSegmentUtf8Bytes = new Span(partialStringSegmentCharPtr, 4); + ReadOnlySpan bytesToWrite = utf8Value.Slice(startOfPartialUtf8Sequence); + bytesToWrite.CopyTo(partialStringSegmentUtf8Bytes); + partialStringSegmentUtf8Bytes[3] = (byte)bytesToWrite.Length; + } + } + + utf8Value = utf8Value.Slice(0, startOfPartialUtf8Sequence); + } + } + WriteStringEscape(utf8Value, nextTokenType); if (isFinalSegment) diff --git a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs index 031b3390df1f10..48b2e656c5b5b2 100644 --- a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs +++ b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs @@ -6800,7 +6800,91 @@ public static void WriteStringValue_IndentationOptions() } [Fact] - public static void WriteStringValueSegment() + public static void WriteStringValueSegment_Byte() + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStartObject(); + jsonUtf8.WritePropertyName("test"); + jsonUtf8.WriteStringValueSegment(Encoding.UTF8.GetBytes("Hello "), isFinalSegment: false); + jsonUtf8.WriteStringValueSegment(Encoding.UTF8.GetBytes("World!"), isFinalSegment: true); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents($"{{\"test\":\"Hello World!\"}}", output); + } + + [Fact] + public static void WriteStringValueSegment_Byte_SplitInUtf8Sequence() + { + const string result = "\\uD83D\\uDE00"; + + Span utf8Bytes = Encoding.UTF8.GetBytes("\uD83D\uDE00"); + + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStartObject(); + jsonUtf8.WritePropertyName("full"); + // complete string -> expect 0xD83D 0xDE00 + jsonUtf8.WriteStringValue(utf8Bytes); + jsonUtf8.WritePropertyName("segmented"); + // incomplete UTf-8 sequence -> expect cached + jsonUtf8.WriteStringValueSegment(utf8Bytes.Slice(0, 1), isFinalSegment: false); + // incomplete UTf-8 sequence -> expect cached + jsonUtf8.WriteStringValueSegment(utf8Bytes.Slice(1, 1), isFinalSegment: false); + // remainder of UTF-8 sequence -> expect 0xD83D 0xDE00 + jsonUtf8.WriteStringValueSegment(utf8Bytes.Slice(2, 2), isFinalSegment: true); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents($"{{\"full\":\"{result}\",\"segmented\":\"{result}\"}}", output); + } + + [Fact] + public static void WriteStringValueSegment_Byte_NotFinalized() + { + static ArrayBufferWriter executeScenario(Action implementation, bool expectFailure) + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStartObject(); + jsonUtf8.WritePropertyName("test"); + jsonUtf8.WriteStringValueSegment(Encoding.UTF8.GetBytes("Hello "), isFinalSegment: false); + + if (expectFailure) + { + InvalidOperationException invalidOperationexception = Assert.Throws( + () => implementation(jsonUtf8)); + Assert.Contains("The current JSON string must be finalized before a token of type", invalidOperationexception.Message); + return null; + } + else + { + implementation(jsonUtf8); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + return output; + } + } + + // The following are expected to fail. + executeScenario(w => w.WriteEndArray(), expectFailure: true); + executeScenario(w => w.WriteCommentValue("comment"), expectFailure: true); + executeScenario(w => w.WriteEndArray(), expectFailure: true); + executeScenario(w => w.WriteEndObject(), expectFailure: true); + executeScenario(w => w.WriteNullValue(), expectFailure: true); + executeScenario(w => w.WriteNumberValue(123), expectFailure: true); + executeScenario(w => w.WritePropertyName("test"), expectFailure: true); + executeScenario(w => w.WriteStartArray(), expectFailure: true); + executeScenario(w => w.WriteStartObject(), expectFailure: true); + + // WriteStringValue is a special case that implicitly finalizes. + ArrayBufferWriter writeStringValueOutput = executeScenario(w => w.WriteStringValue(Encoding.UTF8.GetBytes("World!")), expectFailure: false); + JsonTestHelper.AssertContents($"{{\"test\":\"Hello World!\"}}", writeStringValueOutput); + } + + [Fact] + public static void WriteStringValueSegment_Char() { var output = new ArrayBufferWriter(); using var jsonUtf8 = new Utf8JsonWriter(output); @@ -6815,7 +6899,7 @@ public static void WriteStringValueSegment() } [Fact] - public static void WriteStringValueSegment_BadSurrogatePairs() + public static void WriteStringValueSegment_Char_BadSurrogatePairs() { const string result = "\\uFFFD\\uD83D\\uDE00\\uFFFD"; @@ -6843,7 +6927,7 @@ public static void WriteStringValueSegment_BadSurrogatePairs() } [Fact] - public static void WriteStringValueSegment_SplitInSurrogatePair() + public static void WriteStringValueSegment_Char_SplitInSurrogatePair() { const string result = "\\uD83D\\uDE00\\uD83D\\uDE00\\uD83D\\uDE00"; @@ -6871,7 +6955,7 @@ public static void WriteStringValueSegment_SplitInSurrogatePair() } [Fact] - public static void WriteStringValueSegment_NotFinalized() + public static void WriteStringValueSegment_Char_NotFinalized() { static ArrayBufferWriter executeScenario(Action implementation, bool expectFailure) { From 1601af8e70efcfd295043b58311d4ce27ed3821c Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Wed, 11 Dec 2024 12:31:42 -0800 Subject: [PATCH 05/21] utf8 and utf16 --- .../System.Text.Json/ref/System.Text.Json.cs | 4 +- ...em.Numerics.BitOperations.netstandard20.cs | 64 ++ .../System.Text.Rune.netstandard20.cs | 547 ++++++++++ .../src/Resources/Strings.resx | 7 +- .../src/System.Text.Json.csproj | 11 +- .../Reader/JsonReaderHelper.Unescaping.cs | 72 -- .../src/System/Text/Json/ThrowHelper.cs | 2 +- .../Json/Writer/JsonWriterHelper.Escaping.cs | 42 +- .../Text/Json/Writer/JsonWriterHelper.cs | 32 - .../Utf8JsonWriter.WriteValues.Comment.cs | 5 +- .../Utf8JsonWriter.WriteValues.Helpers.cs | 4 +- .../Utf8JsonWriter.WriteValues.String.cs | 367 ++----- ...tf8JsonWriter.WriteValues.StringSegment.cs | 496 +++++++++ .../System/Text/Json/Writer/Utf8JsonWriter.cs | 11 + .../System.Text.Json.Tests/JsonTestHelper.cs | 41 +- .../System.Text.Json.Tests.csproj | 1 + ...tf8JsonWriterTests.Values.StringSegment.cs | 978 ++++++++++++++++++ .../Utf8JsonWriterTests.cs | 753 ++++++++------ 18 files changed, 2665 insertions(+), 772 deletions(-) create mode 100644 src/libraries/System.Text.Json/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs create mode 100644 src/libraries/System.Text.Json/src/Polyfills/System.Text.Rune.netstandard20.cs create mode 100644 src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs create mode 100644 src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs diff --git a/src/libraries/System.Text.Json/ref/System.Text.Json.cs b/src/libraries/System.Text.Json/ref/System.Text.Json.cs index c1b7189c8c11fe..469ea5e4f07c89 100644 --- a/src/libraries/System.Text.Json/ref/System.Text.Json.cs +++ b/src/libraries/System.Text.Json/ref/System.Text.Json.cs @@ -679,8 +679,8 @@ public void WriteStringValue(System.ReadOnlySpan utf8Value) { } public void WriteStringValue(System.ReadOnlySpan value) { } public void WriteStringValue(string? value) { } public void WriteStringValue(System.Text.Json.JsonEncodedText value) { } - public void WriteStringValueSegment(System.ReadOnlySpan utf8Value, bool isFinalSegment) { } - public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegment) { } + public void WriteStringValueSegment(System.ReadOnlySpan value, bool isFinalSegment) { } + public void WriteStringValueSegment(System.ReadOnlySpan value, bool isFinalSegment) { } } } namespace System.Text.Json.Nodes diff --git a/src/libraries/System.Text.Json/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs b/src/libraries/System.Text.Json/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs new file mode 100644 index 00000000000000..5453bd9e6b57d1 --- /dev/null +++ b/src/libraries/System.Text.Json/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs @@ -0,0 +1,64 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// Contains a polyfill implementation of System.Numerics.BitOperations that works on netstandard2.0. +// Implementation copied from: +// https://github.com/dotnet/runtime/blob/6072e4d3a7a2a1493f514cdf4be75a3d56580e84/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs +// +// Some routines inspired by the Stanford Bit Twiddling Hacks by Sean Eron Anderson: +// http://graphics.stanford.edu/~seander/bithacks.html + +namespace System.Numerics +{ + internal static class BitOperations + { + private static ReadOnlySpan Log2DeBruijn => // 32 + [ + 00, 09, 01, 10, 13, 21, 02, 29, + 11, 14, 16, 18, 22, 25, 03, 30, + 08, 12, 20, 28, 15, 17, 24, 07, + 19, 27, 23, 06, 26, 05, 04, 31 + ]; + + /// + /// Returns the integer (floor) log of the specified value, base 2. + /// Note that by convention, input value 0 returns 0 since log(0) is undefined. + /// + /// The value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int Log2(uint value) + { + // Fallback contract is 0->0 + return Log2SoftwareFallback(value | 1); + } + + /// + /// Returns the integer (floor) log of the specified value, base 2. + /// Note that by convention, input value 0 returns 0 since Log(0) is undefined. + /// Does not directly use any hardware intrinsics, nor does it incur branching. + /// + /// The value. + private static int Log2SoftwareFallback(uint value) + { + // No AggressiveInlining due to large method size + // Has conventional contract 0->0 (Log(0) is undefined) + + // Fill trailing zeros with ones, eg 00010010 becomes 00011111 + value |= value >> 01; + value |= value >> 02; + value |= value >> 04; + value |= value >> 08; + value |= value >> 16; + + // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check + return Unsafe.AddByteOffset( + // Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_1100_0100_1010_1100_1101_1101u + ref MemoryMarshal.GetReference(Log2DeBruijn), + // uint|long -> IntPtr cast on 32-bit platforms does expensive overflow checks not needed here + (nint)((value * 0x07C4ACDDu) >> 27)); + } + } +} diff --git a/src/libraries/System.Text.Json/src/Polyfills/System.Text.Rune.netstandard20.cs b/src/libraries/System.Text.Json/src/Polyfills/System.Text.Rune.netstandard20.cs new file mode 100644 index 00000000000000..8a490a85465617 --- /dev/null +++ b/src/libraries/System.Text.Json/src/Polyfills/System.Text.Rune.netstandard20.cs @@ -0,0 +1,547 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Buffers; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Text.Json; + +// Contains a polyfill implementation of System.Text.Rune that works on netstandard2.0. +// Implementation copied from: +// https://github.com/dotnet/runtime/blob/177d6f1a0bfdc853ae9ffeef4be99ff984c4f5dd/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs + +namespace System.Text +{ + internal readonly struct Rune : IEquatable + { + private const int MaxUtf16CharsPerRune = 2; // supplementary plane code points are encoded as 2 UTF-16 code units + + private const char HighSurrogateStart = '\ud800'; + private const char LowSurrogateStart = '\udc00'; + private const int HighSurrogateRange = 0x3FF; + + private readonly uint _value; + + /// + /// Creates a from the provided Unicode scalar value. + /// + /// + /// If does not represent a value Unicode scalar value. + /// + public Rune(uint value) + { + if (!UnicodeUtility.IsValidUnicodeScalar(value)) + { + throw new ArgumentOutOfRangeException(nameof(value)); + } + _value = value; + } + + /// + /// Creates a from the provided Unicode scalar value. + /// + /// + /// If does not represent a value Unicode scalar value. + /// + public Rune(int value) + : this((uint)value) + { + } + + // non-validating ctor + private Rune(uint scalarValue, bool _) + { + UnicodeDebug.AssertIsValidScalar(scalarValue); + _value = scalarValue; + } + + /// + /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ]) + /// and therefore representable by a single UTF-8 code unit. + /// + public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(_value); + + /// + /// Returns true if and only if this scalar value is within the BMP ([ U+0000..U+FFFF ]) + /// and therefore representable by a single UTF-16 code unit. + /// + public bool IsBmp => UnicodeUtility.IsBmpCodePoint(_value); + + public static bool operator ==(Rune left, Rune right) => left._value == right._value; + + public static bool operator !=(Rune left, Rune right) => left._value != right._value; + + public static bool IsControl(Rune value) + { + // Per the Unicode stability policy, the set of control characters + // is forever fixed at [ U+0000..U+001F ], [ U+007F..U+009F ]. No + // characters will ever be added to or removed from the "control characters" + // group. See https://www.unicode.org/policies/stability_policy.html. + + // Logic below depends on Rune.Value never being -1 (since Rune is a validating type) + // 00..1F (+1) => 01..20 (&~80) => 01..20 + // 7F..9F (+1) => 80..A0 (&~80) => 00..20 + + return ((value._value + 1) & ~0x80u) <= 0x20u; + } + + /// + /// A instance that represents the Unicode replacement character U+FFFD. + /// + public static Rune ReplacementChar => UnsafeCreate(UnicodeUtility.ReplacementChar); + + /// + /// Returns the length in code units () of the + /// UTF-16 sequence required to represent this scalar value. + /// + /// + /// The return value will be 1 or 2. + /// + public int Utf16SequenceLength + { + get + { + int codeUnitCount = UnicodeUtility.GetUtf16SequenceLength(_value); + Debug.Assert(codeUnitCount > 0 && codeUnitCount <= MaxUtf16CharsPerRune); + return codeUnitCount; + } + } + + /// + /// Returns the Unicode scalar value as an integer. + /// + public int Value => (int)_value; + + /// + /// Decodes the at the beginning of the provided UTF-16 source buffer. + /// + /// + /// + /// If the source buffer begins with a valid UTF-16 encoded scalar value, returns , + /// and outs via the decoded and via the + /// number of s used in the input buffer to encode the . + /// + /// + /// If the source buffer is empty or contains only a standalone UTF-16 high surrogate character, returns , + /// and outs via and via the length of the input buffer. + /// + /// + /// If the source buffer begins with an ill-formed UTF-16 encoded scalar value, returns , + /// and outs via and via the number of + /// s used in the input buffer to encode the ill-formed sequence. + /// + /// + /// + /// The general calling convention is to call this method in a loop, slicing the buffer by + /// elements on each iteration of the loop. On each iteration of the loop + /// will contain the real scalar value if successfully decoded, or it will contain if + /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of + /// invalid sequences while iterating through the loop. + /// + public static OperationStatus DecodeFromUtf16(ReadOnlySpan source, out Rune result, out int charsConsumed) + { + if (!source.IsEmpty) + { + // First, check for the common case of a BMP scalar value. + // If this is correct, return immediately. + + char firstChar = source[0]; + if (TryCreate(firstChar, out result)) + { + charsConsumed = 1; + return OperationStatus.Done; + } + + // First thing we saw was a UTF-16 surrogate code point. + // Let's optimistically assume for now it's a high surrogate and hope + // that combining it with the next char yields useful results. + + if (1 < (uint)source.Length) + { + char secondChar = source[1]; + if (TryCreate(firstChar, secondChar, out result)) + { + // Success! Formed a supplementary scalar value. + charsConsumed = 2; + return OperationStatus.Done; + } + else + { + // Either the first character was a low surrogate, or the second + // character was not a low surrogate. This is an error. + goto InvalidData; + } + } + else if (!char.IsHighSurrogate(firstChar)) + { + // Quick check to make sure we're not going to report NeedMoreData for + // a single-element buffer where the data is a standalone low surrogate + // character. Since no additional data will ever make this valid, we'll + // report an error immediately. + goto InvalidData; + } + } + + // If we got to this point, the input buffer was empty, or the buffer + // was a single element in length and that element was a high surrogate char. + + charsConsumed = source.Length; + result = ReplacementChar; + return OperationStatus.NeedMoreData; + + InvalidData: + + charsConsumed = 1; // maximal invalid subsequence for UTF-16 is always a single code unit in length + result = ReplacementChar; + return OperationStatus.InvalidData; + } + + /// + /// Decodes the at the beginning of the provided UTF-8 source buffer. + /// + /// + /// + /// If the source buffer begins with a valid UTF-8 encoded scalar value, returns , + /// and outs via the decoded and via the + /// number of s used in the input buffer to encode the . + /// + /// + /// If the source buffer is empty or contains only a partial UTF-8 subsequence, returns , + /// and outs via and via the length of the input buffer. + /// + /// + /// If the source buffer begins with an ill-formed UTF-8 encoded scalar value, returns , + /// and outs via and via the number of + /// s used in the input buffer to encode the ill-formed sequence. + /// + /// + /// + /// The general calling convention is to call this method in a loop, slicing the buffer by + /// elements on each iteration of the loop. On each iteration of the loop + /// will contain the real scalar value if successfully decoded, or it will contain if + /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of + /// invalid sequences while iterating through the loop. + /// + public static OperationStatus DecodeFromUtf8(ReadOnlySpan source, out Rune result, out int bytesConsumed) + { + // This method follows the Unicode Standard's recommendation for detecting + // the maximal subpart of an ill-formed subsequence. See The Unicode Standard, + // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence, + // it tries to consume as many code units as possible as long as those code + // units constitute the beginning of a longer well-formed subsequence per Table 3-7. + + int index = 0; + + // Try reading input[0]. + + if ((uint)index >= (uint)source.Length) + { + goto NeedsMoreData; + } + + uint tempValue = source[index]; + if (!UnicodeUtility.IsAsciiCodePoint(tempValue)) + { + goto NotAscii; + } + + Finish: + + bytesConsumed = index + 1; + Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4] + result = UnsafeCreate(tempValue); + return OperationStatus.Done; + + NotAscii: + + // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in + // the range [C2..F4]. If it's outside of that range, it's either a standalone + // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range + // four-byte sequence. + + if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4)) + { + goto FirstByteInvalid; + } + + tempValue = (tempValue - 0xC2) << 6; + + // Try reading input[1]. + + index++; + if ((uint)index >= (uint)source.Length) + { + goto NeedsMoreData; + } + + // Continuation bytes are of the form [10xxxxxx], which means that their two's + // complement representation is in the range [-65..-128]. This allows us to + // perform a single comparison to see if a byte is a continuation byte. + + int thisByteSignExtended = (sbyte)source[index]; + if (thisByteSignExtended >= -64) + { + goto Invalid; + } + + tempValue += (uint)thisByteSignExtended; + tempValue += 0x80; // remove the continuation byte marker + tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker + + if (tempValue < 0x0800) + { + Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF)); + goto Finish; // this is a valid 2-byte sequence + } + + // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have + // enough information (from just two code units) to detect overlong or surrogate + // sequences, we need to perform these checks now. + + if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80))) + { + // The first two bytes were not in the range [[E0 A0]..[F4 8F]]. + // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence. + goto Invalid; + } + + if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80))) + { + // This is a UTF-16 surrogate code point, which is invalid in UTF-8. + goto Invalid; + } + + if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80))) + { + // This is an overlong 4-byte sequence. + goto Invalid; + } + + // The first two bytes were just fine. We don't need to perform any other checks + // on the remaining bytes other than to see that they're valid continuation bytes. + + // Try reading input[2]. + + index++; + if ((uint)index >= (uint)source.Length) + { + goto NeedsMoreData; + } + + thisByteSignExtended = (sbyte)source[index]; + if (thisByteSignExtended >= -64) + { + goto Invalid; // this byte is not a UTF-8 continuation byte + } + + tempValue <<= 6; + tempValue += (uint)thisByteSignExtended; + tempValue += 0x80; // remove the continuation byte marker + tempValue -= (0xE0 - 0xC0) << 12; // remove the leading byte marker + + if (tempValue <= 0xFFFF) + { + Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF)); + goto Finish; // this is a valid 3-byte sequence + } + + // Try reading input[3]. + + index++; + if ((uint)index >= (uint)source.Length) + { + goto NeedsMoreData; + } + + thisByteSignExtended = (sbyte)source[index]; + if (thisByteSignExtended >= -64) + { + goto Invalid; // this byte is not a UTF-8 continuation byte + } + + tempValue <<= 6; + tempValue += (uint)thisByteSignExtended; + tempValue += 0x80; // remove the continuation byte marker + tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker + + UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue); + goto Finish; // this is a valid 4-byte sequence + + FirstByteInvalid: + + index = 1; // Invalid subsequences are always at least length 1. + + Invalid: + + Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3 + bytesConsumed = index; + result = ReplacementChar; + return OperationStatus.InvalidData; + + NeedsMoreData: + + Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3 + bytesConsumed = index; + result = ReplacementChar; + return OperationStatus.NeedMoreData; + } + + public override bool Equals([NotNullWhen(true)] object? obj) => (obj is Rune other) && Equals(other); + + public bool Equals(Rune other) => this == other; + + public override int GetHashCode() => Value; + + /// + /// Attempts to create a from the provided input value. + /// + public static bool TryCreate(char ch, out Rune result) + { + uint extendedValue = ch; + if (!UnicodeUtility.IsSurrogateCodePoint(extendedValue)) + { + result = UnsafeCreate(extendedValue); + return true; + } + else + { + result = default; + return false; + } + } + + /// + /// Attempts to create a from the provided UTF-16 surrogate pair. + /// Returns if the input values don't represent a well-formed UTF-16surrogate pair. + /// + public static bool TryCreate(char highSurrogate, char lowSurrogate, out Rune result) + { + // First, extend both to 32 bits, then calculate the offset of + // each candidate surrogate char from the start of its range. + + uint highSurrogateOffset = (uint)highSurrogate - HighSurrogateStart; + uint lowSurrogateOffset = (uint)lowSurrogate - LowSurrogateStart; + + // This is a single comparison which allows us to check both for validity at once since + // both the high surrogate range and the low surrogate range are the same length. + // If the comparison fails, we call to a helper method to throw the correct exception message. + + if ((highSurrogateOffset | lowSurrogateOffset) <= HighSurrogateRange) + { + // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding. + result = UnsafeCreate((highSurrogateOffset << 10) + ((uint)lowSurrogate - LowSurrogateStart) + (0x40u << 10)); + return true; + } + else + { + // Didn't have a high surrogate followed by a low surrogate. + result = default; + return false; + } + } + + /// + /// Encodes this to a UTF-16 destination buffer. + /// + /// The buffer to which to write this value as UTF-16. + /// + /// The number of s written to , + /// or 0 if the destination buffer is not large enough to contain the output. + /// True if the value was written to the buffer; otherwise, false. + public bool TryEncodeToUtf16(Span destination, out int charsWritten) + { + if (destination.Length >= 1) + { + if (IsBmp) + { + destination[0] = (char)_value; + charsWritten = 1; + return true; + } + else if (destination.Length >= 2) + { + UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out destination[0], out destination[1]); + charsWritten = 2; + return true; + } + } + + // Destination buffer not large enough + + charsWritten = default; + return false; + } + + /// + /// Encodes this to a destination buffer as UTF-8 bytes. + /// + /// The buffer to which to write this value as UTF-8. + /// + /// The number of s written to , + /// or 0 if the destination buffer is not large enough to contain the output. + /// True if the value was written to the buffer; otherwise, false. + public bool TryEncodeToUtf8(Span destination, out int bytesWritten) + { + // The bit patterns below come from the Unicode Standard, Table 3-6. + + if (destination.Length >= 1) + { + if (IsAscii) + { + destination[0] = (byte)_value; + bytesWritten = 1; + return true; + } + + if (destination.Length >= 2) + { + if (_value <= 0x7FFu) + { + // Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ] + destination[0] = (byte)((_value + (0b110u << 11)) >> 6); + destination[1] = (byte)((_value & 0x3Fu) + 0x80u); + bytesWritten = 2; + return true; + } + + if (destination.Length >= 3) + { + if (_value <= 0xFFFFu) + { + // Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ] + destination[0] = (byte)((_value + (0b1110 << 16)) >> 12); + destination[1] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u); + destination[2] = (byte)((_value & 0x3Fu) + 0x80u); + bytesWritten = 3; + return true; + } + + if (destination.Length >= 4) + { + // Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] + destination[0] = (byte)((_value + (0b11110 << 21)) >> 18); + destination[1] = (byte)(((_value & (0x3Fu << 12)) >> 12) + 0x80u); + destination[2] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u); + destination[3] = (byte)((_value & 0x3Fu) + 0x80u); + bytesWritten = 4; + return true; + } + } + } + } + + // Destination buffer not large enough + + bytesWritten = default; + return false; + } + + /// + /// Creates a without performing validation on the input. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Rune UnsafeCreate(uint scalarValue) => new Rune(scalarValue, false); + } +} diff --git a/src/libraries/System.Text.Json/src/Resources/Strings.resx b/src/libraries/System.Text.Json/src/Resources/Strings.resx index 50ab0ca35fcdc8..a6989b11d1fae4 100644 --- a/src/libraries/System.Text.Json/src/Resources/Strings.resx +++ b/src/libraries/System.Text.Json/src/Resources/Strings.resx @@ -767,7 +767,6 @@ The depth of the generated JSON schema exceeds the JsonSerializerOptions.MaxDepth setting. - The value '{0}' is not of type '{1}' and cannot be used in this generic collection. @@ -810,7 +809,7 @@ Offset and length were out of bounds for the array or count is greater than the number of elements from index to the end of the source collection. - - The current JSON string must be finalized before a token of type '{0}' can be added. + + Cannot write the requested JSON property or value until the final string value segment has been written. - + \ No newline at end of file diff --git a/src/libraries/System.Text.Json/src/System.Text.Json.csproj b/src/libraries/System.Text.Json/src/System.Text.Json.csproj index 5c429898a5fdfe..3c9d9ca1aebf25 100644 --- a/src/libraries/System.Text.Json/src/System.Text.Json.csproj +++ b/src/libraries/System.Text.Json/src/System.Text.Json.csproj @@ -1,4 +1,4 @@ - + $(NetCoreAppCurrent);$(NetCoreAppPrevious);$(NetCoreAppMinimum);netstandard2.0;$(NetFrameworkMinimum) @@ -160,6 +160,7 @@ The System.Text.Json library is built-in as part of the shared framework in .NET + @@ -389,6 +390,14 @@ The System.Text.Json library is built-in as part of the shared framework in .NET + + + + + + + + diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Reader/JsonReaderHelper.Unescaping.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Reader/JsonReaderHelper.Unescaping.cs index 8d9145febf4234..63f69942b3b7ad 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Reader/JsonReaderHelper.Unescaping.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Reader/JsonReaderHelper.Unescaping.cs @@ -572,12 +572,8 @@ private static bool TryUnescape(ReadOnlySpan source, Span destinatio + JsonConstants.UnicodePlane01StartValue; } -#if NET var rune = new Rune(scalar); bool success = rune.TryEncodeToUtf8(destination.Slice(written), out int bytesWritten); -#else - bool success = TryEncodeToUtf8Bytes((uint)scalar, destination.Slice(written), out int bytesWritten); -#endif if (!success) { goto DestinationTooShort; @@ -644,73 +640,5 @@ private static bool TryUnescape(ReadOnlySpan source, Span destinatio DestinationTooShort: return false; } - -#if !NET - /// - /// Copies the UTF-8 code unit representation of this scalar to an output buffer. - /// The buffer must be large enough to hold the required number of s. - /// - private static bool TryEncodeToUtf8Bytes(uint scalar, Span utf8Destination, out int bytesWritten) - { - Debug.Assert(JsonHelpers.IsValidUnicodeScalar(scalar)); - - if (scalar < 0x80U) - { - // Single UTF-8 code unit - if ((uint)utf8Destination.Length < 1u) - { - bytesWritten = 0; - return false; - } - - utf8Destination[0] = (byte)scalar; - bytesWritten = 1; - } - else if (scalar < 0x800U) - { - // Two UTF-8 code units - if ((uint)utf8Destination.Length < 2u) - { - bytesWritten = 0; - return false; - } - - utf8Destination[0] = (byte)(0xC0U | (scalar >> 6)); - utf8Destination[1] = (byte)(0x80U | (scalar & 0x3FU)); - bytesWritten = 2; - } - else if (scalar < 0x10000U) - { - // Three UTF-8 code units - if ((uint)utf8Destination.Length < 3u) - { - bytesWritten = 0; - return false; - } - - utf8Destination[0] = (byte)(0xE0U | (scalar >> 12)); - utf8Destination[1] = (byte)(0x80U | ((scalar >> 6) & 0x3FU)); - utf8Destination[2] = (byte)(0x80U | (scalar & 0x3FU)); - bytesWritten = 3; - } - else - { - // Four UTF-8 code units - if ((uint)utf8Destination.Length < 4u) - { - bytesWritten = 0; - return false; - } - - utf8Destination[0] = (byte)(0xF0U | (scalar >> 18)); - utf8Destination[1] = (byte)(0x80U | ((scalar >> 12) & 0x3FU)); - utf8Destination[2] = (byte)(0x80U | ((scalar >> 6) & 0x3FU)); - utf8Destination[3] = (byte)(0x80U | (scalar & 0x3FU)); - bytesWritten = 4; - } - - return true; - } -#endif } } diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/ThrowHelper.cs b/src/libraries/System.Text.Json/src/System/Text/Json/ThrowHelper.cs index 9f1fb42c2bba7e..df76deacf835b9 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/ThrowHelper.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/ThrowHelper.cs @@ -617,7 +617,7 @@ private static string GetResourceString(ExceptionResource resource, int currentD message = SR.Format(SR.CannotWriteValueAfterPrimitiveOrClose, tokenType); break; case ExceptionResource.CannotWriteWithinString: - message = SR.Format(SR.CannotWriteWithinString, tokenType); + message = SR.CannotWriteWithinString; break; default: Debug.Fail($"The ExceptionResource enum value: {resource} is not part of the switch. Add the appropriate case and exception message."); diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs index 140ecfb9112314..3010e31cbd6fd8 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Escaping.cs @@ -79,37 +79,42 @@ public static int GetMaxEscapedLength(int textLength, int firstIndexToEscape) return firstIndexToEscape + JsonConstants.MaxExpansionFactorWhileEscaping * (textLength - firstIndexToEscape); } - private static void EscapeString(ReadOnlySpan value, Span destination, JavaScriptEncoder encoder, ref int written) + private static void EscapeString(ReadOnlySpan value, Span destination, JavaScriptEncoder encoder, ref int consumed, ref int written, bool isFinalBlock) { Debug.Assert(encoder != null); - OperationStatus result = encoder.EncodeUtf8(value, destination, out int encoderBytesConsumed, out int encoderBytesWritten); + OperationStatus result = encoder.EncodeUtf8(value, destination, out int encoderBytesConsumed, out int encoderBytesWritten, isFinalBlock); Debug.Assert(result != OperationStatus.DestinationTooSmall); - Debug.Assert(result != OperationStatus.NeedMoreData); + Debug.Assert(result != OperationStatus.NeedMoreData || !isFinalBlock); - if (result != OperationStatus.Done) + if (!(result == OperationStatus.Done || (result == OperationStatus.NeedMoreData && !isFinalBlock))) { ThrowHelper.ThrowArgumentException_InvalidUTF8(value.Slice(encoderBytesWritten)); } - Debug.Assert(encoderBytesConsumed == value.Length); + Debug.Assert(encoderBytesConsumed == value.Length || (result == OperationStatus.NeedMoreData && !isFinalBlock)); written += encoderBytesWritten; + consumed += encoderBytesConsumed; } public static void EscapeString(ReadOnlySpan value, Span destination, int indexOfFirstByteToEscape, JavaScriptEncoder? encoder, out int written) + => EscapeString(value, destination, indexOfFirstByteToEscape, encoder, out _, out written, isFinalBlock: true); + + public static void EscapeString(ReadOnlySpan value, Span destination, int indexOfFirstByteToEscape, JavaScriptEncoder? encoder, out int consumed, out int written, bool isFinalBlock = true) { Debug.Assert(indexOfFirstByteToEscape >= 0 && indexOfFirstByteToEscape < value.Length); value.Slice(0, indexOfFirstByteToEscape).CopyTo(destination); written = indexOfFirstByteToEscape; + consumed = indexOfFirstByteToEscape; if (encoder != null) { destination = destination.Slice(indexOfFirstByteToEscape); value = value.Slice(indexOfFirstByteToEscape); - EscapeString(value, destination, encoder, ref written); + EscapeString(value, destination, encoder, ref consumed, ref written, isFinalBlock); } else { @@ -124,12 +129,14 @@ public static void EscapeString(ReadOnlySpan value, Span destination { EscapeNextBytes(val, destination, ref written); indexOfFirstByteToEscape++; + consumed++; } else { destination[written] = val; written++; indexOfFirstByteToEscape++; + consumed++; } } else @@ -137,7 +144,7 @@ public static void EscapeString(ReadOnlySpan value, Span destination // Fall back to default encoder. destination = destination.Slice(written); value = value.Slice(indexOfFirstByteToEscape); - EscapeString(value, destination, JavaScriptEncoder.Default, ref written); + EscapeString(value, destination, JavaScriptEncoder.Default, ref consumed, ref written, isFinalBlock); break; } } @@ -190,37 +197,42 @@ private static void EscapeNextBytes(byte value, Span destination, ref int private static bool IsAsciiValue(char value) => value <= LastAsciiCharacter; - private static void EscapeString(ReadOnlySpan value, Span destination, JavaScriptEncoder encoder, ref int written) + private static void EscapeString(ReadOnlySpan value, Span destination, JavaScriptEncoder encoder, ref int consumed, ref int written, bool isFinalBlock) { Debug.Assert(encoder != null); - OperationStatus result = encoder.Encode(value, destination, out int encoderBytesConsumed, out int encoderCharsWritten); + OperationStatus result = encoder.Encode(value, destination, out int encoderBytesConsumed, out int encoderCharsWritten, isFinalBlock); Debug.Assert(result != OperationStatus.DestinationTooSmall); - Debug.Assert(result != OperationStatus.NeedMoreData); + Debug.Assert(result != OperationStatus.NeedMoreData || !isFinalBlock); - if (result != OperationStatus.Done) + if (!(result == OperationStatus.Done || (result == OperationStatus.NeedMoreData && !isFinalBlock))) { ThrowHelper.ThrowArgumentException_InvalidUTF16(value[encoderCharsWritten]); } - Debug.Assert(encoderBytesConsumed == value.Length); + Debug.Assert(encoderBytesConsumed == value.Length || (result == OperationStatus.NeedMoreData && !isFinalBlock)); written += encoderCharsWritten; + consumed += encoderBytesConsumed; } public static void EscapeString(ReadOnlySpan value, Span destination, int indexOfFirstByteToEscape, JavaScriptEncoder? encoder, out int written) + => EscapeString(value, destination, indexOfFirstByteToEscape, encoder, out _, out written, isFinalBlock: true); + + public static void EscapeString(ReadOnlySpan value, Span destination, int indexOfFirstByteToEscape, JavaScriptEncoder? encoder, out int consumed, out int written, bool isFinalBlock = true) { Debug.Assert(indexOfFirstByteToEscape >= 0 && indexOfFirstByteToEscape < value.Length); value.Slice(0, indexOfFirstByteToEscape).CopyTo(destination); written = indexOfFirstByteToEscape; + consumed = indexOfFirstByteToEscape; if (encoder != null) { destination = destination.Slice(indexOfFirstByteToEscape); value = value.Slice(indexOfFirstByteToEscape); - EscapeString(value, destination, encoder, ref written); + EscapeString(value, destination, encoder, ref consumed, ref written, isFinalBlock); } else { @@ -235,12 +247,14 @@ public static void EscapeString(ReadOnlySpan value, Span destination { EscapeNextChars(val, destination, ref written); indexOfFirstByteToEscape++; + consumed++; } else { destination[written] = val; written++; indexOfFirstByteToEscape++; + consumed++; } } else @@ -248,7 +262,7 @@ public static void EscapeString(ReadOnlySpan value, Span destination // Fall back to default encoder. destination = destination.Slice(written); value = value.Slice(indexOfFirstByteToEscape); - EscapeString(value, destination, JavaScriptEncoder.Default, ref written); + EscapeString(value, destination, JavaScriptEncoder.Default, ref consumed, ref written, isFinalBlock); break; } } diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.cs index fca6792eb69f45..b2e05f589fc10c 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.cs @@ -284,38 +284,6 @@ public static unsafe bool IsValidUtf8String(ReadOnlySpan bytes) #endif } - internal static int GetUtf8CharByteCount(byte firstUtf8Byte) - { - byte upperUtf8Bits = (byte)(firstUtf8Byte & 0xC0); - byte lowerUtf8Bits = (byte)(firstUtf8Byte & 0x30); - - switch (upperUtf8Bits) - { - case 0b00_00_0000: - case 0b01_00_0000: - return 1; - - case 0b11_00_0000: - switch (lowerUtf8Bits) - { - case 0b00_00_0000: - case 0b00_01_0000: - return 2; - - case 0b00_10_0000: - return 3; - - case 0b00_11_0000: - return 4; - } - break; - } - - // This should really only be an extension byte (10xxxxxx), but we'll return here instead of having a case for it to - // keep the compiler happy. - return 0; - } - internal static unsafe OperationStatus ToUtf8(ReadOnlySpan source, Span destination, out int written) { #if NET diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs index a23c406ff02c63..6da2117f34ade4 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs @@ -61,7 +61,10 @@ public void WriteCommentValue(ReadOnlySpan value) private void WriteCommentByOptions(ReadOnlySpan value) { - ValidateWritingComment(); + if (!_options.SkipValidation) + { + ValidateWritingComment(); + } if (_options.Indented) { diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs index fdb859b3862de1..699cb6d9a097ca 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs @@ -10,16 +10,16 @@ namespace System.Text.Json { public sealed partial class Utf8JsonWriter { - [MethodImpl(MethodImplOptions.AggressiveInlining)] private void ValidateNotWithinUnfinalizedString() { + Debug.Assert(!HasPartialCodePoint); + if (_tokenType == StringSegmentSentinel) { ThrowHelper.ThrowInvalidOperationException(ExceptionResource.CannotWriteWithinString, currentDepth: default, maxDepth: _options.MaxDepth, token: default, _tokenType); } } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private void ValidateWritingComment() { // Make sure a new comment is not attempted within an unfinalized string. diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs index 6ae59a524fbbb3..5ff4064d2b59bd 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.String.cs @@ -2,20 +2,12 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Buffers; -using System.Buffers.Text; using System.Diagnostics; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; namespace System.Text.Json { public sealed partial class Utf8JsonWriter { - private const byte HighSurrogateByteSentinel = 0xFF; - private const char HighSurrogateCharSentinel = (char)(HighSurrogateByteSentinel<<8 | HighSurrogateByteSentinel); - - private int _partialStringSegmentChar; - /// /// Writes the pre-encoded text value (as a JSON string) as an element of a JSON array. /// @@ -28,7 +20,7 @@ public void WriteStringValue(JsonEncodedText value) ReadOnlySpan utf8Value = value.EncodedUtf8Bytes; Debug.Assert(utf8Value.Length <= JsonConstants.MaxUnescapedTokenSize); - WriteStringByOptions(utf8Value, JsonTokenType.String); + WriteStringByOptions(utf8Value); SetFlagToAddListSeparatorBeforeNextItem(); _tokenType = JsonTokenType.String; @@ -81,104 +73,13 @@ public void WriteStringValue(ReadOnlySpan value) { JsonWriterHelper.ValidateValue(value); - WriteStringEscape(value, JsonTokenType.String); + WriteStringEscape(value); SetFlagToAddListSeparatorBeforeNextItem(); _tokenType = JsonTokenType.String; } - /// - /// Writes the text value segment as a partial JSON string. - /// - /// The value to write. - /// Indicates that this is the final segment of the string. - /// - /// Thrown when the specified value is too large. - /// - /// - /// Thrown if this would result in invalid JSON being written (while validation is enabled). - /// - /// - /// The value is escaped before writing. - /// - public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegment) - { - JsonWriterHelper.ValidateValue(value); - - JsonTokenType nextTokenType = isFinalSegment ? JsonTokenType.String : StringSegmentSentinel; - - // If we have a high surrogate left over from the last segment we need to make sure it's written out. When - // the first character of the current segment is a low surrogate we'll write as a complete pair, otherwise - // we'll write it on its own. - if (_partialStringSegmentChar != 0) - { - // Unfortunately we cannot use MemoryMarshal.CreateSpan here because it is not available in netstandard2.0. - unsafe - { - fixed (int* partialStringSegmentCharPtr = &_partialStringSegmentChar) - { - Span partialStringSegmentChar = new Span(partialStringSegmentCharPtr, 2); - if (partialStringSegmentChar[1] == HighSurrogateCharSentinel) - { - if (value.Length > 0 && char.IsLowSurrogate(value[0])) - { - partialStringSegmentChar[1] = value[0]; - WriteStringEscape(partialStringSegmentChar, StringSegmentSentinel); - value = value.Slice(1); - } - else - { - // The caller sent a high surrogate on the previous call to this method, but did not provide a - // low surrogate on the this call. We should handle it gracefully. - WriteStringEscape(partialStringSegmentChar.Slice(0, 1), StringSegmentSentinel); - } - } - else - { - // The caller sent a partial UTF-8 sequence on a previous call to WriteStringValueSegment(byte) but - // switched to calling WriteStringValueSegment(char) on this call. We should handle this gracefully. - Span partialStringSegmentUtf8Bytes = MemoryMarshal.Cast(partialStringSegmentChar); - WriteStringEscape(partialStringSegmentUtf8Bytes.Slice(0, partialStringSegmentUtf8Bytes[3]), StringSegmentSentinel); - } - } - } - - _partialStringSegmentChar = 0; - } - - // If the last character of the segment is a high surrogate we need to cache it and write the rest of the - // string. The cached value will be written when the next segment is written. - if (!isFinalSegment && value.Length > 0) - { - char finalChar = value[value.Length - 1]; - if (char.IsHighSurrogate(finalChar)) - { - // Unfortunately we cannot use MemoryMarshal.CreateSpan here because it is not available in netstandard2.0. - unsafe - { - fixed (int* partialStringSegmentCharPtr = &_partialStringSegmentChar) - { - Span partialStringSegmentChar = new Span(partialStringSegmentCharPtr, 2); - partialStringSegmentChar[0] = finalChar; - partialStringSegmentChar[1] = HighSurrogateCharSentinel; - } - } - - value = value.Slice(0, value.Length - 1); - } - } - - WriteStringEscape(value, nextTokenType); - - if (isFinalSegment) - { - SetFlagToAddListSeparatorBeforeNextItem(); - } - - _tokenType = nextTokenType; - } - - private void WriteStringEscape(ReadOnlySpan value, JsonTokenType stringTokenType) + private void WriteStringEscape(ReadOnlySpan value) { int valueIdx = JsonWriterHelper.NeedsEscaping(value, _options.Encoder); @@ -186,33 +87,33 @@ private void WriteStringEscape(ReadOnlySpan value, JsonTokenType stringTok if (valueIdx != -1) { - WriteStringEscapeValue(value, valueIdx, stringTokenType); + WriteStringEscapeValue(value, valueIdx); } else { - WriteStringByOptions(value, stringTokenType); + WriteStringByOptions(value); } } - private void WriteStringByOptions(ReadOnlySpan value, JsonTokenType stringTokenType) + private void WriteStringByOptions(ReadOnlySpan value) { - if (!_options.SkipValidation && _tokenType != StringSegmentSentinel) + if (!_options.SkipValidation) { ValidateWritingValue(); } if (_options.Indented) { - WriteStringIndented(value, stringTokenType); + WriteStringIndented(value); } else { - WriteStringMinimized(value, stringTokenType); + WriteStringMinimized(value); } } // TODO: https://github.com/dotnet/runtime/issues/29293 - private void WriteStringMinimized(ReadOnlySpan escapedValue, JsonTokenType stringTokenType) + private void WriteStringMinimized(ReadOnlySpan escapedValue) { Debug.Assert(escapedValue.Length < (int.MaxValue / JsonConstants.MaxExpansionFactorWhileTranscoding) - 3); @@ -227,26 +128,19 @@ private void WriteStringMinimized(ReadOnlySpan escapedValue, JsonTokenType Span output = _memory.Span; - if (_tokenType != Utf8JsonWriter.StringSegmentSentinel) + if (_currentDepth < 0) { - if (_currentDepth < 0) - { - output[BytesPending++] = JsonConstants.ListSeparator; - } - - output[BytesPending++] = JsonConstants.Quote; + output[BytesPending++] = JsonConstants.ListSeparator; } + output[BytesPending++] = JsonConstants.Quote; TranscodeAndWrite(escapedValue, output); - if (stringTokenType != Utf8JsonWriter.StringSegmentSentinel) - { - output[BytesPending++] = JsonConstants.Quote; - } + output[BytesPending++] = JsonConstants.Quote; } // TODO: https://github.com/dotnet/runtime/issues/29293 - private void WriteStringIndented(ReadOnlySpan escapedValue, JsonTokenType stringTokenType) + private void WriteStringIndented(ReadOnlySpan escapedValue) { int indent = Indentation; Debug.Assert(indent <= _indentLength * _options.MaxDepth); @@ -264,38 +158,29 @@ private void WriteStringIndented(ReadOnlySpan escapedValue, JsonTokenType Span output = _memory.Span; - if (_tokenType != Utf8JsonWriter.StringSegmentSentinel) + if (_currentDepth < 0) { - if (_currentDepth < 0) - { - output[BytesPending++] = JsonConstants.ListSeparator; - } + output[BytesPending++] = JsonConstants.ListSeparator; + } - if (_tokenType != JsonTokenType.PropertyName && _tokenType != Utf8JsonWriter.StringSegmentSentinel) + if (_tokenType != JsonTokenType.PropertyName) + { + if (_tokenType != JsonTokenType.None) { - if (_tokenType != JsonTokenType.None) - { - WriteNewLine(output); - } - WriteIndentation(output.Slice(BytesPending), indent); - BytesPending += indent; + WriteNewLine(output); } - - output[BytesPending++] = JsonConstants.Quote; + WriteIndentation(output.Slice(BytesPending), indent); + BytesPending += indent; } + output[BytesPending++] = JsonConstants.Quote; + TranscodeAndWrite(escapedValue, output); - if (stringTokenType != Utf8JsonWriter.StringSegmentSentinel) - { - output[BytesPending++] = JsonConstants.Quote; - } + output[BytesPending++] = JsonConstants.Quote; } - private void WriteStringEscapeValue( - ReadOnlySpan value, - int firstEscapeIndexVal, - JsonTokenType stringTokenType) + private void WriteStringEscapeValue(ReadOnlySpan value, int firstEscapeIndexVal) { Debug.Assert(int.MaxValue / JsonConstants.MaxExpansionFactorWhileEscaping >= value.Length); Debug.Assert(firstEscapeIndexVal >= 0 && firstEscapeIndexVal < value.Length); @@ -310,7 +195,7 @@ private void WriteStringEscapeValue( JsonWriterHelper.EscapeString(value, escapedValue, firstEscapeIndexVal, _options.Encoder, out int written); - WriteStringByOptions(escapedValue.Slice(0, written), stringTokenType); + WriteStringByOptions(escapedValue.Slice(0, written)); if (valueArray != null) { @@ -335,137 +220,13 @@ public void WriteStringValue(ReadOnlySpan utf8Value) { JsonWriterHelper.ValidateValue(utf8Value); - WriteStringEscape(utf8Value, JsonTokenType.String); + WriteStringEscape(utf8Value); SetFlagToAddListSeparatorBeforeNextItem(); _tokenType = JsonTokenType.String; } - /// - /// Writes the UTF-8 text value segment as a partial JSON string. - /// - /// The UTF-8 encoded value to be written as a JSON string element of a JSON array. - /// Indicates that this is the final segment of the string. - /// - /// Thrown when the specified value is too large. - /// - /// - /// Thrown if this would result in invalid JSON being written (while validation is enabled). - /// - /// - /// The value is escaped before writing. - /// - public void WriteStringValueSegment(ReadOnlySpan utf8Value, bool isFinalSegment) - { - JsonWriterHelper.ValidateValue(utf8Value); - - JsonTokenType nextTokenType = isFinalSegment ? JsonTokenType.String : Utf8JsonWriter.StringSegmentSentinel; - - if (_partialStringSegmentChar != 0) - { - // Unfortunately we cannot use MemoryMarshal.CreateSpan here because it is not available in netstandard2.0. - unsafe - { - fixed (int* partialStringSegmentCharPtr = &_partialStringSegmentChar) - { - Span partialStringSegmentUtf8Bytes = new Span(partialStringSegmentCharPtr, 4); - if (partialStringSegmentUtf8Bytes[3] == HighSurrogateByteSentinel) - { - // The caller sent a high surrogate on a previous call to WriteStringValueSegment(char) but switched - // to calling WriteStringValueSegment(byte) on this call. We'll handle this gracefully by writing the - // high surrogate on its own. - Span surrogatePair = MemoryMarshal.Cast(partialStringSegmentUtf8Bytes); - WriteStringEscape(surrogatePair.Slice(0, 1), StringSegmentSentinel); - } - else - { - // Attempt to complete the UTF-8 sequence from the previous segment. - int requiredByteCount = JsonWriterHelper.GetUtf8CharByteCount(partialStringSegmentUtf8Bytes[0]); - int remainingByteCount = requiredByteCount - partialStringSegmentUtf8Bytes[3]; - int availableByteCount = Math.Min(remainingByteCount, utf8Value.Length); - - for (int i = 0; i < availableByteCount; i++) - { - int nextByteIndex = partialStringSegmentUtf8Bytes[3] + i; - - byte remainingByte = utf8Value[0]; - if (JsonWriterHelper.GetUtf8CharByteCount(remainingByte) != 0) - { - // Invalid UTF-8 sequence! Write what we cached without trying to complete the sequence. - requiredByteCount = nextByteIndex; - remainingByteCount = 0; - break; - } - - partialStringSegmentUtf8Bytes[nextByteIndex] = remainingByte; - remainingByteCount--; - utf8Value = utf8Value.Slice(1); - } - - if (isFinalSegment || remainingByteCount == 0) - { - WriteStringEscape(partialStringSegmentUtf8Bytes.Slice(0, requiredByteCount), StringSegmentSentinel); - } - else - { - // We didn't have enough to complete the sequence, so update the count of bytes we do have so that - // the next iteration will pick up where we left off. - partialStringSegmentUtf8Bytes[3] = (byte)(requiredByteCount - remainingByteCount); - } - } - } - } - } - - if (!isFinalSegment && utf8Value.Length > 0) - { - int expectedUtf8ByteCount = 0; - int startOfPartialUtf8Sequence = -1; - for (int i = utf8Value.Length - 1; i >= utf8Value.Length - 3; i--) - { - expectedUtf8ByteCount = JsonWriterHelper.GetUtf8CharByteCount(utf8Value[i]); - if (expectedUtf8ByteCount == 0) - { - continue; - } - - if (expectedUtf8ByteCount > 1) - { - startOfPartialUtf8Sequence = i; - } - - break; - } - - if (startOfPartialUtf8Sequence >= 0) - { - // Unfortunately we cannot use MemoryMarshal.CreateSpan here because it is not available in netstandard2.0. - unsafe - { - fixed (int* partialStringSegmentCharPtr = &_partialStringSegmentChar) - { - Span partialStringSegmentUtf8Bytes = new Span(partialStringSegmentCharPtr, 4); - ReadOnlySpan bytesToWrite = utf8Value.Slice(startOfPartialUtf8Sequence); - bytesToWrite.CopyTo(partialStringSegmentUtf8Bytes); - partialStringSegmentUtf8Bytes[3] = (byte)bytesToWrite.Length; - } - } - - utf8Value = utf8Value.Slice(0, startOfPartialUtf8Sequence); - } - } - - WriteStringEscape(utf8Value, nextTokenType); - - if (isFinalSegment) - { - SetFlagToAddListSeparatorBeforeNextItem(); - } - - _tokenType = nextTokenType; - } - - private void WriteStringEscape(ReadOnlySpan utf8Value, JsonTokenType stringTokenType) + private void WriteStringEscape(ReadOnlySpan utf8Value) { int valueIdx = JsonWriterHelper.NeedsEscaping(utf8Value, _options.Encoder); @@ -473,33 +234,33 @@ private void WriteStringEscape(ReadOnlySpan utf8Value, JsonTokenType strin if (valueIdx != -1) { - WriteStringEscapeValue(utf8Value, valueIdx, stringTokenType); + WriteStringEscapeValue(utf8Value, valueIdx); } else { - WriteStringByOptions(utf8Value, stringTokenType); + WriteStringByOptions(utf8Value); } } - private void WriteStringByOptions(ReadOnlySpan utf8Value, JsonTokenType stringTokenType) + private void WriteStringByOptions(ReadOnlySpan utf8Value) { - if (!_options.SkipValidation && _tokenType != Utf8JsonWriter.StringSegmentSentinel) + if (!_options.SkipValidation) { ValidateWritingValue(); } if (_options.Indented) { - WriteStringIndented(utf8Value, stringTokenType); + WriteStringIndented(utf8Value); } else { - WriteStringMinimized(utf8Value, stringTokenType); + WriteStringMinimized(utf8Value); } } // TODO: https://github.com/dotnet/runtime/issues/29293 - private void WriteStringMinimized(ReadOnlySpan escapedValue, JsonTokenType stringTokenType) + private void WriteStringMinimized(ReadOnlySpan escapedValue) { Debug.Assert(escapedValue.Length < int.MaxValue - 3); @@ -513,26 +274,20 @@ private void WriteStringMinimized(ReadOnlySpan escapedValue, JsonTokenType Span output = _memory.Span; - if (_tokenType != Utf8JsonWriter.StringSegmentSentinel) + if (_currentDepth < 0) { - if (_currentDepth < 0) - { - output[BytesPending++] = JsonConstants.ListSeparator; - } - output[BytesPending++] = JsonConstants.Quote; + output[BytesPending++] = JsonConstants.ListSeparator; } + output[BytesPending++] = JsonConstants.Quote; escapedValue.CopyTo(output.Slice(BytesPending)); BytesPending += escapedValue.Length; - if (stringTokenType != Utf8JsonWriter.StringSegmentSentinel) - { - output[BytesPending++] = JsonConstants.Quote; - } + output[BytesPending++] = JsonConstants.Quote; } // TODO: https://github.com/dotnet/runtime/issues/29293 - private void WriteStringIndented(ReadOnlySpan escapedValue, JsonTokenType stringTokenType) + private void WriteStringIndented(ReadOnlySpan escapedValue) { int indent = Indentation; Debug.Assert(indent <= _indentLength * _options.MaxDepth); @@ -549,36 +304,30 @@ private void WriteStringIndented(ReadOnlySpan escapedValue, JsonTokenType Span output = _memory.Span; - if (_tokenType != Utf8JsonWriter.StringSegmentSentinel) + if (_currentDepth < 0) { - if (_currentDepth < 0) - { - output[BytesPending++] = JsonConstants.ListSeparator; - } + output[BytesPending++] = JsonConstants.ListSeparator; + } - if (_tokenType != JsonTokenType.PropertyName) + if (_tokenType != JsonTokenType.PropertyName) + { + if (_tokenType != JsonTokenType.None) { - if (_tokenType != JsonTokenType.None) - { - WriteNewLine(output); - } - WriteIndentation(output.Slice(BytesPending), indent); - BytesPending += indent; + WriteNewLine(output); } - - output[BytesPending++] = JsonConstants.Quote; + WriteIndentation(output.Slice(BytesPending), indent); + BytesPending += indent; } + output[BytesPending++] = JsonConstants.Quote; + escapedValue.CopyTo(output.Slice(BytesPending)); BytesPending += escapedValue.Length; - if (stringTokenType != Utf8JsonWriter.StringSegmentSentinel) - { - output[BytesPending++] = JsonConstants.Quote; - } + output[BytesPending++] = JsonConstants.Quote; } - private void WriteStringEscapeValue(ReadOnlySpan utf8Value, int firstEscapeIndexVal, JsonTokenType stringTokenType) + private void WriteStringEscapeValue(ReadOnlySpan utf8Value, int firstEscapeIndexVal) { Debug.Assert(int.MaxValue / JsonConstants.MaxExpansionFactorWhileEscaping >= utf8Value.Length); Debug.Assert(firstEscapeIndexVal >= 0 && firstEscapeIndexVal < utf8Value.Length); @@ -593,7 +342,7 @@ private void WriteStringEscapeValue(ReadOnlySpan utf8Value, int firstEscap JsonWriterHelper.EscapeString(utf8Value, escapedValue, firstEscapeIndexVal, _options.Encoder, out int written); - WriteStringByOptions(escapedValue.Slice(0, written), stringTokenType); + WriteStringByOptions(escapedValue.Slice(0, written)); if (valueArray != null) { @@ -609,7 +358,7 @@ internal void WriteNumberValueAsStringUnescaped(ReadOnlySpan utf8Value) { // The value has been validated prior to calling this method. - WriteStringByOptions(utf8Value, JsonTokenType.String); + WriteStringByOptions(utf8Value); SetFlagToAddListSeparatorBeforeNextItem(); _tokenType = JsonTokenType.String; diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs new file mode 100644 index 00000000000000..68075f543417ad --- /dev/null +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs @@ -0,0 +1,496 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Buffers; +using System.Buffers.Text; +using System.ComponentModel; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace System.Text.Json +{ + public sealed partial class Utf8JsonWriter + { + /// + /// Writes the text value segment as a partial JSON string. + /// + /// The value to write. + /// Indicates that this is the final segment of the string. + /// + /// Thrown when the specified value is too large. + /// + /// + /// Thrown if this would result in invalid JSON being written (while validation is enabled). + /// + /// + /// The value is escaped before writing. + /// + public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegment) + { + JsonWriterHelper.ValidateValue(value); + + if (!_options.SkipValidation && _tokenType != Utf8JsonWriter.StringSegmentSentinel) + { + ValidateWritingValue(); + } + + if (_tokenType != Utf8JsonWriter.StringSegmentSentinel) + { + WriteStringSegmentProlog(); + _tokenType = Utf8JsonWriter.StringSegmentSentinel; + } + + // The steps to write a string segment are to complete the previous partial code point + // and escape either of which might not be required so there is a fast path for each of these steps. + if (HasPartialCodePoint) + { + WriteStringSegmentWithLeftover(value, isFinalSegment); + } + else + { + WriteStringSegmentEscape(value, isFinalSegment); + } + + if (isFinalSegment) + { + WriteStringSegmentEpilog(); + SetFlagToAddListSeparatorBeforeNextItem(); + _tokenType = JsonTokenType.String; + } + } + + private void WriteStringSegmentWithLeftover(scoped ReadOnlySpan value, bool isFinalSegment) + { + Span combinedBuffer = stackalloc char[2]; + + GetPartialUtf16CodePoint(out scoped ReadOnlySpan partialCodePointBuffer); + combinedBuffer = combinedBuffer.Slice(0, ConcatInto(partialCodePointBuffer, value, combinedBuffer)); + + switch (Rune.DecodeFromUtf16(combinedBuffer, out _, out int charsConsumed)) + { + case OperationStatus.NeedMoreData: + Debug.Assert(value.Length + partialCodePointBuffer.Length < 2); + Debug.Assert(charsConsumed == value.Length + partialCodePointBuffer.Length); + // Let the encoder deal with the error if this is a final buffer. + value = combinedBuffer.Slice(0, charsConsumed); + partialCodePointBuffer = ReadOnlySpan.Empty; + break; + case OperationStatus.Done: + Debug.Assert(charsConsumed > partialCodePointBuffer.Length); + Debug.Assert(charsConsumed <= 2); + // Divide up the code point chars into its own buffer and the remainder of the input buffer. + value = value.Slice(charsConsumed - partialCodePointBuffer.Length); + partialCodePointBuffer = combinedBuffer.Slice(0, charsConsumed); + break; + case OperationStatus.InvalidData: + Debug.Assert(charsConsumed >= partialCodePointBuffer.Length); + Debug.Assert(charsConsumed <= 2); + value = value.Slice(charsConsumed - partialCodePointBuffer.Length); + partialCodePointBuffer = combinedBuffer.Slice(0, charsConsumed); + break; + case OperationStatus.DestinationTooSmall: + default: + Debug.Fail("Unexpected OperationStatus return value."); + break; + } + + ClearPartialCodePoint(); + + // The "isFinalSegment" argument indicates whether input that NeedsMoreData should be consumed as an error or not. + // Because we have validated above that partialCodePointBuffer will be the next consumed chars during Rune decoding + // (even if this is because it is invalid), we should pass isFinalSegment = true to indicate to the decoder to + // parse the code units without extra data. + // + // This is relevant in the case of having ['\uD800', 'C'], where the validation above would have needed all both code units + // to determine that only the first unit should be consumed (as invalid). So this method will get only ['\uD800']. + // Because we know more data will not be able to complete this code point, we need to pass isFinalSegment = true + // to ensure that the encoder consumes this data eagerly instead of leaving it and returning NeedsMoreData. + WriteStringSegmentEscape(partialCodePointBuffer, true); + + WriteStringSegmentEscape(value, isFinalSegment); + } + + private void WriteStringSegmentEscape(ReadOnlySpan value, bool isFinalSegment) + { + if (value.IsEmpty) return; + + int escapeIdx = JsonWriterHelper.NeedsEscaping(value, _options.Encoder); + if (escapeIdx != -1) + { + WriteStringSegmentEscapeValue(value, escapeIdx, isFinalSegment); + } + else + { + WriteStringSegmentData(value); + } + } + + private void WriteStringSegmentEscapeValue(ReadOnlySpan value, int firstEscapeIndexVal, bool isFinalSegment) + { + Debug.Assert(int.MaxValue / JsonConstants.MaxExpansionFactorWhileEscaping >= value.Length); + Debug.Assert(firstEscapeIndexVal >= 0 && firstEscapeIndexVal < value.Length); + + char[]? valueArray = null; + + int length = JsonWriterHelper.GetMaxEscapedLength(value.Length, firstEscapeIndexVal); + + Span escapedValue = length <= JsonConstants.StackallocCharThreshold ? + stackalloc char[JsonConstants.StackallocCharThreshold] : + (valueArray = ArrayPool.Shared.Rent(length)); + + JsonWriterHelper.EscapeString(value, escapedValue, firstEscapeIndexVal, _options.Encoder, out int consumed, out int written, isFinalSegment); + + WriteStringSegmentData(escapedValue.Slice(0, written)); + + Debug.Assert(consumed == value.Length || !isFinalSegment); + if (value.Length != consumed) + { + Debug.Assert(!isFinalSegment); + Debug.Assert(value.Length - consumed < 2); + SetPartialUtf16CodePoint(value.Slice(consumed)); + } + + if (valueArray != null) + { + ArrayPool.Shared.Return(valueArray); + } + } + + private void WriteStringSegmentData(ReadOnlySpan escapedValue) + { + Debug.Assert(escapedValue.Length < (int.MaxValue / JsonConstants.MaxExpansionFactorWhileTranscoding)); + + int requiredBytes = escapedValue.Length * JsonConstants.MaxExpansionFactorWhileTranscoding; + + if (_memory.Length - BytesPending < requiredBytes) + { + Grow(requiredBytes); + } + + Span output = _memory.Span; + + TranscodeAndWrite(escapedValue, output); + } + + /// + /// Writes the UTF-8 text value segment as a partial JSON string. + /// + /// The UTF-8 encoded value to be written as a JSON string element of a JSON array. + /// Indicates that this is the final segment of the string. + /// + /// Thrown when the specified value is too large. + /// + /// + /// Thrown if this would result in invalid JSON being written (while validation is enabled). + /// + /// + /// The value is escaped before writing. + /// + public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegment) + { + JsonWriterHelper.ValidateValue(value); + + if (!_options.SkipValidation && _tokenType != Utf8JsonWriter.StringSegmentSentinel) + { + ValidateWritingValue(); + } + + if (_tokenType != Utf8JsonWriter.StringSegmentSentinel) + { + WriteStringSegmentProlog(); + _tokenType = Utf8JsonWriter.StringSegmentSentinel; + } + + // The steps to write a string segment are to complete the previous partial code point + // and escape either of which might not be required so there is a fast path for each of these steps. + if (HasPartialCodePoint) + { + WriteStringSegmentWithLeftover(value, isFinalSegment); + } + else + { + WriteStringSegmentEscape(value, isFinalSegment); + } + + if (isFinalSegment) + { + WriteStringSegmentEpilog(); + SetFlagToAddListSeparatorBeforeNextItem(); + _tokenType = JsonTokenType.String; + } + } + + private void WriteStringSegmentWithLeftover(scoped ReadOnlySpan utf8Value, bool isFinalSegment) + { + Span combinedBuffer = stackalloc byte[4]; + + GetPartialUtf8CodePoint(out scoped ReadOnlySpan partialCodePointBuffer); + combinedBuffer = combinedBuffer.Slice(0, ConcatInto(partialCodePointBuffer, utf8Value, combinedBuffer)); + + switch (Rune.DecodeFromUtf8(combinedBuffer, out _, out int bytesConsumed)) + { + case OperationStatus.NeedMoreData: + Debug.Assert(utf8Value.Length + partialCodePointBuffer.Length < 4); + Debug.Assert(bytesConsumed == utf8Value.Length + partialCodePointBuffer.Length); + // Let the encoder deal with the error if this is a final buffer. + utf8Value = combinedBuffer.Slice(0, bytesConsumed); + partialCodePointBuffer = ReadOnlySpan.Empty; + break; + case OperationStatus.Done: + Debug.Assert(bytesConsumed > partialCodePointBuffer.Length); + Debug.Assert(bytesConsumed <= 4); + // Divide up the code point bytes into its own buffer and the remainder of the input buffer. + utf8Value = utf8Value.Slice(bytesConsumed - partialCodePointBuffer.Length); + partialCodePointBuffer = combinedBuffer.Slice(0, bytesConsumed); + break; + case OperationStatus.InvalidData: + Debug.Assert(bytesConsumed >= partialCodePointBuffer.Length); + Debug.Assert(bytesConsumed <= 4); + utf8Value = utf8Value.Slice(bytesConsumed - partialCodePointBuffer.Length); + partialCodePointBuffer = combinedBuffer.Slice(0, bytesConsumed); + break; + case OperationStatus.DestinationTooSmall: + default: + Debug.Fail("Unexpected OperationStatus return value."); + break; + } + + ClearPartialCodePoint(); + + // The "isFinalSegment" argument indicates whether input that NeedsMoreData should be consumed as an error or not. + // Because we have validated above that partialCodePointBuffer will be the next consumed bytes during Rune decoding + // (even if this is because it is invalid), we should pass isFinalSegment = true to indicate to the decoder to + // parse the code units without extra data. + // + // This is relevant in the case of having [<3-length prefix code unit>, , ], where the validation + // above would have needed all 3 code units to determine that only the first 2 units should be consumed (as invalid). + // So this method will get only <3-size prefix code unit>. Because we know more data will not be able + // to complete this code point, we need to pass isFinalSegment = true to ensure that the encoder consumes this data eagerly + // instead of leaving it and returning NeedsMoreData. + WriteStringSegmentEscape(partialCodePointBuffer, true); + + WriteStringSegmentEscape(utf8Value, isFinalSegment); + } + + private void WriteStringSegmentEscape(ReadOnlySpan utf8Value, bool isFinalSegment) + { + if (utf8Value.IsEmpty) return; + + int escapeIdx = JsonWriterHelper.NeedsEscaping(utf8Value, _options.Encoder); + if (escapeIdx != -1) + { + WriteStringSegmentEscapeValue(utf8Value, escapeIdx, isFinalSegment); + } + else + { + WriteStringSegmentData(utf8Value); + } + } + + private void WriteStringSegmentEscapeValue(ReadOnlySpan utf8Value, int firstEscapeIndexVal, bool isFinalSegment) + { + Debug.Assert(int.MaxValue / JsonConstants.MaxExpansionFactorWhileEscaping >= utf8Value.Length); + Debug.Assert(firstEscapeIndexVal >= 0 && firstEscapeIndexVal < utf8Value.Length); + byte[]? valueArray = null; + int length = JsonWriterHelper.GetMaxEscapedLength(utf8Value.Length, firstEscapeIndexVal); + Span escapedValue = length <= JsonConstants.StackallocByteThreshold ? + stackalloc byte[JsonConstants.StackallocByteThreshold] : + (valueArray = ArrayPool.Shared.Rent(length)); + + JsonWriterHelper.EscapeString(utf8Value, escapedValue, firstEscapeIndexVal, _options.Encoder, out int consumed, out int written, isFinalSegment); + + WriteStringSegmentData(escapedValue.Slice(0, written)); + + Debug.Assert(consumed == utf8Value.Length || !isFinalSegment); + if (utf8Value.Length != consumed) + { + Debug.Assert(!isFinalSegment); + Debug.Assert(utf8Value.Length - consumed < 4); + SetPartialUtf8CodePoint(utf8Value.Slice(consumed)); + } + + if (valueArray != null) + { + ArrayPool.Shared.Return(valueArray); + } + } + + private void WriteStringSegmentData(ReadOnlySpan escapedValue) + { + Debug.Assert(escapedValue.Length < int.MaxValue - 3); + + int requiredBytes = escapedValue.Length; + + if (_memory.Length - BytesPending < requiredBytes) + { + Grow(requiredBytes); + } + + Span output = _memory.Span; + + escapedValue.CopyTo(output.Slice(BytesPending)); + BytesPending += escapedValue.Length; + } + + private void WriteStringSegmentProlog() + { + if (_options.Indented) + { + WriteStringSegmentIndentedProlog(); + } + else + { + WriteStringSegmentMinimizedProlog(); + } + } + + private void WriteStringSegmentIndentedProlog() + { + int indent = Indentation; + Debug.Assert(indent <= _indentLength * _options.MaxDepth); + + // One quote and optionally 1 indent, 1 list separator and 1-2 bytes for new line + int bytesRequired = 1 + indent + 1 + _newLineLength; + if (_memory.Length - BytesPending < bytesRequired) + { + Grow(bytesRequired); + } + + Span output = _memory.Span; + + if (_currentDepth < 0) + { + output[BytesPending++] = JsonConstants.ListSeparator; + } + + if (_tokenType != JsonTokenType.PropertyName) + { + if (_tokenType != JsonTokenType.None) + { + WriteNewLine(output); + } + WriteIndentation(output.Slice(BytesPending), indent); + BytesPending += indent; + } + + output[BytesPending++] = JsonConstants.Quote; + } + + private void WriteStringSegmentMinimizedProlog() + { + // One quote and optionally 1 list separator + int bytesRequired = 2; + if (_memory.Length - BytesPending < bytesRequired) + { + Grow(bytesRequired); + } + + Span output = _memory.Span; + + if (_currentDepth < 0) + { + output[BytesPending++] = JsonConstants.ListSeparator; + } + + output[BytesPending++] = JsonConstants.Quote; + } + + private void WriteStringSegmentEpilog() + { + if (_memory.Length == BytesPending) + { + Grow(1); + } + + _memory.Span[BytesPending++] = JsonConstants.Quote; + } + +#if NET + [InlineArray(4)] + private struct Inline4ByteArray + { + public byte byte0; + } +#endif + + private void GetPartialUtf8CodePoint(out ReadOnlySpan codePointBytes) + { + ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; + Debug.Assert(partialCodePointBytes.Length == 4); + + byte length = partialCodePointBytes[3]; + Debug.Assert(0 <= length && length <= 4); + + codePointBytes = partialCodePointBytes.Slice(0, length); + } + + private void GetPartialUtf16CodePoint(out ReadOnlySpan codePointChars) + { + ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; + Debug.Assert(partialCodePointBytes.Length == 4); + + byte length = partialCodePointBytes[3]; + Debug.Assert(length == 2 || length == 0); + + codePointChars = MemoryMarshal.Cast(partialCodePointBytes.Slice(0, length)); + } + + private void SetPartialUtf8CodePoint(ReadOnlySpan bytes) + { + Debug.Assert(bytes.Length <= 3); + + Span partialCodePointBytes = PartialCodePointRaw; + + bytes.CopyTo(partialCodePointBytes); + partialCodePointBytes[3] = (byte)bytes.Length; + } + + private void SetPartialUtf16CodePoint(ReadOnlySpan bytes) + { + Debug.Assert(bytes.Length <= 1); + + Span partialCodePointBytes = PartialCodePointRaw; + + bytes.CopyTo(MemoryMarshal.Cast(partialCodePointBytes)); + partialCodePointBytes[3] = (byte)(2 * bytes.Length); + } + + private bool HasPartialCodePoint => PartialCodePointRaw[3] != 0; + + private void ClearPartialCodePoint() => PartialCodePointRaw[3] = 0; + + /// + /// Given a byte buffer , concatenates as much of followed + /// by into it as will fit, then returns the total number of bytes copied. + /// + private static int ConcatInto(ReadOnlySpan srcLeft, ReadOnlySpan srcRight, Span dest) + { + int total = 0; + for (int i = 0; i < srcLeft.Length; i++) + { + if ((uint)total >= (uint)dest.Length) + { + goto Finish; + } + else + { + dest[total++] = srcLeft[i]; + } + } + for (int i = 0; i < srcRight.Length; i++) + { + if ((uint)total >= (uint)dest.Length) + { + goto Finish; + } + else + { + dest[total++] = srcRight[i]; + } + } + Finish: + return total; + } + } +} diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs index 38f329fee44bb9..cf670f4d344562 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs @@ -7,6 +7,7 @@ using System.Runtime.CompilerServices; using System.Threading; using System.Threading.Tasks; +using System.ComponentModel; #if !NET using System.Runtime.InteropServices; @@ -51,6 +52,14 @@ public sealed partial class Utf8JsonWriter : IDisposable, IAsyncDisposable private JsonTokenType _tokenType; private BitStack _bitStack; +#if NET + private Inline4ByteArray _partialCodePoint; + private Span PartialCodePointRaw => _partialCodePoint; +#else + private byte[]? _partialCodePoint; + private Span PartialCodePointRaw => _partialCodePoint ??= new byte[4]; +#endif + // The highest order bit of _currentDepth is used to discern whether we are writing the first item in a list or not. // if (_currentDepth >> 31) == 1, add a list separator before writing the item // else, no list separator is needed since we are writing the first item. @@ -274,6 +283,8 @@ private void ResetHelper() _currentDepth = default; _bitStack = default; + + ClearPartialCodePoint(); } private void CheckNotDisposed() diff --git a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/JsonTestHelper.cs b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/JsonTestHelper.cs index a3c139ee4831ee..7979d4eca35e3e 100644 --- a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/JsonTestHelper.cs +++ b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/JsonTestHelper.cs @@ -4,6 +4,7 @@ using System.Buffers; using System.Collections.Generic; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.IO; using System.Text.Json.Tests; @@ -694,7 +695,13 @@ public static string GetCompactString(string jsonString) } } - public static void AssertContents(string expectedValue, ArrayBufferWriter buffer, bool skipSpecialRules = false) + public static void AssertContents( +#if NET + [StringSyntax(StringSyntaxAttribute.Json)] +#endif + string expectedValue, + ArrayBufferWriter buffer, + bool skipSpecialRules = false) { string value = Encoding.UTF8.GetString( buffer.WrittenSpan @@ -706,14 +713,26 @@ public static void AssertContents(string expectedValue, ArrayBufferWriter AssertContentsAgainstJsonNet(expectedValue, value, skipSpecialRules); } - public static void AssertContents(string expectedValue, MemoryStream stream, bool skipSpecialRules = false) + public static void AssertContents( +#if NET + [StringSyntax(StringSyntaxAttribute.Json)] +#endif + string expectedValue, + MemoryStream stream, + bool skipSpecialRules = false) { string value = Encoding.UTF8.GetString(stream.ToArray()); AssertContentsAgainstJsonNet(expectedValue, value, skipSpecialRules); } - public static void AssertContentsNotEqual(string expectedValue, ArrayBufferWriter buffer, bool skipSpecialRules = false) + public static void AssertContentsNotEqual( +#if NET + [StringSyntax(StringSyntaxAttribute.Json)] +#endif + string expectedValue, + ArrayBufferWriter buffer, + bool skipSpecialRules = false) { string value = Encoding.UTF8.GetString( buffer.WrittenSpan @@ -725,12 +744,24 @@ public static void AssertContentsNotEqual(string expectedValue, ArrayBufferWrite AssertContentsNotEqualAgainstJsonNet(expectedValue, value, skipSpecialRules); } - public static void AssertContentsAgainstJsonNet(string expectedValue, string value, bool skipSpecialRules) + public static void AssertContentsAgainstJsonNet( +#if NET + [StringSyntax(StringSyntaxAttribute.Json)] +#endif + string expectedValue, + string value, + bool skipSpecialRules) { Assert.Equal(expectedValue.NormalizeToJsonNetFormat(skipSpecialRules), value.NormalizeToJsonNetFormat(skipSpecialRules), ignoreLineEndingDifferences: true); } - public static void AssertContentsNotEqualAgainstJsonNet(string expectedValue, string value, bool skipSpecialRules) + public static void AssertContentsNotEqualAgainstJsonNet( +#if NET + [StringSyntax(StringSyntaxAttribute.Json)] +#endif + string expectedValue, + string value, + bool skipSpecialRules) { Assert.NotEqual(expectedValue.NormalizeToJsonNetFormat(skipSpecialRules), value.NormalizeToJsonNetFormat(skipSpecialRules)); } diff --git a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/System.Text.Json.Tests.csproj b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/System.Text.Json.Tests.csproj index 17437b0b6c9a8f..b70e73d3b83254 100644 --- a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/System.Text.Json.Tests.csproj +++ b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/System.Text.Json.Tests.csproj @@ -245,6 +245,7 @@ + diff --git a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs new file mode 100644 index 00000000000000..5f7028ed5a4490 --- /dev/null +++ b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs @@ -0,0 +1,978 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + + + +using System.Buffers; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.InteropServices; +using System.Text.Encodings.Web; +using Xunit; + +namespace System.Text.Json.Tests +{ + public partial class Utf8JsonWriterTests + { + public static IEnumerable BasicStringJsonOptions => + from indented in new[] { true, false } + from encoding in new[] { JavaScriptEncoder.Default, JavaScriptEncoder.UnsafeRelaxedJsonEscaping, JavaScriptEncoder.Create() } + select new JsonWriterOptions + { + Indented = indented, + Encoder = encoding + }; + + public static IEnumerable BasicStringJsonOptions_TestData => + from option in BasicStringJsonOptions + select new object[] { option }; + + public static IEnumerable InvalidUtf16Data() + { + char[][] input = [ + // Unpaired low surrogate + ['\uDC00'], + + // Unpaired high surrogate + ['\uD800'], + ['\uD800', '\uD800'], + + // Two unpaired low surrogates + ['a', '\uDC00', '\uDC00'], + ]; + + // Separate each case with a character + yield return input.SelectMany(arr => arr.Concat(['j'])).ToArray(); + + // Test without separation + yield return input.SelectMany(arr => arr).ToArray(); + } + + public static IEnumerable InvalidUtf16DataWithOptions_TestData => + from data in InvalidUtf16Data() + from option in BasicStringJsonOptions + select new object[] { data, option }; + + [Theory] + [MemberData(nameof(InvalidUtf16DataWithOptions_TestData))] + public static void WriteStringValueSegment_Utf16_SplitCodePointsReplacement(char[] inputArr, JsonWriterOptions options) + { + var expectedChars = new char[inputArr.Length * MaxExpansionFactorWhileEscaping]; + + options.Encoder.Encode(inputArr, expectedChars, out int charsConsumed, out int charsWritten); + Assert.Equal(inputArr.Length, charsConsumed); + + SplitCodePointsHelper(inputArr, $@"""{new string(expectedChars, 0, charsWritten)}""", options); + } + + public static IEnumerable InvalidUtf8Data() + { + byte[][] input = [ + // Continuation without start + [0b10_111111], + + // 2-byte sequence containing < 2 bytes + [0b110_11111], + + // 2-byte overlong + [0b110_00000, 0b10_111111], + + // 3-byte sequence containing < 3 bytes + [0b1110_1111], + // For some reason an invalid 3-byte code point is only replaced + // by one replacement character unlike in the 4-byte case + [0b1110_1111, 0b10_111111], + + // 3-byte overlong + [0b1110_0000, 0b10_000000, 0b10_000000], + + // 4-byte sequence containing < 4 bytes + [0b11110_111], + [0b11110_111, 0b10_111111], + [0b11110_111, 0b10_111111, 0b10_111111], + + // 4-byte overlong + [0b11110_000, 0b10_000000, 0b10_000000, 0b10_000000], + ]; + + // Separate each case with a character + yield return input.SelectMany(arr => arr.Concat([(byte)'j'])).ToArray(); + + // Test without separation + yield return input.SelectMany(arr => arr).ToArray(); + } + + public static IEnumerable InvalidUtf8DataWithOptions_TestData => + from data in InvalidUtf8Data() + from option in BasicStringJsonOptions + select new object[] { data, option }; + + [Theory] + [MemberData(nameof(InvalidUtf8DataWithOptions_TestData))] + public static void WriteStringValueSegment_Utf8_SplitCodePointsReplacement(byte[] inputArr, JsonWriterOptions options) + { + var expectedBytes = new byte[inputArr.Length * MaxExpansionFactorWhileEscaping]; + + options.Encoder.EncodeUtf8(inputArr, expectedBytes, out int bytesConsumed, out int bytesWritten); + Assert.Equal(inputArr.Length, bytesConsumed); + + string expectedString = $@"""{Encoding.UTF8.GetString(expectedBytes, 0, bytesWritten)}"""; + + SplitCodePointsHelper(inputArr, expectedString, options); + } + + private static void SplitCodePointsHelper( + T[] inputArr, + string expected, + JsonWriterOptions options) + where T : struct + { + SplitCodePointsHelper(inputArr, options, output => JsonTestHelper.AssertContents(expected, output)); + } + + private static void SplitCodePointsHelper( + T[] inputArr, + JsonWriterOptions options, + Action> assert) + where T : struct + { + SplitCodePointsHelper(inputArr.AsSpan(), options, assert); + } + + private static void SplitCodePointsHelper( + ReadOnlySpan inputArr, + JsonWriterOptions options, + Action> assert) + where T : struct + { + ReadOnlySpan input = inputArr; + + // Sanity check with non-segmented API + { + var output = new ArrayBufferWriter(1024); + + using (var writer = new Utf8JsonWriter(output, options)) + { + WriteStringValueHelper(writer, input); + writer.Flush(); + } + + assert(output); + } + + for (int splitIndex = 0; splitIndex <= input.Length; splitIndex++) + { + var output = new ArrayBufferWriter(1024); + + using (var writer = new Utf8JsonWriter(output, options)) + { + WriteStringValueSegmentsHelper(writer, input.Slice(0, splitIndex), input.Slice(splitIndex)); + writer.Flush(); + } + + assert(output); + } + + for (int splitIndex = 0; splitIndex <= input.Length; splitIndex++) + { + for (int splitIndex2 = splitIndex; splitIndex2 <= input.Length; splitIndex2++) + { + var output = new ArrayBufferWriter(1024); + + using (var writer = new Utf8JsonWriter(output, options)) + { + WriteStringValueSegmentsHelper(writer, input.Slice(0, splitIndex), input.Slice(splitIndex, splitIndex2 - splitIndex), input.Slice(splitIndex2)); + writer.Flush(); + } + + assert(output); + } + } + } + + [Theory] + [MemberData(nameof(BasicStringJsonOptions_TestData))] + public static void WriteStringValueSegment_Utf16_Basic(JsonWriterOptions options) + { + WriteStringValueSegment_BasicHelper( + "Hello".AsSpan(), + " Wor".AsSpan(), + "ld!".AsSpan(), + options.Encoder.Encode("Hello"), + options.Encoder.Encode(" Wor"), + options.Encoder.Encode("ld!"), + options); + } + + [Theory] + [MemberData(nameof(BasicStringJsonOptions_TestData))] + public static void WriteStringValueSegment_Utf8_Basic(JsonWriterOptions options) + { + WriteStringValueSegment_BasicHelper( + "Hello"u8, + " Wor"u8, + "ld!"u8, + options.Encoder.Encode("Hello"), + options.Encoder.Encode(" Wor"), + options.Encoder.Encode("ld!"), + options); + } + + private static void WriteStringValueSegment_BasicHelper( + ReadOnlySpan segment1, + ReadOnlySpan segment2, + ReadOnlySpan segment3, + string expected1, + string expected2, + string expected3, + JsonWriterOptions options) + where T : struct + { + string indent = options.Indented ? new string(options.IndentCharacter, options.IndentSize) : ""; + string n = options.Indented ? options.NewLine : ""; + string ni = n + indent; + string nii = ni + indent; + string s = options.Indented ? " " : ""; + string e1 = '"' + expected1 + '"'; + string e2 = '"' + expected1 + expected2 + '"'; + string e3 = '"' + expected1 + expected2 + expected3 + '"'; + string foo = '"' + options.Encoder.Encode("foo") + '"'; + string bar = '"' + options.Encoder.Encode("bar") + '"'; + string baz = '"' + options.Encoder.Encode("baz") + '"'; + string inner = '"' + options.Encoder.Encode("inner") + '"'; + + // JSON string + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + WriteStringValueSegmentsHelper(jsonUtf8, segment1); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents(e1, output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + WriteStringValueSegmentsHelper(jsonUtf8, segment1, segment2); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents(e2, output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + WriteStringValueSegmentsHelper(jsonUtf8, segment1, segment2, segment3); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents(e3, output); + } + + // JSON array + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartArray(); + WriteStringValueSegmentsHelper(jsonUtf8, segment1); + jsonUtf8.WriteEndArray(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $"[{n}{indent}{e1}{n}]", + output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartArray(); + WriteStringValueSegmentsHelper(jsonUtf8, segment1, segment2); + jsonUtf8.WriteEndArray(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $"[{ni}{e2}{n}]", + output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartArray(); + WriteStringValueSegmentsHelper(jsonUtf8, segment1, segment2, segment3); + jsonUtf8.WriteEndArray(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $"[{ni}{e3}{n}]", + output); + } + + // Middle item in array + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartArray(); + jsonUtf8.WriteBooleanValue(true); + WriteStringValueSegmentsHelper(jsonUtf8, segment1); + jsonUtf8.WriteBooleanValue(false); + jsonUtf8.WriteEndArray(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $"[{ni}true,{ni}{e1},{ni}false{n}]", + output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartArray(); + jsonUtf8.WriteBooleanValue(true); + WriteStringValueSegmentsHelper(jsonUtf8, segment1, segment2); + jsonUtf8.WriteBooleanValue(false); + jsonUtf8.WriteEndArray(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $"[{ni}true,{ni}{e2},{ni}false{n}]", + output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartArray(); + jsonUtf8.WriteBooleanValue(true); + WriteStringValueSegmentsHelper(jsonUtf8, segment1, segment2, segment3); + jsonUtf8.WriteBooleanValue(false); + jsonUtf8.WriteEndArray(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $"[{ni}true,{ni}{e3},{ni}false{n}]", + output); + } + + // Nested array + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartArray(); + jsonUtf8.WriteStartArray(); + jsonUtf8.WriteBooleanValue(true); + WriteStringValueSegmentsHelper(jsonUtf8, segment1); + jsonUtf8.WriteBooleanValue(false); + jsonUtf8.WriteEndArray(); + jsonUtf8.WriteEndArray(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $"[{ni}[{nii}true,{nii}{e1},{nii}false{ni}]{n}]", + output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartArray(); + jsonUtf8.WriteStartArray(); + jsonUtf8.WriteBooleanValue(true); + WriteStringValueSegmentsHelper(jsonUtf8, segment1, segment2); + jsonUtf8.WriteBooleanValue(false); + jsonUtf8.WriteEndArray(); + jsonUtf8.WriteEndArray(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $"[{ni}[{nii}true,{nii}{e2},{nii}false{ni}]{n}]", + output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartArray(); + jsonUtf8.WriteStartArray(); + jsonUtf8.WriteBooleanValue(true); + WriteStringValueSegmentsHelper(jsonUtf8, segment1, segment2, segment3); + jsonUtf8.WriteBooleanValue(false); + jsonUtf8.WriteEndArray(); + jsonUtf8.WriteEndArray(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $"[{ni}[{nii}true,{nii}{e3},{nii}false{ni}]{n}]", + output); + } + + // JSON object + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartObject(); + jsonUtf8.WritePropertyName("foo"); + WriteStringValueSegmentsHelper(jsonUtf8, segment1); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $@"{{{ni}{foo}:{s}{e1}{n}}}", + output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartObject(); + jsonUtf8.WritePropertyName("foo"); + WriteStringValueSegmentsHelper(jsonUtf8, segment1, segment2); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $@"{{{ni}{foo}:{s}{e2}{n}}}", + output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartObject(); + jsonUtf8.WritePropertyName("foo"); + WriteStringValueSegmentsHelper(jsonUtf8, segment1, segment2, segment3); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $@"{{{ni}{foo}:{s}{e3}{n}}}", + output); + } + + // Middle item in object + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartObject(); + jsonUtf8.WriteBoolean("bar", true); + jsonUtf8.WritePropertyName("foo"); + WriteStringValueSegmentsHelper(jsonUtf8, segment1); + jsonUtf8.WriteBoolean("baz", false); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $@"{{{ni}{bar}:{s}true,{ni}{foo}:{s}{e1},{ni}{baz}:{s}false{n}}}", + output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartObject(); + jsonUtf8.WriteBoolean("bar", true); + jsonUtf8.WritePropertyName("foo"); + WriteStringValueSegmentsHelper(jsonUtf8, segment1, segment2); + jsonUtf8.WriteBoolean("baz", false); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $@"{{{ni}{bar}:{s}true,{ni}{foo}:{s}{e2},{ni}{baz}:{s}false{n}}}", + output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartObject(); + jsonUtf8.WriteBoolean("bar", true); + jsonUtf8.WritePropertyName("foo"); + WriteStringValueSegmentsHelper(jsonUtf8, segment1, segment2, segment3); + jsonUtf8.WriteBoolean("baz", false); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $@"{{{ni}{bar}:{s}true,{ni}{foo}:{s}{e3},{ni}{baz}:{s}false{n}}}", + output); + } + + // Nested object + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartObject(); + jsonUtf8.WriteStartObject("inner"); + jsonUtf8.WriteBoolean("bar", true); + jsonUtf8.WritePropertyName("foo"); + WriteStringValueSegmentsHelper(jsonUtf8, segment1); + jsonUtf8.WriteBoolean("baz", false); + jsonUtf8.WriteEndObject(); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $@"{{{ni}{inner}:{s}{{{nii}{bar}:{s}true,{nii}{foo}:{s}{e1},{nii}{baz}:{s}false{ni}}}{n}}}", + output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartObject(); + jsonUtf8.WriteStartObject("inner"); + jsonUtf8.WriteBoolean("bar", true); + jsonUtf8.WritePropertyName("foo"); + WriteStringValueSegmentsHelper(jsonUtf8, segment1, segment2); + jsonUtf8.WriteBoolean("baz", false); + jsonUtf8.WriteEndObject(); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $@"{{{ni}{inner}:{s}{{{nii}{bar}:{s}true,{nii}{foo}:{s}{e2},{nii}{baz}:{s}false{ni}}}{n}}}", + output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, options); + jsonUtf8.WriteStartObject(); + jsonUtf8.WriteStartObject("inner"); + jsonUtf8.WriteBoolean("bar", true); + jsonUtf8.WritePropertyName("foo"); + WriteStringValueSegmentsHelper(jsonUtf8, segment1, segment2, segment3); + jsonUtf8.WriteBoolean("baz", false); + jsonUtf8.WriteEndObject(); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents( + $@"{{{ni}{inner}:{s}{{{nii}{bar}:{s}true,{nii}{foo}:{s}{e3},{nii}{baz}:{s}false{ni}}}{n}}}", + output); + } + } + + [Fact] + public static void WriteStringValueSegment_Utf16_BadSurrogatePairs() + { + const string result = "\\uFFFD\\uD83D\\uDE00\\uFFFD"; + + ReadOnlySpan surrogates = ['\uD83D', '\uD83D', '\uDE00', '\uDE00']; + + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStartObject(); + jsonUtf8.WritePropertyName("full"); + // complete string -> expect 0xFFFD 0xD83D 0xDE00 0xFFFD + jsonUtf8.WriteStringValue(surrogates); + jsonUtf8.WritePropertyName("segmented"); + // only high surrogate -> expect cached + jsonUtf8.WriteStringValueSegment(surrogates.Slice(0, 1), isFinalSegment: false); + // only high surrogate -> expect 0xFFFD + jsonUtf8.WriteStringValueSegment(surrogates.Slice(0, 1), isFinalSegment: false); + // only low surrogate -> expect 0xD83D 0xDE00 + jsonUtf8.WriteStringValueSegment(surrogates.Slice(2, 1), isFinalSegment: false); + // only low surrogate -> expect 0xFFFD + jsonUtf8.WriteStringValueSegment(surrogates.Slice(2, 1), isFinalSegment: true); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents($"{{\"full\":\"{result}\",\"segmented\":\"{result}\"}}", output); + } + + [Fact] + public static void WriteStringValueSegment_Utf16_SplitInSurrogatePair() + { + const string result = "\\uD83D\\uDE00\\uD83D\\uDE00\\uD83D\\uDE00"; + + Span surrogates = stackalloc char[] { '\uD83D', '\uDE00', '\uD83D', '\uDE00', '\uD83D', '\uDE00' }; + + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStartObject(); + jsonUtf8.WritePropertyName("full"); + // complete string -> expect 0xD83D 0xDE00 0xD83D 0xDE00 0xD83D 0xDE00 + jsonUtf8.WriteStringValue(surrogates); + jsonUtf8.WritePropertyName("segmented"); + // only high surrogate -> expect cached + jsonUtf8.WriteStringValueSegment(surrogates.Slice(0, 2), isFinalSegment: false); + // only low surrogate -> expect 0xD83D 0xDE00 + jsonUtf8.WriteStringValueSegment(surrogates.Slice(0, 1), isFinalSegment: false); + // low surrogate followed by another high surrogate -> expect 0xD83D 0xDE00 + cached + jsonUtf8.WriteStringValueSegment(surrogates.Slice(1, 2), isFinalSegment: false); + // only low surrogate -> expect 0xD83D 0xDE00 + jsonUtf8.WriteStringValueSegment(surrogates.Slice(1, 1), isFinalSegment: true); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents($"{{\"full\":\"{result}\",\"segmented\":\"{result}\"}}", output); + } + + [Fact] + public static void WriteStringValueSegment_Utf8_Split8CodePointsBasic() + { + const string result = "\\uD83D\\uDE00"; + + Span utf8Bytes = Encoding.UTF8.GetBytes("\uD83D\uDE00"); + + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStartObject(); + jsonUtf8.WritePropertyName("full"); + // complete string -> expect 0xD83D 0xDE00 + jsonUtf8.WriteStringValue(utf8Bytes); + jsonUtf8.WritePropertyName("segmented"); + // incomplete UTf-8 sequence -> expect cached + jsonUtf8.WriteStringValueSegment(utf8Bytes.Slice(0, 1), isFinalSegment: false); + // incomplete UTf-8 sequence -> expect cached + jsonUtf8.WriteStringValueSegment(utf8Bytes.Slice(1, 1), isFinalSegment: false); + // remainder of UTF-8 sequence -> expect 0xD83D 0xDE00 + jsonUtf8.WriteStringValueSegment(utf8Bytes.Slice(2, 2), isFinalSegment: true); + jsonUtf8.WriteEndObject(); + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents($"{{\"full\":\"{result}\",\"segmented\":\"{result}\"}}", output); + } + + [Fact] + public static void WriteStringValueSegment_Utf8_ClearedPartial() + { + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + + jsonUtf8.WriteStartArray(); + + jsonUtf8.WriteStringValueSegment([0b110_11111], false); + jsonUtf8.WriteStringValueSegment([0b10_111111], true); + + jsonUtf8.WriteStringValueSegment([0b10_111111], true); + + jsonUtf8.WriteStringValueSegment([0b110_11111], false); + jsonUtf8.WriteStringValueSegment([0b10_111111], false); + jsonUtf8.WriteStringValueSegment([0b10_111111], true); + + jsonUtf8.WriteEndArray(); + + jsonUtf8.Flush(); + + // First code point is written (escaped) and the second is replaced. + JsonTestHelper.AssertContents("""["\u07ff","\uFFFD","\u07ff\uFFFD"]""", output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output, new JsonWriterOptions { Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping }); + + jsonUtf8.WriteStartArray(); + + jsonUtf8.WriteStringValueSegment([0b110_11111], false); + jsonUtf8.WriteStringValueSegment([0b10_111111], true); + + jsonUtf8.WriteStringValueSegment([0b10_111111], true); + + jsonUtf8.WriteStringValueSegment([0b110_11111], false); + jsonUtf8.WriteStringValueSegment([0b10_111111], false); + jsonUtf8.WriteStringValueSegment([0b10_111111], true); + + jsonUtf8.WriteEndArray(); + + jsonUtf8.Flush(); + + // First code point is written (unescaped) and the second is replaced. + JsonTestHelper.AssertContents($"""["{'\u07ff'}","\uFFFD","{'\u07ff'}\uFFFD"]""", output); + } + } + + [Fact] + public static void WriteStringValueSegment_Utf16_ClearedPartial() + { + var output = new ArrayBufferWriter(); + + { + using var jsonUtf8 = new Utf8JsonWriter(output); + + jsonUtf8.WriteStartArray(); + + WriteStringValueSegmentsHelper(jsonUtf8, ['\uD800'], ['\uDC00']); + WriteStringValueSegmentsHelper(jsonUtf8, ['\uDC00']); + WriteStringValueSegmentsHelper(jsonUtf8, ['\uD800'], ['\uDC00'], ['\uDC00']); + + jsonUtf8.WriteEndArray(); + + jsonUtf8.Flush(); + + // First code point is written and the second is replaced. + JsonTestHelper.AssertContents("""["\uD800\uDC00","\uFFFD","\uD800\uDC00\uFFFD"]""", output); + } + } + + [Fact] + public static void WriteStringValueSegment_Flush() + { + var noEscape = JavaScriptEncoder.UnsafeRelaxedJsonEscaping; + TestFlushImpl('\uD800', '\uDC00', new(), @"""\uD800\uDC00"""); + TestFlushImpl(0b110_11111, 0b10_111111, new(), @"""\u07FF"""); + TestFlushImpl(0b110_11111, 0b10_111111, new() { Encoder = noEscape }, "\"\u07FF\""); + + void TestFlushImpl(T unit1, T unit2, JsonWriterOptions options, string expected) + where T : struct + { + byte[] expectedBytes = Encoding.UTF8.GetBytes(expected); + var output = new ArrayBufferWriter(); + using Utf8JsonWriter jsonUtf8 = new(output, options); + + WriteStringValueSegmentHelper(jsonUtf8, [unit1], false); + + Assert.Equal(0, output.WrittenCount); + Assert.Equal(0, jsonUtf8.BytesCommitted); + Assert.Equal(1, jsonUtf8.BytesPending); + + jsonUtf8.Flush(); + Assert.Equal(1, output.WrittenCount); + Assert.Equal(1, jsonUtf8.BytesCommitted); + Assert.Equal(0, jsonUtf8.BytesPending); + + WriteStringValueSegmentHelper(jsonUtf8, [unit2], true); + + Assert.Equal(1, output.WrittenCount); + Assert.Equal(1, jsonUtf8.BytesCommitted); + Assert.Equal(expectedBytes.Length - 1, jsonUtf8.BytesPending); + + jsonUtf8.Flush(); + Assert.Equal(expectedBytes.Length, output.WrittenCount); + Assert.Equal(expectedBytes.Length, jsonUtf8.BytesCommitted); + Assert.Equal(0, jsonUtf8.BytesPending); + + JsonTestHelper.AssertContents(expected, output); + } + } + + [Fact] + public static void WriteStringValueSegment_Utf16_Reset() + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + + jsonUtf8.WriteStringValueSegment("\uD800".AsSpan(), false); + jsonUtf8.Flush(); + + Assert.Equal(0, jsonUtf8.BytesPending); + Assert.Equal(1, jsonUtf8.BytesCommitted); + + jsonUtf8.Reset(); + + Assert.Equal(0, jsonUtf8.BytesPending); + Assert.Equal(0, jsonUtf8.BytesCommitted); + + jsonUtf8.WriteStringValueSegment("\uDC00".AsSpan(), true); + + string expected = @"""\uFFFD"""; + Assert.Equal(expected.Length, jsonUtf8.BytesPending); + Assert.Equal(0, jsonUtf8.BytesCommitted); + + jsonUtf8.Flush(); + + Assert.Equal(0, jsonUtf8.BytesPending); + Assert.Equal(expected.Length, jsonUtf8.BytesCommitted); + JsonTestHelper.AssertContents('"' + expected, output); + } + + [Fact] + public static void WriteStringValueSegment_Utf8_Reset() + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + + jsonUtf8.WriteStringValueSegment([0b110_11111], false); + jsonUtf8.Flush(); + + Assert.Equal(0, jsonUtf8.BytesPending); + Assert.Equal(1, jsonUtf8.BytesCommitted); + + jsonUtf8.Reset(); + + Assert.Equal(0, jsonUtf8.BytesPending); + Assert.Equal(0, jsonUtf8.BytesCommitted); + + jsonUtf8.WriteStringValueSegment([0b10_111111], true); + + string expected = @"""\uFFFD"""; + Assert.Equal(expected.Length, jsonUtf8.BytesPending); + Assert.Equal(0, jsonUtf8.BytesCommitted); + + jsonUtf8.Flush(); + + Assert.Equal(0, jsonUtf8.BytesPending); + Assert.Equal(expected.Length, jsonUtf8.BytesCommitted); + JsonTestHelper.AssertContents('"' + expected, output); + } + + [Fact] + public static void WriteStringValueSegment_MixEncoding() + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + + // Becuase the first code point is a surrogate pair, it will be + // saved until the next write to complete it. It is saved in the + // original encoding, UTF-16, so it will be 0b1101_1000 0b1101_1000 + jsonUtf8.WriteStringValueSegment("\uD8D8".AsSpan(), false); + + // Now we write a UTF-8 continuation byte. With the previous partial + // state, the whole sequence is 0b110_11000 0b110_11000 0b10_111111. + jsonUtf8.WriteStringValueSegment([0b10_111111], true); + + jsonUtf8.Flush(); + + // If this is interpreted as UTF-8, the first byte is invalid because + // it is a 2-byte start unit but the second byte is not a continuation. + // So a replacement character gets written for the first byte. The second and + // third units are valid and get written as is. + JsonTestHelper.AssertContents("\uFFFD\u063F", output); + } + + // Switch this to use an enum discriminator input when base64 is supported + private static void WriteStringValueHelper(Utf8JsonWriter writer, ReadOnlySpan value) + where T : struct + { + if (typeof(T) == typeof(char)) + { + writer.WriteStringValue(MemoryMarshal.Cast(value)); + } + else if (typeof(T) == typeof(byte)) + { + writer.WriteStringValue(MemoryMarshal.Cast(value)); + } + else + { + if (typeof(T) == typeof(int)) + { + Assert.Fail($"Did you pass in int or int[] instead of byte or byte[]? Type {typeof(T)} is not supported by {nameof(WriteStringValueHelper)}."); + } + else + { + Assert.Fail($"Type {typeof(T)} is not supported by {nameof(WriteStringValueHelper)}."); + } + } + } + + // Switch this to use an enum discriminator input when base64 is supported + private static void WriteStringValueSegmentHelper(Utf8JsonWriter writer, ReadOnlySpan value, bool isFinal) + where T : struct + { + if (typeof(T) == typeof(char)) + { + writer.WriteStringValueSegment(MemoryMarshal.Cast(value), isFinal); + } + else if (typeof(T) == typeof(byte)) + { + writer.WriteStringValueSegment(MemoryMarshal.Cast(value), isFinal); + } + else + { + if (typeof(T) == typeof(int)) + { + Assert.Fail($"Did you pass in int or int[] instead of byte or byte[]? Type {typeof(T)} is not supported by {nameof(WriteStringValueSegmentsHelper)}."); + } + else + { + Assert.Fail($"Type {typeof(T)} is not supported by {nameof(WriteStringValueSegmentsHelper)}."); + } + } + } + + // Switch this to use an enum discriminator input when base64 is supported + private static void WriteStringValueSegmentsHelper(Utf8JsonWriter writer, ReadOnlySpan value) + where T : struct + { + if (typeof(T) == typeof(char)) + { + writer.WriteStringValueSegment(MemoryMarshal.Cast(value), true); + } + else if (typeof(T) == typeof(byte)) + { + writer.WriteStringValueSegment(MemoryMarshal.Cast(value), true); + } + else + { + if (typeof(T) == typeof(int)) + { + Assert.Fail($"Did you pass in int or int[] instead of byte or byte[]? Type {typeof(T)} is not supported by {nameof(WriteStringValueSegmentsHelper)}."); + } + else + { + Assert.Fail($"Type {typeof(T)} is not supported by {nameof(WriteStringValueSegmentsHelper)}."); + } + } + } + + // Switch this to use an enum discriminator input when base64 is supported + private static void WriteStringValueSegmentsHelper(Utf8JsonWriter writer, ReadOnlySpan value1, ReadOnlySpan value2) + where T : struct + { + if (typeof(T) == typeof(char)) + { + writer.WriteStringValueSegment(MemoryMarshal.Cast(value1), false); + writer.WriteStringValueSegment(MemoryMarshal.Cast(value2), true); + } + else if (typeof(T) == typeof(byte)) + { + writer.WriteStringValueSegment(MemoryMarshal.Cast(value1), false); + writer.WriteStringValueSegment(MemoryMarshal.Cast(value2), true); + } + else + { + if (typeof(T) == typeof(int)) + { + Assert.Fail($"Did you pass in int or int[] instead of byte or byte[]? Type {typeof(T)} is not supported by {nameof(WriteStringValueSegmentsHelper)}."); + } + else + { + Assert.Fail($"Type {typeof(T)} is not supported by {nameof(WriteStringValueSegmentsHelper)}."); + } + } + } + + // Switch this to use an enum discriminator input when base64 is supported + private static void WriteStringValueSegmentsHelper(Utf8JsonWriter writer, ReadOnlySpan value1, ReadOnlySpan value2, ReadOnlySpan value3) + where T : struct + { + if (typeof(T) == typeof(char)) + { + writer.WriteStringValueSegment(MemoryMarshal.Cast(value1), false); + writer.WriteStringValueSegment(MemoryMarshal.Cast(value2), false); + writer.WriteStringValueSegment(MemoryMarshal.Cast(value3), true); + } + else if (typeof(T) == typeof(byte)) + { + writer.WriteStringValueSegment(MemoryMarshal.Cast(value1), false); + writer.WriteStringValueSegment(MemoryMarshal.Cast(value2), false); + writer.WriteStringValueSegment(MemoryMarshal.Cast(value3), true); + } + else + { + if (typeof(T) == typeof(int)) + { + Assert.Fail($"Did you pass in int or int[] instead of byte or byte[]? Type {typeof(T)} is not supported by {nameof(WriteStringValueSegmentsHelper)}."); + } + else + { + Assert.Fail($"Type {typeof(T)} is not supported by {nameof(WriteStringValueSegmentsHelper)}."); + } + } + } + + private static void WriteStringValueSegmentsHelper(Utf8JsonWriter writer, string value) + => WriteStringValueSegmentsHelper(writer, value.AsSpan()); + + private static void WriteStringValueSegmentsHelper(Utf8JsonWriter writer, string value1, string value2) + => WriteStringValueSegmentsHelper(writer, value1.AsSpan(), value2.AsSpan()); + + private static void WriteStringValueSegmentsHelper(Utf8JsonWriter writer, string value1, string value2, string value3) + => WriteStringValueSegmentsHelper(writer, value1.AsSpan(), value2.AsSpan(), value3.AsSpan()); + } +} diff --git a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs index 48b2e656c5b5b2..6d4ea167aa64a3 100644 --- a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs +++ b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs @@ -14,6 +14,7 @@ using System.Threading; using System.Threading.Tasks; using Microsoft.DotNet.XUnitExtensions; +using Microsoft.VisualStudio.TestPlatform.Utilities; using Newtonsoft.Json; using Xunit; @@ -230,6 +231,12 @@ public void EscapingTestWhileWriting(char replacementChar, JavaScriptEncoder enc written = WriteUtf8StringHelper(writerOptions, Array.Empty()); Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + written = WriteStringSegmentHelper(writerOptions, Array.Empty()); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + + written = WriteUtf8StringSegmentHelper(writerOptions, Array.Empty()); + Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); } var random = new Random(42); @@ -263,6 +270,21 @@ public void EscapingTestWhileWriting(char replacementChar, JavaScriptEncoder enc written = WriteUtf8StringHelper(writerOptions, sourceUtf8); escapedIndex = written.Span.IndexOf((byte)'\\'); Assert.Equal(requiresEscaping ? (i + 1) : -1, escapedIndex); // Account for the start quote + + if (dataLength < 10) + { + SplitCodePointsHelper(changed, writerOptions, output => + { + escapedIndex = output.WrittenSpan.IndexOf((byte)'\\'); + Assert.Equal(requiresEscaping ? (i + 1) : -1, escapedIndex); // Account for the start quote + }); + + SplitCodePointsHelper(changed, writerOptions, output => + { + escapedIndex = output.WrittenSpan.IndexOf((byte)'\\'); + Assert.Equal(requiresEscaping ? (i + 1) : -1, escapedIndex); // Account for the start quote + }); + } } if (dataLength != 0) @@ -279,6 +301,21 @@ public void EscapingTestWhileWriting(char replacementChar, JavaScriptEncoder enc written = WriteUtf8StringHelper(writerOptions, sourceUtf8); escapedIndex = written.Span.IndexOf((byte)'\\'); Assert.Equal(requiresEscaping ? 1 : -1, escapedIndex); // Account for the start quote + + if (dataLength < 10) + { + SplitCodePointsHelper(changed, writerOptions, output => + { + escapedIndex = output.WrittenSpan.IndexOf((byte)'\\'); + Assert.Equal(requiresEscaping ? 1 : -1, escapedIndex); // Account for the start quote + }); + + SplitCodePointsHelper(sourceUtf8, writerOptions, output => + { + escapedIndex = output.WrittenSpan.IndexOf((byte)'\\'); + Assert.Equal(requiresEscaping ? 1 : -1, escapedIndex); // Account for the start quote + }); + } } } } @@ -288,82 +325,82 @@ public static IEnumerable EscapingTestData get { return new List - { - new object[] { 'a', null, false }, // ASCII not escaped - new object[] { '\u001F', null, true }, // control character within single byte range - new object[] { '\u2000', null, true }, // space character outside single byte range - new object[] { '\u00A2', null, true }, // non-ASCII but < 255 - new object[] { '\uA686', null, true }, // non-ASCII above short.MaxValue - new object[] { '\u6C49', null, true }, // non-ASCII from chinese alphabet - multibyte - new object[] { '"', null, true }, // ASCII but must always be escaped in JSON - new object[] { '\\', null, true }, // ASCII but must always be escaped in JSON - new object[] { '<', null, true }, // ASCII but escaped by default - new object[] { '>', null, true }, // ASCII but escaped by default - new object[] { '&', null, true }, // ASCII but escaped by default - new object[] { '`', null, true }, // ASCII but escaped by default - new object[] { '\'', null, true }, // ASCII but escaped by default - new object[] { '+', null, true }, // ASCII but escaped by default - - new object[] { 'a', JavaScriptEncoder.Default, false }, - new object[] { '\u001F', JavaScriptEncoder.Default, true }, - new object[] { '\u2000', JavaScriptEncoder.Default, true }, - new object[] { '\u00A2', JavaScriptEncoder.Default, true }, - new object[] { '\uA686', JavaScriptEncoder.Default, true }, - new object[] { '\u6C49', JavaScriptEncoder.Default, true }, - new object[] { '"', JavaScriptEncoder.Default, true }, - new object[] { '\\', JavaScriptEncoder.Default, true }, - new object[] { '<', JavaScriptEncoder.Default, true }, - new object[] { '>', JavaScriptEncoder.Default, true }, - new object[] { '&', JavaScriptEncoder.Default, true }, - new object[] { '`', JavaScriptEncoder.Default, true }, - new object[] { '\'', JavaScriptEncoder.Default, true }, - new object[] { '+', JavaScriptEncoder.Default, true }, - - new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), false }, - new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, - new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, - new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, - new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, - new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, - new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, - new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, - new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, - new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, - new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, - new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, - new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, - new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, - - new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.All), false }, - new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.All), false }, - new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.All), false }, - new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.All), false }, - new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - - new object[] { 'a', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '\u001F', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, - new object[] { '\u2000', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, - new object[] { '\u00A2', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '\uA686', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '\u6C49', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '"', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, - new object[] { '\\', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, - new object[] { '<', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '>', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '&', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '`', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '\'', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '+', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - }; + { + new object[] { 'a', null, false }, // ASCII not escaped + new object[] { '\u001F', null, true }, // control character within single byte range + new object[] { '\u2000', null, true }, // space character outside single byte range + new object[] { '\u00A2', null, true }, // non-ASCII but < 255 + new object[] { '\uA686', null, true }, // non-ASCII above short.MaxValue + new object[] { '\u6C49', null, true }, // non-ASCII from chinese alphabet - multibyte + new object[] { '"', null, true }, // ASCII but must always be escaped in JSON + new object[] { '\\', null, true }, // ASCII but must always be escaped in JSON + new object[] { '<', null, true }, // ASCII but escaped by default + new object[] { '>', null, true }, // ASCII but escaped by default + new object[] { '&', null, true }, // ASCII but escaped by default + new object[] { '`', null, true }, // ASCII but escaped by default + new object[] { '\'', null, true }, // ASCII but escaped by default + new object[] { '+', null, true }, // ASCII but escaped by default + + new object[] { 'a', JavaScriptEncoder.Default, false }, + new object[] { '\u001F', JavaScriptEncoder.Default, true }, + new object[] { '\u2000', JavaScriptEncoder.Default, true }, + new object[] { '\u00A2', JavaScriptEncoder.Default, true }, + new object[] { '\uA686', JavaScriptEncoder.Default, true }, + new object[] { '\u6C49', JavaScriptEncoder.Default, true }, + new object[] { '"', JavaScriptEncoder.Default, true }, + new object[] { '\\', JavaScriptEncoder.Default, true }, + new object[] { '<', JavaScriptEncoder.Default, true }, + new object[] { '>', JavaScriptEncoder.Default, true }, + new object[] { '&', JavaScriptEncoder.Default, true }, + new object[] { '`', JavaScriptEncoder.Default, true }, + new object[] { '\'', JavaScriptEncoder.Default, true }, + new object[] { '+', JavaScriptEncoder.Default, true }, + + new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), false }, + new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin), true }, + + new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + + new object[] { 'a', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u001F', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u2000', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u00A2', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\uA686', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u6C49', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '"', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\\', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '<', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '>', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '&', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '`', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\'', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '+', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + }; } } @@ -389,6 +426,19 @@ public unsafe void WriteString_NonAscii(char replacementChar, JavaScriptEncoder written = WriteUtf8StringHelper(writerOptions, sourceUtf8); Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + if (dataLength < 10) + { + SplitCodePointsHelper(str, writerOptions, output => + { + Assert.Equal(-1, output.WrittenSpan.IndexOf((byte)'\\')); + }); + + SplitCodePointsHelper(sourceUtf8, writerOptions, output => + { + Assert.Equal(-1, output.WrittenSpan.IndexOf((byte)'\\')); + }); + } + for (int i = 0; i < dataLength; i++) { string source = baseStr.Insert(i, new string(replacementChar, 1)); @@ -403,6 +453,23 @@ public unsafe void WriteString_NonAscii(char replacementChar, JavaScriptEncoder escapedIndex = written.Span.IndexOf((byte)'\\'); // Each CJK character expands to 3 utf-8 bytes. Assert.Equal(requiresEscaping ? ((i * 3) + 1) : -1, escapedIndex); // Account for the start quote + + if (dataLength < 10) + { + SplitCodePointsHelper(source.ToCharArray(), writerOptions, output => + { + escapedIndex = output.WrittenSpan.IndexOf((byte)'\\'); + // Each CJK character expands to 3 utf-8 bytes. + Assert.Equal(requiresEscaping ? ((i * 3) + 1) : -1, escapedIndex); // Account for the start quote + }); + + SplitCodePointsHelper(sourceUtf8, writerOptions, output => + { + escapedIndex = output.WrittenSpan.IndexOf((byte)'\\'); + // Each CJK character expands to 3 utf-8 bytes. + Assert.Equal(requiresEscaping ? ((i * 3) + 1) : -1, escapedIndex); // Account for the start quote + }); + } } } } @@ -412,37 +479,37 @@ public static IEnumerable EscapingTestData_NonAscii get { return new List - { - new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.All), false }, - new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.All), false }, - new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.All), false }, - new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.All), false }, - new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.All), true }, - - new object[] { 'a', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '\u001F', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, - new object[] { '\u2000', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, - new object[] { '\u00A2', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '\uA686', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '\u6C49', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '"', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, - new object[] { '\\', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, - new object[] { '<', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '>', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '&', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '`', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '\'', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - new object[] { '+', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, - }; + { + new object[] { 'a', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u001F', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u2000', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\u00A2', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\uA686', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '\u6C49', JavaScriptEncoder.Create(UnicodeRanges.All), false }, + new object[] { '"', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\\', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '<', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '>', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '&', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '`', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '\'', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + new object[] { '+', JavaScriptEncoder.Create(UnicodeRanges.All), true }, + + new object[] { 'a', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u001F', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u2000', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\u00A2', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\uA686', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\u6C49', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '"', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '\\', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, true }, + new object[] { '<', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '>', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '&', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '`', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '\'', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + new object[] { '+', JavaScriptEncoder.UnsafeRelaxedJsonEscaping, false }, + }; } } @@ -470,6 +537,19 @@ public void EscapingTestWhileWritingSurrogate(JavaScriptEncoder encoder) written = WriteUtf8StringHelper(writerOptions, sourceUtf8); Assert.Equal(-1, written.Span.IndexOf((byte)'\\')); + if (dataLength < 10) + { + SplitCodePointsHelper(str, writerOptions, output => + { + Assert.Equal(-1, output.WrittenSpan.IndexOf((byte)'\\')); + }); + + SplitCodePointsHelper(sourceUtf8, writerOptions, output => + { + Assert.Equal(-1, output.WrittenSpan.IndexOf((byte)'\\')); + }); + } + for (int i = 0; i < dataLength - 1; i++) { char[] changed = baseStr.ToCharArray(); @@ -485,6 +565,21 @@ public void EscapingTestWhileWritingSurrogate(JavaScriptEncoder encoder) written = WriteUtf8StringHelper(writerOptions, sourceUtf8); escapedIndex = written.Span.IndexOf((byte)'\\'); Assert.Equal(i + 1, escapedIndex); // Account for the start quote + + if (dataLength < 10) + { + SplitCodePointsHelper(changed, writerOptions, output => + { + escapedIndex = output.WrittenSpan.IndexOf((byte)'\\'); + Assert.Equal(i + 1, escapedIndex); // Account for the start quote + }); + + SplitCodePointsHelper(sourceUtf8, writerOptions, output => + { + escapedIndex = output.WrittenSpan.IndexOf((byte)'\\'); + Assert.Equal(i + 1, escapedIndex); // Account for the start quote + }); + } } { @@ -506,6 +601,21 @@ public void EscapingTestWhileWritingSurrogate(JavaScriptEncoder encoder) written = WriteUtf8StringHelper(writerOptions, sourceUtf8); escapedIndex = written.Span.IndexOf((byte)'\\'); Assert.Equal(1, escapedIndex); // Account for the start quote + + if (dataLength < 10) + { + SplitCodePointsHelper(changed, writerOptions, output => + { + escapedIndex = output.WrittenSpan.IndexOf((byte)'\\'); + Assert.Equal(1, escapedIndex); // Account for the start quote + }); + + SplitCodePointsHelper(sourceUtf8, writerOptions, output => + { + escapedIndex = output.WrittenSpan.IndexOf((byte)'\\'); + Assert.Equal(1, escapedIndex); // Account for the start quote + }); + } } } } @@ -515,13 +625,13 @@ public static IEnumerable JavaScriptEncoders get { return new List - { - new object[] { null }, - new object[] { JavaScriptEncoder.Default }, - new object[] { JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, - new object[] { JavaScriptEncoder.Create(UnicodeRanges.All) }, - new object[] { JavaScriptEncoder.UnsafeRelaxedJsonEscaping }, - }; + { + new object[] { null }, + new object[] { JavaScriptEncoder.Default }, + new object[] { JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, + new object[] { JavaScriptEncoder.Create(UnicodeRanges.All) }, + new object[] { JavaScriptEncoder.UnsafeRelaxedJsonEscaping }, + }; } } @@ -555,6 +665,19 @@ public unsafe void WriteStringInvalidCharacter(char replacementChar, JavaScriptE written = WriteUtf8StringHelper(writerOptions, sourceUtf8); Assert.True(BeginsWithReplacementCharacter(written.Span.Slice(i + 1))); // +1 to account for starting quote + + if (dataLength < 10) + { + SplitCodePointsHelper(changed, writerOptions, output => + { + Assert.True(BeginsWithReplacementCharacter(output.WrittenSpan.Slice(i + 1))); // +1 to account for starting quote + }); + + SplitCodePointsHelper(sourceUtf8, writerOptions, output => + { + Assert.True(BeginsWithReplacementCharacter(output.WrittenSpan.Slice(i + 1))); // +1 to account for starting quote + }); + } } } @@ -586,19 +709,19 @@ public static IEnumerable InvalidEscapingTestData get { return new List - { - new object[] { '\uD801', JavaScriptEncoder.Default }, // Invalid, high surrogate alone - new object[] { '\uDC01', JavaScriptEncoder.Default }, // Invalid, low surrogate alone + { + new object[] { '\uD801', JavaScriptEncoder.Default }, // Invalid, high surrogate alone + new object[] { '\uDC01', JavaScriptEncoder.Default }, // Invalid, low surrogate alone - new object[] { '\uD801', JavaScriptEncoder.UnsafeRelaxedJsonEscaping }, - new object[] { '\uDC01', JavaScriptEncoder.UnsafeRelaxedJsonEscaping }, + new object[] { '\uD801', JavaScriptEncoder.UnsafeRelaxedJsonEscaping }, + new object[] { '\uDC01', JavaScriptEncoder.UnsafeRelaxedJsonEscaping }, - new object[] { '\uD801', JavaScriptEncoder.Create(UnicodeRanges.All) }, - new object[] { '\uDC01', JavaScriptEncoder.Create(UnicodeRanges.All) }, + new object[] { '\uD801', JavaScriptEncoder.Create(UnicodeRanges.All) }, + new object[] { '\uDC01', JavaScriptEncoder.Create(UnicodeRanges.All) }, - new object[] { '\uD801', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, - new object[] { '\uDC01', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, - }; + new object[] { '\uD801', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, + new object[] { '\uDC01', JavaScriptEncoder.Create(UnicodeRanges.BasicLatin) }, + }; } } @@ -622,6 +745,26 @@ private static ReadOnlyMemory WriteUtf8StringHelper(JsonWriterOptions writ return output.WrittenMemory; } + private static ReadOnlyMemory WriteStringSegmentHelper(JsonWriterOptions writerOptions, ReadOnlySpan str) + { + var output = new ArrayBufferWriter(); + using (var writer = new Utf8JsonWriter(output, writerOptions)) + { + writer.WriteStringValueSegment(str, true); + } + return output.WrittenMemory; + } + + private static ReadOnlyMemory WriteUtf8StringSegmentHelper(JsonWriterOptions writerOptions, ReadOnlySpan utf8str) + { + var output = new ArrayBufferWriter(); + using (var writer = new Utf8JsonWriter(output, writerOptions)) + { + writer.WriteStringValueSegment(utf8str, true); + } + return output.WrittenMemory; + } + [Fact] public void WriteJsonWritesToIBWOnDemand_Dispose() { @@ -1653,6 +1796,7 @@ public void FixedSizeBufferWriter_Decimal(JsonWriterOptions options) } } + private const JsonValueKind JsonValueKindStringSegment = (JsonValueKind)(1 << 7); public static IEnumerable InvalidJsonDueToWritingMultipleValues_TestData() => JsonOptionsWith([ JsonValueKind.Array, @@ -1662,6 +1806,7 @@ public static IEnumerable InvalidJsonDueToWritingMultipleValues_TestDa JsonValueKind.True, JsonValueKind.False, JsonValueKind.Null, + JsonValueKindStringSegment ]); [Theory] @@ -1718,6 +1863,30 @@ public void InvalidJsonDueToWritingMultipleValues(JsonWriterOptions options, Jso ValidateAction(jsonUtf8, () => jsonUtf8.WriteStringValue("foo"), options.SkipValidation); } + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) + { + WritePreamble(jsonUtf8, kind); + ValidateAction(jsonUtf8, () => jsonUtf8.WriteStringValueSegment("foo".AsSpan(), true), options.SkipValidation); + } + + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) + { + WritePreamble(jsonUtf8, kind); + ValidateAction(jsonUtf8, () => jsonUtf8.WriteStringValueSegment("foo".AsSpan(), false), options.SkipValidation); + } + + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) + { + WritePreamble(jsonUtf8, kind); + ValidateAction(jsonUtf8, () => jsonUtf8.WriteStringValueSegment("foo"u8, true), options.SkipValidation); + } + + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) + { + WritePreamble(jsonUtf8, kind); + ValidateAction(jsonUtf8, () => jsonUtf8.WriteStringValueSegment("foo"u8, false), options.SkipValidation); + } + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) { WritePreamble(jsonUtf8, kind); @@ -1828,6 +1997,30 @@ public void InvalidJsonDueToWritingMultipleValuesWithComments(JsonWriterOptions ValidateAction(jsonUtf8, () => jsonUtf8.WriteStringValue("foo"), options.SkipValidation); } + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) + { + WritePreamble(jsonUtf8, kind, addComments: true); + ValidateAction(jsonUtf8, () => jsonUtf8.WriteStringValueSegment("foo".AsSpan(), true), options.SkipValidation); + } + + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) + { + WritePreamble(jsonUtf8, kind, addComments: true); + ValidateAction(jsonUtf8, () => jsonUtf8.WriteStringValueSegment("foo".AsSpan(), false), options.SkipValidation); + } + + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) + { + WritePreamble(jsonUtf8, kind, addComments: true); + ValidateAction(jsonUtf8, () => jsonUtf8.WriteStringValueSegment("foo"u8, true), options.SkipValidation); + } + + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) + { + WritePreamble(jsonUtf8, kind, addComments: true); + ValidateAction(jsonUtf8, () => jsonUtf8.WriteStringValueSegment("foo"u8, false), options.SkipValidation); + } + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) { WritePreamble(jsonUtf8, kind, addComments: true); @@ -1918,6 +2111,10 @@ private void WritePreamble(Utf8JsonWriter writer, JsonValueKind kind, bool addCo case JsonValueKind.Null: writer.WriteNullValue(); break; + case JsonValueKindStringSegment: + writer.WriteStringValueSegment("foo".ToCharArray(), false); + writer.WriteStringValueSegment("bar".ToCharArray(), true); + break; default: Debug.Fail($"Invalid JsonValueKind passed in '{kind}'."); break; @@ -2042,7 +2239,7 @@ public void InvalidJsonMismatch(JsonWriterOptions options) jsonUtf8.WriteStartObject(); if (options.SkipValidation) { - jsonUtf8.WriteStringValue("key"); + jsonUtf8.WriteStringValue("value"); } else { @@ -2050,6 +2247,28 @@ public void InvalidJsonMismatch(JsonWriterOptions options) } } + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) + { + jsonUtf8.WriteStartObject(); + if (options.SkipValidation) + { + jsonUtf8.WriteStringValueSegment(['a', 'b'], true); + jsonUtf8.WriteStringValueSegment(['a', 'b'], false); + jsonUtf8.WriteStringValueSegment(['a', 'b'], true); + + jsonUtf8.WriteStringValueSegment([65, 66], true); + jsonUtf8.WriteStringValueSegment([65, 66], false); + jsonUtf8.WriteStringValueSegment([65, 66], true); + } + else + { + Assert.Throws(() => jsonUtf8.WriteStringValueSegment(['a', 'b'], true)); + Assert.Throws(() => jsonUtf8.WriteStringValueSegment(['a', 'b'], false)); + Assert.Throws(() => jsonUtf8.WriteStringValueSegment([65, 66], true)); + Assert.Throws(() => jsonUtf8.WriteStringValueSegment([65, 66], false)); + } + } + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) { jsonUtf8.WriteStartArray(); @@ -2586,6 +2805,81 @@ public void InvalidJsonPrimitive(JsonWriterOptions options) Assert.Throws(() => jsonUtf8.WritePropertyName("test name")); } } + + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) + { + jsonUtf8.WriteStringValueSegment("a".AsSpan(), true); + if (options.SkipValidation) + { + jsonUtf8.WritePropertyName("test name"); + } + else + { + Assert.Throws(() => jsonUtf8.WritePropertyName("test name")); + } + } + + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) + { + jsonUtf8.WriteStringValueSegment("a"u8, true); + if (options.SkipValidation) + { + jsonUtf8.WritePropertyName("test name"); + } + else + { + Assert.Throws(() => jsonUtf8.WritePropertyName("test name")); + } + } + } + + // Name is present in the test data to make it easier to identify the test case + public static IEnumerable InvalidJsonStringValueSegment_TestData => + from write in new (string methodName, Action method)[] { + (nameof(Utf8JsonWriter.WriteStartObject), writer => writer.WriteStartObject()), + (nameof(Utf8JsonWriter.WriteEndObject), writer => writer.WriteEndObject()), + (nameof(Utf8JsonWriter.WriteStartArray), writer => writer.WriteStartArray()), + (nameof(Utf8JsonWriter.WriteEndArray), writer => writer.WriteEndArray()), + (nameof(Utf8JsonWriter.WriteBooleanValue), writer => writer.WriteBooleanValue(true)), + (nameof(Utf8JsonWriter.WriteBoolean), writer => writer.WriteBoolean("foo", true)), + (nameof(Utf8JsonWriter.WriteCommentValue), writer => writer.WriteCommentValue("comment")), + (nameof(Utf8JsonWriter.WriteNullValue), writer => writer.WriteNullValue()), + (nameof(Utf8JsonWriter.WriteStringValue), writer => writer.WriteStringValue("foo")), + (nameof(Utf8JsonWriter.WritePropertyName), writer => writer.WritePropertyName("foo")), + } + from option in new [] { new JsonWriterOptions { SkipValidation = true }, new JsonWriterOptions { SkipValidation = false } } + select new object[] { write.methodName, write.method, option }; + + [Theory] + [MemberData(nameof(InvalidJsonStringValueSegment_TestData))] + public void InvalidJsonStringValueSegment(string _, Action write, JsonWriterOptions options) + { + var output = new ArrayBufferWriter(1024); + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) + { + jsonUtf8.WriteStringValueSegment("foo"u8, isFinalSegment: false); + if (options.SkipValidation) + { + write(jsonUtf8); + } + else + { + Assert.Throws(() => write(jsonUtf8)); + } + } + + using (var jsonUtf8 = new Utf8JsonWriter(output, options)) + { + jsonUtf8.WriteStringValueSegment("foo".ToCharArray(), isFinalSegment: false); + if (options.SkipValidation) + { + write(jsonUtf8); + } + else + { + Assert.Throws(() => write(jsonUtf8)); + } + } } [Theory] @@ -2872,7 +3166,7 @@ public void WritingTooLargeProperty(JsonWriterOptions options) key.AsSpan().Fill((byte)'a'); keyChars.AsSpan().Fill('a'); - var output = new ArrayBufferWriter(1024); + var output = new ArrayBufferWriter(1024); using (var jsonUtf8 = new Utf8JsonWriter(output, options)) { @@ -3160,7 +3454,7 @@ public void WriteHelloWorld(JsonWriterOptions options) ReadOnlySpan utf8PropertyName = "message"u8; ReadOnlySpan utf8Value = "Hello, World!"u8; - + for (int i = 0; i < 32; i++) { var output = new ArrayBufferWriter(32); @@ -3346,7 +3640,7 @@ public void WriteHelloWorldEscaped(JsonWriterOptions options) string propertyName = "mess> propertyNameSpan = propertyName.AsSpan(); ReadOnlySpan valueSpan = value.AsSpan(); ReadOnlySpan propertyNameSpanUtf8 = Encoding.UTF8.GetBytes(propertyName); @@ -3545,7 +3839,6 @@ public void WriteHelloWorldEscaped(JsonWriterOptions options) [MemberData(nameof(JsonOptions_TestData))] public void WritePartialHelloWorld(JsonWriterOptions options) { - var output = new ArrayBufferWriter(10); using var jsonUtf8 = new Utf8JsonWriter(output, options); @@ -3745,7 +4038,7 @@ public void WriteInvalidPartialJson(JsonWriterOptions options) public void WriteInvalidBase64(JsonWriterOptions options) { { - var output = new ArrayBufferWriter(10); + var output = new ArrayBufferWriter(10); using var jsonUtf8 = new Utf8JsonWriter(output, options); jsonUtf8.WriteStartObject(); @@ -3770,7 +4063,7 @@ public void WriteInvalidBase64(JsonWriterOptions options) } } { - var output = new ArrayBufferWriter(10); + var output = new ArrayBufferWriter(10); using var jsonUtf8 = new Utf8JsonWriter(output, options); jsonUtf8.WriteStartArray(); @@ -6799,204 +7092,6 @@ public static void WriteStringValue_IndentationOptions() Assert.Equal(expectedOutput, output); } - [Fact] - public static void WriteStringValueSegment_Byte() - { - var output = new ArrayBufferWriter(); - using var jsonUtf8 = new Utf8JsonWriter(output); - jsonUtf8.WriteStartObject(); - jsonUtf8.WritePropertyName("test"); - jsonUtf8.WriteStringValueSegment(Encoding.UTF8.GetBytes("Hello "), isFinalSegment: false); - jsonUtf8.WriteStringValueSegment(Encoding.UTF8.GetBytes("World!"), isFinalSegment: true); - jsonUtf8.WriteEndObject(); - jsonUtf8.Flush(); - - JsonTestHelper.AssertContents($"{{\"test\":\"Hello World!\"}}", output); - } - - [Fact] - public static void WriteStringValueSegment_Byte_SplitInUtf8Sequence() - { - const string result = "\\uD83D\\uDE00"; - - Span utf8Bytes = Encoding.UTF8.GetBytes("\uD83D\uDE00"); - - var output = new ArrayBufferWriter(); - using var jsonUtf8 = new Utf8JsonWriter(output); - jsonUtf8.WriteStartObject(); - jsonUtf8.WritePropertyName("full"); - // complete string -> expect 0xD83D 0xDE00 - jsonUtf8.WriteStringValue(utf8Bytes); - jsonUtf8.WritePropertyName("segmented"); - // incomplete UTf-8 sequence -> expect cached - jsonUtf8.WriteStringValueSegment(utf8Bytes.Slice(0, 1), isFinalSegment: false); - // incomplete UTf-8 sequence -> expect cached - jsonUtf8.WriteStringValueSegment(utf8Bytes.Slice(1, 1), isFinalSegment: false); - // remainder of UTF-8 sequence -> expect 0xD83D 0xDE00 - jsonUtf8.WriteStringValueSegment(utf8Bytes.Slice(2, 2), isFinalSegment: true); - jsonUtf8.WriteEndObject(); - jsonUtf8.Flush(); - - JsonTestHelper.AssertContents($"{{\"full\":\"{result}\",\"segmented\":\"{result}\"}}", output); - } - - [Fact] - public static void WriteStringValueSegment_Byte_NotFinalized() - { - static ArrayBufferWriter executeScenario(Action implementation, bool expectFailure) - { - var output = new ArrayBufferWriter(); - using var jsonUtf8 = new Utf8JsonWriter(output); - jsonUtf8.WriteStartObject(); - jsonUtf8.WritePropertyName("test"); - jsonUtf8.WriteStringValueSegment(Encoding.UTF8.GetBytes("Hello "), isFinalSegment: false); - - if (expectFailure) - { - InvalidOperationException invalidOperationexception = Assert.Throws( - () => implementation(jsonUtf8)); - Assert.Contains("The current JSON string must be finalized before a token of type", invalidOperationexception.Message); - return null; - } - else - { - implementation(jsonUtf8); - jsonUtf8.WriteEndObject(); - jsonUtf8.Flush(); - return output; - } - } - - // The following are expected to fail. - executeScenario(w => w.WriteEndArray(), expectFailure: true); - executeScenario(w => w.WriteCommentValue("comment"), expectFailure: true); - executeScenario(w => w.WriteEndArray(), expectFailure: true); - executeScenario(w => w.WriteEndObject(), expectFailure: true); - executeScenario(w => w.WriteNullValue(), expectFailure: true); - executeScenario(w => w.WriteNumberValue(123), expectFailure: true); - executeScenario(w => w.WritePropertyName("test"), expectFailure: true); - executeScenario(w => w.WriteStartArray(), expectFailure: true); - executeScenario(w => w.WriteStartObject(), expectFailure: true); - - // WriteStringValue is a special case that implicitly finalizes. - ArrayBufferWriter writeStringValueOutput = executeScenario(w => w.WriteStringValue(Encoding.UTF8.GetBytes("World!")), expectFailure: false); - JsonTestHelper.AssertContents($"{{\"test\":\"Hello World!\"}}", writeStringValueOutput); - } - - [Fact] - public static void WriteStringValueSegment_Char() - { - var output = new ArrayBufferWriter(); - using var jsonUtf8 = new Utf8JsonWriter(output); - jsonUtf8.WriteStartObject(); - jsonUtf8.WritePropertyName("test"); - jsonUtf8.WriteStringValueSegment("Hello ".AsSpan(), isFinalSegment: false); - jsonUtf8.WriteStringValueSegment("World!".AsSpan(), isFinalSegment: true); - jsonUtf8.WriteEndObject(); - jsonUtf8.Flush(); - - JsonTestHelper.AssertContents($"{{\"test\":\"Hello World!\"}}", output); - } - - [Fact] - public static void WriteStringValueSegment_Char_BadSurrogatePairs() - { - const string result = "\\uFFFD\\uD83D\\uDE00\\uFFFD"; - - Span surrogates = stackalloc char[] { '\uD83D', '\uD83D', '\uDE00', '\uDE00' }; - - var output = new ArrayBufferWriter(); - using var jsonUtf8 = new Utf8JsonWriter(output); - jsonUtf8.WriteStartObject(); - jsonUtf8.WritePropertyName("full"); - // complete string -> expect 0xFFFD 0xD83D 0xDE00 0xFFFD - jsonUtf8.WriteStringValue(surrogates); - jsonUtf8.WritePropertyName("segmented"); - // only high surrogate -> expect cached - jsonUtf8.WriteStringValueSegment(surrogates.Slice(0, 1), isFinalSegment: false); - // only high surrogate -> expect 0xFFFD - jsonUtf8.WriteStringValueSegment(surrogates.Slice(0, 1), isFinalSegment: false); - // only low surrogate -> expect 0xD83D 0xDE00 - jsonUtf8.WriteStringValueSegment(surrogates.Slice(2, 1), isFinalSegment: false); - // only low surrogate -> expect 0xFFFD - jsonUtf8.WriteStringValueSegment(surrogates.Slice(2, 1), isFinalSegment: true); - jsonUtf8.WriteEndObject(); - jsonUtf8.Flush(); - - JsonTestHelper.AssertContents($"{{\"full\":\"{result}\",\"segmented\":\"{result}\"}}", output); - } - - [Fact] - public static void WriteStringValueSegment_Char_SplitInSurrogatePair() - { - const string result = "\\uD83D\\uDE00\\uD83D\\uDE00\\uD83D\\uDE00"; - - Span surrogates = stackalloc char[] { '\uD83D', '\uDE00', '\uD83D', '\uDE00', '\uD83D', '\uDE00' }; - - var output = new ArrayBufferWriter(); - using var jsonUtf8 = new Utf8JsonWriter(output); - jsonUtf8.WriteStartObject(); - jsonUtf8.WritePropertyName("full"); - // complete string -> expect 0xD83D 0xDE00 0xD83D 0xDE00 0xD83D 0xDE00 - jsonUtf8.WriteStringValue(surrogates); - jsonUtf8.WritePropertyName("segmented"); - // only high surrogate -> expect cached - jsonUtf8.WriteStringValueSegment(surrogates.Slice(0, 2), isFinalSegment: false); - // only low surrogate -> expect 0xD83D 0xDE00 - jsonUtf8.WriteStringValueSegment(surrogates.Slice(0, 1), isFinalSegment: false); - // low surrogate followed by another high surrogate -> expect 0xD83D 0xDE00 + cached - jsonUtf8.WriteStringValueSegment(surrogates.Slice(1, 2), isFinalSegment: false); - // only low surrogate -> expect 0xD83D 0xDE00 - jsonUtf8.WriteStringValueSegment(surrogates.Slice(1, 1), isFinalSegment: true); - jsonUtf8.WriteEndObject(); - jsonUtf8.Flush(); - - JsonTestHelper.AssertContents($"{{\"full\":\"{result}\",\"segmented\":\"{result}\"}}", output); - } - - [Fact] - public static void WriteStringValueSegment_Char_NotFinalized() - { - static ArrayBufferWriter executeScenario(Action implementation, bool expectFailure) - { - var output = new ArrayBufferWriter(); - using var jsonUtf8 = new Utf8JsonWriter(output); - jsonUtf8.WriteStartObject(); - jsonUtf8.WritePropertyName("test"); - jsonUtf8.WriteStringValueSegment("Hello ".AsSpan(), isFinalSegment: false); - - if (expectFailure) - { - InvalidOperationException invalidOperationexception = Assert.Throws( - () => implementation(jsonUtf8)); - Assert.Contains("The current JSON string must be finalized before a token of type", invalidOperationexception.Message); - return null; - } - else - { - implementation(jsonUtf8); - jsonUtf8.WriteEndObject(); - jsonUtf8.Flush(); - return output; - } - } - - // The following are expected to fail. - executeScenario(w => w.WriteEndArray(), expectFailure: true); - executeScenario(w => w.WriteCommentValue("comment"), expectFailure: true); - executeScenario(w => w.WriteEndArray(), expectFailure: true); - executeScenario(w => w.WriteEndObject(), expectFailure: true); - executeScenario(w => w.WriteNullValue(), expectFailure: true); - executeScenario(w => w.WriteNumberValue(123), expectFailure: true); - executeScenario(w => w.WritePropertyName("test"), expectFailure: true); - executeScenario(w => w.WriteStartArray(), expectFailure: true); - executeScenario(w => w.WriteStartObject(), expectFailure: true); - - // WriteStringValue is a special case that implicitly finalizes. - ArrayBufferWriter writeStringValueOutput = executeScenario(w => w.WriteStringValue("World!"), expectFailure: false); - JsonTestHelper.AssertContents($"{{\"test\":\"Hello World!\"}}", writeStringValueOutput); - } - private delegate void WriteValueSpanAction( Utf8JsonWriter writer, ReadOnlySpan value); From d6b66beb221ff2890124ad985198e0d9ea4a8b10 Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Mon, 16 Dec 2024 10:24:51 -0800 Subject: [PATCH 06/21] fix build error --- .../Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs index 68075f543417ad..cb045cb4cdf8a6 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs @@ -176,7 +176,7 @@ private void WriteStringSegmentData(ReadOnlySpan escapedValue) /// /// Writes the UTF-8 text value segment as a partial JSON string. /// - /// The UTF-8 encoded value to be written as a JSON string element of a JSON array. + /// The UTF-8 encoded value to be written as a JSON string element of a JSON array. /// Indicates that this is the final segment of the string. /// /// Thrown when the specified value is too large. From a46a1cc3017f74ce4def4037485d4d153f814cff Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Mon, 16 Dec 2024 11:52:32 -0800 Subject: [PATCH 07/21] Update src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs Co-authored-by: Eirik Tsarpalis --- .../Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs index cb045cb4cdf8a6..848efba9b9af9e 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs @@ -396,7 +396,7 @@ private void WriteStringSegmentMinimizedProlog() output[BytesPending++] = JsonConstants.Quote; } - private void WriteStringSegmentEpilog() + private void WriteStringSegmentEpilogue() { if (_memory.Length == BytesPending) { From b5d0c170ae8c3fc1ccc118928c0933e543b94dec Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Mon, 16 Dec 2024 11:52:44 -0800 Subject: [PATCH 08/21] Update src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs Co-authored-by: Eirik Tsarpalis --- .../Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs index 848efba9b9af9e..fe0e2991c09b09 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs @@ -333,7 +333,7 @@ private void WriteStringSegmentData(ReadOnlySpan escapedValue) BytesPending += escapedValue.Length; } - private void WriteStringSegmentProlog() + private void WriteStringSegmentPrologue() { if (_options.Indented) { From 4a0d1c6a6d66c0ce20494a1391a5989fa12e6465 Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Mon, 16 Dec 2024 13:38:37 -0800 Subject: [PATCH 09/21] PR comments --- .../Utf8JsonWriter.WriteValues.Comment.cs | 4 +++- .../Utf8JsonWriter.WriteValues.Helpers.cs | 6 ----- ...tf8JsonWriter.WriteValues.StringSegment.cs | 24 +++++++------------ .../System/Text/Json/Writer/Utf8JsonWriter.cs | 20 ++++++++++++---- .../Utf8JsonWriterTests.cs | 1 - 5 files changed, 27 insertions(+), 28 deletions(-) diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs index 6da2117f34ade4..e0fa3e91cb1ad8 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs @@ -63,7 +63,9 @@ private void WriteCommentByOptions(ReadOnlySpan value) { if (!_options.SkipValidation) { - ValidateWritingComment(); + // Comments generally can be placed anywhere in JSON, but not after a non-final + // string segment. + ValidateNotWithinUnfinalizedString(); } if (_options.Indented) diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs index 699cb6d9a097ca..0d6cdc4d89b31d 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs @@ -20,12 +20,6 @@ private void ValidateNotWithinUnfinalizedString() } } - private void ValidateWritingComment() - { - // Make sure a new comment is not attempted within an unfinalized string. - ValidateNotWithinUnfinalizedString(); - } - private void ValidateWritingValue() { Debug.Assert(!_options.SkipValidation); diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs index fe0e2991c09b09..e70f5f7d32b961 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs @@ -37,7 +37,7 @@ public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegmen if (_tokenType != Utf8JsonWriter.StringSegmentSentinel) { - WriteStringSegmentProlog(); + WriteStringSegmentPrologue(); _tokenType = Utf8JsonWriter.StringSegmentSentinel; } @@ -54,7 +54,7 @@ public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegmen if (isFinalSegment) { - WriteStringSegmentEpilog(); + WriteStringSegmentEpilogue(); SetFlagToAddListSeparatorBeforeNextItem(); _tokenType = JsonTokenType.String; } @@ -198,7 +198,7 @@ public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegmen if (_tokenType != Utf8JsonWriter.StringSegmentSentinel) { - WriteStringSegmentProlog(); + WriteStringSegmentPrologue(); _tokenType = Utf8JsonWriter.StringSegmentSentinel; } @@ -215,7 +215,7 @@ public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegmen if (isFinalSegment) { - WriteStringSegmentEpilog(); + WriteStringSegmentEpilogue(); SetFlagToAddListSeparatorBeforeNextItem(); _tokenType = JsonTokenType.String; } @@ -337,15 +337,15 @@ private void WriteStringSegmentPrologue() { if (_options.Indented) { - WriteStringSegmentIndentedProlog(); + WriteStringSegmentIndentedPrologue(); } else { - WriteStringSegmentMinimizedProlog(); + WriteStringSegmentMinimizedPrologue(); } } - private void WriteStringSegmentIndentedProlog() + private void WriteStringSegmentIndentedPrologue() { int indent = Indentation; Debug.Assert(indent <= _indentLength * _options.MaxDepth); @@ -377,7 +377,7 @@ private void WriteStringSegmentIndentedProlog() output[BytesPending++] = JsonConstants.Quote; } - private void WriteStringSegmentMinimizedProlog() + private void WriteStringSegmentMinimizedPrologue() { // One quote and optionally 1 list separator int bytesRequired = 2; @@ -406,14 +406,6 @@ private void WriteStringSegmentEpilogue() _memory.Span[BytesPending++] = JsonConstants.Quote; } -#if NET - [InlineArray(4)] - private struct Inline4ByteArray - { - public byte byte0; - } -#endif - private void GetPartialUtf8CodePoint(out ReadOnlySpan codePointBytes) { ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs index cf670f4d344562..b301abda2e88fd 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs @@ -52,12 +52,24 @@ public sealed partial class Utf8JsonWriter : IDisposable, IAsyncDisposable private JsonTokenType _tokenType; private BitStack _bitStack; -#if NET - private Inline4ByteArray _partialCodePoint; - private Span PartialCodePointRaw => _partialCodePoint; -#else + /// + /// This 4-byte array stores the partial code point leftover when writing a string value + /// segment that is split across multiple write calls. The first 3 bytes provide space + /// to store the leftover bytes using the source encoding and the last byte is the number + /// of bytes used to store the partial code point. + /// +#if !NET private byte[]? _partialCodePoint; private Span PartialCodePointRaw => _partialCodePoint ??= new byte[4]; +#else + private Inline4ByteArray _partialCodePoint; + private Span PartialCodePointRaw => _partialCodePoint; + + [InlineArray(4)] + private struct Inline4ByteArray + { + public byte byte0; + } #endif // The highest order bit of _currentDepth is used to discern whether we are writing the first item in a list or not. diff --git a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs index 6d4ea167aa64a3..382349214beb1b 100644 --- a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs +++ b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs @@ -14,7 +14,6 @@ using System.Threading; using System.Threading.Tasks; using Microsoft.DotNet.XUnitExtensions; -using Microsoft.VisualStudio.TestPlatform.Utilities; using Newtonsoft.Json; using Xunit; From 96ed9228ccad98a2790db527cd5996b1430a99c4 Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Mon, 16 Dec 2024 17:17:32 -0800 Subject: [PATCH 10/21] add encoding flags --- .../Utf8JsonWriter.WriteValues.Helpers.cs | 90 +++++++ ...tf8JsonWriter.WriteValues.StringSegment.cs | 230 ++++++++---------- ...tf8JsonWriterTests.Values.StringSegment.cs | 2 +- 3 files changed, 190 insertions(+), 132 deletions(-) diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs index 0d6cdc4d89b31d..916e564a9541af 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs @@ -5,11 +5,101 @@ using System.Buffers.Text; using System.Diagnostics; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; namespace System.Text.Json { public sealed partial class Utf8JsonWriter { + private const byte LengthMask = 0b000_000_11; + private const byte EncodingMask = 0b000_111_00; + + private const byte Utf8EncodingFlag = 0b000_001_00; + private const byte Utf16EncodingFlag = 0b000_010_00; + + private bool TryGetPartialUtf8CodePoint(out ReadOnlySpan codePointBytes) + { + ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; + Debug.Assert(partialCodePointBytes.Length == 4); + + if ((partialCodePointBytes[3] & Utf8EncodingFlag) == 0) + { + codePointBytes = ReadOnlySpan.Empty; + return false; + } + + int length = partialCodePointBytes[3] & LengthMask; + Debug.Assert((uint)length < 4); + + codePointBytes = partialCodePointBytes.Slice(0, length); + return true; + } + + private bool TryGetPartialUtf16CodePoint(out ReadOnlySpan codePointChars) + { + ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; + Debug.Assert(partialCodePointBytes.Length == 4); + + if ((partialCodePointBytes[3] & Utf16EncodingFlag) == 0) + { + codePointChars = ReadOnlySpan.Empty; + return false; + } + + int length = partialCodePointBytes[3] & LengthMask; + Debug.Assert(length == 2 || length == 0); + + codePointChars = MemoryMarshal.Cast(partialCodePointBytes.Slice(0, length)); + return true; + } + + private void SetPartialUtf8CodePoint(ReadOnlySpan bytes) + { + Debug.Assert(bytes.Length <= 3); + + Span partialCodePointBytes = PartialCodePointRaw; + + bytes.CopyTo(partialCodePointBytes); + partialCodePointBytes[3] = (byte)(bytes.Length | Utf8EncodingFlag); + } + + private void SetPartialUtf16CodePoint(ReadOnlySpan bytes) + { + Debug.Assert(bytes.Length <= 1); + + Span partialCodePointBytes = PartialCodePointRaw; + + bytes.CopyTo(MemoryMarshal.Cast(partialCodePointBytes)); + partialCodePointBytes[3] = (byte)((2 * bytes.Length) | Utf16EncodingFlag); + } + + private bool HasPartialCodePoint => (PartialCodePointRaw[3] & LengthMask) != 0; + + private void ClearPartialCodePoint() => PartialCodePointRaw[3] = 0; + + private void WriteInvalidPartialCodePoint() + { + ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; + Debug.Assert(partialCodePointBytes.Length == 4); + + int length = partialCodePointBytes[3] & LengthMask; + + switch (partialCodePointBytes[3] & EncodingMask) + { + case Utf8EncodingFlag: + Debug.Assert((uint)length < 4); + WriteStringSegmentEscape(partialCodePointBytes.Slice(0, length), true); + break; + case Utf16EncodingFlag: + Debug.Assert(length == 0 || length == 2); + WriteStringSegmentEscape(MemoryMarshal.Cast(partialCodePointBytes.Slice(0, length)), true); + break; + default: + Debug.Fail("Encoding not recognized."); + break; + } + } + private void ValidateNotWithinUnfinalizedString() { Debug.Assert(!HasPartialCodePoint); diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs index e70f5f7d32b961..d1b665f9c5fe0e 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs @@ -62,52 +62,59 @@ public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegmen private void WriteStringSegmentWithLeftover(scoped ReadOnlySpan value, bool isFinalSegment) { - Span combinedBuffer = stackalloc char[2]; - - GetPartialUtf16CodePoint(out scoped ReadOnlySpan partialCodePointBuffer); - combinedBuffer = combinedBuffer.Slice(0, ConcatInto(partialCodePointBuffer, value, combinedBuffer)); - - switch (Rune.DecodeFromUtf16(combinedBuffer, out _, out int charsConsumed)) - { - case OperationStatus.NeedMoreData: - Debug.Assert(value.Length + partialCodePointBuffer.Length < 2); - Debug.Assert(charsConsumed == value.Length + partialCodePointBuffer.Length); - // Let the encoder deal with the error if this is a final buffer. - value = combinedBuffer.Slice(0, charsConsumed); - partialCodePointBuffer = ReadOnlySpan.Empty; - break; - case OperationStatus.Done: - Debug.Assert(charsConsumed > partialCodePointBuffer.Length); - Debug.Assert(charsConsumed <= 2); - // Divide up the code point chars into its own buffer and the remainder of the input buffer. - value = value.Slice(charsConsumed - partialCodePointBuffer.Length); - partialCodePointBuffer = combinedBuffer.Slice(0, charsConsumed); - break; - case OperationStatus.InvalidData: - Debug.Assert(charsConsumed >= partialCodePointBuffer.Length); - Debug.Assert(charsConsumed <= 2); - value = value.Slice(charsConsumed - partialCodePointBuffer.Length); - partialCodePointBuffer = combinedBuffer.Slice(0, charsConsumed); - break; - case OperationStatus.DestinationTooSmall: - default: - Debug.Fail("Unexpected OperationStatus return value."); - break; + Debug.Assert(HasPartialCodePoint); + + if (TryGetPartialUtf16CodePoint(out scoped ReadOnlySpan partialCodePointBuffer)) + { + Span combinedBuffer = stackalloc char[2]; + combinedBuffer = combinedBuffer.Slice(0, ConcatInto(partialCodePointBuffer, value, combinedBuffer)); + + switch (Rune.DecodeFromUtf16(combinedBuffer, out _, out int charsConsumed)) + { + case OperationStatus.NeedMoreData: + Debug.Assert(value.Length + partialCodePointBuffer.Length < 2); + Debug.Assert(charsConsumed == value.Length + partialCodePointBuffer.Length); + // Let the encoder deal with the error if this is a final buffer. + value = combinedBuffer.Slice(0, charsConsumed); + partialCodePointBuffer = ReadOnlySpan.Empty; + break; + case OperationStatus.Done: + Debug.Assert(charsConsumed > partialCodePointBuffer.Length); + Debug.Assert(charsConsumed <= 2); + // Divide up the code point chars into its own buffer and the remainder of the input buffer. + value = value.Slice(charsConsumed - partialCodePointBuffer.Length); + partialCodePointBuffer = combinedBuffer.Slice(0, charsConsumed); + break; + case OperationStatus.InvalidData: + Debug.Assert(charsConsumed >= partialCodePointBuffer.Length); + Debug.Assert(charsConsumed <= 2); + value = value.Slice(charsConsumed - partialCodePointBuffer.Length); + partialCodePointBuffer = combinedBuffer.Slice(0, charsConsumed); + break; + case OperationStatus.DestinationTooSmall: + default: + Debug.Fail("Unexpected OperationStatus return value."); + break; + } + + // The "isFinalSegment" argument indicates whether input that NeedsMoreData should be consumed as an error or not. + // Because we have validated above that partialCodePointBuffer will be the next consumed chars during Rune decoding + // (even if this is because it is invalid), we should pass isFinalSegment = true to indicate to the decoder to + // parse the code units without extra data. + // + // This is relevant in the case of having ['\uD800', 'C'], where the validation above would have needed all both code units + // to determine that only the first unit should be consumed (as invalid). So this method will get only ['\uD800']. + // Because we know more data will not be able to complete this code point, we need to pass isFinalSegment = true + // to ensure that the encoder consumes this data eagerly instead of leaving it and returning NeedsMoreData. + WriteStringSegmentEscape(partialCodePointBuffer, true); + } + else + { + WriteInvalidPartialCodePoint(); } ClearPartialCodePoint(); - // The "isFinalSegment" argument indicates whether input that NeedsMoreData should be consumed as an error or not. - // Because we have validated above that partialCodePointBuffer will be the next consumed chars during Rune decoding - // (even if this is because it is invalid), we should pass isFinalSegment = true to indicate to the decoder to - // parse the code units without extra data. - // - // This is relevant in the case of having ['\uD800', 'C'], where the validation above would have needed all both code units - // to determine that only the first unit should be consumed (as invalid). So this method will get only ['\uD800']. - // Because we know more data will not be able to complete this code point, we need to pass isFinalSegment = true - // to ensure that the encoder consumes this data eagerly instead of leaving it and returning NeedsMoreData. - WriteStringSegmentEscape(partialCodePointBuffer, true); - WriteStringSegmentEscape(value, isFinalSegment); } @@ -223,53 +230,60 @@ public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegmen private void WriteStringSegmentWithLeftover(scoped ReadOnlySpan utf8Value, bool isFinalSegment) { - Span combinedBuffer = stackalloc byte[4]; - - GetPartialUtf8CodePoint(out scoped ReadOnlySpan partialCodePointBuffer); - combinedBuffer = combinedBuffer.Slice(0, ConcatInto(partialCodePointBuffer, utf8Value, combinedBuffer)); - - switch (Rune.DecodeFromUtf8(combinedBuffer, out _, out int bytesConsumed)) - { - case OperationStatus.NeedMoreData: - Debug.Assert(utf8Value.Length + partialCodePointBuffer.Length < 4); - Debug.Assert(bytesConsumed == utf8Value.Length + partialCodePointBuffer.Length); - // Let the encoder deal with the error if this is a final buffer. - utf8Value = combinedBuffer.Slice(0, bytesConsumed); - partialCodePointBuffer = ReadOnlySpan.Empty; - break; - case OperationStatus.Done: - Debug.Assert(bytesConsumed > partialCodePointBuffer.Length); - Debug.Assert(bytesConsumed <= 4); - // Divide up the code point bytes into its own buffer and the remainder of the input buffer. - utf8Value = utf8Value.Slice(bytesConsumed - partialCodePointBuffer.Length); - partialCodePointBuffer = combinedBuffer.Slice(0, bytesConsumed); - break; - case OperationStatus.InvalidData: - Debug.Assert(bytesConsumed >= partialCodePointBuffer.Length); - Debug.Assert(bytesConsumed <= 4); - utf8Value = utf8Value.Slice(bytesConsumed - partialCodePointBuffer.Length); - partialCodePointBuffer = combinedBuffer.Slice(0, bytesConsumed); - break; - case OperationStatus.DestinationTooSmall: - default: - Debug.Fail("Unexpected OperationStatus return value."); - break; + Debug.Assert(HasPartialCodePoint); + + if (TryGetPartialUtf8CodePoint(out scoped ReadOnlySpan partialCodePointBuffer)) + { + Span combinedBuffer = stackalloc byte[4]; + combinedBuffer = combinedBuffer.Slice(0, ConcatInto(partialCodePointBuffer, utf8Value, combinedBuffer)); + + switch (Rune.DecodeFromUtf8(combinedBuffer, out _, out int bytesConsumed)) + { + case OperationStatus.NeedMoreData: + Debug.Assert(utf8Value.Length + partialCodePointBuffer.Length < 4); + Debug.Assert(bytesConsumed == utf8Value.Length + partialCodePointBuffer.Length); + // Let the encoder deal with the error if this is a final buffer. + utf8Value = combinedBuffer.Slice(0, bytesConsumed); + partialCodePointBuffer = ReadOnlySpan.Empty; + break; + case OperationStatus.Done: + Debug.Assert(bytesConsumed > partialCodePointBuffer.Length); + Debug.Assert(bytesConsumed <= 4); + // Divide up the code point bytes into its own buffer and the remainder of the input buffer. + utf8Value = utf8Value.Slice(bytesConsumed - partialCodePointBuffer.Length); + partialCodePointBuffer = combinedBuffer.Slice(0, bytesConsumed); + break; + case OperationStatus.InvalidData: + Debug.Assert(bytesConsumed >= partialCodePointBuffer.Length); + Debug.Assert(bytesConsumed <= 4); + utf8Value = utf8Value.Slice(bytesConsumed - partialCodePointBuffer.Length); + partialCodePointBuffer = combinedBuffer.Slice(0, bytesConsumed); + break; + case OperationStatus.DestinationTooSmall: + default: + Debug.Fail("Unexpected OperationStatus return value."); + break; + } + + // The "isFinalSegment" argument indicates whether input that NeedsMoreData should be consumed as an error or not. + // Because we have validated above that partialCodePointBuffer will be the next consumed bytes during Rune decoding + // (even if this is because it is invalid), we should pass isFinalSegment = true to indicate to the decoder to + // parse the code units without extra data. + // + // This is relevant in the case of having [<3-length prefix code unit>, , ], where the validation + // above would have needed all 3 code units to determine that only the first 2 units should be consumed (as invalid). + // So this method will get only <3-size prefix code unit>. Because we know more data will not be able + // to complete this code point, we need to pass isFinalSegment = true to ensure that the encoder consumes this data eagerly + // instead of leaving it and returning NeedsMoreData. + WriteStringSegmentEscape(partialCodePointBuffer, true); + } + else + { + WriteInvalidPartialCodePoint(); } ClearPartialCodePoint(); - // The "isFinalSegment" argument indicates whether input that NeedsMoreData should be consumed as an error or not. - // Because we have validated above that partialCodePointBuffer will be the next consumed bytes during Rune decoding - // (even if this is because it is invalid), we should pass isFinalSegment = true to indicate to the decoder to - // parse the code units without extra data. - // - // This is relevant in the case of having [<3-length prefix code unit>, , ], where the validation - // above would have needed all 3 code units to determine that only the first 2 units should be consumed (as invalid). - // So this method will get only <3-size prefix code unit>. Because we know more data will not be able - // to complete this code point, we need to pass isFinalSegment = true to ensure that the encoder consumes this data eagerly - // instead of leaving it and returning NeedsMoreData. - WriteStringSegmentEscape(partialCodePointBuffer, true); - WriteStringSegmentEscape(utf8Value, isFinalSegment); } @@ -406,52 +420,6 @@ private void WriteStringSegmentEpilogue() _memory.Span[BytesPending++] = JsonConstants.Quote; } - private void GetPartialUtf8CodePoint(out ReadOnlySpan codePointBytes) - { - ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; - Debug.Assert(partialCodePointBytes.Length == 4); - - byte length = partialCodePointBytes[3]; - Debug.Assert(0 <= length && length <= 4); - - codePointBytes = partialCodePointBytes.Slice(0, length); - } - - private void GetPartialUtf16CodePoint(out ReadOnlySpan codePointChars) - { - ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; - Debug.Assert(partialCodePointBytes.Length == 4); - - byte length = partialCodePointBytes[3]; - Debug.Assert(length == 2 || length == 0); - - codePointChars = MemoryMarshal.Cast(partialCodePointBytes.Slice(0, length)); - } - - private void SetPartialUtf8CodePoint(ReadOnlySpan bytes) - { - Debug.Assert(bytes.Length <= 3); - - Span partialCodePointBytes = PartialCodePointRaw; - - bytes.CopyTo(partialCodePointBytes); - partialCodePointBytes[3] = (byte)bytes.Length; - } - - private void SetPartialUtf16CodePoint(ReadOnlySpan bytes) - { - Debug.Assert(bytes.Length <= 1); - - Span partialCodePointBytes = PartialCodePointRaw; - - bytes.CopyTo(MemoryMarshal.Cast(partialCodePointBytes)); - partialCodePointBytes[3] = (byte)(2 * bytes.Length); - } - - private bool HasPartialCodePoint => PartialCodePointRaw[3] != 0; - - private void ClearPartialCodePoint() => PartialCodePointRaw[3] = 0; - /// /// Given a byte buffer , concatenates as much of followed /// by into it as will fit, then returns the total number of bytes copied. diff --git a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs index 5f7028ed5a4490..5bb237d05f170c 100644 --- a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs +++ b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs @@ -832,7 +832,7 @@ public static void WriteStringValueSegment_MixEncoding() // it is a 2-byte start unit but the second byte is not a continuation. // So a replacement character gets written for the first byte. The second and // third units are valid and get written as is. - JsonTestHelper.AssertContents("\uFFFD\u063F", output); + JsonTestHelper.AssertContents(@"""\uFFFD\uFFFD""", output); } // Switch this to use an enum discriminator input when base64 is supported From a078bfd5d16ec5543280db970c5f8f2d17719912 Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Mon, 16 Dec 2024 18:13:21 -0800 Subject: [PATCH 11/21] add test for switching encoding --- ...tf8JsonWriterTests.Values.StringSegment.cs | 85 +++++++++++++++---- 1 file changed, 70 insertions(+), 15 deletions(-) diff --git a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs index 5bb237d05f170c..64f48b7ca3ae3d 100644 --- a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs +++ b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs @@ -814,25 +814,80 @@ public static void WriteStringValueSegment_Utf8_Reset() [Fact] public static void WriteStringValueSegment_MixEncoding() { - var output = new ArrayBufferWriter(); - using var jsonUtf8 = new Utf8JsonWriter(output); + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); - // Becuase the first code point is a surrogate pair, it will be - // saved until the next write to complete it. It is saved in the - // original encoding, UTF-16, so it will be 0b1101_1000 0b1101_1000 - jsonUtf8.WriteStringValueSegment("\uD8D8".AsSpan(), false); + // Becuase the first code point is a surrogate pair, it will be + // saved until the next write to complete it. It is saved in the + // original encoding, UTF-16, so it will be 0b1101_1000 0b1101_1000 + jsonUtf8.WriteStringValueSegment("\uD8D8".AsSpan(), false); - // Now we write a UTF-8 continuation byte. With the previous partial - // state, the whole sequence is 0b110_11000 0b110_11000 0b10_111111. - jsonUtf8.WriteStringValueSegment([0b10_111111], true); + // Now we write a UTF-8 continuation byte. With the previous partial + // state, the whole sequence is 0b110_11000 0b110_11000 0b10_111111. + jsonUtf8.WriteStringValueSegment([0b10_111111], true); - jsonUtf8.Flush(); + jsonUtf8.Flush(); + + // If this is interpreted as UTF-8, the first byte is invalid because + // it is a 2-byte start unit but the second byte is not a continuation. + // So a replacement character gets written for the first byte. The second and + // third units are valid and get written as is. Instead, if this is + // handled correctly, two replacement characters will be written. + + JsonTestHelper.AssertContents(@"""\uFFFD\uFFFD""", output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + + // The second UTF-16 code unit, when interpreted as UTF-8, is a continuation, + // so if the first and second code units are decoded together, they will + // form a valid 3-byte sequence. + jsonUtf8.WriteStringValueSegment([0b1110_1111], false); + jsonUtf8.WriteStringValueSegment("\u8080".AsSpan(), true); + + jsonUtf8.Flush(); + + JsonTestHelper.AssertContents(@"""\uFFFD\u8080""", output); + } - // If this is interpreted as UTF-8, the first byte is invalid because - // it is a 2-byte start unit but the second byte is not a continuation. - // So a replacement character gets written for the first byte. The second and - // third units are valid and get written as is. - JsonTestHelper.AssertContents(@"""\uFFFD\uFFFD""", output); + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStringValueSegment([0b110_11111], false); + jsonUtf8.Flush(); + JsonTestHelper.AssertContents("\"", output); + + // Writing empty UTF-8 sequence will still keep the partial code point + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false); + jsonUtf8.Flush(); + JsonTestHelper.AssertContents("\"", output); + + // Writing empty UTF-16 sequence will dump the partial UTF-8 code point + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false); + jsonUtf8.Flush(); + JsonTestHelper.AssertContents(@"""\uFFFD", output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStringValueSegment(['\uD800'], false); + jsonUtf8.Flush(); + JsonTestHelper.AssertContents("\"", output); + + // Writing empty UTF-16 sequence will still keep the partial code point + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false); + jsonUtf8.Flush(); + JsonTestHelper.AssertContents("\"", output); + + // Writing empty UTF-8 sequence will dump the partial UTF-16 code point + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false); + jsonUtf8.Flush(); + JsonTestHelper.AssertContents(@"""\uFFFD", output); + } } // Switch this to use an enum discriminator input when base64 is supported From 93e6ee9bf453e89972ab1fd460c959a0bf51608f Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Tue, 17 Dec 2024 11:55:16 -0800 Subject: [PATCH 12/21] use CoreLib Rune for polyfill instead of having a separate copy --- .../src/System/Text/Rune.cs | 33 +- .../src/System/Text/Unicode/Utf16Utility.cs | 5 + .../System.Text.Rune.netstandard20.cs | 547 ------------------ .../src/Resources/Strings.resx | 12 + .../src/System.Text.Encodings.Web.csproj | 4 +- .../src/System/ThrowHelper.cs | 120 ++++ ...em.Numerics.BitOperations.netstandard20.cs | 64 -- .../System.Text.Rune.netstandard20.cs | 547 ------------------ .../src/Resources/Strings.resx | 12 + .../src/System.Text.Json.csproj | 7 +- .../src/System/ThrowHelper.cs | 120 ++++ 11 files changed, 302 insertions(+), 1169 deletions(-) delete mode 100644 src/libraries/System.Text.Encodings.Web/src/Polyfills/System.Text.Rune.netstandard20.cs create mode 100644 src/libraries/System.Text.Encodings.Web/src/System/ThrowHelper.cs delete mode 100644 src/libraries/System.Text.Json/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs delete mode 100644 src/libraries/System.Text.Json/src/Polyfills/System.Text.Rune.netstandard20.cs create mode 100644 src/libraries/System.Text.Json/src/System/ThrowHelper.cs diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs index b283674d8ace46..9fe75a3243ff39 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs @@ -18,7 +18,12 @@ namespace System.Text /// assuming that the underlying instance is well-formed. /// [DebuggerDisplay("{DebuggerDisplay,nq}")] - public readonly struct Rune : IComparable, IComparable, IEquatable +#if SYSTEM_PRIVATE_CORELIB + public +#else + internal +#endif + readonly struct Rune : IComparable, IComparable, IEquatable #if SYSTEM_PRIVATE_CORELIB #pragma warning disable SA1001 // Commas should be spaced correctly , ISpanFormattable @@ -102,7 +107,9 @@ public Rune(int value) /// /// If does not represent a value Unicode scalar value. /// +#if SYSTEM_PRIVATE_CORELIB // CS3019: CLS compliance checking will not be performed on 'Rune.explicit operator Rune(uint)' because it is not visible from outside this assembly [CLSCompliant(false)] +#endif public Rune(uint value) { if (!UnicodeUtility.IsValidUnicodeScalar(value)) @@ -135,13 +142,22 @@ private Rune(uint scalarValue, bool _) public static explicit operator Rune(char ch) => new Rune(ch); +#if SYSTEM_PRIVATE_CORELIB // CS3019: CLS compliance checking will not be performed on 'Rune.explicit operator Rune(uint)' because it is not visible from outside this assembly [CLSCompliant(false)] +#endif public static explicit operator Rune(uint value) => new Rune(value); public static explicit operator Rune(int value) => new Rune(value); // Displayed as "'' (U+XXXX)"; e.g., "'e' (U+0065)" - private string DebuggerDisplay => string.Create(CultureInfo.InvariantCulture, $"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'"); + private string DebuggerDisplay => +#if SYSTEM_PRIVATE_CORELIB + string.Create( + CultureInfo.InvariantCulture, +#else + FormattableString.Invariant( +#endif + $"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'"); /// /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ]) @@ -242,7 +258,6 @@ private static Rune ChangeCaseCultureAware(Rune rune, TextInfo textInfo, bool to #else private static Rune ChangeCaseCultureAware(Rune rune, CultureInfo culture, bool toUpper) { - Debug.Assert(!GlobalizationMode.Invariant, "This should've been checked by the caller."); Debug.Assert(culture != null, "This should've been checked by the caller."); Span original = stackalloc char[MaxUtf16CharsPerRune]; // worst case scenario = 2 code units (for a surrogate pair) @@ -795,7 +810,9 @@ public static Rune GetRuneAt(string input, int index) /// Returns iff is a valid Unicode scalar /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive. /// +#if SYSTEM_PRIVATE_CORELIB // CS3019: CLS compliance checking will not be performed on 'Rune.explicit operator Rune(uint)' because it is not visible from outside this assembly [CLSCompliant(false)] +#endif public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value); // returns a negative number on failure @@ -978,7 +995,9 @@ public static bool TryCreate(char highSurrogate, char lowSurrogate, out Rune res /// /// Attempts to create a from the provided input value. /// +#if SYSTEM_PRIVATE_CORELIB // CS3019: CLS compliance checking will not be performed on 'Rune.explicit operator Rune(uint)' because it is not visible from outside this assembly [CLSCompliant(false)] +#endif public static bool TryCreate(uint value, out Rune result) { if (UnicodeUtility.IsValidUnicodeScalar(value)) @@ -1375,12 +1394,12 @@ public static Rune ToLower(Rune value, CultureInfo culture) // ASCII characters differently than the invariant culture (e.g., Turkish I). Instead // we'll just jump straight to the globalization tables if they're available. +#if SYSTEM_PRIVATE_CORELIB if (GlobalizationMode.Invariant) { return ToLowerInvariant(value); } -#if SYSTEM_PRIVATE_CORELIB return ChangeCaseCultureAware(value, culture.TextInfo, toUpper: false); #else return ChangeCaseCultureAware(value, culture, toUpper: false); @@ -1399,6 +1418,7 @@ public static Rune ToLowerInvariant(Rune value) return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(value._value)); } +#if SYSTEM_PRIVATE_CORELIB if (GlobalizationMode.Invariant) { return UnsafeCreate(CharUnicodeInfo.ToLower(value._value)); @@ -1406,7 +1426,6 @@ public static Rune ToLowerInvariant(Rune value) // Non-ASCII data requires going through the case folding tables. -#if SYSTEM_PRIVATE_CORELIB return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: false); #else return ChangeCaseCultureAware(value, CultureInfo.InvariantCulture, toUpper: false); @@ -1424,12 +1443,12 @@ public static Rune ToUpper(Rune value, CultureInfo culture) // ASCII characters differently than the invariant culture (e.g., Turkish I). Instead // we'll just jump straight to the globalization tables if they're available. +#if SYSTEM_PRIVATE_CORELIB if (GlobalizationMode.Invariant) { return ToUpperInvariant(value); } -#if SYSTEM_PRIVATE_CORELIB return ChangeCaseCultureAware(value, culture.TextInfo, toUpper: true); #else return ChangeCaseCultureAware(value, culture, toUpper: true); @@ -1448,6 +1467,7 @@ public static Rune ToUpperInvariant(Rune value) return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(value._value)); } +#if SYSTEM_PRIVATE_CORELIB if (GlobalizationMode.Invariant) { return UnsafeCreate(CharUnicodeInfo.ToUpper(value._value)); @@ -1455,7 +1475,6 @@ public static Rune ToUpperInvariant(Rune value) // Non-ASCII data requires going through the case folding tables. -#if SYSTEM_PRIVATE_CORELIB return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: true); #else return ChangeCaseCultureAware(value, CultureInfo.InvariantCulture, toUpper: true); diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs index 8961529dfed83c..7a79a3a6592657 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs @@ -3,7 +3,10 @@ using System.Diagnostics; using System.Runtime.CompilerServices; + +#if SYSTEM_PRIVATE_CORELIB using System.Runtime.Intrinsics; +#endif namespace System.Text.Unicode { @@ -277,6 +280,7 @@ internal static bool UInt64OrdinalIgnoreCaseAscii(ulong valueA, ulong valueB) return (differentBits & indicator) == 0; } +#if SYSTEM_PRIVATE_CORELIB /// /// Returns true iff the TVector represents ASCII UTF-16 characters in machine endianness. /// @@ -286,5 +290,6 @@ internal static bool AllCharsInVectorAreAscii(TVector vec) { return (vec & TVector.Create(unchecked((ushort)~0x007F))).Equals(TVector.Zero); } +#endif } } diff --git a/src/libraries/System.Text.Encodings.Web/src/Polyfills/System.Text.Rune.netstandard20.cs b/src/libraries/System.Text.Encodings.Web/src/Polyfills/System.Text.Rune.netstandard20.cs deleted file mode 100644 index f48490179238eb..00000000000000 --- a/src/libraries/System.Text.Encodings.Web/src/Polyfills/System.Text.Rune.netstandard20.cs +++ /dev/null @@ -1,547 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Buffers; -using System.Diagnostics; -using System.Diagnostics.CodeAnalysis; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -using System.Text.Encodings.Web; - -// Contains a polyfill implementation of System.Text.Rune that works on netstandard2.0. -// Implementation copied from: -// https://github.com/dotnet/runtime/blob/177d6f1a0bfdc853ae9ffeef4be99ff984c4f5dd/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs - -namespace System.Text -{ - internal readonly struct Rune : IEquatable - { - private const int MaxUtf16CharsPerRune = 2; // supplementary plane code points are encoded as 2 UTF-16 code units - - private const char HighSurrogateStart = '\ud800'; - private const char LowSurrogateStart = '\udc00'; - private const int HighSurrogateRange = 0x3FF; - - private readonly uint _value; - - /// - /// Creates a from the provided Unicode scalar value. - /// - /// - /// If does not represent a value Unicode scalar value. - /// - public Rune(uint value) - { - if (!UnicodeUtility.IsValidUnicodeScalar(value)) - { - ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.value); - } - _value = value; - } - - /// - /// Creates a from the provided Unicode scalar value. - /// - /// - /// If does not represent a value Unicode scalar value. - /// - public Rune(int value) - : this((uint)value) - { - } - - // non-validating ctor - private Rune(uint scalarValue, bool _) - { - UnicodeDebug.AssertIsValidScalar(scalarValue); - _value = scalarValue; - } - - /// - /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ]) - /// and therefore representable by a single UTF-8 code unit. - /// - public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(_value); - - /// - /// Returns true if and only if this scalar value is within the BMP ([ U+0000..U+FFFF ]) - /// and therefore representable by a single UTF-16 code unit. - /// - public bool IsBmp => UnicodeUtility.IsBmpCodePoint(_value); - - public static bool operator ==(Rune left, Rune right) => left._value == right._value; - - public static bool operator !=(Rune left, Rune right) => left._value != right._value; - - public static bool IsControl(Rune value) - { - // Per the Unicode stability policy, the set of control characters - // is forever fixed at [ U+0000..U+001F ], [ U+007F..U+009F ]. No - // characters will ever be added to or removed from the "control characters" - // group. See https://www.unicode.org/policies/stability_policy.html. - - // Logic below depends on Rune.Value never being -1 (since Rune is a validating type) - // 00..1F (+1) => 01..20 (&~80) => 01..20 - // 7F..9F (+1) => 80..A0 (&~80) => 00..20 - - return ((value._value + 1) & ~0x80u) <= 0x20u; - } - - /// - /// A instance that represents the Unicode replacement character U+FFFD. - /// - public static Rune ReplacementChar => UnsafeCreate(UnicodeUtility.ReplacementChar); - - /// - /// Returns the length in code units () of the - /// UTF-16 sequence required to represent this scalar value. - /// - /// - /// The return value will be 1 or 2. - /// - public int Utf16SequenceLength - { - get - { - int codeUnitCount = UnicodeUtility.GetUtf16SequenceLength(_value); - Debug.Assert(codeUnitCount > 0 && codeUnitCount <= MaxUtf16CharsPerRune); - return codeUnitCount; - } - } - - /// - /// Returns the Unicode scalar value as an integer. - /// - public int Value => (int)_value; - - /// - /// Decodes the at the beginning of the provided UTF-16 source buffer. - /// - /// - /// - /// If the source buffer begins with a valid UTF-16 encoded scalar value, returns , - /// and outs via the decoded and via the - /// number of s used in the input buffer to encode the . - /// - /// - /// If the source buffer is empty or contains only a standalone UTF-16 high surrogate character, returns , - /// and outs via and via the length of the input buffer. - /// - /// - /// If the source buffer begins with an ill-formed UTF-16 encoded scalar value, returns , - /// and outs via and via the number of - /// s used in the input buffer to encode the ill-formed sequence. - /// - /// - /// - /// The general calling convention is to call this method in a loop, slicing the buffer by - /// elements on each iteration of the loop. On each iteration of the loop - /// will contain the real scalar value if successfully decoded, or it will contain if - /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of - /// invalid sequences while iterating through the loop. - /// - public static OperationStatus DecodeFromUtf16(ReadOnlySpan source, out Rune result, out int charsConsumed) - { - if (!source.IsEmpty) - { - // First, check for the common case of a BMP scalar value. - // If this is correct, return immediately. - - char firstChar = source[0]; - if (TryCreate(firstChar, out result)) - { - charsConsumed = 1; - return OperationStatus.Done; - } - - // First thing we saw was a UTF-16 surrogate code point. - // Let's optimistically assume for now it's a high surrogate and hope - // that combining it with the next char yields useful results. - - if (1 < (uint)source.Length) - { - char secondChar = source[1]; - if (TryCreate(firstChar, secondChar, out result)) - { - // Success! Formed a supplementary scalar value. - charsConsumed = 2; - return OperationStatus.Done; - } - else - { - // Either the first character was a low surrogate, or the second - // character was not a low surrogate. This is an error. - goto InvalidData; - } - } - else if (!char.IsHighSurrogate(firstChar)) - { - // Quick check to make sure we're not going to report NeedMoreData for - // a single-element buffer where the data is a standalone low surrogate - // character. Since no additional data will ever make this valid, we'll - // report an error immediately. - goto InvalidData; - } - } - - // If we got to this point, the input buffer was empty, or the buffer - // was a single element in length and that element was a high surrogate char. - - charsConsumed = source.Length; - result = ReplacementChar; - return OperationStatus.NeedMoreData; - - InvalidData: - - charsConsumed = 1; // maximal invalid subsequence for UTF-16 is always a single code unit in length - result = ReplacementChar; - return OperationStatus.InvalidData; - } - - /// - /// Decodes the at the beginning of the provided UTF-8 source buffer. - /// - /// - /// - /// If the source buffer begins with a valid UTF-8 encoded scalar value, returns , - /// and outs via the decoded and via the - /// number of s used in the input buffer to encode the . - /// - /// - /// If the source buffer is empty or contains only a partial UTF-8 subsequence, returns , - /// and outs via and via the length of the input buffer. - /// - /// - /// If the source buffer begins with an ill-formed UTF-8 encoded scalar value, returns , - /// and outs via and via the number of - /// s used in the input buffer to encode the ill-formed sequence. - /// - /// - /// - /// The general calling convention is to call this method in a loop, slicing the buffer by - /// elements on each iteration of the loop. On each iteration of the loop - /// will contain the real scalar value if successfully decoded, or it will contain if - /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of - /// invalid sequences while iterating through the loop. - /// - public static OperationStatus DecodeFromUtf8(ReadOnlySpan source, out Rune result, out int bytesConsumed) - { - // This method follows the Unicode Standard's recommendation for detecting - // the maximal subpart of an ill-formed subsequence. See The Unicode Standard, - // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence, - // it tries to consume as many code units as possible as long as those code - // units constitute the beginning of a longer well-formed subsequence per Table 3-7. - - int index = 0; - - // Try reading input[0]. - - if ((uint)index >= (uint)source.Length) - { - goto NeedsMoreData; - } - - uint tempValue = source[index]; - if (!UnicodeUtility.IsAsciiCodePoint(tempValue)) - { - goto NotAscii; - } - - Finish: - - bytesConsumed = index + 1; - Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4] - result = UnsafeCreate(tempValue); - return OperationStatus.Done; - - NotAscii: - - // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in - // the range [C2..F4]. If it's outside of that range, it's either a standalone - // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range - // four-byte sequence. - - if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4)) - { - goto FirstByteInvalid; - } - - tempValue = (tempValue - 0xC2) << 6; - - // Try reading input[1]. - - index++; - if ((uint)index >= (uint)source.Length) - { - goto NeedsMoreData; - } - - // Continuation bytes are of the form [10xxxxxx], which means that their two's - // complement representation is in the range [-65..-128]. This allows us to - // perform a single comparison to see if a byte is a continuation byte. - - int thisByteSignExtended = (sbyte)source[index]; - if (thisByteSignExtended >= -64) - { - goto Invalid; - } - - tempValue += (uint)thisByteSignExtended; - tempValue += 0x80; // remove the continuation byte marker - tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker - - if (tempValue < 0x0800) - { - Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF)); - goto Finish; // this is a valid 2-byte sequence - } - - // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have - // enough information (from just two code units) to detect overlong or surrogate - // sequences, we need to perform these checks now. - - if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80))) - { - // The first two bytes were not in the range [[E0 A0]..[F4 8F]]. - // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence. - goto Invalid; - } - - if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80))) - { - // This is a UTF-16 surrogate code point, which is invalid in UTF-8. - goto Invalid; - } - - if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80))) - { - // This is an overlong 4-byte sequence. - goto Invalid; - } - - // The first two bytes were just fine. We don't need to perform any other checks - // on the remaining bytes other than to see that they're valid continuation bytes. - - // Try reading input[2]. - - index++; - if ((uint)index >= (uint)source.Length) - { - goto NeedsMoreData; - } - - thisByteSignExtended = (sbyte)source[index]; - if (thisByteSignExtended >= -64) - { - goto Invalid; // this byte is not a UTF-8 continuation byte - } - - tempValue <<= 6; - tempValue += (uint)thisByteSignExtended; - tempValue += 0x80; // remove the continuation byte marker - tempValue -= (0xE0 - 0xC0) << 12; // remove the leading byte marker - - if (tempValue <= 0xFFFF) - { - Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF)); - goto Finish; // this is a valid 3-byte sequence - } - - // Try reading input[3]. - - index++; - if ((uint)index >= (uint)source.Length) - { - goto NeedsMoreData; - } - - thisByteSignExtended = (sbyte)source[index]; - if (thisByteSignExtended >= -64) - { - goto Invalid; // this byte is not a UTF-8 continuation byte - } - - tempValue <<= 6; - tempValue += (uint)thisByteSignExtended; - tempValue += 0x80; // remove the continuation byte marker - tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker - - UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue); - goto Finish; // this is a valid 4-byte sequence - - FirstByteInvalid: - - index = 1; // Invalid subsequences are always at least length 1. - - Invalid: - - Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3 - bytesConsumed = index; - result = ReplacementChar; - return OperationStatus.InvalidData; - - NeedsMoreData: - - Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3 - bytesConsumed = index; - result = ReplacementChar; - return OperationStatus.NeedMoreData; - } - - public override bool Equals([NotNullWhen(true)] object? obj) => (obj is Rune other) && Equals(other); - - public bool Equals(Rune other) => this == other; - - public override int GetHashCode() => Value; - - /// - /// Attempts to create a from the provided input value. - /// - public static bool TryCreate(char ch, out Rune result) - { - uint extendedValue = ch; - if (!UnicodeUtility.IsSurrogateCodePoint(extendedValue)) - { - result = UnsafeCreate(extendedValue); - return true; - } - else - { - result = default; - return false; - } - } - - /// - /// Attempts to create a from the provided UTF-16 surrogate pair. - /// Returns if the input values don't represent a well-formed UTF-16surrogate pair. - /// - public static bool TryCreate(char highSurrogate, char lowSurrogate, out Rune result) - { - // First, extend both to 32 bits, then calculate the offset of - // each candidate surrogate char from the start of its range. - - uint highSurrogateOffset = (uint)highSurrogate - HighSurrogateStart; - uint lowSurrogateOffset = (uint)lowSurrogate - LowSurrogateStart; - - // This is a single comparison which allows us to check both for validity at once since - // both the high surrogate range and the low surrogate range are the same length. - // If the comparison fails, we call to a helper method to throw the correct exception message. - - if ((highSurrogateOffset | lowSurrogateOffset) <= HighSurrogateRange) - { - // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding. - result = UnsafeCreate((highSurrogateOffset << 10) + ((uint)lowSurrogate - LowSurrogateStart) + (0x40u << 10)); - return true; - } - else - { - // Didn't have a high surrogate followed by a low surrogate. - result = default; - return false; - } - } - - /// - /// Encodes this to a UTF-16 destination buffer. - /// - /// The buffer to which to write this value as UTF-16. - /// - /// The number of s written to , - /// or 0 if the destination buffer is not large enough to contain the output. - /// True if the value was written to the buffer; otherwise, false. - public bool TryEncodeToUtf16(Span destination, out int charsWritten) - { - if (destination.Length >= 1) - { - if (IsBmp) - { - destination[0] = (char)_value; - charsWritten = 1; - return true; - } - else if (destination.Length >= 2) - { - UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out destination[0], out destination[1]); - charsWritten = 2; - return true; - } - } - - // Destination buffer not large enough - - charsWritten = default; - return false; - } - - /// - /// Encodes this to a destination buffer as UTF-8 bytes. - /// - /// The buffer to which to write this value as UTF-8. - /// - /// The number of s written to , - /// or 0 if the destination buffer is not large enough to contain the output. - /// True if the value was written to the buffer; otherwise, false. - public bool TryEncodeToUtf8(Span destination, out int bytesWritten) - { - // The bit patterns below come from the Unicode Standard, Table 3-6. - - if (destination.Length >= 1) - { - if (IsAscii) - { - destination[0] = (byte)_value; - bytesWritten = 1; - return true; - } - - if (destination.Length >= 2) - { - if (_value <= 0x7FFu) - { - // Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ] - destination[0] = (byte)((_value + (0b110u << 11)) >> 6); - destination[1] = (byte)((_value & 0x3Fu) + 0x80u); - bytesWritten = 2; - return true; - } - - if (destination.Length >= 3) - { - if (_value <= 0xFFFFu) - { - // Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ] - destination[0] = (byte)((_value + (0b1110 << 16)) >> 12); - destination[1] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u); - destination[2] = (byte)((_value & 0x3Fu) + 0x80u); - bytesWritten = 3; - return true; - } - - if (destination.Length >= 4) - { - // Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] - destination[0] = (byte)((_value + (0b11110 << 21)) >> 18); - destination[1] = (byte)(((_value & (0x3Fu << 12)) >> 12) + 0x80u); - destination[2] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u); - destination[3] = (byte)((_value & 0x3Fu) + 0x80u); - bytesWritten = 4; - return true; - } - } - } - } - - // Destination buffer not large enough - - bytesWritten = default; - return false; - } - - /// - /// Creates a without performing validation on the input. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static Rune UnsafeCreate(uint scalarValue) => new Rune(scalarValue, false); - } -} diff --git a/src/libraries/System.Text.Encodings.Web/src/Resources/Strings.resx b/src/libraries/System.Text.Encodings.Web/src/Resources/Strings.resx index 2a0d862e398987..b6d9de5a23fa29 100644 --- a/src/libraries/System.Text.Encodings.Web/src/Resources/Strings.resx +++ b/src/libraries/System.Text.Encodings.Web/src/Resources/Strings.resx @@ -117,6 +117,18 @@ System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 + + Index was out of range. Must be non-negative and less than the size of the collection. + + + Cannot extract a Unicode scalar value from the specified index in the input. + + + Destination is too short. + + + Object must be of type Rune. + TextEncoder does not implement MaxOutputCharsPerInputChar correctly. diff --git a/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj b/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj index 426e51a961445e..833cd9691870ec 100644 --- a/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj +++ b/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj @@ -51,6 +51,7 @@ System.Text.Encodings.Web.JavaScriptEncoder + @@ -59,8 +60,9 @@ System.Text.Encodings.Web.JavaScriptEncoder + + - diff --git a/src/libraries/System.Text.Encodings.Web/src/System/ThrowHelper.cs b/src/libraries/System.Text.Encodings.Web/src/System/ThrowHelper.cs new file mode 100644 index 00000000000000..a975436d4e800d --- /dev/null +++ b/src/libraries/System.Text.Encodings.Web/src/System/ThrowHelper.cs @@ -0,0 +1,120 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using SR = System.SR; + +namespace System +{ + internal static class ThrowHelper + { + [DoesNotReturn] + internal static void ThrowArgumentOutOfRangeException() + { + throw new ArgumentOutOfRangeException(); + } + + [DoesNotReturn] + internal static void ThrowArgumentException_DestinationTooShort() + { + throw new ArgumentException(SR.Argument_DestinationTooShort, "destination"); + } + + [DoesNotReturn] + internal static void ThrowArgumentException_CannotExtractScalar(ExceptionArgument argument) + { + throw GetArgumentException(ExceptionResource.Argument_CannotExtractScalar, argument); + } + + [DoesNotReturn] + internal static void ThrowArgumentOutOfRange_IndexMustBeLessException() + { + throw GetArgumentOutOfRangeException(ExceptionArgument.index, + ExceptionResource.ArgumentOutOfRange_IndexMustBeLess); + } + + [DoesNotReturn] + internal static void ThrowArgumentNullException(ExceptionArgument argument) + { + throw new ArgumentNullException(GetArgumentName(argument)); + } + + [DoesNotReturn] + internal static void ThrowArgumentOutOfRangeException(ExceptionArgument argument) + { + throw new ArgumentOutOfRangeException(GetArgumentName(argument)); + } + + private static ArgumentException GetArgumentException(ExceptionResource resource) + { + return new ArgumentException(GetResourceString(resource)); + } + + private static ArgumentOutOfRangeException GetArgumentOutOfRangeException(ExceptionArgument argument, ExceptionResource resource) + { + return new ArgumentOutOfRangeException(GetArgumentName(argument), GetResourceString(resource)); + } + + private static ArgumentException GetArgumentException(ExceptionResource resource, ExceptionArgument argument) + { + return new ArgumentException(GetResourceString(resource), GetArgumentName(argument)); + } + + private static string GetArgumentName(ExceptionArgument argument) + { + switch (argument) + { + case ExceptionArgument.ch: + return nameof(ExceptionArgument.ch); + case ExceptionArgument.culture: + return nameof(ExceptionArgument.culture); + case ExceptionArgument.index: + return nameof(ExceptionArgument.index); + case ExceptionArgument.input: + return nameof(ExceptionArgument.input); + case ExceptionArgument.value: + return nameof(ExceptionArgument.value); + default: + Debug.Fail("The enum value is not defined, please check the ExceptionArgument Enum."); + return ""; + + }; + } + + private static string GetResourceString(ExceptionResource resource) + { + switch (resource) + { + case ExceptionResource.ArgumentOutOfRange_IndexMustBeLess: + return SR.ArgumentOutOfRange_IndexMustBeLess; + case ExceptionResource.Argument_CannotExtractScalar: + return SR.Argument_CannotExtractScalar; + default: + Debug.Fail("The enum value is not defined, please check the ExceptionResource Enum."); + return ""; + } + } + } + + // + // The convention for this enum is using the argument name as the enum name + // + internal enum ExceptionArgument + { + ch, + culture, + index, + input, + value, + } + + // + // The convention for this enum is using the resource name as the enum name + // + internal enum ExceptionResource + { + Argument_CannotExtractScalar, + ArgumentOutOfRange_IndexMustBeLess + } +} diff --git a/src/libraries/System.Text.Json/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs b/src/libraries/System.Text.Json/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs deleted file mode 100644 index 5453bd9e6b57d1..00000000000000 --- a/src/libraries/System.Text.Json/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs +++ /dev/null @@ -1,64 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; - -// Contains a polyfill implementation of System.Numerics.BitOperations that works on netstandard2.0. -// Implementation copied from: -// https://github.com/dotnet/runtime/blob/6072e4d3a7a2a1493f514cdf4be75a3d56580e84/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs -// -// Some routines inspired by the Stanford Bit Twiddling Hacks by Sean Eron Anderson: -// http://graphics.stanford.edu/~seander/bithacks.html - -namespace System.Numerics -{ - internal static class BitOperations - { - private static ReadOnlySpan Log2DeBruijn => // 32 - [ - 00, 09, 01, 10, 13, 21, 02, 29, - 11, 14, 16, 18, 22, 25, 03, 30, - 08, 12, 20, 28, 15, 17, 24, 07, - 19, 27, 23, 06, 26, 05, 04, 31 - ]; - - /// - /// Returns the integer (floor) log of the specified value, base 2. - /// Note that by convention, input value 0 returns 0 since log(0) is undefined. - /// - /// The value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int Log2(uint value) - { - // Fallback contract is 0->0 - return Log2SoftwareFallback(value | 1); - } - - /// - /// Returns the integer (floor) log of the specified value, base 2. - /// Note that by convention, input value 0 returns 0 since Log(0) is undefined. - /// Does not directly use any hardware intrinsics, nor does it incur branching. - /// - /// The value. - private static int Log2SoftwareFallback(uint value) - { - // No AggressiveInlining due to large method size - // Has conventional contract 0->0 (Log(0) is undefined) - - // Fill trailing zeros with ones, eg 00010010 becomes 00011111 - value |= value >> 01; - value |= value >> 02; - value |= value >> 04; - value |= value >> 08; - value |= value >> 16; - - // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check - return Unsafe.AddByteOffset( - // Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_1100_0100_1010_1100_1101_1101u - ref MemoryMarshal.GetReference(Log2DeBruijn), - // uint|long -> IntPtr cast on 32-bit platforms does expensive overflow checks not needed here - (nint)((value * 0x07C4ACDDu) >> 27)); - } - } -} diff --git a/src/libraries/System.Text.Json/src/Polyfills/System.Text.Rune.netstandard20.cs b/src/libraries/System.Text.Json/src/Polyfills/System.Text.Rune.netstandard20.cs deleted file mode 100644 index 8a490a85465617..00000000000000 --- a/src/libraries/System.Text.Json/src/Polyfills/System.Text.Rune.netstandard20.cs +++ /dev/null @@ -1,547 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Buffers; -using System.Diagnostics; -using System.Diagnostics.CodeAnalysis; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -using System.Text.Json; - -// Contains a polyfill implementation of System.Text.Rune that works on netstandard2.0. -// Implementation copied from: -// https://github.com/dotnet/runtime/blob/177d6f1a0bfdc853ae9ffeef4be99ff984c4f5dd/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs - -namespace System.Text -{ - internal readonly struct Rune : IEquatable - { - private const int MaxUtf16CharsPerRune = 2; // supplementary plane code points are encoded as 2 UTF-16 code units - - private const char HighSurrogateStart = '\ud800'; - private const char LowSurrogateStart = '\udc00'; - private const int HighSurrogateRange = 0x3FF; - - private readonly uint _value; - - /// - /// Creates a from the provided Unicode scalar value. - /// - /// - /// If does not represent a value Unicode scalar value. - /// - public Rune(uint value) - { - if (!UnicodeUtility.IsValidUnicodeScalar(value)) - { - throw new ArgumentOutOfRangeException(nameof(value)); - } - _value = value; - } - - /// - /// Creates a from the provided Unicode scalar value. - /// - /// - /// If does not represent a value Unicode scalar value. - /// - public Rune(int value) - : this((uint)value) - { - } - - // non-validating ctor - private Rune(uint scalarValue, bool _) - { - UnicodeDebug.AssertIsValidScalar(scalarValue); - _value = scalarValue; - } - - /// - /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ]) - /// and therefore representable by a single UTF-8 code unit. - /// - public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(_value); - - /// - /// Returns true if and only if this scalar value is within the BMP ([ U+0000..U+FFFF ]) - /// and therefore representable by a single UTF-16 code unit. - /// - public bool IsBmp => UnicodeUtility.IsBmpCodePoint(_value); - - public static bool operator ==(Rune left, Rune right) => left._value == right._value; - - public static bool operator !=(Rune left, Rune right) => left._value != right._value; - - public static bool IsControl(Rune value) - { - // Per the Unicode stability policy, the set of control characters - // is forever fixed at [ U+0000..U+001F ], [ U+007F..U+009F ]. No - // characters will ever be added to or removed from the "control characters" - // group. See https://www.unicode.org/policies/stability_policy.html. - - // Logic below depends on Rune.Value never being -1 (since Rune is a validating type) - // 00..1F (+1) => 01..20 (&~80) => 01..20 - // 7F..9F (+1) => 80..A0 (&~80) => 00..20 - - return ((value._value + 1) & ~0x80u) <= 0x20u; - } - - /// - /// A instance that represents the Unicode replacement character U+FFFD. - /// - public static Rune ReplacementChar => UnsafeCreate(UnicodeUtility.ReplacementChar); - - /// - /// Returns the length in code units () of the - /// UTF-16 sequence required to represent this scalar value. - /// - /// - /// The return value will be 1 or 2. - /// - public int Utf16SequenceLength - { - get - { - int codeUnitCount = UnicodeUtility.GetUtf16SequenceLength(_value); - Debug.Assert(codeUnitCount > 0 && codeUnitCount <= MaxUtf16CharsPerRune); - return codeUnitCount; - } - } - - /// - /// Returns the Unicode scalar value as an integer. - /// - public int Value => (int)_value; - - /// - /// Decodes the at the beginning of the provided UTF-16 source buffer. - /// - /// - /// - /// If the source buffer begins with a valid UTF-16 encoded scalar value, returns , - /// and outs via the decoded and via the - /// number of s used in the input buffer to encode the . - /// - /// - /// If the source buffer is empty or contains only a standalone UTF-16 high surrogate character, returns , - /// and outs via and via the length of the input buffer. - /// - /// - /// If the source buffer begins with an ill-formed UTF-16 encoded scalar value, returns , - /// and outs via and via the number of - /// s used in the input buffer to encode the ill-formed sequence. - /// - /// - /// - /// The general calling convention is to call this method in a loop, slicing the buffer by - /// elements on each iteration of the loop. On each iteration of the loop - /// will contain the real scalar value if successfully decoded, or it will contain if - /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of - /// invalid sequences while iterating through the loop. - /// - public static OperationStatus DecodeFromUtf16(ReadOnlySpan source, out Rune result, out int charsConsumed) - { - if (!source.IsEmpty) - { - // First, check for the common case of a BMP scalar value. - // If this is correct, return immediately. - - char firstChar = source[0]; - if (TryCreate(firstChar, out result)) - { - charsConsumed = 1; - return OperationStatus.Done; - } - - // First thing we saw was a UTF-16 surrogate code point. - // Let's optimistically assume for now it's a high surrogate and hope - // that combining it with the next char yields useful results. - - if (1 < (uint)source.Length) - { - char secondChar = source[1]; - if (TryCreate(firstChar, secondChar, out result)) - { - // Success! Formed a supplementary scalar value. - charsConsumed = 2; - return OperationStatus.Done; - } - else - { - // Either the first character was a low surrogate, or the second - // character was not a low surrogate. This is an error. - goto InvalidData; - } - } - else if (!char.IsHighSurrogate(firstChar)) - { - // Quick check to make sure we're not going to report NeedMoreData for - // a single-element buffer where the data is a standalone low surrogate - // character. Since no additional data will ever make this valid, we'll - // report an error immediately. - goto InvalidData; - } - } - - // If we got to this point, the input buffer was empty, or the buffer - // was a single element in length and that element was a high surrogate char. - - charsConsumed = source.Length; - result = ReplacementChar; - return OperationStatus.NeedMoreData; - - InvalidData: - - charsConsumed = 1; // maximal invalid subsequence for UTF-16 is always a single code unit in length - result = ReplacementChar; - return OperationStatus.InvalidData; - } - - /// - /// Decodes the at the beginning of the provided UTF-8 source buffer. - /// - /// - /// - /// If the source buffer begins with a valid UTF-8 encoded scalar value, returns , - /// and outs via the decoded and via the - /// number of s used in the input buffer to encode the . - /// - /// - /// If the source buffer is empty or contains only a partial UTF-8 subsequence, returns , - /// and outs via and via the length of the input buffer. - /// - /// - /// If the source buffer begins with an ill-formed UTF-8 encoded scalar value, returns , - /// and outs via and via the number of - /// s used in the input buffer to encode the ill-formed sequence. - /// - /// - /// - /// The general calling convention is to call this method in a loop, slicing the buffer by - /// elements on each iteration of the loop. On each iteration of the loop - /// will contain the real scalar value if successfully decoded, or it will contain if - /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of - /// invalid sequences while iterating through the loop. - /// - public static OperationStatus DecodeFromUtf8(ReadOnlySpan source, out Rune result, out int bytesConsumed) - { - // This method follows the Unicode Standard's recommendation for detecting - // the maximal subpart of an ill-formed subsequence. See The Unicode Standard, - // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence, - // it tries to consume as many code units as possible as long as those code - // units constitute the beginning of a longer well-formed subsequence per Table 3-7. - - int index = 0; - - // Try reading input[0]. - - if ((uint)index >= (uint)source.Length) - { - goto NeedsMoreData; - } - - uint tempValue = source[index]; - if (!UnicodeUtility.IsAsciiCodePoint(tempValue)) - { - goto NotAscii; - } - - Finish: - - bytesConsumed = index + 1; - Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4] - result = UnsafeCreate(tempValue); - return OperationStatus.Done; - - NotAscii: - - // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in - // the range [C2..F4]. If it's outside of that range, it's either a standalone - // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range - // four-byte sequence. - - if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4)) - { - goto FirstByteInvalid; - } - - tempValue = (tempValue - 0xC2) << 6; - - // Try reading input[1]. - - index++; - if ((uint)index >= (uint)source.Length) - { - goto NeedsMoreData; - } - - // Continuation bytes are of the form [10xxxxxx], which means that their two's - // complement representation is in the range [-65..-128]. This allows us to - // perform a single comparison to see if a byte is a continuation byte. - - int thisByteSignExtended = (sbyte)source[index]; - if (thisByteSignExtended >= -64) - { - goto Invalid; - } - - tempValue += (uint)thisByteSignExtended; - tempValue += 0x80; // remove the continuation byte marker - tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker - - if (tempValue < 0x0800) - { - Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF)); - goto Finish; // this is a valid 2-byte sequence - } - - // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have - // enough information (from just two code units) to detect overlong or surrogate - // sequences, we need to perform these checks now. - - if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80))) - { - // The first two bytes were not in the range [[E0 A0]..[F4 8F]]. - // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence. - goto Invalid; - } - - if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80))) - { - // This is a UTF-16 surrogate code point, which is invalid in UTF-8. - goto Invalid; - } - - if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80))) - { - // This is an overlong 4-byte sequence. - goto Invalid; - } - - // The first two bytes were just fine. We don't need to perform any other checks - // on the remaining bytes other than to see that they're valid continuation bytes. - - // Try reading input[2]. - - index++; - if ((uint)index >= (uint)source.Length) - { - goto NeedsMoreData; - } - - thisByteSignExtended = (sbyte)source[index]; - if (thisByteSignExtended >= -64) - { - goto Invalid; // this byte is not a UTF-8 continuation byte - } - - tempValue <<= 6; - tempValue += (uint)thisByteSignExtended; - tempValue += 0x80; // remove the continuation byte marker - tempValue -= (0xE0 - 0xC0) << 12; // remove the leading byte marker - - if (tempValue <= 0xFFFF) - { - Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF)); - goto Finish; // this is a valid 3-byte sequence - } - - // Try reading input[3]. - - index++; - if ((uint)index >= (uint)source.Length) - { - goto NeedsMoreData; - } - - thisByteSignExtended = (sbyte)source[index]; - if (thisByteSignExtended >= -64) - { - goto Invalid; // this byte is not a UTF-8 continuation byte - } - - tempValue <<= 6; - tempValue += (uint)thisByteSignExtended; - tempValue += 0x80; // remove the continuation byte marker - tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker - - UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue); - goto Finish; // this is a valid 4-byte sequence - - FirstByteInvalid: - - index = 1; // Invalid subsequences are always at least length 1. - - Invalid: - - Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3 - bytesConsumed = index; - result = ReplacementChar; - return OperationStatus.InvalidData; - - NeedsMoreData: - - Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3 - bytesConsumed = index; - result = ReplacementChar; - return OperationStatus.NeedMoreData; - } - - public override bool Equals([NotNullWhen(true)] object? obj) => (obj is Rune other) && Equals(other); - - public bool Equals(Rune other) => this == other; - - public override int GetHashCode() => Value; - - /// - /// Attempts to create a from the provided input value. - /// - public static bool TryCreate(char ch, out Rune result) - { - uint extendedValue = ch; - if (!UnicodeUtility.IsSurrogateCodePoint(extendedValue)) - { - result = UnsafeCreate(extendedValue); - return true; - } - else - { - result = default; - return false; - } - } - - /// - /// Attempts to create a from the provided UTF-16 surrogate pair. - /// Returns if the input values don't represent a well-formed UTF-16surrogate pair. - /// - public static bool TryCreate(char highSurrogate, char lowSurrogate, out Rune result) - { - // First, extend both to 32 bits, then calculate the offset of - // each candidate surrogate char from the start of its range. - - uint highSurrogateOffset = (uint)highSurrogate - HighSurrogateStart; - uint lowSurrogateOffset = (uint)lowSurrogate - LowSurrogateStart; - - // This is a single comparison which allows us to check both for validity at once since - // both the high surrogate range and the low surrogate range are the same length. - // If the comparison fails, we call to a helper method to throw the correct exception message. - - if ((highSurrogateOffset | lowSurrogateOffset) <= HighSurrogateRange) - { - // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding. - result = UnsafeCreate((highSurrogateOffset << 10) + ((uint)lowSurrogate - LowSurrogateStart) + (0x40u << 10)); - return true; - } - else - { - // Didn't have a high surrogate followed by a low surrogate. - result = default; - return false; - } - } - - /// - /// Encodes this to a UTF-16 destination buffer. - /// - /// The buffer to which to write this value as UTF-16. - /// - /// The number of s written to , - /// or 0 if the destination buffer is not large enough to contain the output. - /// True if the value was written to the buffer; otherwise, false. - public bool TryEncodeToUtf16(Span destination, out int charsWritten) - { - if (destination.Length >= 1) - { - if (IsBmp) - { - destination[0] = (char)_value; - charsWritten = 1; - return true; - } - else if (destination.Length >= 2) - { - UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out destination[0], out destination[1]); - charsWritten = 2; - return true; - } - } - - // Destination buffer not large enough - - charsWritten = default; - return false; - } - - /// - /// Encodes this to a destination buffer as UTF-8 bytes. - /// - /// The buffer to which to write this value as UTF-8. - /// - /// The number of s written to , - /// or 0 if the destination buffer is not large enough to contain the output. - /// True if the value was written to the buffer; otherwise, false. - public bool TryEncodeToUtf8(Span destination, out int bytesWritten) - { - // The bit patterns below come from the Unicode Standard, Table 3-6. - - if (destination.Length >= 1) - { - if (IsAscii) - { - destination[0] = (byte)_value; - bytesWritten = 1; - return true; - } - - if (destination.Length >= 2) - { - if (_value <= 0x7FFu) - { - // Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ] - destination[0] = (byte)((_value + (0b110u << 11)) >> 6); - destination[1] = (byte)((_value & 0x3Fu) + 0x80u); - bytesWritten = 2; - return true; - } - - if (destination.Length >= 3) - { - if (_value <= 0xFFFFu) - { - // Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ] - destination[0] = (byte)((_value + (0b1110 << 16)) >> 12); - destination[1] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u); - destination[2] = (byte)((_value & 0x3Fu) + 0x80u); - bytesWritten = 3; - return true; - } - - if (destination.Length >= 4) - { - // Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] - destination[0] = (byte)((_value + (0b11110 << 21)) >> 18); - destination[1] = (byte)(((_value & (0x3Fu << 12)) >> 12) + 0x80u); - destination[2] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u); - destination[3] = (byte)((_value & 0x3Fu) + 0x80u); - bytesWritten = 4; - return true; - } - } - } - } - - // Destination buffer not large enough - - bytesWritten = default; - return false; - } - - /// - /// Creates a without performing validation on the input. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static Rune UnsafeCreate(uint scalarValue) => new Rune(scalarValue, false); - } -} diff --git a/src/libraries/System.Text.Json/src/Resources/Strings.resx b/src/libraries/System.Text.Json/src/Resources/Strings.resx index a6989b11d1fae4..1d0f1005a69859 100644 --- a/src/libraries/System.Text.Json/src/Resources/Strings.resx +++ b/src/libraries/System.Text.Json/src/Resources/Strings.resx @@ -812,4 +812,16 @@ Cannot write the requested JSON property or value until the final string value segment has been written. + + Object must be of type Rune. + + + Index was out of range. Must be non-negative and less than the size of the collection. + + + Destination is too short. + + + Cannot extract a Unicode scalar value from the specified index in the input. + \ No newline at end of file diff --git a/src/libraries/System.Text.Json/src/System.Text.Json.csproj b/src/libraries/System.Text.Json/src/System.Text.Json.csproj index 3c9d9ca1aebf25..e8c9a953577c24 100644 --- a/src/libraries/System.Text.Json/src/System.Text.Json.csproj +++ b/src/libraries/System.Text.Json/src/System.Text.Json.csproj @@ -1,4 +1,4 @@ - + $(NetCoreAppCurrent);$(NetCoreAppPrevious);$(NetCoreAppMinimum);netstandard2.0;$(NetFrameworkMinimum) @@ -341,6 +341,7 @@ The System.Text.Json library is built-in as part of the shared framework in .NET + @@ -392,10 +393,10 @@ The System.Text.Json library is built-in as part of the shared framework in .NET - - + + diff --git a/src/libraries/System.Text.Json/src/System/ThrowHelper.cs b/src/libraries/System.Text.Json/src/System/ThrowHelper.cs new file mode 100644 index 00000000000000..a975436d4e800d --- /dev/null +++ b/src/libraries/System.Text.Json/src/System/ThrowHelper.cs @@ -0,0 +1,120 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using SR = System.SR; + +namespace System +{ + internal static class ThrowHelper + { + [DoesNotReturn] + internal static void ThrowArgumentOutOfRangeException() + { + throw new ArgumentOutOfRangeException(); + } + + [DoesNotReturn] + internal static void ThrowArgumentException_DestinationTooShort() + { + throw new ArgumentException(SR.Argument_DestinationTooShort, "destination"); + } + + [DoesNotReturn] + internal static void ThrowArgumentException_CannotExtractScalar(ExceptionArgument argument) + { + throw GetArgumentException(ExceptionResource.Argument_CannotExtractScalar, argument); + } + + [DoesNotReturn] + internal static void ThrowArgumentOutOfRange_IndexMustBeLessException() + { + throw GetArgumentOutOfRangeException(ExceptionArgument.index, + ExceptionResource.ArgumentOutOfRange_IndexMustBeLess); + } + + [DoesNotReturn] + internal static void ThrowArgumentNullException(ExceptionArgument argument) + { + throw new ArgumentNullException(GetArgumentName(argument)); + } + + [DoesNotReturn] + internal static void ThrowArgumentOutOfRangeException(ExceptionArgument argument) + { + throw new ArgumentOutOfRangeException(GetArgumentName(argument)); + } + + private static ArgumentException GetArgumentException(ExceptionResource resource) + { + return new ArgumentException(GetResourceString(resource)); + } + + private static ArgumentOutOfRangeException GetArgumentOutOfRangeException(ExceptionArgument argument, ExceptionResource resource) + { + return new ArgumentOutOfRangeException(GetArgumentName(argument), GetResourceString(resource)); + } + + private static ArgumentException GetArgumentException(ExceptionResource resource, ExceptionArgument argument) + { + return new ArgumentException(GetResourceString(resource), GetArgumentName(argument)); + } + + private static string GetArgumentName(ExceptionArgument argument) + { + switch (argument) + { + case ExceptionArgument.ch: + return nameof(ExceptionArgument.ch); + case ExceptionArgument.culture: + return nameof(ExceptionArgument.culture); + case ExceptionArgument.index: + return nameof(ExceptionArgument.index); + case ExceptionArgument.input: + return nameof(ExceptionArgument.input); + case ExceptionArgument.value: + return nameof(ExceptionArgument.value); + default: + Debug.Fail("The enum value is not defined, please check the ExceptionArgument Enum."); + return ""; + + }; + } + + private static string GetResourceString(ExceptionResource resource) + { + switch (resource) + { + case ExceptionResource.ArgumentOutOfRange_IndexMustBeLess: + return SR.ArgumentOutOfRange_IndexMustBeLess; + case ExceptionResource.Argument_CannotExtractScalar: + return SR.Argument_CannotExtractScalar; + default: + Debug.Fail("The enum value is not defined, please check the ExceptionResource Enum."); + return ""; + } + } + } + + // + // The convention for this enum is using the argument name as the enum name + // + internal enum ExceptionArgument + { + ch, + culture, + index, + input, + value, + } + + // + // The convention for this enum is using the resource name as the enum name + // + internal enum ExceptionResource + { + Argument_CannotExtractScalar, + ArgumentOutOfRange_IndexMustBeLess + } +} From c3b1c3b9cc5b0ebfc14adbe4bcf363b8d04ebf63 Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Wed, 18 Dec 2024 12:26:10 -0800 Subject: [PATCH 13/21] move warning disabling to top and fix up tests --- .../System.Private.CoreLib/src/System/Text/Rune.cs | 12 ++++-------- .../src/System.Text.Encodings.Web.csproj | 3 +-- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs index 9fe75a3243ff39..325d64bb278bb3 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs @@ -8,6 +8,10 @@ using System.Runtime.CompilerServices; using System.Text.Unicode; +#if !SYSTEM_PRIVATE_CORELIB +#pragma warning disable CS3019 // CLS compliance checking will not be performed because it is not visible from outside this assembly +#endif + namespace System.Text { /// @@ -107,9 +111,7 @@ public Rune(int value) /// /// If does not represent a value Unicode scalar value. /// -#if SYSTEM_PRIVATE_CORELIB // CS3019: CLS compliance checking will not be performed on 'Rune.explicit operator Rune(uint)' because it is not visible from outside this assembly [CLSCompliant(false)] -#endif public Rune(uint value) { if (!UnicodeUtility.IsValidUnicodeScalar(value)) @@ -142,9 +144,7 @@ private Rune(uint scalarValue, bool _) public static explicit operator Rune(char ch) => new Rune(ch); -#if SYSTEM_PRIVATE_CORELIB // CS3019: CLS compliance checking will not be performed on 'Rune.explicit operator Rune(uint)' because it is not visible from outside this assembly [CLSCompliant(false)] -#endif public static explicit operator Rune(uint value) => new Rune(value); public static explicit operator Rune(int value) => new Rune(value); @@ -810,9 +810,7 @@ public static Rune GetRuneAt(string input, int index) /// Returns iff is a valid Unicode scalar /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive. /// -#if SYSTEM_PRIVATE_CORELIB // CS3019: CLS compliance checking will not be performed on 'Rune.explicit operator Rune(uint)' because it is not visible from outside this assembly [CLSCompliant(false)] -#endif public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value); // returns a negative number on failure @@ -995,9 +993,7 @@ public static bool TryCreate(char highSurrogate, char lowSurrogate, out Rune res /// /// Attempts to create a from the provided input value. /// -#if SYSTEM_PRIVATE_CORELIB // CS3019: CLS compliance checking will not be performed on 'Rune.explicit operator Rune(uint)' because it is not visible from outside this assembly [CLSCompliant(false)] -#endif public static bool TryCreate(uint value, out Rune result) { if (UnicodeUtility.IsValidUnicodeScalar(value)) diff --git a/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj b/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj index 833cd9691870ec..9536c99694bd46 100644 --- a/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj +++ b/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj @@ -5,8 +5,7 @@ $(TargetFrameworks);$(NetCoreAppPrevious)-windows;$(NetCoreAppPrevious) true - - $(NoWarn);CS3011;CS3019 + $(NoWarn);CS3011 false true Provides types for encoding and escaping strings for use in JavaScript, HyperText Markup Language (HTML), and uniform resource locators (URL). From c9c48840aacdc51d887739f28ab3da79e6af6950 Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Wed, 18 Dec 2024 21:53:43 -0800 Subject: [PATCH 14/21] add fuzzer --- .../libraries/fuzzing/deploy-to-onefuzz.yml | 8 + .../DotnetFuzzing/DotnetFuzzing.csproj | 6 + .../Fuzzers/Utf8JsonWriterFuzzer.cs | 146 ++++++++++++++++++ .../DotnetFuzzing/MemoryBackedStream.cs | 137 ++++++++++++++++ ...tf8JsonWriterTests.Values.StringSegment.cs | 86 ++++++++++- 5 files changed, 378 insertions(+), 5 deletions(-) create mode 100644 src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs create mode 100644 src/libraries/Fuzzing/DotnetFuzzing/MemoryBackedStream.cs diff --git a/eng/pipelines/libraries/fuzzing/deploy-to-onefuzz.yml b/eng/pipelines/libraries/fuzzing/deploy-to-onefuzz.yml index 9f2b06ec638bfd..2c9d95a807d11b 100644 --- a/eng/pipelines/libraries/fuzzing/deploy-to-onefuzz.yml +++ b/eng/pipelines/libraries/fuzzing/deploy-to-onefuzz.yml @@ -153,4 +153,12 @@ extends: onefuzzDropDirectory: $(fuzzerProject)/deployment/UTF8Fuzzer SYSTEM_ACCESSTOKEN: $(System.AccessToken) displayName: Send UTF8Fuzzer to OneFuzz + + - task: onefuzz-task@0 + inputs: + onefuzzOSes: 'Windows' + env: + onefuzzDropDirectory: $(fuzzerProject)/deployment/Utf8JsonWriterFuzzer + SYSTEM_ACCESSTOKEN: $(System.AccessToken) + displayName: Send Utf8JsonWriterFuzzer to OneFuzz # ONEFUZZ_TASK_WORKAROUND_END diff --git a/src/libraries/Fuzzing/DotnetFuzzing/DotnetFuzzing.csproj b/src/libraries/Fuzzing/DotnetFuzzing/DotnetFuzzing.csproj index f538468d180f19..fcc85a77545553 100644 --- a/src/libraries/Fuzzing/DotnetFuzzing/DotnetFuzzing.csproj +++ b/src/libraries/Fuzzing/DotnetFuzzing/DotnetFuzzing.csproj @@ -29,7 +29,9 @@ + + @@ -42,6 +44,10 @@ + + + + diff --git a/src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs b/src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs new file mode 100644 index 00000000000000..2ee39eb93b33c7 --- /dev/null +++ b/src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs @@ -0,0 +1,146 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Buffers; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Text; +using System.Text.Encodings.Web; +using System.Text.Json; +using System.Text.Unicode; +using SharpFuzz; + +namespace DotnetFuzzing.Fuzzers; + +internal sealed class Utf8JsonWriterFuzzer : IFuzzer +{ + public string[] TargetAssemblies { get; } = ["System.Text.Json"]; + + public string[] TargetCoreLibPrefixes => []; + + private const byte IndentFlag = 1; + private const byte EncoderFlag = 1 << 1; + private const byte MaxDepthFlag = 1 << 2; + private const byte NewLineFlag = 1 << 3; + private const byte SkipValidationFlag = 1 << 4; + private const byte EncodingFlag = 1 << 5; + private const byte PoisonFlag = 1 << 5; + + public void FuzzTarget(ReadOnlySpan bytes) + { + const int minLength = 10; // 2 ints, 1 byte, and 1 padding to align chars + if (bytes.Length < minLength) + { + return; + } + + ReadOnlySpan ints = MemoryMarshal.Cast(bytes); + int slice1 = ints[0]; + int slice2 = ints[1]; + byte optionsByte = bytes[8]; + bytes = bytes.Slice(minLength); + ReadOnlySpan chars = MemoryMarshal.Cast(bytes); + + bool utf8 = (optionsByte & EncodingFlag) == 0; + if (!(0 <= slice1 && slice1 <= slice2 && slice2 <= (utf8 ? bytes.Length : chars.Length))) + { + return; + } + + // Set up options based on the first byte + bool indented = (optionsByte & IndentFlag) == 0; + JsonWriterOptions options = new() + { + Encoder = (optionsByte & EncodingFlag) == 0 ? JavaScriptEncoder.Default : JavaScriptEncoder.UnsafeRelaxedJsonEscaping, + Indented = indented, + MaxDepth = (optionsByte & MaxDepthFlag) == 0 ? 1 : 0, + NewLine = (optionsByte & NewLineFlag) == 0 ? "\n" : "\r\n", + SkipValidation = (optionsByte & SkipValidationFlag) == 0, + }; + + // Compute the expected result by using the encoder directly and the input + + byte[] buffer = ArrayPool.Shared.Rent(6 * bytes.Length + 2); + int written; + Span expected = utf8 + ? EncodeToUtf8(bytes, buffer, options.Encoder, out written) + : EncodeToUtf8(chars, buffer, options.Encoder, out written); + + for (int i = 1; i <= 3; i++) + { + using PooledBoundedMemory memory = PooledBoundedMemory.Rent(expected.Length, (optionsByte & PoisonFlag) == 0 ? PoisonPagePlacement.After : PoisonPagePlacement.Before); + using MemoryBackedStream stream = new(memory.Memory); + using Utf8JsonWriter writer = new(stream, options); + try + { + int start = 0; + if (utf8) + { + if (i == 3) + { + writer.WriteStringValueSegment(bytes.Slice(start, slice1), false); + start = slice1; + } + + if (i >= 2) + { + writer.WriteStringValueSegment(bytes.Slice(start, slice2 - start), false); + start = slice2; + } + + writer.WriteStringValueSegment(bytes.Slice(start), true); + writer.Flush(); + } + else + { + if (i == 3) + { + writer.WriteStringValueSegment(chars.Slice(0, slice1), false); + start = slice1; + } + + if (i >= 2) + { + writer.WriteStringValueSegment(chars.Slice(start, slice2 - start), false); + start = slice2; + } + + writer.WriteStringValueSegment(chars.Slice(start), true); + writer.Flush(); + } + } + catch (JsonException) { return; } + + ReadOnlySpan actual = memory.Span; + + // Compare the expected and actual results + Assert.SequenceEqual(expected, actual); + } + + ArrayPool.Shared.Return(buffer); + } + + private static Span EncodeToUtf8(ReadOnlySpan bytes, Span destBuffer, JavaScriptEncoder encoder, out int written) + { + destBuffer[0] = (byte)'"'; + encoder.EncodeUtf8(bytes, destBuffer[1..], out _, out written, isFinalBlock: true); + destBuffer[written + 1] = (byte)'"'; + return destBuffer.Slice(0, written + 2); + } + + private static Span EncodeToUtf8(ReadOnlySpan chars, Span destBuffer, JavaScriptEncoder encoder, out int written) + { + var utf16buffer = ArrayPool.Shared.Rent(6 * chars.Length + 2); + utf16buffer[0] = '"'; + encoder.Encode(chars, utf16buffer.AsSpan(1), out _, out written, isFinalBlock: true); + utf16buffer[written + 1] = '"'; + + Utf8.FromUtf16(utf16buffer.AsSpan(0, written + 2), destBuffer, out _, out written, isFinalBlock: true); + ArrayPool.Shared.Return(utf16buffer); + return destBuffer[0..written]; + } +} diff --git a/src/libraries/Fuzzing/DotnetFuzzing/MemoryBackedStream.cs b/src/libraries/Fuzzing/DotnetFuzzing/MemoryBackedStream.cs new file mode 100644 index 00000000000000..4ab0504d488bb3 --- /dev/null +++ b/src/libraries/Fuzzing/DotnetFuzzing/MemoryBackedStream.cs @@ -0,0 +1,137 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Buffers; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; +using System.Threading.Tasks; + +namespace DotnetFuzzing; + +public class MemoryBackedStream : Stream +{ + private Memory _memory; + private bool _writable; + private bool _disposed; + private int _position; + + public MemoryBackedStream(Memory memory, bool writable = true) + { + _memory = memory; + _writable = writable; + } + + public override bool CanRead => _disposed; + + public override bool CanSeek => _disposed; + + public override bool CanWrite => _writable; + + public override long Length + { + get + { + EnsureNotClosed(); + return _memory.Length; + } + } + + public override long Position + { + get + { + EnsureNotClosed(); + return _position; + } + set + { + ArgumentOutOfRangeException.ThrowIfGreaterThan((ulong)value, (ulong)int.MaxValue, nameof(value)); + EnsureNotClosed(); + _position = (int)value; + } + } + + public override void Flush() { } + + public override int Read(byte[] buffer, int offset, int count) + { + ValidateBufferArguments(buffer, offset, count); + EnsureNotClosed(); + + int n = _memory.Length - _position; + if (n > count) + n = count; + if (n <= 0) + return 0; + + _memory.CopyTo(buffer.AsMemory(offset, count)); + return n; + } + + public override long Seek(long offset, SeekOrigin origin) + { + EnsureNotClosed(); + return SeekCore(offset, origin switch + { + SeekOrigin.Begin => 0, + SeekOrigin.Current => _position, + SeekOrigin.End => _memory.Length, + _ => throw new ArgumentException(nameof(origin)) + }); + } + + private long SeekCore(long offset, int loc) + { + ArgumentOutOfRangeException.ThrowIfGreaterThan(offset, int.MaxValue - loc); + int tempPosition = unchecked(loc + (int)offset); + if (unchecked(loc + offset) < 0 || tempPosition < 0) + throw new IOException("Seek before begin."); + _position = tempPosition; + + Debug.Assert(_position >= 0); + return _position; + } + + public override void SetLength(long value) => throw new NotSupportedException("Currently stream expansion is not supported."); + + public override void Write(byte[] buffer, int offset, int count) + { + ValidateBufferArguments(buffer, offset, count); + EnsureNotClosed(); + EnsureWriteable(); + + int i = _position + count; + // Check for overflow + if (i < 0) + throw new IOException("Stream too long."); + + if (i > _memory.Length) + throw new NotSupportedException("Currently stream expansion is not supported."); + + buffer.AsMemory(offset, count).CopyTo(_memory); + } + + protected override void Dispose(bool disposing) + { + if (!_disposed) + { + _disposed = true; + _memory = Memory.Empty; + _writable = false; + } + } + + private void EnsureNotClosed() + { + if (_disposed) + throw new ObjectDisposedException(nameof(MemoryBackedStream)); + } + + private void EnsureWriteable() + { + if (!_writable) + throw new ObjectDisposedException(nameof(MemoryBackedStream)); + } +} diff --git a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs index 64f48b7ca3ae3d..cea62ff9fe128e 100644 --- a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs +++ b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs @@ -79,20 +79,22 @@ public static IEnumerable InvalidUtf8Data() // 3-byte sequence containing < 3 bytes [0b1110_1111], - // For some reason an invalid 3-byte code point is only replaced - // by one replacement character unlike in the 4-byte case [0b1110_1111, 0b10_111111], // 3-byte overlong [0b1110_0000, 0b10_000000, 0b10_000000], // 4-byte sequence containing < 4 bytes - [0b11110_111], - [0b11110_111, 0b10_111111], - [0b11110_111, 0b10_111111, 0b10_111111], + [0b11110_100], + [0b11110_100, 0b10_001111], + [0b11110_100, 0b10_001111, 0b10_111111], // 4-byte overlong [0b11110_000, 0b10_000000, 0b10_000000, 0b10_000000], + + // Greater than Unicode max value + [0b11110_111, 0b10_000000], + [0b11110_100, 0b10_100000, 0b10_000000], ]; // Separate each case with a character @@ -890,6 +892,80 @@ public static void WriteStringValueSegment_MixEncoding() } } + [Fact] + public static void WriteStringValueSegment_Empty() + { + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, true); + jsonUtf8.Flush(); + JsonTestHelper.AssertContents("\"\"", output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false); + jsonUtf8.Flush(); + JsonTestHelper.AssertContents("\"", output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false); + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, true); + jsonUtf8.Flush(); + JsonTestHelper.AssertContents("\"\"", output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false); + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false); + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, true); + jsonUtf8.Flush(); + JsonTestHelper.AssertContents("\"\"", output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, true); + jsonUtf8.Flush(); + JsonTestHelper.AssertContents("\"\"", output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false); + jsonUtf8.Flush(); + JsonTestHelper.AssertContents("\"", output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false); + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, true); + jsonUtf8.Flush(); + JsonTestHelper.AssertContents("\"\"", output); + } + + { + var output = new ArrayBufferWriter(); + using var jsonUtf8 = new Utf8JsonWriter(output); + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false); + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false); + jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, true); + jsonUtf8.Flush(); + JsonTestHelper.AssertContents("\"\"", output); + } + } + // Switch this to use an enum discriminator input when base64 is supported private static void WriteStringValueHelper(Utf8JsonWriter writer, ReadOnlySpan value) where T : struct From 8482b1c1ef494a46970382d8bf3cbeec113ed3dd Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Thu, 19 Dec 2024 12:08:07 -0800 Subject: [PATCH 15/21] Fix some tests I missed --- .../src/System/ThrowHelper.cs | 5 ----- .../System.Text.Encodings.Web/tests/SR.cs | 17 +++++++++++++++++ .../System.Text.Encodings.Web.Tests.csproj | 10 +++++++--- .../System.Text.Json/src/System/ThrowHelper.cs | 5 ----- 4 files changed, 24 insertions(+), 13 deletions(-) create mode 100644 src/libraries/System.Text.Encodings.Web/tests/SR.cs diff --git a/src/libraries/System.Text.Encodings.Web/src/System/ThrowHelper.cs b/src/libraries/System.Text.Encodings.Web/src/System/ThrowHelper.cs index a975436d4e800d..0c3bc8378e5d56 100644 --- a/src/libraries/System.Text.Encodings.Web/src/System/ThrowHelper.cs +++ b/src/libraries/System.Text.Encodings.Web/src/System/ThrowHelper.cs @@ -46,11 +46,6 @@ internal static void ThrowArgumentOutOfRangeException(ExceptionArgument argument throw new ArgumentOutOfRangeException(GetArgumentName(argument)); } - private static ArgumentException GetArgumentException(ExceptionResource resource) - { - return new ArgumentException(GetResourceString(resource)); - } - private static ArgumentOutOfRangeException GetArgumentOutOfRangeException(ExceptionArgument argument, ExceptionResource resource) { return new ArgumentOutOfRangeException(GetArgumentName(argument), GetResourceString(resource)); diff --git a/src/libraries/System.Text.Encodings.Web/tests/SR.cs b/src/libraries/System.Text.Encodings.Web/tests/SR.cs new file mode 100644 index 00000000000000..bfdbe18e4ba9d9 --- /dev/null +++ b/src/libraries/System.Text.Encodings.Web/tests/SR.cs @@ -0,0 +1,17 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace System +{ + internal static partial class SR + { + /// Index was out of range. Must be non-negative and less than the size of the collection. + internal static string @ArgumentOutOfRange_IndexMustBeLess => @"Index was out of range. Must be non-negative and less than the size of the collection."; + /// Cannot extract a Unicode scalar value from the specified index in the input. + internal static string @Argument_CannotExtractScalar => @"Cannot extract a Unicode scalar value from the specified index in the input."; + /// Destination is too short. + internal static string @Argument_DestinationTooShort => @"Destination is too short."; + /// Object must be of type Rune. + internal static string @Arg_MustBeRune => @"Object must be of type Rune."; + } +} diff --git a/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj b/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj index 80ee57a4d5013b..59ebeba971090e 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj +++ b/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj @@ -3,6 +3,8 @@ true $(NetCoreAppCurrent);$(NetFrameworkMinimum) 15.0 + + $(NoWarn);CS3021 @@ -19,6 +21,7 @@ + @@ -39,9 +42,10 @@ - - - + + + + diff --git a/src/libraries/System.Text.Json/src/System/ThrowHelper.cs b/src/libraries/System.Text.Json/src/System/ThrowHelper.cs index a975436d4e800d..0c3bc8378e5d56 100644 --- a/src/libraries/System.Text.Json/src/System/ThrowHelper.cs +++ b/src/libraries/System.Text.Json/src/System/ThrowHelper.cs @@ -46,11 +46,6 @@ internal static void ThrowArgumentOutOfRangeException(ExceptionArgument argument throw new ArgumentOutOfRangeException(GetArgumentName(argument)); } - private static ArgumentException GetArgumentException(ExceptionResource resource) - { - return new ArgumentException(GetResourceString(resource)); - } - private static ArgumentOutOfRangeException GetArgumentOutOfRangeException(ExceptionArgument argument, ExceptionResource resource) { return new ArgumentOutOfRangeException(GetArgumentName(argument), GetResourceString(resource)); From d50bbcae3eafb96b2ca3840cfe65a2c09a9179ee Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Thu, 19 Dec 2024 13:06:28 -0800 Subject: [PATCH 16/21] clean up and add another test to fuzzer --- .../Fuzzers/Utf8JsonWriterFuzzer.cs | 112 +++++++++++------- .../DotnetFuzzing/MemoryBackedStream.cs | 5 + 2 files changed, 76 insertions(+), 41 deletions(-) diff --git a/src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs b/src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs index 2ee39eb93b33c7..402eb6ead31271 100644 --- a/src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs +++ b/src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs @@ -3,6 +3,7 @@ using System; using System.Buffers; +using System.Collections; using System.Collections.Generic; using System.Diagnostics; using System.IO; @@ -22,13 +23,21 @@ internal sealed class Utf8JsonWriterFuzzer : IFuzzer public string[] TargetCoreLibPrefixes => []; + // One of the bytes in the input is used to set various test options. + // Each bit in that byte represents a different option as indicated here. + + // Options for JsonWriterOptions private const byte IndentFlag = 1; private const byte EncoderFlag = 1 << 1; private const byte MaxDepthFlag = 1 << 2; private const byte NewLineFlag = 1 << 3; private const byte SkipValidationFlag = 1 << 4; + + // Options for choosing between UTF-8 and UTF-16 encoding private const byte EncodingFlag = 1 << 5; - private const byte PoisonFlag = 1 << 5; + + // Options for choosing whether to poison previous or next page + private const byte PoisonFlag = 1 << 6; public void FuzzTarget(ReadOnlySpan bytes) { @@ -38,6 +47,7 @@ public void FuzzTarget(ReadOnlySpan bytes) return; } + // First 2 ints are used as indices to slice the input and the following byte is used for options ReadOnlySpan ints = MemoryMarshal.Cast(bytes); int slice1 = ints[0]; int slice2 = ints[1]; @@ -45,6 +55,7 @@ public void FuzzTarget(ReadOnlySpan bytes) bytes = bytes.Slice(minLength); ReadOnlySpan chars = MemoryMarshal.Cast(bytes); + // Validate that the indices are within bounds of the input bool utf8 = (optionsByte & EncodingFlag) == 0; if (!(0 <= slice1 && slice1 <= slice2 && slice2 <= (utf8 ? bytes.Length : chars.Length))) { @@ -63,57 +74,36 @@ public void FuzzTarget(ReadOnlySpan bytes) }; // Compute the expected result by using the encoder directly and the input - - byte[] buffer = ArrayPool.Shared.Rent(6 * bytes.Length + 2); + int maxExpandedSizeBytes = 6 * bytes.Length + 2; + byte[] buffer = ArrayPool.Shared.Rent(maxExpandedSizeBytes); int written; Span expected = utf8 ? EncodeToUtf8(bytes, buffer, options.Encoder, out written) : EncodeToUtf8(chars, buffer, options.Encoder, out written); - for (int i = 1; i <= 3; i++) + // Compute the actual result by using Utf8JsonWriter. Each iteration is a different slice of the input, but the result should be the same. + foreach (ReadOnlySpan ranges in new[] + { + new[] { 0.. }, + new[] { 0..slice1, slice1.. }, + new[] { 0..slice1, slice1..slice2, slice2.. }, + }) { + // Use a stream backed by bounded memory to detect out-of-bounds accesses using PooledBoundedMemory memory = PooledBoundedMemory.Rent(expected.Length, (optionsByte & PoisonFlag) == 0 ? PoisonPagePlacement.After : PoisonPagePlacement.Before); using MemoryBackedStream stream = new(memory.Memory); using Utf8JsonWriter writer = new(stream, options); - try + + if (utf8) { - int start = 0; - if (utf8) - { - if (i == 3) - { - writer.WriteStringValueSegment(bytes.Slice(start, slice1), false); - start = slice1; - } - - if (i >= 2) - { - writer.WriteStringValueSegment(bytes.Slice(start, slice2 - start), false); - start = slice2; - } - - writer.WriteStringValueSegment(bytes.Slice(start), true); - writer.Flush(); - } - else - { - if (i == 3) - { - writer.WriteStringValueSegment(chars.Slice(0, slice1), false); - start = slice1; - } - - if (i >= 2) - { - writer.WriteStringValueSegment(chars.Slice(start, slice2 - start), false); - start = slice2; - } - - writer.WriteStringValueSegment(chars.Slice(start), true); - writer.Flush(); - } + WriteStringValueSegments(writer, bytes, ranges); + writer.Flush(); + } + else + { + WriteStringValueSegments(writer, chars, ranges); + writer.Flush(); } - catch (JsonException) { return; } ReadOnlySpan actual = memory.Span; @@ -121,9 +111,49 @@ public void FuzzTarget(ReadOnlySpan bytes) Assert.SequenceEqual(expected, actual); } + // Additional test for mixing UTF-8 and UTF-16 encoding. The alignment math is easier in UTF-16 mode so just run it for that. + if (!utf8) + { + { + using PooledBoundedMemory memory = PooledBoundedMemory.Rent(maxExpandedSizeBytes, PoisonPagePlacement.Before); + using MemoryBackedStream stream = new(memory.Memory); + using Utf8JsonWriter writer = new(stream, options); + + writer.WriteStringValueSegment(chars[0..slice1], false); + writer.WriteStringValueSegment(bytes[(2 * slice1)..], true); + writer.Flush(); + } + + { + using PooledBoundedMemory memory = PooledBoundedMemory.Rent(maxExpandedSizeBytes, PoisonPagePlacement.Before); + using MemoryBackedStream stream = new(memory.Memory); + using Utf8JsonWriter writer = new(stream, options); + + writer.WriteStringValueSegment(bytes[0..(2 * slice1)], false); + writer.WriteStringValueSegment(chars[slice1..], true); + writer.Flush(); + } + } + ArrayPool.Shared.Return(buffer); } + private static void WriteStringValueSegments(Utf8JsonWriter writer, ReadOnlySpan bytes, ReadOnlySpan ranges) + { + for (int i = 0; i < ranges.Length; i++) + { + writer.WriteStringValueSegment(bytes[ranges[i]], i == ranges.Length - 1); + } + } + + private static void WriteStringValueSegments(Utf8JsonWriter writer, ReadOnlySpan chars, ReadOnlySpan ranges) + { + for (int i = 0; i < ranges.Length; i++) + { + writer.WriteStringValueSegment(chars[ranges[i]], i == ranges.Length - 1); + } + } + private static Span EncodeToUtf8(ReadOnlySpan bytes, Span destBuffer, JavaScriptEncoder encoder, out int written) { destBuffer[0] = (byte)'"'; diff --git a/src/libraries/Fuzzing/DotnetFuzzing/MemoryBackedStream.cs b/src/libraries/Fuzzing/DotnetFuzzing/MemoryBackedStream.cs index 4ab0504d488bb3..1a9c24d7733ae1 100644 --- a/src/libraries/Fuzzing/DotnetFuzzing/MemoryBackedStream.cs +++ b/src/libraries/Fuzzing/DotnetFuzzing/MemoryBackedStream.cs @@ -10,6 +10,11 @@ namespace DotnetFuzzing; +/// +/// A stream implementation that is backed by instead of a byte array. +/// This is particularly useful in tests where we need a stream but we also want to detect +/// out-of-bounds accesses with . +/// public class MemoryBackedStream : Stream { private Memory _memory; From 55827d9db85c103d43acf611b75b51c29317fc53 Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Fri, 20 Dec 2024 10:55:33 -0800 Subject: [PATCH 17/21] comment typo Co-authored-by: Eirik Tsarpalis --- .../Utf8JsonWriterTests.Values.StringSegment.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs index cea62ff9fe128e..5582aae7501d9f 100644 --- a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs +++ b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs @@ -820,7 +820,7 @@ public static void WriteStringValueSegment_MixEncoding() var output = new ArrayBufferWriter(); using var jsonUtf8 = new Utf8JsonWriter(output); - // Becuase the first code point is a surrogate pair, it will be + // Because the first code point is a surrogate pair, it will be // saved until the next write to complete it. It is saved in the // original encoding, UTF-16, so it will be 0b1101_1000 0b1101_1000 jsonUtf8.WriteStringValueSegment("\uD8D8".AsSpan(), false); From a5cd8554349d6dd4507b9375f6c9ec16ef791931 Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Fri, 20 Dec 2024 12:48:13 -0800 Subject: [PATCH 18/21] pr comments --- .../DotnetFuzzing/DotnetFuzzing.csproj | 7 +- .../Fuzzers/Utf8JsonWriterFuzzer.cs | 117 ++++++++++----- .../DotnetFuzzing/MemoryBackedStream.cs | 142 ------------------ .../Utf8JsonWriter.WriteValues.Helpers.cs | 54 +++---- .../System/Text/Json/Writer/Utf8JsonWriter.cs | 19 ++- 5 files changed, 121 insertions(+), 218 deletions(-) delete mode 100644 src/libraries/Fuzzing/DotnetFuzzing/MemoryBackedStream.cs diff --git a/src/libraries/Fuzzing/DotnetFuzzing/DotnetFuzzing.csproj b/src/libraries/Fuzzing/DotnetFuzzing/DotnetFuzzing.csproj index fcc85a77545553..a392983c364e9f 100644 --- a/src/libraries/Fuzzing/DotnetFuzzing/DotnetFuzzing.csproj +++ b/src/libraries/Fuzzing/DotnetFuzzing/DotnetFuzzing.csproj @@ -1,4 +1,4 @@ - + Exe @@ -31,7 +31,6 @@ - @@ -44,10 +43,6 @@ - - - - diff --git a/src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs b/src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs index 402eb6ead31271..afecd70d7afd8d 100644 --- a/src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs +++ b/src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs @@ -36,9 +36,6 @@ internal sealed class Utf8JsonWriterFuzzer : IFuzzer // Options for choosing between UTF-8 and UTF-16 encoding private const byte EncodingFlag = 1 << 5; - // Options for choosing whether to poison previous or next page - private const byte PoisonFlag = 1 << 6; - public void FuzzTarget(ReadOnlySpan bytes) { const int minLength = 10; // 2 ints, 1 byte, and 1 padding to align chars @@ -75,13 +72,14 @@ public void FuzzTarget(ReadOnlySpan bytes) // Compute the expected result by using the encoder directly and the input int maxExpandedSizeBytes = 6 * bytes.Length + 2; - byte[] buffer = ArrayPool.Shared.Rent(maxExpandedSizeBytes); - int written; - Span expected = utf8 - ? EncodeToUtf8(bytes, buffer, options.Encoder, out written) - : EncodeToUtf8(chars, buffer, options.Encoder, out written); + byte[] expectedBuffer = ArrayPool.Shared.Rent(maxExpandedSizeBytes); + Span expected = + expectedBuffer.AsSpan(0, utf8 + ? EncodeToUtf8(bytes, expectedBuffer, options.Encoder) + : EncodeToUtf8(chars, expectedBuffer, options.Encoder)); // Compute the actual result by using Utf8JsonWriter. Each iteration is a different slice of the input, but the result should be the same. + byte[] actualBuffer = new byte[expected.Length]; foreach (ReadOnlySpan ranges in new[] { new[] { 0.. }, @@ -89,53 +87,75 @@ public void FuzzTarget(ReadOnlySpan bytes) new[] { 0..slice1, slice1..slice2, slice2.. }, }) { - // Use a stream backed by bounded memory to detect out-of-bounds accesses - using PooledBoundedMemory memory = PooledBoundedMemory.Rent(expected.Length, (optionsByte & PoisonFlag) == 0 ? PoisonPagePlacement.After : PoisonPagePlacement.Before); - using MemoryBackedStream stream = new(memory.Memory); + using MemoryStream stream = new(actualBuffer); using Utf8JsonWriter writer = new(stream, options); if (utf8) { WriteStringValueSegments(writer, bytes, ranges); - writer.Flush(); } else { WriteStringValueSegments(writer, chars, ranges); - writer.Flush(); } - ReadOnlySpan actual = memory.Span; + writer.Flush(); // Compare the expected and actual results - Assert.SequenceEqual(expected, actual); + Assert.SequenceEqual(expected, actualBuffer); + Assert.Equal(expected.Length, writer.BytesCommitted); + Assert.Equal(0, writer.BytesPending); + + Array.Clear(actualBuffer); } // Additional test for mixing UTF-8 and UTF-16 encoding. The alignment math is easier in UTF-16 mode so just run it for that. if (!utf8) { + Array.Clear(expectedBuffer); + { - using PooledBoundedMemory memory = PooledBoundedMemory.Rent(maxExpandedSizeBytes, PoisonPagePlacement.Before); - using MemoryBackedStream stream = new(memory.Memory); + ReadOnlySpan firstSegment = chars[slice1..]; + ReadOnlySpan secondSegment = bytes[0..(2 * slice1)]; + + expected = expectedBuffer.AsSpan(0, EncodeToUtf8(firstSegment, secondSegment, expectedBuffer, options.Encoder)); + + actualBuffer = new byte[expected.Length]; + using MemoryStream stream = new(actualBuffer); using Utf8JsonWriter writer = new(stream, options); - writer.WriteStringValueSegment(chars[0..slice1], false); - writer.WriteStringValueSegment(bytes[(2 * slice1)..], true); + writer.WriteStringValueSegment(firstSegment, false); + writer.WriteStringValueSegment(secondSegment, true); writer.Flush(); + + Assert.SequenceEqual(expected, actualBuffer); + Assert.Equal(expected.Length, writer.BytesCommitted); + Assert.Equal(0, writer.BytesPending); } + Array.Clear(expectedBuffer); + { - using PooledBoundedMemory memory = PooledBoundedMemory.Rent(maxExpandedSizeBytes, PoisonPagePlacement.Before); - using MemoryBackedStream stream = new(memory.Memory); + ReadOnlySpan firstSegment = bytes[0..(2 * slice1)]; + ReadOnlySpan secondSegment = chars[slice1..]; + + expected = expectedBuffer.AsSpan(0, EncodeToUtf8(firstSegment, secondSegment, expectedBuffer, options.Encoder)); + + actualBuffer = new byte[expected.Length]; + using MemoryStream stream = new(actualBuffer); using Utf8JsonWriter writer = new(stream, options); - writer.WriteStringValueSegment(bytes[0..(2 * slice1)], false); - writer.WriteStringValueSegment(chars[slice1..], true); + writer.WriteStringValueSegment(firstSegment, false); + writer.WriteStringValueSegment(secondSegment, true); writer.Flush(); + + Assert.SequenceEqual(expected, actualBuffer); + Assert.Equal(expected.Length, writer.BytesCommitted); + Assert.Equal(0, writer.BytesPending); } } - ArrayPool.Shared.Return(buffer); + ArrayPool.Shared.Return(expectedBuffer); } private static void WriteStringValueSegments(Utf8JsonWriter writer, ReadOnlySpan bytes, ReadOnlySpan ranges) @@ -154,23 +174,50 @@ private static void WriteStringValueSegments(Utf8JsonWriter writer, ReadOnlySpan } } - private static Span EncodeToUtf8(ReadOnlySpan bytes, Span destBuffer, JavaScriptEncoder encoder, out int written) + private static int EncodeToUtf8(ReadOnlySpan bytes, Span destBuffer, JavaScriptEncoder encoder) + { + destBuffer[0] = (byte)'"'; + encoder.EncodeUtf8(bytes, destBuffer[1..], out _, out int written, isFinalBlock: true); + destBuffer[++written] = (byte)'"'; + return written + 1; + } + + private static int EncodeToUtf8(ReadOnlySpan chars, Span destBuffer, JavaScriptEncoder encoder) + { + int written = 1; + destBuffer[0] = (byte)'"'; + destBuffer[written += EncodeTranscode(chars, destBuffer[1..], encoder)] = (byte)'"'; + return written + 1; + } + + private static int EncodeToUtf8(ReadOnlySpan bytes, ReadOnlySpan chars, Span destBuffer, JavaScriptEncoder encoder) + { + int written = 1; + destBuffer[0] = (byte)'"'; + encoder.EncodeUtf8(bytes, destBuffer[1..], out _, out int writtenTemp, isFinalBlock: true); + written += writtenTemp; + destBuffer[written += EncodeTranscode(chars, destBuffer[written..], encoder, isFinalBlock: true)] = (byte)'"'; + return written + 1; + } + + private static int EncodeToUtf8(ReadOnlySpan chars, ReadOnlySpan bytes, Span destBuffer, JavaScriptEncoder encoder) { + int written = 1; destBuffer[0] = (byte)'"'; - encoder.EncodeUtf8(bytes, destBuffer[1..], out _, out written, isFinalBlock: true); - destBuffer[written + 1] = (byte)'"'; - return destBuffer.Slice(0, written + 2); + written += EncodeTranscode(chars, destBuffer[1..], encoder, isFinalBlock: true); + encoder.EncodeUtf8(bytes, destBuffer[written..], out _, out int writtenTemp, isFinalBlock: true); + written += writtenTemp; + destBuffer[written] = (byte)'"'; + return written + 1; } - private static Span EncodeToUtf8(ReadOnlySpan chars, Span destBuffer, JavaScriptEncoder encoder, out int written) + private static int EncodeTranscode(ReadOnlySpan chars, Span destBuffer, JavaScriptEncoder encoder, bool isFinalBlock = true) { - var utf16buffer = ArrayPool.Shared.Rent(6 * chars.Length + 2); - utf16buffer[0] = '"'; - encoder.Encode(chars, utf16buffer.AsSpan(1), out _, out written, isFinalBlock: true); - utf16buffer[written + 1] = '"'; + var utf16buffer = ArrayPool.Shared.Rent(6 * chars.Length); + encoder.Encode(chars, utf16buffer, out _, out int written, isFinalBlock: true); - Utf8.FromUtf16(utf16buffer.AsSpan(0, written + 2), destBuffer, out _, out written, isFinalBlock: true); + Utf8.FromUtf16(utf16buffer.AsSpan(0, written), destBuffer, out _, out written, isFinalBlock); ArrayPool.Shared.Return(utf16buffer); - return destBuffer[0..written]; + return written; } } diff --git a/src/libraries/Fuzzing/DotnetFuzzing/MemoryBackedStream.cs b/src/libraries/Fuzzing/DotnetFuzzing/MemoryBackedStream.cs deleted file mode 100644 index 1a9c24d7733ae1..00000000000000 --- a/src/libraries/Fuzzing/DotnetFuzzing/MemoryBackedStream.cs +++ /dev/null @@ -1,142 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Buffers; -using System.Diagnostics; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -using System.Threading; -using System.Threading.Tasks; - -namespace DotnetFuzzing; - -/// -/// A stream implementation that is backed by instead of a byte array. -/// This is particularly useful in tests where we need a stream but we also want to detect -/// out-of-bounds accesses with . -/// -public class MemoryBackedStream : Stream -{ - private Memory _memory; - private bool _writable; - private bool _disposed; - private int _position; - - public MemoryBackedStream(Memory memory, bool writable = true) - { - _memory = memory; - _writable = writable; - } - - public override bool CanRead => _disposed; - - public override bool CanSeek => _disposed; - - public override bool CanWrite => _writable; - - public override long Length - { - get - { - EnsureNotClosed(); - return _memory.Length; - } - } - - public override long Position - { - get - { - EnsureNotClosed(); - return _position; - } - set - { - ArgumentOutOfRangeException.ThrowIfGreaterThan((ulong)value, (ulong)int.MaxValue, nameof(value)); - EnsureNotClosed(); - _position = (int)value; - } - } - - public override void Flush() { } - - public override int Read(byte[] buffer, int offset, int count) - { - ValidateBufferArguments(buffer, offset, count); - EnsureNotClosed(); - - int n = _memory.Length - _position; - if (n > count) - n = count; - if (n <= 0) - return 0; - - _memory.CopyTo(buffer.AsMemory(offset, count)); - return n; - } - - public override long Seek(long offset, SeekOrigin origin) - { - EnsureNotClosed(); - return SeekCore(offset, origin switch - { - SeekOrigin.Begin => 0, - SeekOrigin.Current => _position, - SeekOrigin.End => _memory.Length, - _ => throw new ArgumentException(nameof(origin)) - }); - } - - private long SeekCore(long offset, int loc) - { - ArgumentOutOfRangeException.ThrowIfGreaterThan(offset, int.MaxValue - loc); - int tempPosition = unchecked(loc + (int)offset); - if (unchecked(loc + offset) < 0 || tempPosition < 0) - throw new IOException("Seek before begin."); - _position = tempPosition; - - Debug.Assert(_position >= 0); - return _position; - } - - public override void SetLength(long value) => throw new NotSupportedException("Currently stream expansion is not supported."); - - public override void Write(byte[] buffer, int offset, int count) - { - ValidateBufferArguments(buffer, offset, count); - EnsureNotClosed(); - EnsureWriteable(); - - int i = _position + count; - // Check for overflow - if (i < 0) - throw new IOException("Stream too long."); - - if (i > _memory.Length) - throw new NotSupportedException("Currently stream expansion is not supported."); - - buffer.AsMemory(offset, count).CopyTo(_memory); - } - - protected override void Dispose(bool disposing) - { - if (!_disposed) - { - _disposed = true; - _memory = Memory.Empty; - _writable = false; - } - } - - private void EnsureNotClosed() - { - if (_disposed) - throw new ObjectDisposedException(nameof(MemoryBackedStream)); - } - - private void EnsureWriteable() - { - if (!_writable) - throw new ObjectDisposedException(nameof(MemoryBackedStream)); - } -} diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs index 916e564a9541af..ff927a0cc9a8a8 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs @@ -11,24 +11,24 @@ namespace System.Text.Json { public sealed partial class Utf8JsonWriter { - private const byte LengthMask = 0b000_000_11; - private const byte EncodingMask = 0b000_111_00; + private const byte PartialCodePointLengthMask = 0b000_000_11; + private const byte PartialCodePointEncodingMask = 0b000_111_00; - private const byte Utf8EncodingFlag = 0b000_001_00; - private const byte Utf16EncodingFlag = 0b000_010_00; + private const byte PartialCodePointUtf8EncodingFlag = 0b000_001_00; + private const byte PartialCodePointUtf16EncodingFlag = 0b000_010_00; private bool TryGetPartialUtf8CodePoint(out ReadOnlySpan codePointBytes) { - ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; - Debug.Assert(partialCodePointBytes.Length == 4); - - if ((partialCodePointBytes[3] & Utf8EncodingFlag) == 0) + if ((_partialCodePointFlags & PartialCodePointUtf8EncodingFlag) == 0) { - codePointBytes = ReadOnlySpan.Empty; + codePointBytes = []; return false; } - int length = partialCodePointBytes[3] & LengthMask; + ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; + Debug.Assert(partialCodePointBytes.Length == 3); + + int length = _partialCodePointFlags & PartialCodePointLengthMask; Debug.Assert((uint)length < 4); codePointBytes = partialCodePointBytes.Slice(0, length); @@ -37,17 +37,17 @@ private bool TryGetPartialUtf8CodePoint(out ReadOnlySpan codePointBytes) private bool TryGetPartialUtf16CodePoint(out ReadOnlySpan codePointChars) { - ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; - Debug.Assert(partialCodePointBytes.Length == 4); - - if ((partialCodePointBytes[3] & Utf16EncodingFlag) == 0) + if ((_partialCodePointFlags & PartialCodePointUtf16EncodingFlag) == 0) { - codePointChars = ReadOnlySpan.Empty; + codePointChars = []; return false; } - int length = partialCodePointBytes[3] & LengthMask; - Debug.Assert(length == 2 || length == 0); + ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; + Debug.Assert(partialCodePointBytes.Length == 3); + + int length = _partialCodePointFlags & PartialCodePointLengthMask; + Debug.Assert(length is 2 or 0); codePointChars = MemoryMarshal.Cast(partialCodePointBytes.Slice(0, length)); return true; @@ -60,7 +60,7 @@ private void SetPartialUtf8CodePoint(ReadOnlySpan bytes) Span partialCodePointBytes = PartialCodePointRaw; bytes.CopyTo(partialCodePointBytes); - partialCodePointBytes[3] = (byte)(bytes.Length | Utf8EncodingFlag); + _partialCodePointFlags = (byte)(bytes.Length | PartialCodePointUtf8EncodingFlag); } private void SetPartialUtf16CodePoint(ReadOnlySpan bytes) @@ -70,28 +70,28 @@ private void SetPartialUtf16CodePoint(ReadOnlySpan bytes) Span partialCodePointBytes = PartialCodePointRaw; bytes.CopyTo(MemoryMarshal.Cast(partialCodePointBytes)); - partialCodePointBytes[3] = (byte)((2 * bytes.Length) | Utf16EncodingFlag); + _partialCodePointFlags = (byte)((2 * bytes.Length) | PartialCodePointUtf16EncodingFlag); } - private bool HasPartialCodePoint => (PartialCodePointRaw[3] & LengthMask) != 0; + private bool HasPartialCodePoint => (_partialCodePointFlags & PartialCodePointLengthMask) != 0; - private void ClearPartialCodePoint() => PartialCodePointRaw[3] = 0; + private void ClearPartialCodePoint() => _partialCodePointFlags = 0; private void WriteInvalidPartialCodePoint() { ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; - Debug.Assert(partialCodePointBytes.Length == 4); + Debug.Assert(partialCodePointBytes.Length == 3); - int length = partialCodePointBytes[3] & LengthMask; + int length = _partialCodePointFlags & PartialCodePointLengthMask; - switch (partialCodePointBytes[3] & EncodingMask) + switch (_partialCodePointFlags & PartialCodePointEncodingMask) { - case Utf8EncodingFlag: + case PartialCodePointUtf8EncodingFlag: Debug.Assert((uint)length < 4); WriteStringSegmentEscape(partialCodePointBytes.Slice(0, length), true); break; - case Utf16EncodingFlag: - Debug.Assert(length == 0 || length == 2); + case PartialCodePointUtf16EncodingFlag: + Debug.Assert(length is 0 or 2); WriteStringSegmentEscape(MemoryMarshal.Cast(partialCodePointBytes.Slice(0, length)), true); break; default: diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs index b301abda2e88fd..db207eba8e6b3e 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs @@ -53,25 +53,28 @@ public sealed partial class Utf8JsonWriter : IDisposable, IAsyncDisposable private BitStack _bitStack; /// - /// This 4-byte array stores the partial code point leftover when writing a string value - /// segment that is split across multiple write calls. The first 3 bytes provide space - /// to store the leftover bytes using the source encoding and the last byte is the number - /// of bytes used to store the partial code point. + /// This 3-byte array stores the partial code point leftover when writing a string value + /// segment that is split across multiple write calls. /// #if !NET private byte[]? _partialCodePoint; - private Span PartialCodePointRaw => _partialCodePoint ??= new byte[4]; + private Span PartialCodePointRaw => _partialCodePoint ??= new byte[3]; #else - private Inline4ByteArray _partialCodePoint; + private Inline3ByteArray _partialCodePoint; private Span PartialCodePointRaw => _partialCodePoint; - [InlineArray(4)] - private struct Inline4ByteArray + [InlineArray(3)] + private struct Inline3ByteArray { public byte byte0; } #endif + /// + /// Stores the length and encoding of the partial code point. + /// + private byte _partialCodePointFlags; + // The highest order bit of _currentDepth is used to discern whether we are writing the first item in a list or not. // if (_currentDepth >> 31) == 1, add a list separator before writing the item // else, no list separator is needed since we are writing the first item. From b7fd4a5be5572e86cbdebd49a2e9354f3854dbc3 Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Mon, 23 Dec 2024 16:29:25 -0800 Subject: [PATCH 19/21] throw when encodings are mixed --- .../src/Resources/Strings.resx | 3 + .../src/System.Text.Json.csproj | 1 - .../src/System/Text/Json/ThrowHelper.cs | 6 + .../Text/Json/Writer/SequenceValidity.cs | 54 ---- .../Utf8JsonWriter.WriteValues.Helpers.cs | 93 +------ ...tf8JsonWriter.WriteValues.StringSegment.cs | 242 ++++++++++-------- .../System/Text/Json/Writer/Utf8JsonWriter.cs | 106 +++++++- ...tf8JsonWriterTests.Values.StringSegment.cs | 37 +-- 8 files changed, 253 insertions(+), 289 deletions(-) delete mode 100644 src/libraries/System.Text.Json/src/System/Text/Json/Writer/SequenceValidity.cs diff --git a/src/libraries/System.Text.Json/src/Resources/Strings.resx b/src/libraries/System.Text.Json/src/Resources/Strings.resx index 1d0f1005a69859..39325a7e3f70e3 100644 --- a/src/libraries/System.Text.Json/src/Resources/Strings.resx +++ b/src/libraries/System.Text.Json/src/Resources/Strings.resx @@ -824,4 +824,7 @@ Cannot extract a Unicode scalar value from the specified index in the input. + + Cannot mix encodings between string value segments. The previous segment's encoding was '{0}' and the current segment's encoding is '{1}'. + \ No newline at end of file diff --git a/src/libraries/System.Text.Json/src/System.Text.Json.csproj b/src/libraries/System.Text.Json/src/System.Text.Json.csproj index e8c9a953577c24..b7130feea79ea8 100644 --- a/src/libraries/System.Text.Json/src/System.Text.Json.csproj +++ b/src/libraries/System.Text.Json/src/System.Text.Json.csproj @@ -309,7 +309,6 @@ The System.Text.Json library is built-in as part of the shared framework in .NET - diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/ThrowHelper.cs b/src/libraries/System.Text.Json/src/System/Text/Json/ThrowHelper.cs index df76deacf835b9..2bc50fcfeb4d39 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/ThrowHelper.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/ThrowHelper.cs @@ -311,6 +311,12 @@ public static void ThrowInvalidOperationException_CannotSkipOnPartial() throw GetInvalidOperationException(SR.CannotSkip); } + [DoesNotReturn] + public static void ThrowInvalidOperationException_CannotMixEncodings(Utf8JsonWriter.SegmentEncoding previousEncoding, Utf8JsonWriter.SegmentEncoding currentEncoding) + { + throw GetInvalidOperationException(SR.Format(SR.CannotMixEncodings, previousEncoding, currentEncoding)); + } + private static InvalidOperationException GetInvalidOperationException(string message, JsonTokenType tokenType) { return GetInvalidOperationException(SR.Format(SR.InvalidCast, tokenType, message)); diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/SequenceValidity.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/SequenceValidity.cs deleted file mode 100644 index 6d7ec2ce08e397..00000000000000 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/SequenceValidity.cs +++ /dev/null @@ -1,54 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -namespace System.Buffers.Text -{ - /// - /// Represents the validity of a UTF code unit sequence. - /// - internal enum SequenceValidity - { - /// - /// The sequence is empty. - /// - Empty = 0, - - /// - /// The sequence is well-formed and unambiguously represents a proper Unicode scalar value. - /// - /// - /// [ 20 ] (U+0020 SPACE) is a well-formed UTF-8 sequence. - /// [ C3 A9 ] (U+00E9 LATIN SMALL LETTER E WITH ACUTE) is a well-formed UTF-8 sequence. - /// [ F0 9F 98 80 ] (U+1F600 GRINNING FACE) is a well-formed UTF-8 sequence. - /// [ D83D DE00 ] (U+1F600 GRINNING FACE) is a well-formed UTF-16 sequence. - /// - WellFormed = 1, - - /// - /// The sequence is not well-formed on its own, but it could appear as a prefix - /// of a longer well-formed sequence. More code units are needed to make a proper - /// determination as to whether this sequence is well-formed. Incomplete sequences - /// can only appear at the end of a string. - /// - /// - /// [ C2 ] is an incomplete UTF-8 sequence if it is followed by nothing. - /// [ F0 9F ] is an incomplete UTF-8 sequence if it is followed by nothing. - /// [ D83D ] is an incomplete UTF-16 sequence if it is followed by nothing. - /// - Incomplete = 2, - - /// - /// The sequence is never well-formed anywhere, or this sequence can never appear as a prefix - /// of a longer well-formed sequence, or the sequence was improperly terminated by the code - /// unit which appeared immediately after this sequence. - /// - /// - /// [ 80 ] is an invalid UTF-8 sequence (code unit cannot appear at start of sequence). - /// [ FE ] is an invalid UTF-8 sequence (sequence is never well-formed anywhere in UTF-8 string). - /// [ C2 ] is an invalid UTF-8 sequence if it is followed by [ 20 ] (sequence improperly terminated). - /// [ ED A0 ] is an invalid UTF-8 sequence (sequence is never well-formed anywhere in UTF-8 string). - /// [ DE00 ] is an invalid UTF-16 sequence (code unit cannot appear at start of sequence). - /// - Invalid = 3 - } -} diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs index ff927a0cc9a8a8..a8440144d4cf88 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Helpers.cs @@ -4,6 +4,7 @@ using System.Buffers; using System.Buffers.Text; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -11,103 +12,27 @@ namespace System.Text.Json { public sealed partial class Utf8JsonWriter { - private const byte PartialCodePointLengthMask = 0b000_000_11; - private const byte PartialCodePointEncodingMask = 0b000_111_00; + private bool HasPartialCodePoint => PartialCodePointLength != 0; - private const byte PartialCodePointUtf8EncodingFlag = 0b000_001_00; - private const byte PartialCodePointUtf16EncodingFlag = 0b000_010_00; + private void ClearPartialCodePoint() => PartialCodePointLength = 0; - private bool TryGetPartialUtf8CodePoint(out ReadOnlySpan codePointBytes) + private void ValidateEncodingDidNotChange(SegmentEncoding currentSegmentEncoding) { - if ((_partialCodePointFlags & PartialCodePointUtf8EncodingFlag) == 0) + if (PreviousSegmentEncoding != currentSegmentEncoding) { - codePointBytes = []; - return false; - } - - ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; - Debug.Assert(partialCodePointBytes.Length == 3); - - int length = _partialCodePointFlags & PartialCodePointLengthMask; - Debug.Assert((uint)length < 4); - - codePointBytes = partialCodePointBytes.Slice(0, length); - return true; - } - - private bool TryGetPartialUtf16CodePoint(out ReadOnlySpan codePointChars) - { - if ((_partialCodePointFlags & PartialCodePointUtf16EncodingFlag) == 0) - { - codePointChars = []; - return false; - } - - ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; - Debug.Assert(partialCodePointBytes.Length == 3); - - int length = _partialCodePointFlags & PartialCodePointLengthMask; - Debug.Assert(length is 2 or 0); - - codePointChars = MemoryMarshal.Cast(partialCodePointBytes.Slice(0, length)); - return true; - } - - private void SetPartialUtf8CodePoint(ReadOnlySpan bytes) - { - Debug.Assert(bytes.Length <= 3); - - Span partialCodePointBytes = PartialCodePointRaw; - - bytes.CopyTo(partialCodePointBytes); - _partialCodePointFlags = (byte)(bytes.Length | PartialCodePointUtf8EncodingFlag); - } - - private void SetPartialUtf16CodePoint(ReadOnlySpan bytes) - { - Debug.Assert(bytes.Length <= 1); - - Span partialCodePointBytes = PartialCodePointRaw; - - bytes.CopyTo(MemoryMarshal.Cast(partialCodePointBytes)); - _partialCodePointFlags = (byte)((2 * bytes.Length) | PartialCodePointUtf16EncodingFlag); - } - - private bool HasPartialCodePoint => (_partialCodePointFlags & PartialCodePointLengthMask) != 0; - - private void ClearPartialCodePoint() => _partialCodePointFlags = 0; - - private void WriteInvalidPartialCodePoint() - { - ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; - Debug.Assert(partialCodePointBytes.Length == 3); - - int length = _partialCodePointFlags & PartialCodePointLengthMask; - - switch (_partialCodePointFlags & PartialCodePointEncodingMask) - { - case PartialCodePointUtf8EncodingFlag: - Debug.Assert((uint)length < 4); - WriteStringSegmentEscape(partialCodePointBytes.Slice(0, length), true); - break; - case PartialCodePointUtf16EncodingFlag: - Debug.Assert(length is 0 or 2); - WriteStringSegmentEscape(MemoryMarshal.Cast(partialCodePointBytes.Slice(0, length)), true); - break; - default: - Debug.Fail("Encoding not recognized."); - break; + ThrowHelper.ThrowInvalidOperationException_CannotMixEncodings(PreviousSegmentEncoding, currentSegmentEncoding); } } private void ValidateNotWithinUnfinalizedString() { - Debug.Assert(!HasPartialCodePoint); - if (_tokenType == StringSegmentSentinel) { ThrowHelper.ThrowInvalidOperationException(ExceptionResource.CannotWriteWithinString, currentDepth: default, maxDepth: _options.MaxDepth, token: default, _tokenType); } + + Debug.Assert(PreviousSegmentEncoding == SegmentEncoding.None); + Debug.Assert(!HasPartialCodePoint); } private void ValidateWritingValue() diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs index d1b665f9c5fe0e..08b517cce9648d 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.StringSegment.cs @@ -21,7 +21,8 @@ public sealed partial class Utf8JsonWriter /// Thrown when the specified value is too large. /// /// - /// Thrown if this would result in invalid JSON being written (while validation is enabled). + /// Thrown if this would result in invalid JSON being written (while validation is enabled) or + /// if the previously written segment (if any) was not written with this same overload. /// /// /// The value is escaped before writing. @@ -30,16 +31,25 @@ public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegmen { JsonWriterHelper.ValidateValue(value); - if (!_options.SkipValidation && _tokenType != Utf8JsonWriter.StringSegmentSentinel) - { - ValidateWritingValue(); - } - if (_tokenType != Utf8JsonWriter.StringSegmentSentinel) { + Debug.Assert(PreviousSegmentEncoding == SegmentEncoding.None); + Debug.Assert(!HasPartialCodePoint); + + if (!_options.SkipValidation) + { + ValidateWritingValue(); + } + WriteStringSegmentPrologue(); + + PreviousSegmentEncoding = SegmentEncoding.Utf16; _tokenType = Utf8JsonWriter.StringSegmentSentinel; } + else + { + ValidateEncodingDidNotChange(SegmentEncoding.Utf16); + } // The steps to write a string segment are to complete the previous partial code point // and escape either of which might not be required so there is a fast path for each of these steps. @@ -55,7 +65,9 @@ public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegmen if (isFinalSegment) { WriteStringSegmentEpilogue(); + SetFlagToAddListSeparatorBeforeNextItem(); + PreviousSegmentEncoding = SegmentEncoding.None; _tokenType = JsonTokenType.String; } } @@ -63,55 +75,51 @@ public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegmen private void WriteStringSegmentWithLeftover(scoped ReadOnlySpan value, bool isFinalSegment) { Debug.Assert(HasPartialCodePoint); - - if (TryGetPartialUtf16CodePoint(out scoped ReadOnlySpan partialCodePointBuffer)) - { - Span combinedBuffer = stackalloc char[2]; - combinedBuffer = combinedBuffer.Slice(0, ConcatInto(partialCodePointBuffer, value, combinedBuffer)); - - switch (Rune.DecodeFromUtf16(combinedBuffer, out _, out int charsConsumed)) - { - case OperationStatus.NeedMoreData: - Debug.Assert(value.Length + partialCodePointBuffer.Length < 2); - Debug.Assert(charsConsumed == value.Length + partialCodePointBuffer.Length); - // Let the encoder deal with the error if this is a final buffer. - value = combinedBuffer.Slice(0, charsConsumed); - partialCodePointBuffer = ReadOnlySpan.Empty; - break; - case OperationStatus.Done: - Debug.Assert(charsConsumed > partialCodePointBuffer.Length); - Debug.Assert(charsConsumed <= 2); - // Divide up the code point chars into its own buffer and the remainder of the input buffer. - value = value.Slice(charsConsumed - partialCodePointBuffer.Length); - partialCodePointBuffer = combinedBuffer.Slice(0, charsConsumed); - break; - case OperationStatus.InvalidData: - Debug.Assert(charsConsumed >= partialCodePointBuffer.Length); - Debug.Assert(charsConsumed <= 2); - value = value.Slice(charsConsumed - partialCodePointBuffer.Length); - partialCodePointBuffer = combinedBuffer.Slice(0, charsConsumed); - break; - case OperationStatus.DestinationTooSmall: - default: - Debug.Fail("Unexpected OperationStatus return value."); - break; - } - - // The "isFinalSegment" argument indicates whether input that NeedsMoreData should be consumed as an error or not. - // Because we have validated above that partialCodePointBuffer will be the next consumed chars during Rune decoding - // (even if this is because it is invalid), we should pass isFinalSegment = true to indicate to the decoder to - // parse the code units without extra data. - // - // This is relevant in the case of having ['\uD800', 'C'], where the validation above would have needed all both code units - // to determine that only the first unit should be consumed (as invalid). So this method will get only ['\uD800']. - // Because we know more data will not be able to complete this code point, we need to pass isFinalSegment = true - // to ensure that the encoder consumes this data eagerly instead of leaving it and returning NeedsMoreData. - WriteStringSegmentEscape(partialCodePointBuffer, true); - } - else - { - WriteInvalidPartialCodePoint(); - } + Debug.Assert(PreviousSegmentEncoding == SegmentEncoding.Utf16); + + scoped ReadOnlySpan partialCodePointBuffer = PartialUtf16CodePoint; + + Span combinedBuffer = stackalloc char[2]; + combinedBuffer = combinedBuffer.Slice(0, ConcatInto(partialCodePointBuffer, value, combinedBuffer)); + + switch (Rune.DecodeFromUtf16(combinedBuffer, out _, out int charsConsumed)) + { + case OperationStatus.NeedMoreData: + Debug.Assert(value.Length + partialCodePointBuffer.Length < 2); + Debug.Assert(charsConsumed == value.Length + partialCodePointBuffer.Length); + // Let the encoder deal with the error if this is a final buffer. + value = combinedBuffer.Slice(0, charsConsumed); + partialCodePointBuffer = ReadOnlySpan.Empty; + break; + case OperationStatus.Done: + Debug.Assert(charsConsumed > partialCodePointBuffer.Length); + Debug.Assert(charsConsumed <= 2); + // Divide up the code point chars into its own buffer and the remainder of the input buffer. + value = value.Slice(charsConsumed - partialCodePointBuffer.Length); + partialCodePointBuffer = combinedBuffer.Slice(0, charsConsumed); + break; + case OperationStatus.InvalidData: + Debug.Assert(charsConsumed >= partialCodePointBuffer.Length); + Debug.Assert(charsConsumed <= 2); + value = value.Slice(charsConsumed - partialCodePointBuffer.Length); + partialCodePointBuffer = combinedBuffer.Slice(0, charsConsumed); + break; + case OperationStatus.DestinationTooSmall: + default: + Debug.Fail("Unexpected OperationStatus return value."); + break; + } + + // The "isFinalSegment" argument indicates whether input that NeedsMoreData should be consumed as an error or not. + // Because we have validated above that partialCodePointBuffer will be the next consumed chars during Rune decoding + // (even if this is because it is invalid), we should pass isFinalSegment = true to indicate to the decoder to + // parse the code units without extra data. + // + // This is relevant in the case of having ['\uD800', 'C'], where the validation above would have needed both code units + // to determine that only the first unit should be consumed (as invalid). So this method will get only ['\uD800']. + // Because we know more data will not be able to complete this code point, we need to pass isFinalSegment = true + // to ensure that the encoder consumes this data eagerly instead of leaving it and returning NeedsMoreData. + WriteStringSegmentEscape(partialCodePointBuffer, true); ClearPartialCodePoint(); @@ -155,7 +163,7 @@ private void WriteStringSegmentEscapeValue(ReadOnlySpan value, int firstEs { Debug.Assert(!isFinalSegment); Debug.Assert(value.Length - consumed < 2); - SetPartialUtf16CodePoint(value.Slice(consumed)); + PartialUtf16CodePoint = value.Slice(consumed); } if (valueArray != null) @@ -189,7 +197,8 @@ private void WriteStringSegmentData(ReadOnlySpan escapedValue) /// Thrown when the specified value is too large. /// /// - /// Thrown if this would result in invalid JSON being written (while validation is enabled). + /// Thrown if this would result in invalid JSON being written (while validation is enabled) or + /// if the previously written segment (if any) was not written with this same overload. /// /// /// The value is escaped before writing. @@ -198,16 +207,25 @@ public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegmen { JsonWriterHelper.ValidateValue(value); - if (!_options.SkipValidation && _tokenType != Utf8JsonWriter.StringSegmentSentinel) - { - ValidateWritingValue(); - } - if (_tokenType != Utf8JsonWriter.StringSegmentSentinel) { + Debug.Assert(PreviousSegmentEncoding == SegmentEncoding.None); + Debug.Assert(!HasPartialCodePoint); + + if (!_options.SkipValidation) + { + ValidateWritingValue(); + } + WriteStringSegmentPrologue(); + + PreviousSegmentEncoding = SegmentEncoding.Utf8; _tokenType = Utf8JsonWriter.StringSegmentSentinel; } + else + { + ValidateEncodingDidNotChange(SegmentEncoding.Utf8); + } // The steps to write a string segment are to complete the previous partial code point // and escape either of which might not be required so there is a fast path for each of these steps. @@ -223,7 +241,9 @@ public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegmen if (isFinalSegment) { WriteStringSegmentEpilogue(); + SetFlagToAddListSeparatorBeforeNextItem(); + PreviousSegmentEncoding = SegmentEncoding.None; _tokenType = JsonTokenType.String; } } @@ -231,56 +251,52 @@ public void WriteStringValueSegment(ReadOnlySpan value, bool isFinalSegmen private void WriteStringSegmentWithLeftover(scoped ReadOnlySpan utf8Value, bool isFinalSegment) { Debug.Assert(HasPartialCodePoint); - - if (TryGetPartialUtf8CodePoint(out scoped ReadOnlySpan partialCodePointBuffer)) - { - Span combinedBuffer = stackalloc byte[4]; - combinedBuffer = combinedBuffer.Slice(0, ConcatInto(partialCodePointBuffer, utf8Value, combinedBuffer)); - - switch (Rune.DecodeFromUtf8(combinedBuffer, out _, out int bytesConsumed)) - { - case OperationStatus.NeedMoreData: - Debug.Assert(utf8Value.Length + partialCodePointBuffer.Length < 4); - Debug.Assert(bytesConsumed == utf8Value.Length + partialCodePointBuffer.Length); - // Let the encoder deal with the error if this is a final buffer. - utf8Value = combinedBuffer.Slice(0, bytesConsumed); - partialCodePointBuffer = ReadOnlySpan.Empty; - break; - case OperationStatus.Done: - Debug.Assert(bytesConsumed > partialCodePointBuffer.Length); - Debug.Assert(bytesConsumed <= 4); - // Divide up the code point bytes into its own buffer and the remainder of the input buffer. - utf8Value = utf8Value.Slice(bytesConsumed - partialCodePointBuffer.Length); - partialCodePointBuffer = combinedBuffer.Slice(0, bytesConsumed); - break; - case OperationStatus.InvalidData: - Debug.Assert(bytesConsumed >= partialCodePointBuffer.Length); - Debug.Assert(bytesConsumed <= 4); - utf8Value = utf8Value.Slice(bytesConsumed - partialCodePointBuffer.Length); - partialCodePointBuffer = combinedBuffer.Slice(0, bytesConsumed); - break; - case OperationStatus.DestinationTooSmall: - default: - Debug.Fail("Unexpected OperationStatus return value."); - break; - } - - // The "isFinalSegment" argument indicates whether input that NeedsMoreData should be consumed as an error or not. - // Because we have validated above that partialCodePointBuffer will be the next consumed bytes during Rune decoding - // (even if this is because it is invalid), we should pass isFinalSegment = true to indicate to the decoder to - // parse the code units without extra data. - // - // This is relevant in the case of having [<3-length prefix code unit>, , ], where the validation - // above would have needed all 3 code units to determine that only the first 2 units should be consumed (as invalid). - // So this method will get only <3-size prefix code unit>. Because we know more data will not be able - // to complete this code point, we need to pass isFinalSegment = true to ensure that the encoder consumes this data eagerly - // instead of leaving it and returning NeedsMoreData. - WriteStringSegmentEscape(partialCodePointBuffer, true); - } - else - { - WriteInvalidPartialCodePoint(); - } + Debug.Assert(PreviousSegmentEncoding == SegmentEncoding.Utf8); + + scoped ReadOnlySpan partialCodePointBuffer = PartialUtf8CodePoint; + + Span combinedBuffer = stackalloc byte[4]; + combinedBuffer = combinedBuffer.Slice(0, ConcatInto(partialCodePointBuffer, utf8Value, combinedBuffer)); + + switch (Rune.DecodeFromUtf8(combinedBuffer, out _, out int bytesConsumed)) + { + case OperationStatus.NeedMoreData: + Debug.Assert(utf8Value.Length + partialCodePointBuffer.Length < 4); + Debug.Assert(bytesConsumed == utf8Value.Length + partialCodePointBuffer.Length); + // Let the encoder deal with the error if this is a final buffer. + utf8Value = combinedBuffer.Slice(0, bytesConsumed); + partialCodePointBuffer = ReadOnlySpan.Empty; + break; + case OperationStatus.Done: + Debug.Assert(bytesConsumed > partialCodePointBuffer.Length); + Debug.Assert(bytesConsumed <= 4); + // Divide up the code point bytes into its own buffer and the remainder of the input buffer. + utf8Value = utf8Value.Slice(bytesConsumed - partialCodePointBuffer.Length); + partialCodePointBuffer = combinedBuffer.Slice(0, bytesConsumed); + break; + case OperationStatus.InvalidData: + Debug.Assert(bytesConsumed >= partialCodePointBuffer.Length); + Debug.Assert(bytesConsumed <= 4); + utf8Value = utf8Value.Slice(bytesConsumed - partialCodePointBuffer.Length); + partialCodePointBuffer = combinedBuffer.Slice(0, bytesConsumed); + break; + case OperationStatus.DestinationTooSmall: + default: + Debug.Fail("Unexpected OperationStatus return value."); + break; + } + + // The "isFinalSegment" argument indicates whether input that NeedsMoreData should be consumed as an error or not. + // Because we have validated above that partialCodePointBuffer will be the next consumed bytes during Rune decoding + // (even if this is because it is invalid), we should pass isFinalSegment = true to indicate to the decoder to + // parse the code units without extra data. + // + // This is relevant in the case of having [<3-length prefix code unit>, , ], where the validation + // above would have needed all 3 code units to determine that only the first 2 units should be consumed (as invalid). + // So this method will get only <3-size prefix code unit>. Because we know more data will not be able + // to complete this code point, we need to pass isFinalSegment = true to ensure that the encoder consumes this data eagerly + // instead of leaving it and returning NeedsMoreData. + WriteStringSegmentEscape(partialCodePointBuffer, true); ClearPartialCodePoint(); @@ -321,7 +337,7 @@ private void WriteStringSegmentEscapeValue(ReadOnlySpan utf8Value, int fir { Debug.Assert(!isFinalSegment); Debug.Assert(utf8Value.Length - consumed < 4); - SetPartialUtf8CodePoint(utf8Value.Slice(consumed)); + PartialUtf8CodePoint = utf8Value.Slice(consumed); } if (valueArray != null) diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs index db207eba8e6b3e..da9dc4b6bac503 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.cs @@ -5,14 +5,11 @@ using System.Diagnostics; using System.IO; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using System.Threading; using System.Threading.Tasks; using System.ComponentModel; -#if !NET -using System.Runtime.InteropServices; -#endif - namespace System.Text.Json { /// @@ -41,6 +38,13 @@ public sealed partial class Utf8JsonWriter : IDisposable, IAsyncDisposable // A special value for JsonTokenType that lets the writer keep track of string segments. private const JsonTokenType StringSegmentSentinel = (JsonTokenType)255; + // Masks and flags for the length and encoding of the partial code point + private const byte PartialCodePointLengthMask = 0b000_000_11; + private const byte PartialCodePointEncodingMask = 0b000_111_00; + + private const byte PartialCodePointUtf8EncodingFlag = 0b000_001_00; + private const byte PartialCodePointUtf16EncodingFlag = 0b000_010_00; + private IBufferWriter? _output; private Stream? _stream; private ArrayBufferWriter? _arrayBufferWriter; @@ -54,7 +58,7 @@ public sealed partial class Utf8JsonWriter : IDisposable, IAsyncDisposable /// /// This 3-byte array stores the partial code point leftover when writing a string value - /// segment that is split across multiple write calls. + /// segment that is split across multiple segment write calls. /// #if !NET private byte[]? _partialCodePoint; @@ -71,7 +75,9 @@ private struct Inline3ByteArray #endif /// - /// Stores the length and encoding of the partial code point. + /// Stores the length and encoding of the partial code point. Outside of segment writes, this value is 0. + /// Across segment writes, this value is always non-zero even if the length is 0, to indicate the encoding of the segment. + /// This allows detection of encoding changes across segment writes. /// private byte _partialCodePointFlags; @@ -121,6 +127,91 @@ private struct Inline3ByteArray /// public int CurrentDepth => _currentDepth & JsonConstants.RemoveFlagsBitMask; + /// + /// Length of the partial code point. + /// + private byte PartialCodePointLength + { + get => (byte)(_partialCodePointFlags & PartialCodePointLengthMask); + set => _partialCodePointFlags = (byte)((_partialCodePointFlags & ~PartialCodePointLengthMask) | (byte)value); + } + + /// + /// The partial UTF-8 code point. + /// + private ReadOnlySpan PartialUtf8CodePoint + { + get + { + Debug.Assert(PreviousSegmentEncoding == SegmentEncoding.Utf8); + + ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; + Debug.Assert(partialCodePointBytes.Length == 3); + + byte length = PartialCodePointLength; + Debug.Assert(length < 4); + + return partialCodePointBytes.Slice(0, length); + } + + set + { + Debug.Assert(value.Length <= 3); + + Span partialCodePointBytes = PartialCodePointRaw; + + value.CopyTo(partialCodePointBytes); + PartialCodePointLength = (byte)value.Length; + } + } + + /// + /// The partial UTF-16 code point. + /// + private ReadOnlySpan PartialUtf16CodePoint + { + get + { + Debug.Assert(PreviousSegmentEncoding == SegmentEncoding.Utf16); + + ReadOnlySpan partialCodePointBytes = PartialCodePointRaw; + Debug.Assert(partialCodePointBytes.Length == 3); + + byte length = PartialCodePointLength; + Debug.Assert(length is 2 or 0); + + return MemoryMarshal.Cast(partialCodePointBytes.Slice(0, length)); + } + set + { + Debug.Assert(value.Length <= 1); + + Span partialCodePointBytes = PartialCodePointRaw; + + value.CopyTo(MemoryMarshal.Cast(partialCodePointBytes)); + PartialCodePointLength = (byte)(2 * value.Length); + } + } + + /// + /// Encoding used for the previous string segment write. + /// + private SegmentEncoding PreviousSegmentEncoding + { + get => (SegmentEncoding)(_partialCodePointFlags & PartialCodePointEncodingMask); + set => _partialCodePointFlags = (byte)((_partialCodePointFlags & ~PartialCodePointEncodingMask) | (byte)value); + } + + /// + /// Convenience enumeration to track the encoding of the partial code point. This must be kept in sync with the PartialCodePoint*Encoding flags. + /// + internal enum SegmentEncoding : byte + { + None = 0, + Utf8 = PartialCodePointUtf8EncodingFlag, + Utf16 = PartialCodePointUtf16EncodingFlag, + } + private Utf8JsonWriter() { } @@ -299,7 +390,8 @@ private void ResetHelper() _bitStack = default; - ClearPartialCodePoint(); + _partialCodePoint = default; + _partialCodePointFlags = default; } private void CheckNotDisposed() diff --git a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs index 5582aae7501d9f..9d95eeb0f9a26a 100644 --- a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs +++ b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.Values.StringSegment.cs @@ -820,39 +820,20 @@ public static void WriteStringValueSegment_MixEncoding() var output = new ArrayBufferWriter(); using var jsonUtf8 = new Utf8JsonWriter(output); - // Because the first code point is a surrogate pair, it will be - // saved until the next write to complete it. It is saved in the - // original encoding, UTF-16, so it will be 0b1101_1000 0b1101_1000 + // High surrogate jsonUtf8.WriteStringValueSegment("\uD8D8".AsSpan(), false); - // Now we write a UTF-8 continuation byte. With the previous partial - // state, the whole sequence is 0b110_11000 0b110_11000 0b10_111111. - jsonUtf8.WriteStringValueSegment([0b10_111111], true); - - jsonUtf8.Flush(); - - // If this is interpreted as UTF-8, the first byte is invalid because - // it is a 2-byte start unit but the second byte is not a continuation. - // So a replacement character gets written for the first byte. The second and - // third units are valid and get written as is. Instead, if this is - // handled correctly, two replacement characters will be written. - - JsonTestHelper.AssertContents(@"""\uFFFD\uFFFD""", output); + Assert.Throws(() => jsonUtf8.WriteStringValueSegment([0b10_111111], true)); } { var output = new ArrayBufferWriter(); using var jsonUtf8 = new Utf8JsonWriter(output); - // The second UTF-16 code unit, when interpreted as UTF-8, is a continuation, - // so if the first and second code units are decoded together, they will - // form a valid 3-byte sequence. + // Start of a 3-byte sequence jsonUtf8.WriteStringValueSegment([0b1110_1111], false); - jsonUtf8.WriteStringValueSegment("\u8080".AsSpan(), true); - jsonUtf8.Flush(); - - JsonTestHelper.AssertContents(@"""\uFFFD\u8080""", output); + Assert.Throws(() => jsonUtf8.WriteStringValueSegment("\u8080".AsSpan(), true)); } { @@ -867,10 +848,8 @@ public static void WriteStringValueSegment_MixEncoding() jsonUtf8.Flush(); JsonTestHelper.AssertContents("\"", output); - // Writing empty UTF-16 sequence will dump the partial UTF-8 code point - jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false); - jsonUtf8.Flush(); - JsonTestHelper.AssertContents(@"""\uFFFD", output); + // Writing empty UTF-16 sequence will throw + Assert.Throws(() => jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false)); } { @@ -886,9 +865,7 @@ public static void WriteStringValueSegment_MixEncoding() JsonTestHelper.AssertContents("\"", output); // Writing empty UTF-8 sequence will dump the partial UTF-16 code point - jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false); - jsonUtf8.Flush(); - JsonTestHelper.AssertContents(@"""\uFFFD", output); + Assert.Throws(() => jsonUtf8.WriteStringValueSegment(ReadOnlySpan.Empty, false)); } } From c0a700c4c366104f4eadbf989d4c597fe2fe5a78 Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Mon, 23 Dec 2024 18:08:43 -0800 Subject: [PATCH 20/21] update fuzzer to assert that mixing encodings always throws --- src/libraries/Fuzzing/DotnetFuzzing/Assert.cs | 20 +++++++++++++++++++ .../Fuzzers/Utf8JsonWriterFuzzer.cs | 15 +++----------- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/src/libraries/Fuzzing/DotnetFuzzing/Assert.cs b/src/libraries/Fuzzing/DotnetFuzzing/Assert.cs index a5f2a9dd1d195b..e4191707281596 100644 --- a/src/libraries/Fuzzing/DotnetFuzzing/Assert.cs +++ b/src/libraries/Fuzzing/DotnetFuzzing/Assert.cs @@ -45,4 +45,24 @@ static void Throw(ReadOnlySpan expected, ReadOnlySpan actual) throw new Exception($"Expected={expected[diffIndex]} Actual={actual[diffIndex]} at index {diffIndex}"); } } + + public static void Throws(Action action, TState state) + where T : Exception + where TState : allows ref struct + { + try + { + action(state); + } + catch (T) + { + return; + } + catch (Exception ex) + { + throw new Exception($"Expected exception of type {typeof(T).Name} but got {ex.GetType().Name}"); + } + + throw new Exception($"Expected exception of type {typeof(T).Name} but no exception was thrown"); + } } diff --git a/src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs b/src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs index afecd70d7afd8d..767821bcf875b1 100644 --- a/src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs +++ b/src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs @@ -118,19 +118,15 @@ public void FuzzTarget(ReadOnlySpan bytes) ReadOnlySpan firstSegment = chars[slice1..]; ReadOnlySpan secondSegment = bytes[0..(2 * slice1)]; - expected = expectedBuffer.AsSpan(0, EncodeToUtf8(firstSegment, secondSegment, expectedBuffer, options.Encoder)); + expected = expectedBuffer.AsSpan(0, EncodeToUtf8(firstSegment, expectedBuffer, options.Encoder)); actualBuffer = new byte[expected.Length]; using MemoryStream stream = new(actualBuffer); using Utf8JsonWriter writer = new(stream, options); writer.WriteStringValueSegment(firstSegment, false); - writer.WriteStringValueSegment(secondSegment, true); - writer.Flush(); - Assert.SequenceEqual(expected, actualBuffer); - Assert.Equal(expected.Length, writer.BytesCommitted); - Assert.Equal(0, writer.BytesPending); + Assert.Throws>(state => writer.WriteStringValueSegment(state, true), secondSegment); } Array.Clear(expectedBuffer); @@ -146,12 +142,7 @@ public void FuzzTarget(ReadOnlySpan bytes) using Utf8JsonWriter writer = new(stream, options); writer.WriteStringValueSegment(firstSegment, false); - writer.WriteStringValueSegment(secondSegment, true); - writer.Flush(); - - Assert.SequenceEqual(expected, actualBuffer); - Assert.Equal(expected.Length, writer.BytesCommitted); - Assert.Equal(0, writer.BytesPending); + Assert.Throws>(state => writer.WriteStringValueSegment(state, true), secondSegment); } } From 4d8a0471b72a7108eb829efc4fa73b6854738e98 Mon Sep 17 00:00:00 2001 From: Pranav Senthilnathan Date: Thu, 26 Dec 2024 10:05:09 -0800 Subject: [PATCH 21/21] pr comments --- src/libraries/Fuzzing/DotnetFuzzing/Assert.cs | 12 ++++++------ .../System.Text.Json/src/Resources/Strings.resx | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/libraries/Fuzzing/DotnetFuzzing/Assert.cs b/src/libraries/Fuzzing/DotnetFuzzing/Assert.cs index e4191707281596..2814de3f08bf49 100644 --- a/src/libraries/Fuzzing/DotnetFuzzing/Assert.cs +++ b/src/libraries/Fuzzing/DotnetFuzzing/Assert.cs @@ -46,23 +46,23 @@ static void Throw(ReadOnlySpan expected, ReadOnlySpan actual) } } - public static void Throws(Action action, TState state) - where T : Exception + public static TException Throws(Action action, TState state) + where TException : Exception where TState : allows ref struct { try { action(state); } - catch (T) + catch (TException ex) { - return; + return ex; } catch (Exception ex) { - throw new Exception($"Expected exception of type {typeof(T).Name} but got {ex.GetType().Name}"); + throw new Exception($"Expected exception of type {typeof(TException).Name} but got {ex.GetType().Name}"); } - throw new Exception($"Expected exception of type {typeof(T).Name} but no exception was thrown"); + throw new Exception($"Expected exception of type {typeof(TException).Name} but no exception was thrown"); } } diff --git a/src/libraries/System.Text.Json/src/Resources/Strings.resx b/src/libraries/System.Text.Json/src/Resources/Strings.resx index 39325a7e3f70e3..cd8a5e392ca66e 100644 --- a/src/libraries/System.Text.Json/src/Resources/Strings.resx +++ b/src/libraries/System.Text.Json/src/Resources/Strings.resx @@ -810,7 +810,7 @@ Offset and length were out of bounds for the array or count is greater than the number of elements from index to the end of the source collection. - Cannot write the requested JSON property or value until the final string value segment has been written. + Writing a JSON property or value before writing the final string value segment is not supported. Object must be of type Rune. @@ -825,6 +825,6 @@ Cannot extract a Unicode scalar value from the specified index in the input. - Cannot mix encodings between string value segments. The previous segment's encoding was '{0}' and the current segment's encoding is '{1}'. + Mixing UTF encodings in a single multi-segment JSON string is not supported. The previous segment's encoding was '{0}' and the current segment's encoding is '{1}'. \ No newline at end of file