diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/Contents/DataUriParser.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/Contents/DataUriParser.cs
index 6afe1409e75..792bb750b5c 100644
--- a/src/Libraries/Microsoft.Extensions.AI.Abstractions/Contents/DataUriParser.cs
+++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/Contents/DataUriParser.cs
@@ -23,6 +23,11 @@ internal static class DataUriParser
{
public static string Scheme => "data:";
+ ///
+ /// The default media type per RFC 2397 when the media type is omitted.
+ ///
+ public const string DefaultMediaType = "text/plain;charset=US-ASCII";
+
public static DataUri Parse(ReadOnlyMemory dataUri)
{
// Validate, then trim off the "data:" scheme.
@@ -59,9 +64,14 @@ public static DataUri Parse(ReadOnlyMemory dataUri)
}
// Validate the media type, if present.
+ // Per RFC 2397, if the media type is omitted, it defaults to "text/plain;charset=US-ASCII".
ReadOnlySpan span = metadata.Span.Trim();
string? mediaType = null;
- if (!span.IsEmpty && !IsValidMediaType(span, ref mediaType))
+ if (span.IsEmpty)
+ {
+ mediaType = DefaultMediaType;
+ }
+ else if (!IsValidMediaType(span, ref mediaType))
{
throw new UriFormatException("Invalid data URI format: the media type is not a valid.");
}
@@ -91,6 +101,7 @@ public static bool IsValidMediaType(ReadOnlySpan mediaTypeSpan, [NotNull]
// For common media types, we can avoid both allocating a string for the span and avoid parsing overheads.
string? knownType = mediaTypeSpan switch
{
+ DefaultMediaType => DefaultMediaType,
"application/json" => "application/json",
"application/octet-stream" => "application/octet-stream",
"application/pdf" => "application/pdf",
diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Contents/DataContentTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Contents/DataContentTests.cs
index d87d776185a..3e6fa85a489 100644
--- a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Contents/DataContentTests.cs
+++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Contents/DataContentTests.cs
@@ -109,6 +109,35 @@ public void Ctor_NoMediaType_Roundtrips()
Assert.Equal("aGVsbG8=", content.Base64Data.ToString());
}
+ [Theory]
+ [InlineData("data:,hello", "hello")]
+ [InlineData("data:;base64,aGVsbG8=", "hello")]
+ [InlineData("data:,hello%20world", "hello world")]
+ [InlineData("data:,", "")]
+ [InlineData("data:;base64,", "")]
+ public void Ctor_OmittedMediaType_DefaultsToTextPlain(string uri, string expectedData)
+ {
+ // Per RFC 2397, if the media type is omitted, it defaults to "text/plain;charset=US-ASCII"
+ static void Validate(DataContent content, string expectedData)
+ {
+ Assert.Equal("text/plain;charset=US-ASCII", content.MediaType);
+ Assert.Equal(expectedData, Encoding.UTF8.GetString(content.Data.ToArray()));
+ }
+
+ Validate(new DataContent(uri), expectedData);
+ Validate(new DataContent(new Uri(uri)), expectedData);
+ }
+
+ [Theory]
+ [InlineData("data:,hello", "application/json")]
+ [InlineData("data:;base64,aGVsbG8=", "application/octet-stream")]
+ public void Ctor_OmittedMediaType_CanBeOverridden(string uri, string mediaType)
+ {
+ // When media type is omitted in the URI but provided as a parameter, the parameter takes precedence
+ var content = new DataContent(uri, mediaType);
+ Assert.Equal(mediaType, content.MediaType);
+ }
+
[Fact]
public void Serialize_MatchesExpectedJson()
{