diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs index b75fc2e7f50..e6a14bfbf17 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs @@ -6,6 +6,7 @@ using System.IO; using System.Threading; using System.Threading.Tasks; +using Microsoft.Extensions.AI; using Microsoft.Shared.Diagnostics; using ModelContextProtocol.Client; using ModelContextProtocol.Protocol; @@ -42,21 +43,23 @@ public override async Task ReadAsync(FileInfo source, string throw new FileNotFoundException("The specified file does not exist.", source.FullName); } - // Read file content as base64 data URI + // Read file content and create DataContent #if NET - byte[] fileBytes = await File.ReadAllBytesAsync(source.FullName, cancellationToken).ConfigureAwait(false); + ReadOnlyMemory fileBytes = await File.ReadAllBytesAsync(source.FullName, cancellationToken).ConfigureAwait(false); #else - byte[] fileBytes; + ReadOnlyMemory fileBytes; using (FileStream fs = new(source.FullName, FileMode.Open, FileAccess.Read, FileShare.Read, 1, FileOptions.Asynchronous)) { - using MemoryStream ms = new(); + using MemoryStream ms = new((int)Math.Min(int.MaxValue, fs.Length)); await fs.CopyToAsync(ms).ConfigureAwait(false); - fileBytes = ms.ToArray(); + fileBytes = ms.GetBuffer().AsMemory(0, (int)ms.Length); } #endif - string dataUri = CreateDataUri(fileBytes, mediaType); + DataContent dataContent = new( + fileBytes, + string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!); - string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false); + string markdown = await ConvertToMarkdownAsync(dataContent, cancellationToken).ConfigureAwait(false); return MarkdownParser.Parse(markdown, identifier); } @@ -67,31 +70,23 @@ public override async Task ReadAsync(Stream source, string id _ = Throw.IfNull(source); _ = Throw.IfNullOrEmpty(identifier); - // Read stream content as base64 data URI - using MemoryStream ms = new(); + // Read stream content and create DataContent + using MemoryStream ms = source.CanSeek ? new((int)Math.Min(int.MaxValue, source.Length)) : new(); #if NET await source.CopyToAsync(ms, cancellationToken).ConfigureAwait(false); #else await source.CopyToAsync(ms).ConfigureAwait(false); #endif - byte[] fileBytes = ms.ToArray(); - string dataUri = CreateDataUri(fileBytes, mediaType); + DataContent dataContent = new( + ms.GetBuffer().AsMemory(0, (int)ms.Length), + string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType); - string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false); + string markdown = await ConvertToMarkdownAsync(dataContent, cancellationToken).ConfigureAwait(false); return MarkdownParser.Parse(markdown, identifier); } -#pragma warning disable S3995 // URI return values should not be strings - private static string CreateDataUri(byte[] fileBytes, string? mediaType) -#pragma warning restore S3995 // URI return values should not be strings - { - string base64Content = Convert.ToBase64String(fileBytes); - string mimeType = string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!; - return $"data:{mimeType};base64,{base64Content}"; - } - - private async Task ConvertToMarkdownAsync(string dataUri, CancellationToken cancellationToken) + private async Task ConvertToMarkdownAsync(DataContent dataContent, CancellationToken cancellationToken) { // Create HTTP client transport for MCP HttpClientTransport transport = new(new HttpClientTransportOptions @@ -109,7 +104,7 @@ private async Task ConvertToMarkdownAsync(string dataUri, CancellationTo // Build parameters for convert_to_markdown tool Dictionary parameters = new() { - ["uri"] = dataUri + ["uri"] = dataContent.Uri }; // Call the convert_to_markdown tool