Skip to content

Commit 72f930a

Browse files
authored
Introduce Microsoft.Extensions.DataIngestion.Abstractions (#6949)
1 parent 1523a41 commit 72f930a

File tree

12 files changed

+730
-0
lines changed

12 files changed

+730
-0
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
using System;
5+
using System.Collections.Generic;
6+
using System.Diagnostics;
7+
using Microsoft.Shared.Diagnostics;
8+
9+
namespace Microsoft.Extensions.DataIngestion;
10+
11+
/// <summary>
12+
/// Represents a chunk of content extracted from an <see cref="IngestionDocument"/>.
13+
/// </summary>
14+
/// <typeparam name="T">The type of the content.</typeparam>
15+
[DebuggerDisplay("Content = {Content}")]
16+
public sealed class IngestionChunk<T>
17+
{
18+
private Dictionary<string, object>? _metadata;
19+
20+
/// <summary>
21+
/// Initializes a new instance of the <see cref="IngestionChunk{T}"/> class.
22+
/// </summary>
23+
/// <param name="content">The content of the chunk.</param>
24+
/// <param name="document">The document from which this chunk was extracted.</param>
25+
/// <param name="context">Additional context for the chunk.</param>
26+
/// <exception cref="ArgumentNullException">
27+
/// <paramref name="content"/> or <paramref name="document"/> is <see langword="null"/>.
28+
/// </exception>
29+
/// <exception cref="ArgumentException">
30+
/// <paramref name="content"/> is a string that is empty or contains only white-space characters.
31+
/// </exception>
32+
public IngestionChunk(T content, IngestionDocument document, string? context = null)
33+
{
34+
if (typeof(T) == typeof(string))
35+
{
36+
Content = (T)(object)Throw.IfNullOrEmpty((string)(object)content!);
37+
}
38+
else
39+
{
40+
Content = Throw.IfNull(content);
41+
}
42+
43+
Document = Throw.IfNull(document);
44+
Context = context;
45+
}
46+
47+
/// <summary>
48+
/// Gets the content of the chunk.
49+
/// </summary>
50+
public T Content { get; }
51+
52+
/// <summary>
53+
/// Gets the document from which this chunk was extracted.
54+
/// </summary>
55+
public IngestionDocument Document { get; }
56+
57+
/// <summary>
58+
/// Gets additional context for the chunk.
59+
/// </summary>
60+
public string? Context { get; }
61+
62+
/// <summary>
63+
/// Gets a value indicating whether this chunk has metadata.
64+
/// </summary>
65+
public bool HasMetadata => _metadata?.Count > 0;
66+
67+
/// <summary>
68+
/// Gets the metadata associated with this chunk.
69+
/// </summary>
70+
public IDictionary<string, object> Metadata => _metadata ??= [];
71+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
using System.Collections.Generic;
5+
using System.Threading;
6+
7+
namespace Microsoft.Extensions.DataIngestion;
8+
9+
/// <summary>
10+
/// Processes chunks in a pipeline.
11+
/// </summary>
12+
/// <typeparam name="T">The type of the chunk content.</typeparam>
13+
public abstract class IngestionChunkProcessor<T>
14+
{
15+
/// <summary>
16+
/// Processes chunks asynchronously.
17+
/// </summary>
18+
/// <param name="chunks">The chunks to process.</param>
19+
/// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
20+
/// <returns>The processed chunks.</returns>
21+
public abstract IAsyncEnumerable<IngestionChunk<T>> ProcessAsync(IAsyncEnumerable<IngestionChunk<T>> chunks, CancellationToken cancellationToken = default);
22+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
using System;
5+
using System.Collections.Generic;
6+
using System.Threading;
7+
using System.Threading.Tasks;
8+
9+
namespace Microsoft.Extensions.DataIngestion;
10+
11+
/// <summary>
12+
/// Writes chunks to a destination.
13+
/// </summary>
14+
/// <typeparam name="T">The type of the chunk content.</typeparam>
15+
public abstract class IngestionChunkWriter<T> : IDisposable
16+
{
17+
/// <summary>
18+
/// Writes chunks asynchronously.
19+
/// </summary>
20+
/// <param name="chunks">The chunks to write.</param>
21+
/// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
22+
/// <returns>A task representing the asynchronous write operation.</returns>
23+
public abstract Task WriteAsync(IAsyncEnumerable<IngestionChunk<T>> chunks, CancellationToken cancellationToken = default);
24+
25+
/// <summary>
26+
/// Disposes the writer and releases all associated resources.
27+
/// </summary>
28+
public void Dispose()
29+
{
30+
Dispose(disposing: true);
31+
GC.SuppressFinalize(this);
32+
}
33+
34+
/// <summary>
35+
/// Disposes the writer.
36+
/// </summary>
37+
/// <param name="disposing">true if called from dispose, false if called from finalizer.</param>
38+
protected virtual void Dispose(bool disposing)
39+
{
40+
}
41+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
using System.Collections.Generic;
5+
using System.Threading;
6+
7+
namespace Microsoft.Extensions.DataIngestion;
8+
9+
/// <summary>
10+
/// Splits an <see cref="IngestionDocument"/> into chunks.
11+
/// </summary>
12+
/// <typeparam name="T">The type of the chunk content.</typeparam>
13+
public abstract class IngestionChunker<T>
14+
{
15+
/// <summary>
16+
/// Splits a document into chunks asynchronously.
17+
/// </summary>
18+
/// <param name="document">The document to split.</param>
19+
/// <param name="cancellationToken">The token to monitor for cancellation requests.</param>
20+
/// <returns>The chunks created from the document.</returns>
21+
public abstract IAsyncEnumerable<IngestionChunk<T>> ProcessAsync(IngestionDocument document, CancellationToken cancellationToken = default);
22+
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
using System;
5+
using System.Collections.Generic;
6+
using Microsoft.Shared.Diagnostics;
7+
8+
namespace Microsoft.Extensions.DataIngestion;
9+
10+
/// <summary>
11+
/// A format-agnostic container that normalizes diverse input formats into a structured hierarchy.
12+
/// </summary>
13+
public sealed class IngestionDocument
14+
{
15+
/// <summary>
16+
/// Initializes a new instance of the <see cref="IngestionDocument"/> class.
17+
/// </summary>
18+
/// <param name="identifier">The unique identifier for the document.</param>
19+
/// <exception cref="ArgumentNullException"><paramref name="identifier"/> is <see langword="null"/>.</exception>
20+
public IngestionDocument(string identifier)
21+
{
22+
Identifier = Throw.IfNullOrEmpty(identifier);
23+
}
24+
25+
/// <summary>
26+
/// Gets the unique identifier for the document.
27+
/// </summary>
28+
public string Identifier { get; }
29+
30+
/// <summary>
31+
/// Gets the sections of the document.
32+
/// </summary>
33+
public IList<IngestionDocumentSection> Sections { get; } = [];
34+
35+
/// <summary>
36+
/// Iterate over all elements in the document, including those in nested sections.
37+
/// </summary>
38+
/// <returns>An enumerable collection of elements.</returns>
39+
/// <remarks>
40+
/// Sections themselves are not included.
41+
/// </remarks>
42+
public IEnumerable<IngestionDocumentElement> EnumerateContent()
43+
{
44+
Stack<IngestionDocumentElement> elementsToProcess = new();
45+
46+
for (int sectionIndex = Sections.Count - 1; sectionIndex >= 0; sectionIndex--)
47+
{
48+
elementsToProcess.Push(Sections[sectionIndex]);
49+
}
50+
51+
while (elementsToProcess.Count > 0)
52+
{
53+
IngestionDocumentElement currentElement = elementsToProcess.Pop();
54+
55+
if (currentElement is not IngestionDocumentSection nestedSection)
56+
{
57+
yield return currentElement;
58+
}
59+
else
60+
{
61+
for (int i = nestedSection.Elements.Count - 1; i >= 0; i--)
62+
{
63+
elementsToProcess.Push(nestedSection.Elements[i]);
64+
}
65+
}
66+
}
67+
}
68+
}

0 commit comments

Comments
 (0)